summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/activity
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
commit5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree739caf8c461053357daa9f162bef34516c7bf452 /src/backend/utils/activity
parentInitial commit. (diff)
downloadpostgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/activity')
-rw-r--r--src/backend/utils/activity/Makefile34
-rw-r--r--src/backend/utils/activity/backend_progress.c112
-rw-r--r--src/backend/utils/activity/backend_status.c1151
-rw-r--r--src/backend/utils/activity/pgstat.c1678
-rw-r--r--src/backend/utils/activity/pgstat_archiver.c111
-rw-r--r--src/backend/utils/activity/pgstat_bgwriter.c110
-rw-r--r--src/backend/utils/activity/pgstat_checkpointer.c121
-rw-r--r--src/backend/utils/activity/pgstat_database.c437
-rw-r--r--src/backend/utils/activity/pgstat_function.c243
-rw-r--r--src/backend/utils/activity/pgstat_relation.c938
-rw-r--r--src/backend/utils/activity/pgstat_replslot.c224
-rw-r--r--src/backend/utils/activity/pgstat_shmem.c1003
-rw-r--r--src/backend/utils/activity/pgstat_slru.c248
-rw-r--r--src/backend/utils/activity/pgstat_subscription.c110
-rw-r--r--src/backend/utils/activity/pgstat_wal.c182
-rw-r--r--src/backend/utils/activity/pgstat_xact.c391
-rw-r--r--src/backend/utils/activity/wait_event.c749
17 files changed, 7842 insertions, 0 deletions
diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile
new file mode 100644
index 0000000..a2e8507
--- /dev/null
+++ b/src/backend/utils/activity/Makefile
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for backend/utils/activity
+#
+# Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# src/backend/utils/activity/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/utils/activity
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ backend_progress.o \
+ backend_status.o \
+ pgstat.o \
+ pgstat_archiver.o \
+ pgstat_bgwriter.o \
+ pgstat_checkpointer.o \
+ pgstat_database.o \
+ pgstat_function.o \
+ pgstat_relation.o \
+ pgstat_replslot.o \
+ pgstat_shmem.o \
+ pgstat_slru.o \
+ pgstat_subscription.o \
+ pgstat_wal.o \
+ pgstat_xact.o \
+ wait_event.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/activity/backend_progress.c b/src/backend/utils/activity/backend_progress.c
new file mode 100644
index 0000000..f291997
--- /dev/null
+++ b/src/backend/utils/activity/backend_progress.c
@@ -0,0 +1,112 @@
+/* ----------
+ * backend_progress.c
+ *
+ * Command progress reporting infrastructure.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/activity/backend_progress.c
+ * ----------
+ */
+#include "postgres.h"
+
+#include "port/atomics.h" /* for memory barriers */
+#include "utils/backend_progress.h"
+#include "utils/backend_status.h"
+
+
+/*-----------
+ * pgstat_progress_start_command() -
+ *
+ * Set st_progress_command (and st_progress_command_target) in own backend
+ * entry. Also, zero-initialize st_progress_param array.
+ *-----------
+ */
+void
+pgstat_progress_start_command(ProgressCommandType cmdtype, Oid relid)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+
+ if (!beentry || !pgstat_track_activities)
+ return;
+
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+ beentry->st_progress_command = cmdtype;
+ beentry->st_progress_command_target = relid;
+ MemSet(&beentry->st_progress_param, 0, sizeof(beentry->st_progress_param));
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
+
+/*-----------
+ * pgstat_progress_update_param() -
+ *
+ * Update index'th member in st_progress_param[] of own backend entry.
+ *-----------
+ */
+void
+pgstat_progress_update_param(int index, int64 val)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+
+ Assert(index >= 0 && index < PGSTAT_NUM_PROGRESS_PARAM);
+
+ if (!beentry || !pgstat_track_activities)
+ return;
+
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+ beentry->st_progress_param[index] = val;
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
+
+/*-----------
+ * pgstat_progress_update_multi_param() -
+ *
+ * Update multiple members in st_progress_param[] of own backend entry.
+ * This is atomic; readers won't see intermediate states.
+ *-----------
+ */
+void
+pgstat_progress_update_multi_param(int nparam, const int *index,
+ const int64 *val)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+ int i;
+
+ if (!beentry || !pgstat_track_activities || nparam == 0)
+ return;
+
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+
+ for (i = 0; i < nparam; ++i)
+ {
+ Assert(index[i] >= 0 && index[i] < PGSTAT_NUM_PROGRESS_PARAM);
+
+ beentry->st_progress_param[index[i]] = val[i];
+ }
+
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
+
+/*-----------
+ * pgstat_progress_end_command() -
+ *
+ * Reset st_progress_command (and st_progress_command_target) in own backend
+ * entry. This signals the end of the command.
+ *-----------
+ */
+void
+pgstat_progress_end_command(void)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+
+ if (!beentry || !pgstat_track_activities)
+ return;
+
+ if (beentry->st_progress_command == PROGRESS_COMMAND_INVALID)
+ return;
+
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+ beentry->st_progress_command = PROGRESS_COMMAND_INVALID;
+ beentry->st_progress_command_target = InvalidOid;
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
new file mode 100644
index 0000000..3ecb15d
--- /dev/null
+++ b/src/backend/utils/activity/backend_status.c
@@ -0,0 +1,1151 @@
+/* ----------
+ * backend_status.c
+ * Backend status reporting infrastructure.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/backend_status.c
+ * ----------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "libpq/libpq.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "port/atomics.h" /* for memory barriers */
+#include "storage/ipc.h"
+#include "storage/proc.h" /* for MyProc */
+#include "storage/sinvaladt.h"
+#include "utils/ascii.h"
+#include "utils/backend_status.h"
+#include "utils/guc.h" /* for application_name */
+#include "utils/memutils.h"
+
+
+/* ----------
+ * Total number of backends including auxiliary
+ *
+ * We reserve a slot for each possible BackendId, plus one for each
+ * possible auxiliary process type. (This scheme assumes there is not
+ * more than one of any auxiliary process type at a time.) MaxBackends
+ * includes autovacuum workers and background workers as well.
+ * ----------
+ */
+#define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES)
+
+
+/* ----------
+ * GUC parameters
+ * ----------
+ */
+bool pgstat_track_activities = false;
+int pgstat_track_activity_query_size = 1024;
+
+
+/* exposed so that backend_progress.c can access it */
+PgBackendStatus *MyBEEntry = NULL;
+
+
+static PgBackendStatus *BackendStatusArray = NULL;
+static char *BackendAppnameBuffer = NULL;
+static char *BackendClientHostnameBuffer = NULL;
+static char *BackendActivityBuffer = NULL;
+static Size BackendActivityBufferSize = 0;
+#ifdef USE_SSL
+static PgBackendSSLStatus *BackendSslStatusBuffer = NULL;
+#endif
+#ifdef ENABLE_GSS
+static PgBackendGSSStatus *BackendGssStatusBuffer = NULL;
+#endif
+
+
+/* Status for backends including auxiliary */
+static LocalPgBackendStatus *localBackendStatusTable = NULL;
+
+/* Total number of backends including auxiliary */
+static int localNumBackends = 0;
+
+static MemoryContext backendStatusSnapContext;
+
+
+static void pgstat_beshutdown_hook(int code, Datum arg);
+static void pgstat_read_current_status(void);
+static void pgstat_setup_backend_status_context(void);
+
+
+/*
+ * Report shared-memory space needed by CreateSharedBackendStatus.
+ */
+Size
+BackendStatusShmemSize(void)
+{
+ Size size;
+
+ /* BackendStatusArray: */
+ size = mul_size(sizeof(PgBackendStatus), NumBackendStatSlots);
+ /* BackendAppnameBuffer: */
+ size = add_size(size,
+ mul_size(NAMEDATALEN, NumBackendStatSlots));
+ /* BackendClientHostnameBuffer: */
+ size = add_size(size,
+ mul_size(NAMEDATALEN, NumBackendStatSlots));
+ /* BackendActivityBuffer: */
+ size = add_size(size,
+ mul_size(pgstat_track_activity_query_size, NumBackendStatSlots));
+#ifdef USE_SSL
+ /* BackendSslStatusBuffer: */
+ size = add_size(size,
+ mul_size(sizeof(PgBackendSSLStatus), NumBackendStatSlots));
+#endif
+#ifdef ENABLE_GSS
+ /* BackendGssStatusBuffer: */
+ size = add_size(size,
+ mul_size(sizeof(PgBackendGSSStatus), NumBackendStatSlots));
+#endif
+ return size;
+}
+
+/*
+ * Initialize the shared status array and several string buffers
+ * during postmaster startup.
+ */
+void
+CreateSharedBackendStatus(void)
+{
+ Size size;
+ bool found;
+ int i;
+ char *buffer;
+
+ /* Create or attach to the shared array */
+ size = mul_size(sizeof(PgBackendStatus), NumBackendStatSlots);
+ BackendStatusArray = (PgBackendStatus *)
+ ShmemInitStruct("Backend Status Array", size, &found);
+
+ if (!found)
+ {
+ /*
+ * We're the first - initialize.
+ */
+ MemSet(BackendStatusArray, 0, size);
+ }
+
+ /* Create or attach to the shared appname buffer */
+ size = mul_size(NAMEDATALEN, NumBackendStatSlots);
+ BackendAppnameBuffer = (char *)
+ ShmemInitStruct("Backend Application Name Buffer", size, &found);
+
+ if (!found)
+ {
+ MemSet(BackendAppnameBuffer, 0, size);
+
+ /* Initialize st_appname pointers. */
+ buffer = BackendAppnameBuffer;
+ for (i = 0; i < NumBackendStatSlots; i++)
+ {
+ BackendStatusArray[i].st_appname = buffer;
+ buffer += NAMEDATALEN;
+ }
+ }
+
+ /* Create or attach to the shared client hostname buffer */
+ size = mul_size(NAMEDATALEN, NumBackendStatSlots);
+ BackendClientHostnameBuffer = (char *)
+ ShmemInitStruct("Backend Client Host Name Buffer", size, &found);
+
+ if (!found)
+ {
+ MemSet(BackendClientHostnameBuffer, 0, size);
+
+ /* Initialize st_clienthostname pointers. */
+ buffer = BackendClientHostnameBuffer;
+ for (i = 0; i < NumBackendStatSlots; i++)
+ {
+ BackendStatusArray[i].st_clienthostname = buffer;
+ buffer += NAMEDATALEN;
+ }
+ }
+
+ /* Create or attach to the shared activity buffer */
+ BackendActivityBufferSize = mul_size(pgstat_track_activity_query_size,
+ NumBackendStatSlots);
+ BackendActivityBuffer = (char *)
+ ShmemInitStruct("Backend Activity Buffer",
+ BackendActivityBufferSize,
+ &found);
+
+ if (!found)
+ {
+ MemSet(BackendActivityBuffer, 0, BackendActivityBufferSize);
+
+ /* Initialize st_activity pointers. */
+ buffer = BackendActivityBuffer;
+ for (i = 0; i < NumBackendStatSlots; i++)
+ {
+ BackendStatusArray[i].st_activity_raw = buffer;
+ buffer += pgstat_track_activity_query_size;
+ }
+ }
+
+#ifdef USE_SSL
+ /* Create or attach to the shared SSL status buffer */
+ size = mul_size(sizeof(PgBackendSSLStatus), NumBackendStatSlots);
+ BackendSslStatusBuffer = (PgBackendSSLStatus *)
+ ShmemInitStruct("Backend SSL Status Buffer", size, &found);
+
+ if (!found)
+ {
+ PgBackendSSLStatus *ptr;
+
+ MemSet(BackendSslStatusBuffer, 0, size);
+
+ /* Initialize st_sslstatus pointers. */
+ ptr = BackendSslStatusBuffer;
+ for (i = 0; i < NumBackendStatSlots; i++)
+ {
+ BackendStatusArray[i].st_sslstatus = ptr;
+ ptr++;
+ }
+ }
+#endif
+
+#ifdef ENABLE_GSS
+ /* Create or attach to the shared GSSAPI status buffer */
+ size = mul_size(sizeof(PgBackendGSSStatus), NumBackendStatSlots);
+ BackendGssStatusBuffer = (PgBackendGSSStatus *)
+ ShmemInitStruct("Backend GSS Status Buffer", size, &found);
+
+ if (!found)
+ {
+ PgBackendGSSStatus *ptr;
+
+ MemSet(BackendGssStatusBuffer, 0, size);
+
+ /* Initialize st_gssstatus pointers. */
+ ptr = BackendGssStatusBuffer;
+ for (i = 0; i < NumBackendStatSlots; i++)
+ {
+ BackendStatusArray[i].st_gssstatus = ptr;
+ ptr++;
+ }
+ }
+#endif
+}
+
+/*
+ * Initialize pgstats backend activity state, and set up our on-proc-exit
+ * hook. Called from InitPostgres and AuxiliaryProcessMain. For auxiliary
+ * process, MyBackendId is invalid. Otherwise, MyBackendId must be set, but we
+ * must not have started any transaction yet (since the exit hook must run
+ * after the last transaction exit).
+ *
+ * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful.
+ */
+void
+pgstat_beinit(void)
+{
+ /* Initialize MyBEEntry */
+ if (MyBackendId != InvalidBackendId)
+ {
+ Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
+ MyBEEntry = &BackendStatusArray[MyBackendId - 1];
+ }
+ else
+ {
+ /* Must be an auxiliary process */
+ Assert(MyAuxProcType != NotAnAuxProcess);
+
+ /*
+ * Assign the MyBEEntry for an auxiliary process. Since it doesn't
+ * have a BackendId, the slot is statically allocated based on the
+ * auxiliary process type (MyAuxProcType). Backends use slots indexed
+ * in the range from 1 to MaxBackends (inclusive), so we use
+ * MaxBackends + AuxBackendType + 1 as the index of the slot for an
+ * auxiliary process.
+ */
+ MyBEEntry = &BackendStatusArray[MaxBackends + MyAuxProcType];
+ }
+
+ /* Set up a process-exit hook to clean up */
+ on_shmem_exit(pgstat_beshutdown_hook, 0);
+}
+
+
+/* ----------
+ * pgstat_bestart() -
+ *
+ * Initialize this backend's entry in the PgBackendStatus array.
+ * Called from InitPostgres.
+ *
+ * Apart from auxiliary processes, MyBackendId, MyDatabaseId,
+ * session userid, and application_name must be set for a
+ * backend (hence, this cannot be combined with pgbestat_beinit).
+ * Note also that we must be inside a transaction if this isn't an aux
+ * process, as we may need to do encoding conversion on some strings.
+ * ----------
+ */
+void
+pgstat_bestart(void)
+{
+ volatile PgBackendStatus *vbeentry = MyBEEntry;
+ PgBackendStatus lbeentry;
+#ifdef USE_SSL
+ PgBackendSSLStatus lsslstatus;
+#endif
+#ifdef ENABLE_GSS
+ PgBackendGSSStatus lgssstatus;
+#endif
+
+ /* pgstats state must be initialized from pgstat_beinit() */
+ Assert(vbeentry != NULL);
+
+ /*
+ * To minimize the time spent modifying the PgBackendStatus entry, and
+ * avoid risk of errors inside the critical section, we first copy the
+ * shared-memory struct to a local variable, then modify the data in the
+ * local variable, then copy the local variable back to shared memory.
+ * Only the last step has to be inside the critical section.
+ *
+ * Most of the data we copy from shared memory is just going to be
+ * overwritten, but the struct's not so large that it's worth the
+ * maintenance hassle to copy only the needful fields.
+ */
+ memcpy(&lbeentry,
+ unvolatize(PgBackendStatus *, vbeentry),
+ sizeof(PgBackendStatus));
+
+ /* These structs can just start from zeroes each time, though */
+#ifdef USE_SSL
+ memset(&lsslstatus, 0, sizeof(lsslstatus));
+#endif
+#ifdef ENABLE_GSS
+ memset(&lgssstatus, 0, sizeof(lgssstatus));
+#endif
+
+ /*
+ * Now fill in all the fields of lbeentry, except for strings that are
+ * out-of-line data. Those have to be handled separately, below.
+ */
+ lbeentry.st_procpid = MyProcPid;
+ lbeentry.st_backendType = MyBackendType;
+ lbeentry.st_proc_start_timestamp = MyStartTimestamp;
+ lbeentry.st_activity_start_timestamp = 0;
+ lbeentry.st_state_start_timestamp = 0;
+ lbeentry.st_xact_start_timestamp = 0;
+ lbeentry.st_databaseid = MyDatabaseId;
+
+ /* We have userid for client-backends, wal-sender and bgworker processes */
+ if (lbeentry.st_backendType == B_BACKEND
+ || lbeentry.st_backendType == B_WAL_SENDER
+ || lbeentry.st_backendType == B_BG_WORKER)
+ lbeentry.st_userid = GetSessionUserId();
+ else
+ lbeentry.st_userid = InvalidOid;
+
+ /*
+ * We may not have a MyProcPort (eg, if this is the autovacuum process).
+ * If so, use all-zeroes client address, which is dealt with specially in
+ * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
+ */
+ if (MyProcPort)
+ memcpy(&lbeentry.st_clientaddr, &MyProcPort->raddr,
+ sizeof(lbeentry.st_clientaddr));
+ else
+ MemSet(&lbeentry.st_clientaddr, 0, sizeof(lbeentry.st_clientaddr));
+
+#ifdef USE_SSL
+ if (MyProcPort && MyProcPort->ssl_in_use)
+ {
+ lbeentry.st_ssl = true;
+ lsslstatus.ssl_bits = be_tls_get_cipher_bits(MyProcPort);
+ strlcpy(lsslstatus.ssl_version, be_tls_get_version(MyProcPort), NAMEDATALEN);
+ strlcpy(lsslstatus.ssl_cipher, be_tls_get_cipher(MyProcPort), NAMEDATALEN);
+ be_tls_get_peer_subject_name(MyProcPort, lsslstatus.ssl_client_dn, NAMEDATALEN);
+ be_tls_get_peer_serial(MyProcPort, lsslstatus.ssl_client_serial, NAMEDATALEN);
+ be_tls_get_peer_issuer_name(MyProcPort, lsslstatus.ssl_issuer_dn, NAMEDATALEN);
+ }
+ else
+ {
+ lbeentry.st_ssl = false;
+ }
+#else
+ lbeentry.st_ssl = false;
+#endif
+
+#ifdef ENABLE_GSS
+ if (MyProcPort && MyProcPort->gss != NULL)
+ {
+ const char *princ = be_gssapi_get_princ(MyProcPort);
+
+ lbeentry.st_gss = true;
+ lgssstatus.gss_auth = be_gssapi_get_auth(MyProcPort);
+ lgssstatus.gss_enc = be_gssapi_get_enc(MyProcPort);
+ if (princ)
+ strlcpy(lgssstatus.gss_princ, princ, NAMEDATALEN);
+ }
+ else
+ {
+ lbeentry.st_gss = false;
+ }
+#else
+ lbeentry.st_gss = false;
+#endif
+
+ lbeentry.st_state = STATE_UNDEFINED;
+ lbeentry.st_progress_command = PROGRESS_COMMAND_INVALID;
+ lbeentry.st_progress_command_target = InvalidOid;
+ lbeentry.st_query_id = UINT64CONST(0);
+
+ /*
+ * we don't zero st_progress_param here to save cycles; nobody should
+ * examine it until st_progress_command has been set to something other
+ * than PROGRESS_COMMAND_INVALID
+ */
+
+ /*
+ * We're ready to enter the critical section that fills the shared-memory
+ * status entry. We follow the protocol of bumping st_changecount before
+ * and after; and make sure it's even afterwards. We use a volatile
+ * pointer here to ensure the compiler doesn't try to get cute.
+ */
+ PGSTAT_BEGIN_WRITE_ACTIVITY(vbeentry);
+
+ /* make sure we'll memcpy the same st_changecount back */
+ lbeentry.st_changecount = vbeentry->st_changecount;
+
+ memcpy(unvolatize(PgBackendStatus *, vbeentry),
+ &lbeentry,
+ sizeof(PgBackendStatus));
+
+ /*
+ * We can write the out-of-line strings and structs using the pointers
+ * that are in lbeentry; this saves some de-volatilizing messiness.
+ */
+ lbeentry.st_appname[0] = '\0';
+ if (MyProcPort && MyProcPort->remote_hostname)
+ strlcpy(lbeentry.st_clienthostname, MyProcPort->remote_hostname,
+ NAMEDATALEN);
+ else
+ lbeentry.st_clienthostname[0] = '\0';
+ lbeentry.st_activity_raw[0] = '\0';
+ /* Also make sure the last byte in each string area is always 0 */
+ lbeentry.st_appname[NAMEDATALEN - 1] = '\0';
+ lbeentry.st_clienthostname[NAMEDATALEN - 1] = '\0';
+ lbeentry.st_activity_raw[pgstat_track_activity_query_size - 1] = '\0';
+
+#ifdef USE_SSL
+ memcpy(lbeentry.st_sslstatus, &lsslstatus, sizeof(PgBackendSSLStatus));
+#endif
+#ifdef ENABLE_GSS
+ memcpy(lbeentry.st_gssstatus, &lgssstatus, sizeof(PgBackendGSSStatus));
+#endif
+
+ PGSTAT_END_WRITE_ACTIVITY(vbeentry);
+
+ /* Update app name to current GUC setting */
+ if (application_name)
+ pgstat_report_appname(application_name);
+}
+
+/*
+ * Clear out our entry in the PgBackendStatus array.
+ */
+static void
+pgstat_beshutdown_hook(int code, Datum arg)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+
+ /*
+ * Clear my status entry, following the protocol of bumping st_changecount
+ * before and after. We use a volatile pointer here to ensure the
+ * compiler doesn't try to get cute.
+ */
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+
+ beentry->st_procpid = 0; /* mark invalid */
+
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+
+ /* so that functions can check if backend_status.c is up via MyBEEntry */
+ MyBEEntry = NULL;
+}
+
+/*
+ * Discard any data collected in the current transaction. Any subsequent
+ * request will cause new snapshots to be read.
+ *
+ * This is also invoked during transaction commit or abort to discard the
+ * no-longer-wanted snapshot.
+ */
+void
+pgstat_clear_backend_activity_snapshot(void)
+{
+ /* Release memory, if any was allocated */
+ if (backendStatusSnapContext)
+ {
+ MemoryContextDelete(backendStatusSnapContext);
+ backendStatusSnapContext = NULL;
+ }
+
+ /* Reset variables */
+ localBackendStatusTable = NULL;
+ localNumBackends = 0;
+}
+
+static void
+pgstat_setup_backend_status_context(void)
+{
+ if (!backendStatusSnapContext)
+ backendStatusSnapContext = AllocSetContextCreate(TopMemoryContext,
+ "Backend Status Snapshot",
+ ALLOCSET_SMALL_SIZES);
+}
+
+
+/* ----------
+ * pgstat_report_activity() -
+ *
+ * Called from tcop/postgres.c to report what the backend is actually doing
+ * (but note cmd_str can be NULL for certain cases).
+ *
+ * All updates of the status entry follow the protocol of bumping
+ * st_changecount before and after. We use a volatile pointer here to
+ * ensure the compiler doesn't try to get cute.
+ * ----------
+ */
+void
+pgstat_report_activity(BackendState state, const char *cmd_str)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+ TimestampTz start_timestamp;
+ TimestampTz current_timestamp;
+ int len = 0;
+
+ TRACE_POSTGRESQL_STATEMENT_STATUS(cmd_str);
+
+ if (!beentry)
+ return;
+
+ if (!pgstat_track_activities)
+ {
+ if (beentry->st_state != STATE_DISABLED)
+ {
+ volatile PGPROC *proc = MyProc;
+
+ /*
+ * track_activities is disabled, but we last reported a
+ * non-disabled state. As our final update, change the state and
+ * clear fields we will not be updating anymore.
+ */
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+ beentry->st_state = STATE_DISABLED;
+ beentry->st_state_start_timestamp = 0;
+ beentry->st_activity_raw[0] = '\0';
+ beentry->st_activity_start_timestamp = 0;
+ /* st_xact_start_timestamp and wait_event_info are also disabled */
+ beentry->st_xact_start_timestamp = 0;
+ beentry->st_query_id = UINT64CONST(0);
+ proc->wait_event_info = 0;
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+ }
+ return;
+ }
+
+ /*
+ * To minimize the time spent modifying the entry, and avoid risk of
+ * errors inside the critical section, fetch all the needed data first.
+ */
+ start_timestamp = GetCurrentStatementStartTimestamp();
+ if (cmd_str != NULL)
+ {
+ /*
+ * Compute length of to-be-stored string unaware of multi-byte
+ * characters. For speed reasons that'll get corrected on read, rather
+ * than computed every write.
+ */
+ len = Min(strlen(cmd_str), pgstat_track_activity_query_size - 1);
+ }
+ current_timestamp = GetCurrentTimestamp();
+
+ /*
+ * If the state has changed from "active" or "idle in transaction",
+ * calculate the duration.
+ */
+ if ((beentry->st_state == STATE_RUNNING ||
+ beentry->st_state == STATE_FASTPATH ||
+ beentry->st_state == STATE_IDLEINTRANSACTION ||
+ beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED) &&
+ state != beentry->st_state)
+ {
+ long secs;
+ int usecs;
+
+ TimestampDifference(beentry->st_state_start_timestamp,
+ current_timestamp,
+ &secs, &usecs);
+
+ if (beentry->st_state == STATE_RUNNING ||
+ beentry->st_state == STATE_FASTPATH)
+ pgstat_count_conn_active_time((PgStat_Counter) secs * 1000000 + usecs);
+ else
+ pgstat_count_conn_txn_idle_time((PgStat_Counter) secs * 1000000 + usecs);
+ }
+
+ /*
+ * Now update the status entry
+ */
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+
+ beentry->st_state = state;
+ beentry->st_state_start_timestamp = current_timestamp;
+
+ /*
+ * If a new query is started, we reset the query identifier as it'll only
+ * be known after parse analysis, to avoid reporting last query's
+ * identifier.
+ */
+ if (state == STATE_RUNNING)
+ beentry->st_query_id = UINT64CONST(0);
+
+ if (cmd_str != NULL)
+ {
+ memcpy((char *) beentry->st_activity_raw, cmd_str, len);
+ beentry->st_activity_raw[len] = '\0';
+ beentry->st_activity_start_timestamp = start_timestamp;
+ }
+
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
+
+/* --------
+ * pgstat_report_query_id() -
+ *
+ * Called to update top-level query identifier.
+ * --------
+ */
+void
+pgstat_report_query_id(uint64 query_id, bool force)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+
+ /*
+ * if track_activities is disabled, st_query_id should already have been
+ * reset
+ */
+ if (!beentry || !pgstat_track_activities)
+ return;
+
+ /*
+ * We only report the top-level query identifiers. The stored query_id is
+ * reset when a backend calls pgstat_report_activity(STATE_RUNNING), or
+ * with an explicit call to this function using the force flag. If the
+ * saved query identifier is not zero it means that it's not a top-level
+ * command, so ignore the one provided unless it's an explicit call to
+ * reset the identifier.
+ */
+ if (beentry->st_query_id != 0 && !force)
+ return;
+
+ /*
+ * Update my status entry, following the protocol of bumping
+ * st_changecount before and after. We use a volatile pointer here to
+ * ensure the compiler doesn't try to get cute.
+ */
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+ beentry->st_query_id = query_id;
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
+
+
+/* ----------
+ * pgstat_report_appname() -
+ *
+ * Called to update our application name.
+ * ----------
+ */
+void
+pgstat_report_appname(const char *appname)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+ int len;
+
+ if (!beentry)
+ return;
+
+ /* This should be unnecessary if GUC did its job, but be safe */
+ len = pg_mbcliplen(appname, strlen(appname), NAMEDATALEN - 1);
+
+ /*
+ * Update my status entry, following the protocol of bumping
+ * st_changecount before and after. We use a volatile pointer here to
+ * ensure the compiler doesn't try to get cute.
+ */
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+
+ memcpy((char *) beentry->st_appname, appname, len);
+ beentry->st_appname[len] = '\0';
+
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
+
+/*
+ * Report current transaction start timestamp as the specified value.
+ * Zero means there is no active transaction.
+ */
+void
+pgstat_report_xact_timestamp(TimestampTz tstamp)
+{
+ volatile PgBackendStatus *beentry = MyBEEntry;
+
+ if (!pgstat_track_activities || !beentry)
+ return;
+
+ /*
+ * Update my status entry, following the protocol of bumping
+ * st_changecount before and after. We use a volatile pointer here to
+ * ensure the compiler doesn't try to get cute.
+ */
+ PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
+
+ beentry->st_xact_start_timestamp = tstamp;
+
+ PGSTAT_END_WRITE_ACTIVITY(beentry);
+}
+
+/* ----------
+ * pgstat_read_current_status() -
+ *
+ * Copy the current contents of the PgBackendStatus array to local memory,
+ * if not already done in this transaction.
+ * ----------
+ */
+static void
+pgstat_read_current_status(void)
+{
+ volatile PgBackendStatus *beentry;
+ LocalPgBackendStatus *localtable;
+ LocalPgBackendStatus *localentry;
+ char *localappname,
+ *localclienthostname,
+ *localactivity;
+#ifdef USE_SSL
+ PgBackendSSLStatus *localsslstatus;
+#endif
+#ifdef ENABLE_GSS
+ PgBackendGSSStatus *localgssstatus;
+#endif
+ int i;
+
+ if (localBackendStatusTable)
+ return; /* already done */
+
+ pgstat_setup_backend_status_context();
+
+ /*
+ * Allocate storage for local copy of state data. We can presume that
+ * none of these requests overflow size_t, because we already calculated
+ * the same values using mul_size during shmem setup. However, with
+ * probably-silly values of pgstat_track_activity_query_size and
+ * max_connections, the localactivity buffer could exceed 1GB, so use
+ * "huge" allocation for that one.
+ */
+ localtable = (LocalPgBackendStatus *)
+ MemoryContextAlloc(backendStatusSnapContext,
+ sizeof(LocalPgBackendStatus) * NumBackendStatSlots);
+ localappname = (char *)
+ MemoryContextAlloc(backendStatusSnapContext,
+ NAMEDATALEN * NumBackendStatSlots);
+ localclienthostname = (char *)
+ MemoryContextAlloc(backendStatusSnapContext,
+ NAMEDATALEN * NumBackendStatSlots);
+ localactivity = (char *)
+ MemoryContextAllocHuge(backendStatusSnapContext,
+ (Size) pgstat_track_activity_query_size *
+ (Size) NumBackendStatSlots);
+#ifdef USE_SSL
+ localsslstatus = (PgBackendSSLStatus *)
+ MemoryContextAlloc(backendStatusSnapContext,
+ sizeof(PgBackendSSLStatus) * NumBackendStatSlots);
+#endif
+#ifdef ENABLE_GSS
+ localgssstatus = (PgBackendGSSStatus *)
+ MemoryContextAlloc(backendStatusSnapContext,
+ sizeof(PgBackendGSSStatus) * NumBackendStatSlots);
+#endif
+
+ localNumBackends = 0;
+
+ beentry = BackendStatusArray;
+ localentry = localtable;
+ for (i = 1; i <= NumBackendStatSlots; i++)
+ {
+ /*
+ * Follow the protocol of retrying if st_changecount changes while we
+ * copy the entry, or if it's odd. (The check for odd is needed to
+ * cover the case where we are able to completely copy the entry while
+ * the source backend is between increment steps.) We use a volatile
+ * pointer here to ensure the compiler doesn't try to get cute.
+ */
+ for (;;)
+ {
+ int before_changecount;
+ int after_changecount;
+
+ pgstat_begin_read_activity(beentry, before_changecount);
+
+ localentry->backendStatus.st_procpid = beentry->st_procpid;
+ /* Skip all the data-copying work if entry is not in use */
+ if (localentry->backendStatus.st_procpid > 0)
+ {
+ memcpy(&localentry->backendStatus, unvolatize(PgBackendStatus *, beentry), sizeof(PgBackendStatus));
+
+ /*
+ * For each PgBackendStatus field that is a pointer, copy the
+ * pointed-to data, then adjust the local copy of the pointer
+ * field to point at the local copy of the data.
+ *
+ * strcpy is safe even if the string is modified concurrently,
+ * because there's always a \0 at the end of the buffer.
+ */
+ strcpy(localappname, (char *) beentry->st_appname);
+ localentry->backendStatus.st_appname = localappname;
+ strcpy(localclienthostname, (char *) beentry->st_clienthostname);
+ localentry->backendStatus.st_clienthostname = localclienthostname;
+ strcpy(localactivity, (char *) beentry->st_activity_raw);
+ localentry->backendStatus.st_activity_raw = localactivity;
+#ifdef USE_SSL
+ if (beentry->st_ssl)
+ {
+ memcpy(localsslstatus, beentry->st_sslstatus, sizeof(PgBackendSSLStatus));
+ localentry->backendStatus.st_sslstatus = localsslstatus;
+ }
+#endif
+#ifdef ENABLE_GSS
+ if (beentry->st_gss)
+ {
+ memcpy(localgssstatus, beentry->st_gssstatus, sizeof(PgBackendGSSStatus));
+ localentry->backendStatus.st_gssstatus = localgssstatus;
+ }
+#endif
+ }
+
+ pgstat_end_read_activity(beentry, after_changecount);
+
+ if (pgstat_read_activity_complete(before_changecount,
+ after_changecount))
+ break;
+
+ /* Make sure we can break out of loop if stuck... */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ beentry++;
+ /* Only valid entries get included into the local array */
+ if (localentry->backendStatus.st_procpid > 0)
+ {
+ BackendIdGetTransactionIds(i,
+ &localentry->backend_xid,
+ &localentry->backend_xmin);
+
+ localentry++;
+ localappname += NAMEDATALEN;
+ localclienthostname += NAMEDATALEN;
+ localactivity += pgstat_track_activity_query_size;
+#ifdef USE_SSL
+ localsslstatus++;
+#endif
+#ifdef ENABLE_GSS
+ localgssstatus++;
+#endif
+ localNumBackends++;
+ }
+ }
+
+ /* Set the pointer only after completion of a valid table */
+ localBackendStatusTable = localtable;
+}
+
+
+/* ----------
+ * pgstat_get_backend_current_activity() -
+ *
+ * Return a string representing the current activity of the backend with
+ * the specified PID. This looks directly at the BackendStatusArray,
+ * and so will provide current information regardless of the age of our
+ * transaction's snapshot of the status array.
+ *
+ * It is the caller's responsibility to invoke this only for backends whose
+ * state is expected to remain stable while the result is in use. The
+ * only current use is in deadlock reporting, where we can expect that
+ * the target backend is blocked on a lock. (There are corner cases
+ * where the target's wait could get aborted while we are looking at it,
+ * but the very worst consequence is to return a pointer to a string
+ * that's been changed, so we won't worry too much.)
+ *
+ * Note: return strings for special cases match pg_stat_get_backend_activity.
+ * ----------
+ */
+const char *
+pgstat_get_backend_current_activity(int pid, bool checkUser)
+{
+ PgBackendStatus *beentry;
+ int i;
+
+ beentry = BackendStatusArray;
+ for (i = 1; i <= MaxBackends; i++)
+ {
+ /*
+ * Although we expect the target backend's entry to be stable, that
+ * doesn't imply that anyone else's is. To avoid identifying the
+ * wrong backend, while we check for a match to the desired PID we
+ * must follow the protocol of retrying if st_changecount changes
+ * while we examine the entry, or if it's odd. (This might be
+ * unnecessary, since fetching or storing an int is almost certainly
+ * atomic, but let's play it safe.) We use a volatile pointer here to
+ * ensure the compiler doesn't try to get cute.
+ */
+ volatile PgBackendStatus *vbeentry = beentry;
+ bool found;
+
+ for (;;)
+ {
+ int before_changecount;
+ int after_changecount;
+
+ pgstat_begin_read_activity(vbeentry, before_changecount);
+
+ found = (vbeentry->st_procpid == pid);
+
+ pgstat_end_read_activity(vbeentry, after_changecount);
+
+ if (pgstat_read_activity_complete(before_changecount,
+ after_changecount))
+ break;
+
+ /* Make sure we can break out of loop if stuck... */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (found)
+ {
+ /* Now it is safe to use the non-volatile pointer */
+ if (checkUser && !superuser() && beentry->st_userid != GetUserId())
+ return "<insufficient privilege>";
+ else if (*(beentry->st_activity_raw) == '\0')
+ return "<command string not enabled>";
+ else
+ {
+ /* this'll leak a bit of memory, but that seems acceptable */
+ return pgstat_clip_activity(beentry->st_activity_raw);
+ }
+ }
+
+ beentry++;
+ }
+
+ /* If we get here, caller is in error ... */
+ return "<backend information not available>";
+}
+
+/* ----------
+ * pgstat_get_crashed_backend_activity() -
+ *
+ * Return a string representing the current activity of the backend with
+ * the specified PID. Like the function above, but reads shared memory with
+ * the expectation that it may be corrupt. On success, copy the string
+ * into the "buffer" argument and return that pointer. On failure,
+ * return NULL.
+ *
+ * This function is only intended to be used by the postmaster to report the
+ * query that crashed a backend. In particular, no attempt is made to
+ * follow the correct concurrency protocol when accessing the
+ * BackendStatusArray. But that's OK, in the worst case we'll return a
+ * corrupted message. We also must take care not to trip on ereport(ERROR).
+ * ----------
+ */
+const char *
+pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen)
+{
+ volatile PgBackendStatus *beentry;
+ int i;
+
+ beentry = BackendStatusArray;
+
+ /*
+ * We probably shouldn't get here before shared memory has been set up,
+ * but be safe.
+ */
+ if (beentry == NULL || BackendActivityBuffer == NULL)
+ return NULL;
+
+ for (i = 1; i <= MaxBackends; i++)
+ {
+ if (beentry->st_procpid == pid)
+ {
+ /* Read pointer just once, so it can't change after validation */
+ const char *activity = beentry->st_activity_raw;
+ const char *activity_last;
+
+ /*
+ * We mustn't access activity string before we verify that it
+ * falls within the BackendActivityBuffer. To make sure that the
+ * entire string including its ending is contained within the
+ * buffer, subtract one activity length from the buffer size.
+ */
+ activity_last = BackendActivityBuffer + BackendActivityBufferSize
+ - pgstat_track_activity_query_size;
+
+ if (activity < BackendActivityBuffer ||
+ activity > activity_last)
+ return NULL;
+
+ /* If no string available, no point in a report */
+ if (activity[0] == '\0')
+ return NULL;
+
+ /*
+ * Copy only ASCII-safe characters so we don't run into encoding
+ * problems when reporting the message; and be sure not to run off
+ * the end of memory. As only ASCII characters are reported, it
+ * doesn't seem necessary to perform multibyte aware clipping.
+ */
+ ascii_safe_strlcpy(buffer, activity,
+ Min(buflen, pgstat_track_activity_query_size));
+
+ return buffer;
+ }
+
+ beentry++;
+ }
+
+ /* PID not found */
+ return NULL;
+}
+
+/* ----------
+ * pgstat_get_my_query_id() -
+ *
+ * Return current backend's query identifier.
+ */
+uint64
+pgstat_get_my_query_id(void)
+{
+ if (!MyBEEntry)
+ return 0;
+
+ /*
+ * There's no need for a lock around pgstat_begin_read_activity /
+ * pgstat_end_read_activity here as it's only called from
+ * pg_stat_get_activity which is already protected, or from the same
+ * backend which means that there won't be concurrent writes.
+ */
+ return MyBEEntry->st_query_id;
+}
+
+
+/* ----------
+ * pgstat_fetch_stat_beentry() -
+ *
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * our local copy of the current-activity entry for one backend.
+ *
+ * NB: caller is responsible for a check if the user is permitted to see
+ * this info (especially the querystring).
+ * ----------
+ */
+PgBackendStatus *
+pgstat_fetch_stat_beentry(int beid)
+{
+ pgstat_read_current_status();
+
+ if (beid < 1 || beid > localNumBackends)
+ return NULL;
+
+ return &localBackendStatusTable[beid - 1].backendStatus;
+}
+
+
+/* ----------
+ * pgstat_fetch_stat_local_beentry() -
+ *
+ * Like pgstat_fetch_stat_beentry() but with locally computed additions (like
+ * xid and xmin values of the backend)
+ *
+ * NB: caller is responsible for a check if the user is permitted to see
+ * this info (especially the querystring).
+ * ----------
+ */
+LocalPgBackendStatus *
+pgstat_fetch_stat_local_beentry(int beid)
+{
+ pgstat_read_current_status();
+
+ if (beid < 1 || beid > localNumBackends)
+ return NULL;
+
+ return &localBackendStatusTable[beid - 1];
+}
+
+
+/* ----------
+ * pgstat_fetch_stat_numbackends() -
+ *
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * the maximum current backend id.
+ * ----------
+ */
+int
+pgstat_fetch_stat_numbackends(void)
+{
+ pgstat_read_current_status();
+
+ return localNumBackends;
+}
+
+/*
+ * Convert a potentially unsafely truncated activity string (see
+ * PgBackendStatus.st_activity_raw's documentation) into a correctly truncated
+ * one.
+ *
+ * The returned string is allocated in the caller's memory context and may be
+ * freed.
+ */
+char *
+pgstat_clip_activity(const char *raw_activity)
+{
+ char *activity;
+ int rawlen;
+ int cliplen;
+
+ /*
+ * Some callers, like pgstat_get_backend_current_activity(), do not
+ * guarantee that the buffer isn't concurrently modified. We try to take
+ * care that the buffer is always terminated by a NUL byte regardless, but
+ * let's still be paranoid about the string's length. In those cases the
+ * underlying buffer is guaranteed to be pgstat_track_activity_query_size
+ * large.
+ */
+ activity = pnstrdup(raw_activity, pgstat_track_activity_query_size - 1);
+
+ /* now double-guaranteed to be NUL terminated */
+ rawlen = strlen(activity);
+
+ /*
+ * All supported server-encodings make it possible to determine the length
+ * of a multi-byte character from its first byte (this is not the case for
+ * client encodings, see GB18030). As st_activity is always stored using
+ * server encoding, this allows us to perform multi-byte aware truncation,
+ * even if the string earlier was truncated in the middle of a multi-byte
+ * character.
+ */
+ cliplen = pg_mbcliplen(activity, rawlen,
+ pgstat_track_activity_query_size - 1);
+
+ activity[cliplen] = '\0';
+
+ return activity;
+}
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
new file mode 100644
index 0000000..84d65a7
--- /dev/null
+++ b/src/backend/utils/activity/pgstat.c
@@ -0,0 +1,1678 @@
+/* ----------
+ * pgstat.c
+ * Infrastructure for the cumulative statistics system.
+ *
+ * The cumulative statistics system accumulates statistics for different kinds
+ * of objects. Some kinds of statistics are collected for a fixed number of
+ * objects (most commonly 1), e.g., checkpointer statistics. Other kinds of
+ * statistics are collected for a varying number of objects
+ * (e.g. relations). See PgStat_KindInfo for a list of currently handled
+ * statistics.
+ *
+ * Statistics are loaded from the filesystem during startup (by the startup
+ * process), unless preceded by a crash, in which case all stats are
+ * discarded. They are written out by the checkpointer process just before
+ * shutting down, except when shutting down in immediate mode.
+ *
+ * Fixed-numbered stats are stored in plain (non-dynamic) shared memory.
+ *
+ * Statistics for variable-numbered objects are stored in dynamic shared
+ * memory and can be found via a dshash hashtable. The statistics counters are
+ * not part of the dshash entry (PgStatShared_HashEntry) directly, but are
+ * separately allocated (PgStatShared_HashEntry->body). The separate
+ * allocation allows different kinds of statistics to be stored in the same
+ * hashtable without wasting space in PgStatShared_HashEntry.
+ *
+ * Variable-numbered stats are addressed by PgStat_HashKey while running. It
+ * is not possible to have statistics for an object that cannot be addressed
+ * that way at runtime. A wider identifier can be used when serializing to
+ * disk (used for replication slot stats).
+ *
+ * To avoid contention on the shared hashtable, each backend has a
+ * backend-local hashtable (pgStatEntryRefHash) in front of the shared
+ * hashtable, containing references (PgStat_EntryRef) to shared hashtable
+ * entries. The shared hashtable only needs to be accessed when no prior
+ * reference is found in the local hashtable. Besides pointing to the
+ * shared hashtable entry (PgStatShared_HashEntry) PgStat_EntryRef also
+ * contains a pointer to the shared statistics data, as a process-local
+ * address, to reduce access costs.
+ *
+ * The names for structs stored in shared memory are prefixed with
+ * PgStatShared instead of PgStat. Each stats entry in shared memory is
+ * protected by a dedicated lwlock.
+ *
+ * Most stats updates are first accumulated locally in each process as pending
+ * entries, then later flushed to shared memory (just after commit, or by
+ * idle-timeout). This practically eliminates contention on individual stats
+ * entries. For most kinds of variable-numbered pending stats data is stored
+ * in PgStat_EntryRef->pending. All entries with pending data are in the
+ * pgStatPending list. Pending statistics updates are flushed out by
+ * pgstat_report_stat().
+ *
+ * The behavior of different kinds of statistics is determined by the kind's
+ * entry in pgstat_kind_infos, see PgStat_KindInfo for details.
+ *
+ * The consistency of read accesses to statistics can be configured using the
+ * stats_fetch_consistency GUC (see config.sgml and monitoring.sgml for the
+ * settings). When using PGSTAT_FETCH_CONSISTENCY_CACHE or
+ * PGSTAT_FETCH_CONSISTENCY_SNAPSHOT statistics are stored in
+ * pgStatLocal.snapshot.
+ *
+ * To keep things manageable, stats handling is split across several
+ * files. Infrastructure pieces are in:
+ * - pgstat.c - this file, to tie it all together
+ * - pgstat_shmem.c - nearly everything dealing with shared memory, including
+ * the maintenance of hashtable entries
+ * - pgstat_xact.c - transactional integration, including the transactional
+ * creation and dropping of stats entries
+ *
+ * Each statistics kind is handled in a dedicated file:
+ * - pgstat_archiver.c
+ * - pgstat_bgwriter.c
+ * - pgstat_checkpointer.c
+ * - pgstat_database.c
+ * - pgstat_function.c
+ * - pgstat_relation.c
+ * - pgstat_replslot.c
+ * - pgstat_slru.c
+ * - pgstat_subscription.c
+ * - pgstat_wal.c
+ *
+ * Whenever possible infrastructure files should not contain code related to
+ * specific kinds of stats.
+ *
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat.c
+ * ----------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "access/xact.h"
+#include "lib/dshash.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "storage/shmem.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/pgstat_internal.h"
+#include "utils/timestamp.h"
+
+
+/* ----------
+ * Timer definitions.
+ *
+ * In milliseconds.
+ * ----------
+ */
+
+/* minimum interval non-forced stats flushes.*/
+#define PGSTAT_MIN_INTERVAL 1000
+/* how long until to block flushing pending stats updates */
+#define PGSTAT_MAX_INTERVAL 60000
+/* when to call pgstat_report_stat() again, even when idle */
+#define PGSTAT_IDLE_INTERVAL 10000
+
+/* ----------
+ * Initial size hints for the hash tables used in statistics.
+ * ----------
+ */
+
+#define PGSTAT_SNAPSHOT_HASH_SIZE 512
+
+
+/* hash table for statistics snapshots entry */
+typedef struct PgStat_SnapshotEntry
+{
+ PgStat_HashKey key;
+ char status; /* for simplehash use */
+ void *data; /* the stats data itself */
+} PgStat_SnapshotEntry;
+
+
+/* ----------
+ * Backend-local Hash Table Definitions
+ * ----------
+ */
+
+/* for stats snapshot entries */
+#define SH_PREFIX pgstat_snapshot
+#define SH_ELEMENT_TYPE PgStat_SnapshotEntry
+#define SH_KEY_TYPE PgStat_HashKey
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) \
+ pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL)
+#define SH_EQUAL(tb, a, b) \
+ pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0
+#define SH_SCOPE static inline
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+
+/* ----------
+ * Local function forward declarations
+ * ----------
+ */
+
+static void pgstat_write_statsfile(void);
+static void pgstat_read_statsfile(void);
+
+static void pgstat_reset_after_failure(void);
+
+static bool pgstat_flush_pending_entries(bool nowait);
+
+static void pgstat_prep_snapshot(void);
+static void pgstat_build_snapshot(void);
+static void pgstat_build_snapshot_fixed(PgStat_Kind kind);
+
+static inline bool pgstat_is_kind_valid(int ikind);
+
+
+/* ----------
+ * GUC parameters
+ * ----------
+ */
+
+bool pgstat_track_counts = false;
+int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE;
+
+
+/* ----------
+ * state shared with pgstat_*.c
+ * ----------
+ */
+
+PgStat_LocalState pgStatLocal;
+
+
+/* ----------
+ * Local data
+ *
+ * NB: There should be only variables related to stats infrastructure here,
+ * not for specific kinds of stats.
+ * ----------
+ */
+
+/*
+ * Memory contexts containing the pgStatEntryRefHash table, the
+ * pgStatSharedRef entries, and pending data respectively. Mostly to make it
+ * easier to track / attribute memory usage.
+ */
+
+static MemoryContext pgStatPendingContext = NULL;
+
+/*
+ * Backend local list of PgStat_EntryRef with unflushed pending stats.
+ *
+ * Newly pending entries should only ever be added to the end of the list,
+ * otherwise pgstat_flush_pending_entries() might not see them immediately.
+ */
+static dlist_head pgStatPending = DLIST_STATIC_INIT(pgStatPending);
+
+
+/*
+ * Force the next stats flush to happen regardless of
+ * PGSTAT_MIN_INTERVAL. Useful in test scripts.
+ */
+static bool pgStatForceNextFlush = false;
+
+/*
+ * Force-clear existing snapshot before next use when stats_fetch_consistency
+ * is changed.
+ */
+static bool force_stats_snapshot_clear = false;
+
+
+/*
+ * For assertions that check pgstat is not used before initialization / after
+ * shutdown.
+ */
+#ifdef USE_ASSERT_CHECKING
+static bool pgstat_is_initialized = false;
+static bool pgstat_is_shutdown = false;
+#endif
+
+
+/*
+ * The different kinds of statistics.
+ *
+ * If reasonably possible, handling specific to one kind of stats should go
+ * through this abstraction, rather than making more of pgstat.c aware.
+ *
+ * See comments for struct PgStat_KindInfo for details about the individual
+ * fields.
+ *
+ * XXX: It'd be nicer to define this outside of this file. But there doesn't
+ * seem to be a great way of doing that, given the split across multiple
+ * files.
+ */
+static const PgStat_KindInfo pgstat_kind_infos[PGSTAT_NUM_KINDS] = {
+
+ /* stats kinds for variable-numbered objects */
+
+ [PGSTAT_KIND_DATABASE] = {
+ .name = "database",
+
+ .fixed_amount = false,
+ /* so pg_stat_database entries can be seen in all databases */
+ .accessed_across_databases = true,
+
+ .shared_size = sizeof(PgStatShared_Database),
+ .shared_data_off = offsetof(PgStatShared_Database, stats),
+ .shared_data_len = sizeof(((PgStatShared_Database *) 0)->stats),
+ .pending_size = sizeof(PgStat_StatDBEntry),
+
+ .flush_pending_cb = pgstat_database_flush_cb,
+ .reset_timestamp_cb = pgstat_database_reset_timestamp_cb,
+ },
+
+ [PGSTAT_KIND_RELATION] = {
+ .name = "relation",
+
+ .fixed_amount = false,
+
+ .shared_size = sizeof(PgStatShared_Relation),
+ .shared_data_off = offsetof(PgStatShared_Relation, stats),
+ .shared_data_len = sizeof(((PgStatShared_Relation *) 0)->stats),
+ .pending_size = sizeof(PgStat_TableStatus),
+
+ .flush_pending_cb = pgstat_relation_flush_cb,
+ .delete_pending_cb = pgstat_relation_delete_pending_cb,
+ },
+
+ [PGSTAT_KIND_FUNCTION] = {
+ .name = "function",
+
+ .fixed_amount = false,
+
+ .shared_size = sizeof(PgStatShared_Function),
+ .shared_data_off = offsetof(PgStatShared_Function, stats),
+ .shared_data_len = sizeof(((PgStatShared_Function *) 0)->stats),
+ .pending_size = sizeof(PgStat_BackendFunctionEntry),
+
+ .flush_pending_cb = pgstat_function_flush_cb,
+ },
+
+ [PGSTAT_KIND_REPLSLOT] = {
+ .name = "replslot",
+
+ .fixed_amount = false,
+
+ .accessed_across_databases = true,
+ .named_on_disk = true,
+
+ .shared_size = sizeof(PgStatShared_ReplSlot),
+ .shared_data_off = offsetof(PgStatShared_ReplSlot, stats),
+ .shared_data_len = sizeof(((PgStatShared_ReplSlot *) 0)->stats),
+
+ .reset_timestamp_cb = pgstat_replslot_reset_timestamp_cb,
+ .to_serialized_name = pgstat_replslot_to_serialized_name_cb,
+ .from_serialized_name = pgstat_replslot_from_serialized_name_cb,
+ },
+
+ [PGSTAT_KIND_SUBSCRIPTION] = {
+ .name = "subscription",
+
+ .fixed_amount = false,
+ /* so pg_stat_subscription_stats entries can be seen in all databases */
+ .accessed_across_databases = true,
+
+ .shared_size = sizeof(PgStatShared_Subscription),
+ .shared_data_off = offsetof(PgStatShared_Subscription, stats),
+ .shared_data_len = sizeof(((PgStatShared_Subscription *) 0)->stats),
+ .pending_size = sizeof(PgStat_BackendSubEntry),
+
+ .flush_pending_cb = pgstat_subscription_flush_cb,
+ .reset_timestamp_cb = pgstat_subscription_reset_timestamp_cb,
+ },
+
+
+ /* stats for fixed-numbered (mostly 1) objects */
+
+ [PGSTAT_KIND_ARCHIVER] = {
+ .name = "archiver",
+
+ .fixed_amount = true,
+
+ .reset_all_cb = pgstat_archiver_reset_all_cb,
+ .snapshot_cb = pgstat_archiver_snapshot_cb,
+ },
+
+ [PGSTAT_KIND_BGWRITER] = {
+ .name = "bgwriter",
+
+ .fixed_amount = true,
+
+ .reset_all_cb = pgstat_bgwriter_reset_all_cb,
+ .snapshot_cb = pgstat_bgwriter_snapshot_cb,
+ },
+
+ [PGSTAT_KIND_CHECKPOINTER] = {
+ .name = "checkpointer",
+
+ .fixed_amount = true,
+
+ .reset_all_cb = pgstat_checkpointer_reset_all_cb,
+ .snapshot_cb = pgstat_checkpointer_snapshot_cb,
+ },
+
+ [PGSTAT_KIND_SLRU] = {
+ .name = "slru",
+
+ .fixed_amount = true,
+
+ .reset_all_cb = pgstat_slru_reset_all_cb,
+ .snapshot_cb = pgstat_slru_snapshot_cb,
+ },
+
+ [PGSTAT_KIND_WAL] = {
+ .name = "wal",
+
+ .fixed_amount = true,
+
+ .reset_all_cb = pgstat_wal_reset_all_cb,
+ .snapshot_cb = pgstat_wal_snapshot_cb,
+ },
+};
+
+
+/* ------------------------------------------------------------
+ * Functions managing the state of the stats system for all backends.
+ * ------------------------------------------------------------
+ */
+
+/*
+ * Read on-disk stats into memory at server start.
+ *
+ * Should only be called by the startup process or in single user mode.
+ */
+void
+pgstat_restore_stats(void)
+{
+ pgstat_read_statsfile();
+}
+
+/*
+ * Remove the stats file. This is currently used only if WAL recovery is
+ * needed after a crash.
+ *
+ * Should only be called by the startup process or in single user mode.
+ */
+void
+pgstat_discard_stats(void)
+{
+ int ret;
+
+ /* NB: this needs to be done even in single user mode */
+
+ ret = unlink(PGSTAT_STAT_PERMANENT_FILENAME);
+ if (ret != 0)
+ {
+ if (errno == ENOENT)
+ elog(DEBUG2,
+ "didn't need to unlink permanent stats file \"%s\" - didn't exist",
+ PGSTAT_STAT_PERMANENT_FILENAME);
+ else
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not unlink permanent statistics file \"%s\": %m",
+ PGSTAT_STAT_PERMANENT_FILENAME)));
+ }
+ else
+ {
+ ereport(DEBUG2,
+ (errcode_for_file_access(),
+ errmsg_internal("unlinked permanent statistics file \"%s\"",
+ PGSTAT_STAT_PERMANENT_FILENAME)));
+ }
+
+ /*
+ * Reset stats contents. This will set reset timestamps of fixed-numbered
+ * stats to the current time (no variable stats exist).
+ */
+ pgstat_reset_after_failure();
+}
+
+/*
+ * pgstat_before_server_shutdown() needs to be called by exactly one process
+ * during regular server shutdowns. Otherwise all stats will be lost.
+ *
+ * We currently only write out stats for proc_exit(0). We might want to change
+ * that at some point... But right now pgstat_discard_stats() would be called
+ * during the start after a disorderly shutdown, anyway.
+ */
+void
+pgstat_before_server_shutdown(int code, Datum arg)
+{
+ Assert(pgStatLocal.shmem != NULL);
+ Assert(!pgStatLocal.shmem->is_shutdown);
+
+ /*
+ * Stats should only be reported after pgstat_initialize() and before
+ * pgstat_shutdown(). This is a convenient point to catch most violations
+ * of this rule.
+ */
+ Assert(pgstat_is_initialized && !pgstat_is_shutdown);
+
+ /* flush out our own pending changes before writing out */
+ pgstat_report_stat(true);
+
+ /*
+ * Only write out file during normal shutdown. Don't even signal that
+ * we've shutdown during irregular shutdowns, because the shutdown
+ * sequence isn't coordinated to ensure this backend shuts down last.
+ */
+ if (code == 0)
+ {
+ pgStatLocal.shmem->is_shutdown = true;
+ pgstat_write_statsfile();
+ }
+}
+
+
+/* ------------------------------------------------------------
+ * Backend initialization / shutdown functions
+ * ------------------------------------------------------------
+ */
+
+/*
+ * Shut down a single backend's statistics reporting at process exit.
+ *
+ * Flush out any remaining statistics counts. Without this, operations
+ * triggered during backend exit (such as temp table deletions) won't be
+ * counted.
+ */
+static void
+pgstat_shutdown_hook(int code, Datum arg)
+{
+ Assert(!pgstat_is_shutdown);
+ Assert(IsUnderPostmaster || !IsPostmasterEnvironment);
+
+ /*
+ * If we got as far as discovering our own database ID, we can flush out
+ * what we did so far. Otherwise, we'd be reporting an invalid database
+ * ID, so forget it. (This means that accesses to pg_database during
+ * failed backend starts might never get counted.)
+ */
+ if (OidIsValid(MyDatabaseId))
+ pgstat_report_disconnect(MyDatabaseId);
+
+ pgstat_report_stat(true);
+
+ /* there shouldn't be any pending changes left */
+ Assert(dlist_is_empty(&pgStatPending));
+ dlist_init(&pgStatPending);
+
+ pgstat_detach_shmem();
+
+#ifdef USE_ASSERT_CHECKING
+ pgstat_is_shutdown = true;
+#endif
+}
+
+/*
+ * Initialize pgstats state, and set up our on-proc-exit hook. Called from
+ * BaseInit().
+ *
+ * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful.
+ */
+void
+pgstat_initialize(void)
+{
+ Assert(!pgstat_is_initialized);
+
+ pgstat_attach_shmem();
+
+ pgstat_init_wal();
+
+ /* Set up a process-exit hook to clean up */
+ before_shmem_exit(pgstat_shutdown_hook, 0);
+
+#ifdef USE_ASSERT_CHECKING
+ pgstat_is_initialized = true;
+#endif
+}
+
+
+/* ------------------------------------------------------------
+ * Public functions used by backends follow
+ * ------------------------------------------------------------
+ */
+
+/*
+ * Must be called by processes that performs DML: tcop/postgres.c, logical
+ * receiver processes, SPI worker, etc. to flush pending statistics updates to
+ * shared memory.
+ *
+ * Unless called with 'force', pending stats updates are flushed happen once
+ * per PGSTAT_MIN_INTERVAL (1000ms). When not forced, stats flushes do not
+ * block on lock acquisition, except if stats updates have been pending for
+ * longer than PGSTAT_MAX_INTERVAL (60000ms).
+ *
+ * Whenever pending stats updates remain at the end of pgstat_report_stat() a
+ * suggested idle timeout is returned. Currently this is always
+ * PGSTAT_IDLE_INTERVAL (10000ms). Callers can use the returned time to set up
+ * a timeout after which to call pgstat_report_stat(true), but are not
+ * required to to do so.
+ *
+ * Note that this is called only when not within a transaction, so it is fair
+ * to use transaction stop time as an approximation of current time.
+ */
+long
+pgstat_report_stat(bool force)
+{
+ static TimestampTz pending_since = 0;
+ static TimestampTz last_flush = 0;
+ bool partial_flush;
+ TimestampTz now;
+ bool nowait;
+
+ pgstat_assert_is_up();
+ Assert(!IsTransactionOrTransactionBlock());
+
+ /* "absorb" the forced flush even if there's nothing to flush */
+ if (pgStatForceNextFlush)
+ {
+ force = true;
+ pgStatForceNextFlush = false;
+ }
+
+ /* Don't expend a clock check if nothing to do */
+ if (dlist_is_empty(&pgStatPending) &&
+ !have_slrustats &&
+ !pgstat_have_pending_wal())
+ {
+ Assert(pending_since == 0);
+ return 0;
+ }
+
+ /*
+ * There should never be stats to report once stats are shut down. Can't
+ * assert that before the checks above, as there is an unconditional
+ * pgstat_report_stat() call in pgstat_shutdown_hook() - which at least
+ * the process that ran pgstat_before_server_shutdown() will still call.
+ */
+ Assert(!pgStatLocal.shmem->is_shutdown);
+
+ now = GetCurrentTransactionStopTimestamp();
+
+ if (!force)
+ {
+ if (pending_since > 0 &&
+ TimestampDifferenceExceeds(pending_since, now, PGSTAT_MAX_INTERVAL))
+ {
+ /* don't keep pending updates longer than PGSTAT_MAX_INTERVAL */
+ force = true;
+ }
+ else if (last_flush > 0 &&
+ !TimestampDifferenceExceeds(last_flush, now, PGSTAT_MIN_INTERVAL))
+ {
+ /* don't flush too frequently */
+ if (pending_since == 0)
+ pending_since = now;
+
+ return PGSTAT_IDLE_INTERVAL;
+ }
+ }
+
+ pgstat_update_dbstats(now);
+
+ /* don't wait for lock acquisition when !force */
+ nowait = !force;
+
+ partial_flush = false;
+
+ /* flush database / relation / function / ... stats */
+ partial_flush |= pgstat_flush_pending_entries(nowait);
+
+ /* flush wal stats */
+ partial_flush |= pgstat_flush_wal(nowait);
+
+ /* flush SLRU stats */
+ partial_flush |= pgstat_slru_flush(nowait);
+
+ last_flush = now;
+
+ /*
+ * If some of the pending stats could not be flushed due to lock
+ * contention, let the caller know when to retry.
+ */
+ if (partial_flush)
+ {
+ /* force should have prevented us from getting here */
+ Assert(!force);
+
+ /* remember since when stats have been pending */
+ if (pending_since == 0)
+ pending_since = now;
+
+ return PGSTAT_IDLE_INTERVAL;
+ }
+
+ pending_since = 0;
+
+ return 0;
+}
+
+/*
+ * Force locally pending stats to be flushed during the next
+ * pgstat_report_stat() call. This is useful for writing tests.
+ */
+void
+pgstat_force_next_flush(void)
+{
+ pgStatForceNextFlush = true;
+}
+
+/*
+ * Only for use by pgstat_reset_counters()
+ */
+static bool
+match_db_entries(PgStatShared_HashEntry *entry, Datum match_data)
+{
+ return entry->key.dboid == DatumGetObjectId(MyDatabaseId);
+}
+
+/*
+ * Reset counters for our database.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+void
+pgstat_reset_counters(void)
+{
+ TimestampTz ts = GetCurrentTimestamp();
+
+ pgstat_reset_matching_entries(match_db_entries,
+ ObjectIdGetDatum(MyDatabaseId),
+ ts);
+}
+
+/*
+ * Reset a single variable-numbered entry.
+ *
+ * If the stats kind is within a database, also reset the database's
+ * stat_reset_timestamp.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+void
+pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objoid)
+{
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+ TimestampTz ts = GetCurrentTimestamp();
+
+ /* not needed atm, and doesn't make sense with the current signature */
+ Assert(!pgstat_get_kind_info(kind)->fixed_amount);
+
+ /* reset the "single counter" */
+ pgstat_reset_entry(kind, dboid, objoid, ts);
+
+ if (!kind_info->accessed_across_databases)
+ pgstat_reset_database_timestamp(dboid, ts);
+}
+
+/*
+ * Reset stats for all entries of a kind.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+void
+pgstat_reset_of_kind(PgStat_Kind kind)
+{
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+ TimestampTz ts = GetCurrentTimestamp();
+
+ if (kind_info->fixed_amount)
+ kind_info->reset_all_cb(ts);
+ else
+ pgstat_reset_entries_of_kind(kind, ts);
+}
+
+
+/* ------------------------------------------------------------
+ * Fetching of stats
+ * ------------------------------------------------------------
+ */
+
+/*
+ * Discard any data collected in the current transaction. Any subsequent
+ * request will cause new snapshots to be read.
+ *
+ * This is also invoked during transaction commit or abort to discard
+ * the no-longer-wanted snapshot. Updates of stats_fetch_consistency can
+ * cause this routine to be called.
+ */
+void
+pgstat_clear_snapshot(void)
+{
+ pgstat_assert_is_up();
+
+ memset(&pgStatLocal.snapshot.fixed_valid, 0,
+ sizeof(pgStatLocal.snapshot.fixed_valid));
+ pgStatLocal.snapshot.stats = NULL;
+ pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_NONE;
+
+ /* Release memory, if any was allocated */
+ if (pgStatLocal.snapshot.context)
+ {
+ MemoryContextDelete(pgStatLocal.snapshot.context);
+
+ /* Reset variables */
+ pgStatLocal.snapshot.context = NULL;
+ }
+
+ /*
+ * Historically the backend_status.c facilities lived in this file, and
+ * were reset with the same function. For now keep it that way, and
+ * forward the reset request.
+ */
+ pgstat_clear_backend_activity_snapshot();
+
+ /* Reset this flag, as it may be possible that a cleanup was forced. */
+ force_stats_snapshot_clear = false;
+}
+
+void *
+pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, Oid objoid)
+{
+ PgStat_HashKey key;
+ PgStat_EntryRef *entry_ref;
+ void *stats_data;
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+
+ /* should be called from backends */
+ Assert(IsUnderPostmaster || !IsPostmasterEnvironment);
+ AssertArg(!kind_info->fixed_amount);
+
+ pgstat_prep_snapshot();
+
+ key.kind = kind;
+ key.dboid = dboid;
+ key.objoid = objoid;
+
+ /* if we need to build a full snapshot, do so */
+ if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT)
+ pgstat_build_snapshot();
+
+ /* if caching is desired, look up in cache */
+ if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE)
+ {
+ PgStat_SnapshotEntry *entry = NULL;
+
+ entry = pgstat_snapshot_lookup(pgStatLocal.snapshot.stats, key);
+
+ if (entry)
+ return entry->data;
+
+ /*
+ * If we built a full snapshot and the key is not in
+ * pgStatLocal.snapshot.stats, there are no matching stats.
+ */
+ if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT)
+ return NULL;
+ }
+
+ pgStatLocal.snapshot.mode = pgstat_fetch_consistency;
+
+ entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL);
+
+ if (entry_ref == NULL || entry_ref->shared_entry->dropped)
+ {
+ /* create empty entry when using PGSTAT_FETCH_CONSISTENCY_CACHE */
+ if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE)
+ {
+ PgStat_SnapshotEntry *entry = NULL;
+ bool found;
+
+ entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found);
+ Assert(!found);
+ entry->data = NULL;
+ }
+ return NULL;
+ }
+
+ /*
+ * Allocate in caller's context for PGSTAT_FETCH_CONSISTENCY_NONE,
+ * otherwise we could quickly end up with a fair bit of memory used due to
+ * repeated accesses.
+ */
+ if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE)
+ stats_data = palloc(kind_info->shared_data_len);
+ else
+ stats_data = MemoryContextAlloc(pgStatLocal.snapshot.context,
+ kind_info->shared_data_len);
+
+ pgstat_lock_entry_shared(entry_ref, false);
+ memcpy(stats_data,
+ pgstat_get_entry_data(kind, entry_ref->shared_stats),
+ kind_info->shared_data_len);
+ pgstat_unlock_entry(entry_ref);
+
+ if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE)
+ {
+ PgStat_SnapshotEntry *entry = NULL;
+ bool found;
+
+ entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found);
+ entry->data = stats_data;
+ }
+
+ return stats_data;
+}
+
+/*
+ * If a stats snapshot has been taken, return the timestamp at which that was
+ * done, and set *have_snapshot to true. Otherwise *have_snapshot is set to
+ * false.
+ */
+TimestampTz
+pgstat_get_stat_snapshot_timestamp(bool *have_snapshot)
+{
+ if (force_stats_snapshot_clear)
+ pgstat_clear_snapshot();
+
+ if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT)
+ {
+ *have_snapshot = true;
+ return pgStatLocal.snapshot.snapshot_timestamp;
+ }
+
+ *have_snapshot = false;
+
+ return 0;
+}
+
+bool
+pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid)
+{
+ /* fixed-numbered stats always exist */
+ if (pgstat_get_kind_info(kind)->fixed_amount)
+ return true;
+
+ return pgstat_get_entry_ref(kind, dboid, objoid, false, NULL) != NULL;
+}
+
+/*
+ * Ensure snapshot for fixed-numbered 'kind' exists.
+ *
+ * Typically used by the pgstat_fetch_* functions for a kind of stats, before
+ * massaging the data into the desired format.
+ */
+void
+pgstat_snapshot_fixed(PgStat_Kind kind)
+{
+ AssertArg(pgstat_is_kind_valid(kind));
+ AssertArg(pgstat_get_kind_info(kind)->fixed_amount);
+
+ if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT)
+ pgstat_build_snapshot();
+ else
+ pgstat_build_snapshot_fixed(kind);
+
+ Assert(pgStatLocal.snapshot.fixed_valid[kind]);
+}
+
+static void
+pgstat_prep_snapshot(void)
+{
+ if (force_stats_snapshot_clear)
+ pgstat_clear_snapshot();
+
+ if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE ||
+ pgStatLocal.snapshot.stats != NULL)
+ return;
+
+ if (!pgStatLocal.snapshot.context)
+ pgStatLocal.snapshot.context = AllocSetContextCreate(TopMemoryContext,
+ "PgStat Snapshot",
+ ALLOCSET_SMALL_SIZES);
+
+ pgStatLocal.snapshot.stats =
+ pgstat_snapshot_create(pgStatLocal.snapshot.context,
+ PGSTAT_SNAPSHOT_HASH_SIZE,
+ NULL);
+}
+
+static void
+pgstat_build_snapshot(void)
+{
+ dshash_seq_status hstat;
+ PgStatShared_HashEntry *p;
+
+ /* should only be called when we need a snapshot */
+ Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT);
+
+ /* snapshot already built */
+ if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT)
+ return;
+
+ pgstat_prep_snapshot();
+
+ Assert(pgStatLocal.snapshot.stats->members == 0);
+
+ pgStatLocal.snapshot.snapshot_timestamp = GetCurrentTimestamp();
+
+ /*
+ * Snapshot all variable stats.
+ */
+ dshash_seq_init(&hstat, pgStatLocal.shared_hash, false);
+ while ((p = dshash_seq_next(&hstat)) != NULL)
+ {
+ PgStat_Kind kind = p->key.kind;
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+ bool found;
+ PgStat_SnapshotEntry *entry;
+ PgStatShared_Common *stats_data;
+
+ /*
+ * Check if the stats object should be included in the snapshot.
+ * Unless the stats kind can be accessed from all databases (e.g.,
+ * database stats themselves), we only include stats for the current
+ * database or objects not associated with a database (e.g. shared
+ * relations).
+ */
+ if (p->key.dboid != MyDatabaseId &&
+ p->key.dboid != InvalidOid &&
+ !kind_info->accessed_across_databases)
+ continue;
+
+ if (p->dropped)
+ continue;
+
+ Assert(pg_atomic_read_u32(&p->refcount) > 0);
+
+ stats_data = dsa_get_address(pgStatLocal.dsa, p->body);
+ Assert(stats_data);
+
+ entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, p->key, &found);
+ Assert(!found);
+
+ entry->data = MemoryContextAlloc(pgStatLocal.snapshot.context,
+ kind_info->shared_size);
+ /*
+ * Acquire the LWLock directly instead of using
+ * pg_stat_lock_entry_shared() which requires a reference.
+ */
+ LWLockAcquire(&stats_data->lock, LW_SHARED);
+ memcpy(entry->data,
+ pgstat_get_entry_data(kind, stats_data),
+ kind_info->shared_size);
+ LWLockRelease(&stats_data->lock);
+ }
+ dshash_seq_term(&hstat);
+
+ /*
+ * Build snapshot of all fixed-numbered stats.
+ */
+ for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++)
+ {
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+
+ if (!kind_info->fixed_amount)
+ {
+ Assert(kind_info->snapshot_cb == NULL);
+ continue;
+ }
+
+ pgstat_build_snapshot_fixed(kind);
+ }
+
+ pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_SNAPSHOT;
+}
+
+static void
+pgstat_build_snapshot_fixed(PgStat_Kind kind)
+{
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+
+ Assert(kind_info->fixed_amount);
+ Assert(kind_info->snapshot_cb != NULL);
+
+ if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE)
+ {
+ /* rebuild every time */
+ pgStatLocal.snapshot.fixed_valid[kind] = false;
+ }
+ else if (pgStatLocal.snapshot.fixed_valid[kind])
+ {
+ /* in snapshot mode we shouldn't get called again */
+ Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE);
+ return;
+ }
+
+ Assert(!pgStatLocal.snapshot.fixed_valid[kind]);
+
+ kind_info->snapshot_cb();
+
+ Assert(!pgStatLocal.snapshot.fixed_valid[kind]);
+ pgStatLocal.snapshot.fixed_valid[kind] = true;
+}
+
+
+/* ------------------------------------------------------------
+ * Backend-local pending stats infrastructure
+ * ------------------------------------------------------------
+ */
+
+/*
+ * Returns the appropriate PgStat_EntryRef, preparing it to receive pending
+ * stats if not already done.
+ *
+ * If created_entry is non-NULL, it'll be set to true if the entry is newly
+ * created, false otherwise.
+ */
+PgStat_EntryRef *
+pgstat_prep_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid, bool *created_entry)
+{
+ PgStat_EntryRef *entry_ref;
+
+ /* need to be able to flush out */
+ Assert(pgstat_get_kind_info(kind)->flush_pending_cb != NULL);
+
+ if (unlikely(!pgStatPendingContext))
+ {
+ pgStatPendingContext =
+ AllocSetContextCreate(TopMemoryContext,
+ "PgStat Pending",
+ ALLOCSET_SMALL_SIZES);
+ }
+
+ entry_ref = pgstat_get_entry_ref(kind, dboid, objoid,
+ true, created_entry);
+
+ if (entry_ref->pending == NULL)
+ {
+ size_t entrysize = pgstat_get_kind_info(kind)->pending_size;
+
+ Assert(entrysize != (size_t) -1);
+
+ entry_ref->pending = MemoryContextAllocZero(pgStatPendingContext, entrysize);
+ dlist_push_tail(&pgStatPending, &entry_ref->pending_node);
+ }
+
+ return entry_ref;
+}
+
+/*
+ * Return an existing stats entry, or NULL.
+ *
+ * This should only be used for helper function for pgstatfuncs.c - outside of
+ * that it shouldn't be needed.
+ */
+PgStat_EntryRef *
+pgstat_fetch_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid)
+{
+ PgStat_EntryRef *entry_ref;
+
+ entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL);
+
+ if (entry_ref == NULL || entry_ref->pending == NULL)
+ return NULL;
+
+ return entry_ref;
+}
+
+void
+pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref)
+{
+ PgStat_Kind kind = entry_ref->shared_entry->key.kind;
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+ void *pending_data = entry_ref->pending;
+
+ Assert(pending_data != NULL);
+ /* !fixed_amount stats should be handled explicitly */
+ Assert(!pgstat_get_kind_info(kind)->fixed_amount);
+
+ if (kind_info->delete_pending_cb)
+ kind_info->delete_pending_cb(entry_ref);
+
+ pfree(pending_data);
+ entry_ref->pending = NULL;
+
+ dlist_delete(&entry_ref->pending_node);
+}
+
+/*
+ * Flush out pending stats for database objects (databases, relations,
+ * functions).
+ */
+static bool
+pgstat_flush_pending_entries(bool nowait)
+{
+ bool have_pending = false;
+ dlist_node *cur = NULL;
+
+ /*
+ * Need to be a bit careful iterating over the list of pending entries.
+ * Processing a pending entry may queue further pending entries to the end
+ * of the list that we want to process, so a simple iteration won't do.
+ * Further complicating matters is that we want to delete the current
+ * entry in each iteration from the list if we flushed successfully.
+ *
+ * So we just keep track of the next pointer in each loop iteration.
+ */
+ if (!dlist_is_empty(&pgStatPending))
+ cur = dlist_head_node(&pgStatPending);
+
+ while (cur)
+ {
+ PgStat_EntryRef *entry_ref =
+ dlist_container(PgStat_EntryRef, pending_node, cur);
+ PgStat_HashKey key = entry_ref->shared_entry->key;
+ PgStat_Kind kind = key.kind;
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+ bool did_flush;
+ dlist_node *next;
+
+ Assert(!kind_info->fixed_amount);
+ Assert(kind_info->flush_pending_cb != NULL);
+
+ /* flush the stats, if possible */
+ did_flush = kind_info->flush_pending_cb(entry_ref, nowait);
+
+ Assert(did_flush || nowait);
+
+ /* determine next entry, before deleting the pending entry */
+ if (dlist_has_next(&pgStatPending, cur))
+ next = dlist_next_node(&pgStatPending, cur);
+ else
+ next = NULL;
+
+ /* if successfully flushed, remove entry */
+ if (did_flush)
+ pgstat_delete_pending_entry(entry_ref);
+ else
+ have_pending = true;
+
+ cur = next;
+ }
+
+ Assert(dlist_is_empty(&pgStatPending) == !have_pending);
+
+ return have_pending;
+}
+
+
+/* ------------------------------------------------------------
+ * Helper / infrastructure functions
+ * ------------------------------------------------------------
+ */
+
+PgStat_Kind
+pgstat_get_kind_from_str(char *kind_str)
+{
+ for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++)
+ {
+ if (pg_strcasecmp(kind_str, pgstat_kind_infos[kind].name) == 0)
+ return kind;
+ }
+
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid statistics kind: \"%s\"", kind_str)));
+ return PGSTAT_KIND_DATABASE; /* avoid compiler warnings */
+}
+
+static inline bool
+pgstat_is_kind_valid(int ikind)
+{
+ return ikind >= PGSTAT_KIND_FIRST_VALID && ikind <= PGSTAT_KIND_LAST;
+}
+
+const PgStat_KindInfo *
+pgstat_get_kind_info(PgStat_Kind kind)
+{
+ AssertArg(pgstat_is_kind_valid(kind));
+
+ return &pgstat_kind_infos[kind];
+}
+
+/*
+ * Stats should only be reported after pgstat_initialize() and before
+ * pgstat_shutdown(). This check is put in a few central places to catch
+ * violations of this rule more easily.
+ */
+#ifdef USE_ASSERT_CHECKING
+void
+pgstat_assert_is_up(void)
+{
+ Assert(pgstat_is_initialized && !pgstat_is_shutdown);
+}
+#endif
+
+
+/* ------------------------------------------------------------
+ * reading and writing of on-disk stats file
+ * ------------------------------------------------------------
+ */
+
+/* helpers for pgstat_write_statsfile() */
+static void
+write_chunk(FILE *fpout, void *ptr, size_t len)
+{
+ int rc;
+
+ rc = fwrite(ptr, len, 1, fpout);
+
+ /* we'll check for errors with ferror once at the end */
+ (void) rc;
+}
+
+#define write_chunk_s(fpout, ptr) write_chunk(fpout, ptr, sizeof(*ptr))
+
+/*
+ * This function is called in the last process that is accessing the shared
+ * stats so locking is not required.
+ */
+static void
+pgstat_write_statsfile(void)
+{
+ FILE *fpout;
+ int32 format_id;
+ const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE;
+ const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+ dshash_seq_status hstat;
+ PgStatShared_HashEntry *ps;
+
+ pgstat_assert_is_up();
+
+ /* we're shutting down, so it's ok to just override this */
+ pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE;
+
+ elog(DEBUG2, "writing stats file \"%s\"", statfile);
+
+ /*
+ * Open the statistics temp file to write out the current values.
+ */
+ fpout = AllocateFile(tmpfile, PG_BINARY_W);
+ if (fpout == NULL)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary statistics file \"%s\": %m",
+ tmpfile)));
+ return;
+ }
+
+ /*
+ * Write the file header --- currently just a format ID.
+ */
+ format_id = PGSTAT_FILE_FORMAT_ID;
+ write_chunk_s(fpout, &format_id);
+
+ /*
+ * XXX: The following could now be generalized to just iterate over
+ * pgstat_kind_infos instead of knowing about the different kinds of
+ * stats.
+ */
+
+ /*
+ * Write archiver stats struct
+ */
+ pgstat_build_snapshot_fixed(PGSTAT_KIND_ARCHIVER);
+ write_chunk_s(fpout, &pgStatLocal.snapshot.archiver);
+
+ /*
+ * Write bgwriter stats struct
+ */
+ pgstat_build_snapshot_fixed(PGSTAT_KIND_BGWRITER);
+ write_chunk_s(fpout, &pgStatLocal.snapshot.bgwriter);
+
+ /*
+ * Write checkpointer stats struct
+ */
+ pgstat_build_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER);
+ write_chunk_s(fpout, &pgStatLocal.snapshot.checkpointer);
+
+ /*
+ * Write SLRU stats struct
+ */
+ pgstat_build_snapshot_fixed(PGSTAT_KIND_SLRU);
+ write_chunk_s(fpout, &pgStatLocal.snapshot.slru);
+
+ /*
+ * Write WAL stats struct
+ */
+ pgstat_build_snapshot_fixed(PGSTAT_KIND_WAL);
+ write_chunk_s(fpout, &pgStatLocal.snapshot.wal);
+
+ /*
+ * Walk through the stats entries
+ */
+ dshash_seq_init(&hstat, pgStatLocal.shared_hash, false);
+ while ((ps = dshash_seq_next(&hstat)) != NULL)
+ {
+ PgStatShared_Common *shstats;
+ const PgStat_KindInfo *kind_info = NULL;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* we may have some "dropped" entries not yet removed, skip them */
+ Assert(!ps->dropped);
+ if (ps->dropped)
+ continue;
+
+ shstats = (PgStatShared_Common *) dsa_get_address(pgStatLocal.dsa, ps->body);
+
+ kind_info = pgstat_get_kind_info(ps->key.kind);
+
+ /* if not dropped the valid-entry refcount should exist */
+ Assert(pg_atomic_read_u32(&ps->refcount) > 0);
+
+ if (!kind_info->to_serialized_name)
+ {
+ /* normal stats entry, identified by PgStat_HashKey */
+ fputc('S', fpout);
+ write_chunk_s(fpout, &ps->key);
+ }
+ else
+ {
+ /* stats entry identified by name on disk (e.g. slots) */
+ NameData name;
+
+ kind_info->to_serialized_name(&ps->key, shstats, &name);
+
+ fputc('N', fpout);
+ write_chunk_s(fpout, &ps->key.kind);
+ write_chunk_s(fpout, &name);
+ }
+
+ /* Write except the header part of the entry */
+ write_chunk(fpout,
+ pgstat_get_entry_data(ps->key.kind, shstats),
+ pgstat_get_entry_len(ps->key.kind));
+ }
+ dshash_seq_term(&hstat);
+
+ /*
+ * No more output to be done. Close the temp file and replace the old
+ * pgstat.stat with it. The ferror() check replaces testing for error
+ * after each individual fputc or fwrite (in write_chunk()) above.
+ */
+ fputc('E', fpout);
+
+ if (ferror(fpout))
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write temporary statistics file \"%s\": %m",
+ tmpfile)));
+ FreeFile(fpout);
+ unlink(tmpfile);
+ }
+ else if (FreeFile(fpout) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not close temporary statistics file \"%s\": %m",
+ tmpfile)));
+ unlink(tmpfile);
+ }
+ else if (rename(tmpfile, statfile) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
+ tmpfile, statfile)));
+ unlink(tmpfile);
+ }
+}
+
+/* helpers for pgstat_read_statsfile() */
+static bool
+read_chunk(FILE *fpin, void *ptr, size_t len)
+{
+ return fread(ptr, 1, len, fpin) == len;
+}
+
+#define read_chunk_s(fpin, ptr) read_chunk(fpin, ptr, sizeof(*ptr))
+
+/*
+ * Reads in existing statistics file into the shared stats hash.
+ *
+ * This function is called in the only process that is accessing the shared
+ * stats so locking is not required.
+ */
+static void
+pgstat_read_statsfile(void)
+{
+ FILE *fpin;
+ int32 format_id;
+ bool found;
+ const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+ PgStat_ShmemControl *shmem = pgStatLocal.shmem;
+
+ /* shouldn't be called from postmaster */
+ Assert(IsUnderPostmaster || !IsPostmasterEnvironment);
+
+ elog(DEBUG2, "reading stats file \"%s\"", statfile);
+
+ /*
+ * Try to open the stats file. If it doesn't exist, the backends simply
+ * returns zero for anything and statistics simply starts from scratch
+ * with empty counters.
+ *
+ * ENOENT is a possibility if stats collection was previously disabled or
+ * has not yet written the stats file for the first time. Any other
+ * failure condition is suspicious.
+ */
+ if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+ {
+ if (errno != ENOENT)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open statistics file \"%s\": %m",
+ statfile)));
+ pgstat_reset_after_failure();
+ return;
+ }
+
+ /*
+ * Verify it's of the expected format.
+ */
+ if (!read_chunk_s(fpin, &format_id) ||
+ format_id != PGSTAT_FILE_FORMAT_ID)
+ goto error;
+
+ /*
+ * XXX: The following could now be generalized to just iterate over
+ * pgstat_kind_infos instead of knowing about the different kinds of
+ * stats.
+ */
+
+ /*
+ * Read archiver stats struct
+ */
+ if (!read_chunk_s(fpin, &shmem->archiver.stats))
+ goto error;
+
+ /*
+ * Read bgwriter stats struct
+ */
+ if (!read_chunk_s(fpin, &shmem->bgwriter.stats))
+ goto error;
+
+ /*
+ * Read checkpointer stats struct
+ */
+ if (!read_chunk_s(fpin, &shmem->checkpointer.stats))
+ goto error;
+
+ /*
+ * Read SLRU stats struct
+ */
+ if (!read_chunk_s(fpin, &shmem->slru.stats))
+ goto error;
+
+ /*
+ * Read WAL stats struct
+ */
+ if (!read_chunk_s(fpin, &shmem->wal.stats))
+ goto error;
+
+ /*
+ * We found an existing statistics file. Read it and put all the hash
+ * table entries into place.
+ */
+ for (;;)
+ {
+ int t = fgetc(fpin);
+
+ switch (t)
+ {
+ case 'S':
+ case 'N':
+ {
+ PgStat_HashKey key;
+ PgStatShared_HashEntry *p;
+ PgStatShared_Common *header;
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (t == 'S')
+ {
+ /* normal stats entry, identified by PgStat_HashKey */
+ if (!read_chunk_s(fpin, &key))
+ goto error;
+
+ if (!pgstat_is_kind_valid(key.kind))
+ goto error;
+ }
+ else
+ {
+ /* stats entry identified by name on disk (e.g. slots) */
+ const PgStat_KindInfo *kind_info = NULL;
+ PgStat_Kind kind;
+ NameData name;
+
+ if (!read_chunk_s(fpin, &kind))
+ goto error;
+ if (!read_chunk_s(fpin, &name))
+ goto error;
+ if (!pgstat_is_kind_valid(kind))
+ goto error;
+
+ kind_info = pgstat_get_kind_info(kind);
+
+ if (!kind_info->from_serialized_name)
+ goto error;
+
+ if (!kind_info->from_serialized_name(&name, &key))
+ {
+ /* skip over data for entry we don't care about */
+ if (fseek(fpin, pgstat_get_entry_len(kind), SEEK_CUR) != 0)
+ goto error;
+
+ continue;
+ }
+
+ Assert(key.kind == kind);
+ }
+
+ /*
+ * This intentionally doesn't use pgstat_get_entry_ref() -
+ * putting all stats into checkpointer's
+ * pgStatEntryRefHash would be wasted effort and memory.
+ */
+ p = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &found);
+
+ /* don't allow duplicate entries */
+ if (found)
+ {
+ dshash_release_lock(pgStatLocal.shared_hash, p);
+ elog(WARNING, "found duplicate stats entry %d/%u/%u",
+ key.kind, key.dboid, key.objoid);
+ goto error;
+ }
+
+ header = pgstat_init_entry(key.kind, p);
+ dshash_release_lock(pgStatLocal.shared_hash, p);
+
+ if (!read_chunk(fpin,
+ pgstat_get_entry_data(key.kind, header),
+ pgstat_get_entry_len(key.kind)))
+ goto error;
+
+ break;
+ }
+ case 'E':
+ /* check that 'E' actually signals end of file */
+ if (fgetc(fpin) != EOF)
+ goto error;
+
+ goto done;
+
+ default:
+ goto error;
+ }
+ }
+
+done:
+ FreeFile(fpin);
+
+ elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+ unlink(statfile);
+
+ return;
+
+error:
+ ereport(LOG,
+ (errmsg("corrupted statistics file \"%s\"", statfile)));
+
+ pgstat_reset_after_failure();
+
+ goto done;
+}
+
+/*
+ * Helper to reset / drop stats after a crash or after restoring stats from
+ * disk failed, potentially after already loading parts.
+ */
+static void
+pgstat_reset_after_failure(void)
+{
+ TimestampTz ts = GetCurrentTimestamp();
+
+ /* reset fixed-numbered stats */
+ for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++)
+ {
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+
+ if (!kind_info->fixed_amount)
+ continue;
+
+ kind_info->reset_all_cb(ts);
+ }
+
+ /* and drop variable-numbered ones */
+ pgstat_drop_all_entries();
+}
+
+/*
+ * GUC assign_hook for stats_fetch_consistency.
+ */
+void
+assign_stats_fetch_consistency(int newval, void *extra)
+{
+ /*
+ * Changing this value in a transaction may cause snapshot state
+ * inconsistencies, so force a clear of the current snapshot on the next
+ * snapshot build attempt.
+ */
+ if (pgstat_fetch_consistency != newval)
+ force_stats_snapshot_clear = true;
+}
diff --git a/src/backend/utils/activity/pgstat_archiver.c b/src/backend/utils/activity/pgstat_archiver.c
new file mode 100644
index 0000000..851726f
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_archiver.c
@@ -0,0 +1,111 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_archiver.c
+ * Implementation of archiver statistics.
+ *
+ * This file contains the implementation of archiver statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_archiver.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/pgstat_internal.h"
+#include "utils/timestamp.h"
+
+
+/*
+ * Report archiver statistics
+ */
+void
+pgstat_report_archiver(const char *xlog, bool failed)
+{
+ PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver;
+ TimestampTz now = GetCurrentTimestamp();
+
+ pgstat_begin_changecount_write(&stats_shmem->changecount);
+
+ if (failed)
+ {
+ ++stats_shmem->stats.failed_count;
+ memcpy(&stats_shmem->stats.last_failed_wal, xlog,
+ sizeof(stats_shmem->stats.last_failed_wal));
+ stats_shmem->stats.last_failed_timestamp = now;
+ }
+ else
+ {
+ ++stats_shmem->stats.archived_count;
+ memcpy(&stats_shmem->stats.last_archived_wal, xlog,
+ sizeof(stats_shmem->stats.last_archived_wal));
+ stats_shmem->stats.last_archived_timestamp = now;
+ }
+
+ pgstat_end_changecount_write(&stats_shmem->changecount);
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * a pointer to the archiver statistics struct.
+ */
+PgStat_ArchiverStats *
+pgstat_fetch_stat_archiver(void)
+{
+ pgstat_snapshot_fixed(PGSTAT_KIND_ARCHIVER);
+
+ return &pgStatLocal.snapshot.archiver;
+}
+
+void
+pgstat_archiver_reset_all_cb(TimestampTz ts)
+{
+ PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver;
+
+ /* see explanation above PgStatShared_Archiver for the reset protocol */
+ LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
+ pgstat_copy_changecounted_stats(&stats_shmem->reset_offset,
+ &stats_shmem->stats,
+ sizeof(stats_shmem->stats),
+ &stats_shmem->changecount);
+ stats_shmem->stats.stat_reset_timestamp = ts;
+ LWLockRelease(&stats_shmem->lock);
+}
+
+void
+pgstat_archiver_snapshot_cb(void)
+{
+ PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver;
+ PgStat_ArchiverStats *stat_snap = &pgStatLocal.snapshot.archiver;
+ PgStat_ArchiverStats *reset_offset = &stats_shmem->reset_offset;
+ PgStat_ArchiverStats reset;
+
+ pgstat_copy_changecounted_stats(stat_snap,
+ &stats_shmem->stats,
+ sizeof(stats_shmem->stats),
+ &stats_shmem->changecount);
+
+ LWLockAcquire(&stats_shmem->lock, LW_SHARED);
+ memcpy(&reset, reset_offset, sizeof(stats_shmem->stats));
+ LWLockRelease(&stats_shmem->lock);
+
+ /* compensate by reset offsets */
+ if (stat_snap->archived_count == reset.archived_count)
+ {
+ stat_snap->last_archived_wal[0] = 0;
+ stat_snap->last_archived_timestamp = 0;
+ }
+ stat_snap->archived_count -= reset.archived_count;
+
+ if (stat_snap->failed_count == reset.failed_count)
+ {
+ stat_snap->last_failed_wal[0] = 0;
+ stat_snap->last_failed_timestamp = 0;
+ }
+ stat_snap->failed_count -= reset.failed_count;
+}
diff --git a/src/backend/utils/activity/pgstat_bgwriter.c b/src/backend/utils/activity/pgstat_bgwriter.c
new file mode 100644
index 0000000..fbb1edc
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_bgwriter.c
@@ -0,0 +1,110 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_bgwriter.c
+ * Implementation of bgwriter statistics.
+ *
+ * This file contains the implementation of bgwriter statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_bgwriter.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/pgstat_internal.h"
+
+
+PgStat_BgWriterStats PendingBgWriterStats = {0};
+
+
+/*
+ * Report bgwriter statistics
+ */
+void
+pgstat_report_bgwriter(void)
+{
+ PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter;
+ static const PgStat_BgWriterStats all_zeroes;
+
+ Assert(!pgStatLocal.shmem->is_shutdown);
+ pgstat_assert_is_up();
+
+ /*
+ * This function can be called even if nothing at all has happened. In
+ * this case, avoid unnecessarily modifying the stats entry.
+ */
+ if (memcmp(&PendingBgWriterStats, &all_zeroes, sizeof(all_zeroes)) == 0)
+ return;
+
+ pgstat_begin_changecount_write(&stats_shmem->changecount);
+
+#define BGWRITER_ACC(fld) stats_shmem->stats.fld += PendingBgWriterStats.fld
+ BGWRITER_ACC(buf_written_clean);
+ BGWRITER_ACC(maxwritten_clean);
+ BGWRITER_ACC(buf_alloc);
+#undef BGWRITER_ACC
+
+ pgstat_end_changecount_write(&stats_shmem->changecount);
+
+ /*
+ * Clear out the statistics buffer, so it can be re-used.
+ */
+ MemSet(&PendingBgWriterStats, 0, sizeof(PendingBgWriterStats));
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * a pointer to the bgwriter statistics struct.
+ */
+PgStat_BgWriterStats *
+pgstat_fetch_stat_bgwriter(void)
+{
+ pgstat_snapshot_fixed(PGSTAT_KIND_BGWRITER);
+
+ return &pgStatLocal.snapshot.bgwriter;
+}
+
+void
+pgstat_bgwriter_reset_all_cb(TimestampTz ts)
+{
+ PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter;
+
+ /* see explanation above PgStatShared_BgWriter for the reset protocol */
+ LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
+ pgstat_copy_changecounted_stats(&stats_shmem->reset_offset,
+ &stats_shmem->stats,
+ sizeof(stats_shmem->stats),
+ &stats_shmem->changecount);
+ stats_shmem->stats.stat_reset_timestamp = ts;
+ LWLockRelease(&stats_shmem->lock);
+}
+
+void
+pgstat_bgwriter_snapshot_cb(void)
+{
+ PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter;
+ PgStat_BgWriterStats *reset_offset = &stats_shmem->reset_offset;
+ PgStat_BgWriterStats reset;
+
+ pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.bgwriter,
+ &stats_shmem->stats,
+ sizeof(stats_shmem->stats),
+ &stats_shmem->changecount);
+
+ LWLockAcquire(&stats_shmem->lock, LW_SHARED);
+ memcpy(&reset, reset_offset, sizeof(stats_shmem->stats));
+ LWLockRelease(&stats_shmem->lock);
+
+ /* compensate by reset offsets */
+#define BGWRITER_COMP(fld) pgStatLocal.snapshot.bgwriter.fld -= reset.fld;
+ BGWRITER_COMP(buf_written_clean);
+ BGWRITER_COMP(maxwritten_clean);
+ BGWRITER_COMP(buf_alloc);
+#undef BGWRITER_COMP
+}
diff --git a/src/backend/utils/activity/pgstat_checkpointer.c b/src/backend/utils/activity/pgstat_checkpointer.c
new file mode 100644
index 0000000..af8d513
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_checkpointer.c
@@ -0,0 +1,121 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_checkpointer.c
+ * Implementation of checkpoint statistics.
+ *
+ * This file contains the implementation of checkpoint statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_checkpointer.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/pgstat_internal.h"
+
+
+PgStat_CheckpointerStats PendingCheckpointerStats = {0};
+
+
+/*
+ * Report checkpointer statistics
+ */
+void
+pgstat_report_checkpointer(void)
+{
+ /* We assume this initializes to zeroes */
+ static const PgStat_CheckpointerStats all_zeroes;
+ PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer;
+
+ Assert(!pgStatLocal.shmem->is_shutdown);
+ pgstat_assert_is_up();
+
+ /*
+ * This function can be called even if nothing at all has happened. In
+ * this case, avoid unnecessarily modifying the stats entry.
+ */
+ if (memcmp(&PendingCheckpointerStats, &all_zeroes,
+ sizeof(all_zeroes)) == 0)
+ return;
+
+ pgstat_begin_changecount_write(&stats_shmem->changecount);
+
+#define CHECKPOINTER_ACC(fld) stats_shmem->stats.fld += PendingCheckpointerStats.fld
+ CHECKPOINTER_ACC(timed_checkpoints);
+ CHECKPOINTER_ACC(requested_checkpoints);
+ CHECKPOINTER_ACC(checkpoint_write_time);
+ CHECKPOINTER_ACC(checkpoint_sync_time);
+ CHECKPOINTER_ACC(buf_written_checkpoints);
+ CHECKPOINTER_ACC(buf_written_backend);
+ CHECKPOINTER_ACC(buf_fsync_backend);
+#undef CHECKPOINTER_ACC
+
+ pgstat_end_changecount_write(&stats_shmem->changecount);
+
+ /*
+ * Clear out the statistics buffer, so it can be re-used.
+ */
+ MemSet(&PendingCheckpointerStats, 0, sizeof(PendingCheckpointerStats));
+}
+
+/*
+ * pgstat_fetch_stat_checkpointer() -
+ *
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * a pointer to the checkpointer statistics struct.
+ */
+PgStat_CheckpointerStats *
+pgstat_fetch_stat_checkpointer(void)
+{
+ pgstat_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER);
+
+ return &pgStatLocal.snapshot.checkpointer;
+}
+
+void
+pgstat_checkpointer_reset_all_cb(TimestampTz ts)
+{
+ PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer;
+
+ /* see explanation above PgStatShared_Checkpointer for the reset protocol */
+ LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
+ pgstat_copy_changecounted_stats(&stats_shmem->reset_offset,
+ &stats_shmem->stats,
+ sizeof(stats_shmem->stats),
+ &stats_shmem->changecount);
+ LWLockRelease(&stats_shmem->lock);
+}
+
+void
+pgstat_checkpointer_snapshot_cb(void)
+{
+ PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer;
+ PgStat_CheckpointerStats *reset_offset = &stats_shmem->reset_offset;
+ PgStat_CheckpointerStats reset;
+
+ pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.checkpointer,
+ &stats_shmem->stats,
+ sizeof(stats_shmem->stats),
+ &stats_shmem->changecount);
+
+ LWLockAcquire(&stats_shmem->lock, LW_SHARED);
+ memcpy(&reset, reset_offset, sizeof(stats_shmem->stats));
+ LWLockRelease(&stats_shmem->lock);
+
+ /* compensate by reset offsets */
+#define CHECKPOINTER_COMP(fld) pgStatLocal.snapshot.checkpointer.fld -= reset.fld;
+ CHECKPOINTER_COMP(timed_checkpoints);
+ CHECKPOINTER_COMP(requested_checkpoints);
+ CHECKPOINTER_COMP(checkpoint_write_time);
+ CHECKPOINTER_COMP(checkpoint_sync_time);
+ CHECKPOINTER_COMP(buf_written_checkpoints);
+ CHECKPOINTER_COMP(buf_written_backend);
+ CHECKPOINTER_COMP(buf_fsync_backend);
+#undef CHECKPOINTER_COMP
+}
diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c
new file mode 100644
index 0000000..4235fa0
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_database.c
@@ -0,0 +1,437 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_database.c
+ * Implementation of database statistics.
+ *
+ * This file contains the implementation of database statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_database.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/pgstat_internal.h"
+#include "utils/timestamp.h"
+#include "storage/procsignal.h"
+
+
+static bool pgstat_should_report_connstat(void);
+
+
+PgStat_Counter pgStatBlockReadTime = 0;
+PgStat_Counter pgStatBlockWriteTime = 0;
+PgStat_Counter pgStatActiveTime = 0;
+PgStat_Counter pgStatTransactionIdleTime = 0;
+SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL;
+
+
+static int pgStatXactCommit = 0;
+static int pgStatXactRollback = 0;
+static PgStat_Counter pgLastSessionReportTime = 0;
+
+
+/*
+ * Remove entry for the database being dropped.
+ */
+void
+pgstat_drop_database(Oid databaseid)
+{
+ pgstat_drop_transactional(PGSTAT_KIND_DATABASE, databaseid, InvalidOid);
+}
+
+/*
+ * Called from autovacuum.c to report startup of an autovacuum process.
+ * We are called before InitPostgres is done, so can't rely on MyDatabaseId;
+ * the db OID must be passed in, instead.
+ */
+void
+pgstat_report_autovac(Oid dboid)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStatShared_Database *dbentry;
+
+ /* can't get here in single user mode */
+ Assert(IsUnderPostmaster);
+
+ /*
+ * End-of-vacuum is reported instantly. Report the start the same way for
+ * consistency. Vacuum doesn't run frequently and is a long-lasting
+ * operation so it doesn't matter if we get blocked here a little.
+ */
+ entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE,
+ dboid, InvalidOid, false);
+
+ dbentry = (PgStatShared_Database *) entry_ref->shared_stats;
+ dbentry->stats.last_autovac_time = GetCurrentTimestamp();
+
+ pgstat_unlock_entry(entry_ref);
+}
+
+/*
+ * Report a Hot Standby recovery conflict.
+ */
+void
+pgstat_report_recovery_conflict(int reason)
+{
+ PgStat_StatDBEntry *dbentry;
+
+ Assert(IsUnderPostmaster);
+ if (!pgstat_track_counts)
+ return;
+
+ dbentry = pgstat_prep_database_pending(MyDatabaseId);
+
+ switch (reason)
+ {
+ case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+
+ /*
+ * Since we drop the information about the database as soon as it
+ * replicates, there is no point in counting these conflicts.
+ */
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ dbentry->n_conflict_tablespace++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ dbentry->n_conflict_lock++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ dbentry->n_conflict_snapshot++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ dbentry->n_conflict_bufferpin++;
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ dbentry->n_conflict_startup_deadlock++;
+ break;
+ }
+}
+
+/*
+ * Report a detected deadlock.
+ */
+void
+pgstat_report_deadlock(void)
+{
+ PgStat_StatDBEntry *dbent;
+
+ if (!pgstat_track_counts)
+ return;
+
+ dbent = pgstat_prep_database_pending(MyDatabaseId);
+ dbent->n_deadlocks++;
+}
+
+/*
+ * Report one or more checksum failures.
+ */
+void
+pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStatShared_Database *sharedent;
+
+ if (!pgstat_track_counts)
+ return;
+
+ /*
+ * Update the shared stats directly - checksum failures should never be
+ * common enough for that to be a problem.
+ */
+ entry_ref =
+ pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false);
+
+ sharedent = (PgStatShared_Database *) entry_ref->shared_stats;
+ sharedent->stats.n_checksum_failures += failurecount;
+ sharedent->stats.last_checksum_failure = GetCurrentTimestamp();
+
+ pgstat_unlock_entry(entry_ref);
+}
+
+/*
+ * Report one checksum failure in the current database.
+ */
+void
+pgstat_report_checksum_failure(void)
+{
+ pgstat_report_checksum_failures_in_db(MyDatabaseId, 1);
+}
+
+/*
+ * Report creation of temporary file.
+ */
+void
+pgstat_report_tempfile(size_t filesize)
+{
+ PgStat_StatDBEntry *dbent;
+
+ if (!pgstat_track_counts)
+ return;
+
+ dbent = pgstat_prep_database_pending(MyDatabaseId);
+ dbent->n_temp_bytes += filesize;
+ dbent->n_temp_files++;
+}
+
+/*
+ * Notify stats system of a new connection.
+ */
+void
+pgstat_report_connect(Oid dboid)
+{
+ PgStat_StatDBEntry *dbentry;
+
+ if (!pgstat_should_report_connstat())
+ return;
+
+ pgLastSessionReportTime = MyStartTimestamp;
+
+ dbentry = pgstat_prep_database_pending(MyDatabaseId);
+ dbentry->n_sessions++;
+}
+
+/*
+ * Notify the stats system of a disconnect.
+ */
+void
+pgstat_report_disconnect(Oid dboid)
+{
+ PgStat_StatDBEntry *dbentry;
+
+ if (!pgstat_should_report_connstat())
+ return;
+
+ dbentry = pgstat_prep_database_pending(MyDatabaseId);
+
+ switch (pgStatSessionEndCause)
+ {
+ case DISCONNECT_NOT_YET:
+ case DISCONNECT_NORMAL:
+ /* we don't collect these */
+ break;
+ case DISCONNECT_CLIENT_EOF:
+ dbentry->n_sessions_abandoned++;
+ break;
+ case DISCONNECT_FATAL:
+ dbentry->n_sessions_fatal++;
+ break;
+ case DISCONNECT_KILLED:
+ dbentry->n_sessions_killed++;
+ break;
+ }
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * the collected statistics for one database or NULL. NULL doesn't mean
+ * that the database doesn't exist, just that there are no statistics, so the
+ * caller is better off to report ZERO instead.
+ */
+PgStat_StatDBEntry *
+pgstat_fetch_stat_dbentry(Oid dboid)
+{
+ return (PgStat_StatDBEntry *)
+ pgstat_fetch_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid);
+}
+
+void
+AtEOXact_PgStat_Database(bool isCommit, bool parallel)
+{
+ /* Don't count parallel worker transaction stats */
+ if (!parallel)
+ {
+ /*
+ * Count transaction commit or abort. (We use counters, not just
+ * bools, in case the reporting message isn't sent right away.)
+ */
+ if (isCommit)
+ pgStatXactCommit++;
+ else
+ pgStatXactRollback++;
+ }
+}
+
+/*
+ * Subroutine for pgstat_report_stat(): Handle xact commit/rollback and I/O
+ * timings.
+ */
+void
+pgstat_update_dbstats(TimestampTz ts)
+{
+ PgStat_StatDBEntry *dbentry;
+
+ /*
+ * If not connected to a database yet, don't attribute time to "shared
+ * state" (InvalidOid is used to track stats for shared relations, etc.).
+ */
+ if (!OidIsValid(MyDatabaseId))
+ return;
+
+ dbentry = pgstat_prep_database_pending(MyDatabaseId);
+
+ /*
+ * Accumulate xact commit/rollback and I/O timings to stats entry of the
+ * current database.
+ */
+ dbentry->n_xact_commit += pgStatXactCommit;
+ dbentry->n_xact_rollback += pgStatXactRollback;
+ dbentry->n_block_read_time += pgStatBlockReadTime;
+ dbentry->n_block_write_time += pgStatBlockWriteTime;
+
+ if (pgstat_should_report_connstat())
+ {
+ long secs;
+ int usecs;
+
+ /*
+ * pgLastSessionReportTime is initialized to MyStartTimestamp by
+ * pgstat_report_connect().
+ */
+ TimestampDifference(pgLastSessionReportTime, ts, &secs, &usecs);
+ pgLastSessionReportTime = ts;
+ dbentry->total_session_time += (PgStat_Counter) secs * 1000000 + usecs;
+ dbentry->total_active_time += pgStatActiveTime;
+ dbentry->total_idle_in_xact_time += pgStatTransactionIdleTime;
+ }
+
+ pgStatXactCommit = 0;
+ pgStatXactRollback = 0;
+ pgStatBlockReadTime = 0;
+ pgStatBlockWriteTime = 0;
+ pgStatActiveTime = 0;
+ pgStatTransactionIdleTime = 0;
+}
+
+/*
+ * We report session statistics only for normal backend processes. Parallel
+ * workers run in parallel, so they don't contribute to session times, even
+ * though they use CPU time. Walsender processes could be considered here,
+ * but they have different session characteristics from normal backends (for
+ * example, they are always "active"), so they would skew session statistics.
+ */
+static bool
+pgstat_should_report_connstat(void)
+{
+ return MyBackendType == B_BACKEND;
+}
+
+/*
+ * Find or create a local PgStat_StatDBEntry entry for dboid.
+ */
+PgStat_StatDBEntry *
+pgstat_prep_database_pending(Oid dboid)
+{
+ PgStat_EntryRef *entry_ref;
+
+ /*
+ * This should not report stats on database objects before having
+ * connected to a database.
+ */
+ Assert(!OidIsValid(dboid) || OidIsValid(MyDatabaseId));
+
+ entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid,
+ NULL);
+
+ return entry_ref->pending;
+}
+
+/*
+ * Reset the database's reset timestamp, without resetting the contents of the
+ * database stats.
+ */
+void
+pgstat_reset_database_timestamp(Oid dboid, TimestampTz ts)
+{
+ PgStat_EntryRef *dbref;
+ PgStatShared_Database *dbentry;
+
+ dbref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, MyDatabaseId, InvalidOid,
+ false);
+
+ dbentry = (PgStatShared_Database *) dbref->shared_stats;
+ dbentry->stats.stat_reset_timestamp = ts;
+
+ pgstat_unlock_entry(dbref);
+}
+
+/*
+ * Flush out pending stats for the entry
+ *
+ * If nowait is true, this function returns false if lock could not
+ * immediately acquired, otherwise true is returned.
+ */
+bool
+pgstat_database_flush_cb(PgStat_EntryRef *entry_ref, bool nowait)
+{
+ PgStatShared_Database *sharedent;
+ PgStat_StatDBEntry *pendingent;
+
+ pendingent = (PgStat_StatDBEntry *) entry_ref->pending;
+ sharedent = (PgStatShared_Database *) entry_ref->shared_stats;
+
+ if (!pgstat_lock_entry(entry_ref, nowait))
+ return false;
+
+#define PGSTAT_ACCUM_DBCOUNT(item) \
+ (sharedent)->stats.item += (pendingent)->item
+
+ PGSTAT_ACCUM_DBCOUNT(n_xact_commit);
+ PGSTAT_ACCUM_DBCOUNT(n_xact_rollback);
+ PGSTAT_ACCUM_DBCOUNT(n_blocks_fetched);
+ PGSTAT_ACCUM_DBCOUNT(n_blocks_hit);
+
+ PGSTAT_ACCUM_DBCOUNT(n_tuples_returned);
+ PGSTAT_ACCUM_DBCOUNT(n_tuples_fetched);
+ PGSTAT_ACCUM_DBCOUNT(n_tuples_inserted);
+ PGSTAT_ACCUM_DBCOUNT(n_tuples_updated);
+ PGSTAT_ACCUM_DBCOUNT(n_tuples_deleted);
+
+ /* last_autovac_time is reported immediately */
+ Assert(pendingent->last_autovac_time == 0);
+
+ PGSTAT_ACCUM_DBCOUNT(n_conflict_tablespace);
+ PGSTAT_ACCUM_DBCOUNT(n_conflict_lock);
+ PGSTAT_ACCUM_DBCOUNT(n_conflict_snapshot);
+ PGSTAT_ACCUM_DBCOUNT(n_conflict_bufferpin);
+ PGSTAT_ACCUM_DBCOUNT(n_conflict_startup_deadlock);
+
+ PGSTAT_ACCUM_DBCOUNT(n_temp_bytes);
+ PGSTAT_ACCUM_DBCOUNT(n_temp_files);
+ PGSTAT_ACCUM_DBCOUNT(n_deadlocks);
+
+ /* checksum failures are reported immediately */
+ Assert(pendingent->n_checksum_failures == 0);
+ Assert(pendingent->last_checksum_failure == 0);
+
+ PGSTAT_ACCUM_DBCOUNT(n_block_read_time);
+ PGSTAT_ACCUM_DBCOUNT(n_block_write_time);
+
+ PGSTAT_ACCUM_DBCOUNT(n_sessions);
+ PGSTAT_ACCUM_DBCOUNT(total_session_time);
+ PGSTAT_ACCUM_DBCOUNT(total_active_time);
+ PGSTAT_ACCUM_DBCOUNT(total_idle_in_xact_time);
+ PGSTAT_ACCUM_DBCOUNT(n_sessions_abandoned);
+ PGSTAT_ACCUM_DBCOUNT(n_sessions_fatal);
+ PGSTAT_ACCUM_DBCOUNT(n_sessions_killed);
+#undef PGSTAT_ACCUM_DBCOUNT
+
+ pgstat_unlock_entry(entry_ref);
+
+ memset(pendingent, 0, sizeof(*pendingent));
+
+ return true;
+}
+
+void
+pgstat_database_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts)
+{
+ ((PgStatShared_Database *) header)->stats.stat_reset_timestamp = ts;
+}
diff --git a/src/backend/utils/activity/pgstat_function.c b/src/backend/utils/activity/pgstat_function.c
new file mode 100644
index 0000000..427d8c4
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_function.c
@@ -0,0 +1,243 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_function.c
+ * Implementation of function statistics.
+ *
+ * This file contains the implementation of function statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_function.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "utils/inval.h"
+#include "utils/pgstat_internal.h"
+#include "utils/syscache.h"
+
+
+/* ----------
+ * GUC parameters
+ * ----------
+ */
+int pgstat_track_functions = TRACK_FUNC_OFF;
+
+
+/*
+ * Total time charged to functions so far in the current backend.
+ * We use this to help separate "self" and "other" time charges.
+ * (We assume this initializes to zero.)
+ */
+static instr_time total_func_time;
+
+
+/*
+ * Ensure that stats are dropped if transaction aborts.
+ */
+void
+pgstat_create_function(Oid proid)
+{
+ pgstat_create_transactional(PGSTAT_KIND_FUNCTION,
+ MyDatabaseId,
+ proid);
+}
+
+/*
+ * Ensure that stats are dropped if transaction commits.
+ *
+ * NB: This is only reliable because pgstat_init_function_usage() does some
+ * extra work. If other places start emitting function stats they likely need
+ * similar logic.
+ */
+void
+pgstat_drop_function(Oid proid)
+{
+ pgstat_drop_transactional(PGSTAT_KIND_FUNCTION,
+ MyDatabaseId,
+ proid);
+}
+
+/*
+ * Initialize function call usage data.
+ * Called by the executor before invoking a function.
+ */
+void
+pgstat_init_function_usage(FunctionCallInfo fcinfo,
+ PgStat_FunctionCallUsage *fcu)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStat_BackendFunctionEntry *pending;
+ bool created_entry;
+
+ if (pgstat_track_functions <= fcinfo->flinfo->fn_stats)
+ {
+ /* stats not wanted */
+ fcu->fs = NULL;
+ return;
+ }
+
+ entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_FUNCTION,
+ MyDatabaseId,
+ fcinfo->flinfo->fn_oid,
+ &created_entry);
+
+ /*
+ * If no shared entry already exists, check if the function has been
+ * deleted concurrently. This can go unnoticed until here because
+ * executing a statement that just calls a function, does not trigger
+ * cache invalidation processing. The reason we care about this case is
+ * that otherwise we could create a new stats entry for an already dropped
+ * function (for relations etc this is not possible because emitting stats
+ * requires a lock for the relation to already have been acquired).
+ *
+ * It's somewhat ugly to have a behavioral difference based on
+ * track_functions being enabled/disabled. But it seems acceptable, given
+ * that there's already behavioral differences depending on whether the
+ * function is the caches etc.
+ *
+ * For correctness it'd be sufficient to set ->dropped to true. However,
+ * the accepted invalidation will commonly cause "low level" failures in
+ * PL code, with an OID in the error message. Making this harder to
+ * test...
+ */
+ if (created_entry)
+ {
+ AcceptInvalidationMessages();
+ if (!SearchSysCacheExists1(PROCOID, ObjectIdGetDatum(fcinfo->flinfo->fn_oid)))
+ {
+ pgstat_drop_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId,
+ fcinfo->flinfo->fn_oid);
+ ereport(ERROR, errcode(ERRCODE_UNDEFINED_FUNCTION),
+ errmsg("function call to dropped function"));
+ }
+ }
+
+ pending = entry_ref->pending;
+
+ fcu->fs = &pending->f_counts;
+
+ /* save stats for this function, later used to compensate for recursion */
+ fcu->save_f_total_time = pending->f_counts.f_total_time;
+
+ /* save current backend-wide total time */
+ fcu->save_total = total_func_time;
+
+ /* get clock time as of function start */
+ INSTR_TIME_SET_CURRENT(fcu->f_start);
+}
+
+/*
+ * Calculate function call usage and update stat counters.
+ * Called by the executor after invoking a function.
+ *
+ * In the case of a set-returning function that runs in value-per-call mode,
+ * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage
+ * calls for what the user considers a single call of the function. The
+ * finalize flag should be TRUE on the last call.
+ */
+void
+pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize)
+{
+ PgStat_FunctionCounts *fs = fcu->fs;
+ instr_time f_total;
+ instr_time f_others;
+ instr_time f_self;
+
+ /* stats not wanted? */
+ if (fs == NULL)
+ return;
+
+ /* total elapsed time in this function call */
+ INSTR_TIME_SET_CURRENT(f_total);
+ INSTR_TIME_SUBTRACT(f_total, fcu->f_start);
+
+ /* self usage: elapsed minus anything already charged to other calls */
+ f_others = total_func_time;
+ INSTR_TIME_SUBTRACT(f_others, fcu->save_total);
+ f_self = f_total;
+ INSTR_TIME_SUBTRACT(f_self, f_others);
+
+ /* update backend-wide total time */
+ INSTR_TIME_ADD(total_func_time, f_self);
+
+ /*
+ * Compute the new f_total_time as the total elapsed time added to the
+ * pre-call value of f_total_time. This is necessary to avoid
+ * double-counting any time taken by recursive calls of myself. (We do
+ * not need any similar kluge for self time, since that already excludes
+ * any recursive calls.)
+ */
+ INSTR_TIME_ADD(f_total, fcu->save_f_total_time);
+
+ /* update counters in function stats table */
+ if (finalize)
+ fs->f_numcalls++;
+ fs->f_total_time = f_total;
+ INSTR_TIME_ADD(fs->f_self_time, f_self);
+}
+
+/*
+ * Flush out pending stats for the entry
+ *
+ * If nowait is true, this function returns false if lock could not
+ * immediately acquired, otherwise true is returned.
+ */
+bool
+pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait)
+{
+ PgStat_BackendFunctionEntry *localent;
+ PgStatShared_Function *shfuncent;
+
+ localent = (PgStat_BackendFunctionEntry *) entry_ref->pending;
+ shfuncent = (PgStatShared_Function *) entry_ref->shared_stats;
+
+ /* localent always has non-zero content */
+
+ if (!pgstat_lock_entry(entry_ref, nowait))
+ return false;
+
+ shfuncent->stats.f_numcalls += localent->f_counts.f_numcalls;
+ shfuncent->stats.f_total_time +=
+ INSTR_TIME_GET_MICROSEC(localent->f_counts.f_total_time);
+ shfuncent->stats.f_self_time +=
+ INSTR_TIME_GET_MICROSEC(localent->f_counts.f_self_time);
+
+ pgstat_unlock_entry(entry_ref);
+
+ return true;
+}
+
+/*
+ * find any existing PgStat_BackendFunctionEntry entry for specified function
+ *
+ * If no entry, return NULL, don't create a new one
+ */
+PgStat_BackendFunctionEntry *
+find_funcstat_entry(Oid func_id)
+{
+ PgStat_EntryRef *entry_ref;
+
+ entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id);
+
+ if (entry_ref)
+ return entry_ref->pending;
+ return NULL;
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * the collected statistics for one function or NULL.
+ */
+PgStat_StatFuncEntry *
+pgstat_fetch_stat_funcentry(Oid func_id)
+{
+ return (PgStat_StatFuncEntry *)
+ pgstat_fetch_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id);
+}
diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c
new file mode 100644
index 0000000..a846d9f
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_relation.c
@@ -0,0 +1,938 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_relation.c
+ * Implementation of relation statistics.
+ *
+ * This file contains the implementation of function relation. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_relation.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "catalog/partition.h"
+#include "postmaster/autovacuum.h"
+#include "utils/memutils.h"
+#include "utils/pgstat_internal.h"
+#include "utils/rel.h"
+#include "utils/timestamp.h"
+
+
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+ PgStat_Counter tuples_inserted; /* tuples inserted in xact */
+ PgStat_Counter tuples_updated; /* tuples updated in xact */
+ PgStat_Counter tuples_deleted; /* tuples deleted in xact */
+ /* tuples i/u/d prior to truncate/drop */
+ PgStat_Counter inserted_pre_truncdrop;
+ PgStat_Counter updated_pre_truncdrop;
+ PgStat_Counter deleted_pre_truncdrop;
+ Oid t_id; /* table's OID */
+ bool t_shared; /* is it a shared catalog? */
+ bool t_truncdropped; /* was the relation truncated/dropped? */
+} TwoPhasePgStatRecord;
+
+
+static PgStat_TableStatus *pgstat_prep_relation_pending(Oid rel_id, bool isshared);
+static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level);
+static void ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info);
+static void save_truncdrop_counters(PgStat_TableXactStatus *trans, bool is_drop);
+static void restore_truncdrop_counters(PgStat_TableXactStatus *trans);
+
+
+/*
+ * Copy stats between relations. This is used for things like REINDEX
+ * CONCURRENTLY.
+ */
+void
+pgstat_copy_relation_stats(Relation dst, Relation src)
+{
+ PgStat_StatTabEntry *srcstats;
+ PgStatShared_Relation *dstshstats;
+ PgStat_EntryRef *dst_ref;
+
+ srcstats = pgstat_fetch_stat_tabentry_ext(src->rd_rel->relisshared,
+ RelationGetRelid(src));
+ if (!srcstats)
+ return;
+
+ dst_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION,
+ dst->rd_rel->relisshared ? InvalidOid : MyDatabaseId,
+ RelationGetRelid(dst),
+ false);
+
+ dstshstats = (PgStatShared_Relation *) dst_ref->shared_stats;
+ dstshstats->stats = *srcstats;
+
+ pgstat_unlock_entry(dst_ref);
+}
+
+/*
+ * Initialize a relcache entry to count access statistics. Called whenever a
+ * relation is opened.
+ *
+ * We assume that a relcache entry's pgstat_info field is zeroed by relcache.c
+ * when the relcache entry is made; thereafter it is long-lived data.
+ *
+ * This does not create a reference to a stats entry in shared memory, nor
+ * allocate memory for the pending stats. That happens in
+ * pgstat_assoc_relation().
+ */
+void
+pgstat_init_relation(Relation rel)
+{
+ char relkind = rel->rd_rel->relkind;
+
+ /*
+ * We only count stats for relations with storage and partitioned tables
+ */
+ if (!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE)
+ {
+ rel->pgstat_enabled = false;
+ rel->pgstat_info = NULL;
+ return;
+ }
+
+ if (!pgstat_track_counts)
+ {
+ if (rel->pgstat_info)
+ pgstat_unlink_relation(rel);
+
+ /* We're not counting at all */
+ rel->pgstat_enabled = false;
+ rel->pgstat_info = NULL;
+ return;
+ }
+
+ rel->pgstat_enabled = true;
+}
+
+/*
+ * Prepare for statistics for this relation to be collected.
+ *
+ * This ensures we have a reference to the stats entry before stats can be
+ * generated. That is important because a relation drop in another connection
+ * could otherwise lead to the stats entry being dropped, which then later
+ * would get recreated when flushing stats.
+ *
+ * This is separate from pgstat_init_relation() as it is not uncommon for
+ * relcache entries to be opened without ever getting stats reported.
+ */
+void
+pgstat_assoc_relation(Relation rel)
+{
+ Assert(rel->pgstat_enabled);
+ Assert(rel->pgstat_info == NULL);
+
+ /* Else find or make the PgStat_TableStatus entry, and update link */
+ rel->pgstat_info = pgstat_prep_relation_pending(RelationGetRelid(rel),
+ rel->rd_rel->relisshared);
+
+ /* don't allow link a stats to multiple relcache entries */
+ Assert(rel->pgstat_info->relation == NULL);
+
+ /* mark this relation as the owner */
+ rel->pgstat_info->relation = rel;
+}
+
+/*
+ * Break the mutual link between a relcache entry and pending stats entry.
+ * This must be called whenever one end of the link is removed.
+ */
+void
+pgstat_unlink_relation(Relation rel)
+{
+ /* remove the link to stats info if any */
+ if (rel->pgstat_info == NULL)
+ return;
+
+ /* link sanity check */
+ Assert(rel->pgstat_info->relation == rel);
+ rel->pgstat_info->relation = NULL;
+ rel->pgstat_info = NULL;
+}
+
+/*
+ * Ensure that stats are dropped if transaction aborts.
+ */
+void
+pgstat_create_relation(Relation rel)
+{
+ pgstat_create_transactional(PGSTAT_KIND_RELATION,
+ rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId,
+ RelationGetRelid(rel));
+}
+
+/*
+ * Ensure that stats are dropped if transaction commits.
+ */
+void
+pgstat_drop_relation(Relation rel)
+{
+ int nest_level = GetCurrentTransactionNestLevel();
+ PgStat_TableStatus *pgstat_info;
+
+ pgstat_drop_transactional(PGSTAT_KIND_RELATION,
+ rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId,
+ RelationGetRelid(rel));
+
+ if (!pgstat_should_count_relation(rel))
+ return;
+
+ /*
+ * Transactionally set counters to 0. That ensures that accesses to
+ * pg_stat_xact_all_tables inside the transaction show 0.
+ */
+ pgstat_info = rel->pgstat_info;
+ if (pgstat_info->trans &&
+ pgstat_info->trans->nest_level == nest_level)
+ {
+ save_truncdrop_counters(pgstat_info->trans, true);
+ pgstat_info->trans->tuples_inserted = 0;
+ pgstat_info->trans->tuples_updated = 0;
+ pgstat_info->trans->tuples_deleted = 0;
+ }
+}
+
+/*
+ * Report that the table was just vacuumed.
+ */
+void
+pgstat_report_vacuum(Oid tableoid, bool shared,
+ PgStat_Counter livetuples, PgStat_Counter deadtuples)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStatShared_Relation *shtabentry;
+ PgStat_StatTabEntry *tabentry;
+ Oid dboid = (shared ? InvalidOid : MyDatabaseId);
+ TimestampTz ts;
+
+ if (!pgstat_track_counts)
+ return;
+
+ /* Store the data in the table's hash table entry. */
+ ts = GetCurrentTimestamp();
+
+ /* block acquiring lock for the same reason as pgstat_report_autovac() */
+ entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION,
+ dboid, tableoid, false);
+
+ shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats;
+ tabentry = &shtabentry->stats;
+
+ tabentry->n_live_tuples = livetuples;
+ tabentry->n_dead_tuples = deadtuples;
+
+ /*
+ * It is quite possible that a non-aggressive VACUUM ended up skipping
+ * various pages, however, we'll zero the insert counter here regardless.
+ * It's currently used only to track when we need to perform an "insert"
+ * autovacuum, which are mainly intended to freeze newly inserted tuples.
+ * Zeroing this may just mean we'll not try to vacuum the table again
+ * until enough tuples have been inserted to trigger another insert
+ * autovacuum. An anti-wraparound autovacuum will catch any persistent
+ * stragglers.
+ */
+ tabentry->inserts_since_vacuum = 0;
+
+ if (IsAutoVacuumWorkerProcess())
+ {
+ tabentry->autovac_vacuum_timestamp = ts;
+ tabentry->autovac_vacuum_count++;
+ }
+ else
+ {
+ tabentry->vacuum_timestamp = ts;
+ tabentry->vacuum_count++;
+ }
+
+ pgstat_unlock_entry(entry_ref);
+}
+
+/*
+ * Report that the table was just analyzed.
+ *
+ * Caller must provide new live- and dead-tuples estimates, as well as a
+ * flag indicating whether to reset the changes_since_analyze counter.
+ */
+void
+pgstat_report_analyze(Relation rel,
+ PgStat_Counter livetuples, PgStat_Counter deadtuples,
+ bool resetcounter)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStatShared_Relation *shtabentry;
+ PgStat_StatTabEntry *tabentry;
+ Oid dboid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId);
+
+ if (!pgstat_track_counts)
+ return;
+
+ /*
+ * Unlike VACUUM, ANALYZE might be running inside a transaction that has
+ * already inserted and/or deleted rows in the target table. ANALYZE will
+ * have counted such rows as live or dead respectively. Because we will
+ * report our counts of such rows at transaction end, we should subtract
+ * off these counts from the update we're making now, else they'll be
+ * double-counted after commit. (This approach also ensures that the
+ * shared stats entry ends up with the right numbers if we abort instead
+ * of committing.)
+ *
+ * Waste no time on partitioned tables, though.
+ */
+ if (pgstat_should_count_relation(rel) &&
+ rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+ {
+ PgStat_TableXactStatus *trans;
+
+ for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
+ {
+ livetuples -= trans->tuples_inserted - trans->tuples_deleted;
+ deadtuples -= trans->tuples_updated + trans->tuples_deleted;
+ }
+ /* count stuff inserted by already-aborted subxacts, too */
+ deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples;
+ /* Since ANALYZE's counts are estimates, we could have underflowed */
+ livetuples = Max(livetuples, 0);
+ deadtuples = Max(deadtuples, 0);
+ }
+
+ /* block acquiring lock for the same reason as pgstat_report_autovac() */
+ entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, dboid,
+ RelationGetRelid(rel),
+ false);
+ /* can't get dropped while accessed */
+ Assert(entry_ref != NULL && entry_ref->shared_stats != NULL);
+
+ shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats;
+ tabentry = &shtabentry->stats;
+
+ tabentry->n_live_tuples = livetuples;
+ tabentry->n_dead_tuples = deadtuples;
+
+ /*
+ * If commanded, reset changes_since_analyze to zero. This forgets any
+ * changes that were committed while the ANALYZE was in progress, but we
+ * have no good way to estimate how many of those there were.
+ */
+ if (resetcounter)
+ tabentry->changes_since_analyze = 0;
+
+ if (IsAutoVacuumWorkerProcess())
+ {
+ tabentry->autovac_analyze_timestamp = GetCurrentTimestamp();
+ tabentry->autovac_analyze_count++;
+ }
+ else
+ {
+ tabentry->analyze_timestamp = GetCurrentTimestamp();
+ tabentry->analyze_count++;
+ }
+
+ pgstat_unlock_entry(entry_ref);
+}
+
+/*
+ * count a tuple insertion of n tuples
+ */
+void
+pgstat_count_heap_insert(Relation rel, PgStat_Counter n)
+{
+ if (pgstat_should_count_relation(rel))
+ {
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ ensure_tabstat_xact_level(pgstat_info);
+ pgstat_info->trans->tuples_inserted += n;
+ }
+}
+
+/*
+ * count a tuple update
+ */
+void
+pgstat_count_heap_update(Relation rel, bool hot)
+{
+ if (pgstat_should_count_relation(rel))
+ {
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ ensure_tabstat_xact_level(pgstat_info);
+ pgstat_info->trans->tuples_updated++;
+
+ /* t_tuples_hot_updated is nontransactional, so just advance it */
+ if (hot)
+ pgstat_info->t_counts.t_tuples_hot_updated++;
+ }
+}
+
+/*
+ * count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+ if (pgstat_should_count_relation(rel))
+ {
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ ensure_tabstat_xact_level(pgstat_info);
+ pgstat_info->trans->tuples_deleted++;
+ }
+}
+
+/*
+ * update tuple counters due to truncate
+ */
+void
+pgstat_count_truncate(Relation rel)
+{
+ if (pgstat_should_count_relation(rel))
+ {
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ ensure_tabstat_xact_level(pgstat_info);
+ save_truncdrop_counters(pgstat_info->trans, false);
+ pgstat_info->trans->tuples_inserted = 0;
+ pgstat_info->trans->tuples_updated = 0;
+ pgstat_info->trans->tuples_deleted = 0;
+ }
+}
+
+/*
+ * update dead-tuples count
+ *
+ * The semantics of this are that we are reporting the nontransactional
+ * recovery of "delta" dead tuples; so t_delta_dead_tuples decreases
+ * rather than increasing, and the change goes straight into the per-table
+ * counter, not into transactional state.
+ */
+void
+pgstat_update_heap_dead_tuples(Relation rel, int delta)
+{
+ if (pgstat_should_count_relation(rel))
+ {
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ pgstat_info->t_counts.t_delta_dead_tuples -= delta;
+ }
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * the collected statistics for one table or NULL. NULL doesn't mean
+ * that the table doesn't exist, just that there are no statistics, so the
+ * caller is better off to report ZERO instead.
+ */
+PgStat_StatTabEntry *
+pgstat_fetch_stat_tabentry(Oid relid)
+{
+ PgStat_StatTabEntry *tabentry;
+
+ tabentry = pgstat_fetch_stat_tabentry_ext(false, relid);
+ if (tabentry != NULL)
+ return tabentry;
+
+ /*
+ * If we didn't find it, maybe it's a shared table.
+ */
+ tabentry = pgstat_fetch_stat_tabentry_ext(true, relid);
+ return tabentry;
+}
+
+/*
+ * More efficient version of pgstat_fetch_stat_tabentry(), allowing to specify
+ * whether the to-be-accessed table is a shared relation or not.
+ */
+PgStat_StatTabEntry *
+pgstat_fetch_stat_tabentry_ext(bool shared, Oid reloid)
+{
+ Oid dboid = (shared ? InvalidOid : MyDatabaseId);
+
+ return (PgStat_StatTabEntry *)
+ pgstat_fetch_entry(PGSTAT_KIND_RELATION, dboid, reloid);
+}
+
+/*
+ * find any existing PgStat_TableStatus entry for rel
+ *
+ * Find any existing PgStat_TableStatus entry for rel_id in the current
+ * database. If not found, try finding from shared tables.
+ *
+ * If no entry found, return NULL, don't create a new one
+ */
+PgStat_TableStatus *
+find_tabstat_entry(Oid rel_id)
+{
+ PgStat_EntryRef *entry_ref;
+
+ entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, MyDatabaseId, rel_id);
+ if (!entry_ref)
+ entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, InvalidOid, rel_id);
+
+ if (entry_ref)
+ return entry_ref->pending;
+ return NULL;
+}
+
+/*
+ * Perform relation stats specific end-of-transaction work. Helper for
+ * AtEOXact_PgStat.
+ *
+ * Transfer transactional insert/update counts into the base tabstat entries.
+ * We don't bother to free any of the transactional state, since it's all in
+ * TopTransactionContext and will go away anyway.
+ */
+void
+AtEOXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit)
+{
+ PgStat_TableXactStatus *trans;
+
+ for (trans = xact_state->first; trans != NULL; trans = trans->next)
+ {
+ PgStat_TableStatus *tabstat;
+
+ Assert(trans->nest_level == 1);
+ Assert(trans->upper == NULL);
+ tabstat = trans->parent;
+ Assert(tabstat->trans == trans);
+ /* restore pre-truncate/drop stats (if any) in case of aborted xact */
+ if (!isCommit)
+ restore_truncdrop_counters(trans);
+ /* count attempted actions regardless of commit/abort */
+ tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
+ tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
+ tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
+ if (isCommit)
+ {
+ tabstat->t_counts.t_truncdropped = trans->truncdropped;
+ if (trans->truncdropped)
+ {
+ /* forget live/dead stats seen by backend thus far */
+ tabstat->t_counts.t_delta_live_tuples = 0;
+ tabstat->t_counts.t_delta_dead_tuples = 0;
+ }
+ /* insert adds a live tuple, delete removes one */
+ tabstat->t_counts.t_delta_live_tuples +=
+ trans->tuples_inserted - trans->tuples_deleted;
+ /* update and delete each create a dead tuple */
+ tabstat->t_counts.t_delta_dead_tuples +=
+ trans->tuples_updated + trans->tuples_deleted;
+ /* insert, update, delete each count as one change event */
+ tabstat->t_counts.t_changed_tuples +=
+ trans->tuples_inserted + trans->tuples_updated +
+ trans->tuples_deleted;
+ }
+ else
+ {
+ /* inserted tuples are dead, deleted tuples are unaffected */
+ tabstat->t_counts.t_delta_dead_tuples +=
+ trans->tuples_inserted + trans->tuples_updated;
+ /* an aborted xact generates no changed_tuple events */
+ }
+ tabstat->trans = NULL;
+ }
+}
+
+/*
+ * Perform relation stats specific end-of-sub-transaction work. Helper for
+ * AtEOSubXact_PgStat.
+ *
+ * Transfer transactional insert/update counts into the next higher
+ * subtransaction state.
+ */
+void
+AtEOSubXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit, int nestDepth)
+{
+ PgStat_TableXactStatus *trans;
+ PgStat_TableXactStatus *next_trans;
+
+ for (trans = xact_state->first; trans != NULL; trans = next_trans)
+ {
+ PgStat_TableStatus *tabstat;
+
+ next_trans = trans->next;
+ Assert(trans->nest_level == nestDepth);
+ tabstat = trans->parent;
+ Assert(tabstat->trans == trans);
+
+ if (isCommit)
+ {
+ if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+ {
+ if (trans->truncdropped)
+ {
+ /* propagate the truncate/drop status one level up */
+ save_truncdrop_counters(trans->upper, false);
+ /* replace upper xact stats with ours */
+ trans->upper->tuples_inserted = trans->tuples_inserted;
+ trans->upper->tuples_updated = trans->tuples_updated;
+ trans->upper->tuples_deleted = trans->tuples_deleted;
+ }
+ else
+ {
+ trans->upper->tuples_inserted += trans->tuples_inserted;
+ trans->upper->tuples_updated += trans->tuples_updated;
+ trans->upper->tuples_deleted += trans->tuples_deleted;
+ }
+ tabstat->trans = trans->upper;
+ pfree(trans);
+ }
+ else
+ {
+ /*
+ * When there isn't an immediate parent state, we can just
+ * reuse the record instead of going through a palloc/pfree
+ * pushup (this works since it's all in TopTransactionContext
+ * anyway). We have to re-link it into the parent level,
+ * though, and that might mean pushing a new entry into the
+ * pgStatXactStack.
+ */
+ PgStat_SubXactStatus *upper_xact_state;
+
+ upper_xact_state = pgstat_get_xact_stack_level(nestDepth - 1);
+ trans->next = upper_xact_state->first;
+ upper_xact_state->first = trans;
+ trans->nest_level = nestDepth - 1;
+ }
+ }
+ else
+ {
+ /*
+ * On abort, update top-level tabstat counts, then forget the
+ * subtransaction
+ */
+
+ /* first restore values obliterated by truncate/drop */
+ restore_truncdrop_counters(trans);
+ /* count attempted actions regardless of commit/abort */
+ tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
+ tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
+ tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
+ /* inserted tuples are dead, deleted tuples are unaffected */
+ tabstat->t_counts.t_delta_dead_tuples +=
+ trans->tuples_inserted + trans->tuples_updated;
+ tabstat->trans = trans->upper;
+ pfree(trans);
+ }
+ }
+}
+
+/*
+ * Generate 2PC records for all the pending transaction-dependent relation
+ * stats.
+ */
+void
+AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state)
+{
+ PgStat_TableXactStatus *trans;
+
+ for (trans = xact_state->first; trans != NULL; trans = trans->next)
+ {
+ PgStat_TableStatus *tabstat PG_USED_FOR_ASSERTS_ONLY;
+ TwoPhasePgStatRecord record;
+
+ Assert(trans->nest_level == 1);
+ Assert(trans->upper == NULL);
+ tabstat = trans->parent;
+ Assert(tabstat->trans == trans);
+
+ record.tuples_inserted = trans->tuples_inserted;
+ record.tuples_updated = trans->tuples_updated;
+ record.tuples_deleted = trans->tuples_deleted;
+ record.inserted_pre_truncdrop = trans->inserted_pre_truncdrop;
+ record.updated_pre_truncdrop = trans->updated_pre_truncdrop;
+ record.deleted_pre_truncdrop = trans->deleted_pre_truncdrop;
+ record.t_id = tabstat->t_id;
+ record.t_shared = tabstat->t_shared;
+ record.t_truncdropped = trans->truncdropped;
+
+ RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+ &record, sizeof(TwoPhasePgStatRecord));
+ }
+}
+
+/*
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state. The nontransactional action counts will be
+ * reported to the stats system immediately, while the effects on live and
+ * dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat_Relations is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state)
+{
+ PgStat_TableXactStatus *trans;
+
+ for (trans = xact_state->first; trans != NULL; trans = trans->next)
+ {
+ PgStat_TableStatus *tabstat;
+
+ tabstat = trans->parent;
+ tabstat->trans = NULL;
+ }
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+ PgStat_TableStatus *pgstat_info;
+
+ /* Find or create a tabstat entry for the rel */
+ pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared);
+
+ /* Same math as in AtEOXact_PgStat, commit case */
+ pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
+ pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
+ pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
+ pgstat_info->t_counts.t_truncdropped = rec->t_truncdropped;
+ if (rec->t_truncdropped)
+ {
+ /* forget live/dead stats seen by backend thus far */
+ pgstat_info->t_counts.t_delta_live_tuples = 0;
+ pgstat_info->t_counts.t_delta_dead_tuples = 0;
+ }
+ pgstat_info->t_counts.t_delta_live_tuples +=
+ rec->tuples_inserted - rec->tuples_deleted;
+ pgstat_info->t_counts.t_delta_dead_tuples +=
+ rec->tuples_updated + rec->tuples_deleted;
+ pgstat_info->t_counts.t_changed_tuples +=
+ rec->tuples_inserted + rec->tuples_updated +
+ rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+ PgStat_TableStatus *pgstat_info;
+
+ /* Find or create a tabstat entry for the rel */
+ pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared);
+
+ /* Same math as in AtEOXact_PgStat, abort case */
+ if (rec->t_truncdropped)
+ {
+ rec->tuples_inserted = rec->inserted_pre_truncdrop;
+ rec->tuples_updated = rec->updated_pre_truncdrop;
+ rec->tuples_deleted = rec->deleted_pre_truncdrop;
+ }
+ pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
+ pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
+ pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
+ pgstat_info->t_counts.t_delta_dead_tuples +=
+ rec->tuples_inserted + rec->tuples_updated;
+}
+
+/*
+ * Flush out pending stats for the entry
+ *
+ * If nowait is true, this function returns false if lock could not
+ * immediately acquired, otherwise true is returned.
+ *
+ * Some of the stats are copied to the corresponding pending database stats
+ * entry when successfully flushing.
+ */
+bool
+pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait)
+{
+ static const PgStat_TableCounts all_zeroes;
+ Oid dboid;
+ PgStat_TableStatus *lstats; /* pending stats entry */
+ PgStatShared_Relation *shtabstats;
+ PgStat_StatTabEntry *tabentry; /* table entry of shared stats */
+ PgStat_StatDBEntry *dbentry; /* pending database entry */
+
+ dboid = entry_ref->shared_entry->key.dboid;
+ lstats = (PgStat_TableStatus *) entry_ref->pending;
+ shtabstats = (PgStatShared_Relation *) entry_ref->shared_stats;
+
+ /*
+ * Ignore entries that didn't accumulate any actual counts, such as
+ * indexes that were opened by the planner but not used.
+ */
+ if (memcmp(&lstats->t_counts, &all_zeroes,
+ sizeof(PgStat_TableCounts)) == 0)
+ {
+ return true;
+ }
+
+ if (!pgstat_lock_entry(entry_ref, nowait))
+ return false;
+
+ /* add the values to the shared entry. */
+ tabentry = &shtabstats->stats;
+
+ tabentry->numscans += lstats->t_counts.t_numscans;
+ tabentry->tuples_returned += lstats->t_counts.t_tuples_returned;
+ tabentry->tuples_fetched += lstats->t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted += lstats->t_counts.t_tuples_inserted;
+ tabentry->tuples_updated += lstats->t_counts.t_tuples_updated;
+ tabentry->tuples_deleted += lstats->t_counts.t_tuples_deleted;
+ tabentry->tuples_hot_updated += lstats->t_counts.t_tuples_hot_updated;
+
+ /*
+ * If table was truncated/dropped, first reset the live/dead counters.
+ */
+ if (lstats->t_counts.t_truncdropped)
+ {
+ tabentry->n_live_tuples = 0;
+ tabentry->n_dead_tuples = 0;
+ tabentry->inserts_since_vacuum = 0;
+ }
+
+ tabentry->n_live_tuples += lstats->t_counts.t_delta_live_tuples;
+ tabentry->n_dead_tuples += lstats->t_counts.t_delta_dead_tuples;
+ tabentry->changes_since_analyze += lstats->t_counts.t_changed_tuples;
+ tabentry->inserts_since_vacuum += lstats->t_counts.t_tuples_inserted;
+ tabentry->blocks_fetched += lstats->t_counts.t_blocks_fetched;
+ tabentry->blocks_hit += lstats->t_counts.t_blocks_hit;
+
+ /* Clamp n_live_tuples in case of negative delta_live_tuples */
+ tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
+ /* Likewise for n_dead_tuples */
+ tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
+
+ pgstat_unlock_entry(entry_ref);
+
+ /* The entry was successfully flushed, add the same to database stats */
+ dbentry = pgstat_prep_database_pending(dboid);
+ dbentry->n_tuples_returned += lstats->t_counts.t_tuples_returned;
+ dbentry->n_tuples_fetched += lstats->t_counts.t_tuples_fetched;
+ dbentry->n_tuples_inserted += lstats->t_counts.t_tuples_inserted;
+ dbentry->n_tuples_updated += lstats->t_counts.t_tuples_updated;
+ dbentry->n_tuples_deleted += lstats->t_counts.t_tuples_deleted;
+ dbentry->n_blocks_fetched += lstats->t_counts.t_blocks_fetched;
+ dbentry->n_blocks_hit += lstats->t_counts.t_blocks_hit;
+
+ return true;
+}
+
+void
+pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref)
+{
+ PgStat_TableStatus *pending = (PgStat_TableStatus *) entry_ref->pending;
+
+ if (pending->relation)
+ pgstat_unlink_relation(pending->relation);
+}
+
+/*
+ * Find or create a PgStat_TableStatus entry for rel. New entry is created and
+ * initialized if not exists.
+ */
+static PgStat_TableStatus *
+pgstat_prep_relation_pending(Oid rel_id, bool isshared)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStat_TableStatus *pending;
+
+ entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_RELATION,
+ isshared ? InvalidOid : MyDatabaseId,
+ rel_id, NULL);
+ pending = entry_ref->pending;
+ pending->t_id = rel_id;
+ pending->t_shared = isshared;
+
+ return pending;
+}
+
+/*
+ * add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+ PgStat_SubXactStatus *xact_state;
+ PgStat_TableXactStatus *trans;
+
+ /*
+ * If this is the first rel to be modified at the current nest level, we
+ * first have to push a transaction stack entry.
+ */
+ xact_state = pgstat_get_xact_stack_level(nest_level);
+
+ /* Now make a per-table stack entry */
+ trans = (PgStat_TableXactStatus *)
+ MemoryContextAllocZero(TopTransactionContext,
+ sizeof(PgStat_TableXactStatus));
+ trans->nest_level = nest_level;
+ trans->upper = pgstat_info->trans;
+ trans->parent = pgstat_info;
+ trans->next = xact_state->first;
+ xact_state->first = trans;
+ pgstat_info->trans = trans;
+}
+
+/*
+ * Add a new (sub)transaction record if needed.
+ */
+static void
+ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info)
+{
+ int nest_level = GetCurrentTransactionNestLevel();
+
+ if (pgstat_info->trans == NULL ||
+ pgstat_info->trans->nest_level != nest_level)
+ add_tabstat_xact_level(pgstat_info, nest_level);
+}
+
+/*
+ * Whenever a table is truncated/dropped, we save its i/u/d counters so that
+ * they can be cleared, and if the (sub)xact that executed the truncate/drop
+ * later aborts, the counters can be restored to the saved (pre-truncate/drop)
+ * values.
+ *
+ * Note that for truncate we do this on the first truncate in any particular
+ * subxact level only.
+ */
+static void
+save_truncdrop_counters(PgStat_TableXactStatus *trans, bool is_drop)
+{
+ if (!trans->truncdropped || is_drop)
+ {
+ trans->inserted_pre_truncdrop = trans->tuples_inserted;
+ trans->updated_pre_truncdrop = trans->tuples_updated;
+ trans->deleted_pre_truncdrop = trans->tuples_deleted;
+ trans->truncdropped = true;
+ }
+}
+
+/*
+ * restore counters when a truncate aborts
+ */
+static void
+restore_truncdrop_counters(PgStat_TableXactStatus *trans)
+{
+ if (trans->truncdropped)
+ {
+ trans->tuples_inserted = trans->inserted_pre_truncdrop;
+ trans->tuples_updated = trans->updated_pre_truncdrop;
+ trans->tuples_deleted = trans->deleted_pre_truncdrop;
+ }
+}
diff --git a/src/backend/utils/activity/pgstat_replslot.c b/src/backend/utils/activity/pgstat_replslot.c
new file mode 100644
index 0000000..1a8473b
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_replslot.c
@@ -0,0 +1,224 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_replslot.c
+ * Implementation of replication slot statistics.
+ *
+ * This file contains the implementation of replication slot statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Replication slot stats work a bit different than other other
+ * variable-numbered stats. Slots do not have oids (so they can be created on
+ * physical replicas). Use the slot index as object id while running. However,
+ * the slot index can change when restarting. That is addressed by using the
+ * name when (de-)serializing. After a restart it is possible for slots to
+ * have been dropped while shut down, which is addressed by not restoring
+ * stats for slots that cannot be found by name when starting up.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_replslot.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "replication/slot.h"
+#include "utils/builtins.h" /* for namestrcpy() */
+#include "utils/pgstat_internal.h"
+
+
+static int get_replslot_index(const char *name);
+
+
+/*
+ * Reset counters for a single replication slot.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+void
+pgstat_reset_replslot(const char *name)
+{
+ ReplicationSlot *slot;
+
+ AssertArg(name != NULL);
+
+ /* Check if the slot exits with the given name. */
+ slot = SearchNamedReplicationSlot(name, true);
+
+ if (!slot)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("replication slot \"%s\" does not exist",
+ name)));
+
+ /*
+ * Nothing to do for physical slots as we collect stats only for logical
+ * slots.
+ */
+ if (SlotIsPhysical(slot))
+ return;
+
+ /* reset this one entry */
+ pgstat_reset(PGSTAT_KIND_REPLSLOT, InvalidOid,
+ ReplicationSlotIndex(slot));
+}
+
+/*
+ * Report replication slot statistics.
+ *
+ * We can rely on the stats for the slot to exist and to belong to this
+ * slot. We can only get here if pgstat_create_replslot() or
+ * pgstat_acquire_replslot() have already been called.
+ */
+void
+pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *repSlotStat)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStatShared_ReplSlot *shstatent;
+ PgStat_StatReplSlotEntry *statent;
+
+ entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid,
+ ReplicationSlotIndex(slot), false);
+ shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats;
+ statent = &shstatent->stats;
+
+ /* Update the replication slot statistics */
+#define REPLSLOT_ACC(fld) statent->fld += repSlotStat->fld
+ REPLSLOT_ACC(spill_txns);
+ REPLSLOT_ACC(spill_count);
+ REPLSLOT_ACC(spill_bytes);
+ REPLSLOT_ACC(stream_txns);
+ REPLSLOT_ACC(stream_count);
+ REPLSLOT_ACC(stream_bytes);
+ REPLSLOT_ACC(total_txns);
+ REPLSLOT_ACC(total_bytes);
+#undef REPLSLOT_ACC
+
+ pgstat_unlock_entry(entry_ref);
+}
+
+/*
+ * Report replication slot creation.
+ *
+ * NB: This gets called with ReplicationSlotAllocationLock already held, be
+ * careful about calling back into slot.c.
+ */
+void
+pgstat_create_replslot(ReplicationSlot *slot)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStatShared_ReplSlot *shstatent;
+
+ entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid,
+ ReplicationSlotIndex(slot), false);
+ shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats;
+
+ /*
+ * NB: need to accept that there might be stats from an older slot, e.g.
+ * if we previously crashed after dropping a slot.
+ */
+ memset(&shstatent->stats, 0, sizeof(shstatent->stats));
+
+ pgstat_unlock_entry(entry_ref);
+}
+
+/*
+ * Report replication slot has been acquired.
+ *
+ * This guarantees that a stats entry exists during later
+ * pgstat_report_replslot() calls.
+ *
+ * If we previously crashed, no stats data exists. But if we did not crash,
+ * the stats do belong to this slot:
+ * - the stats cannot belong to a dropped slot, pgstat_drop_replslot() would
+ * have been called
+ * - if the slot was removed while shut down,
+ * pgstat_replslot_from_serialized_name_cb() returning false would have
+ * caused the stats to be dropped
+ */
+void
+pgstat_acquire_replslot(ReplicationSlot *slot)
+{
+ pgstat_get_entry_ref(PGSTAT_KIND_REPLSLOT, InvalidOid,
+ ReplicationSlotIndex(slot), true, NULL);
+}
+
+/*
+ * Report replication slot drop.
+ */
+void
+pgstat_drop_replslot(ReplicationSlot *slot)
+{
+ pgstat_drop_entry(PGSTAT_KIND_REPLSLOT, InvalidOid,
+ ReplicationSlotIndex(slot));
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * a pointer to the replication slot statistics struct.
+ */
+PgStat_StatReplSlotEntry *
+pgstat_fetch_replslot(NameData slotname)
+{
+ int idx = get_replslot_index(NameStr(slotname));
+
+ if (idx == -1)
+ return NULL;
+
+ return (PgStat_StatReplSlotEntry *)
+ pgstat_fetch_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, idx);
+}
+
+void
+pgstat_replslot_to_serialized_name_cb(const PgStat_HashKey *key, const PgStatShared_Common *header, NameData *name)
+{
+ /*
+ * This is only called late during shutdown. The set of existing slots
+ * isn't allowed to change at this point, we can assume that a slot exists
+ * at the offset.
+ */
+ if (!ReplicationSlotName(key->objoid, name))
+ elog(ERROR, "could not find name for replication slot index %u",
+ key->objoid);
+}
+
+bool
+pgstat_replslot_from_serialized_name_cb(const NameData *name, PgStat_HashKey *key)
+{
+ int idx = get_replslot_index(NameStr(*name));
+
+ /* slot might have been deleted */
+ if (idx == -1)
+ return false;
+
+ key->kind = PGSTAT_KIND_REPLSLOT;
+ key->dboid = InvalidOid;
+ key->objoid = idx;
+
+ return true;
+}
+
+void
+pgstat_replslot_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts)
+{
+ ((PgStatShared_ReplSlot *) header)->stats.stat_reset_timestamp = ts;
+}
+
+static int
+get_replslot_index(const char *name)
+{
+ ReplicationSlot *slot;
+
+ AssertArg(name != NULL);
+
+ slot = SearchNamedReplicationSlot(name, true);
+
+ if (!slot)
+ return -1;
+
+ return ReplicationSlotIndex(slot);
+}
diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c
new file mode 100644
index 0000000..9a4f037
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_shmem.c
@@ -0,0 +1,1003 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_shmem.c
+ * Storage of stats entries in shared memory
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_shmem.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "pgstat.h"
+#include "storage/shmem.h"
+#include "utils/memutils.h"
+#include "utils/pgstat_internal.h"
+
+
+#define PGSTAT_ENTRY_REF_HASH_SIZE 128
+
+/* hash table entry for finding the PgStat_EntryRef for a key */
+typedef struct PgStat_EntryRefHashEntry
+{
+ PgStat_HashKey key; /* hash key */
+ char status; /* for simplehash use */
+ PgStat_EntryRef *entry_ref;
+} PgStat_EntryRefHashEntry;
+
+
+/* for references to shared statistics entries */
+#define SH_PREFIX pgstat_entry_ref_hash
+#define SH_ELEMENT_TYPE PgStat_EntryRefHashEntry
+#define SH_KEY_TYPE PgStat_HashKey
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) \
+ pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL)
+#define SH_EQUAL(tb, a, b) \
+ pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0
+#define SH_SCOPE static inline
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+
+static void pgstat_drop_database_and_contents(Oid dboid);
+
+static void pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat);
+
+static void pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, bool discard_pending);
+static bool pgstat_need_entry_refs_gc(void);
+static void pgstat_gc_entry_refs(void);
+static void pgstat_release_all_entry_refs(bool discard_pending);
+typedef bool (*ReleaseMatchCB) (PgStat_EntryRefHashEntry *, Datum data);
+static void pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, Datum match_data);
+
+static void pgstat_setup_memcxt(void);
+
+
+/* parameter for the shared hash */
+static const dshash_parameters dsh_params = {
+ sizeof(PgStat_HashKey),
+ sizeof(PgStatShared_HashEntry),
+ pgstat_cmp_hash_key,
+ pgstat_hash_hash_key,
+ LWTRANCHE_PGSTATS_HASH
+};
+
+
+/*
+ * Backend local references to shared stats entries. If there are pending
+ * updates to a stats entry, the PgStat_EntryRef is added to the pgStatPending
+ * list.
+ *
+ * When a stats entry is dropped each backend needs to release its reference
+ * to it before the memory can be released. To trigger that
+ * pgStatLocal.shmem->gc_request_count is incremented - which each backend
+ * compares to their copy of pgStatSharedRefAge on a regular basis.
+ */
+static pgstat_entry_ref_hash_hash *pgStatEntryRefHash = NULL;
+static int pgStatSharedRefAge = 0; /* cache age of pgStatShmLookupCache */
+
+/*
+ * Memory contexts containing the pgStatEntryRefHash table and the
+ * pgStatSharedRef entries respectively. Kept separate to make it easier to
+ * track / attribute memory usage.
+ */
+static MemoryContext pgStatSharedRefContext = NULL;
+static MemoryContext pgStatEntryRefHashContext = NULL;
+
+
+/* ------------------------------------------------------------
+ * Public functions called from postmaster follow
+ * ------------------------------------------------------------
+ */
+
+/*
+ * The size of the shared memory allocation for stats stored in the shared
+ * stats hash table. This allocation will be done as part of the main shared
+ * memory, rather than dynamic shared memory, allowing it to be initialized in
+ * postmaster.
+ */
+static Size
+pgstat_dsa_init_size(void)
+{
+ Size sz;
+
+ /*
+ * The dshash header / initial buckets array needs to fit into "plain"
+ * shared memory, but it's beneficial to not need dsm segments
+ * immediately. A size of 256kB seems works well and is not
+ * disproportional compared to other constant sized shared memory
+ * allocations. NB: To avoid DSMs further, the user can configure
+ * min_dynamic_shared_memory.
+ */
+ sz = 256 * 1024;
+ Assert(dsa_minimum_size() <= sz);
+ return MAXALIGN(sz);
+}
+
+/*
+ * Compute shared memory space needed for cumulative statistics
+ */
+Size
+StatsShmemSize(void)
+{
+ Size sz;
+
+ sz = MAXALIGN(sizeof(PgStat_ShmemControl));
+ sz = add_size(sz, pgstat_dsa_init_size());
+
+ return sz;
+}
+
+/*
+ * Initialize cumulative statistics system during startup
+ */
+void
+StatsShmemInit(void)
+{
+ bool found;
+ Size sz;
+
+ sz = StatsShmemSize();
+ pgStatLocal.shmem = (PgStat_ShmemControl *)
+ ShmemInitStruct("Shared Memory Stats", sz, &found);
+
+ if (!IsUnderPostmaster)
+ {
+ dsa_area *dsa;
+ dshash_table *dsh;
+ PgStat_ShmemControl *ctl = pgStatLocal.shmem;
+ char *p = (char *) ctl;
+
+ Assert(!found);
+
+ /* the allocation of pgStatLocal.shmem itself */
+ p += MAXALIGN(sizeof(PgStat_ShmemControl));
+
+ /*
+ * Create a small dsa allocation in plain shared memory. This is
+ * required because postmaster cannot use dsm segments. It also
+ * provides a small efficiency win.
+ */
+ ctl->raw_dsa_area = p;
+ p += MAXALIGN(pgstat_dsa_init_size());
+ dsa = dsa_create_in_place(ctl->raw_dsa_area,
+ pgstat_dsa_init_size(),
+ LWTRANCHE_PGSTATS_DSA, 0);
+ dsa_pin(dsa);
+
+ /*
+ * To ensure dshash is created in "plain" shared memory, temporarily
+ * limit size of dsa to the initial size of the dsa.
+ */
+ dsa_set_size_limit(dsa, pgstat_dsa_init_size());
+
+ /*
+ * With the limit in place, create the dshash table. XXX: It'd be nice
+ * if there were dshash_create_in_place().
+ */
+ dsh = dshash_create(dsa, &dsh_params, 0);
+ ctl->hash_handle = dshash_get_hash_table_handle(dsh);
+
+ /* lift limit set above */
+ dsa_set_size_limit(dsa, -1);
+
+ /*
+ * Postmaster will never access these again, thus free the local
+ * dsa/dshash references.
+ */
+ dshash_detach(dsh);
+ dsa_detach(dsa);
+
+ pg_atomic_init_u64(&ctl->gc_request_count, 1);
+
+
+ /* initialize fixed-numbered stats */
+ LWLockInitialize(&ctl->archiver.lock, LWTRANCHE_PGSTATS_DATA);
+ LWLockInitialize(&ctl->bgwriter.lock, LWTRANCHE_PGSTATS_DATA);
+ LWLockInitialize(&ctl->checkpointer.lock, LWTRANCHE_PGSTATS_DATA);
+ LWLockInitialize(&ctl->slru.lock, LWTRANCHE_PGSTATS_DATA);
+ LWLockInitialize(&ctl->wal.lock, LWTRANCHE_PGSTATS_DATA);
+ }
+ else
+ {
+ Assert(found);
+ }
+}
+
+void
+pgstat_attach_shmem(void)
+{
+ MemoryContext oldcontext;
+
+ Assert(pgStatLocal.dsa == NULL);
+
+ /* stats shared memory persists for the backend lifetime */
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ pgStatLocal.dsa = dsa_attach_in_place(pgStatLocal.shmem->raw_dsa_area,
+ NULL);
+ dsa_pin_mapping(pgStatLocal.dsa);
+
+ pgStatLocal.shared_hash = dshash_attach(pgStatLocal.dsa, &dsh_params,
+ pgStatLocal.shmem->hash_handle, 0);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+void
+pgstat_detach_shmem(void)
+{
+ Assert(pgStatLocal.dsa);
+
+ /* we shouldn't leave references to shared stats */
+ pgstat_release_all_entry_refs(false);
+
+ dshash_detach(pgStatLocal.shared_hash);
+ pgStatLocal.shared_hash = NULL;
+
+ dsa_detach(pgStatLocal.dsa);
+ pgStatLocal.dsa = NULL;
+}
+
+
+/* ------------------------------------------------------------
+ * Maintenance of shared memory stats entries
+ * ------------------------------------------------------------
+ */
+
+PgStatShared_Common *
+pgstat_init_entry(PgStat_Kind kind,
+ PgStatShared_HashEntry *shhashent)
+{
+ /* Create new stats entry. */
+ dsa_pointer chunk;
+ PgStatShared_Common *shheader;
+
+ /*
+ * Initialize refcount to 1, marking it as valid / not dropped. The entry
+ * can't be freed before the initialization because it can't be found as
+ * long as we hold the dshash partition lock. Caller needs to increase
+ * further if a longer lived reference is needed.
+ */
+ pg_atomic_init_u32(&shhashent->refcount, 1);
+ shhashent->dropped = false;
+
+ chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size);
+ shheader = dsa_get_address(pgStatLocal.dsa, chunk);
+ shheader->magic = 0xdeadbeef;
+
+ /* Link the new entry from the hash entry. */
+ shhashent->body = chunk;
+
+ LWLockInitialize(&shheader->lock, LWTRANCHE_PGSTATS_DATA);
+
+ return shheader;
+}
+
+static PgStatShared_Common *
+pgstat_reinit_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent)
+{
+ PgStatShared_Common *shheader;
+
+ shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body);
+
+ /* mark as not dropped anymore */
+ pg_atomic_fetch_add_u32(&shhashent->refcount, 1);
+ shhashent->dropped = false;
+
+ /* reinitialize content */
+ Assert(shheader->magic == 0xdeadbeef);
+ memset(pgstat_get_entry_data(kind, shheader), 0,
+ pgstat_get_entry_len(kind));
+
+ return shheader;
+}
+
+static void
+pgstat_setup_shared_refs(void)
+{
+ if (likely(pgStatEntryRefHash != NULL))
+ return;
+
+ pgStatEntryRefHash =
+ pgstat_entry_ref_hash_create(pgStatEntryRefHashContext,
+ PGSTAT_ENTRY_REF_HASH_SIZE, NULL);
+ pgStatSharedRefAge = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count);
+ Assert(pgStatSharedRefAge != 0);
+}
+
+/*
+ * Helper function for pgstat_get_entry_ref().
+ */
+static void
+pgstat_acquire_entry_ref(PgStat_EntryRef *entry_ref,
+ PgStatShared_HashEntry *shhashent,
+ PgStatShared_Common *shheader)
+{
+ Assert(shheader->magic == 0xdeadbeef);
+ Assert(pg_atomic_read_u32(&shhashent->refcount) > 0);
+
+ pg_atomic_fetch_add_u32(&shhashent->refcount, 1);
+
+ dshash_release_lock(pgStatLocal.shared_hash, shhashent);
+
+ entry_ref->shared_stats = shheader;
+ entry_ref->shared_entry = shhashent;
+}
+
+/*
+ * Helper function for pgstat_get_entry_ref().
+ */
+static bool
+pgstat_get_entry_ref_cached(PgStat_HashKey key, PgStat_EntryRef **entry_ref_p)
+{
+ bool found;
+ PgStat_EntryRefHashEntry *cache_entry;
+
+ /*
+ * We immediately insert a cache entry, because it avoids 1) multiple
+ * hashtable lookups in case of a cache miss 2) having to deal with
+ * out-of-memory errors after incrementing PgStatShared_Common->refcount.
+ */
+
+ cache_entry = pgstat_entry_ref_hash_insert(pgStatEntryRefHash, key, &found);
+
+ if (!found || !cache_entry->entry_ref)
+ {
+ PgStat_EntryRef *entry_ref;
+
+ cache_entry->entry_ref = entry_ref =
+ MemoryContextAlloc(pgStatSharedRefContext,
+ sizeof(PgStat_EntryRef));
+ entry_ref->shared_stats = NULL;
+ entry_ref->shared_entry = NULL;
+ entry_ref->pending = NULL;
+
+ found = false;
+ }
+ else if (cache_entry->entry_ref->shared_stats == NULL)
+ {
+ Assert(cache_entry->entry_ref->pending == NULL);
+ found = false;
+ }
+ else
+ {
+ PgStat_EntryRef *entry_ref PG_USED_FOR_ASSERTS_ONLY;
+
+ entry_ref = cache_entry->entry_ref;
+ Assert(entry_ref->shared_entry != NULL);
+ Assert(entry_ref->shared_stats != NULL);
+
+ Assert(entry_ref->shared_stats->magic == 0xdeadbeef);
+ /* should have at least our reference */
+ Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) > 0);
+ }
+
+ *entry_ref_p = cache_entry->entry_ref;
+ return found;
+}
+
+/*
+ * Get a shared stats reference. If create is true, the shared stats object is
+ * created if it does not exist.
+ *
+ * When create is true, and created_entry is non-NULL, it'll be set to true
+ * if the entry is newly created, false otherwise.
+ */
+PgStat_EntryRef *
+pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, Oid objoid, bool create,
+ bool *created_entry)
+{
+ PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid};
+ PgStatShared_HashEntry *shhashent;
+ PgStatShared_Common *shheader = NULL;
+ PgStat_EntryRef *entry_ref;
+
+ /*
+ * passing in created_entry only makes sense if we possibly could create
+ * entry.
+ */
+ AssertArg(create || created_entry == NULL);
+ pgstat_assert_is_up();
+ Assert(pgStatLocal.shared_hash != NULL);
+ Assert(!pgStatLocal.shmem->is_shutdown);
+
+ pgstat_setup_memcxt();
+ pgstat_setup_shared_refs();
+
+ if (created_entry != NULL)
+ *created_entry = false;
+
+ /*
+ * Check if other backends dropped stats that could not be deleted because
+ * somebody held references to it. If so, check this backend's references.
+ * This is not expected to happen often. The location of the check is a
+ * bit random, but this is a relatively frequently called path, so better
+ * than most.
+ */
+ if (pgstat_need_entry_refs_gc())
+ pgstat_gc_entry_refs();
+
+ /*
+ * First check the lookup cache hashtable in local memory. If we find a
+ * match here we can avoid taking locks / causing contention.
+ */
+ if (pgstat_get_entry_ref_cached(key, &entry_ref))
+ return entry_ref;
+
+ Assert(entry_ref != NULL);
+
+ /*
+ * Do a lookup in the hash table first - it's quite likely that the entry
+ * already exists, and that way we only need a shared lock.
+ */
+ shhashent = dshash_find(pgStatLocal.shared_hash, &key, false);
+
+ if (create && !shhashent)
+ {
+ bool shfound;
+
+ /*
+ * It's possible that somebody created the entry since the above
+ * lookup. If so, fall through to the same path as if we'd have if it
+ * already had been created before the dshash_find() calls.
+ */
+ shhashent = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &shfound);
+ if (!shfound)
+ {
+ shheader = pgstat_init_entry(kind, shhashent);
+ pgstat_acquire_entry_ref(entry_ref, shhashent, shheader);
+
+ if (created_entry != NULL)
+ *created_entry = true;
+
+ return entry_ref;
+ }
+ }
+
+ if (!shhashent)
+ {
+ /*
+ * If we're not creating, delete the reference again. In all
+ * likelihood it's just a stats lookup - no point wasting memory for a
+ * shared ref to nothing...
+ */
+ pgstat_release_entry_ref(key, entry_ref, false);
+
+ return NULL;
+ }
+ else
+ {
+ /*
+ * Can get here either because dshash_find() found a match, or if
+ * dshash_find_or_insert() found a concurrently inserted entry.
+ */
+
+ if (shhashent->dropped && create)
+ {
+ /*
+ * There are legitimate cases where the old stats entry might not
+ * yet have been dropped by the time it's reused. The most obvious
+ * case are replication slot stats, where a new slot can be
+ * created with the same index just after dropping. But oid
+ * wraparound can lead to other cases as well. We just reset the
+ * stats to their plain state.
+ */
+ shheader = pgstat_reinit_entry(kind, shhashent);
+ pgstat_acquire_entry_ref(entry_ref, shhashent, shheader);
+
+ if (created_entry != NULL)
+ *created_entry = true;
+
+ return entry_ref;
+ }
+ else if (shhashent->dropped)
+ {
+ dshash_release_lock(pgStatLocal.shared_hash, shhashent);
+ pgstat_release_entry_ref(key, entry_ref, false);
+
+ return NULL;
+ }
+ else
+ {
+ shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body);
+ pgstat_acquire_entry_ref(entry_ref, shhashent, shheader);
+
+ return entry_ref;
+ }
+ }
+}
+
+static void
+pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref,
+ bool discard_pending)
+{
+ if (entry_ref && entry_ref->pending)
+ {
+ if (discard_pending)
+ pgstat_delete_pending_entry(entry_ref);
+ else
+ elog(ERROR, "releasing ref with pending data");
+ }
+
+ if (entry_ref && entry_ref->shared_stats)
+ {
+ Assert(entry_ref->shared_stats->magic == 0xdeadbeef);
+ Assert(entry_ref->pending == NULL);
+
+ /*
+ * This can't race with another backend looking up the stats entry and
+ * increasing the refcount because it is not "legal" to create
+ * additional references to dropped entries.
+ */
+ if (pg_atomic_fetch_sub_u32(&entry_ref->shared_entry->refcount, 1) == 1)
+ {
+ PgStatShared_HashEntry *shent;
+
+ /*
+ * We're the last referrer to this entry, try to drop the shared
+ * entry.
+ */
+
+ /* only dropped entries can reach a 0 refcount */
+ Assert(entry_ref->shared_entry->dropped);
+
+ shent = dshash_find(pgStatLocal.shared_hash,
+ &entry_ref->shared_entry->key,
+ true);
+ if (!shent)
+ elog(ERROR, "could not find just referenced shared stats entry");
+
+ Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) == 0);
+ Assert(entry_ref->shared_entry == shent);
+
+ pgstat_free_entry(shent, NULL);
+ }
+ }
+
+ if (!pgstat_entry_ref_hash_delete(pgStatEntryRefHash, key))
+ elog(ERROR, "entry ref vanished before deletion");
+
+ if (entry_ref)
+ pfree(entry_ref);
+}
+
+bool
+pgstat_lock_entry(PgStat_EntryRef *entry_ref, bool nowait)
+{
+ LWLock *lock = &entry_ref->shared_stats->lock;
+
+ if (nowait)
+ return LWLockConditionalAcquire(lock, LW_EXCLUSIVE);
+
+ LWLockAcquire(lock, LW_EXCLUSIVE);
+ return true;
+}
+
+/*
+ * Separate from pgstat_lock_entry() as most callers will need to lock
+ * exclusively.
+ */
+bool
+pgstat_lock_entry_shared(PgStat_EntryRef *entry_ref, bool nowait)
+{
+ LWLock *lock = &entry_ref->shared_stats->lock;
+
+ if (nowait)
+ return LWLockConditionalAcquire(lock, LW_SHARED);
+
+ LWLockAcquire(lock, LW_SHARED);
+ return true;
+}
+
+void
+pgstat_unlock_entry(PgStat_EntryRef *entry_ref)
+{
+ LWLockRelease(&entry_ref->shared_stats->lock);
+}
+
+/*
+ * Helper function to fetch and lock shared stats.
+ */
+PgStat_EntryRef *
+pgstat_get_entry_ref_locked(PgStat_Kind kind, Oid dboid, Oid objoid,
+ bool nowait)
+{
+ PgStat_EntryRef *entry_ref;
+
+ /* find shared table stats entry corresponding to the local entry */
+ entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, true, NULL);
+
+ /* lock the shared entry to protect the content, skip if failed */
+ if (!pgstat_lock_entry(entry_ref, nowait))
+ return NULL;
+
+ return entry_ref;
+}
+
+void
+pgstat_request_entry_refs_gc(void)
+{
+ pg_atomic_fetch_add_u64(&pgStatLocal.shmem->gc_request_count, 1);
+}
+
+static bool
+pgstat_need_entry_refs_gc(void)
+{
+ uint64 curage;
+
+ if (!pgStatEntryRefHash)
+ return false;
+
+ /* should have been initialized when creating pgStatEntryRefHash */
+ Assert(pgStatSharedRefAge != 0);
+
+ curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count);
+
+ return pgStatSharedRefAge != curage;
+}
+
+static void
+pgstat_gc_entry_refs(void)
+{
+ pgstat_entry_ref_hash_iterator i;
+ PgStat_EntryRefHashEntry *ent;
+ uint64 curage;
+
+ curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count);
+ Assert(curage != 0);
+
+ /*
+ * Some entries have been dropped. Invalidate cache pointer to them.
+ */
+ pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i);
+ while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) != NULL)
+ {
+ PgStat_EntryRef *entry_ref = ent->entry_ref;
+
+ Assert(!entry_ref->shared_stats ||
+ entry_ref->shared_stats->magic == 0xdeadbeef);
+
+ if (!entry_ref->shared_entry->dropped)
+ continue;
+
+ /* cannot gc shared ref that has pending data */
+ if (entry_ref->pending != NULL)
+ continue;
+
+ pgstat_release_entry_ref(ent->key, entry_ref, false);
+ }
+
+ pgStatSharedRefAge = curage;
+}
+
+static void
+pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match,
+ Datum match_data)
+{
+ pgstat_entry_ref_hash_iterator i;
+ PgStat_EntryRefHashEntry *ent;
+
+ if (pgStatEntryRefHash == NULL)
+ return;
+
+ pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i);
+
+ while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i))
+ != NULL)
+ {
+ Assert(ent->entry_ref != NULL);
+
+ if (match && !match(ent, match_data))
+ continue;
+
+ pgstat_release_entry_ref(ent->key, ent->entry_ref, discard_pending);
+ }
+}
+
+/*
+ * Release all local references to shared stats entries.
+ *
+ * When a process exits it cannot do so while still holding references onto
+ * stats entries, otherwise the shared stats entries could never be freed.
+ */
+static void
+pgstat_release_all_entry_refs(bool discard_pending)
+{
+ if (pgStatEntryRefHash == NULL)
+ return;
+
+ pgstat_release_matching_entry_refs(discard_pending, NULL, 0);
+ Assert(pgStatEntryRefHash->members == 0);
+ pgstat_entry_ref_hash_destroy(pgStatEntryRefHash);
+ pgStatEntryRefHash = NULL;
+}
+
+static bool
+match_db(PgStat_EntryRefHashEntry *ent, Datum match_data)
+{
+ Oid dboid = DatumGetObjectId(match_data);
+
+ return ent->key.dboid == dboid;
+}
+
+static void
+pgstat_release_db_entry_refs(Oid dboid)
+{
+ pgstat_release_matching_entry_refs( /* discard pending = */ true,
+ match_db,
+ ObjectIdGetDatum(dboid));
+}
+
+
+/* ------------------------------------------------------------
+ * Dropping and resetting of stats entries
+ * ------------------------------------------------------------
+ */
+
+static void
+pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat)
+{
+ dsa_pointer pdsa;
+
+ /*
+ * Fetch dsa pointer before deleting entry - that way we can free the
+ * memory after releasing the lock.
+ */
+ pdsa = shent->body;
+
+ if (!hstat)
+ dshash_delete_entry(pgStatLocal.shared_hash, shent);
+ else
+ dshash_delete_current(hstat);
+
+ dsa_free(pgStatLocal.dsa, pdsa);
+}
+
+/*
+ * Helper for both pgstat_drop_database_and_contents() and
+ * pgstat_drop_entry(). If hstat is non-null delete the shared entry using
+ * dshash_delete_current(), otherwise use dshash_delete_entry(). In either
+ * case the entry needs to be already locked.
+ */
+static bool
+pgstat_drop_entry_internal(PgStatShared_HashEntry *shent,
+ dshash_seq_status *hstat)
+{
+ Assert(shent->body != InvalidDsaPointer);
+
+ /* should already have released local reference */
+ if (pgStatEntryRefHash)
+ Assert(!pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, shent->key));
+
+ /*
+ * Signal that the entry is dropped - this will eventually cause other
+ * backends to release their references.
+ */
+ if (shent->dropped)
+ elog(ERROR, "can only drop stats once");
+ shent->dropped = true;
+
+ /* release refcount marking entry as not dropped */
+ if (pg_atomic_sub_fetch_u32(&shent->refcount, 1) == 0)
+ {
+ pgstat_free_entry(shent, hstat);
+ return true;
+ }
+ else
+ {
+ if (!hstat)
+ dshash_release_lock(pgStatLocal.shared_hash, shent);
+ return false;
+ }
+}
+
+/*
+ * Drop stats for the database and all the objects inside that database.
+ */
+static void
+pgstat_drop_database_and_contents(Oid dboid)
+{
+ dshash_seq_status hstat;
+ PgStatShared_HashEntry *p;
+ uint64 not_freed_count = 0;
+
+ Assert(OidIsValid(dboid));
+
+ Assert(pgStatLocal.shared_hash != NULL);
+
+ /*
+ * This backend might very well be the only backend holding a reference to
+ * about-to-be-dropped entries. Ensure that we're not preventing it from
+ * being cleaned up till later.
+ *
+ * Doing this separately from the dshash iteration below avoids having to
+ * do so while holding a partition lock on the shared hashtable.
+ */
+ pgstat_release_db_entry_refs(dboid);
+
+ /* some of the dshash entries are to be removed, take exclusive lock. */
+ dshash_seq_init(&hstat, pgStatLocal.shared_hash, true);
+ while ((p = dshash_seq_next(&hstat)) != NULL)
+ {
+ if (p->dropped)
+ continue;
+
+ if (p->key.dboid != dboid)
+ continue;
+
+ if (!pgstat_drop_entry_internal(p, &hstat))
+ {
+ /*
+ * Even statistics for a dropped database might currently be
+ * accessed (consider e.g. database stats for pg_stat_database).
+ */
+ not_freed_count++;
+ }
+ }
+ dshash_seq_term(&hstat);
+
+ /*
+ * If some of the stats data could not be freed, signal the reference
+ * holders to run garbage collection of their cached pgStatShmLookupCache.
+ */
+ if (not_freed_count > 0)
+ pgstat_request_entry_refs_gc();
+}
+
+bool
+pgstat_drop_entry(PgStat_Kind kind, Oid dboid, Oid objoid)
+{
+ PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid};
+ PgStatShared_HashEntry *shent;
+ bool freed = true;
+
+ /* delete local reference */
+ if (pgStatEntryRefHash)
+ {
+ PgStat_EntryRefHashEntry *lohashent =
+ pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, key);
+
+ if (lohashent)
+ pgstat_release_entry_ref(lohashent->key, lohashent->entry_ref,
+ true);
+ }
+
+ /* mark entry in shared hashtable as deleted, drop if possible */
+ shent = dshash_find(pgStatLocal.shared_hash, &key, true);
+ if (shent)
+ {
+ freed = pgstat_drop_entry_internal(shent, NULL);
+
+ /*
+ * Database stats contain other stats. Drop those as well when
+ * dropping the database. XXX: Perhaps this should be done in a
+ * slightly more principled way? But not obvious what that'd look
+ * like, and so far this is the only case...
+ */
+ if (key.kind == PGSTAT_KIND_DATABASE)
+ pgstat_drop_database_and_contents(key.dboid);
+ }
+
+ return freed;
+}
+
+void
+pgstat_drop_all_entries(void)
+{
+ dshash_seq_status hstat;
+ PgStatShared_HashEntry *ps;
+ uint64 not_freed_count = 0;
+
+ dshash_seq_init(&hstat, pgStatLocal.shared_hash, true);
+ while ((ps = dshash_seq_next(&hstat)) != NULL)
+ {
+ if (ps->dropped)
+ continue;
+
+ if (!pgstat_drop_entry_internal(ps, &hstat))
+ not_freed_count++;
+ }
+ dshash_seq_term(&hstat);
+
+ if (not_freed_count > 0)
+ pgstat_request_entry_refs_gc();
+}
+
+static void
+shared_stat_reset_contents(PgStat_Kind kind, PgStatShared_Common *header,
+ TimestampTz ts)
+{
+ const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+
+ memset(pgstat_get_entry_data(kind, header), 0,
+ pgstat_get_entry_len(kind));
+
+ if (kind_info->reset_timestamp_cb)
+ kind_info->reset_timestamp_cb(header, ts);
+}
+
+/*
+ * Reset one variable-numbered stats entry.
+ */
+void
+pgstat_reset_entry(PgStat_Kind kind, Oid dboid, Oid objoid, TimestampTz ts)
+{
+ PgStat_EntryRef *entry_ref;
+
+ Assert(!pgstat_get_kind_info(kind)->fixed_amount);
+
+ entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL);
+ if (!entry_ref || entry_ref->shared_entry->dropped)
+ return;
+
+ (void) pgstat_lock_entry(entry_ref, false);
+ shared_stat_reset_contents(kind, entry_ref->shared_stats, ts);
+ pgstat_unlock_entry(entry_ref);
+}
+
+/*
+ * Scan through the shared hashtable of stats, resetting statistics if
+ * approved by the provided do_reset() function.
+ */
+void
+pgstat_reset_matching_entries(bool (*do_reset) (PgStatShared_HashEntry *, Datum),
+ Datum match_data, TimestampTz ts)
+{
+ dshash_seq_status hstat;
+ PgStatShared_HashEntry *p;
+
+ /* dshash entry is not modified, take shared lock */
+ dshash_seq_init(&hstat, pgStatLocal.shared_hash, false);
+ while ((p = dshash_seq_next(&hstat)) != NULL)
+ {
+ PgStatShared_Common *header;
+
+ if (p->dropped)
+ continue;
+
+ if (!do_reset(p, match_data))
+ continue;
+
+ header = dsa_get_address(pgStatLocal.dsa, p->body);
+
+ LWLockAcquire(&header->lock, LW_EXCLUSIVE);
+
+ shared_stat_reset_contents(p->key.kind, header, ts);
+
+ LWLockRelease(&header->lock);
+ }
+ dshash_seq_term(&hstat);
+}
+
+static bool
+match_kind(PgStatShared_HashEntry *p, Datum match_data)
+{
+ return p->key.kind == DatumGetInt32(match_data);
+}
+
+void
+pgstat_reset_entries_of_kind(PgStat_Kind kind, TimestampTz ts)
+{
+ pgstat_reset_matching_entries(match_kind, Int32GetDatum(kind), ts);
+}
+
+static void
+pgstat_setup_memcxt(void)
+{
+ if (unlikely(!pgStatSharedRefContext))
+ pgStatSharedRefContext =
+ AllocSetContextCreate(TopMemoryContext,
+ "PgStat Shared Ref",
+ ALLOCSET_SMALL_SIZES);
+ if (unlikely(!pgStatEntryRefHashContext))
+ pgStatEntryRefHashContext =
+ AllocSetContextCreate(TopMemoryContext,
+ "PgStat Shared Ref Hash",
+ ALLOCSET_SMALL_SIZES);
+}
diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c
new file mode 100644
index 0000000..28ef736
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_slru.c
@@ -0,0 +1,248 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_slru.c
+ * Implementation of SLRU statistics.
+ *
+ * This file contains the implementation of SLRU statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_slru.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/pgstat_internal.h"
+#include "utils/timestamp.h"
+
+
+static inline PgStat_SLRUStats *get_slru_entry(int slru_idx);
+static void pgstat_reset_slru_counter_internal(int index, TimestampTz ts);
+
+
+/*
+ * SLRU statistics counts waiting to be flushed out. We assume this variable
+ * inits to zeroes. Entries are one-to-one with slru_names[]. Changes of
+ * SLRU counters are reported within critical sections so we use static memory
+ * in order to avoid memory allocation.
+ */
+static PgStat_SLRUStats pending_SLRUStats[SLRU_NUM_ELEMENTS];
+bool have_slrustats = false;
+
+
+/*
+ * Reset counters for a single SLRU.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+void
+pgstat_reset_slru(const char *name)
+{
+ TimestampTz ts = GetCurrentTimestamp();
+
+ AssertArg(name != NULL);
+
+ pgstat_reset_slru_counter_internal(pgstat_get_slru_index(name), ts);
+}
+
+/*
+ * SLRU statistics count accumulation functions --- called from slru.c
+ */
+
+void
+pgstat_count_slru_page_zeroed(int slru_idx)
+{
+ get_slru_entry(slru_idx)->blocks_zeroed += 1;
+}
+
+void
+pgstat_count_slru_page_hit(int slru_idx)
+{
+ get_slru_entry(slru_idx)->blocks_hit += 1;
+}
+
+void
+pgstat_count_slru_page_exists(int slru_idx)
+{
+ get_slru_entry(slru_idx)->blocks_exists += 1;
+}
+
+void
+pgstat_count_slru_page_read(int slru_idx)
+{
+ get_slru_entry(slru_idx)->blocks_read += 1;
+}
+
+void
+pgstat_count_slru_page_written(int slru_idx)
+{
+ get_slru_entry(slru_idx)->blocks_written += 1;
+}
+
+void
+pgstat_count_slru_flush(int slru_idx)
+{
+ get_slru_entry(slru_idx)->flush += 1;
+}
+
+void
+pgstat_count_slru_truncate(int slru_idx)
+{
+ get_slru_entry(slru_idx)->truncate += 1;
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * a pointer to the slru statistics struct.
+ */
+PgStat_SLRUStats *
+pgstat_fetch_slru(void)
+{
+ pgstat_snapshot_fixed(PGSTAT_KIND_SLRU);
+
+ return pgStatLocal.snapshot.slru;
+}
+
+/*
+ * Returns SLRU name for an index. The index may be above SLRU_NUM_ELEMENTS,
+ * in which case this returns NULL. This allows writing code that does not
+ * know the number of entries in advance.
+ */
+const char *
+pgstat_get_slru_name(int slru_idx)
+{
+ if (slru_idx < 0 || slru_idx >= SLRU_NUM_ELEMENTS)
+ return NULL;
+
+ return slru_names[slru_idx];
+}
+
+/*
+ * Determine index of entry for a SLRU with a given name. If there's no exact
+ * match, returns index of the last "other" entry used for SLRUs defined in
+ * external projects.
+ */
+int
+pgstat_get_slru_index(const char *name)
+{
+ int i;
+
+ for (i = 0; i < SLRU_NUM_ELEMENTS; i++)
+ {
+ if (strcmp(slru_names[i], name) == 0)
+ return i;
+ }
+
+ /* return index of the last entry (which is the "other" one) */
+ return (SLRU_NUM_ELEMENTS - 1);
+}
+
+/*
+ * Flush out locally pending SLRU stats entries
+ *
+ * If nowait is true, this function returns false on lock failure. Otherwise
+ * this function always returns true.
+ *
+ * If nowait is true, this function returns true if the lock could not be
+ * acquired. Otherwise return false.
+ */
+bool
+pgstat_slru_flush(bool nowait)
+{
+ PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru;
+ int i;
+
+ if (!have_slrustats)
+ return false;
+
+ if (!nowait)
+ LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
+ else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE))
+ return true;
+
+ for (i = 0; i < SLRU_NUM_ELEMENTS; i++)
+ {
+ PgStat_SLRUStats *sharedent = &stats_shmem->stats[i];
+ PgStat_SLRUStats *pendingent = &pending_SLRUStats[i];
+
+#define SLRU_ACC(fld) sharedent->fld += pendingent->fld
+ SLRU_ACC(blocks_zeroed);
+ SLRU_ACC(blocks_hit);
+ SLRU_ACC(blocks_read);
+ SLRU_ACC(blocks_written);
+ SLRU_ACC(blocks_exists);
+ SLRU_ACC(flush);
+ SLRU_ACC(truncate);
+#undef SLRU_ACC
+ }
+
+ /* done, clear the pending entry */
+ MemSet(pending_SLRUStats, 0, sizeof(pending_SLRUStats));
+
+ LWLockRelease(&stats_shmem->lock);
+
+ have_slrustats = false;
+
+ return false;
+}
+
+void
+pgstat_slru_reset_all_cb(TimestampTz ts)
+{
+ for (int i = 0; i < SLRU_NUM_ELEMENTS; i++)
+ pgstat_reset_slru_counter_internal(i, ts);
+}
+
+void
+pgstat_slru_snapshot_cb(void)
+{
+ PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru;
+
+ LWLockAcquire(&stats_shmem->lock, LW_SHARED);
+
+ memcpy(pgStatLocal.snapshot.slru, &stats_shmem->stats,
+ sizeof(stats_shmem->stats));
+
+ LWLockRelease(&stats_shmem->lock);
+}
+
+/*
+ * Returns pointer to entry with counters for given SLRU (based on the name
+ * stored in SlruCtl as lwlock tranche name).
+ */
+static inline PgStat_SLRUStats *
+get_slru_entry(int slru_idx)
+{
+ pgstat_assert_is_up();
+
+ /*
+ * The postmaster should never register any SLRU statistics counts; if it
+ * did, the counts would be duplicated into child processes via fork().
+ */
+ Assert(IsUnderPostmaster || !IsPostmasterEnvironment);
+
+ Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS));
+
+ have_slrustats = true;
+
+ return &pending_SLRUStats[slru_idx];
+}
+
+static void
+pgstat_reset_slru_counter_internal(int index, TimestampTz ts)
+{
+ PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru;
+
+ LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
+
+ memset(&stats_shmem->stats[index], 0, sizeof(PgStat_SLRUStats));
+ stats_shmem->stats[index].stat_reset_timestamp = ts;
+
+ LWLockRelease(&stats_shmem->lock);
+}
diff --git a/src/backend/utils/activity/pgstat_subscription.c b/src/backend/utils/activity/pgstat_subscription.c
new file mode 100644
index 0000000..e1072bd
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_subscription.c
@@ -0,0 +1,110 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_subscription.c
+ * Implementation of subscription statistics.
+ *
+ * This file contains the implementation of subscription statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_subscription.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/pgstat_internal.h"
+
+
+/*
+ * Report a subscription error.
+ */
+void
+pgstat_report_subscription_error(Oid subid, bool is_apply_error)
+{
+ PgStat_EntryRef *entry_ref;
+ PgStat_BackendSubEntry *pending;
+
+ entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_SUBSCRIPTION,
+ InvalidOid, subid, NULL);
+ pending = entry_ref->pending;
+
+ if (is_apply_error)
+ pending->apply_error_count++;
+ else
+ pending->sync_error_count++;
+}
+
+/*
+ * Report creating the subscription.
+ *
+ * Ensures that stats are dropped if transaction rolls back.
+ */
+void
+pgstat_create_subscription(Oid subid)
+{
+ pgstat_create_transactional(PGSTAT_KIND_SUBSCRIPTION,
+ InvalidOid, subid);
+}
+
+/*
+ * Report dropping the subscription.
+ *
+ * Ensures that stats are dropped if transaction commits.
+ */
+void
+pgstat_drop_subscription(Oid subid)
+{
+ pgstat_drop_transactional(PGSTAT_KIND_SUBSCRIPTION,
+ InvalidOid, subid);
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * the collected statistics for one subscription or NULL.
+ */
+PgStat_StatSubEntry *
+pgstat_fetch_stat_subscription(Oid subid)
+{
+ return (PgStat_StatSubEntry *)
+ pgstat_fetch_entry(PGSTAT_KIND_SUBSCRIPTION, InvalidOid, subid);
+}
+
+/*
+ * Flush out pending stats for the entry
+ *
+ * If nowait is true, this function returns false if lock could not
+ * immediately acquired, otherwise true is returned.
+ */
+bool
+pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait)
+{
+ PgStat_BackendSubEntry *localent;
+ PgStatShared_Subscription *shsubent;
+
+ localent = (PgStat_BackendSubEntry *) entry_ref->pending;
+ shsubent = (PgStatShared_Subscription *) entry_ref->shared_stats;
+
+ /* localent always has non-zero content */
+
+ if (!pgstat_lock_entry(entry_ref, nowait))
+ return false;
+
+#define SUB_ACC(fld) shsubent->stats.fld += localent->fld
+ SUB_ACC(apply_error_count);
+ SUB_ACC(sync_error_count);
+#undef SUB_ACC
+
+ pgstat_unlock_entry(entry_ref);
+ return true;
+}
+
+void
+pgstat_subscription_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts)
+{
+ ((PgStatShared_Subscription *) header)->stats.stat_reset_timestamp = ts;
+}
diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c
new file mode 100644
index 0000000..305a925
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_wal.c
@@ -0,0 +1,182 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_wal.c
+ * Implementation of WAL statistics.
+ *
+ * This file contains the implementation of WAL statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_wal.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/pgstat_internal.h"
+#include "executor/instrument.h"
+
+
+PgStat_WalStats PendingWalStats = {0};
+
+/*
+ * WAL usage counters saved from pgWALUsage at the previous call to
+ * pgstat_report_wal(). This is used to calculate how much WAL usage
+ * happens between pgstat_report_wal() calls, by subtracting
+ * the previous counters from the current ones.
+ */
+static WalUsage prevWalUsage;
+
+
+/*
+ * Calculate how much WAL usage counters have increased and update
+ * shared statistics.
+ *
+ * Must be called by processes that generate WAL, that do not call
+ * pgstat_report_stat(), like walwriter.
+ *
+ * "force" set to true ensures that the statistics are flushed; note that
+ * this needs to acquire the pgstat shmem LWLock, waiting on it. When
+ * set to false, the statistics may not be flushed if the lock could not
+ * be acquired.
+ */
+void
+pgstat_report_wal(bool force)
+{
+ bool nowait;
+
+ /* like in pgstat.c, don't wait for lock acquisition when !force */
+ nowait = !force;
+
+ /* flush wal stats */
+ pgstat_flush_wal(nowait);
+}
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * a pointer to the WAL statistics struct.
+ */
+PgStat_WalStats *
+pgstat_fetch_stat_wal(void)
+{
+ pgstat_snapshot_fixed(PGSTAT_KIND_WAL);
+
+ return &pgStatLocal.snapshot.wal;
+}
+
+/*
+ * Calculate how much WAL usage counters have increased by subtracting the
+ * previous counters from the current ones.
+ *
+ * If nowait is true, this function returns true if the lock could not be
+ * acquired. Otherwise return false.
+ */
+bool
+pgstat_flush_wal(bool nowait)
+{
+ PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal;
+ WalUsage diff = {0};
+
+ Assert(IsUnderPostmaster || !IsPostmasterEnvironment);
+ Assert(pgStatLocal.shmem != NULL &&
+ !pgStatLocal.shmem->is_shutdown);
+
+ /*
+ * This function can be called even if nothing at all has happened. Avoid
+ * taking lock for nothing in that case.
+ */
+ if (!pgstat_have_pending_wal())
+ return false;
+
+ /*
+ * We don't update the WAL usage portion of the local WalStats elsewhere.
+ * Calculate how much WAL usage counters were increased by subtracting the
+ * previous counters from the current ones.
+ */
+ WalUsageAccumDiff(&diff, &pgWalUsage, &prevWalUsage);
+ PendingWalStats.wal_records = diff.wal_records;
+ PendingWalStats.wal_fpi = diff.wal_fpi;
+ PendingWalStats.wal_bytes = diff.wal_bytes;
+
+ if (!nowait)
+ LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
+ else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE))
+ return true;
+
+#define WALSTAT_ACC(fld) stats_shmem->stats.fld += PendingWalStats.fld
+ WALSTAT_ACC(wal_records);
+ WALSTAT_ACC(wal_fpi);
+ WALSTAT_ACC(wal_bytes);
+ WALSTAT_ACC(wal_buffers_full);
+ WALSTAT_ACC(wal_write);
+ WALSTAT_ACC(wal_sync);
+ WALSTAT_ACC(wal_write_time);
+ WALSTAT_ACC(wal_sync_time);
+#undef WALSTAT_ACC
+
+ LWLockRelease(&stats_shmem->lock);
+
+ /*
+ * Save the current counters for the subsequent calculation of WAL usage.
+ */
+ prevWalUsage = pgWalUsage;
+
+ /*
+ * Clear out the statistics buffer, so it can be re-used.
+ */
+ MemSet(&PendingWalStats, 0, sizeof(PendingWalStats));
+
+ return false;
+}
+
+void
+pgstat_init_wal(void)
+{
+ /*
+ * Initialize prevWalUsage with pgWalUsage so that pgstat_flush_wal() can
+ * calculate how much pgWalUsage counters are increased by subtracting
+ * prevWalUsage from pgWalUsage.
+ */
+ prevWalUsage = pgWalUsage;
+}
+
+/*
+ * To determine whether any WAL activity has occurred since last time, not
+ * only the number of generated WAL records but also the numbers of WAL
+ * writes and syncs need to be checked. Because even transaction that
+ * generates no WAL records can write or sync WAL data when flushing the
+ * data pages.
+ */
+bool
+pgstat_have_pending_wal(void)
+{
+ return pgWalUsage.wal_records != prevWalUsage.wal_records ||
+ PendingWalStats.wal_write != 0 ||
+ PendingWalStats.wal_sync != 0;
+}
+
+void
+pgstat_wal_reset_all_cb(TimestampTz ts)
+{
+ PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal;
+
+ LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
+ memset(&stats_shmem->stats, 0, sizeof(stats_shmem->stats));
+ stats_shmem->stats.stat_reset_timestamp = ts;
+ LWLockRelease(&stats_shmem->lock);
+}
+
+void
+pgstat_wal_snapshot_cb(void)
+{
+ PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal;
+
+ LWLockAcquire(&stats_shmem->lock, LW_SHARED);
+ memcpy(&pgStatLocal.snapshot.wal, &stats_shmem->stats,
+ sizeof(pgStatLocal.snapshot.wal));
+ LWLockRelease(&stats_shmem->lock);
+}
diff --git a/src/backend/utils/activity/pgstat_xact.c b/src/backend/utils/activity/pgstat_xact.c
new file mode 100644
index 0000000..d6f660e
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_xact.c
@@ -0,0 +1,391 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_xact.c
+ * Transactional integration for the cumulative statistics system.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/pgstat_xact.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/transam.h"
+#include "access/xact.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#include "utils/pgstat_internal.h"
+
+
+typedef struct PgStat_PendingDroppedStatsItem
+{
+ xl_xact_stats_item item;
+ bool is_create;
+ dlist_node node;
+} PgStat_PendingDroppedStatsItem;
+
+
+static void AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit);
+static void AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state,
+ bool isCommit, int nestDepth);
+
+static PgStat_SubXactStatus *pgStatXactStack = NULL;
+
+
+/*
+ * Called from access/transam/xact.c at top-level transaction commit/abort.
+ */
+void
+AtEOXact_PgStat(bool isCommit, bool parallel)
+{
+ PgStat_SubXactStatus *xact_state;
+
+ AtEOXact_PgStat_Database(isCommit, parallel);
+
+ /* handle transactional stats information */
+ xact_state = pgStatXactStack;
+ if (xact_state != NULL)
+ {
+ Assert(xact_state->nest_level == 1);
+ Assert(xact_state->prev == NULL);
+
+ AtEOXact_PgStat_Relations(xact_state, isCommit);
+ AtEOXact_PgStat_DroppedStats(xact_state, isCommit);
+ }
+ pgStatXactStack = NULL;
+
+ /* Make sure any stats snapshot is thrown away */
+ pgstat_clear_snapshot();
+}
+
+/*
+ * When committing, drop stats for objects dropped in the transaction. When
+ * aborting, drop stats for objects created in the transaction.
+ */
+static void
+AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit)
+{
+ dlist_mutable_iter iter;
+ int not_freed_count = 0;
+
+ if (xact_state->pending_drops_count == 0)
+ {
+ Assert(dlist_is_empty(&xact_state->pending_drops));
+ return;
+ }
+
+ dlist_foreach_modify(iter, &xact_state->pending_drops)
+ {
+ PgStat_PendingDroppedStatsItem *pending =
+ dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur);
+ xl_xact_stats_item *it = &pending->item;
+
+ if (isCommit && !pending->is_create)
+ {
+ /*
+ * Transaction that dropped an object committed. Drop the stats
+ * too.
+ */
+ if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid))
+ not_freed_count++;
+ }
+ else if (!isCommit && pending->is_create)
+ {
+ /*
+ * Transaction that created an object aborted. Drop the stats
+ * associated with the object.
+ */
+ if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid))
+ not_freed_count++;
+ }
+
+ dlist_delete(&pending->node);
+ xact_state->pending_drops_count--;
+ pfree(pending);
+ }
+
+ if (not_freed_count > 0)
+ pgstat_request_entry_refs_gc();
+}
+
+/*
+ * Called from access/transam/xact.c at subtransaction commit/abort.
+ */
+void
+AtEOSubXact_PgStat(bool isCommit, int nestDepth)
+{
+ PgStat_SubXactStatus *xact_state;
+
+ /* merge the sub-transaction's transactional stats into the parent */
+ xact_state = pgStatXactStack;
+ if (xact_state != NULL &&
+ xact_state->nest_level >= nestDepth)
+ {
+ /* delink xact_state from stack immediately to simplify reuse case */
+ pgStatXactStack = xact_state->prev;
+
+ AtEOSubXact_PgStat_Relations(xact_state, isCommit, nestDepth);
+ AtEOSubXact_PgStat_DroppedStats(xact_state, isCommit, nestDepth);
+
+ pfree(xact_state);
+ }
+}
+
+/*
+ * Like AtEOXact_PgStat_DroppedStats(), but for subtransactions.
+ */
+static void
+AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state,
+ bool isCommit, int nestDepth)
+{
+ PgStat_SubXactStatus *parent_xact_state;
+ dlist_mutable_iter iter;
+ int not_freed_count = 0;
+
+ if (xact_state->pending_drops_count == 0)
+ return;
+
+ parent_xact_state = pgstat_get_xact_stack_level(nestDepth - 1);
+
+ dlist_foreach_modify(iter, &xact_state->pending_drops)
+ {
+ PgStat_PendingDroppedStatsItem *pending =
+ dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur);
+ xl_xact_stats_item *it = &pending->item;
+
+ dlist_delete(&pending->node);
+ xact_state->pending_drops_count--;
+
+ if (!isCommit && pending->is_create)
+ {
+ /*
+ * Subtransaction creating a new stats object aborted. Drop the
+ * stats object.
+ */
+ if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid))
+ not_freed_count++;
+ pfree(pending);
+ }
+ else if (isCommit)
+ {
+ /*
+ * Subtransaction dropping a stats object committed. Can't yet
+ * remove the stats object, the surrounding transaction might
+ * still abort. Pass it on to the parent.
+ */
+ dlist_push_tail(&parent_xact_state->pending_drops, &pending->node);
+ parent_xact_state->pending_drops_count++;
+ }
+ else
+ {
+ pfree(pending);
+ }
+ }
+
+ Assert(xact_state->pending_drops_count == 0);
+ if (not_freed_count > 0)
+ pgstat_request_entry_refs_gc();
+}
+
+/*
+ * Save the transactional stats state at 2PC transaction prepare.
+ */
+void
+AtPrepare_PgStat(void)
+{
+ PgStat_SubXactStatus *xact_state;
+
+ xact_state = pgStatXactStack;
+ if (xact_state != NULL)
+ {
+ Assert(xact_state->nest_level == 1);
+ Assert(xact_state->prev == NULL);
+
+ AtPrepare_PgStat_Relations(xact_state);
+ }
+}
+
+/*
+ * Clean up after successful PREPARE.
+ *
+ * Note: AtEOXact_PgStat is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat(void)
+{
+ PgStat_SubXactStatus *xact_state;
+
+ /*
+ * We don't bother to free any of the transactional state, since it's all
+ * in TopTransactionContext and will go away anyway.
+ */
+ xact_state = pgStatXactStack;
+ if (xact_state != NULL)
+ {
+ Assert(xact_state->nest_level == 1);
+ Assert(xact_state->prev == NULL);
+
+ PostPrepare_PgStat_Relations(xact_state);
+ }
+ pgStatXactStack = NULL;
+
+ /* Make sure any stats snapshot is thrown away */
+ pgstat_clear_snapshot();
+}
+
+/*
+ * Ensure (sub)transaction stack entry for the given nest_level exists, adding
+ * it if needed.
+ */
+PgStat_SubXactStatus *
+pgstat_get_xact_stack_level(int nest_level)
+{
+ PgStat_SubXactStatus *xact_state;
+
+ xact_state = pgStatXactStack;
+ if (xact_state == NULL || xact_state->nest_level != nest_level)
+ {
+ xact_state = (PgStat_SubXactStatus *)
+ MemoryContextAlloc(TopTransactionContext,
+ sizeof(PgStat_SubXactStatus));
+ dlist_init(&xact_state->pending_drops);
+ xact_state->pending_drops_count = 0;
+ xact_state->nest_level = nest_level;
+ xact_state->prev = pgStatXactStack;
+ xact_state->first = NULL;
+ pgStatXactStack = xact_state;
+ }
+ return xact_state;
+}
+
+/*
+ * Get stat items that need to be dropped at commit / abort.
+ *
+ * When committing, stats for objects that have been dropped in the
+ * transaction are returned. When aborting, stats for newly created objects are
+ * returned.
+ *
+ * Used by COMMIT / ABORT and 2PC PREPARE processing when building their
+ * respective WAL records, to ensure stats are dropped in case of a crash / on
+ * standbys.
+ *
+ * The list of items is allocated in CurrentMemoryContext and must be freed by
+ * the caller (directly or via memory context reset).
+ */
+int
+pgstat_get_transactional_drops(bool isCommit, xl_xact_stats_item **items)
+{
+ PgStat_SubXactStatus *xact_state = pgStatXactStack;
+ int nitems = 0;
+ dlist_iter iter;
+
+ if (xact_state == NULL)
+ return 0;
+
+ /*
+ * We expect to be called for subtransaction abort (which logs a WAL
+ * record), but not for subtransaction commit (which doesn't).
+ */
+ Assert(!isCommit || xact_state->nest_level == 1);
+ Assert(!isCommit || xact_state->prev == NULL);
+
+ *items = palloc(xact_state->pending_drops_count
+ * sizeof(xl_xact_stats_item));
+
+ dlist_foreach(iter, &xact_state->pending_drops)
+ {
+ PgStat_PendingDroppedStatsItem *pending =
+ dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur);
+
+ if (isCommit && pending->is_create)
+ continue;
+ if (!isCommit && !pending->is_create)
+ continue;
+
+ Assert(nitems < xact_state->pending_drops_count);
+ (*items)[nitems++] = pending->item;
+ }
+
+ return nitems;
+}
+
+/*
+ * Execute scheduled drops post-commit. Called from xact_redo_commit() /
+ * xact_redo_abort() during recovery, and from FinishPreparedTransaction()
+ * during normal 2PC COMMIT/ABORT PREPARED processing.
+ */
+void
+pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_item *items, bool is_redo)
+{
+ int not_freed_count = 0;
+
+ if (ndrops == 0)
+ return;
+
+ for (int i = 0; i < ndrops; i++)
+ {
+ xl_xact_stats_item *it = &items[i];
+
+ if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid))
+ not_freed_count++;
+ }
+
+ if (not_freed_count > 0)
+ pgstat_request_entry_refs_gc();
+}
+
+static void
+create_drop_transactional_internal(PgStat_Kind kind, Oid dboid, Oid objoid, bool is_create)
+{
+ int nest_level = GetCurrentTransactionNestLevel();
+ PgStat_SubXactStatus *xact_state;
+ PgStat_PendingDroppedStatsItem *drop = (PgStat_PendingDroppedStatsItem *)
+ MemoryContextAlloc(TopTransactionContext, sizeof(PgStat_PendingDroppedStatsItem));
+
+ xact_state = pgstat_get_xact_stack_level(nest_level);
+
+ drop->is_create = is_create;
+ drop->item.kind = kind;
+ drop->item.dboid = dboid;
+ drop->item.objoid = objoid;
+
+ dlist_push_tail(&xact_state->pending_drops, &drop->node);
+ xact_state->pending_drops_count++;
+}
+
+/*
+ * Create a stats entry for a newly created database object in a transactional
+ * manner.
+ *
+ * I.e. if the current (sub-)transaction aborts, the stats entry will also be
+ * dropped.
+ */
+void
+pgstat_create_transactional(PgStat_Kind kind, Oid dboid, Oid objoid)
+{
+ if (pgstat_get_entry_ref(kind, dboid, objoid, false, NULL))
+ {
+ ereport(WARNING,
+ errmsg("resetting existing statistics for kind %s, db=%u, oid=%u",
+ (pgstat_get_kind_info(kind))->name, dboid, objoid));
+
+ pgstat_reset(kind, dboid, objoid);
+ }
+
+ create_drop_transactional_internal(kind, dboid, objoid, /* create */ true);
+}
+
+/*
+ * Drop a stats entry for a just dropped database object in a transactional
+ * manner.
+ *
+ * I.e. if the current (sub-)transaction aborts, the stats entry will stay
+ * alive.
+ */
+void
+pgstat_drop_transactional(PgStat_Kind kind, Oid dboid, Oid objoid)
+{
+ create_drop_transactional_internal(kind, dboid, objoid, /* create */ false);
+}
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
new file mode 100644
index 0000000..87c15b9
--- /dev/null
+++ b/src/backend/utils/activity/wait_event.c
@@ -0,0 +1,749 @@
+/* ----------
+ * wait_event.c
+ * Wait event reporting infrastructure.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/wait_event.c
+ *
+ * NOTES
+ *
+ * To make pgstat_report_wait_start() and pgstat_report_wait_end() as
+ * lightweight as possible, they do not check if shared memory (MyProc
+ * specifically, where the wait event is stored) is already available. Instead
+ * we initially set my_wait_event_info to a process local variable, which then
+ * is redirected to shared memory using pgstat_set_wait_event_storage(). For
+ * the same reason pgstat_track_activities is not checked - the check adds
+ * more work than it saves.
+ *
+ * ----------
+ */
+#include "postgres.h"
+
+#include "storage/lmgr.h" /* for GetLockNameFromTagType */
+#include "storage/lwlock.h" /* for GetLWLockIdentifier */
+#include "utils/wait_event.h"
+
+
+static const char *pgstat_get_wait_activity(WaitEventActivity w);
+static const char *pgstat_get_wait_client(WaitEventClient w);
+static const char *pgstat_get_wait_ipc(WaitEventIPC w);
+static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
+static const char *pgstat_get_wait_io(WaitEventIO w);
+
+
+static uint32 local_my_wait_event_info;
+uint32 *my_wait_event_info = &local_my_wait_event_info;
+
+
+/*
+ * Configure wait event reporting to report wait events to *wait_event_info.
+ * *wait_event_info needs to be valid until pgstat_reset_wait_event_storage()
+ * is called.
+ *
+ * Expected to be called during backend startup, to point my_wait_event_info
+ * into shared memory.
+ */
+void
+pgstat_set_wait_event_storage(uint32 *wait_event_info)
+{
+ my_wait_event_info = wait_event_info;
+}
+
+/*
+ * Reset wait event storage location.
+ *
+ * Expected to be called during backend shutdown, before the location set up
+ * pgstat_set_wait_event_storage() becomes invalid.
+ */
+void
+pgstat_reset_wait_event_storage(void)
+{
+ my_wait_event_info = &local_my_wait_event_info;
+}
+
+/* ----------
+ * pgstat_get_wait_event_type() -
+ *
+ * Return a string representing the current wait event type, backend is
+ * waiting on.
+ */
+const char *
+pgstat_get_wait_event_type(uint32 wait_event_info)
+{
+ uint32 classId;
+ const char *event_type;
+
+ /* report process as not waiting. */
+ if (wait_event_info == 0)
+ return NULL;
+
+ classId = wait_event_info & 0xFF000000;
+
+ switch (classId)
+ {
+ case PG_WAIT_LWLOCK:
+ event_type = "LWLock";
+ break;
+ case PG_WAIT_LOCK:
+ event_type = "Lock";
+ break;
+ case PG_WAIT_BUFFER_PIN:
+ event_type = "BufferPin";
+ break;
+ case PG_WAIT_ACTIVITY:
+ event_type = "Activity";
+ break;
+ case PG_WAIT_CLIENT:
+ event_type = "Client";
+ break;
+ case PG_WAIT_EXTENSION:
+ event_type = "Extension";
+ break;
+ case PG_WAIT_IPC:
+ event_type = "IPC";
+ break;
+ case PG_WAIT_TIMEOUT:
+ event_type = "Timeout";
+ break;
+ case PG_WAIT_IO:
+ event_type = "IO";
+ break;
+ default:
+ event_type = "???";
+ break;
+ }
+
+ return event_type;
+}
+
+/* ----------
+ * pgstat_get_wait_event() -
+ *
+ * Return a string representing the current wait event, backend is
+ * waiting on.
+ */
+const char *
+pgstat_get_wait_event(uint32 wait_event_info)
+{
+ uint32 classId;
+ uint16 eventId;
+ const char *event_name;
+
+ /* report process as not waiting. */
+ if (wait_event_info == 0)
+ return NULL;
+
+ classId = wait_event_info & 0xFF000000;
+ eventId = wait_event_info & 0x0000FFFF;
+
+ switch (classId)
+ {
+ case PG_WAIT_LWLOCK:
+ event_name = GetLWLockIdentifier(classId, eventId);
+ break;
+ case PG_WAIT_LOCK:
+ event_name = GetLockNameFromTagType(eventId);
+ break;
+ case PG_WAIT_BUFFER_PIN:
+ event_name = "BufferPin";
+ break;
+ case PG_WAIT_ACTIVITY:
+ {
+ WaitEventActivity w = (WaitEventActivity) wait_event_info;
+
+ event_name = pgstat_get_wait_activity(w);
+ break;
+ }
+ case PG_WAIT_CLIENT:
+ {
+ WaitEventClient w = (WaitEventClient) wait_event_info;
+
+ event_name = pgstat_get_wait_client(w);
+ break;
+ }
+ case PG_WAIT_EXTENSION:
+ event_name = "Extension";
+ break;
+ case PG_WAIT_IPC:
+ {
+ WaitEventIPC w = (WaitEventIPC) wait_event_info;
+
+ event_name = pgstat_get_wait_ipc(w);
+ break;
+ }
+ case PG_WAIT_TIMEOUT:
+ {
+ WaitEventTimeout w = (WaitEventTimeout) wait_event_info;
+
+ event_name = pgstat_get_wait_timeout(w);
+ break;
+ }
+ case PG_WAIT_IO:
+ {
+ WaitEventIO w = (WaitEventIO) wait_event_info;
+
+ event_name = pgstat_get_wait_io(w);
+ break;
+ }
+ default:
+ event_name = "unknown wait event";
+ break;
+ }
+
+ return event_name;
+}
+
+/* ----------
+ * pgstat_get_wait_activity() -
+ *
+ * Convert WaitEventActivity to string.
+ * ----------
+ */
+static const char *
+pgstat_get_wait_activity(WaitEventActivity w)
+{
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_ARCHIVER_MAIN:
+ event_name = "ArchiverMain";
+ break;
+ case WAIT_EVENT_AUTOVACUUM_MAIN:
+ event_name = "AutoVacuumMain";
+ break;
+ case WAIT_EVENT_BGWRITER_HIBERNATE:
+ event_name = "BgWriterHibernate";
+ break;
+ case WAIT_EVENT_BGWRITER_MAIN:
+ event_name = "BgWriterMain";
+ break;
+ case WAIT_EVENT_CHECKPOINTER_MAIN:
+ event_name = "CheckpointerMain";
+ break;
+ case WAIT_EVENT_LOGICAL_APPLY_MAIN:
+ event_name = "LogicalApplyMain";
+ break;
+ case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN:
+ event_name = "LogicalLauncherMain";
+ break;
+ case WAIT_EVENT_RECOVERY_WAL_STREAM:
+ event_name = "RecoveryWalStream";
+ break;
+ case WAIT_EVENT_SYSLOGGER_MAIN:
+ event_name = "SysLoggerMain";
+ break;
+ case WAIT_EVENT_WAL_RECEIVER_MAIN:
+ event_name = "WalReceiverMain";
+ break;
+ case WAIT_EVENT_WAL_SENDER_MAIN:
+ event_name = "WalSenderMain";
+ break;
+ case WAIT_EVENT_WAL_WRITER_MAIN:
+ event_name = "WalWriterMain";
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+}
+
+/* ----------
+ * pgstat_get_wait_client() -
+ *
+ * Convert WaitEventClient to string.
+ * ----------
+ */
+static const char *
+pgstat_get_wait_client(WaitEventClient w)
+{
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_CLIENT_READ:
+ event_name = "ClientRead";
+ break;
+ case WAIT_EVENT_CLIENT_WRITE:
+ event_name = "ClientWrite";
+ break;
+ case WAIT_EVENT_GSS_OPEN_SERVER:
+ event_name = "GSSOpenServer";
+ break;
+ case WAIT_EVENT_LIBPQWALRECEIVER_CONNECT:
+ event_name = "LibPQWalReceiverConnect";
+ break;
+ case WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE:
+ event_name = "LibPQWalReceiverReceive";
+ break;
+ case WAIT_EVENT_SSL_OPEN_SERVER:
+ event_name = "SSLOpenServer";
+ break;
+ case WAIT_EVENT_WAL_SENDER_WAIT_WAL:
+ event_name = "WalSenderWaitForWAL";
+ break;
+ case WAIT_EVENT_WAL_SENDER_WRITE_DATA:
+ event_name = "WalSenderWriteData";
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+}
+
+/* ----------
+ * pgstat_get_wait_ipc() -
+ *
+ * Convert WaitEventIPC to string.
+ * ----------
+ */
+static const char *
+pgstat_get_wait_ipc(WaitEventIPC w)
+{
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_APPEND_READY:
+ event_name = "AppendReady";
+ break;
+ case WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND:
+ event_name = "ArchiveCleanupCommand";
+ break;
+ case WAIT_EVENT_ARCHIVE_COMMAND:
+ event_name = "ArchiveCommand";
+ break;
+ case WAIT_EVENT_BACKEND_TERMINATION:
+ event_name = "BackendTermination";
+ break;
+ case WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE:
+ event_name = "BackupWaitWalArchive";
+ break;
+ case WAIT_EVENT_BGWORKER_SHUTDOWN:
+ event_name = "BgWorkerShutdown";
+ break;
+ case WAIT_EVENT_BGWORKER_STARTUP:
+ event_name = "BgWorkerStartup";
+ break;
+ case WAIT_EVENT_BTREE_PAGE:
+ event_name = "BtreePage";
+ break;
+ case WAIT_EVENT_BUFFER_IO:
+ event_name = "BufferIO";
+ break;
+ case WAIT_EVENT_CHECKPOINT_DONE:
+ event_name = "CheckpointDone";
+ break;
+ case WAIT_EVENT_CHECKPOINT_START:
+ event_name = "CheckpointStart";
+ break;
+ case WAIT_EVENT_EXECUTE_GATHER:
+ event_name = "ExecuteGather";
+ break;
+ case WAIT_EVENT_HASH_BATCH_ALLOCATE:
+ event_name = "HashBatchAllocate";
+ break;
+ case WAIT_EVENT_HASH_BATCH_ELECT:
+ event_name = "HashBatchElect";
+ break;
+ case WAIT_EVENT_HASH_BATCH_LOAD:
+ event_name = "HashBatchLoad";
+ break;
+ case WAIT_EVENT_HASH_BUILD_ALLOCATE:
+ event_name = "HashBuildAllocate";
+ break;
+ case WAIT_EVENT_HASH_BUILD_ELECT:
+ event_name = "HashBuildElect";
+ break;
+ case WAIT_EVENT_HASH_BUILD_HASH_INNER:
+ event_name = "HashBuildHashInner";
+ break;
+ case WAIT_EVENT_HASH_BUILD_HASH_OUTER:
+ event_name = "HashBuildHashOuter";
+ break;
+ case WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE:
+ event_name = "HashGrowBatchesAllocate";
+ break;
+ case WAIT_EVENT_HASH_GROW_BATCHES_DECIDE:
+ event_name = "HashGrowBatchesDecide";
+ break;
+ case WAIT_EVENT_HASH_GROW_BATCHES_ELECT:
+ event_name = "HashGrowBatchesElect";
+ break;
+ case WAIT_EVENT_HASH_GROW_BATCHES_FINISH:
+ event_name = "HashGrowBatchesFinish";
+ break;
+ case WAIT_EVENT_HASH_GROW_BATCHES_REPARTITION:
+ event_name = "HashGrowBatchesRepartition";
+ break;
+ case WAIT_EVENT_HASH_GROW_BUCKETS_ALLOCATE:
+ event_name = "HashGrowBucketsAllocate";
+ break;
+ case WAIT_EVENT_HASH_GROW_BUCKETS_ELECT:
+ event_name = "HashGrowBucketsElect";
+ break;
+ case WAIT_EVENT_HASH_GROW_BUCKETS_REINSERT:
+ event_name = "HashGrowBucketsReinsert";
+ break;
+ case WAIT_EVENT_LOGICAL_SYNC_DATA:
+ event_name = "LogicalSyncData";
+ break;
+ case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE:
+ event_name = "LogicalSyncStateChange";
+ break;
+ case WAIT_EVENT_MQ_INTERNAL:
+ event_name = "MessageQueueInternal";
+ break;
+ case WAIT_EVENT_MQ_PUT_MESSAGE:
+ event_name = "MessageQueuePutMessage";
+ break;
+ case WAIT_EVENT_MQ_RECEIVE:
+ event_name = "MessageQueueReceive";
+ break;
+ case WAIT_EVENT_MQ_SEND:
+ event_name = "MessageQueueSend";
+ break;
+ case WAIT_EVENT_PARALLEL_BITMAP_SCAN:
+ event_name = "ParallelBitmapScan";
+ break;
+ case WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN:
+ event_name = "ParallelCreateIndexScan";
+ break;
+ case WAIT_EVENT_PARALLEL_FINISH:
+ event_name = "ParallelFinish";
+ break;
+ case WAIT_EVENT_PROCARRAY_GROUP_UPDATE:
+ event_name = "ProcArrayGroupUpdate";
+ break;
+ case WAIT_EVENT_PROC_SIGNAL_BARRIER:
+ event_name = "ProcSignalBarrier";
+ break;
+ case WAIT_EVENT_PROMOTE:
+ event_name = "Promote";
+ break;
+ case WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT:
+ event_name = "RecoveryConflictSnapshot";
+ break;
+ case WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE:
+ event_name = "RecoveryConflictTablespace";
+ break;
+ case WAIT_EVENT_RECOVERY_END_COMMAND:
+ event_name = "RecoveryEndCommand";
+ break;
+ case WAIT_EVENT_RECOVERY_PAUSE:
+ event_name = "RecoveryPause";
+ break;
+ case WAIT_EVENT_REPLICATION_ORIGIN_DROP:
+ event_name = "ReplicationOriginDrop";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_DROP:
+ event_name = "ReplicationSlotDrop";
+ break;
+ case WAIT_EVENT_RESTORE_COMMAND:
+ event_name = "RestoreCommand";
+ break;
+ case WAIT_EVENT_SAFE_SNAPSHOT:
+ event_name = "SafeSnapshot";
+ break;
+ case WAIT_EVENT_SYNC_REP:
+ event_name = "SyncRep";
+ break;
+ case WAIT_EVENT_WAL_RECEIVER_EXIT:
+ event_name = "WalReceiverExit";
+ break;
+ case WAIT_EVENT_WAL_RECEIVER_WAIT_START:
+ event_name = "WalReceiverWaitStart";
+ break;
+ case WAIT_EVENT_XACT_GROUP_UPDATE:
+ event_name = "XactGroupUpdate";
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+}
+
+/* ----------
+ * pgstat_get_wait_timeout() -
+ *
+ * Convert WaitEventTimeout to string.
+ * ----------
+ */
+static const char *
+pgstat_get_wait_timeout(WaitEventTimeout w)
+{
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_BASE_BACKUP_THROTTLE:
+ event_name = "BaseBackupThrottle";
+ break;
+ case WAIT_EVENT_CHECKPOINT_WRITE_DELAY:
+ event_name = "CheckpointWriteDelay";
+ break;
+ case WAIT_EVENT_PG_SLEEP:
+ event_name = "PgSleep";
+ break;
+ case WAIT_EVENT_RECOVERY_APPLY_DELAY:
+ event_name = "RecoveryApplyDelay";
+ break;
+ case WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL:
+ event_name = "RecoveryRetrieveRetryInterval";
+ break;
+ case WAIT_EVENT_REGISTER_SYNC_REQUEST:
+ event_name = "RegisterSyncRequest";
+ break;
+ case WAIT_EVENT_VACUUM_DELAY:
+ event_name = "VacuumDelay";
+ break;
+ case WAIT_EVENT_VACUUM_TRUNCATE:
+ event_name = "VacuumTruncate";
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+}
+
+/* ----------
+ * pgstat_get_wait_io() -
+ *
+ * Convert WaitEventIO to string.
+ * ----------
+ */
+static const char *
+pgstat_get_wait_io(WaitEventIO w)
+{
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_BASEBACKUP_READ:
+ event_name = "BaseBackupRead";
+ break;
+ case WAIT_EVENT_BASEBACKUP_SYNC:
+ event_name = "BaseBackupSync";
+ break;
+ case WAIT_EVENT_BASEBACKUP_WRITE:
+ event_name = "BaseBackupWrite";
+ break;
+ case WAIT_EVENT_BUFFILE_READ:
+ event_name = "BufFileRead";
+ break;
+ case WAIT_EVENT_BUFFILE_WRITE:
+ event_name = "BufFileWrite";
+ break;
+ case WAIT_EVENT_BUFFILE_TRUNCATE:
+ event_name = "BufFileTruncate";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_READ:
+ event_name = "ControlFileRead";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_SYNC:
+ event_name = "ControlFileSync";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE:
+ event_name = "ControlFileSyncUpdate";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_WRITE:
+ event_name = "ControlFileWrite";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE:
+ event_name = "ControlFileWriteUpdate";
+ break;
+ case WAIT_EVENT_COPY_FILE_READ:
+ event_name = "CopyFileRead";
+ break;
+ case WAIT_EVENT_COPY_FILE_WRITE:
+ event_name = "CopyFileWrite";
+ break;
+ case WAIT_EVENT_DATA_FILE_EXTEND:
+ event_name = "DataFileExtend";
+ break;
+ case WAIT_EVENT_DATA_FILE_FLUSH:
+ event_name = "DataFileFlush";
+ break;
+ case WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC:
+ event_name = "DataFileImmediateSync";
+ break;
+ case WAIT_EVENT_DATA_FILE_PREFETCH:
+ event_name = "DataFilePrefetch";
+ break;
+ case WAIT_EVENT_DATA_FILE_READ:
+ event_name = "DataFileRead";
+ break;
+ case WAIT_EVENT_DATA_FILE_SYNC:
+ event_name = "DataFileSync";
+ break;
+ case WAIT_EVENT_DATA_FILE_TRUNCATE:
+ event_name = "DataFileTruncate";
+ break;
+ case WAIT_EVENT_DATA_FILE_WRITE:
+ event_name = "DataFileWrite";
+ break;
+ case WAIT_EVENT_DSM_FILL_ZERO_WRITE:
+ event_name = "DSMFillZeroWrite";
+ break;
+ case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ:
+ event_name = "LockFileAddToDataDirRead";
+ break;
+ case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC:
+ event_name = "LockFileAddToDataDirSync";
+ break;
+ case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE:
+ event_name = "LockFileAddToDataDirWrite";
+ break;
+ case WAIT_EVENT_LOCK_FILE_CREATE_READ:
+ event_name = "LockFileCreateRead";
+ break;
+ case WAIT_EVENT_LOCK_FILE_CREATE_SYNC:
+ event_name = "LockFileCreateSync";
+ break;
+ case WAIT_EVENT_LOCK_FILE_CREATE_WRITE:
+ event_name = "LockFileCreateWrite";
+ break;
+ case WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ:
+ event_name = "LockFileReCheckDataDirRead";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC:
+ event_name = "LogicalRewriteCheckpointSync";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC:
+ event_name = "LogicalRewriteMappingSync";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE:
+ event_name = "LogicalRewriteMappingWrite";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_SYNC:
+ event_name = "LogicalRewriteSync";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE:
+ event_name = "LogicalRewriteTruncate";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_WRITE:
+ event_name = "LogicalRewriteWrite";
+ break;
+ case WAIT_EVENT_RELATION_MAP_READ:
+ event_name = "RelationMapRead";
+ break;
+ case WAIT_EVENT_RELATION_MAP_SYNC:
+ event_name = "RelationMapSync";
+ break;
+ case WAIT_EVENT_RELATION_MAP_WRITE:
+ event_name = "RelationMapWrite";
+ break;
+ case WAIT_EVENT_REORDER_BUFFER_READ:
+ event_name = "ReorderBufferRead";
+ break;
+ case WAIT_EVENT_REORDER_BUFFER_WRITE:
+ event_name = "ReorderBufferWrite";
+ break;
+ case WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ:
+ event_name = "ReorderLogicalMappingRead";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_READ:
+ event_name = "ReplicationSlotRead";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC:
+ event_name = "ReplicationSlotRestoreSync";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_SYNC:
+ event_name = "ReplicationSlotSync";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_WRITE:
+ event_name = "ReplicationSlotWrite";
+ break;
+ case WAIT_EVENT_SLRU_FLUSH_SYNC:
+ event_name = "SLRUFlushSync";
+ break;
+ case WAIT_EVENT_SLRU_READ:
+ event_name = "SLRURead";
+ break;
+ case WAIT_EVENT_SLRU_SYNC:
+ event_name = "SLRUSync";
+ break;
+ case WAIT_EVENT_SLRU_WRITE:
+ event_name = "SLRUWrite";
+ break;
+ case WAIT_EVENT_SNAPBUILD_READ:
+ event_name = "SnapbuildRead";
+ break;
+ case WAIT_EVENT_SNAPBUILD_SYNC:
+ event_name = "SnapbuildSync";
+ break;
+ case WAIT_EVENT_SNAPBUILD_WRITE:
+ event_name = "SnapbuildWrite";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC:
+ event_name = "TimelineHistoryFileSync";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE:
+ event_name = "TimelineHistoryFileWrite";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_READ:
+ event_name = "TimelineHistoryRead";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_SYNC:
+ event_name = "TimelineHistorySync";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_WRITE:
+ event_name = "TimelineHistoryWrite";
+ break;
+ case WAIT_EVENT_TWOPHASE_FILE_READ:
+ event_name = "TwophaseFileRead";
+ break;
+ case WAIT_EVENT_TWOPHASE_FILE_SYNC:
+ event_name = "TwophaseFileSync";
+ break;
+ case WAIT_EVENT_TWOPHASE_FILE_WRITE:
+ event_name = "TwophaseFileWrite";
+ break;
+ case WAIT_EVENT_VERSION_FILE_WRITE:
+ event_name = "VersionFileWrite";
+ break;
+ case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ:
+ event_name = "WALSenderTimelineHistoryRead";
+ break;
+ case WAIT_EVENT_WAL_BOOTSTRAP_SYNC:
+ event_name = "WALBootstrapSync";
+ break;
+ case WAIT_EVENT_WAL_BOOTSTRAP_WRITE:
+ event_name = "WALBootstrapWrite";
+ break;
+ case WAIT_EVENT_WAL_COPY_READ:
+ event_name = "WALCopyRead";
+ break;
+ case WAIT_EVENT_WAL_COPY_SYNC:
+ event_name = "WALCopySync";
+ break;
+ case WAIT_EVENT_WAL_COPY_WRITE:
+ event_name = "WALCopyWrite";
+ break;
+ case WAIT_EVENT_WAL_INIT_SYNC:
+ event_name = "WALInitSync";
+ break;
+ case WAIT_EVENT_WAL_INIT_WRITE:
+ event_name = "WALInitWrite";
+ break;
+ case WAIT_EVENT_WAL_READ:
+ event_name = "WALRead";
+ break;
+ case WAIT_EVENT_WAL_SYNC:
+ event_name = "WALSync";
+ break;
+ case WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN:
+ event_name = "WALSyncMethodAssign";
+ break;
+ case WAIT_EVENT_WAL_WRITE:
+ event_name = "WALWrite";
+ break;
+
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+}