diff options
Diffstat (limited to 'src/backend/utils/activity/pgstat.c')
-rw-r--r-- | src/backend/utils/activity/pgstat.c | 1678 |
1 files changed, 1678 insertions, 0 deletions
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c new file mode 100644 index 0000000..84d65a7 --- /dev/null +++ b/src/backend/utils/activity/pgstat.c @@ -0,0 +1,1678 @@ +/* ---------- + * pgstat.c + * Infrastructure for the cumulative statistics system. + * + * The cumulative statistics system accumulates statistics for different kinds + * of objects. Some kinds of statistics are collected for a fixed number of + * objects (most commonly 1), e.g., checkpointer statistics. Other kinds of + * statistics are collected for a varying number of objects + * (e.g. relations). See PgStat_KindInfo for a list of currently handled + * statistics. + * + * Statistics are loaded from the filesystem during startup (by the startup + * process), unless preceded by a crash, in which case all stats are + * discarded. They are written out by the checkpointer process just before + * shutting down, except when shutting down in immediate mode. + * + * Fixed-numbered stats are stored in plain (non-dynamic) shared memory. + * + * Statistics for variable-numbered objects are stored in dynamic shared + * memory and can be found via a dshash hashtable. The statistics counters are + * not part of the dshash entry (PgStatShared_HashEntry) directly, but are + * separately allocated (PgStatShared_HashEntry->body). The separate + * allocation allows different kinds of statistics to be stored in the same + * hashtable without wasting space in PgStatShared_HashEntry. + * + * Variable-numbered stats are addressed by PgStat_HashKey while running. It + * is not possible to have statistics for an object that cannot be addressed + * that way at runtime. A wider identifier can be used when serializing to + * disk (used for replication slot stats). + * + * To avoid contention on the shared hashtable, each backend has a + * backend-local hashtable (pgStatEntryRefHash) in front of the shared + * hashtable, containing references (PgStat_EntryRef) to shared hashtable + * entries. The shared hashtable only needs to be accessed when no prior + * reference is found in the local hashtable. Besides pointing to the + * shared hashtable entry (PgStatShared_HashEntry) PgStat_EntryRef also + * contains a pointer to the shared statistics data, as a process-local + * address, to reduce access costs. + * + * The names for structs stored in shared memory are prefixed with + * PgStatShared instead of PgStat. Each stats entry in shared memory is + * protected by a dedicated lwlock. + * + * Most stats updates are first accumulated locally in each process as pending + * entries, then later flushed to shared memory (just after commit, or by + * idle-timeout). This practically eliminates contention on individual stats + * entries. For most kinds of variable-numbered pending stats data is stored + * in PgStat_EntryRef->pending. All entries with pending data are in the + * pgStatPending list. Pending statistics updates are flushed out by + * pgstat_report_stat(). + * + * The behavior of different kinds of statistics is determined by the kind's + * entry in pgstat_kind_infos, see PgStat_KindInfo for details. + * + * The consistency of read accesses to statistics can be configured using the + * stats_fetch_consistency GUC (see config.sgml and monitoring.sgml for the + * settings). When using PGSTAT_FETCH_CONSISTENCY_CACHE or + * PGSTAT_FETCH_CONSISTENCY_SNAPSHOT statistics are stored in + * pgStatLocal.snapshot. + * + * To keep things manageable, stats handling is split across several + * files. Infrastructure pieces are in: + * - pgstat.c - this file, to tie it all together + * - pgstat_shmem.c - nearly everything dealing with shared memory, including + * the maintenance of hashtable entries + * - pgstat_xact.c - transactional integration, including the transactional + * creation and dropping of stats entries + * + * Each statistics kind is handled in a dedicated file: + * - pgstat_archiver.c + * - pgstat_bgwriter.c + * - pgstat_checkpointer.c + * - pgstat_database.c + * - pgstat_function.c + * - pgstat_relation.c + * - pgstat_replslot.c + * - pgstat_slru.c + * - pgstat_subscription.c + * - pgstat_wal.c + * + * Whenever possible infrastructure files should not contain code related to + * specific kinds of stats. + * + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat.c + * ---------- + */ +#include "postgres.h" + +#include <unistd.h> + +#include "access/transam.h" +#include "access/xact.h" +#include "lib/dshash.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "storage/shmem.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/pgstat_internal.h" +#include "utils/timestamp.h" + + +/* ---------- + * Timer definitions. + * + * In milliseconds. + * ---------- + */ + +/* minimum interval non-forced stats flushes.*/ +#define PGSTAT_MIN_INTERVAL 1000 +/* how long until to block flushing pending stats updates */ +#define PGSTAT_MAX_INTERVAL 60000 +/* when to call pgstat_report_stat() again, even when idle */ +#define PGSTAT_IDLE_INTERVAL 10000 + +/* ---------- + * Initial size hints for the hash tables used in statistics. + * ---------- + */ + +#define PGSTAT_SNAPSHOT_HASH_SIZE 512 + + +/* hash table for statistics snapshots entry */ +typedef struct PgStat_SnapshotEntry +{ + PgStat_HashKey key; + char status; /* for simplehash use */ + void *data; /* the stats data itself */ +} PgStat_SnapshotEntry; + + +/* ---------- + * Backend-local Hash Table Definitions + * ---------- + */ + +/* for stats snapshot entries */ +#define SH_PREFIX pgstat_snapshot +#define SH_ELEMENT_TYPE PgStat_SnapshotEntry +#define SH_KEY_TYPE PgStat_HashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL) +#define SH_EQUAL(tb, a, b) \ + pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + + +/* ---------- + * Local function forward declarations + * ---------- + */ + +static void pgstat_write_statsfile(void); +static void pgstat_read_statsfile(void); + +static void pgstat_reset_after_failure(void); + +static bool pgstat_flush_pending_entries(bool nowait); + +static void pgstat_prep_snapshot(void); +static void pgstat_build_snapshot(void); +static void pgstat_build_snapshot_fixed(PgStat_Kind kind); + +static inline bool pgstat_is_kind_valid(int ikind); + + +/* ---------- + * GUC parameters + * ---------- + */ + +bool pgstat_track_counts = false; +int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; + + +/* ---------- + * state shared with pgstat_*.c + * ---------- + */ + +PgStat_LocalState pgStatLocal; + + +/* ---------- + * Local data + * + * NB: There should be only variables related to stats infrastructure here, + * not for specific kinds of stats. + * ---------- + */ + +/* + * Memory contexts containing the pgStatEntryRefHash table, the + * pgStatSharedRef entries, and pending data respectively. Mostly to make it + * easier to track / attribute memory usage. + */ + +static MemoryContext pgStatPendingContext = NULL; + +/* + * Backend local list of PgStat_EntryRef with unflushed pending stats. + * + * Newly pending entries should only ever be added to the end of the list, + * otherwise pgstat_flush_pending_entries() might not see them immediately. + */ +static dlist_head pgStatPending = DLIST_STATIC_INIT(pgStatPending); + + +/* + * Force the next stats flush to happen regardless of + * PGSTAT_MIN_INTERVAL. Useful in test scripts. + */ +static bool pgStatForceNextFlush = false; + +/* + * Force-clear existing snapshot before next use when stats_fetch_consistency + * is changed. + */ +static bool force_stats_snapshot_clear = false; + + +/* + * For assertions that check pgstat is not used before initialization / after + * shutdown. + */ +#ifdef USE_ASSERT_CHECKING +static bool pgstat_is_initialized = false; +static bool pgstat_is_shutdown = false; +#endif + + +/* + * The different kinds of statistics. + * + * If reasonably possible, handling specific to one kind of stats should go + * through this abstraction, rather than making more of pgstat.c aware. + * + * See comments for struct PgStat_KindInfo for details about the individual + * fields. + * + * XXX: It'd be nicer to define this outside of this file. But there doesn't + * seem to be a great way of doing that, given the split across multiple + * files. + */ +static const PgStat_KindInfo pgstat_kind_infos[PGSTAT_NUM_KINDS] = { + + /* stats kinds for variable-numbered objects */ + + [PGSTAT_KIND_DATABASE] = { + .name = "database", + + .fixed_amount = false, + /* so pg_stat_database entries can be seen in all databases */ + .accessed_across_databases = true, + + .shared_size = sizeof(PgStatShared_Database), + .shared_data_off = offsetof(PgStatShared_Database, stats), + .shared_data_len = sizeof(((PgStatShared_Database *) 0)->stats), + .pending_size = sizeof(PgStat_StatDBEntry), + + .flush_pending_cb = pgstat_database_flush_cb, + .reset_timestamp_cb = pgstat_database_reset_timestamp_cb, + }, + + [PGSTAT_KIND_RELATION] = { + .name = "relation", + + .fixed_amount = false, + + .shared_size = sizeof(PgStatShared_Relation), + .shared_data_off = offsetof(PgStatShared_Relation, stats), + .shared_data_len = sizeof(((PgStatShared_Relation *) 0)->stats), + .pending_size = sizeof(PgStat_TableStatus), + + .flush_pending_cb = pgstat_relation_flush_cb, + .delete_pending_cb = pgstat_relation_delete_pending_cb, + }, + + [PGSTAT_KIND_FUNCTION] = { + .name = "function", + + .fixed_amount = false, + + .shared_size = sizeof(PgStatShared_Function), + .shared_data_off = offsetof(PgStatShared_Function, stats), + .shared_data_len = sizeof(((PgStatShared_Function *) 0)->stats), + .pending_size = sizeof(PgStat_BackendFunctionEntry), + + .flush_pending_cb = pgstat_function_flush_cb, + }, + + [PGSTAT_KIND_REPLSLOT] = { + .name = "replslot", + + .fixed_amount = false, + + .accessed_across_databases = true, + .named_on_disk = true, + + .shared_size = sizeof(PgStatShared_ReplSlot), + .shared_data_off = offsetof(PgStatShared_ReplSlot, stats), + .shared_data_len = sizeof(((PgStatShared_ReplSlot *) 0)->stats), + + .reset_timestamp_cb = pgstat_replslot_reset_timestamp_cb, + .to_serialized_name = pgstat_replslot_to_serialized_name_cb, + .from_serialized_name = pgstat_replslot_from_serialized_name_cb, + }, + + [PGSTAT_KIND_SUBSCRIPTION] = { + .name = "subscription", + + .fixed_amount = false, + /* so pg_stat_subscription_stats entries can be seen in all databases */ + .accessed_across_databases = true, + + .shared_size = sizeof(PgStatShared_Subscription), + .shared_data_off = offsetof(PgStatShared_Subscription, stats), + .shared_data_len = sizeof(((PgStatShared_Subscription *) 0)->stats), + .pending_size = sizeof(PgStat_BackendSubEntry), + + .flush_pending_cb = pgstat_subscription_flush_cb, + .reset_timestamp_cb = pgstat_subscription_reset_timestamp_cb, + }, + + + /* stats for fixed-numbered (mostly 1) objects */ + + [PGSTAT_KIND_ARCHIVER] = { + .name = "archiver", + + .fixed_amount = true, + + .reset_all_cb = pgstat_archiver_reset_all_cb, + .snapshot_cb = pgstat_archiver_snapshot_cb, + }, + + [PGSTAT_KIND_BGWRITER] = { + .name = "bgwriter", + + .fixed_amount = true, + + .reset_all_cb = pgstat_bgwriter_reset_all_cb, + .snapshot_cb = pgstat_bgwriter_snapshot_cb, + }, + + [PGSTAT_KIND_CHECKPOINTER] = { + .name = "checkpointer", + + .fixed_amount = true, + + .reset_all_cb = pgstat_checkpointer_reset_all_cb, + .snapshot_cb = pgstat_checkpointer_snapshot_cb, + }, + + [PGSTAT_KIND_SLRU] = { + .name = "slru", + + .fixed_amount = true, + + .reset_all_cb = pgstat_slru_reset_all_cb, + .snapshot_cb = pgstat_slru_snapshot_cb, + }, + + [PGSTAT_KIND_WAL] = { + .name = "wal", + + .fixed_amount = true, + + .reset_all_cb = pgstat_wal_reset_all_cb, + .snapshot_cb = pgstat_wal_snapshot_cb, + }, +}; + + +/* ------------------------------------------------------------ + * Functions managing the state of the stats system for all backends. + * ------------------------------------------------------------ + */ + +/* + * Read on-disk stats into memory at server start. + * + * Should only be called by the startup process or in single user mode. + */ +void +pgstat_restore_stats(void) +{ + pgstat_read_statsfile(); +} + +/* + * Remove the stats file. This is currently used only if WAL recovery is + * needed after a crash. + * + * Should only be called by the startup process or in single user mode. + */ +void +pgstat_discard_stats(void) +{ + int ret; + + /* NB: this needs to be done even in single user mode */ + + ret = unlink(PGSTAT_STAT_PERMANENT_FILENAME); + if (ret != 0) + { + if (errno == ENOENT) + elog(DEBUG2, + "didn't need to unlink permanent stats file \"%s\" - didn't exist", + PGSTAT_STAT_PERMANENT_FILENAME); + else + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not unlink permanent statistics file \"%s\": %m", + PGSTAT_STAT_PERMANENT_FILENAME))); + } + else + { + ereport(DEBUG2, + (errcode_for_file_access(), + errmsg_internal("unlinked permanent statistics file \"%s\"", + PGSTAT_STAT_PERMANENT_FILENAME))); + } + + /* + * Reset stats contents. This will set reset timestamps of fixed-numbered + * stats to the current time (no variable stats exist). + */ + pgstat_reset_after_failure(); +} + +/* + * pgstat_before_server_shutdown() needs to be called by exactly one process + * during regular server shutdowns. Otherwise all stats will be lost. + * + * We currently only write out stats for proc_exit(0). We might want to change + * that at some point... But right now pgstat_discard_stats() would be called + * during the start after a disorderly shutdown, anyway. + */ +void +pgstat_before_server_shutdown(int code, Datum arg) +{ + Assert(pgStatLocal.shmem != NULL); + Assert(!pgStatLocal.shmem->is_shutdown); + + /* + * Stats should only be reported after pgstat_initialize() and before + * pgstat_shutdown(). This is a convenient point to catch most violations + * of this rule. + */ + Assert(pgstat_is_initialized && !pgstat_is_shutdown); + + /* flush out our own pending changes before writing out */ + pgstat_report_stat(true); + + /* + * Only write out file during normal shutdown. Don't even signal that + * we've shutdown during irregular shutdowns, because the shutdown + * sequence isn't coordinated to ensure this backend shuts down last. + */ + if (code == 0) + { + pgStatLocal.shmem->is_shutdown = true; + pgstat_write_statsfile(); + } +} + + +/* ------------------------------------------------------------ + * Backend initialization / shutdown functions + * ------------------------------------------------------------ + */ + +/* + * Shut down a single backend's statistics reporting at process exit. + * + * Flush out any remaining statistics counts. Without this, operations + * triggered during backend exit (such as temp table deletions) won't be + * counted. + */ +static void +pgstat_shutdown_hook(int code, Datum arg) +{ + Assert(!pgstat_is_shutdown); + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + /* + * If we got as far as discovering our own database ID, we can flush out + * what we did so far. Otherwise, we'd be reporting an invalid database + * ID, so forget it. (This means that accesses to pg_database during + * failed backend starts might never get counted.) + */ + if (OidIsValid(MyDatabaseId)) + pgstat_report_disconnect(MyDatabaseId); + + pgstat_report_stat(true); + + /* there shouldn't be any pending changes left */ + Assert(dlist_is_empty(&pgStatPending)); + dlist_init(&pgStatPending); + + pgstat_detach_shmem(); + +#ifdef USE_ASSERT_CHECKING + pgstat_is_shutdown = true; +#endif +} + +/* + * Initialize pgstats state, and set up our on-proc-exit hook. Called from + * BaseInit(). + * + * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful. + */ +void +pgstat_initialize(void) +{ + Assert(!pgstat_is_initialized); + + pgstat_attach_shmem(); + + pgstat_init_wal(); + + /* Set up a process-exit hook to clean up */ + before_shmem_exit(pgstat_shutdown_hook, 0); + +#ifdef USE_ASSERT_CHECKING + pgstat_is_initialized = true; +#endif +} + + +/* ------------------------------------------------------------ + * Public functions used by backends follow + * ------------------------------------------------------------ + */ + +/* + * Must be called by processes that performs DML: tcop/postgres.c, logical + * receiver processes, SPI worker, etc. to flush pending statistics updates to + * shared memory. + * + * Unless called with 'force', pending stats updates are flushed happen once + * per PGSTAT_MIN_INTERVAL (1000ms). When not forced, stats flushes do not + * block on lock acquisition, except if stats updates have been pending for + * longer than PGSTAT_MAX_INTERVAL (60000ms). + * + * Whenever pending stats updates remain at the end of pgstat_report_stat() a + * suggested idle timeout is returned. Currently this is always + * PGSTAT_IDLE_INTERVAL (10000ms). Callers can use the returned time to set up + * a timeout after which to call pgstat_report_stat(true), but are not + * required to to do so. + * + * Note that this is called only when not within a transaction, so it is fair + * to use transaction stop time as an approximation of current time. + */ +long +pgstat_report_stat(bool force) +{ + static TimestampTz pending_since = 0; + static TimestampTz last_flush = 0; + bool partial_flush; + TimestampTz now; + bool nowait; + + pgstat_assert_is_up(); + Assert(!IsTransactionOrTransactionBlock()); + + /* "absorb" the forced flush even if there's nothing to flush */ + if (pgStatForceNextFlush) + { + force = true; + pgStatForceNextFlush = false; + } + + /* Don't expend a clock check if nothing to do */ + if (dlist_is_empty(&pgStatPending) && + !have_slrustats && + !pgstat_have_pending_wal()) + { + Assert(pending_since == 0); + return 0; + } + + /* + * There should never be stats to report once stats are shut down. Can't + * assert that before the checks above, as there is an unconditional + * pgstat_report_stat() call in pgstat_shutdown_hook() - which at least + * the process that ran pgstat_before_server_shutdown() will still call. + */ + Assert(!pgStatLocal.shmem->is_shutdown); + + now = GetCurrentTransactionStopTimestamp(); + + if (!force) + { + if (pending_since > 0 && + TimestampDifferenceExceeds(pending_since, now, PGSTAT_MAX_INTERVAL)) + { + /* don't keep pending updates longer than PGSTAT_MAX_INTERVAL */ + force = true; + } + else if (last_flush > 0 && + !TimestampDifferenceExceeds(last_flush, now, PGSTAT_MIN_INTERVAL)) + { + /* don't flush too frequently */ + if (pending_since == 0) + pending_since = now; + + return PGSTAT_IDLE_INTERVAL; + } + } + + pgstat_update_dbstats(now); + + /* don't wait for lock acquisition when !force */ + nowait = !force; + + partial_flush = false; + + /* flush database / relation / function / ... stats */ + partial_flush |= pgstat_flush_pending_entries(nowait); + + /* flush wal stats */ + partial_flush |= pgstat_flush_wal(nowait); + + /* flush SLRU stats */ + partial_flush |= pgstat_slru_flush(nowait); + + last_flush = now; + + /* + * If some of the pending stats could not be flushed due to lock + * contention, let the caller know when to retry. + */ + if (partial_flush) + { + /* force should have prevented us from getting here */ + Assert(!force); + + /* remember since when stats have been pending */ + if (pending_since == 0) + pending_since = now; + + return PGSTAT_IDLE_INTERVAL; + } + + pending_since = 0; + + return 0; +} + +/* + * Force locally pending stats to be flushed during the next + * pgstat_report_stat() call. This is useful for writing tests. + */ +void +pgstat_force_next_flush(void) +{ + pgStatForceNextFlush = true; +} + +/* + * Only for use by pgstat_reset_counters() + */ +static bool +match_db_entries(PgStatShared_HashEntry *entry, Datum match_data) +{ + return entry->key.dboid == DatumGetObjectId(MyDatabaseId); +} + +/* + * Reset counters for our database. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +void +pgstat_reset_counters(void) +{ + TimestampTz ts = GetCurrentTimestamp(); + + pgstat_reset_matching_entries(match_db_entries, + ObjectIdGetDatum(MyDatabaseId), + ts); +} + +/* + * Reset a single variable-numbered entry. + * + * If the stats kind is within a database, also reset the database's + * stat_reset_timestamp. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +void +pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + TimestampTz ts = GetCurrentTimestamp(); + + /* not needed atm, and doesn't make sense with the current signature */ + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + /* reset the "single counter" */ + pgstat_reset_entry(kind, dboid, objoid, ts); + + if (!kind_info->accessed_across_databases) + pgstat_reset_database_timestamp(dboid, ts); +} + +/* + * Reset stats for all entries of a kind. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +void +pgstat_reset_of_kind(PgStat_Kind kind) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + TimestampTz ts = GetCurrentTimestamp(); + + if (kind_info->fixed_amount) + kind_info->reset_all_cb(ts); + else + pgstat_reset_entries_of_kind(kind, ts); +} + + +/* ------------------------------------------------------------ + * Fetching of stats + * ------------------------------------------------------------ + */ + +/* + * Discard any data collected in the current transaction. Any subsequent + * request will cause new snapshots to be read. + * + * This is also invoked during transaction commit or abort to discard + * the no-longer-wanted snapshot. Updates of stats_fetch_consistency can + * cause this routine to be called. + */ +void +pgstat_clear_snapshot(void) +{ + pgstat_assert_is_up(); + + memset(&pgStatLocal.snapshot.fixed_valid, 0, + sizeof(pgStatLocal.snapshot.fixed_valid)); + pgStatLocal.snapshot.stats = NULL; + pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_NONE; + + /* Release memory, if any was allocated */ + if (pgStatLocal.snapshot.context) + { + MemoryContextDelete(pgStatLocal.snapshot.context); + + /* Reset variables */ + pgStatLocal.snapshot.context = NULL; + } + + /* + * Historically the backend_status.c facilities lived in this file, and + * were reset with the same function. For now keep it that way, and + * forward the reset request. + */ + pgstat_clear_backend_activity_snapshot(); + + /* Reset this flag, as it may be possible that a cleanup was forced. */ + force_stats_snapshot_clear = false; +} + +void * +pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + PgStat_HashKey key; + PgStat_EntryRef *entry_ref; + void *stats_data; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + /* should be called from backends */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + AssertArg(!kind_info->fixed_amount); + + pgstat_prep_snapshot(); + + key.kind = kind; + key.dboid = dboid; + key.objoid = objoid; + + /* if we need to build a full snapshot, do so */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + pgstat_build_snapshot(); + + /* if caching is desired, look up in cache */ + if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE) + { + PgStat_SnapshotEntry *entry = NULL; + + entry = pgstat_snapshot_lookup(pgStatLocal.snapshot.stats, key); + + if (entry) + return entry->data; + + /* + * If we built a full snapshot and the key is not in + * pgStatLocal.snapshot.stats, there are no matching stats. + */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + return NULL; + } + + pgStatLocal.snapshot.mode = pgstat_fetch_consistency; + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + + if (entry_ref == NULL || entry_ref->shared_entry->dropped) + { + /* create empty entry when using PGSTAT_FETCH_CONSISTENCY_CACHE */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE) + { + PgStat_SnapshotEntry *entry = NULL; + bool found; + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found); + Assert(!found); + entry->data = NULL; + } + return NULL; + } + + /* + * Allocate in caller's context for PGSTAT_FETCH_CONSISTENCY_NONE, + * otherwise we could quickly end up with a fair bit of memory used due to + * repeated accesses. + */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE) + stats_data = palloc(kind_info->shared_data_len); + else + stats_data = MemoryContextAlloc(pgStatLocal.snapshot.context, + kind_info->shared_data_len); + + pgstat_lock_entry_shared(entry_ref, false); + memcpy(stats_data, + pgstat_get_entry_data(kind, entry_ref->shared_stats), + kind_info->shared_data_len); + pgstat_unlock_entry(entry_ref); + + if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE) + { + PgStat_SnapshotEntry *entry = NULL; + bool found; + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found); + entry->data = stats_data; + } + + return stats_data; +} + +/* + * If a stats snapshot has been taken, return the timestamp at which that was + * done, and set *have_snapshot to true. Otherwise *have_snapshot is set to + * false. + */ +TimestampTz +pgstat_get_stat_snapshot_timestamp(bool *have_snapshot) +{ + if (force_stats_snapshot_clear) + pgstat_clear_snapshot(); + + if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + { + *have_snapshot = true; + return pgStatLocal.snapshot.snapshot_timestamp; + } + + *have_snapshot = false; + + return 0; +} + +bool +pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + /* fixed-numbered stats always exist */ + if (pgstat_get_kind_info(kind)->fixed_amount) + return true; + + return pgstat_get_entry_ref(kind, dboid, objoid, false, NULL) != NULL; +} + +/* + * Ensure snapshot for fixed-numbered 'kind' exists. + * + * Typically used by the pgstat_fetch_* functions for a kind of stats, before + * massaging the data into the desired format. + */ +void +pgstat_snapshot_fixed(PgStat_Kind kind) +{ + AssertArg(pgstat_is_kind_valid(kind)); + AssertArg(pgstat_get_kind_info(kind)->fixed_amount); + + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + pgstat_build_snapshot(); + else + pgstat_build_snapshot_fixed(kind); + + Assert(pgStatLocal.snapshot.fixed_valid[kind]); +} + +static void +pgstat_prep_snapshot(void) +{ + if (force_stats_snapshot_clear) + pgstat_clear_snapshot(); + + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE || + pgStatLocal.snapshot.stats != NULL) + return; + + if (!pgStatLocal.snapshot.context) + pgStatLocal.snapshot.context = AllocSetContextCreate(TopMemoryContext, + "PgStat Snapshot", + ALLOCSET_SMALL_SIZES); + + pgStatLocal.snapshot.stats = + pgstat_snapshot_create(pgStatLocal.snapshot.context, + PGSTAT_SNAPSHOT_HASH_SIZE, + NULL); +} + +static void +pgstat_build_snapshot(void) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + + /* should only be called when we need a snapshot */ + Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT); + + /* snapshot already built */ + if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + return; + + pgstat_prep_snapshot(); + + Assert(pgStatLocal.snapshot.stats->members == 0); + + pgStatLocal.snapshot.snapshot_timestamp = GetCurrentTimestamp(); + + /* + * Snapshot all variable stats. + */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + PgStat_Kind kind = p->key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + bool found; + PgStat_SnapshotEntry *entry; + PgStatShared_Common *stats_data; + + /* + * Check if the stats object should be included in the snapshot. + * Unless the stats kind can be accessed from all databases (e.g., + * database stats themselves), we only include stats for the current + * database or objects not associated with a database (e.g. shared + * relations). + */ + if (p->key.dboid != MyDatabaseId && + p->key.dboid != InvalidOid && + !kind_info->accessed_across_databases) + continue; + + if (p->dropped) + continue; + + Assert(pg_atomic_read_u32(&p->refcount) > 0); + + stats_data = dsa_get_address(pgStatLocal.dsa, p->body); + Assert(stats_data); + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, p->key, &found); + Assert(!found); + + entry->data = MemoryContextAlloc(pgStatLocal.snapshot.context, + kind_info->shared_size); + /* + * Acquire the LWLock directly instead of using + * pg_stat_lock_entry_shared() which requires a reference. + */ + LWLockAcquire(&stats_data->lock, LW_SHARED); + memcpy(entry->data, + pgstat_get_entry_data(kind, stats_data), + kind_info->shared_size); + LWLockRelease(&stats_data->lock); + } + dshash_seq_term(&hstat); + + /* + * Build snapshot of all fixed-numbered stats. + */ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->fixed_amount) + { + Assert(kind_info->snapshot_cb == NULL); + continue; + } + + pgstat_build_snapshot_fixed(kind); + } + + pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_SNAPSHOT; +} + +static void +pgstat_build_snapshot_fixed(PgStat_Kind kind) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + Assert(kind_info->fixed_amount); + Assert(kind_info->snapshot_cb != NULL); + + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE) + { + /* rebuild every time */ + pgStatLocal.snapshot.fixed_valid[kind] = false; + } + else if (pgStatLocal.snapshot.fixed_valid[kind]) + { + /* in snapshot mode we shouldn't get called again */ + Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE); + return; + } + + Assert(!pgStatLocal.snapshot.fixed_valid[kind]); + + kind_info->snapshot_cb(); + + Assert(!pgStatLocal.snapshot.fixed_valid[kind]); + pgStatLocal.snapshot.fixed_valid[kind] = true; +} + + +/* ------------------------------------------------------------ + * Backend-local pending stats infrastructure + * ------------------------------------------------------------ + */ + +/* + * Returns the appropriate PgStat_EntryRef, preparing it to receive pending + * stats if not already done. + * + * If created_entry is non-NULL, it'll be set to true if the entry is newly + * created, false otherwise. + */ +PgStat_EntryRef * +pgstat_prep_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid, bool *created_entry) +{ + PgStat_EntryRef *entry_ref; + + /* need to be able to flush out */ + Assert(pgstat_get_kind_info(kind)->flush_pending_cb != NULL); + + if (unlikely(!pgStatPendingContext)) + { + pgStatPendingContext = + AllocSetContextCreate(TopMemoryContext, + "PgStat Pending", + ALLOCSET_SMALL_SIZES); + } + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, + true, created_entry); + + if (entry_ref->pending == NULL) + { + size_t entrysize = pgstat_get_kind_info(kind)->pending_size; + + Assert(entrysize != (size_t) -1); + + entry_ref->pending = MemoryContextAllocZero(pgStatPendingContext, entrysize); + dlist_push_tail(&pgStatPending, &entry_ref->pending_node); + } + + return entry_ref; +} + +/* + * Return an existing stats entry, or NULL. + * + * This should only be used for helper function for pgstatfuncs.c - outside of + * that it shouldn't be needed. + */ +PgStat_EntryRef * +pgstat_fetch_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + PgStat_EntryRef *entry_ref; + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + + if (entry_ref == NULL || entry_ref->pending == NULL) + return NULL; + + return entry_ref; +} + +void +pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref) +{ + PgStat_Kind kind = entry_ref->shared_entry->key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + void *pending_data = entry_ref->pending; + + Assert(pending_data != NULL); + /* !fixed_amount stats should be handled explicitly */ + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + if (kind_info->delete_pending_cb) + kind_info->delete_pending_cb(entry_ref); + + pfree(pending_data); + entry_ref->pending = NULL; + + dlist_delete(&entry_ref->pending_node); +} + +/* + * Flush out pending stats for database objects (databases, relations, + * functions). + */ +static bool +pgstat_flush_pending_entries(bool nowait) +{ + bool have_pending = false; + dlist_node *cur = NULL; + + /* + * Need to be a bit careful iterating over the list of pending entries. + * Processing a pending entry may queue further pending entries to the end + * of the list that we want to process, so a simple iteration won't do. + * Further complicating matters is that we want to delete the current + * entry in each iteration from the list if we flushed successfully. + * + * So we just keep track of the next pointer in each loop iteration. + */ + if (!dlist_is_empty(&pgStatPending)) + cur = dlist_head_node(&pgStatPending); + + while (cur) + { + PgStat_EntryRef *entry_ref = + dlist_container(PgStat_EntryRef, pending_node, cur); + PgStat_HashKey key = entry_ref->shared_entry->key; + PgStat_Kind kind = key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + bool did_flush; + dlist_node *next; + + Assert(!kind_info->fixed_amount); + Assert(kind_info->flush_pending_cb != NULL); + + /* flush the stats, if possible */ + did_flush = kind_info->flush_pending_cb(entry_ref, nowait); + + Assert(did_flush || nowait); + + /* determine next entry, before deleting the pending entry */ + if (dlist_has_next(&pgStatPending, cur)) + next = dlist_next_node(&pgStatPending, cur); + else + next = NULL; + + /* if successfully flushed, remove entry */ + if (did_flush) + pgstat_delete_pending_entry(entry_ref); + else + have_pending = true; + + cur = next; + } + + Assert(dlist_is_empty(&pgStatPending) == !have_pending); + + return have_pending; +} + + +/* ------------------------------------------------------------ + * Helper / infrastructure functions + * ------------------------------------------------------------ + */ + +PgStat_Kind +pgstat_get_kind_from_str(char *kind_str) +{ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + if (pg_strcasecmp(kind_str, pgstat_kind_infos[kind].name) == 0) + return kind; + } + + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid statistics kind: \"%s\"", kind_str))); + return PGSTAT_KIND_DATABASE; /* avoid compiler warnings */ +} + +static inline bool +pgstat_is_kind_valid(int ikind) +{ + return ikind >= PGSTAT_KIND_FIRST_VALID && ikind <= PGSTAT_KIND_LAST; +} + +const PgStat_KindInfo * +pgstat_get_kind_info(PgStat_Kind kind) +{ + AssertArg(pgstat_is_kind_valid(kind)); + + return &pgstat_kind_infos[kind]; +} + +/* + * Stats should only be reported after pgstat_initialize() and before + * pgstat_shutdown(). This check is put in a few central places to catch + * violations of this rule more easily. + */ +#ifdef USE_ASSERT_CHECKING +void +pgstat_assert_is_up(void) +{ + Assert(pgstat_is_initialized && !pgstat_is_shutdown); +} +#endif + + +/* ------------------------------------------------------------ + * reading and writing of on-disk stats file + * ------------------------------------------------------------ + */ + +/* helpers for pgstat_write_statsfile() */ +static void +write_chunk(FILE *fpout, void *ptr, size_t len) +{ + int rc; + + rc = fwrite(ptr, len, 1, fpout); + + /* we'll check for errors with ferror once at the end */ + (void) rc; +} + +#define write_chunk_s(fpout, ptr) write_chunk(fpout, ptr, sizeof(*ptr)) + +/* + * This function is called in the last process that is accessing the shared + * stats so locking is not required. + */ +static void +pgstat_write_statsfile(void) +{ + FILE *fpout; + int32 format_id; + const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + dshash_seq_status hstat; + PgStatShared_HashEntry *ps; + + pgstat_assert_is_up(); + + /* we're shutting down, so it's ok to just override this */ + pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE; + + elog(DEBUG2, "writing stats file \"%s\"", statfile); + + /* + * Open the statistics temp file to write out the current values. + */ + fpout = AllocateFile(tmpfile, PG_BINARY_W); + if (fpout == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open temporary statistics file \"%s\": %m", + tmpfile))); + return; + } + + /* + * Write the file header --- currently just a format ID. + */ + format_id = PGSTAT_FILE_FORMAT_ID; + write_chunk_s(fpout, &format_id); + + /* + * XXX: The following could now be generalized to just iterate over + * pgstat_kind_infos instead of knowing about the different kinds of + * stats. + */ + + /* + * Write archiver stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_ARCHIVER); + write_chunk_s(fpout, &pgStatLocal.snapshot.archiver); + + /* + * Write bgwriter stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_BGWRITER); + write_chunk_s(fpout, &pgStatLocal.snapshot.bgwriter); + + /* + * Write checkpointer stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER); + write_chunk_s(fpout, &pgStatLocal.snapshot.checkpointer); + + /* + * Write SLRU stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_SLRU); + write_chunk_s(fpout, &pgStatLocal.snapshot.slru); + + /* + * Write WAL stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_WAL); + write_chunk_s(fpout, &pgStatLocal.snapshot.wal); + + /* + * Walk through the stats entries + */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((ps = dshash_seq_next(&hstat)) != NULL) + { + PgStatShared_Common *shstats; + const PgStat_KindInfo *kind_info = NULL; + + CHECK_FOR_INTERRUPTS(); + + /* we may have some "dropped" entries not yet removed, skip them */ + Assert(!ps->dropped); + if (ps->dropped) + continue; + + shstats = (PgStatShared_Common *) dsa_get_address(pgStatLocal.dsa, ps->body); + + kind_info = pgstat_get_kind_info(ps->key.kind); + + /* if not dropped the valid-entry refcount should exist */ + Assert(pg_atomic_read_u32(&ps->refcount) > 0); + + if (!kind_info->to_serialized_name) + { + /* normal stats entry, identified by PgStat_HashKey */ + fputc('S', fpout); + write_chunk_s(fpout, &ps->key); + } + else + { + /* stats entry identified by name on disk (e.g. slots) */ + NameData name; + + kind_info->to_serialized_name(&ps->key, shstats, &name); + + fputc('N', fpout); + write_chunk_s(fpout, &ps->key.kind); + write_chunk_s(fpout, &name); + } + + /* Write except the header part of the entry */ + write_chunk(fpout, + pgstat_get_entry_data(ps->key.kind, shstats), + pgstat_get_entry_len(ps->key.kind)); + } + dshash_seq_term(&hstat); + + /* + * No more output to be done. Close the temp file and replace the old + * pgstat.stat with it. The ferror() check replaces testing for error + * after each individual fputc or fwrite (in write_chunk()) above. + */ + fputc('E', fpout); + + if (ferror(fpout)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write temporary statistics file \"%s\": %m", + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); + } + else if (FreeFile(fpout) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close temporary statistics file \"%s\": %m", + tmpfile))); + unlink(tmpfile); + } + else if (rename(tmpfile, statfile) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", + tmpfile, statfile))); + unlink(tmpfile); + } +} + +/* helpers for pgstat_read_statsfile() */ +static bool +read_chunk(FILE *fpin, void *ptr, size_t len) +{ + return fread(ptr, 1, len, fpin) == len; +} + +#define read_chunk_s(fpin, ptr) read_chunk(fpin, ptr, sizeof(*ptr)) + +/* + * Reads in existing statistics file into the shared stats hash. + * + * This function is called in the only process that is accessing the shared + * stats so locking is not required. + */ +static void +pgstat_read_statsfile(void) +{ + FILE *fpin; + int32 format_id; + bool found; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + PgStat_ShmemControl *shmem = pgStatLocal.shmem; + + /* shouldn't be called from postmaster */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + elog(DEBUG2, "reading stats file \"%s\"", statfile); + + /* + * Try to open the stats file. If it doesn't exist, the backends simply + * returns zero for anything and statistics simply starts from scratch + * with empty counters. + * + * ENOENT is a possibility if stats collection was previously disabled or + * has not yet written the stats file for the first time. Any other + * failure condition is suspicious. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + { + if (errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + pgstat_reset_after_failure(); + return; + } + + /* + * Verify it's of the expected format. + */ + if (!read_chunk_s(fpin, &format_id) || + format_id != PGSTAT_FILE_FORMAT_ID) + goto error; + + /* + * XXX: The following could now be generalized to just iterate over + * pgstat_kind_infos instead of knowing about the different kinds of + * stats. + */ + + /* + * Read archiver stats struct + */ + if (!read_chunk_s(fpin, &shmem->archiver.stats)) + goto error; + + /* + * Read bgwriter stats struct + */ + if (!read_chunk_s(fpin, &shmem->bgwriter.stats)) + goto error; + + /* + * Read checkpointer stats struct + */ + if (!read_chunk_s(fpin, &shmem->checkpointer.stats)) + goto error; + + /* + * Read SLRU stats struct + */ + if (!read_chunk_s(fpin, &shmem->slru.stats)) + goto error; + + /* + * Read WAL stats struct + */ + if (!read_chunk_s(fpin, &shmem->wal.stats)) + goto error; + + /* + * We found an existing statistics file. Read it and put all the hash + * table entries into place. + */ + for (;;) + { + int t = fgetc(fpin); + + switch (t) + { + case 'S': + case 'N': + { + PgStat_HashKey key; + PgStatShared_HashEntry *p; + PgStatShared_Common *header; + + CHECK_FOR_INTERRUPTS(); + + if (t == 'S') + { + /* normal stats entry, identified by PgStat_HashKey */ + if (!read_chunk_s(fpin, &key)) + goto error; + + if (!pgstat_is_kind_valid(key.kind)) + goto error; + } + else + { + /* stats entry identified by name on disk (e.g. slots) */ + const PgStat_KindInfo *kind_info = NULL; + PgStat_Kind kind; + NameData name; + + if (!read_chunk_s(fpin, &kind)) + goto error; + if (!read_chunk_s(fpin, &name)) + goto error; + if (!pgstat_is_kind_valid(kind)) + goto error; + + kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->from_serialized_name) + goto error; + + if (!kind_info->from_serialized_name(&name, &key)) + { + /* skip over data for entry we don't care about */ + if (fseek(fpin, pgstat_get_entry_len(kind), SEEK_CUR) != 0) + goto error; + + continue; + } + + Assert(key.kind == kind); + } + + /* + * This intentionally doesn't use pgstat_get_entry_ref() - + * putting all stats into checkpointer's + * pgStatEntryRefHash would be wasted effort and memory. + */ + p = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &found); + + /* don't allow duplicate entries */ + if (found) + { + dshash_release_lock(pgStatLocal.shared_hash, p); + elog(WARNING, "found duplicate stats entry %d/%u/%u", + key.kind, key.dboid, key.objoid); + goto error; + } + + header = pgstat_init_entry(key.kind, p); + dshash_release_lock(pgStatLocal.shared_hash, p); + + if (!read_chunk(fpin, + pgstat_get_entry_data(key.kind, header), + pgstat_get_entry_len(key.kind))) + goto error; + + break; + } + case 'E': + /* check that 'E' actually signals end of file */ + if (fgetc(fpin) != EOF) + goto error; + + goto done; + + default: + goto error; + } + } + +done: + FreeFile(fpin); + + elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); + unlink(statfile); + + return; + +error: + ereport(LOG, + (errmsg("corrupted statistics file \"%s\"", statfile))); + + pgstat_reset_after_failure(); + + goto done; +} + +/* + * Helper to reset / drop stats after a crash or after restoring stats from + * disk failed, potentially after already loading parts. + */ +static void +pgstat_reset_after_failure(void) +{ + TimestampTz ts = GetCurrentTimestamp(); + + /* reset fixed-numbered stats */ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->fixed_amount) + continue; + + kind_info->reset_all_cb(ts); + } + + /* and drop variable-numbered ones */ + pgstat_drop_all_entries(); +} + +/* + * GUC assign_hook for stats_fetch_consistency. + */ +void +assign_stats_fetch_consistency(int newval, void *extra) +{ + /* + * Changing this value in a transaction may cause snapshot state + * inconsistencies, so force a clear of the current snapshot on the next + * snapshot build attempt. + */ + if (pgstat_fetch_consistency != newval) + force_stats_snapshot_clear = true; +} |