diff options
Diffstat (limited to 'src/backend/access/common/syncscan.c')
-rw-r--r-- | src/backend/access/common/syncscan.c | 322 |
1 files changed, 322 insertions, 0 deletions
diff --git a/src/backend/access/common/syncscan.c b/src/backend/access/common/syncscan.c new file mode 100644 index 0000000..b7a28af --- /dev/null +++ b/src/backend/access/common/syncscan.c @@ -0,0 +1,322 @@ +/*------------------------------------------------------------------------- + * + * syncscan.c + * scan synchronization support + * + * When multiple backends run a sequential scan on the same table, we try + * to keep them synchronized to reduce the overall I/O needed. The goal is + * to read each page into shared buffer cache only once, and let all backends + * that take part in the shared scan process the page before it falls out of + * the cache. + * + * Since the "leader" in a pack of backends doing a seqscan will have to wait + * for I/O, while the "followers" don't, there is a strong self-synchronizing + * effect once we can get the backends examining approximately the same part + * of the table at the same time. Hence all that is really needed is to get + * a new backend beginning a seqscan to begin it close to where other backends + * are reading. We can scan the table circularly, from block X up to the + * end and then from block 0 to X-1, to ensure we visit all rows while still + * participating in the common scan. + * + * To accomplish that, we keep track of the scan position of each table, and + * start new scans close to where the previous scan(s) are. We don't try to + * do any extra synchronization to keep the scans together afterwards; some + * scans might progress much more slowly than others, for example if the + * results need to be transferred to the client over a slow network, and we + * don't want such queries to slow down others. + * + * There can realistically only be a few large sequential scans on different + * tables in progress at any time. Therefore we just keep the scan positions + * in a small LRU list which we scan every time we need to look up or update a + * scan position. The whole mechanism is only applied for tables exceeding + * a threshold size (but that is not the concern of this module). + * + * INTERFACE ROUTINES + * ss_get_location - return current scan location of a relation + * ss_report_location - update current scan location + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/common/syncscan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/syncscan.h" +#include "miscadmin.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/rel.h" + + +/* GUC variables */ +#ifdef TRACE_SYNCSCAN +bool trace_syncscan = false; +#endif + + +/* + * Size of the LRU list. + * + * Note: the code assumes that SYNC_SCAN_NELEM > 1. + * + * XXX: What's a good value? It should be large enough to hold the + * maximum number of large tables scanned simultaneously. But a larger value + * means more traversing of the LRU list when starting a new scan. + */ +#define SYNC_SCAN_NELEM 20 + +/* + * Interval between reports of the location of the current scan, in pages. + * + * Note: This should be smaller than the ring size (see buffer/freelist.c) + * we use for bulk reads. Otherwise a scan joining other scans might start + * from a page that's no longer in the buffer cache. This is a bit fuzzy; + * there's no guarantee that the new scan will read the page before it leaves + * the buffer cache anyway, and on the other hand the page is most likely + * still in the OS cache. + */ +#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) + + +/* + * The scan locations structure is essentially a doubly-linked LRU with head + * and tail pointer, but designed to hold a fixed maximum number of elements in + * fixed-size shared memory. + */ +typedef struct ss_scan_location_t +{ + RelFileNode relfilenode; /* identity of a relation */ + BlockNumber location; /* last-reported location in the relation */ +} ss_scan_location_t; + +typedef struct ss_lru_item_t +{ + struct ss_lru_item_t *prev; + struct ss_lru_item_t *next; + ss_scan_location_t location; +} ss_lru_item_t; + +typedef struct ss_scan_locations_t +{ + ss_lru_item_t *head; + ss_lru_item_t *tail; + ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */ +} ss_scan_locations_t; + +#define SizeOfScanLocations(N) \ + (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t)) + +/* Pointer to struct in shared memory */ +static ss_scan_locations_t *scan_locations; + +/* prototypes for internal functions */ +static BlockNumber ss_search(RelFileNode relfilenode, + BlockNumber location, bool set); + + +/* + * SyncScanShmemSize --- report amount of shared memory space needed + */ +Size +SyncScanShmemSize(void) +{ + return SizeOfScanLocations(SYNC_SCAN_NELEM); +} + +/* + * SyncScanShmemInit --- initialize this module's shared memory + */ +void +SyncScanShmemInit(void) +{ + int i; + bool found; + + scan_locations = (ss_scan_locations_t *) + ShmemInitStruct("Sync Scan Locations List", + SizeOfScanLocations(SYNC_SCAN_NELEM), + &found); + + if (!IsUnderPostmaster) + { + /* Initialize shared memory area */ + Assert(!found); + + scan_locations->head = &scan_locations->items[0]; + scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1]; + + for (i = 0; i < SYNC_SCAN_NELEM; i++) + { + ss_lru_item_t *item = &scan_locations->items[i]; + + /* + * Initialize all slots with invalid values. As scans are started, + * these invalid entries will fall off the LRU list and get + * replaced with real entries. + */ + item->location.relfilenode.spcNode = InvalidOid; + item->location.relfilenode.dbNode = InvalidOid; + item->location.relfilenode.relNode = InvalidOid; + item->location.location = InvalidBlockNumber; + + item->prev = (i > 0) ? + (&scan_locations->items[i - 1]) : NULL; + item->next = (i < SYNC_SCAN_NELEM - 1) ? + (&scan_locations->items[i + 1]) : NULL; + } + } + else + Assert(found); +} + +/* + * ss_search --- search the scan_locations structure for an entry with the + * given relfilenode. + * + * If "set" is true, the location is updated to the given location. If no + * entry for the given relfilenode is found, it will be created at the head + * of the list with the given location, even if "set" is false. + * + * In any case, the location after possible update is returned. + * + * Caller is responsible for having acquired suitable lock on the shared + * data structure. + */ +static BlockNumber +ss_search(RelFileNode relfilenode, BlockNumber location, bool set) +{ + ss_lru_item_t *item; + + item = scan_locations->head; + for (;;) + { + bool match; + + match = RelFileNodeEquals(item->location.relfilenode, relfilenode); + + if (match || item->next == NULL) + { + /* + * If we reached the end of list and no match was found, take over + * the last entry + */ + if (!match) + { + item->location.relfilenode = relfilenode; + item->location.location = location; + } + else if (set) + item->location.location = location; + + /* Move the entry to the front of the LRU list */ + if (item != scan_locations->head) + { + /* unlink */ + if (item == scan_locations->tail) + scan_locations->tail = item->prev; + item->prev->next = item->next; + if (item->next) + item->next->prev = item->prev; + + /* link */ + item->prev = NULL; + item->next = scan_locations->head; + scan_locations->head->prev = item; + scan_locations->head = item; + } + + return item->location.location; + } + + item = item->next; + } + + /* not reached */ +} + +/* + * ss_get_location --- get the optimal starting location for scan + * + * Returns the last-reported location of a sequential scan on the + * relation, or 0 if no valid location is found. + * + * We expect the caller has just done RelationGetNumberOfBlocks(), and + * so that number is passed in rather than computing it again. The result + * is guaranteed less than relnblocks (assuming that's > 0). + */ +BlockNumber +ss_get_location(Relation rel, BlockNumber relnblocks) +{ + BlockNumber startloc; + + LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); + startloc = ss_search(rel->rd_node, 0, false); + LWLockRelease(SyncScanLock); + + /* + * If the location is not a valid block number for this scan, start at 0. + * + * This can happen if for instance a VACUUM truncated the table since the + * location was saved. + */ + if (startloc >= relnblocks) + startloc = 0; + +#ifdef TRACE_SYNCSCAN + if (trace_syncscan) + elog(LOG, + "SYNC_SCAN: start \"%s\" (size %u) at %u", + RelationGetRelationName(rel), relnblocks, startloc); +#endif + + return startloc; +} + +/* + * ss_report_location --- update the current scan location + * + * Writes an entry into the shared Sync Scan state of the form + * (relfilenode, blocknumber), overwriting any existing entry for the + * same relfilenode. + */ +void +ss_report_location(Relation rel, BlockNumber location) +{ +#ifdef TRACE_SYNCSCAN + if (trace_syncscan) + { + if ((location % 1024) == 0) + elog(LOG, + "SYNC_SCAN: scanning \"%s\" at %u", + RelationGetRelationName(rel), location); + } +#endif + + /* + * To reduce lock contention, only report scan progress every N pages. For + * the same reason, don't block if the lock isn't immediately available. + * Missing a few updates isn't critical, it just means that a new scan + * that wants to join the pack will start a little bit behind the head of + * the scan. Hopefully the pages are still in OS cache and the scan + * catches up quickly. + */ + if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0) + { + if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) + { + (void) ss_search(rel->rd_node, location, true); + LWLockRelease(SyncScanLock); + } +#ifdef TRACE_SYNCSCAN + else if (trace_syncscan) + elog(LOG, + "SYNC_SCAN: missed update for \"%s\" at %u", + RelationGetRelationName(rel), location); +#endif + } +} |