/*------------------------------------------------------------------------- * * syncscan.c * scan synchronization support * * When multiple backends run a sequential scan on the same table, we try * to keep them synchronized to reduce the overall I/O needed. The goal is * to read each page into shared buffer cache only once, and let all backends * that take part in the shared scan process the page before it falls out of * the cache. * * Since the "leader" in a pack of backends doing a seqscan will have to wait * for I/O, while the "followers" don't, there is a strong self-synchronizing * effect once we can get the backends examining approximately the same part * of the table at the same time. Hence all that is really needed is to get * a new backend beginning a seqscan to begin it close to where other backends * are reading. We can scan the table circularly, from block X up to the * end and then from block 0 to X-1, to ensure we visit all rows while still * participating in the common scan. * * To accomplish that, we keep track of the scan position of each table, and * start new scans close to where the previous scan(s) are. We don't try to * do any extra synchronization to keep the scans together afterwards; some * scans might progress much more slowly than others, for example if the * results need to be transferred to the client over a slow network, and we * don't want such queries to slow down others. * * There can realistically only be a few large sequential scans on different * tables in progress at any time. Therefore we just keep the scan positions * in a small LRU list which we scan every time we need to look up or update a * scan position. The whole mechanism is only applied for tables exceeding * a threshold size (but that is not the concern of this module). * * INTERFACE ROUTINES * ss_get_location - return current scan location of a relation * ss_report_location - update current scan location * * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/access/common/syncscan.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/syncscan.h" #include "miscadmin.h" #include "storage/lwlock.h" #include "storage/shmem.h" #include "utils/rel.h" /* GUC variables */ #ifdef TRACE_SYNCSCAN bool trace_syncscan = false; #endif /* * Size of the LRU list. * * Note: the code assumes that SYNC_SCAN_NELEM > 1. * * XXX: What's a good value? It should be large enough to hold the * maximum number of large tables scanned simultaneously. But a larger value * means more traversing of the LRU list when starting a new scan. */ #define SYNC_SCAN_NELEM 20 /* * Interval between reports of the location of the current scan, in pages. * * Note: This should be smaller than the ring size (see buffer/freelist.c) * we use for bulk reads. Otherwise a scan joining other scans might start * from a page that's no longer in the buffer cache. This is a bit fuzzy; * there's no guarantee that the new scan will read the page before it leaves * the buffer cache anyway, and on the other hand the page is most likely * still in the OS cache. */ #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) /* * The scan locations structure is essentially a doubly-linked LRU with head * and tail pointer, but designed to hold a fixed maximum number of elements in * fixed-size shared memory. */ typedef struct ss_scan_location_t { RelFileNode relfilenode; /* identity of a relation */ BlockNumber location; /* last-reported location in the relation */ } ss_scan_location_t; typedef struct ss_lru_item_t { struct ss_lru_item_t *prev; struct ss_lru_item_t *next; ss_scan_location_t location; } ss_lru_item_t; typedef struct ss_scan_locations_t { ss_lru_item_t *head; ss_lru_item_t *tail; ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */ } ss_scan_locations_t; #define SizeOfScanLocations(N) \ (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t)) /* Pointer to struct in shared memory */ static ss_scan_locations_t *scan_locations; /* prototypes for internal functions */ static BlockNumber ss_search(RelFileNode relfilenode, BlockNumber location, bool set); /* * SyncScanShmemSize --- report amount of shared memory space needed */ Size SyncScanShmemSize(void) { return SizeOfScanLocations(SYNC_SCAN_NELEM); } /* * SyncScanShmemInit --- initialize this module's shared memory */ void SyncScanShmemInit(void) { int i; bool found; scan_locations = (ss_scan_locations_t *) ShmemInitStruct("Sync Scan Locations List", SizeOfScanLocations(SYNC_SCAN_NELEM), &found); if (!IsUnderPostmaster) { /* Initialize shared memory area */ Assert(!found); scan_locations->head = &scan_locations->items[0]; scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1]; for (i = 0; i < SYNC_SCAN_NELEM; i++) { ss_lru_item_t *item = &scan_locations->items[i]; /* * Initialize all slots with invalid values. As scans are started, * these invalid entries will fall off the LRU list and get * replaced with real entries. */ item->location.relfilenode.spcNode = InvalidOid; item->location.relfilenode.dbNode = InvalidOid; item->location.relfilenode.relNode = InvalidOid; item->location.location = InvalidBlockNumber; item->prev = (i > 0) ? (&scan_locations->items[i - 1]) : NULL; item->next = (i < SYNC_SCAN_NELEM - 1) ? (&scan_locations->items[i + 1]) : NULL; } } else Assert(found); } /* * ss_search --- search the scan_locations structure for an entry with the * given relfilenode. * * If "set" is true, the location is updated to the given location. If no * entry for the given relfilenode is found, it will be created at the head * of the list with the given location, even if "set" is false. * * In any case, the location after possible update is returned. * * Caller is responsible for having acquired suitable lock on the shared * data structure. */ static BlockNumber ss_search(RelFileNode relfilenode, BlockNumber location, bool set) { ss_lru_item_t *item; item = scan_locations->head; for (;;) { bool match; match = RelFileNodeEquals(item->location.relfilenode, relfilenode); if (match || item->next == NULL) { /* * If we reached the end of list and no match was found, take over * the last entry */ if (!match) { item->location.relfilenode = relfilenode; item->location.location = location; } else if (set) item->location.location = location; /* Move the entry to the front of the LRU list */ if (item != scan_locations->head) { /* unlink */ if (item == scan_locations->tail) scan_locations->tail = item->prev; item->prev->next = item->next; if (item->next) item->next->prev = item->prev; /* link */ item->prev = NULL; item->next = scan_locations->head; scan_locations->head->prev = item; scan_locations->head = item; } return item->location.location; } item = item->next; } /* not reached */ } /* * ss_get_location --- get the optimal starting location for scan * * Returns the last-reported location of a sequential scan on the * relation, or 0 if no valid location is found. * * We expect the caller has just done RelationGetNumberOfBlocks(), and * so that number is passed in rather than computing it again. The result * is guaranteed less than relnblocks (assuming that's > 0). */ BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks) { BlockNumber startloc; LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); startloc = ss_search(rel->rd_node, 0, false); LWLockRelease(SyncScanLock); /* * If the location is not a valid block number for this scan, start at 0. * * This can happen if for instance a VACUUM truncated the table since the * location was saved. */ if (startloc >= relnblocks) startloc = 0; #ifdef TRACE_SYNCSCAN if (trace_syncscan) elog(LOG, "SYNC_SCAN: start \"%s\" (size %u) at %u", RelationGetRelationName(rel), relnblocks, startloc); #endif return startloc; } /* * ss_report_location --- update the current scan location * * Writes an entry into the shared Sync Scan state of the form * (relfilenode, blocknumber), overwriting any existing entry for the * same relfilenode. */ void ss_report_location(Relation rel, BlockNumber location) { #ifdef TRACE_SYNCSCAN if (trace_syncscan) { if ((location % 1024) == 0) elog(LOG, "SYNC_SCAN: scanning \"%s\" at %u", RelationGetRelationName(rel), location); } #endif /* * To reduce lock contention, only report scan progress every N pages. For * the same reason, don't block if the lock isn't immediately available. * Missing a few updates isn't critical, it just means that a new scan * that wants to join the pack will start a little bit behind the head of * the scan. Hopefully the pages are still in OS cache and the scan * catches up quickly. */ if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0) { if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) { (void) ss_search(rel->rd_node, location, true); LWLockRelease(SyncScanLock); } #ifdef TRACE_SYNCSCAN else if (trace_syncscan) elog(LOG, "SYNC_SCAN: missed update for \"%s\" at %u", RelationGetRelationName(rel), location); #endif } }