1 files changed, 322 insertions, 0 deletions
diff --git a/src/backend/access/common/syncscan.c b/src/backend/access/common/syncscan.c
new file mode 100644
index 0000000..d5b16c5
--- /dev/null
+++ b/src/backend/access/common/syncscan.c
@@ -0,0 +1,322 @@
+/*-------------------------------------------------------------------------
+ *
+ * syncscan.c
+ *	  scan synchronization support
+ *
+ * When multiple backends run a sequential scan on the same table, we try
+ * to keep them synchronized to reduce the overall I/O needed.  The goal is
+ * to read each page into shared buffer cache only once, and let all backends
+ * that take part in the shared scan process the page before it falls out of
+ * the cache.
+ *
+ * Since the "leader" in a pack of backends doing a seqscan will have to wait
+ * for I/O, while the "followers" don't, there is a strong self-synchronizing
+ * effect once we can get the backends examining approximately the same part
+ * of the table at the same time.  Hence all that is really needed is to get
+ * a new backend beginning a seqscan to begin it close to where other backends
+ * are reading.  We can scan the table circularly, from block X up to the
+ * end and then from block 0 to X-1, to ensure we visit all rows while still
+ * participating in the common scan.
+ *
+ * To accomplish that, we keep track of the scan position of each table, and
+ * start new scans close to where the previous scan(s) are.  We don't try to
+ * do any extra synchronization to keep the scans together afterwards; some
+ * scans might progress much more slowly than others, for example if the
+ * results need to be transferred to the client over a slow network, and we
+ * don't want such queries to slow down others.
+ *
+ * There can realistically only be a few large sequential scans on different
+ * tables in progress at any time.  Therefore we just keep the scan positions
+ * in a small LRU list which we scan every time we need to look up or update a
+ * scan position.  The whole mechanism is only applied for tables exceeding
+ * a threshold size (but that is not the concern of this module).
+ *
+ * INTERFACE ROUTINES
+ *		ss_get_location		- return current scan location of a relation
+ *		ss_report_location	- update current scan location
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/common/syncscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/syncscan.h"
+#include "miscadmin.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/rel.h"
+
+
+/* GUC variables */
+#ifdef TRACE_SYNCSCAN
+bool		trace_syncscan = false;
+#endif
+
+
+/*
+ * Size of the LRU list.
+ *
+ * Note: the code assumes that SYNC_SCAN_NELEM > 1.
+ *
+ * XXX: What's a good value? It should be large enough to hold the
+ * maximum number of large tables scanned simultaneously.  But a larger value
+ * means more traversing of the LRU list when starting a new scan.
+ */
+#define SYNC_SCAN_NELEM 20
+
+/*
+ * Interval between reports of the location of the current scan, in pages.
+ *
+ * Note: This should be smaller than the ring size (see buffer/freelist.c)
+ * we use for bulk reads.  Otherwise a scan joining other scans might start
+ * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
+ * there's no guarantee that the new scan will read the page before it leaves
+ * the buffer cache anyway, and on the other hand the page is most likely
+ * still in the OS cache.
+ */
+#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
+
+
+/*
+ * The scan locations structure is essentially a doubly-linked LRU with head
+ * and tail pointer, but designed to hold a fixed maximum number of elements in
+ * fixed-size shared memory.
+ */
+typedef struct ss_scan_location_t
+{
+	RelFileNode relfilenode;	/* identity of a relation */
+	BlockNumber location;		/* last-reported location in the relation */
+} ss_scan_location_t;
+
+typedef struct ss_lru_item_t
+{
+	struct ss_lru_item_t *prev;
+	struct ss_lru_item_t *next;
+	ss_scan_location_t location;
+} ss_lru_item_t;
+
+typedef struct ss_scan_locations_t
+{
+	ss_lru_item_t *head;
+	ss_lru_item_t *tail;
+	ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
+} ss_scan_locations_t;
+
+#define SizeOfScanLocations(N) \
+	(offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
+
+/* Pointer to struct in shared memory */
+static ss_scan_locations_t *scan_locations;
+
+/* prototypes for internal functions */
+static BlockNumber ss_search(RelFileNode relfilenode,
+							 BlockNumber location, bool set);
+
+
+/*
+ * SyncScanShmemSize --- report amount of shared memory space needed
+ */
+Size
+SyncScanShmemSize(void)
+{
+	return SizeOfScanLocations(SYNC_SCAN_NELEM);
+}
+
+/*
+ * SyncScanShmemInit --- initialize this module's shared memory
+ */
+void
+SyncScanShmemInit(void)
+{
+	int			i;
+	bool		found;
+
+	scan_locations = (ss_scan_locations_t *)
+		ShmemInitStruct("Sync Scan Locations List",
+						SizeOfScanLocations(SYNC_SCAN_NELEM),
+						&found);
+
+	if (!IsUnderPostmaster)
+	{
+		/* Initialize shared memory area */
+		Assert(!found);
+
+		scan_locations->head = &scan_locations->items[0];
+		scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
+
+		for (i = 0; i < SYNC_SCAN_NELEM; i++)
+		{
+			ss_lru_item_t *item = &scan_locations->items[i];
+
+			/*
+			 * Initialize all slots with invalid values. As scans are started,
+			 * these invalid entries will fall off the LRU list and get
+			 * replaced with real entries.
+			 */
+			item->location.relfilenode.spcNode = InvalidOid;
+			item->location.relfilenode.dbNode = InvalidOid;
+			item->location.relfilenode.relNode = InvalidOid;
+			item->location.location = InvalidBlockNumber;
+
+			item->prev = (i > 0) ?
+				(&scan_locations->items[i - 1]) : NULL;
+			item->next = (i < SYNC_SCAN_NELEM - 1) ?
+				(&scan_locations->items[i + 1]) : NULL;
+		}
+	}
+	else
+		Assert(found);
+}
+
+/*
+ * ss_search --- search the scan_locations structure for an entry with the
+ *		given relfilenode.
+ *
+ * If "set" is true, the location is updated to the given location.  If no
+ * entry for the given relfilenode is found, it will be created at the head
+ * of the list with the given location, even if "set" is false.
+ *
+ * In any case, the location after possible update is returned.
+ *
+ * Caller is responsible for having acquired suitable lock on the shared
+ * data structure.
+ */
+static BlockNumber
+ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
+{
+	ss_lru_item_t *item;
+
+	item = scan_locations->head;
+	for (;;)
+	{
+		bool		match;
+
+		match = RelFileNodeEquals(item->location.relfilenode, relfilenode);
+
+		if (match || item->next == NULL)
+		{
+			/*
+			 * If we reached the end of list and no match was found, take over
+			 * the last entry
+			 */
+			if (!match)
+			{
+				item->location.relfilenode = relfilenode;
+				item->location.location = location;
+			}
+			else if (set)
+				item->location.location = location;
+
+			/* Move the entry to the front of the LRU list */
+			if (item != scan_locations->head)
+			{
+				/* unlink */
+				if (item == scan_locations->tail)
+					scan_locations->tail = item->prev;
+				item->prev->next = item->next;
+				if (item->next)
+					item->next->prev = item->prev;
+
+				/* link */
+				item->prev = NULL;
+				item->next = scan_locations->head;
+				scan_locations->head->prev = item;
+				scan_locations->head = item;
+			}
+
+			return item->location.location;
+		}
+
+		item = item->next;
+	}
+
+	/* not reached */
+}
+
+/*
+ * ss_get_location --- get the optimal starting location for scan
+ *
+ * Returns the last-reported location of a sequential scan on the
+ * relation, or 0 if no valid location is found.
+ *
+ * We expect the caller has just done RelationGetNumberOfBlocks(), and
+ * so that number is passed in rather than computing it again.  The result
+ * is guaranteed less than relnblocks (assuming that's > 0).
+ */
+BlockNumber
+ss_get_location(Relation rel, BlockNumber relnblocks)
+{
+	BlockNumber startloc;
+
+	LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
+	startloc = ss_search(rel->rd_node, 0, false);
+	LWLockRelease(SyncScanLock);
+
+	/*
+	 * If the location is not a valid block number for this scan, start at 0.
+	 *
+	 * This can happen if for instance a VACUUM truncated the table since the
+	 * location was saved.
+	 */
+	if (startloc >= relnblocks)
+		startloc = 0;
+
+#ifdef TRACE_SYNCSCAN
+	if (trace_syncscan)
+		elog(LOG,
+			 "SYNC_SCAN: start \"%s\" (size %u) at %u",
+			 RelationGetRelationName(rel), relnblocks, startloc);
+#endif
+
+	return startloc;
+}
+
+/*
+ * ss_report_location --- update the current scan location
+ *
+ * Writes an entry into the shared Sync Scan state of the form
+ * (relfilenode, blocknumber), overwriting any existing entry for the
+ * same relfilenode.
+ */
+void
+ss_report_location(Relation rel, BlockNumber location)
+{
+#ifdef TRACE_SYNCSCAN
+	if (trace_syncscan)
+	{
+		if ((location % 1024) == 0)
+			elog(LOG,
+				 "SYNC_SCAN: scanning \"%s\" at %u",
+				 RelationGetRelationName(rel), location);
+	}
+#endif
+
+	/*
+	 * To reduce lock contention, only report scan progress every N pages. For
+	 * the same reason, don't block if the lock isn't immediately available.
+	 * Missing a few updates isn't critical, it just means that a new scan
+	 * that wants to join the pack will start a little bit behind the head of
+	 * the scan.  Hopefully the pages are still in OS cache and the scan
+	 * catches up quickly.
+	 */
+	if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
+	{
+		if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
+		{
+			(void) ss_search(rel->rd_node, location, true);
+			LWLockRelease(SyncScanLock);
+		}
+#ifdef TRACE_SYNCSCAN
+		else if (trace_syncscan)
+			elog(LOG,
+				 "SYNC_SCAN: missed update for \"%s\" at %u",
+				 RelationGetRelationName(rel), location);
+#endif
+	}
+}