1 files changed, 257 insertions, 0 deletions
diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c
new file mode 100644
index 0000000..c387f3d
--- /dev/null
+++ b/src/backend/access/tablesample/system.c
@@ -0,0 +1,257 @@
+/*-------------------------------------------------------------------------
+ *
+ * system.c
+ *	  support routines for SYSTEM tablesample method
+ *
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
+ *
+ * To achieve that, we proceed by hashing each candidate block number together
+ * with the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/tablesample/system.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/relscan.h"
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
+#include "common/hashfn.h"
+#include "optimizer/optimizer.h"
+#include "utils/builtins.h"
+
+
+/* Private state */
+typedef struct
+{
+	uint64		cutoff;			/* select blocks with hash less than this */
+	uint32		seed;			/* random seed */
+	BlockNumber nextblock;		/* next block to consider sampling */
+	OffsetNumber lt;			/* last tuple returned from current block */
+} SystemSamplerData;
+
+
+static void system_samplescangetsamplesize(PlannerInfo *root,
+										   RelOptInfo *baserel,
+										   List *paramexprs,
+										   BlockNumber *pages,
+										   double *tuples);
+static void system_initsamplescan(SampleScanState *node,
+								  int eflags);
+static void system_beginsamplescan(SampleScanState *node,
+								   Datum *params,
+								   int nparams,
+								   uint32 seed);
+static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
+static OffsetNumber system_nextsampletuple(SampleScanState *node,
+										   BlockNumber blockno,
+										   OffsetNumber maxoffset);
+
+
+/*
+ * Create a TsmRoutine descriptor for the SYSTEM method.
+ */
+Datum
+tsm_system_handler(PG_FUNCTION_ARGS)
+{
+	TsmRoutine *tsm = makeNode(TsmRoutine);
+
+	tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+	tsm->repeatable_across_queries = true;
+	tsm->repeatable_across_scans = true;
+	tsm->SampleScanGetSampleSize = system_samplescangetsamplesize;
+	tsm->InitSampleScan = system_initsamplescan;
+	tsm->BeginSampleScan = system_beginsamplescan;
+	tsm->NextSampleBlock = system_nextsampleblock;
+	tsm->NextSampleTuple = system_nextsampletuple;
+	tsm->EndSampleScan = NULL;
+
+	PG_RETURN_POINTER(tsm);
+}
+
+/*
+ * Sample size estimation.
+ */
+static void
+system_samplescangetsamplesize(PlannerInfo *root,
+							   RelOptInfo *baserel,
+							   List *paramexprs,
+							   BlockNumber *pages,
+							   double *tuples)
+{
+	Node	   *pctnode;
+	float4		samplefract;
+
+	/* Try to extract an estimate for the sample percentage */
+	pctnode = (Node *) linitial(paramexprs);
+	pctnode = estimate_expression_value(root, pctnode);
+
+	if (IsA(pctnode, Const) &&
+		!((Const *) pctnode)->constisnull)
+	{
+		samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+		if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+			samplefract /= 100.0f;
+		else
+		{
+			/* Default samplefract if the value is bogus */
+			samplefract = 0.1f;
+		}
+	}
+	else
+	{
+		/* Default samplefract if we didn't obtain a non-null Const */
+		samplefract = 0.1f;
+	}
+
+	/* We'll visit a sample of the pages ... */
+	*pages = clamp_row_est(baserel->pages * samplefract);
+
+	/* ... and hopefully get a representative number of tuples from them */
+	*tuples = clamp_row_est(baserel->tuples * samplefract);
+}
+
+/*
+ * Initialize during executor setup.
+ */
+static void
+system_initsamplescan(SampleScanState *node, int eflags)
+{
+	node->tsm_state = palloc0(sizeof(SystemSamplerData));
+}
+
+/*
+ * Examine parameters and prepare for a sample scan.
+ */
+static void
+system_beginsamplescan(SampleScanState *node,
+					   Datum *params,
+					   int nparams,
+					   uint32 seed)
+{
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	double		percent = DatumGetFloat4(params[0]);
+	double		dcutoff;
+
+	if (percent < 0 || percent > 100 || isnan(percent))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+				 errmsg("sample percentage must be between 0 and 100")));
+
+	/*
+	 * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+	 * store that as a uint64, of course.  Note that this gives strictly
+	 * correct behavior at the limits of zero or one probability.
+	 */
+	dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
+	sampler->cutoff = (uint64) dcutoff;
+	sampler->seed = seed;
+	sampler->nextblock = 0;
+	sampler->lt = InvalidOffsetNumber;
+
+	/*
+	 * Bulkread buffer access strategy probably makes sense unless we're
+	 * scanning a very small fraction of the table.  The 1% cutoff here is a
+	 * guess.  We should use pagemode visibility checking, since we scan all
+	 * tuples on each selected page.
+	 */
+	node->use_bulkread = (percent >= 1);
+	node->use_pagemode = true;
+}
+
+/*
+ * Select next block to sample.
+ */
+static BlockNumber
+system_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
+{
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	BlockNumber nextblock = sampler->nextblock;
+	uint32		hashinput[2];
+
+	/*
+	 * We compute the hash by applying hash_any to an array of 2 uint32's
+	 * containing the block number and seed.  This is efficient to set up, and
+	 * with the current implementation of hash_any, it gives
+	 * machine-independent results, which is a nice property for regression
+	 * testing.
+	 *
+	 * These words in the hash input are the same throughout the block:
+	 */
+	hashinput[1] = sampler->seed;
+
+	/*
+	 * Loop over block numbers until finding suitable block or reaching end of
+	 * relation.
+	 */
+	for (; nextblock < nblocks; nextblock++)
+	{
+		uint32		hash;
+
+		hashinput[0] = nextblock;
+
+		hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+									   (int) sizeof(hashinput)));
+		if (hash < sampler->cutoff)
+			break;
+	}
+
+	if (nextblock < nblocks)
+	{
+		/* Found a suitable block; remember where we should start next time */
+		sampler->nextblock = nextblock + 1;
+		return nextblock;
+	}
+
+	/* Done, but let's reset nextblock to 0 for safety. */
+	sampler->nextblock = 0;
+	return InvalidBlockNumber;
+}
+
+/*
+ * Select next sampled tuple in current block.
+ *
+ * In block sampling, we just want to sample all the tuples in each selected
+ * block.
+ *
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists); nodeSamplescan.c will deal with that.
+ *
+ * When we reach end of the block, return InvalidOffsetNumber which tells
+ * SampleScan to go to next block.
+ */
+static OffsetNumber
+system_nextsampletuple(SampleScanState *node,
+					   BlockNumber blockno,
+					   OffsetNumber maxoffset)
+{
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	OffsetNumber tupoffset = sampler->lt;
+
+	/* Advance to next possible offset on page */
+	if (tupoffset == InvalidOffsetNumber)
+		tupoffset = FirstOffsetNumber;
+	else
+		tupoffset++;
+
+	/* Done? */
+	if (tupoffset > maxoffset)
+		tupoffset = InvalidOffsetNumber;
+
+	sampler->lt = tupoffset;
+
+	return tupoffset;
+}