/*------------------------------------------------------------------------- * * system.c * support routines for SYSTEM tablesample method * * To ensure repeatability of samples, it is necessary that selection of a * given tuple be history-independent; otherwise syncscanning would break * repeatability, to say nothing of logically-irrelevant maintenance such * as physical extension or shortening of the relation. * * To achieve that, we proceed by hashing each candidate block number together * with the active seed, and then selecting it if the hash is less than the * cutoff value computed from the selection probability by BeginSampleScan. * * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/access/tablesample/system.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "access/relscan.h" #include "access/tsmapi.h" #include "catalog/pg_type.h" #include "common/hashfn.h" #include "optimizer/optimizer.h" #include "utils/builtins.h" /* Private state */ typedef struct { uint64 cutoff; /* select blocks with hash less than this */ uint32 seed; /* random seed */ BlockNumber nextblock; /* next block to consider sampling */ OffsetNumber lt; /* last tuple returned from current block */ } SystemSamplerData; static void system_samplescangetsamplesize(PlannerInfo *root, RelOptInfo *baserel, List *paramexprs, BlockNumber *pages, double *tuples); static void system_initsamplescan(SampleScanState *node, int eflags); static void system_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed); static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks); static OffsetNumber system_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset); /* * Create a TsmRoutine descriptor for the SYSTEM method. */ Datum tsm_system_handler(PG_FUNCTION_ARGS) { TsmRoutine *tsm = makeNode(TsmRoutine); tsm->parameterTypes = list_make1_oid(FLOAT4OID); tsm->repeatable_across_queries = true; tsm->repeatable_across_scans = true; tsm->SampleScanGetSampleSize = system_samplescangetsamplesize; tsm->InitSampleScan = system_initsamplescan; tsm->BeginSampleScan = system_beginsamplescan; tsm->NextSampleBlock = system_nextsampleblock; tsm->NextSampleTuple = system_nextsampletuple; tsm->EndSampleScan = NULL; PG_RETURN_POINTER(tsm); } /* * Sample size estimation. */ static void system_samplescangetsamplesize(PlannerInfo *root, RelOptInfo *baserel, List *paramexprs, BlockNumber *pages, double *tuples) { Node *pctnode; float4 samplefract; /* Try to extract an estimate for the sample percentage */ pctnode = (Node *) linitial(paramexprs); pctnode = estimate_expression_value(root, pctnode); if (IsA(pctnode, Const) && !((Const *) pctnode)->constisnull) { samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) samplefract /= 100.0f; else { /* Default samplefract if the value is bogus */ samplefract = 0.1f; } } else { /* Default samplefract if we didn't obtain a non-null Const */ samplefract = 0.1f; } /* We'll visit a sample of the pages ... */ *pages = clamp_row_est(baserel->pages * samplefract); /* ... and hopefully get a representative number of tuples from them */ *tuples = clamp_row_est(baserel->tuples * samplefract); } /* * Initialize during executor setup. */ static void system_initsamplescan(SampleScanState *node, int eflags) { node->tsm_state = palloc0(sizeof(SystemSamplerData)); } /* * Examine parameters and prepare for a sample scan. */ static void system_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed) { SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; double percent = DatumGetFloat4(params[0]); double dcutoff; if (percent < 0 || percent > 100 || isnan(percent)) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), errmsg("sample percentage must be between 0 and 100"))); /* * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to * store that as a uint64, of course. Note that this gives strictly * correct behavior at the limits of zero or one probability. */ dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); sampler->cutoff = (uint64) dcutoff; sampler->seed = seed; sampler->nextblock = 0; sampler->lt = InvalidOffsetNumber; /* * Bulkread buffer access strategy probably makes sense unless we're * scanning a very small fraction of the table. The 1% cutoff here is a * guess. We should use pagemode visibility checking, since we scan all * tuples on each selected page. */ node->use_bulkread = (percent >= 1); node->use_pagemode = true; } /* * Select next block to sample. */ static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks) { SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; BlockNumber nextblock = sampler->nextblock; uint32 hashinput[2]; /* * We compute the hash by applying hash_any to an array of 2 uint32's * containing the block number and seed. This is efficient to set up, and * with the current implementation of hash_any, it gives * machine-independent results, which is a nice property for regression * testing. * * These words in the hash input are the same throughout the block: */ hashinput[1] = sampler->seed; /* * Loop over block numbers until finding suitable block or reaching end of * relation. */ for (; nextblock < nblocks; nextblock++) { uint32 hash; hashinput[0] = nextblock; hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, (int) sizeof(hashinput))); if (hash < sampler->cutoff) break; } if (nextblock < nblocks) { /* Found a suitable block; remember where we should start next time */ sampler->nextblock = nextblock + 1; return nextblock; } /* Done, but let's reset nextblock to 0 for safety. */ sampler->nextblock = 0; return InvalidBlockNumber; } /* * Select next sampled tuple in current block. * * In block sampling, we just want to sample all the tuples in each selected * block. * * It is OK here to return an offset without knowing if the tuple is visible * (or even exists); nodeSamplescan.c will deal with that. * * When we reach end of the block, return InvalidOffsetNumber which tells * SampleScan to go to next block. */ static OffsetNumber system_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset) { SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; OffsetNumber tupoffset = sampler->lt; /* Advance to next possible offset on page */ if (tupoffset == InvalidOffsetNumber) tupoffset = FirstOffsetNumber; else tupoffset++; /* Done? */ if (tupoffset > maxoffset) tupoffset = InvalidOffsetNumber; sampler->lt = tupoffset; return tupoffset; }