diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:17:33 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:17:33 +0000 |
commit | 5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch) | |
tree | 739caf8c461053357daa9f162bef34516c7bf452 /src/backend/statistics/mvdistinct.c | |
parent | Initial commit. (diff) | |
download | postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip |
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/statistics/mvdistinct.c')
-rw-r--r-- | src/backend/statistics/mvdistinct.c | 700 |
1 files changed, 700 insertions, 0 deletions
diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c new file mode 100644 index 0000000..9b216af --- /dev/null +++ b/src/backend/statistics/mvdistinct.c @@ -0,0 +1,700 @@ +/*------------------------------------------------------------------------- + * + * mvdistinct.c + * POSTGRES multivariate ndistinct coefficients + * + * Estimating number of groups in a combination of columns (e.g. for GROUP BY) + * is tricky, and the estimation error is often significant. + + * The multivariate ndistinct coefficients address this by storing ndistinct + * estimates for combinations of the user-specified columns. So for example + * given a statistics object on three columns (a,b,c), this module estimates + * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column + * estimates are already available in pg_statistic. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/statistics/mvdistinct.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "access/htup_details.h" +#include "catalog/pg_statistic_ext.h" +#include "catalog/pg_statistic_ext_data.h" +#include "lib/stringinfo.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics.h" +#include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + +static double ndistinct_for_combination(double totalrows, StatsBuildData *data, + int k, int *combination); +static double estimate_ndistinct(double totalrows, int numrows, int d, int f1); +static int n_choose_k(int n, int k); +static int num_combinations(int n); + +/* size of the struct header fields (magic, type, nitems) */ +#define SizeOfHeader (3 * sizeof(uint32)) + +/* size of a serialized ndistinct item (coefficient, natts, atts) */ +#define SizeOfItem(natts) \ + (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber)) + +/* minimal size of a ndistinct item (with two attributes) */ +#define MinSizeOfItem SizeOfItem(2) + +/* minimal size of mvndistinct, when all items are minimal */ +#define MinSizeOfItems(nitems) \ + (SizeOfHeader + (nitems) * MinSizeOfItem) + +/* Combination generator API */ + +/* internal state for generator of k-combinations of n elements */ +typedef struct CombinationGenerator +{ + int k; /* size of the combination */ + int n; /* total number of elements */ + int current; /* index of the next combination to return */ + int ncombinations; /* number of combinations (size of array) */ + int *combinations; /* array of pre-built combinations */ +} CombinationGenerator; + +static CombinationGenerator *generator_init(int n, int k); +static void generator_free(CombinationGenerator *state); +static int *generator_next(CombinationGenerator *state); +static void generate_combinations(CombinationGenerator *state); + + +/* + * statext_ndistinct_build + * Compute ndistinct coefficient for the combination of attributes. + * + * This computes the ndistinct estimate using the same estimator used + * in analyze.c and then computes the coefficient. + * + * To handle expressions easily, we treat them as system attributes with + * negative attnums, and offset everything by number of expressions to + * allow using Bitmapsets. + */ +MVNDistinct * +statext_ndistinct_build(double totalrows, StatsBuildData *data) +{ + MVNDistinct *result; + int k; + int itemcnt; + int numattrs = data->nattnums; + int numcombs = num_combinations(numattrs); + + result = palloc(offsetof(MVNDistinct, items) + + numcombs * sizeof(MVNDistinctItem)); + result->magic = STATS_NDISTINCT_MAGIC; + result->type = STATS_NDISTINCT_TYPE_BASIC; + result->nitems = numcombs; + + itemcnt = 0; + for (k = 2; k <= numattrs; k++) + { + int *combination; + CombinationGenerator *generator; + + /* generate combinations of K out of N elements */ + generator = generator_init(numattrs, k); + + while ((combination = generator_next(generator))) + { + MVNDistinctItem *item = &result->items[itemcnt]; + int j; + + item->attributes = palloc(sizeof(AttrNumber) * k); + item->nattributes = k; + + /* translate the indexes to attnums */ + for (j = 0; j < k; j++) + { + item->attributes[j] = data->attnums[combination[j]]; + + Assert(AttributeNumberIsValid(item->attributes[j])); + } + + item->ndistinct = + ndistinct_for_combination(totalrows, data, k, combination); + + itemcnt++; + Assert(itemcnt <= result->nitems); + } + + generator_free(generator); + } + + /* must consume exactly the whole output array */ + Assert(itemcnt == result->nitems); + + return result; +} + +/* + * statext_ndistinct_load + * Load the ndistinct value for the indicated pg_statistic_ext tuple + */ +MVNDistinct * +statext_ndistinct_load(Oid mvoid, bool inh) +{ + MVNDistinct *result; + bool isnull; + Datum ndist; + HeapTuple htup; + + htup = SearchSysCache2(STATEXTDATASTXOID, + ObjectIdGetDatum(mvoid), BoolGetDatum(inh)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for statistics object %u", mvoid); + + ndist = SysCacheGetAttr(STATEXTDATASTXOID, htup, + Anum_pg_statistic_ext_data_stxdndistinct, &isnull); + if (isnull) + elog(ERROR, + "requested statistics kind \"%c\" is not yet built for statistics object %u", + STATS_EXT_NDISTINCT, mvoid); + + result = statext_ndistinct_deserialize(DatumGetByteaPP(ndist)); + + ReleaseSysCache(htup); + + return result; +} + +/* + * statext_ndistinct_serialize + * serialize ndistinct to the on-disk bytea format + */ +bytea * +statext_ndistinct_serialize(MVNDistinct *ndistinct) +{ + int i; + bytea *output; + char *tmp; + Size len; + + Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC); + Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC); + + /* + * Base size is size of scalar fields in the struct, plus one base struct + * for each item, including number of items for each. + */ + len = VARHDRSZ + SizeOfHeader; + + /* and also include space for the actual attribute numbers */ + for (i = 0; i < ndistinct->nitems; i++) + { + int nmembers; + + nmembers = ndistinct->items[i].nattributes; + Assert(nmembers >= 2); + + len += SizeOfItem(nmembers); + } + + output = (bytea *) palloc(len); + SET_VARSIZE(output, len); + + tmp = VARDATA(output); + + /* Store the base struct values (magic, type, nitems) */ + memcpy(tmp, &ndistinct->magic, sizeof(uint32)); + tmp += sizeof(uint32); + memcpy(tmp, &ndistinct->type, sizeof(uint32)); + tmp += sizeof(uint32); + memcpy(tmp, &ndistinct->nitems, sizeof(uint32)); + tmp += sizeof(uint32); + + /* + * store number of attributes and attribute numbers for each entry + */ + for (i = 0; i < ndistinct->nitems; i++) + { + MVNDistinctItem item = ndistinct->items[i]; + int nmembers = item.nattributes; + + memcpy(tmp, &item.ndistinct, sizeof(double)); + tmp += sizeof(double); + memcpy(tmp, &nmembers, sizeof(int)); + tmp += sizeof(int); + + memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers); + tmp += nmembers * sizeof(AttrNumber); + + /* protect against overflows */ + Assert(tmp <= ((char *) output + len)); + } + + /* check we used exactly the expected space */ + Assert(tmp == ((char *) output + len)); + + return output; +} + +/* + * statext_ndistinct_deserialize + * Read an on-disk bytea format MVNDistinct to in-memory format + */ +MVNDistinct * +statext_ndistinct_deserialize(bytea *data) +{ + int i; + Size minimum_size; + MVNDistinct ndist; + MVNDistinct *ndistinct; + char *tmp; + + if (data == NULL) + return NULL; + + /* we expect at least the basic fields of MVNDistinct struct */ + if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader) + elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)", + VARSIZE_ANY_EXHDR(data), SizeOfHeader); + + /* initialize pointer to the data part (skip the varlena header) */ + tmp = VARDATA_ANY(data); + + /* read the header fields and perform basic sanity checks */ + memcpy(&ndist.magic, tmp, sizeof(uint32)); + tmp += sizeof(uint32); + memcpy(&ndist.type, tmp, sizeof(uint32)); + tmp += sizeof(uint32); + memcpy(&ndist.nitems, tmp, sizeof(uint32)); + tmp += sizeof(uint32); + + if (ndist.magic != STATS_NDISTINCT_MAGIC) + elog(ERROR, "invalid ndistinct magic %08x (expected %08x)", + ndist.magic, STATS_NDISTINCT_MAGIC); + if (ndist.type != STATS_NDISTINCT_TYPE_BASIC) + elog(ERROR, "invalid ndistinct type %d (expected %d)", + ndist.type, STATS_NDISTINCT_TYPE_BASIC); + if (ndist.nitems == 0) + elog(ERROR, "invalid zero-length item array in MVNDistinct"); + + /* what minimum bytea size do we expect for those parameters */ + minimum_size = MinSizeOfItems(ndist.nitems); + if (VARSIZE_ANY_EXHDR(data) < minimum_size) + elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)", + VARSIZE_ANY_EXHDR(data), minimum_size); + + /* + * Allocate space for the ndistinct items (no space for each item's + * attnos: those live in bitmapsets allocated separately) + */ + ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) + + (ndist.nitems * sizeof(MVNDistinctItem))); + ndistinct->magic = ndist.magic; + ndistinct->type = ndist.type; + ndistinct->nitems = ndist.nitems; + + for (i = 0; i < ndistinct->nitems; i++) + { + MVNDistinctItem *item = &ndistinct->items[i]; + + /* ndistinct value */ + memcpy(&item->ndistinct, tmp, sizeof(double)); + tmp += sizeof(double); + + /* number of attributes */ + memcpy(&item->nattributes, tmp, sizeof(int)); + tmp += sizeof(int); + Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS)); + + item->attributes + = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber)); + + memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes); + tmp += sizeof(AttrNumber) * item->nattributes; + + /* still within the bytea */ + Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); + } + + /* we should have consumed the whole bytea exactly */ + Assert(tmp == ((char *) data + VARSIZE_ANY(data))); + + return ndistinct; +} + +/* + * pg_ndistinct_in + * input routine for type pg_ndistinct + * + * pg_ndistinct is real enough to be a table column, but it has no + * operations of its own, and disallows input (just like pg_node_tree). + */ +Datum +pg_ndistinct_in(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ndistinct"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_ndistinct + * output routine for type pg_ndistinct + * + * Produces a human-readable representation of the value. + */ +Datum +pg_ndistinct_out(PG_FUNCTION_ARGS) +{ + bytea *data = PG_GETARG_BYTEA_PP(0); + MVNDistinct *ndist = statext_ndistinct_deserialize(data); + int i; + StringInfoData str; + + initStringInfo(&str); + appendStringInfoChar(&str, '{'); + + for (i = 0; i < ndist->nitems; i++) + { + int j; + MVNDistinctItem item = ndist->items[i]; + + if (i > 0) + appendStringInfoString(&str, ", "); + + for (j = 0; j < item.nattributes; j++) + { + AttrNumber attnum = item.attributes[j]; + + appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum); + } + appendStringInfo(&str, "\": %d", (int) item.ndistinct); + } + + appendStringInfoChar(&str, '}'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * pg_ndistinct_recv + * binary input routine for type pg_ndistinct + */ +Datum +pg_ndistinct_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ndistinct"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_ndistinct_send + * binary output routine for type pg_ndistinct + * + * n-distinct is serialized into a bytea value, so let's send that. + */ +Datum +pg_ndistinct_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} + +/* + * ndistinct_for_combination + * Estimates number of distinct values in a combination of columns. + * + * This uses the same ndistinct estimator as compute_scalar_stats() in + * ANALYZE, i.e., + * n*d / (n - f1 + f1*n/N) + * + * except that instead of values in a single column we are dealing with + * combination of multiple columns. + */ +static double +ndistinct_for_combination(double totalrows, StatsBuildData *data, + int k, int *combination) +{ + int i, + j; + int f1, + cnt, + d; + bool *isnull; + Datum *values; + SortItem *items; + MultiSortSupport mss; + int numrows = data->numrows; + + mss = multi_sort_init(k); + + /* + * In order to determine the number of distinct elements, create separate + * values[]/isnull[] arrays with all the data we have, then sort them + * using the specified column combination as dimensions. We could try to + * sort in place, but it'd probably be more complex and bug-prone. + */ + items = (SortItem *) palloc(numrows * sizeof(SortItem)); + values = (Datum *) palloc0(sizeof(Datum) * numrows * k); + isnull = (bool *) palloc0(sizeof(bool) * numrows * k); + + for (i = 0; i < numrows; i++) + { + items[i].values = &values[i * k]; + items[i].isnull = &isnull[i * k]; + } + + /* + * For each dimension, set up sort-support and fill in the values from the + * sample data. + * + * We use the column data types' default sort operators and collations; + * perhaps at some point it'd be worth using column-specific collations? + */ + for (i = 0; i < k; i++) + { + Oid typid; + TypeCacheEntry *type; + Oid collid = InvalidOid; + VacAttrStats *colstat = data->stats[combination[i]]; + + typid = colstat->attrtypid; + collid = colstat->attrcollid; + + type = lookup_type_cache(typid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) /* shouldn't happen */ + elog(ERROR, "cache lookup failed for ordering operator for type %u", + typid); + + /* prepare the sort function for this dimension */ + multi_sort_add_dimension(mss, i, type->lt_opr, collid); + + /* accumulate all the data for this dimension into the arrays */ + for (j = 0; j < numrows; j++) + { + items[j].values[i] = data->values[combination[i]][j]; + items[j].isnull[i] = data->nulls[combination[i]][j]; + } + } + + /* We can sort the array now ... */ + qsort_interruptible((void *) items, numrows, sizeof(SortItem), + multi_sort_compare, mss); + + /* ... and count the number of distinct combinations */ + + f1 = 0; + cnt = 1; + d = 1; + for (i = 1; i < numrows; i++) + { + if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0) + { + if (cnt == 1) + f1 += 1; + + d++; + cnt = 0; + } + + cnt += 1; + } + + if (cnt == 1) + f1 += 1; + + return estimate_ndistinct(totalrows, numrows, d, f1); +} + +/* The Duj1 estimator (already used in analyze.c). */ +static double +estimate_ndistinct(double totalrows, int numrows, int d, int f1) +{ + double numer, + denom, + ndistinct; + + numer = (double) numrows * (double) d; + + denom = (double) (numrows - f1) + + (double) f1 * (double) numrows / totalrows; + + ndistinct = numer / denom; + + /* Clamp to sane range in case of roundoff error */ + if (ndistinct < (double) d) + ndistinct = (double) d; + + if (ndistinct > totalrows) + ndistinct = totalrows; + + return floor(ndistinct + 0.5); +} + +/* + * n_choose_k + * computes binomial coefficients using an algorithm that is both + * efficient and prevents overflows + */ +static int +n_choose_k(int n, int k) +{ + int d, + r; + + Assert((k > 0) && (n >= k)); + + /* use symmetry of the binomial coefficients */ + k = Min(k, n - k); + + r = 1; + for (d = 1; d <= k; ++d) + { + r *= n--; + r /= d; + } + + return r; +} + +/* + * num_combinations + * number of combinations, excluding single-value combinations + */ +static int +num_combinations(int n) +{ + return (1 << n) - (n + 1); +} + +/* + * generator_init + * initialize the generator of combinations + * + * The generator produces combinations of K elements in the interval (0..N). + * We prebuild all the combinations in this method, which is simpler than + * generating them on the fly. + */ +static CombinationGenerator * +generator_init(int n, int k) +{ + CombinationGenerator *state; + + Assert((n >= k) && (k > 0)); + + /* allocate the generator state as a single chunk of memory */ + state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator)); + + state->ncombinations = n_choose_k(n, k); + + /* pre-allocate space for all combinations */ + state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations); + + state->current = 0; + state->k = k; + state->n = n; + + /* now actually pre-generate all the combinations of K elements */ + generate_combinations(state); + + /* make sure we got the expected number of combinations */ + Assert(state->current == state->ncombinations); + + /* reset the number, so we start with the first one */ + state->current = 0; + + return state; +} + +/* + * generator_next + * returns the next combination from the prebuilt list + * + * Returns a combination of K array indexes (0 .. N), as specified to + * generator_init), or NULL when there are no more combination. + */ +static int * +generator_next(CombinationGenerator *state) +{ + if (state->current == state->ncombinations) + return NULL; + + return &state->combinations[state->k * state->current++]; +} + +/* + * generator_free + * free the internal state of the generator + * + * Releases the generator internal state (pre-built combinations). + */ +static void +generator_free(CombinationGenerator *state) +{ + pfree(state->combinations); + pfree(state); +} + +/* + * generate_combinations_recurse + * given a prefix, generate all possible combinations + * + * Given a prefix (first few elements of the combination), generate following + * elements recursively. We generate the combinations in lexicographic order, + * which eliminates permutations of the same combination. + */ +static void +generate_combinations_recurse(CombinationGenerator *state, + int index, int start, int *current) +{ + /* If we haven't filled all the elements, simply recurse. */ + if (index < state->k) + { + int i; + + /* + * The values have to be in ascending order, so make sure we start + * with the value passed by parameter. + */ + + for (i = start; i < state->n; i++) + { + current[index] = i; + generate_combinations_recurse(state, (index + 1), (i + 1), current); + } + + return; + } + else + { + /* we got a valid combination, add it to the array */ + memcpy(&state->combinations[(state->k * state->current)], + current, state->k * sizeof(int)); + state->current++; + } +} + +/* + * generate_combinations + * generate all k-combinations of N elements + */ +static void +generate_combinations(CombinationGenerator *state) +{ + int *current = (int *) palloc0(sizeof(int) * state->k); + + generate_combinations_recurse(state, 0, 0, current); + + pfree(current); +} |