summaryrefslogtreecommitdiffstats
path: root/src/backend/statistics/mvdistinct.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/statistics/mvdistinct.c')
-rw-r--r--src/backend/statistics/mvdistinct.c699
1 files changed, 699 insertions, 0 deletions
diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c
new file mode 100644
index 0000000..4b4ecec
--- /dev/null
+++ b/src/backend/statistics/mvdistinct.c
@@ -0,0 +1,699 @@
+/*-------------------------------------------------------------------------
+ *
+ * mvdistinct.c
+ * POSTGRES multivariate ndistinct coefficients
+ *
+ * Estimating number of groups in a combination of columns (e.g. for GROUP BY)
+ * is tricky, and the estimation error is often significant.
+
+ * The multivariate ndistinct coefficients address this by storing ndistinct
+ * estimates for combinations of the user-specified columns. So for example
+ * given a statistics object on three columns (a,b,c), this module estimates
+ * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column
+ * estimates are already available in pg_statistic.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/statistics/mvdistinct.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/htup_details.h"
+#include "catalog/pg_statistic_ext.h"
+#include "catalog/pg_statistic_ext_data.h"
+#include "lib/stringinfo.h"
+#include "statistics/extended_stats_internal.h"
+#include "statistics/statistics.h"
+#include "utils/fmgrprotos.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+#include "utils/typcache.h"
+
+static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
+ int k, int *combination);
+static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
+static int n_choose_k(int n, int k);
+static int num_combinations(int n);
+
+/* size of the struct header fields (magic, type, nitems) */
+#define SizeOfHeader (3 * sizeof(uint32))
+
+/* size of a serialized ndistinct item (coefficient, natts, atts) */
+#define SizeOfItem(natts) \
+ (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber))
+
+/* minimal size of a ndistinct item (with two attributes) */
+#define MinSizeOfItem SizeOfItem(2)
+
+/* minimal size of mvndistinct, when all items are minimal */
+#define MinSizeOfItems(nitems) \
+ (SizeOfHeader + (nitems) * MinSizeOfItem)
+
+/* Combination generator API */
+
+/* internal state for generator of k-combinations of n elements */
+typedef struct CombinationGenerator
+{
+ int k; /* size of the combination */
+ int n; /* total number of elements */
+ int current; /* index of the next combination to return */
+ int ncombinations; /* number of combinations (size of array) */
+ int *combinations; /* array of pre-built combinations */
+} CombinationGenerator;
+
+static CombinationGenerator *generator_init(int n, int k);
+static void generator_free(CombinationGenerator *state);
+static int *generator_next(CombinationGenerator *state);
+static void generate_combinations(CombinationGenerator *state);
+
+
+/*
+ * statext_ndistinct_build
+ * Compute ndistinct coefficient for the combination of attributes.
+ *
+ * This computes the ndistinct estimate using the same estimator used
+ * in analyze.c and then computes the coefficient.
+ *
+ * To handle expressions easily, we treat them as system attributes with
+ * negative attnums, and offset everything by number of expressions to
+ * allow using Bitmapsets.
+ */
+MVNDistinct *
+statext_ndistinct_build(double totalrows, StatsBuildData *data)
+{
+ MVNDistinct *result;
+ int k;
+ int itemcnt;
+ int numattrs = data->nattnums;
+ int numcombs = num_combinations(numattrs);
+
+ result = palloc(offsetof(MVNDistinct, items) +
+ numcombs * sizeof(MVNDistinctItem));
+ result->magic = STATS_NDISTINCT_MAGIC;
+ result->type = STATS_NDISTINCT_TYPE_BASIC;
+ result->nitems = numcombs;
+
+ itemcnt = 0;
+ for (k = 2; k <= numattrs; k++)
+ {
+ int *combination;
+ CombinationGenerator *generator;
+
+ /* generate combinations of K out of N elements */
+ generator = generator_init(numattrs, k);
+
+ while ((combination = generator_next(generator)))
+ {
+ MVNDistinctItem *item = &result->items[itemcnt];
+ int j;
+
+ item->attributes = palloc(sizeof(AttrNumber) * k);
+ item->nattributes = k;
+
+ /* translate the indexes to attnums */
+ for (j = 0; j < k; j++)
+ {
+ item->attributes[j] = data->attnums[combination[j]];
+
+ Assert(AttributeNumberIsValid(item->attributes[j]));
+ }
+
+ item->ndistinct =
+ ndistinct_for_combination(totalrows, data, k, combination);
+
+ itemcnt++;
+ Assert(itemcnt <= result->nitems);
+ }
+
+ generator_free(generator);
+ }
+
+ /* must consume exactly the whole output array */
+ Assert(itemcnt == result->nitems);
+
+ return result;
+}
+
+/*
+ * statext_ndistinct_load
+ * Load the ndistinct value for the indicated pg_statistic_ext tuple
+ */
+MVNDistinct *
+statext_ndistinct_load(Oid mvoid)
+{
+ MVNDistinct *result;
+ bool isnull;
+ Datum ndist;
+ HeapTuple htup;
+
+ htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
+ if (!HeapTupleIsValid(htup))
+ elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
+
+ ndist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
+ Anum_pg_statistic_ext_data_stxdndistinct, &isnull);
+ if (isnull)
+ elog(ERROR,
+ "requested statistics kind \"%c\" is not yet built for statistics object %u",
+ STATS_EXT_NDISTINCT, mvoid);
+
+ result = statext_ndistinct_deserialize(DatumGetByteaPP(ndist));
+
+ ReleaseSysCache(htup);
+
+ return result;
+}
+
+/*
+ * statext_ndistinct_serialize
+ * serialize ndistinct to the on-disk bytea format
+ */
+bytea *
+statext_ndistinct_serialize(MVNDistinct *ndistinct)
+{
+ int i;
+ bytea *output;
+ char *tmp;
+ Size len;
+
+ Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC);
+ Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC);
+
+ /*
+ * Base size is size of scalar fields in the struct, plus one base struct
+ * for each item, including number of items for each.
+ */
+ len = VARHDRSZ + SizeOfHeader;
+
+ /* and also include space for the actual attribute numbers */
+ for (i = 0; i < ndistinct->nitems; i++)
+ {
+ int nmembers;
+
+ nmembers = ndistinct->items[i].nattributes;
+ Assert(nmembers >= 2);
+
+ len += SizeOfItem(nmembers);
+ }
+
+ output = (bytea *) palloc(len);
+ SET_VARSIZE(output, len);
+
+ tmp = VARDATA(output);
+
+ /* Store the base struct values (magic, type, nitems) */
+ memcpy(tmp, &ndistinct->magic, sizeof(uint32));
+ tmp += sizeof(uint32);
+ memcpy(tmp, &ndistinct->type, sizeof(uint32));
+ tmp += sizeof(uint32);
+ memcpy(tmp, &ndistinct->nitems, sizeof(uint32));
+ tmp += sizeof(uint32);
+
+ /*
+ * store number of attributes and attribute numbers for each entry
+ */
+ for (i = 0; i < ndistinct->nitems; i++)
+ {
+ MVNDistinctItem item = ndistinct->items[i];
+ int nmembers = item.nattributes;
+
+ memcpy(tmp, &item.ndistinct, sizeof(double));
+ tmp += sizeof(double);
+ memcpy(tmp, &nmembers, sizeof(int));
+ tmp += sizeof(int);
+
+ memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
+ tmp += nmembers * sizeof(AttrNumber);
+
+ /* protect against overflows */
+ Assert(tmp <= ((char *) output + len));
+ }
+
+ /* check we used exactly the expected space */
+ Assert(tmp == ((char *) output + len));
+
+ return output;
+}
+
+/*
+ * statext_ndistinct_deserialize
+ * Read an on-disk bytea format MVNDistinct to in-memory format
+ */
+MVNDistinct *
+statext_ndistinct_deserialize(bytea *data)
+{
+ int i;
+ Size minimum_size;
+ MVNDistinct ndist;
+ MVNDistinct *ndistinct;
+ char *tmp;
+
+ if (data == NULL)
+ return NULL;
+
+ /* we expect at least the basic fields of MVNDistinct struct */
+ if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
+ elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)",
+ VARSIZE_ANY_EXHDR(data), SizeOfHeader);
+
+ /* initialize pointer to the data part (skip the varlena header) */
+ tmp = VARDATA_ANY(data);
+
+ /* read the header fields and perform basic sanity checks */
+ memcpy(&ndist.magic, tmp, sizeof(uint32));
+ tmp += sizeof(uint32);
+ memcpy(&ndist.type, tmp, sizeof(uint32));
+ tmp += sizeof(uint32);
+ memcpy(&ndist.nitems, tmp, sizeof(uint32));
+ tmp += sizeof(uint32);
+
+ if (ndist.magic != STATS_NDISTINCT_MAGIC)
+ elog(ERROR, "invalid ndistinct magic %08x (expected %08x)",
+ ndist.magic, STATS_NDISTINCT_MAGIC);
+ if (ndist.type != STATS_NDISTINCT_TYPE_BASIC)
+ elog(ERROR, "invalid ndistinct type %d (expected %d)",
+ ndist.type, STATS_NDISTINCT_TYPE_BASIC);
+ if (ndist.nitems == 0)
+ elog(ERROR, "invalid zero-length item array in MVNDistinct");
+
+ /* what minimum bytea size do we expect for those parameters */
+ minimum_size = MinSizeOfItems(ndist.nitems);
+ if (VARSIZE_ANY_EXHDR(data) < minimum_size)
+ elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)",
+ VARSIZE_ANY_EXHDR(data), minimum_size);
+
+ /*
+ * Allocate space for the ndistinct items (no space for each item's
+ * attnos: those live in bitmapsets allocated separately)
+ */
+ ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) +
+ (ndist.nitems * sizeof(MVNDistinctItem)));
+ ndistinct->magic = ndist.magic;
+ ndistinct->type = ndist.type;
+ ndistinct->nitems = ndist.nitems;
+
+ for (i = 0; i < ndistinct->nitems; i++)
+ {
+ MVNDistinctItem *item = &ndistinct->items[i];
+
+ /* ndistinct value */
+ memcpy(&item->ndistinct, tmp, sizeof(double));
+ tmp += sizeof(double);
+
+ /* number of attributes */
+ memcpy(&item->nattributes, tmp, sizeof(int));
+ tmp += sizeof(int);
+ Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
+
+ item->attributes
+ = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
+
+ memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
+ tmp += sizeof(AttrNumber) * item->nattributes;
+
+ /* still within the bytea */
+ Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
+ }
+
+ /* we should have consumed the whole bytea exactly */
+ Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
+
+ return ndistinct;
+}
+
+/*
+ * pg_ndistinct_in
+ * input routine for type pg_ndistinct
+ *
+ * pg_ndistinct is real enough to be a table column, but it has no
+ * operations of its own, and disallows input (just like pg_node_tree).
+ */
+Datum
+pg_ndistinct_in(PG_FUNCTION_ARGS)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot accept a value of type %s", "pg_ndistinct")));
+
+ PG_RETURN_VOID(); /* keep compiler quiet */
+}
+
+/*
+ * pg_ndistinct
+ * output routine for type pg_ndistinct
+ *
+ * Produces a human-readable representation of the value.
+ */
+Datum
+pg_ndistinct_out(PG_FUNCTION_ARGS)
+{
+ bytea *data = PG_GETARG_BYTEA_PP(0);
+ MVNDistinct *ndist = statext_ndistinct_deserialize(data);
+ int i;
+ StringInfoData str;
+
+ initStringInfo(&str);
+ appendStringInfoChar(&str, '{');
+
+ for (i = 0; i < ndist->nitems; i++)
+ {
+ int j;
+ MVNDistinctItem item = ndist->items[i];
+
+ if (i > 0)
+ appendStringInfoString(&str, ", ");
+
+ for (j = 0; j < item.nattributes; j++)
+ {
+ AttrNumber attnum = item.attributes[j];
+
+ appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum);
+ }
+ appendStringInfo(&str, "\": %d", (int) item.ndistinct);
+ }
+
+ appendStringInfoChar(&str, '}');
+
+ PG_RETURN_CSTRING(str.data);
+}
+
+/*
+ * pg_ndistinct_recv
+ * binary input routine for type pg_ndistinct
+ */
+Datum
+pg_ndistinct_recv(PG_FUNCTION_ARGS)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot accept a value of type %s", "pg_ndistinct")));
+
+ PG_RETURN_VOID(); /* keep compiler quiet */
+}
+
+/*
+ * pg_ndistinct_send
+ * binary output routine for type pg_ndistinct
+ *
+ * n-distinct is serialized into a bytea value, so let's send that.
+ */
+Datum
+pg_ndistinct_send(PG_FUNCTION_ARGS)
+{
+ return byteasend(fcinfo);
+}
+
+/*
+ * ndistinct_for_combination
+ * Estimates number of distinct values in a combination of columns.
+ *
+ * This uses the same ndistinct estimator as compute_scalar_stats() in
+ * ANALYZE, i.e.,
+ * n*d / (n - f1 + f1*n/N)
+ *
+ * except that instead of values in a single column we are dealing with
+ * combination of multiple columns.
+ */
+static double
+ndistinct_for_combination(double totalrows, StatsBuildData *data,
+ int k, int *combination)
+{
+ int i,
+ j;
+ int f1,
+ cnt,
+ d;
+ bool *isnull;
+ Datum *values;
+ SortItem *items;
+ MultiSortSupport mss;
+ int numrows = data->numrows;
+
+ mss = multi_sort_init(k);
+
+ /*
+ * In order to determine the number of distinct elements, create separate
+ * values[]/isnull[] arrays with all the data we have, then sort them
+ * using the specified column combination as dimensions. We could try to
+ * sort in place, but it'd probably be more complex and bug-prone.
+ */
+ items = (SortItem *) palloc(numrows * sizeof(SortItem));
+ values = (Datum *) palloc0(sizeof(Datum) * numrows * k);
+ isnull = (bool *) palloc0(sizeof(bool) * numrows * k);
+
+ for (i = 0; i < numrows; i++)
+ {
+ items[i].values = &values[i * k];
+ items[i].isnull = &isnull[i * k];
+ }
+
+ /*
+ * For each dimension, set up sort-support and fill in the values from the
+ * sample data.
+ *
+ * We use the column data types' default sort operators and collations;
+ * perhaps at some point it'd be worth using column-specific collations?
+ */
+ for (i = 0; i < k; i++)
+ {
+ Oid typid;
+ TypeCacheEntry *type;
+ Oid collid = InvalidOid;
+ VacAttrStats *colstat = data->stats[combination[i]];
+
+ typid = colstat->attrtypid;
+ collid = colstat->attrcollid;
+
+ type = lookup_type_cache(typid, TYPECACHE_LT_OPR);
+ if (type->lt_opr == InvalidOid) /* shouldn't happen */
+ elog(ERROR, "cache lookup failed for ordering operator for type %u",
+ typid);
+
+ /* prepare the sort function for this dimension */
+ multi_sort_add_dimension(mss, i, type->lt_opr, collid);
+
+ /* accumulate all the data for this dimension into the arrays */
+ for (j = 0; j < numrows; j++)
+ {
+ items[j].values[i] = data->values[combination[i]][j];
+ items[j].isnull[i] = data->nulls[combination[i]][j];
+ }
+ }
+
+ /* We can sort the array now ... */
+ qsort_interruptible((void *) items, numrows, sizeof(SortItem),
+ multi_sort_compare, mss);
+
+ /* ... and count the number of distinct combinations */
+
+ f1 = 0;
+ cnt = 1;
+ d = 1;
+ for (i = 1; i < numrows; i++)
+ {
+ if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
+ {
+ if (cnt == 1)
+ f1 += 1;
+
+ d++;
+ cnt = 0;
+ }
+
+ cnt += 1;
+ }
+
+ if (cnt == 1)
+ f1 += 1;
+
+ return estimate_ndistinct(totalrows, numrows, d, f1);
+}
+
+/* The Duj1 estimator (already used in analyze.c). */
+static double
+estimate_ndistinct(double totalrows, int numrows, int d, int f1)
+{
+ double numer,
+ denom,
+ ndistinct;
+
+ numer = (double) numrows * (double) d;
+
+ denom = (double) (numrows - f1) +
+ (double) f1 * (double) numrows / totalrows;
+
+ ndistinct = numer / denom;
+
+ /* Clamp to sane range in case of roundoff error */
+ if (ndistinct < (double) d)
+ ndistinct = (double) d;
+
+ if (ndistinct > totalrows)
+ ndistinct = totalrows;
+
+ return floor(ndistinct + 0.5);
+}
+
+/*
+ * n_choose_k
+ * computes binomial coefficients using an algorithm that is both
+ * efficient and prevents overflows
+ */
+static int
+n_choose_k(int n, int k)
+{
+ int d,
+ r;
+
+ Assert((k > 0) && (n >= k));
+
+ /* use symmetry of the binomial coefficients */
+ k = Min(k, n - k);
+
+ r = 1;
+ for (d = 1; d <= k; ++d)
+ {
+ r *= n--;
+ r /= d;
+ }
+
+ return r;
+}
+
+/*
+ * num_combinations
+ * number of combinations, excluding single-value combinations
+ */
+static int
+num_combinations(int n)
+{
+ return (1 << n) - (n + 1);
+}
+
+/*
+ * generator_init
+ * initialize the generator of combinations
+ *
+ * The generator produces combinations of K elements in the interval (0..N).
+ * We prebuild all the combinations in this method, which is simpler than
+ * generating them on the fly.
+ */
+static CombinationGenerator *
+generator_init(int n, int k)
+{
+ CombinationGenerator *state;
+
+ Assert((n >= k) && (k > 0));
+
+ /* allocate the generator state as a single chunk of memory */
+ state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator));
+
+ state->ncombinations = n_choose_k(n, k);
+
+ /* pre-allocate space for all combinations */
+ state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations);
+
+ state->current = 0;
+ state->k = k;
+ state->n = n;
+
+ /* now actually pre-generate all the combinations of K elements */
+ generate_combinations(state);
+
+ /* make sure we got the expected number of combinations */
+ Assert(state->current == state->ncombinations);
+
+ /* reset the number, so we start with the first one */
+ state->current = 0;
+
+ return state;
+}
+
+/*
+ * generator_next
+ * returns the next combination from the prebuilt list
+ *
+ * Returns a combination of K array indexes (0 .. N), as specified to
+ * generator_init), or NULL when there are no more combination.
+ */
+static int *
+generator_next(CombinationGenerator *state)
+{
+ if (state->current == state->ncombinations)
+ return NULL;
+
+ return &state->combinations[state->k * state->current++];
+}
+
+/*
+ * generator_free
+ * free the internal state of the generator
+ *
+ * Releases the generator internal state (pre-built combinations).
+ */
+static void
+generator_free(CombinationGenerator *state)
+{
+ pfree(state->combinations);
+ pfree(state);
+}
+
+/*
+ * generate_combinations_recurse
+ * given a prefix, generate all possible combinations
+ *
+ * Given a prefix (first few elements of the combination), generate following
+ * elements recursively. We generate the combinations in lexicographic order,
+ * which eliminates permutations of the same combination.
+ */
+static void
+generate_combinations_recurse(CombinationGenerator *state,
+ int index, int start, int *current)
+{
+ /* If we haven't filled all the elements, simply recurse. */
+ if (index < state->k)
+ {
+ int i;
+
+ /*
+ * The values have to be in ascending order, so make sure we start
+ * with the value passed by parameter.
+ */
+
+ for (i = start; i < state->n; i++)
+ {
+ current[index] = i;
+ generate_combinations_recurse(state, (index + 1), (i + 1), current);
+ }
+
+ return;
+ }
+ else
+ {
+ /* we got a valid combination, add it to the array */
+ memcpy(&state->combinations[(state->k * state->current)],
+ current, state->k * sizeof(int));
+ state->current++;
+ }
+}
+
+/*
+ * generate_combinations
+ * generate all k-combinations of N elements
+ */
+static void
+generate_combinations(CombinationGenerator *state)
+{
+ int *current = (int *) palloc0(sizeof(int) * state->k);
+
+ generate_combinations_recurse(state, 0, 0, current);
+
+ pfree(current);
+}