diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/statistics/mcv.c | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/statistics/mcv.c')
-rw-r--r-- | src/backend/statistics/mcv.c | 2180 |
1 files changed, 2180 insertions, 0 deletions
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c new file mode 100644 index 0000000..0183775 --- /dev/null +++ b/src/backend/statistics/mcv.c @@ -0,0 +1,2180 @@ +/*------------------------------------------------------------------------- + * + * mcv.c + * POSTGRES multivariate MCV lists + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/statistics/mcv.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "access/htup_details.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_statistic_ext.h" +#include "catalog/pg_statistic_ext_data.h" +#include "fmgr.h" +#include "funcapi.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/bytea.h" +#include "utils/fmgroids.h" +#include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" +#include "utils/selfuncs.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + +/* + * Computes size of a serialized MCV item, depending on the number of + * dimensions (columns) the statistic is defined on. The datum values are + * stored in a separate array (deduplicated, to minimize the size), and + * so the serialized items only store uint16 indexes into that array. + * + * Each serialized item stores (in this order): + * + * - indexes to values (ndim * sizeof(uint16)) + * - null flags (ndim * sizeof(bool)) + * - frequency (sizeof(double)) + * - base_frequency (sizeof(double)) + * + * There is no alignment padding within an MCV item. + * So in total each MCV item requires this many bytes: + * + * ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double) + */ +#define ITEM_SIZE(ndims) \ + ((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)) + +/* + * Used to compute size of serialized MCV list representation. + */ +#define MinSizeOfMCVList \ + (VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber)) + +/* + * Size of the serialized MCV list, excluding the space needed for + * deduplicated per-dimension values. The macro is meant to be used + * when it's not yet safe to access the serialized info about amount + * of data for each column. + */ +#define SizeOfMCVList(ndims,nitems) \ + ((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \ + ((ndims) * sizeof(DimensionInfo)) + \ + ((nitems) * ITEM_SIZE(ndims))) + +static MultiSortSupport build_mss(StatsBuildData *data); + +static SortItem *build_distinct_groups(int numrows, SortItem *items, + MultiSortSupport mss, int *ndistinct); + +static SortItem **build_column_frequencies(SortItem *groups, int ngroups, + MultiSortSupport mss, int *ncounts); + +static int count_distinct_groups(int numrows, SortItem *items, + MultiSortSupport mss); + +/* + * Compute new value for bitmap item, considering whether it's used for + * clauses connected by AND/OR. + */ +#define RESULT_MERGE(value, is_or, match) \ + ((is_or) ? ((value) || (match)) : ((value) && (match))) + +/* + * When processing a list of clauses, the bitmap item may get set to a value + * such that additional clauses can't change it. For example, when processing + * a list of clauses connected to AND, as soon as the item gets set to 'false' + * then it'll remain like that. Similarly clauses connected by OR and 'true'. + * + * Returns true when the value in the bitmap can't change no matter how the + * remaining clauses are evaluated. + */ +#define RESULT_IS_FINAL(value, is_or) ((is_or) ? (value) : (!(value))) + +/* + * get_mincount_for_mcv_list + * Determine the minimum number of times a value needs to appear in + * the sample for it to be included in the MCV list. + * + * We want to keep only values that appear sufficiently often in the + * sample that it is reasonable to extrapolate their sample frequencies to + * the entire table. We do this by placing an upper bound on the relative + * standard error of the sample frequency, so that any estimates the + * planner generates from the MCV statistics can be expected to be + * reasonably accurate. + * + * Since we are sampling without replacement, the sample frequency of a + * particular value is described by a hypergeometric distribution. A + * common rule of thumb when estimating errors in this situation is to + * require at least 10 instances of the value in the sample, in which case + * the distribution can be approximated by a normal distribution, and + * standard error analysis techniques can be applied. Given a sample size + * of n, a population size of N, and a sample frequency of p=cnt/n, the + * standard error of the proportion p is given by + * SE = sqrt(p*(1-p)/n) * sqrt((N-n)/(N-1)) + * where the second term is the finite population correction. To get + * reasonably accurate planner estimates, we impose an upper bound on the + * relative standard error of 20% -- i.e., SE/p < 0.2. This 20% relative + * error bound is fairly arbitrary, but has been found empirically to work + * well. Rearranging this formula gives a lower bound on the number of + * instances of the value seen: + * cnt > n*(N-n) / (N-n+0.04*n*(N-1)) + * This bound is at most 25, and approaches 0 as n approaches 0 or N. The + * case where n approaches 0 cannot happen in practice, since the sample + * size is at least 300. The case where n approaches N corresponds to + * sampling the whole the table, in which case it is reasonable to keep + * the whole MCV list (have no lower bound), so it makes sense to apply + * this formula for all inputs, even though the above derivation is + * technically only valid when the right hand side is at least around 10. + * + * An alternative way to look at this formula is as follows -- assume that + * the number of instances of the value seen scales up to the entire + * table, so that the population count is K=N*cnt/n. Then the distribution + * in the sample is a hypergeometric distribution parameterised by N, n + * and K, and the bound above is mathematically equivalent to demanding + * that the standard deviation of that distribution is less than 20% of + * its mean. Thus the relative errors in any planner estimates produced + * from the MCV statistics are likely to be not too large. + */ +static double +get_mincount_for_mcv_list(int samplerows, double totalrows) +{ + double n = samplerows; + double N = totalrows; + double numer, + denom; + + numer = n * (N - n); + denom = N - n + 0.04 * n * (N - 1); + + /* Guard against division by zero (possible if n = N = 1) */ + if (denom == 0.0) + return 0.0; + + return numer / denom; +} + +/* + * Builds MCV list from the set of sampled rows. + * + * The algorithm is quite simple: + * + * (1) sort the data (default collation, '<' for the data type) + * + * (2) count distinct groups, decide how many to keep + * + * (3) build the MCV list using the threshold determined in (2) + * + * (4) remove rows represented by the MCV from the sample + * + */ +MCVList * +statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget) +{ + int i, + numattrs, + numrows, + ngroups, + nitems; + double mincount; + SortItem *items; + SortItem *groups; + MCVList *mcvlist = NULL; + MultiSortSupport mss; + + /* comparator for all the columns */ + mss = build_mss(data); + + /* sort the rows */ + items = build_sorted_items(data, &nitems, mss, + data->nattnums, data->attnums); + + if (!items) + return NULL; + + /* for convenience */ + numattrs = data->nattnums; + numrows = data->numrows; + + /* transform the sorted rows into groups (sorted by frequency) */ + groups = build_distinct_groups(nitems, items, mss, &ngroups); + + /* + * The maximum number of MCV items to store, based on the statistics + * target we computed for the statistics object (from the target set for + * the object itself, attributes and the system default). In any case, we + * can't keep more groups than we have available. + */ + nitems = stattarget; + if (nitems > ngroups) + nitems = ngroups; + + /* + * Decide how many items to keep in the MCV list. We can't use the same + * algorithm as per-column MCV lists, because that only considers the + * actual group frequency - but we're primarily interested in how the + * actual frequency differs from the base frequency (product of simple + * per-column frequencies, as if the columns were independent). + * + * Using the same algorithm might exclude items that are close to the + * "average" frequency of the sample. But that does not say whether the + * observed frequency is close to the base frequency or not. We also need + * to consider unexpectedly uncommon items (again, compared to the base + * frequency), and the single-column algorithm does not have to. + * + * We simply decide how many items to keep by computing the minimum count + * using get_mincount_for_mcv_list() and then keep all items that seem to + * be more common than that. + */ + mincount = get_mincount_for_mcv_list(numrows, totalrows); + + /* + * Walk the groups until we find the first group with a count below the + * mincount threshold (the index of that group is the number of groups we + * want to keep). + */ + for (i = 0; i < nitems; i++) + { + if (groups[i].count < mincount) + { + nitems = i; + break; + } + } + + /* + * At this point, we know the number of items for the MCV list. There + * might be none (for uniform distribution with many groups), and in that + * case, there will be no MCV list. Otherwise, construct the MCV list. + */ + if (nitems > 0) + { + int j; + SortItem key; + MultiSortSupport tmp; + + /* frequencies for values in each attribute */ + SortItem **freqs; + int *nfreqs; + + /* used to search values */ + tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup) + + sizeof(SortSupportData)); + + /* compute frequencies for values in each column */ + nfreqs = (int *) palloc0(sizeof(int) * numattrs); + freqs = build_column_frequencies(groups, ngroups, mss, nfreqs); + + /* + * Allocate the MCV list structure, set the global parameters. + */ + mcvlist = (MCVList *) palloc0(offsetof(MCVList, items) + + sizeof(MCVItem) * nitems); + + mcvlist->magic = STATS_MCV_MAGIC; + mcvlist->type = STATS_MCV_TYPE_BASIC; + mcvlist->ndimensions = numattrs; + mcvlist->nitems = nitems; + + /* store info about data type OIDs */ + for (i = 0; i < numattrs; i++) + mcvlist->types[i] = data->stats[i]->attrtypid; + + /* Copy the first chunk of groups into the result. */ + for (i = 0; i < nitems; i++) + { + /* just pointer to the proper place in the list */ + MCVItem *item = &mcvlist->items[i]; + + item->values = (Datum *) palloc(sizeof(Datum) * numattrs); + item->isnull = (bool *) palloc(sizeof(bool) * numattrs); + + /* copy values for the group */ + memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs); + memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs); + + /* groups should be sorted by frequency in descending order */ + Assert((i == 0) || (groups[i - 1].count >= groups[i].count)); + + /* group frequency */ + item->frequency = (double) groups[i].count / numrows; + + /* base frequency, if the attributes were independent */ + item->base_frequency = 1.0; + for (j = 0; j < numattrs; j++) + { + SortItem *freq; + + /* single dimension */ + tmp->ndims = 1; + tmp->ssup[0] = mss->ssup[j]; + + /* fill search key */ + key.values = &groups[i].values[j]; + key.isnull = &groups[i].isnull[j]; + + freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j], + sizeof(SortItem), + multi_sort_compare, tmp); + + item->base_frequency *= ((double) freq->count) / numrows; + } + } + + pfree(nfreqs); + pfree(freqs); + } + + pfree(items); + pfree(groups); + + return mcvlist; +} + +/* + * build_mss + * Build a MultiSortSupport for the given StatsBuildData. + */ +static MultiSortSupport +build_mss(StatsBuildData *data) +{ + int i; + int numattrs = data->nattnums; + + /* Sort by multiple columns (using array of SortSupport) */ + MultiSortSupport mss = multi_sort_init(numattrs); + + /* prepare the sort functions for all the attributes */ + for (i = 0; i < numattrs; i++) + { + VacAttrStats *colstat = data->stats[i]; + TypeCacheEntry *type; + + type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) /* shouldn't happen */ + elog(ERROR, "cache lookup failed for ordering operator for type %u", + colstat->attrtypid); + + multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid); + } + + return mss; +} + +/* + * count_distinct_groups + * Count distinct combinations of SortItems in the array. + * + * The array is assumed to be sorted according to the MultiSortSupport. + */ +static int +count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss) +{ + int i; + int ndistinct; + + ndistinct = 1; + for (i = 1; i < numrows; i++) + { + /* make sure the array really is sorted */ + Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0); + + if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0) + ndistinct += 1; + } + + return ndistinct; +} + +/* + * compare_sort_item_count + * Comparator for sorting items by count (frequencies) in descending + * order. + */ +static int +compare_sort_item_count(const void *a, const void *b, void *arg) +{ + SortItem *ia = (SortItem *) a; + SortItem *ib = (SortItem *) b; + + if (ia->count == ib->count) + return 0; + else if (ia->count > ib->count) + return -1; + + return 1; +} + +/* + * build_distinct_groups + * Build an array of SortItems for distinct groups and counts matching + * items. + * + * The 'items' array is assumed to be sorted. + */ +static SortItem * +build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss, + int *ndistinct) +{ + int i, + j; + int ngroups = count_distinct_groups(numrows, items, mss); + + SortItem *groups = (SortItem *) palloc(ngroups * sizeof(SortItem)); + + j = 0; + groups[0] = items[0]; + groups[0].count = 1; + + for (i = 1; i < numrows; i++) + { + /* Assume sorted in ascending order. */ + Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0); + + /* New distinct group detected. */ + if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0) + { + groups[++j] = items[i]; + groups[j].count = 0; + } + + groups[j].count++; + } + + /* ensure we filled the expected number of distinct groups */ + Assert(j + 1 == ngroups); + + /* Sort the distinct groups by frequency (in descending order). */ + qsort_interruptible((void *) groups, ngroups, sizeof(SortItem), + compare_sort_item_count, NULL); + + *ndistinct = ngroups; + return groups; +} + +/* compare sort items (single dimension) */ +static int +sort_item_compare(const void *a, const void *b, void *arg) +{ + SortSupport ssup = (SortSupport) arg; + SortItem *ia = (SortItem *) a; + SortItem *ib = (SortItem *) b; + + return ApplySortComparator(ia->values[0], ia->isnull[0], + ib->values[0], ib->isnull[0], + ssup); +} + +/* + * build_column_frequencies + * Compute frequencies of values in each column. + * + * This returns an array of SortItems for each attribute the MCV is built + * on, with a frequency (number of occurrences) for each value. This is + * then used to compute "base" frequency of MCV items. + * + * All the memory is allocated in a single chunk, so that a single pfree + * is enough to release it. We do not allocate space for values/isnull + * arrays in the SortItems, because we can simply point into the input + * groups directly. + */ +static SortItem ** +build_column_frequencies(SortItem *groups, int ngroups, + MultiSortSupport mss, int *ncounts) +{ + int i, + dim; + SortItem **result; + char *ptr; + + Assert(groups); + Assert(ncounts); + + /* allocate arrays for all columns as a single chunk */ + ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) + + mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups)); + + /* initial array of pointers */ + result = (SortItem **) ptr; + ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims); + + for (dim = 0; dim < mss->ndims; dim++) + { + SortSupport ssup = &mss->ssup[dim]; + + /* array of values for a single column */ + result[dim] = (SortItem *) ptr; + ptr += MAXALIGN(sizeof(SortItem) * ngroups); + + /* extract data for the dimension */ + for (i = 0; i < ngroups; i++) + { + /* point into the input groups */ + result[dim][i].values = &groups[i].values[dim]; + result[dim][i].isnull = &groups[i].isnull[dim]; + result[dim][i].count = groups[i].count; + } + + /* sort the values, deduplicate */ + qsort_interruptible((void *) result[dim], ngroups, sizeof(SortItem), + sort_item_compare, ssup); + + /* + * Identify distinct values, compute frequency (there might be + * multiple MCV items containing this value, so we need to sum counts + * from all of them. + */ + ncounts[dim] = 1; + for (i = 1; i < ngroups; i++) + { + if (sort_item_compare(&result[dim][i - 1], &result[dim][i], ssup) == 0) + { + result[dim][ncounts[dim] - 1].count += result[dim][i].count; + continue; + } + + result[dim][ncounts[dim]] = result[dim][i]; + + ncounts[dim]++; + } + } + + return result; +} + +/* + * statext_mcv_load + * Load the MCV list for the indicated pg_statistic_ext tuple. + */ +MCVList * +statext_mcv_load(Oid mvoid) +{ + MCVList *result; + bool isnull; + Datum mcvlist; + HeapTuple htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid)); + + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for statistics object %u", mvoid); + + mcvlist = SysCacheGetAttr(STATEXTDATASTXOID, htup, + Anum_pg_statistic_ext_data_stxdmcv, &isnull); + + if (isnull) + elog(ERROR, + "requested statistics kind \"%c\" is not yet built for statistics object %u", + STATS_EXT_DEPENDENCIES, mvoid); + + result = statext_mcv_deserialize(DatumGetByteaP(mcvlist)); + + ReleaseSysCache(htup); + + return result; +} + + +/* + * statext_mcv_serialize + * Serialize MCV list into a pg_mcv_list value. + * + * The MCV items may include values of various data types, and it's reasonable + * to expect redundancy (values for a given attribute, repeated for multiple + * MCV list items). So we deduplicate the values into arrays, and then replace + * the values by indexes into those arrays. + * + * The overall structure of the serialized representation looks like this: + * + * +---------------+----------------+---------------------+-------+ + * | header fields | dimension info | deduplicated values | items | + * +---------------+----------------+---------------------+-------+ + * + * Where dimension info stores information about the type of the K-th + * attribute (e.g. typlen, typbyval and length of deduplicated values). + * Deduplicated values store deduplicated values for each attribute. And + * items store the actual MCV list items, with values replaced by indexes into + * the arrays. + * + * When serializing the items, we use uint16 indexes. The number of MCV items + * is limited by the statistics target (which is capped to 10k at the moment). + * We might increase this to 65k and still fit into uint16, so there's a bit of + * slack. Furthermore, this limit is on the number of distinct values per column, + * and we usually have few of those (and various combinations of them for the + * those MCV list). So uint16 seems fine for now. + * + * We don't really expect the serialization to save as much space as for + * histograms, as we are not doing any bucket splits (which is the source + * of high redundancy in histograms). + * + * TODO: Consider packing boolean flags (NULL) for each item into a single char + * (or a longer type) instead of using an array of bool items. + */ +bytea * +statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats) +{ + int i; + int dim; + int ndims = mcvlist->ndimensions; + + SortSupport ssup; + DimensionInfo *info; + + Size total_length; + + /* serialized items (indexes into arrays, etc.) */ + bytea *raw; + char *ptr; + char *endptr PG_USED_FOR_ASSERTS_ONLY; + + /* values per dimension (and number of non-NULL values) */ + Datum **values = (Datum **) palloc0(sizeof(Datum *) * ndims); + int *counts = (int *) palloc0(sizeof(int) * ndims); + + /* + * We'll include some rudimentary information about the attribute types + * (length, by-val flag), so that we don't have to look them up while + * deserializing the MCV list (we already have the type OID in the + * header). This is safe because when changing the type of the attribute + * the statistics gets dropped automatically. We need to store the info + * about the arrays of deduplicated values anyway. + */ + info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims); + + /* sort support data for all attributes included in the MCV list */ + ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims); + + /* collect and deduplicate values for each dimension (attribute) */ + for (dim = 0; dim < ndims; dim++) + { + int ndistinct; + TypeCacheEntry *typentry; + + /* + * Lookup the LT operator (can't get it from stats extra_data, as we + * don't know how to interpret that - scalar vs. array etc.). + */ + typentry = lookup_type_cache(stats[dim]->attrtypid, TYPECACHE_LT_OPR); + + /* copy important info about the data type (length, by-value) */ + info[dim].typlen = stats[dim]->attrtype->typlen; + info[dim].typbyval = stats[dim]->attrtype->typbyval; + + /* allocate space for values in the attribute and collect them */ + values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems); + + for (i = 0; i < mcvlist->nitems; i++) + { + /* skip NULL values - we don't need to deduplicate those */ + if (mcvlist->items[i].isnull[dim]) + continue; + + /* append the value at the end */ + values[dim][counts[dim]] = mcvlist->items[i].values[dim]; + counts[dim] += 1; + } + + /* if there are just NULL values in this dimension, we're done */ + if (counts[dim] == 0) + continue; + + /* sort and deduplicate the data */ + ssup[dim].ssup_cxt = CurrentMemoryContext; + ssup[dim].ssup_collation = stats[dim]->attrcollid; + ssup[dim].ssup_nulls_first = false; + + PrepareSortSupportFromOrderingOp(typentry->lt_opr, &ssup[dim]); + + qsort_interruptible(values[dim], counts[dim], sizeof(Datum), + compare_scalars_simple, &ssup[dim]); + + /* + * Walk through the array and eliminate duplicate values, but keep the + * ordering (so that we can do a binary search later). We know there's + * at least one item as (counts[dim] != 0), so we can skip the first + * element. + */ + ndistinct = 1; /* number of distinct values */ + for (i = 1; i < counts[dim]; i++) + { + /* expect sorted array */ + Assert(compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]) <= 0); + + /* if the value is the same as the previous one, we can skip it */ + if (!compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim])) + continue; + + values[dim][ndistinct] = values[dim][i]; + ndistinct += 1; + } + + /* we must not exceed PG_UINT16_MAX, as we use uint16 indexes */ + Assert(ndistinct <= PG_UINT16_MAX); + + /* + * Store additional info about the attribute - number of deduplicated + * values, and also size of the serialized data. For fixed-length data + * types this is trivial to compute, for varwidth types we need to + * actually walk the array and sum the sizes. + */ + info[dim].nvalues = ndistinct; + + if (info[dim].typbyval) /* by-value data types */ + { + info[dim].nbytes = info[dim].nvalues * info[dim].typlen; + + /* + * We copy the data into the MCV item during deserialization, so + * we don't need to allocate any extra space. + */ + info[dim].nbytes_aligned = 0; + } + else if (info[dim].typlen > 0) /* fixed-length by-ref */ + { + /* + * We don't care about alignment in the serialized data, so we + * pack the data as much as possible. But we also track how much + * data will be needed after deserialization, and in that case we + * need to account for alignment of each item. + * + * Note: As the items are fixed-length, we could easily compute + * this during deserialization, but we do it here anyway. + */ + info[dim].nbytes = info[dim].nvalues * info[dim].typlen; + info[dim].nbytes_aligned = info[dim].nvalues * MAXALIGN(info[dim].typlen); + } + else if (info[dim].typlen == -1) /* varlena */ + { + info[dim].nbytes = 0; + info[dim].nbytes_aligned = 0; + for (i = 0; i < info[dim].nvalues; i++) + { + Size len; + + /* + * For varlena values, we detoast the values and store the + * length and data separately. We don't bother with alignment + * here, which means that during deserialization we need to + * copy the fields and only access the copies. + */ + values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i])); + + /* serialized length (uint32 length + data) */ + len = VARSIZE_ANY_EXHDR(values[dim][i]); + info[dim].nbytes += sizeof(uint32); /* length */ + info[dim].nbytes += len; /* value (no header) */ + + /* + * During deserialization we'll build regular varlena values + * with full headers, and we need to align them properly. + */ + info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ + len); + } + } + else if (info[dim].typlen == -2) /* cstring */ + { + info[dim].nbytes = 0; + info[dim].nbytes_aligned = 0; + for (i = 0; i < info[dim].nvalues; i++) + { + Size len; + + /* + * cstring is handled similar to varlena - first we store the + * length as uint32 and then the data. We don't care about + * alignment, which means that during deserialization we need + * to copy the fields and only access the copies. + */ + + /* c-strings include terminator, so +1 byte */ + len = strlen(DatumGetCString(values[dim][i])) + 1; + info[dim].nbytes += sizeof(uint32); /* length */ + info[dim].nbytes += len; /* value */ + + /* space needed for properly aligned deserialized copies */ + info[dim].nbytes_aligned += MAXALIGN(len); + } + } + + /* we know (count>0) so there must be some data */ + Assert(info[dim].nbytes > 0); + } + + /* + * Now we can finally compute how much space we'll actually need for the + * whole serialized MCV list (varlena header, MCV header, dimension info + * for each attribute, deduplicated values and items). + */ + total_length = (3 * sizeof(uint32)) /* magic + type + nitems */ + + sizeof(AttrNumber) /* ndimensions */ + + (ndims * sizeof(Oid)); /* attribute types */ + + /* dimension info */ + total_length += ndims * sizeof(DimensionInfo); + + /* add space for the arrays of deduplicated values */ + for (i = 0; i < ndims; i++) + total_length += info[i].nbytes; + + /* + * And finally account for the items (those are fixed-length, thanks to + * replacing values with uint16 indexes into the deduplicated arrays). + */ + total_length += mcvlist->nitems * ITEM_SIZE(dim); + + /* + * Allocate space for the whole serialized MCV list (we'll skip bytes, so + * we set them to zero to make the result more compressible). + */ + raw = (bytea *) palloc0(VARHDRSZ + total_length); + SET_VARSIZE(raw, VARHDRSZ + total_length); + + ptr = VARDATA(raw); + endptr = ptr + total_length; + + /* copy the MCV list header fields, one by one */ + memcpy(ptr, &mcvlist->magic, sizeof(uint32)); + ptr += sizeof(uint32); + + memcpy(ptr, &mcvlist->type, sizeof(uint32)); + ptr += sizeof(uint32); + + memcpy(ptr, &mcvlist->nitems, sizeof(uint32)); + ptr += sizeof(uint32); + + memcpy(ptr, &mcvlist->ndimensions, sizeof(AttrNumber)); + ptr += sizeof(AttrNumber); + + memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims); + ptr += (sizeof(Oid) * ndims); + + /* store information about the attributes (data amounts, ...) */ + memcpy(ptr, info, sizeof(DimensionInfo) * ndims); + ptr += sizeof(DimensionInfo) * ndims; + + /* Copy the deduplicated values for all attributes to the output. */ + for (dim = 0; dim < ndims; dim++) + { + /* remember the starting point for Asserts later */ + char *start PG_USED_FOR_ASSERTS_ONLY = ptr; + + for (i = 0; i < info[dim].nvalues; i++) + { + Datum value = values[dim][i]; + + if (info[dim].typbyval) /* passed by value */ + { + Datum tmp; + + /* + * For byval types, we need to copy just the significant bytes + * - we can't use memcpy directly, as that assumes + * little-endian behavior. store_att_byval does almost what + * we need, but it requires a properly aligned buffer - the + * output buffer does not guarantee that. So we simply use a + * local Datum variable (which guarantees proper alignment), + * and then copy the value from it. + */ + store_att_byval(&tmp, value, info[dim].typlen); + + memcpy(ptr, &tmp, info[dim].typlen); + ptr += info[dim].typlen; + } + else if (info[dim].typlen > 0) /* passed by reference */ + { + /* no special alignment needed, treated as char array */ + memcpy(ptr, DatumGetPointer(value), info[dim].typlen); + ptr += info[dim].typlen; + } + else if (info[dim].typlen == -1) /* varlena */ + { + uint32 len = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); + + /* copy the length */ + memcpy(ptr, &len, sizeof(uint32)); + ptr += sizeof(uint32); + + /* data from the varlena value (without the header) */ + memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len); + ptr += len; + } + else if (info[dim].typlen == -2) /* cstring */ + { + uint32 len = (uint32) strlen(DatumGetCString(value)) + 1; + + /* copy the length */ + memcpy(ptr, &len, sizeof(uint32)); + ptr += sizeof(uint32); + + /* value */ + memcpy(ptr, DatumGetCString(value), len); + ptr += len; + } + + /* no underflows or overflows */ + Assert((ptr > start) && ((ptr - start) <= info[dim].nbytes)); + } + + /* we should get exactly nbytes of data for this dimension */ + Assert((ptr - start) == info[dim].nbytes); + } + + /* Serialize the items, with uint16 indexes instead of the values. */ + for (i = 0; i < mcvlist->nitems; i++) + { + MCVItem *mcvitem = &mcvlist->items[i]; + + /* don't write beyond the allocated space */ + Assert(ptr <= (endptr - ITEM_SIZE(dim))); + + /* copy NULL and frequency flags into the serialized MCV */ + memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims); + ptr += sizeof(bool) * ndims; + + memcpy(ptr, &mcvitem->frequency, sizeof(double)); + ptr += sizeof(double); + + memcpy(ptr, &mcvitem->base_frequency, sizeof(double)); + ptr += sizeof(double); + + /* store the indexes last */ + for (dim = 0; dim < ndims; dim++) + { + uint16 index = 0; + Datum *value; + + /* do the lookup only for non-NULL values */ + if (!mcvitem->isnull[dim]) + { + value = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim], + info[dim].nvalues, sizeof(Datum), + compare_scalars_simple, &ssup[dim]); + + Assert(value != NULL); /* serialization or deduplication + * error */ + + /* compute index within the deduplicated array */ + index = (uint16) (value - values[dim]); + + /* check the index is within expected bounds */ + Assert(index < info[dim].nvalues); + } + + /* copy the index into the serialized MCV */ + memcpy(ptr, &index, sizeof(uint16)); + ptr += sizeof(uint16); + } + + /* make sure we don't overflow the allocated value */ + Assert(ptr <= endptr); + } + + /* at this point we expect to match the total_length exactly */ + Assert(ptr == endptr); + + pfree(values); + pfree(counts); + + return raw; +} + +/* + * statext_mcv_deserialize + * Reads serialized MCV list into MCVList structure. + * + * All the memory needed by the MCV list is allocated as a single chunk, so + * it's possible to simply pfree() it at once. + */ +MCVList * +statext_mcv_deserialize(bytea *data) +{ + int dim, + i; + Size expected_size; + MCVList *mcvlist; + char *raw; + char *ptr; + char *endptr PG_USED_FOR_ASSERTS_ONLY; + + int ndims, + nitems; + DimensionInfo *info = NULL; + + /* local allocation buffer (used only for deserialization) */ + Datum **map = NULL; + + /* MCV list */ + Size mcvlen; + + /* buffer used for the result */ + Size datalen; + char *dataptr; + char *valuesptr; + char *isnullptr; + + if (data == NULL) + return NULL; + + /* + * We can't possibly deserialize a MCV list if there's not even a complete + * header. We need an explicit formula here, because we serialize the + * header fields one by one, so we need to ignore struct alignment. + */ + if (VARSIZE_ANY(data) < MinSizeOfMCVList) + elog(ERROR, "invalid MCV size %zd (expected at least %zu)", + VARSIZE_ANY(data), MinSizeOfMCVList); + + /* read the MCV list header */ + mcvlist = (MCVList *) palloc0(offsetof(MCVList, items)); + + /* pointer to the data part (skip the varlena header) */ + raw = (char *) data; + ptr = VARDATA_ANY(raw); + endptr = (char *) raw + VARSIZE_ANY(data); + + /* get the header and perform further sanity checks */ + memcpy(&mcvlist->magic, ptr, sizeof(uint32)); + ptr += sizeof(uint32); + + memcpy(&mcvlist->type, ptr, sizeof(uint32)); + ptr += sizeof(uint32); + + memcpy(&mcvlist->nitems, ptr, sizeof(uint32)); + ptr += sizeof(uint32); + + memcpy(&mcvlist->ndimensions, ptr, sizeof(AttrNumber)); + ptr += sizeof(AttrNumber); + + if (mcvlist->magic != STATS_MCV_MAGIC) + elog(ERROR, "invalid MCV magic %u (expected %u)", + mcvlist->magic, STATS_MCV_MAGIC); + + if (mcvlist->type != STATS_MCV_TYPE_BASIC) + elog(ERROR, "invalid MCV type %u (expected %u)", + mcvlist->type, STATS_MCV_TYPE_BASIC); + + if (mcvlist->ndimensions == 0) + elog(ERROR, "invalid zero-length dimension array in MCVList"); + else if ((mcvlist->ndimensions > STATS_MAX_DIMENSIONS) || + (mcvlist->ndimensions < 0)) + elog(ERROR, "invalid length (%d) dimension array in MCVList", + mcvlist->ndimensions); + + if (mcvlist->nitems == 0) + elog(ERROR, "invalid zero-length item array in MCVList"); + else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS) + elog(ERROR, "invalid length (%u) item array in MCVList", + mcvlist->nitems); + + nitems = mcvlist->nitems; + ndims = mcvlist->ndimensions; + + /* + * Check amount of data including DimensionInfo for all dimensions and + * also the serialized items (including uint16 indexes). Also, walk + * through the dimension information and add it to the sum. + */ + expected_size = SizeOfMCVList(ndims, nitems); + + /* + * Check that we have at least the dimension and info records, along with + * the items. We don't know the size of the serialized values yet. We need + * to do this check first, before accessing the dimension info. + */ + if (VARSIZE_ANY(data) < expected_size) + elog(ERROR, "invalid MCV size %zd (expected %zu)", + VARSIZE_ANY(data), expected_size); + + /* Now copy the array of type Oids. */ + memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims); + ptr += (sizeof(Oid) * ndims); + + /* Now it's safe to access the dimension info. */ + info = palloc(ndims * sizeof(DimensionInfo)); + + memcpy(info, ptr, ndims * sizeof(DimensionInfo)); + ptr += (ndims * sizeof(DimensionInfo)); + + /* account for the value arrays */ + for (dim = 0; dim < ndims; dim++) + { + /* + * XXX I wonder if we can/should rely on asserts here. Maybe those + * checks should be done every time? + */ + Assert(info[dim].nvalues >= 0); + Assert(info[dim].nbytes >= 0); + + expected_size += info[dim].nbytes; + } + + /* + * Now we know the total expected MCV size, including all the pieces + * (header, dimension info. items and deduplicated data). So do the final + * check on size. + */ + if (VARSIZE_ANY(data) != expected_size) + elog(ERROR, "invalid MCV size %zd (expected %zu)", + VARSIZE_ANY(data), expected_size); + + /* + * We need an array of Datum values for each dimension, so that we can + * easily translate the uint16 indexes later. We also need a top-level + * array of pointers to those per-dimension arrays. + * + * While allocating the arrays for dimensions, compute how much space we + * need for a copy of the by-ref data, as we can't simply point to the + * original values (it might go away). + */ + datalen = 0; /* space for by-ref data */ + map = (Datum **) palloc(ndims * sizeof(Datum *)); + + for (dim = 0; dim < ndims; dim++) + { + map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues); + + /* space needed for a copy of data for by-ref types */ + datalen += info[dim].nbytes_aligned; + } + + /* + * Now resize the MCV list so that the allocation includes all the data. + * + * Allocate space for a copy of the data, as we can't simply reference the + * serialized data - it's not aligned properly, and it may disappear while + * we're still using the MCV list, e.g. due to catcache release. + * + * We do care about alignment here, because we will allocate all the + * pieces at once, but then use pointers to different parts. + */ + mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems)); + + /* arrays of values and isnull flags for all MCV items */ + mcvlen += nitems * MAXALIGN(sizeof(Datum) * ndims); + mcvlen += nitems * MAXALIGN(sizeof(bool) * ndims); + + /* we don't quite need to align this, but it makes some asserts easier */ + mcvlen += MAXALIGN(datalen); + + /* now resize the deserialized MCV list, and compute pointers to parts */ + mcvlist = repalloc(mcvlist, mcvlen); + + /* pointer to the beginning of values/isnull arrays */ + valuesptr = (char *) mcvlist + + MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems)); + + isnullptr = valuesptr + (nitems * MAXALIGN(sizeof(Datum) * ndims)); + + dataptr = isnullptr + (nitems * MAXALIGN(sizeof(bool) * ndims)); + + /* + * Build mapping (index => value) for translating the serialized data into + * the in-memory representation. + */ + for (dim = 0; dim < ndims; dim++) + { + /* remember start position in the input array */ + char *start PG_USED_FOR_ASSERTS_ONLY = ptr; + + if (info[dim].typbyval) + { + /* for by-val types we simply copy data into the mapping */ + for (i = 0; i < info[dim].nvalues; i++) + { + Datum v = 0; + + memcpy(&v, ptr, info[dim].typlen); + ptr += info[dim].typlen; + + map[dim][i] = fetch_att(&v, true, info[dim].typlen); + + /* no under/overflow of input array */ + Assert(ptr <= (start + info[dim].nbytes)); + } + } + else + { + /* for by-ref types we need to also make a copy of the data */ + + /* passed by reference, but fixed length (name, tid, ...) */ + if (info[dim].typlen > 0) + { + for (i = 0; i < info[dim].nvalues; i++) + { + memcpy(dataptr, ptr, info[dim].typlen); + ptr += info[dim].typlen; + + /* just point into the array */ + map[dim][i] = PointerGetDatum(dataptr); + dataptr += MAXALIGN(info[dim].typlen); + } + } + else if (info[dim].typlen == -1) + { + /* varlena */ + for (i = 0; i < info[dim].nvalues; i++) + { + uint32 len; + + /* read the uint32 length */ + memcpy(&len, ptr, sizeof(uint32)); + ptr += sizeof(uint32); + + /* the length is data-only */ + SET_VARSIZE(dataptr, len + VARHDRSZ); + memcpy(VARDATA(dataptr), ptr, len); + ptr += len; + + /* just point into the array */ + map[dim][i] = PointerGetDatum(dataptr); + + /* skip to place of the next deserialized value */ + dataptr += MAXALIGN(len + VARHDRSZ); + } + } + else if (info[dim].typlen == -2) + { + /* cstring */ + for (i = 0; i < info[dim].nvalues; i++) + { + uint32 len; + + memcpy(&len, ptr, sizeof(uint32)); + ptr += sizeof(uint32); + + memcpy(dataptr, ptr, len); + ptr += len; + + /* just point into the array */ + map[dim][i] = PointerGetDatum(dataptr); + dataptr += MAXALIGN(len); + } + } + + /* no under/overflow of input array */ + Assert(ptr <= (start + info[dim].nbytes)); + + /* no overflow of the output mcv value */ + Assert(dataptr <= ((char *) mcvlist + mcvlen)); + } + + /* check we consumed input data for this dimension exactly */ + Assert(ptr == (start + info[dim].nbytes)); + } + + /* we should have also filled the MCV list exactly */ + Assert(dataptr == ((char *) mcvlist + mcvlen)); + + /* deserialize the MCV items and translate the indexes to Datums */ + for (i = 0; i < nitems; i++) + { + MCVItem *item = &mcvlist->items[i]; + + item->values = (Datum *) valuesptr; + valuesptr += MAXALIGN(sizeof(Datum) * ndims); + + item->isnull = (bool *) isnullptr; + isnullptr += MAXALIGN(sizeof(bool) * ndims); + + memcpy(item->isnull, ptr, sizeof(bool) * ndims); + ptr += sizeof(bool) * ndims; + + memcpy(&item->frequency, ptr, sizeof(double)); + ptr += sizeof(double); + + memcpy(&item->base_frequency, ptr, sizeof(double)); + ptr += sizeof(double); + + /* finally translate the indexes (for non-NULL only) */ + for (dim = 0; dim < ndims; dim++) + { + uint16 index; + + memcpy(&index, ptr, sizeof(uint16)); + ptr += sizeof(uint16); + + if (item->isnull[dim]) + continue; + + item->values[dim] = map[dim][index]; + } + + /* check we're not overflowing the input */ + Assert(ptr <= endptr); + } + + /* check that we processed all the data */ + Assert(ptr == endptr); + + /* release the buffers used for mapping */ + for (dim = 0; dim < ndims; dim++) + pfree(map[dim]); + + pfree(map); + + return mcvlist; +} + +/* + * SRF with details about buckets of a histogram: + * + * - item ID (0...nitems) + * - values (string array) + * - nulls only (boolean array) + * - frequency (double precision) + * - base_frequency (double precision) + * + * The input is the OID of the statistics, and there are no rows returned if + * the statistics contains no histogram. + */ +Datum +pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + + /* stuff done only on the first call of the function */ + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + MCVList *mcvlist; + TupleDesc tupdesc; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* switch to memory context appropriate for multiple function calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + mcvlist = statext_mcv_deserialize(PG_GETARG_BYTEA_P(0)); + + funcctx->user_fctx = mcvlist; + + /* total number of tuples to be returned */ + funcctx->max_calls = 0; + if (funcctx->user_fctx != NULL) + funcctx->max_calls = mcvlist->nitems; + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function returning record called in context " + "that cannot accept type record"))); + tupdesc = BlessTupleDesc(tupdesc); + + /* + * generate attribute metadata needed later to produce tuples from raw + * C strings + */ + funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); + + MemoryContextSwitchTo(oldcontext); + } + + /* stuff done on every call of the function */ + funcctx = SRF_PERCALL_SETUP(); + + if (funcctx->call_cntr < funcctx->max_calls) /* do when there is more + * left to send */ + { + Datum values[5]; + bool nulls[5]; + HeapTuple tuple; + Datum result; + ArrayBuildState *astate_values = NULL; + ArrayBuildState *astate_nulls = NULL; + + int i; + MCVList *mcvlist; + MCVItem *item; + + mcvlist = (MCVList *) funcctx->user_fctx; + + Assert(funcctx->call_cntr < mcvlist->nitems); + + item = &mcvlist->items[funcctx->call_cntr]; + + for (i = 0; i < mcvlist->ndimensions; i++) + { + + astate_nulls = accumArrayResult(astate_nulls, + BoolGetDatum(item->isnull[i]), + false, + BOOLOID, + CurrentMemoryContext); + + if (!item->isnull[i]) + { + bool isvarlena; + Oid outfunc; + FmgrInfo fmgrinfo; + Datum val; + text *txt; + + /* lookup output func for the type */ + getTypeOutputInfo(mcvlist->types[i], &outfunc, &isvarlena); + fmgr_info(outfunc, &fmgrinfo); + + val = FunctionCall1(&fmgrinfo, item->values[i]); + txt = cstring_to_text(DatumGetPointer(val)); + + astate_values = accumArrayResult(astate_values, + PointerGetDatum(txt), + false, + TEXTOID, + CurrentMemoryContext); + } + else + astate_values = accumArrayResult(astate_values, + (Datum) 0, + true, + TEXTOID, + CurrentMemoryContext); + } + + values[0] = Int32GetDatum(funcctx->call_cntr); + values[1] = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext)); + values[2] = PointerGetDatum(makeArrayResult(astate_nulls, CurrentMemoryContext)); + values[3] = Float8GetDatum(item->frequency); + values[4] = Float8GetDatum(item->base_frequency); + + /* no NULLs in the tuple */ + memset(nulls, 0, sizeof(nulls)); + + /* build a tuple */ + tuple = heap_form_tuple(funcctx->attinmeta->tupdesc, values, nulls); + + /* make the tuple into a datum */ + result = HeapTupleGetDatum(tuple); + + SRF_RETURN_NEXT(funcctx, result); + } + else /* do when there is no more left */ + { + SRF_RETURN_DONE(funcctx); + } +} + +/* + * pg_mcv_list_in - input routine for type pg_mcv_list. + * + * pg_mcv_list is real enough to be a table column, but it has no operations + * of its own, and disallows input too + */ +Datum +pg_mcv_list_in(PG_FUNCTION_ARGS) +{ + /* + * pg_mcv_list stores the data in binary form and parsing text input is + * not needed, so disallow this. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_mcv_list"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + + +/* + * pg_mcv_list_out - output routine for type pg_mcv_list. + * + * MCV lists are serialized into a bytea value, so we simply call byteaout() + * to serialize the value into text. But it'd be nice to serialize that into + * a meaningful representation (e.g. for inspection by people). + * + * XXX This should probably return something meaningful, similar to what + * pg_dependencies_out does. Not sure how to deal with the deduplicated + * values, though - do we want to expand that or not? + */ +Datum +pg_mcv_list_out(PG_FUNCTION_ARGS) +{ + return byteaout(fcinfo); +} + +/* + * pg_mcv_list_recv - binary input routine for type pg_mcv_list. + */ +Datum +pg_mcv_list_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_mcv_list"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_mcv_list_send - binary output routine for type pg_mcv_list. + * + * MCV lists are serialized in a bytea value (although the type is named + * differently), so let's just send that. + */ +Datum +pg_mcv_list_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} + +/* + * match the attribute/expression to a dimension of the statistic + * + * Returns the zero-based index of the matching statistics dimension. + * Optionally determines the collation. + */ +static int +mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid) +{ + int idx; + + if (IsA(expr, Var)) + { + /* simple Var, so just lookup using varattno */ + Var *var = (Var *) expr; + + if (collid) + *collid = var->varcollid; + + idx = bms_member_index(keys, var->varattno); + + if (idx < 0) + elog(ERROR, "variable not found in statistics object"); + } + else + { + /* expression - lookup in stats expressions */ + ListCell *lc; + + if (collid) + *collid = exprCollation(expr); + + /* expressions are stored after the simple columns */ + idx = bms_num_members(keys); + foreach(lc, exprs) + { + Node *stat_expr = (Node *) lfirst(lc); + + if (equal(expr, stat_expr)) + break; + + idx++; + } + + if (lc == NULL) + elog(ERROR, "expression not found in statistics object"); + } + + return idx; +} + +/* + * mcv_get_match_bitmap + * Evaluate clauses using the MCV list, and update the match bitmap. + * + * A match bitmap keeps match/mismatch status for each MCV item, and we + * update it based on additional clauses. We also use it to skip items + * that can't possibly match (e.g. item marked as "mismatch" can't change + * to "match" when evaluating AND clause list). + * + * The function also returns a flag indicating whether there was an + * equality condition for all attributes, the minimum frequency in the MCV + * list, and a total MCV frequency (sum of frequencies for all items). + * + * XXX Currently the match bitmap uses a bool for each MCV item, which is + * somewhat wasteful as we could do with just a single bit, thus reducing + * the size to ~1/8. It would also allow us to combine bitmaps simply using + * & and |, which should be faster than min/max. The bitmaps are fairly + * small, though (thanks to the cap on the MCV list size). + */ +static bool * +mcv_get_match_bitmap(PlannerInfo *root, List *clauses, + Bitmapset *keys, List *exprs, + MCVList *mcvlist, bool is_or) +{ + int i; + ListCell *l; + bool *matches; + + /* The bitmap may be partially built. */ + Assert(clauses != NIL); + Assert(list_length(clauses) >= 1); + Assert(mcvlist != NULL); + Assert(mcvlist->nitems > 0); + Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS); + + matches = palloc(sizeof(bool) * mcvlist->nitems); + memset(matches, (is_or) ? false : true, + sizeof(bool) * mcvlist->nitems); + + /* + * Loop through the list of clauses, and for each of them evaluate all the + * MCV items not yet eliminated by the preceding clauses. + */ + foreach(l, clauses) + { + Node *clause = (Node *) lfirst(l); + + /* if it's a RestrictInfo, then extract the clause */ + if (IsA(clause, RestrictInfo)) + clause = (Node *) ((RestrictInfo *) clause)->clause; + + /* + * Handle the various types of clauses - OpClause, NullTest and + * AND/OR/NOT + */ + if (is_opclause(clause)) + { + OpExpr *expr = (OpExpr *) clause; + FmgrInfo opproc; + + /* valid only after examine_opclause_args returns true */ + Node *clause_expr; + Const *cst; + bool expronleft; + int idx; + Oid collid; + + fmgr_info(get_opcode(expr->opno), &opproc); + + /* extract the var/expr and const from the expression */ + if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft)) + elog(ERROR, "incompatible clause"); + + /* match the attribute/expression to a dimension of the statistic */ + idx = mcv_match_expression(clause_expr, keys, exprs, &collid); + + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + bool match = true; + MCVItem *item = &mcvlist->items[i]; + + Assert(idx >= 0); + + /* + * When the MCV item or the Const value is NULL we can treat + * this as a mismatch. We must not call the operator because + * of strictness. + */ + if (item->isnull[idx] || cst->constisnull) + { + matches[i] = RESULT_MERGE(matches[i], is_or, false); + continue; + } + + /* + * Skip MCV items that can't change result in the bitmap. Once + * the value gets false for AND-lists, or true for OR-lists, + * we don't need to look at more clauses. + */ + if (RESULT_IS_FINAL(matches[i], is_or)) + continue; + + /* + * First check whether the constant is below the lower + * boundary (in that case we can skip the bucket, because + * there's no overlap). + * + * We don't store collations used to build the statistics, but + * we can use the collation for the attribute itself, as + * stored in varcollid. We do reset the statistics after a + * type change (including collation change), so this is OK. + * For expressions, we use the collation extracted from the + * expression itself. + */ + if (expronleft) + match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + item->values[idx], + cst->constvalue)); + else + match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + cst->constvalue, + item->values[idx])); + + /* update the match bitmap with the result */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); + } + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; + FmgrInfo opproc; + + /* valid only after examine_opclause_args returns true */ + Node *clause_expr; + Const *cst; + bool expronleft; + Oid collid; + int idx; + + /* array evaluation */ + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int num_elems; + Datum *elem_values; + bool *elem_nulls; + + fmgr_info(get_opcode(expr->opno), &opproc); + + /* extract the var/expr and const from the expression */ + if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft)) + elog(ERROR, "incompatible clause"); + + /* We expect Var on left */ + if (!expronleft) + elog(ERROR, "incompatible clause"); + + /* + * Deconstruct the array constant, unless it's NULL (we'll cover + * that case below) + */ + if (!cst->constisnull) + { + arrayval = DatumGetArrayTypeP(cst->constvalue); + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elem_values, &elem_nulls, &num_elems); + } + + /* match the attribute/expression to a dimension of the statistic */ + idx = mcv_match_expression(clause_expr, keys, exprs, &collid); + + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + int j; + bool match = (expr->useOr ? false : true); + MCVItem *item = &mcvlist->items[i]; + + /* + * When the MCV item or the Const value is NULL we can treat + * this as a mismatch. We must not call the operator because + * of strictness. + */ + if (item->isnull[idx] || cst->constisnull) + { + matches[i] = RESULT_MERGE(matches[i], is_or, false); + continue; + } + + /* + * Skip MCV items that can't change result in the bitmap. Once + * the value gets false for AND-lists, or true for OR-lists, + * we don't need to look at more clauses. + */ + if (RESULT_IS_FINAL(matches[i], is_or)) + continue; + + for (j = 0; j < num_elems; j++) + { + Datum elem_value = elem_values[j]; + bool elem_isnull = elem_nulls[j]; + bool elem_match; + + /* NULL values always evaluate as not matching. */ + if (elem_isnull) + { + match = RESULT_MERGE(match, expr->useOr, false); + continue; + } + + /* + * Stop evaluating the array elements once we reach a + * matching value that can't change - ALL() is the same as + * AND-list, ANY() is the same as OR-list. + */ + if (RESULT_IS_FINAL(match, expr->useOr)) + break; + + elem_match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + item->values[idx], + elem_value)); + + match = RESULT_MERGE(match, expr->useOr, elem_match); + } + + /* update the match bitmap with the result */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); + } + } + else if (IsA(clause, NullTest)) + { + NullTest *expr = (NullTest *) clause; + Node *clause_expr = (Node *) (expr->arg); + + /* match the attribute/expression to a dimension of the statistic */ + int idx = mcv_match_expression(clause_expr, keys, exprs, NULL); + + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + bool match = false; /* assume mismatch */ + MCVItem *item = &mcvlist->items[i]; + + /* if the clause mismatches the MCV item, update the bitmap */ + switch (expr->nulltesttype) + { + case IS_NULL: + match = (item->isnull[idx]) ? true : match; + break; + + case IS_NOT_NULL: + match = (!item->isnull[idx]) ? true : match; + break; + } + + /* now, update the match bitmap, depending on OR/AND type */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); + } + } + else if (is_orclause(clause) || is_andclause(clause)) + { + /* AND/OR clause, with all subclauses being compatible */ + + int i; + BoolExpr *bool_clause = ((BoolExpr *) clause); + List *bool_clauses = bool_clause->args; + + /* match/mismatch bitmap for each MCV item */ + bool *bool_matches = NULL; + + Assert(bool_clauses != NIL); + Assert(list_length(bool_clauses) >= 2); + + /* build the match bitmap for the OR-clauses */ + bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs, + mcvlist, is_orclause(clause)); + + /* + * Merge the bitmap produced by mcv_get_match_bitmap into the + * current one. We need to consider if we're evaluating AND or OR + * condition when merging the results. + */ + for (i = 0; i < mcvlist->nitems; i++) + matches[i] = RESULT_MERGE(matches[i], is_or, bool_matches[i]); + + pfree(bool_matches); + } + else if (is_notclause(clause)) + { + /* NOT clause, with all subclauses compatible */ + + int i; + BoolExpr *not_clause = ((BoolExpr *) clause); + List *not_args = not_clause->args; + + /* match/mismatch bitmap for each MCV item */ + bool *not_matches = NULL; + + Assert(not_args != NIL); + Assert(list_length(not_args) == 1); + + /* build the match bitmap for the NOT-clause */ + not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs, + mcvlist, false); + + /* + * Merge the bitmap produced by mcv_get_match_bitmap into the + * current one. We're handling a NOT clause, so invert the result + * before merging it into the global bitmap. + */ + for (i = 0; i < mcvlist->nitems; i++) + matches[i] = RESULT_MERGE(matches[i], is_or, !not_matches[i]); + + pfree(not_matches); + } + else if (IsA(clause, Var)) + { + /* Var (has to be a boolean Var, possibly from below NOT) */ + + Var *var = (Var *) (clause); + + /* match the attribute to a dimension of the statistic */ + int idx = bms_member_index(keys, var->varattno); + + Assert(var->vartype == BOOLOID); + + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + MCVItem *item = &mcvlist->items[i]; + bool match = false; + + /* if the item is NULL, it's a mismatch */ + if (!item->isnull[idx] && DatumGetBool(item->values[idx])) + match = true; + + /* update the result bitmap */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); + } + } + else + { + /* Otherwise, it must be a bare boolean-returning expression */ + int idx; + + /* match the expression to a dimension of the statistic */ + idx = mcv_match_expression(clause, keys, exprs, NULL); + + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + bool match; + MCVItem *item = &mcvlist->items[i]; + + /* "match" just means it's bool TRUE */ + match = !item->isnull[idx] && DatumGetBool(item->values[idx]); + + /* now, update the match bitmap, depending on OR/AND type */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); + } + } + } + + return matches; +} + + +/* + * mcv_combine_selectivities + * Combine per-column and multi-column MCV selectivity estimates. + * + * simple_sel is a "simple" selectivity estimate (produced without using any + * extended statistics, essentially assuming independence of columns/clauses). + * + * mcv_sel and mcv_basesel are sums of the frequencies and base frequencies of + * all matching MCV items. The difference (mcv_sel - mcv_basesel) is then + * essentially interpreted as a correction to be added to simple_sel, as + * described below. + * + * mcv_totalsel is the sum of the frequencies of all MCV items (not just the + * matching ones). This is used as an upper bound on the portion of the + * selectivity estimates not covered by the MCV statistics. + * + * Note: While simple and base selectivities are defined in a quite similar + * way, the values are computed differently and are not therefore equal. The + * simple selectivity is computed as a product of per-clause estimates, while + * the base selectivity is computed by adding up base frequencies of matching + * items of the multi-column MCV list. So the values may differ for two main + * reasons - (a) the MCV list may not cover 100% of the data and (b) some of + * the MCV items did not match the estimated clauses. + * + * As both (a) and (b) reduce the base selectivity value, it generally holds + * that (simple_sel >= mcv_basesel). If the MCV list covers all the data, the + * values may be equal. + * + * So, other_sel = (simple_sel - mcv_basesel) is an estimate for the part not + * covered by the MCV list, and (mcv_sel - mcv_basesel) may be seen as a + * correction for the part covered by the MCV list. Those two statements are + * actually equivalent. + */ +Selectivity +mcv_combine_selectivities(Selectivity simple_sel, + Selectivity mcv_sel, + Selectivity mcv_basesel, + Selectivity mcv_totalsel) +{ + Selectivity other_sel; + Selectivity sel; + + /* estimated selectivity of values not covered by MCV matches */ + other_sel = simple_sel - mcv_basesel; + CLAMP_PROBABILITY(other_sel); + + /* this non-MCV selectivity cannot exceed 1 - mcv_totalsel */ + if (other_sel > 1.0 - mcv_totalsel) + other_sel = 1.0 - mcv_totalsel; + + /* overall selectivity is the sum of the MCV and non-MCV parts */ + sel = mcv_sel + other_sel; + CLAMP_PROBABILITY(sel); + + return sel; +} + + +/* + * mcv_clauselist_selectivity + * Use MCV statistics to estimate the selectivity of an implicitly-ANDed + * list of clauses. + * + * This determines which MCV items match every clause in the list and returns + * the sum of the frequencies of those items. + * + * In addition, it returns the sum of the base frequencies of each of those + * items (that is the sum of the selectivities that each item would have if + * the columns were independent of one another), and the total selectivity of + * all the MCV items (not just the matching ones). These are expected to be + * used together with a "simple" selectivity estimate (one based only on + * per-column statistics) to produce an overall selectivity estimate that + * makes use of both per-column and multi-column statistics --- see + * mcv_combine_selectivities(). + */ +Selectivity +mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, + List *clauses, int varRelid, + JoinType jointype, SpecialJoinInfo *sjinfo, + RelOptInfo *rel, + Selectivity *basesel, Selectivity *totalsel) +{ + int i; + MCVList *mcv; + Selectivity s = 0.0; + + /* match/mismatch bitmap for each MCV item */ + bool *matches = NULL; + + /* load the MCV list stored in the statistics object */ + mcv = statext_mcv_load(stat->statOid); + + /* build a match bitmap for the clauses */ + matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs, + mcv, false); + + /* sum frequencies for all the matching MCV items */ + *basesel = 0.0; + *totalsel = 0.0; + for (i = 0; i < mcv->nitems; i++) + { + *totalsel += mcv->items[i].frequency; + + if (matches[i] != false) + { + *basesel += mcv->items[i].base_frequency; + s += mcv->items[i].frequency; + } + } + + return s; +} + + +/* + * mcv_clause_selectivity_or + * Use MCV statistics to estimate the selectivity of a clause that + * appears in an ORed list of clauses. + * + * As with mcv_clauselist_selectivity() this determines which MCV items match + * the clause and returns both the sum of the frequencies and the sum of the + * base frequencies of those items, as well as the sum of the frequencies of + * all MCV items (not just the matching ones) so that this information can be + * used by mcv_combine_selectivities() to produce a selectivity estimate that + * makes use of both per-column and multi-column statistics. + * + * Additionally, we return information to help compute the overall selectivity + * of the ORed list of clauses assumed to contain this clause. This function + * is intended to be called for each clause in the ORed list of clauses, + * allowing the overall selectivity to be computed using the following + * algorithm: + * + * Suppose P[n] = P(C[1] OR C[2] OR ... OR C[n]) is the combined selectivity + * of the first n clauses in the list. Then the combined selectivity taking + * into account the next clause C[n+1] can be written as + * + * P[n+1] = P[n] + P(C[n+1]) - P((C[1] OR ... OR C[n]) AND C[n+1]) + * + * The final term above represents the overlap between the clauses examined so + * far and the (n+1)'th clause. To estimate its selectivity, we track the + * match bitmap for the ORed list of clauses examined so far and examine its + * intersection with the match bitmap for the (n+1)'th clause. + * + * We then also return the sums of the MCV item frequencies and base + * frequencies for the match bitmap intersection corresponding to the overlap + * term above, so that they can be combined with a simple selectivity estimate + * for that term. + * + * The parameter "or_matches" is an in/out parameter tracking the match bitmap + * for the clauses examined so far. The caller is expected to set it to NULL + * the first time it calls this function. + */ +Selectivity +mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat, + MCVList *mcv, Node *clause, bool **or_matches, + Selectivity *basesel, Selectivity *overlap_mcvsel, + Selectivity *overlap_basesel, Selectivity *totalsel) +{ + Selectivity s = 0.0; + bool *new_matches; + int i; + + /* build the OR-matches bitmap, if not built already */ + if (*or_matches == NULL) + *or_matches = palloc0(sizeof(bool) * mcv->nitems); + + /* build the match bitmap for the new clause */ + new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys, + stat->exprs, mcv, false); + + /* + * Sum the frequencies for all the MCV items matching this clause and also + * those matching the overlap between this clause and any of the preceding + * clauses as described above. + */ + *basesel = 0.0; + *overlap_mcvsel = 0.0; + *overlap_basesel = 0.0; + *totalsel = 0.0; + for (i = 0; i < mcv->nitems; i++) + { + *totalsel += mcv->items[i].frequency; + + if (new_matches[i]) + { + s += mcv->items[i].frequency; + *basesel += mcv->items[i].base_frequency; + + if ((*or_matches)[i]) + { + *overlap_mcvsel += mcv->items[i].frequency; + *overlap_basesel += mcv->items[i].base_frequency; + } + } + + /* update the OR-matches bitmap for the next clause */ + (*or_matches)[i] = (*or_matches)[i] || new_matches[i]; + } + + pfree(new_matches); + + return s; +} |