summaryrefslogtreecommitdiffstats
path: root/src/backend/statistics/dependencies.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/statistics/dependencies.c')
-rw-r--r--src/backend/statistics/dependencies.c1860
1 files changed, 1860 insertions, 0 deletions
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
new file mode 100644
index 0000000..a9187dd
--- /dev/null
+++ b/src/backend/statistics/dependencies.c
@@ -0,0 +1,1860 @@
+/*-------------------------------------------------------------------------
+ *
+ * dependencies.c
+ * POSTGRES functional dependencies
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/statistics/dependencies.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_statistic_ext.h"
+#include "catalog/pg_statistic_ext_data.h"
+#include "lib/stringinfo.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
+#include "nodes/pathnodes.h"
+#include "optimizer/clauses.h"
+#include "optimizer/optimizer.h"
+#include "parser/parsetree.h"
+#include "statistics/extended_stats_internal.h"
+#include "statistics/statistics.h"
+#include "utils/bytea.h"
+#include "utils/fmgroids.h"
+#include "utils/fmgrprotos.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/selfuncs.h"
+#include "utils/syscache.h"
+#include "utils/typcache.h"
+
+/* size of the struct header fields (magic, type, ndeps) */
+#define SizeOfHeader (3 * sizeof(uint32))
+
+/* size of a serialized dependency (degree, natts, atts) */
+#define SizeOfItem(natts) \
+ (sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
+
+/* minimal size of a dependency (with two attributes) */
+#define MinSizeOfItem SizeOfItem(2)
+
+/* minimal size of dependencies, when all deps are minimal */
+#define MinSizeOfItems(ndeps) \
+ (SizeOfHeader + (ndeps) * MinSizeOfItem)
+
+/*
+ * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
+ * k-permutations of n elements, except that the order does not matter for the
+ * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
+ */
+typedef struct DependencyGeneratorData
+{
+ int k; /* size of the dependency */
+ int n; /* number of possible attributes */
+ int current; /* next dependency to return (index) */
+ AttrNumber ndependencies; /* number of dependencies generated */
+ AttrNumber *dependencies; /* array of pre-generated dependencies */
+} DependencyGeneratorData;
+
+typedef DependencyGeneratorData *DependencyGenerator;
+
+static void generate_dependencies_recurse(DependencyGenerator state,
+ int index, AttrNumber start, AttrNumber *current);
+static void generate_dependencies(DependencyGenerator state);
+static DependencyGenerator DependencyGenerator_init(int n, int k);
+static void DependencyGenerator_free(DependencyGenerator state);
+static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
+static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
+static bool dependency_is_fully_matched(MVDependency *dependency,
+ Bitmapset *attnums);
+static bool dependency_is_compatible_clause(Node *clause, Index relid,
+ AttrNumber *attnum);
+static bool dependency_is_compatible_expression(Node *clause, Index relid,
+ List *statlist, Node **expr);
+static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
+ int ndependencies, Bitmapset *attnums);
+static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
+ int varRelid, JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ MVDependency **dependencies,
+ int ndependencies,
+ AttrNumber *list_attnums,
+ Bitmapset **estimatedclauses);
+
+static void
+generate_dependencies_recurse(DependencyGenerator state, int index,
+ AttrNumber start, AttrNumber *current)
+{
+ /*
+ * The generator handles the first (k-1) elements differently from the
+ * last element.
+ */
+ if (index < (state->k - 1))
+ {
+ AttrNumber i;
+
+ /*
+ * The first (k-1) values have to be in ascending order, which we
+ * generate recursively.
+ */
+
+ for (i = start; i < state->n; i++)
+ {
+ current[index] = i;
+ generate_dependencies_recurse(state, (index + 1), (i + 1), current);
+ }
+ }
+ else
+ {
+ int i;
+
+ /*
+ * the last element is the implied value, which does not respect the
+ * ascending order. We just need to check that the value is not in the
+ * first (k-1) elements.
+ */
+
+ for (i = 0; i < state->n; i++)
+ {
+ int j;
+ bool match = false;
+
+ current[index] = i;
+
+ for (j = 0; j < index; j++)
+ {
+ if (current[j] == i)
+ {
+ match = true;
+ break;
+ }
+ }
+
+ /*
+ * If the value is not found in the first part of the dependency,
+ * we're done.
+ */
+ if (!match)
+ {
+ state->dependencies = (AttrNumber *) repalloc(state->dependencies,
+ state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
+ memcpy(&state->dependencies[(state->k * state->ndependencies)],
+ current, state->k * sizeof(AttrNumber));
+ state->ndependencies++;
+ }
+ }
+ }
+}
+
+/* generate all dependencies (k-permutations of n elements) */
+static void
+generate_dependencies(DependencyGenerator state)
+{
+ AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k);
+
+ generate_dependencies_recurse(state, 0, 0, current);
+
+ pfree(current);
+}
+
+/*
+ * initialize the DependencyGenerator of variations, and prebuild the variations
+ *
+ * This pre-builds all the variations. We could also generate them in
+ * DependencyGenerator_next(), but this seems simpler.
+ */
+static DependencyGenerator
+DependencyGenerator_init(int n, int k)
+{
+ DependencyGenerator state;
+
+ Assert((n >= k) && (k > 0));
+
+ /* allocate the DependencyGenerator state */
+ state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
+ state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber));
+
+ state->ndependencies = 0;
+ state->current = 0;
+ state->k = k;
+ state->n = n;
+
+ /* now actually pre-generate all the variations */
+ generate_dependencies(state);
+
+ return state;
+}
+
+/* free the DependencyGenerator state */
+static void
+DependencyGenerator_free(DependencyGenerator state)
+{
+ pfree(state->dependencies);
+ pfree(state);
+
+}
+
+/* generate next combination */
+static AttrNumber *
+DependencyGenerator_next(DependencyGenerator state)
+{
+ if (state->current == state->ndependencies)
+ return NULL;
+
+ return &state->dependencies[state->k * state->current++];
+}
+
+
+/*
+ * validates functional dependency on the data
+ *
+ * An actual work horse of detecting functional dependencies. Given a variation
+ * of k attributes, it checks that the first (k-1) are sufficient to determine
+ * the last one.
+ */
+static double
+dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
+{
+ int i,
+ nitems;
+ MultiSortSupport mss;
+ SortItem *items;
+ AttrNumber *attnums_dep;
+
+ /* counters valid within a group */
+ int group_size = 0;
+ int n_violations = 0;
+
+ /* total number of rows supporting (consistent with) the dependency */
+ int n_supporting_rows = 0;
+
+ /* Make sure we have at least two input attributes. */
+ Assert(k >= 2);
+
+ /* sort info for all attributes columns */
+ mss = multi_sort_init(k);
+
+ /*
+ * Translate the array of indexes to regular attnums for the dependency
+ * (we will need this to identify the columns in StatsBuildData).
+ */
+ attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
+ for (i = 0; i < k; i++)
+ attnums_dep[i] = data->attnums[dependency[i]];
+
+ /*
+ * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
+ *
+ * (a) sort the data lexicographically
+ *
+ * (b) split the data into groups by first (k-1) columns
+ *
+ * (c) for each group count different values in the last column
+ *
+ * We use the column data types' default sort operators and collations;
+ * perhaps at some point it'd be worth using column-specific collations?
+ */
+
+ /* prepare the sort function for the dimensions */
+ for (i = 0; i < k; i++)
+ {
+ VacAttrStats *colstat = data->stats[dependency[i]];
+ TypeCacheEntry *type;
+
+ type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
+ if (type->lt_opr == InvalidOid) /* shouldn't happen */
+ elog(ERROR, "cache lookup failed for ordering operator for type %u",
+ colstat->attrtypid);
+
+ /* prepare the sort function for this dimension */
+ multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
+ }
+
+ /*
+ * build an array of SortItem(s) sorted using the multi-sort support
+ *
+ * XXX This relies on all stats entries pointing to the same tuple
+ * descriptor. For now that assumption holds, but it might change in the
+ * future for example if we support statistics on multiple tables.
+ */
+ items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
+
+ /*
+ * Walk through the sorted array, split it into rows according to the
+ * first (k-1) columns. If there's a single value in the last column, we
+ * count the group as 'supporting' the functional dependency. Otherwise we
+ * count it as contradicting.
+ */
+
+ /* start with the first row forming a group */
+ group_size = 1;
+
+ /* loop 1 beyond the end of the array so that we count the final group */
+ for (i = 1; i <= nitems; i++)
+ {
+ /*
+ * Check if the group ended, which may be either because we processed
+ * all the items (i==nitems), or because the i-th item is not equal to
+ * the preceding one.
+ */
+ if (i == nitems ||
+ multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
+ {
+ /*
+ * If no violations were found in the group then track the rows of
+ * the group as supporting the functional dependency.
+ */
+ if (n_violations == 0)
+ n_supporting_rows += group_size;
+
+ /* Reset counters for the new group */
+ n_violations = 0;
+ group_size = 1;
+ continue;
+ }
+ /* first columns match, but the last one does not (so contradicting) */
+ else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
+ n_violations++;
+
+ group_size++;
+ }
+
+ /* Compute the 'degree of validity' as (supporting/total). */
+ return (n_supporting_rows * 1.0 / data->numrows);
+}
+
+/*
+ * detects functional dependencies between groups of columns
+ *
+ * Generates all possible subsets of columns (variations) and computes
+ * the degree of validity for each one. For example when creating statistics
+ * on three columns (a,b,c) there are 9 possible dependencies
+ *
+ * two columns three columns
+ * ----------- -------------
+ * (a) -> b (a,b) -> c
+ * (a) -> c (a,c) -> b
+ * (b) -> a (b,c) -> a
+ * (b) -> c
+ * (c) -> a
+ * (c) -> b
+ */
+MVDependencies *
+statext_dependencies_build(StatsBuildData *data)
+{
+ int i,
+ k;
+
+ /* result */
+ MVDependencies *dependencies = NULL;
+ MemoryContext cxt;
+
+ Assert(data->nattnums >= 2);
+
+ /* tracks memory allocated by dependency_degree calls */
+ cxt = AllocSetContextCreate(CurrentMemoryContext,
+ "dependency_degree cxt",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * We'll try build functional dependencies starting from the smallest ones
+ * covering just 2 columns, to the largest ones, covering all columns
+ * included in the statistics object. We start from the smallest ones
+ * because we want to be able to skip already implied ones.
+ */
+ for (k = 2; k <= data->nattnums; k++)
+ {
+ AttrNumber *dependency; /* array with k elements */
+
+ /* prepare a DependencyGenerator of variation */
+ DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
+
+ /* generate all possible variations of k values (out of n) */
+ while ((dependency = DependencyGenerator_next(DependencyGenerator)))
+ {
+ double degree;
+ MVDependency *d;
+ MemoryContext oldcxt;
+
+ /* release memory used by dependency degree calculation */
+ oldcxt = MemoryContextSwitchTo(cxt);
+
+ /* compute how valid the dependency seems */
+ degree = dependency_degree(data, k, dependency);
+
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextReset(cxt);
+
+ /*
+ * if the dependency seems entirely invalid, don't store it
+ */
+ if (degree == 0.0)
+ continue;
+
+ d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
+ + k * sizeof(AttrNumber));
+
+ /* copy the dependency (and keep the indexes into stxkeys) */
+ d->degree = degree;
+ d->nattributes = k;
+ for (i = 0; i < k; i++)
+ d->attributes[i] = data->attnums[dependency[i]];
+
+ /* initialize the list of dependencies */
+ if (dependencies == NULL)
+ {
+ dependencies
+ = (MVDependencies *) palloc0(sizeof(MVDependencies));
+
+ dependencies->magic = STATS_DEPS_MAGIC;
+ dependencies->type = STATS_DEPS_TYPE_BASIC;
+ dependencies->ndeps = 0;
+ }
+
+ dependencies->ndeps++;
+ dependencies = (MVDependencies *) repalloc(dependencies,
+ offsetof(MVDependencies, deps)
+ + dependencies->ndeps * sizeof(MVDependency *));
+
+ dependencies->deps[dependencies->ndeps - 1] = d;
+ }
+
+ /*
+ * we're done with variations of k elements, so free the
+ * DependencyGenerator
+ */
+ DependencyGenerator_free(DependencyGenerator);
+ }
+
+ MemoryContextDelete(cxt);
+
+ return dependencies;
+}
+
+
+/*
+ * Serialize list of dependencies into a bytea value.
+ */
+bytea *
+statext_dependencies_serialize(MVDependencies *dependencies)
+{
+ int i;
+ bytea *output;
+ char *tmp;
+ Size len;
+
+ /* we need to store ndeps, with a number of attributes for each one */
+ len = VARHDRSZ + SizeOfHeader;
+
+ /* and also include space for the actual attribute numbers and degrees */
+ for (i = 0; i < dependencies->ndeps; i++)
+ len += SizeOfItem(dependencies->deps[i]->nattributes);
+
+ output = (bytea *) palloc0(len);
+ SET_VARSIZE(output, len);
+
+ tmp = VARDATA(output);
+
+ /* Store the base struct values (magic, type, ndeps) */
+ memcpy(tmp, &dependencies->magic, sizeof(uint32));
+ tmp += sizeof(uint32);
+ memcpy(tmp, &dependencies->type, sizeof(uint32));
+ tmp += sizeof(uint32);
+ memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
+ tmp += sizeof(uint32);
+
+ /* store number of attributes and attribute numbers for each dependency */
+ for (i = 0; i < dependencies->ndeps; i++)
+ {
+ MVDependency *d = dependencies->deps[i];
+
+ memcpy(tmp, &d->degree, sizeof(double));
+ tmp += sizeof(double);
+
+ memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
+ tmp += sizeof(AttrNumber);
+
+ memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
+ tmp += sizeof(AttrNumber) * d->nattributes;
+
+ /* protect against overflow */
+ Assert(tmp <= ((char *) output + len));
+ }
+
+ /* make sure we've produced exactly the right amount of data */
+ Assert(tmp == ((char *) output + len));
+
+ return output;
+}
+
+/*
+ * Reads serialized dependencies into MVDependencies structure.
+ */
+MVDependencies *
+statext_dependencies_deserialize(bytea *data)
+{
+ int i;
+ Size min_expected_size;
+ MVDependencies *dependencies;
+ char *tmp;
+
+ if (data == NULL)
+ return NULL;
+
+ if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
+ elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)",
+ VARSIZE_ANY_EXHDR(data), SizeOfHeader);
+
+ /* read the MVDependencies header */
+ dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
+
+ /* initialize pointer to the data part (skip the varlena header) */
+ tmp = VARDATA_ANY(data);
+
+ /* read the header fields and perform basic sanity checks */
+ memcpy(&dependencies->magic, tmp, sizeof(uint32));
+ tmp += sizeof(uint32);
+ memcpy(&dependencies->type, tmp, sizeof(uint32));
+ tmp += sizeof(uint32);
+ memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
+ tmp += sizeof(uint32);
+
+ if (dependencies->magic != STATS_DEPS_MAGIC)
+ elog(ERROR, "invalid dependency magic %d (expected %d)",
+ dependencies->magic, STATS_DEPS_MAGIC);
+
+ if (dependencies->type != STATS_DEPS_TYPE_BASIC)
+ elog(ERROR, "invalid dependency type %d (expected %d)",
+ dependencies->type, STATS_DEPS_TYPE_BASIC);
+
+ if (dependencies->ndeps == 0)
+ elog(ERROR, "invalid zero-length item array in MVDependencies");
+
+ /* what minimum bytea size do we expect for those parameters */
+ min_expected_size = SizeOfItem(dependencies->ndeps);
+
+ if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
+ elog(ERROR, "invalid dependencies size %zd (expected at least %zd)",
+ VARSIZE_ANY_EXHDR(data), min_expected_size);
+
+ /* allocate space for the MCV items */
+ dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
+ + (dependencies->ndeps * sizeof(MVDependency *)));
+
+ for (i = 0; i < dependencies->ndeps; i++)
+ {
+ double degree;
+ AttrNumber k;
+ MVDependency *d;
+
+ /* degree of validity */
+ memcpy(&degree, tmp, sizeof(double));
+ tmp += sizeof(double);
+
+ /* number of attributes */
+ memcpy(&k, tmp, sizeof(AttrNumber));
+ tmp += sizeof(AttrNumber);
+
+ /* is the number of attributes valid? */
+ Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
+
+ /* now that we know the number of attributes, allocate the dependency */
+ d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
+ + (k * sizeof(AttrNumber)));
+
+ d->degree = degree;
+ d->nattributes = k;
+
+ /* copy attribute numbers */
+ memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
+ tmp += sizeof(AttrNumber) * d->nattributes;
+
+ dependencies->deps[i] = d;
+
+ /* still within the bytea */
+ Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
+ }
+
+ /* we should have consumed the whole bytea exactly */
+ Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
+
+ return dependencies;
+}
+
+/*
+ * dependency_is_fully_matched
+ * checks that a functional dependency is fully matched given clauses on
+ * attributes (assuming the clauses are suitable equality clauses)
+ */
+static bool
+dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
+{
+ int j;
+
+ /*
+ * Check that the dependency actually is fully covered by clauses. We have
+ * to translate all attribute numbers, as those are referenced
+ */
+ for (j = 0; j < dependency->nattributes; j++)
+ {
+ int attnum = dependency->attributes[j];
+
+ if (!bms_is_member(attnum, attnums))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * statext_dependencies_load
+ * Load the functional dependencies for the indicated pg_statistic_ext tuple
+ */
+MVDependencies *
+statext_dependencies_load(Oid mvoid)
+{
+ MVDependencies *result;
+ bool isnull;
+ Datum deps;
+ HeapTuple htup;
+
+ htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
+ if (!HeapTupleIsValid(htup))
+ elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
+
+ deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
+ Anum_pg_statistic_ext_data_stxddependencies, &isnull);
+ if (isnull)
+ elog(ERROR,
+ "requested statistics kind \"%c\" is not yet built for statistics object %u",
+ STATS_EXT_DEPENDENCIES, mvoid);
+
+ result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
+
+ ReleaseSysCache(htup);
+
+ return result;
+}
+
+/*
+ * pg_dependencies_in - input routine for type pg_dependencies.
+ *
+ * pg_dependencies is real enough to be a table column, but it has no operations
+ * of its own, and disallows input too
+ */
+Datum
+pg_dependencies_in(PG_FUNCTION_ARGS)
+{
+ /*
+ * pg_node_list stores the data in binary form and parsing text input is
+ * not needed, so disallow this.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot accept a value of type %s", "pg_dependencies")));
+
+ PG_RETURN_VOID(); /* keep compiler quiet */
+}
+
+/*
+ * pg_dependencies - output routine for type pg_dependencies.
+ */
+Datum
+pg_dependencies_out(PG_FUNCTION_ARGS)
+{
+ bytea *data = PG_GETARG_BYTEA_PP(0);
+ MVDependencies *dependencies = statext_dependencies_deserialize(data);
+ int i,
+ j;
+ StringInfoData str;
+
+ initStringInfo(&str);
+ appendStringInfoChar(&str, '{');
+
+ for (i = 0; i < dependencies->ndeps; i++)
+ {
+ MVDependency *dependency = dependencies->deps[i];
+
+ if (i > 0)
+ appendStringInfoString(&str, ", ");
+
+ appendStringInfoChar(&str, '"');
+ for (j = 0; j < dependency->nattributes; j++)
+ {
+ if (j == dependency->nattributes - 1)
+ appendStringInfoString(&str, " => ");
+ else if (j > 0)
+ appendStringInfoString(&str, ", ");
+
+ appendStringInfo(&str, "%d", dependency->attributes[j]);
+ }
+ appendStringInfo(&str, "\": %f", dependency->degree);
+ }
+
+ appendStringInfoChar(&str, '}');
+
+ PG_RETURN_CSTRING(str.data);
+}
+
+/*
+ * pg_dependencies_recv - binary input routine for type pg_dependencies.
+ */
+Datum
+pg_dependencies_recv(PG_FUNCTION_ARGS)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot accept a value of type %s", "pg_dependencies")));
+
+ PG_RETURN_VOID(); /* keep compiler quiet */
+}
+
+/*
+ * pg_dependencies_send - binary output routine for type pg_dependencies.
+ *
+ * Functional dependencies are serialized in a bytea value (although the type
+ * is named differently), so let's just send that.
+ */
+Datum
+pg_dependencies_send(PG_FUNCTION_ARGS)
+{
+ return byteasend(fcinfo);
+}
+
+/*
+ * dependency_is_compatible_clause
+ * Determines if the clause is compatible with functional dependencies
+ *
+ * Only clauses that have the form of equality to a pseudoconstant, or can be
+ * interpreted that way, are currently accepted. Furthermore the variable
+ * part of the clause must be a simple Var belonging to the specified
+ * relation, whose attribute number we return in *attnum on success.
+ */
+static bool
+dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
+{
+ Var *var;
+ Node *clause_expr;
+
+ if (IsA(clause, RestrictInfo))
+ {
+ RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+ /* Pseudoconstants are not interesting (they couldn't contain a Var) */
+ if (rinfo->pseudoconstant)
+ return false;
+
+ /* Clauses referencing multiple, or no, varnos are incompatible */
+ if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+ return false;
+
+ clause = (Node *) rinfo->clause;
+ }
+
+ if (is_opclause(clause))
+ {
+ /* If it's an opclause, check for Var = Const or Const = Var. */
+ OpExpr *expr = (OpExpr *) clause;
+
+ /* Only expressions with two arguments are candidates. */
+ if (list_length(expr->args) != 2)
+ return false;
+
+ /* Make sure non-selected argument is a pseudoconstant. */
+ if (is_pseudo_constant_clause(lsecond(expr->args)))
+ clause_expr = linitial(expr->args);
+ else if (is_pseudo_constant_clause(linitial(expr->args)))
+ clause_expr = lsecond(expr->args);
+ else
+ return false;
+
+ /*
+ * If it's not an "=" operator, just ignore the clause, as it's not
+ * compatible with functional dependencies.
+ *
+ * This uses the function for estimating selectivity, not the operator
+ * directly (a bit awkward, but well ...).
+ *
+ * XXX this is pretty dubious; probably it'd be better to check btree
+ * or hash opclass membership, so as not to be fooled by custom
+ * selectivity functions, and to be more consistent with decisions
+ * elsewhere in the planner.
+ */
+ if (get_oprrest(expr->opno) != F_EQSEL)
+ return false;
+
+ /* OK to proceed with checking "var" */
+ }
+ else if (IsA(clause, ScalarArrayOpExpr))
+ {
+ /* If it's an scalar array operator, check for Var IN Const. */
+ ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
+
+ /*
+ * Reject ALL() variant, we only care about ANY/IN.
+ *
+ * XXX Maybe we should check if all the values are the same, and allow
+ * ALL in that case? Doesn't seem very practical, though.
+ */
+ if (!expr->useOr)
+ return false;
+
+ /* Only expressions with two arguments are candidates. */
+ if (list_length(expr->args) != 2)
+ return false;
+
+ /*
+ * We know it's always (Var IN Const), so we assume the var is the
+ * first argument, and pseudoconstant is the second one.
+ */
+ if (!is_pseudo_constant_clause(lsecond(expr->args)))
+ return false;
+
+ clause_expr = linitial(expr->args);
+
+ /*
+ * If it's not an "=" operator, just ignore the clause, as it's not
+ * compatible with functional dependencies. The operator is identified
+ * simply by looking at which function it uses to estimate
+ * selectivity. That's a bit strange, but it's what other similar
+ * places do.
+ */
+ if (get_oprrest(expr->opno) != F_EQSEL)
+ return false;
+
+ /* OK to proceed with checking "var" */
+ }
+ else if (is_orclause(clause))
+ {
+ BoolExpr *bool_expr = (BoolExpr *) clause;
+ ListCell *lc;
+
+ /* start with no attribute number */
+ *attnum = InvalidAttrNumber;
+
+ foreach(lc, bool_expr->args)
+ {
+ AttrNumber clause_attnum;
+
+ /*
+ * Had we found incompatible clause in the arguments, treat the
+ * whole clause as incompatible.
+ */
+ if (!dependency_is_compatible_clause((Node *) lfirst(lc),
+ relid, &clause_attnum))
+ return false;
+
+ if (*attnum == InvalidAttrNumber)
+ *attnum = clause_attnum;
+
+ /* ensure all the variables are the same (same attnum) */
+ if (*attnum != clause_attnum)
+ return false;
+ }
+
+ /* the Var is already checked by the recursive call */
+ return true;
+ }
+ else if (is_notclause(clause))
+ {
+ /*
+ * "NOT x" can be interpreted as "x = false", so get the argument and
+ * proceed with seeing if it's a suitable Var.
+ */
+ clause_expr = (Node *) get_notclausearg(clause);
+ }
+ else
+ {
+ /*
+ * A boolean expression "x" can be interpreted as "x = true", so
+ * proceed with seeing if it's a suitable Var.
+ */
+ clause_expr = (Node *) clause;
+ }
+
+ /*
+ * We may ignore any RelabelType node above the operand. (There won't be
+ * more than one, since eval_const_expressions has been applied already.)
+ */
+ if (IsA(clause_expr, RelabelType))
+ clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
+
+ /* We only support plain Vars for now */
+ if (!IsA(clause_expr, Var))
+ return false;
+
+ /* OK, we know we have a Var */
+ var = (Var *) clause_expr;
+
+ /* Ensure Var is from the correct relation */
+ if (var->varno != relid)
+ return false;
+
+ /* We also better ensure the Var is from the current level */
+ if (var->varlevelsup != 0)
+ return false;
+
+ /* Also ignore system attributes (we don't allow stats on those) */
+ if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+ return false;
+
+ *attnum = var->varattno;
+ return true;
+}
+
+/*
+ * find_strongest_dependency
+ * find the strongest dependency on the attributes
+ *
+ * When applying functional dependencies, we start with the strongest
+ * dependencies. That is, we select the dependency that:
+ *
+ * (a) has all attributes covered by equality clauses
+ *
+ * (b) has the most attributes
+ *
+ * (c) has the highest degree of validity
+ *
+ * This guarantees that we eliminate the most redundant conditions first
+ * (see the comment in dependencies_clauselist_selectivity).
+ */
+static MVDependency *
+find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
+ Bitmapset *attnums)
+{
+ int i,
+ j;
+ MVDependency *strongest = NULL;
+
+ /* number of attnums in clauses */
+ int nattnums = bms_num_members(attnums);
+
+ /*
+ * Iterate over the MVDependency items and find the strongest one from the
+ * fully-matched dependencies. We do the cheap checks first, before
+ * matching it against the attnums.
+ */
+ for (i = 0; i < ndependencies; i++)
+ {
+ for (j = 0; j < dependencies[i]->ndeps; j++)
+ {
+ MVDependency *dependency = dependencies[i]->deps[j];
+
+ /*
+ * Skip dependencies referencing more attributes than available
+ * clauses, as those can't be fully matched.
+ */
+ if (dependency->nattributes > nattnums)
+ continue;
+
+ if (strongest)
+ {
+ /* skip dependencies on fewer attributes than the strongest. */
+ if (dependency->nattributes < strongest->nattributes)
+ continue;
+
+ /* also skip weaker dependencies when attribute count matches */
+ if (strongest->nattributes == dependency->nattributes &&
+ strongest->degree > dependency->degree)
+ continue;
+ }
+
+ /*
+ * this dependency is stronger, but we must still check that it's
+ * fully matched to these attnums. We perform this check last as
+ * it's slightly more expensive than the previous checks.
+ */
+ if (dependency_is_fully_matched(dependency, attnums))
+ strongest = dependency; /* save new best match */
+ }
+ }
+
+ return strongest;
+}
+
+/*
+ * clauselist_apply_dependencies
+ * Apply the specified functional dependencies to a list of clauses and
+ * return the estimated selectivity of the clauses that are compatible
+ * with any of the given dependencies.
+ *
+ * This will estimate all not-already-estimated clauses that are compatible
+ * with functional dependencies, and which have an attribute mentioned by any
+ * of the given dependencies (either as an implying or implied attribute).
+ *
+ * Given (lists of) clauses on attributes (a,b) and a functional dependency
+ * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined
+ * using the formula
+ *
+ * P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
+ *
+ * where 'f' is the degree of dependency. This reflects the fact that we
+ * expect a fraction f of all rows to be consistent with the dependency
+ * (a=>b), and so have a selectivity of P(a), while the remaining rows are
+ * treated as independent.
+ *
+ * In practice, we use a slightly modified version of this formula, which uses
+ * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result
+ * should obviously not exceed either column's individual selectivity. I.e.,
+ * we actually combine selectivities using the formula
+ *
+ * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
+ *
+ * This can make quite a difference if the specific values matching the
+ * clauses are not consistent with the functional dependency.
+ */
+static Selectivity
+clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
+ int varRelid, JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ MVDependency **dependencies, int ndependencies,
+ AttrNumber *list_attnums,
+ Bitmapset **estimatedclauses)
+{
+ Bitmapset *attnums;
+ int i;
+ int j;
+ int nattrs;
+ Selectivity *attr_sel;
+ int attidx;
+ int listidx;
+ ListCell *l;
+ Selectivity s1;
+
+ /*
+ * Extract the attnums of all implying and implied attributes from all the
+ * given dependencies. Each of these attributes is expected to have at
+ * least 1 not-already-estimated compatible clause that we will estimate
+ * here.
+ */
+ attnums = NULL;
+ for (i = 0; i < ndependencies; i++)
+ {
+ for (j = 0; j < dependencies[i]->nattributes; j++)
+ {
+ AttrNumber attnum = dependencies[i]->attributes[j];
+
+ attnums = bms_add_member(attnums, attnum);
+ }
+ }
+
+ /*
+ * Compute per-column selectivity estimates for each of these attributes,
+ * and mark all the corresponding clauses as estimated.
+ */
+ nattrs = bms_num_members(attnums);
+ attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs);
+
+ attidx = 0;
+ i = -1;
+ while ((i = bms_next_member(attnums, i)) >= 0)
+ {
+ List *attr_clauses = NIL;
+ Selectivity simple_sel;
+
+ listidx = -1;
+ foreach(l, clauses)
+ {
+ Node *clause = (Node *) lfirst(l);
+
+ listidx++;
+ if (list_attnums[listidx] == i)
+ {
+ attr_clauses = lappend(attr_clauses, clause);
+ *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+ }
+ }
+
+ simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid,
+ jointype, sjinfo, false);
+ attr_sel[attidx++] = simple_sel;
+ }
+
+ /*
+ * Now combine these selectivities using the dependency information. For
+ * chains of dependencies such as a -> b -> c, the b -> c dependency will
+ * come before the a -> b dependency in the array, so we traverse the
+ * array backwards to ensure such chains are computed in the right order.
+ *
+ * As explained above, pairs of selectivities are combined using the
+ * formula
+ *
+ * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
+ *
+ * to ensure that the combined selectivity is never greater than either
+ * individual selectivity.
+ *
+ * Where multiple dependencies apply (e.g., a -> b -> c), we use
+ * conditional probabilities to compute the overall result as follows:
+ *
+ * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a)
+ *
+ * so we replace the selectivities of all implied attributes with
+ * conditional probabilities, that are conditional on all their implying
+ * attributes. The selectivities of all other non-implied attributes are
+ * left as they are.
+ */
+ for (i = ndependencies - 1; i >= 0; i--)
+ {
+ MVDependency *dependency = dependencies[i];
+ AttrNumber attnum;
+ Selectivity s2;
+ double f;
+
+ /* Selectivity of all the implying attributes */
+ s1 = 1.0;
+ for (j = 0; j < dependency->nattributes - 1; j++)
+ {
+ attnum = dependency->attributes[j];
+ attidx = bms_member_index(attnums, attnum);
+ s1 *= attr_sel[attidx];
+ }
+
+ /* Original selectivity of the implied attribute */
+ attnum = dependency->attributes[j];
+ attidx = bms_member_index(attnums, attnum);
+ s2 = attr_sel[attidx];
+
+ /*
+ * Replace s2 with the conditional probability s2 given s1, computed
+ * using the formula P(b|a) = P(a,b) / P(a), which simplifies to
+ *
+ * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
+ *
+ * where P(a) = s1, the selectivity of the implying attributes, and
+ * P(b) = s2, the selectivity of the implied attribute.
+ */
+ f = dependency->degree;
+
+ if (s1 <= s2)
+ attr_sel[attidx] = f + (1 - f) * s2;
+ else
+ attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2;
+ }
+
+ /*
+ * The overall selectivity of all the clauses on all these attributes is
+ * then the product of all the original (non-implied) probabilities and
+ * the new conditional (implied) probabilities.
+ */
+ s1 = 1.0;
+ for (i = 0; i < nattrs; i++)
+ s1 *= attr_sel[i];
+
+ CLAMP_PROBABILITY(s1);
+
+ pfree(attr_sel);
+ bms_free(attnums);
+
+ return s1;
+}
+
+/*
+ * dependency_is_compatible_expression
+ * Determines if the expression is compatible with functional dependencies
+ *
+ * Similar to dependency_is_compatible_clause, but doesn't enforce that the
+ * expression is a simple Var. OTOH we check that there's at least one
+ * statistics object matching the expression.
+ */
+static bool
+dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
+{
+ List *vars;
+ ListCell *lc,
+ *lc2;
+ Node *clause_expr;
+
+ if (IsA(clause, RestrictInfo))
+ {
+ RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+ /* Pseudoconstants are not interesting (they couldn't contain a Var) */
+ if (rinfo->pseudoconstant)
+ return false;
+
+ /* Clauses referencing multiple, or no, varnos are incompatible */
+ if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+ return false;
+
+ clause = (Node *) rinfo->clause;
+ }
+
+ if (is_opclause(clause))
+ {
+ /* If it's an opclause, check for Var = Const or Const = Var. */
+ OpExpr *expr = (OpExpr *) clause;
+
+ /* Only expressions with two arguments are candidates. */
+ if (list_length(expr->args) != 2)
+ return false;
+
+ /* Make sure non-selected argument is a pseudoconstant. */
+ if (is_pseudo_constant_clause(lsecond(expr->args)))
+ clause_expr = linitial(expr->args);
+ else if (is_pseudo_constant_clause(linitial(expr->args)))
+ clause_expr = lsecond(expr->args);
+ else
+ return false;
+
+ /*
+ * If it's not an "=" operator, just ignore the clause, as it's not
+ * compatible with functional dependencies.
+ *
+ * This uses the function for estimating selectivity, not the operator
+ * directly (a bit awkward, but well ...).
+ *
+ * XXX this is pretty dubious; probably it'd be better to check btree
+ * or hash opclass membership, so as not to be fooled by custom
+ * selectivity functions, and to be more consistent with decisions
+ * elsewhere in the planner.
+ */
+ if (get_oprrest(expr->opno) != F_EQSEL)
+ return false;
+
+ /* OK to proceed with checking "var" */
+ }
+ else if (IsA(clause, ScalarArrayOpExpr))
+ {
+ /* If it's an scalar array operator, check for Var IN Const. */
+ ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
+
+ /*
+ * Reject ALL() variant, we only care about ANY/IN.
+ *
+ * FIXME Maybe we should check if all the values are the same, and
+ * allow ALL in that case? Doesn't seem very practical, though.
+ */
+ if (!expr->useOr)
+ return false;
+
+ /* Only expressions with two arguments are candidates. */
+ if (list_length(expr->args) != 2)
+ return false;
+
+ /*
+ * We know it's always (Var IN Const), so we assume the var is the
+ * first argument, and pseudoconstant is the second one.
+ */
+ if (!is_pseudo_constant_clause(lsecond(expr->args)))
+ return false;
+
+ clause_expr = linitial(expr->args);
+
+ /*
+ * If it's not an "=" operator, just ignore the clause, as it's not
+ * compatible with functional dependencies. The operator is identified
+ * simply by looking at which function it uses to estimate
+ * selectivity. That's a bit strange, but it's what other similar
+ * places do.
+ */
+ if (get_oprrest(expr->opno) != F_EQSEL)
+ return false;
+
+ /* OK to proceed with checking "var" */
+ }
+ else if (is_orclause(clause))
+ {
+ BoolExpr *bool_expr = (BoolExpr *) clause;
+ ListCell *lc;
+
+ /* start with no expression (we'll use the first match) */
+ *expr = NULL;
+
+ foreach(lc, bool_expr->args)
+ {
+ Node *or_expr = NULL;
+
+ /*
+ * Had we found incompatible expression in the arguments, treat
+ * the whole expression as incompatible.
+ */
+ if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
+ statlist, &or_expr))
+ return false;
+
+ if (*expr == NULL)
+ *expr = or_expr;
+
+ /* ensure all the expressions are the same */
+ if (!equal(or_expr, *expr))
+ return false;
+ }
+
+ /* the expression is already checked by the recursive call */
+ return true;
+ }
+ else if (is_notclause(clause))
+ {
+ /*
+ * "NOT x" can be interpreted as "x = false", so get the argument and
+ * proceed with seeing if it's a suitable Var.
+ */
+ clause_expr = (Node *) get_notclausearg(clause);
+ }
+ else
+ {
+ /*
+ * A boolean expression "x" can be interpreted as "x = true", so
+ * proceed with seeing if it's a suitable Var.
+ */
+ clause_expr = (Node *) clause;
+ }
+
+ /*
+ * We may ignore any RelabelType node above the operand. (There won't be
+ * more than one, since eval_const_expressions has been applied already.)
+ */
+ if (IsA(clause_expr, RelabelType))
+ clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
+
+ vars = pull_var_clause(clause_expr, 0);
+
+ foreach(lc, vars)
+ {
+ Var *var = (Var *) lfirst(lc);
+
+ /* Ensure Var is from the correct relation */
+ if (var->varno != relid)
+ return false;
+
+ /* We also better ensure the Var is from the current level */
+ if (var->varlevelsup != 0)
+ return false;
+
+ /* Also ignore system attributes (we don't allow stats on those) */
+ if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+ return false;
+ }
+
+ /*
+ * Check if we actually have a matching statistics for the expression.
+ *
+ * XXX Maybe this is an overkill. We'll eliminate the expressions later.
+ */
+ foreach(lc, statlist)
+ {
+ StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+
+ /* ignore stats without dependencies */
+ if (info->kind != STATS_EXT_DEPENDENCIES)
+ continue;
+
+ foreach(lc2, info->exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc2);
+
+ if (equal(clause_expr, stat_expr))
+ {
+ *expr = stat_expr;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/*
+ * dependencies_clauselist_selectivity
+ * Return the estimated selectivity of (a subset of) the given clauses
+ * using functional dependency statistics, or 1.0 if no useful functional
+ * dependency statistic exists.
+ *
+ * 'estimatedclauses' is an input/output argument that gets a bit set
+ * corresponding to the (zero-based) list index of each clause that is included
+ * in the estimated selectivity.
+ *
+ * Given equality clauses on attributes (a,b) we find the strongest dependency
+ * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
+ * dependency, we then combine the per-clause selectivities using the formula
+ *
+ * P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
+ *
+ * where 'f' is the degree of the dependency. (Actually we use a slightly
+ * modified version of this formula -- see clauselist_apply_dependencies()).
+ *
+ * With clauses on more than two attributes, the dependencies are applied
+ * recursively, starting with the widest/strongest dependencies. For example
+ * P(a,b,c) is first split like this:
+ *
+ * P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c)
+ *
+ * assuming (a,b=>c) is the strongest dependency.
+ */
+Selectivity
+dependencies_clauselist_selectivity(PlannerInfo *root,
+ List *clauses,
+ int varRelid,
+ JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ RelOptInfo *rel,
+ Bitmapset **estimatedclauses)
+{
+ Selectivity s1 = 1.0;
+ ListCell *l;
+ Bitmapset *clauses_attnums = NULL;
+ AttrNumber *list_attnums;
+ int listidx;
+ MVDependencies **func_dependencies;
+ int nfunc_dependencies;
+ int total_ndeps;
+ MVDependency **dependencies;
+ int ndependencies;
+ int i;
+ AttrNumber attnum_offset;
+ RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
+
+ /* unique expressions */
+ Node **unique_exprs;
+ int unique_exprs_cnt;
+
+ /*
+ * When dealing with regular inheritance trees, ignore extended stats
+ * (which were built without data from child rels, and thus do not
+ * represent them). For partitioned tables data there's no data in the
+ * non-leaf relations, so we build stats only for the inheritance tree.
+ * So for partitioned tables we do consider extended stats.
+ */
+ if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE)
+ return 1.0;
+
+ /* check if there's any stats that might be useful for us. */
+ if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
+ return 1.0;
+
+ list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
+ list_length(clauses));
+
+ /*
+ * We allocate space as if every clause was a unique expression, although
+ * that's probably overkill. Some will be simple column references that
+ * we'll translate to attnums, and there might be duplicates. But it's
+ * easier and cheaper to just do one allocation than repalloc later.
+ */
+ unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
+ unique_exprs_cnt = 0;
+
+ /*
+ * Pre-process the clauses list to extract the attnums seen in each item.
+ * We need to determine if there's any clauses which will be useful for
+ * dependency selectivity estimations. Along the way we'll record all of
+ * the attnums for each clause in a list which we'll reference later so we
+ * don't need to repeat the same work again. We'll also keep track of all
+ * attnums seen.
+ *
+ * We also skip clauses that we already estimated using different types of
+ * statistics (we treat them as incompatible).
+ *
+ * To handle expressions, we assign them negative attnums, as if it was a
+ * system attribute (this is fine, as we only allow extended stats on user
+ * attributes). And then we offset everything by the number of
+ * expressions, so that we can store the values in a bitmapset.
+ */
+ listidx = 0;
+ foreach(l, clauses)
+ {
+ Node *clause = (Node *) lfirst(l);
+ AttrNumber attnum;
+ Node *expr = NULL;
+
+ /* ignore clause by default */
+ list_attnums[listidx] = InvalidAttrNumber;
+
+ if (!bms_is_member(listidx, *estimatedclauses))
+ {
+ /*
+ * If it's a simple column reference, just extract the attnum. If
+ * it's an expression, assign a negative attnum as if it was a
+ * system attribute.
+ */
+ if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+ {
+ list_attnums[listidx] = attnum;
+ }
+ else if (dependency_is_compatible_expression(clause, rel->relid,
+ rel->statlist,
+ &expr))
+ {
+ /* special attnum assigned to this expression */
+ attnum = InvalidAttrNumber;
+
+ Assert(expr != NULL);
+
+ /* If the expression is duplicate, use the same attnum. */
+ for (i = 0; i < unique_exprs_cnt; i++)
+ {
+ if (equal(unique_exprs[i], expr))
+ {
+ /* negative attribute number to expression */
+ attnum = -(i + 1);
+ break;
+ }
+ }
+
+ /* not found in the list, so add it */
+ if (attnum == InvalidAttrNumber)
+ {
+ unique_exprs[unique_exprs_cnt++] = expr;
+
+ /* after incrementing the value, to get -1, -2, ... */
+ attnum = (-unique_exprs_cnt);
+ }
+
+ /* remember which attnum was assigned to this clause */
+ list_attnums[listidx] = attnum;
+ }
+ }
+
+ listidx++;
+ }
+
+ Assert(listidx == list_length(clauses));
+
+ /*
+ * How much we need to offset the attnums? If there are no expressions,
+ * then no offset is needed. Otherwise we need to offset enough for the
+ * lowest value (-unique_exprs_cnt) to become 1.
+ */
+ if (unique_exprs_cnt > 0)
+ attnum_offset = (unique_exprs_cnt + 1);
+ else
+ attnum_offset = 0;
+
+ /*
+ * Now that we know how many expressions there are, we can offset the
+ * values just enough to build the bitmapset.
+ */
+ for (i = 0; i < list_length(clauses); i++)
+ {
+ AttrNumber attnum;
+
+ /* ignore incompatible or already estimated clauses */
+ if (list_attnums[i] == InvalidAttrNumber)
+ continue;
+
+ /* make sure the attnum is in the expected range */
+ Assert(list_attnums[i] >= (-unique_exprs_cnt));
+ Assert(list_attnums[i] <= MaxHeapAttributeNumber);
+
+ /* make sure the attnum is positive (valid AttrNumber) */
+ attnum = list_attnums[i] + attnum_offset;
+
+ /*
+ * Either it's a regular attribute, or it's an expression, in which
+ * case we must not have seen it before (expressions are unique).
+ *
+ * XXX Check whether it's a regular attribute has to be done using the
+ * original attnum, while the second check has to use the value with
+ * an offset.
+ */
+ Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
+ !bms_is_member(attnum, clauses_attnums));
+
+ /*
+ * Remember the offset attnum, both for attributes and expressions.
+ * We'll pass list_attnums to clauselist_apply_dependencies, which
+ * uses it to identify clauses in a bitmap. We could also pass the
+ * offset, but this is more convenient.
+ */
+ list_attnums[i] = attnum;
+
+ clauses_attnums = bms_add_member(clauses_attnums, attnum);
+ }
+
+ /*
+ * If there's not at least two distinct attnums and expressions, then
+ * reject the whole list of clauses. We must return 1.0 so the calling
+ * function's selectivity is unaffected.
+ */
+ if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
+ {
+ bms_free(clauses_attnums);
+ pfree(list_attnums);
+ return 1.0;
+ }
+
+ /*
+ * Load all functional dependencies matching at least two parameters. We
+ * can simply consider all dependencies at once, without having to search
+ * for the best statistics object.
+ *
+ * To not waste cycles and memory, we deserialize dependencies only for
+ * statistics that match at least two attributes. The array is allocated
+ * with the assumption that all objects match - we could grow the array to
+ * make it just the right size, but it's likely wasteful anyway thanks to
+ * moving the freed chunks to freelists etc.
+ */
+ func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
+ list_length(rel->statlist));
+ nfunc_dependencies = 0;
+ total_ndeps = 0;
+
+ foreach(l, rel->statlist)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
+ int nmatched;
+ int nexprs;
+ int k;
+ MVDependencies *deps;
+
+ /* skip statistics that are not of the correct type */
+ if (stat->kind != STATS_EXT_DEPENDENCIES)
+ continue;
+
+ /*
+ * Count matching attributes - we have to undo the attnum offsets. The
+ * input attribute numbers are not offset (expressions are not
+ * included in stat->keys, so it's not necessary). But we need to
+ * offset it before checking against clauses_attnums.
+ */
+ nmatched = 0;
+ k = -1;
+ while ((k = bms_next_member(stat->keys, k)) >= 0)
+ {
+ AttrNumber attnum = (AttrNumber) k;
+
+ /* skip expressions */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ continue;
+
+ /* apply the same offset as above */
+ attnum += attnum_offset;
+
+ if (bms_is_member(attnum, clauses_attnums))
+ nmatched++;
+ }
+
+ /* count matching expressions */
+ nexprs = 0;
+ for (i = 0; i < unique_exprs_cnt; i++)
+ {
+ ListCell *lc;
+
+ foreach(lc, stat->exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc);
+
+ /* try to match it */
+ if (equal(stat_expr, unique_exprs[i]))
+ nexprs++;
+ }
+ }
+
+ /*
+ * Skip objects matching fewer than two attributes/expressions from
+ * clauses.
+ */
+ if (nmatched + nexprs < 2)
+ continue;
+
+ deps = statext_dependencies_load(stat->statOid);
+
+ /*
+ * The expressions may be represented by different attnums in the
+ * stats, we need to remap them to be consistent with the clauses.
+ * That will make the later steps (e.g. picking the strongest item and
+ * so on) much simpler and cheaper, because it won't need to care
+ * about the offset at all.
+ *
+ * When we're at it, we can ignore dependencies that are not fully
+ * matched by clauses (i.e. referencing attributes or expressions that
+ * are not in the clauses).
+ *
+ * We have to do this for all statistics, as long as there are any
+ * expressions - we need to shift the attnums in all dependencies.
+ *
+ * XXX Maybe we should do this always, because it also eliminates some
+ * of the dependencies early. It might be cheaper than having to walk
+ * the longer list in find_strongest_dependency later, especially as
+ * we need to do that repeatedly?
+ *
+ * XXX We have to do this even when there are no expressions in
+ * clauses, otherwise find_strongest_dependency may fail for stats
+ * with expressions (due to lookup of negative value in bitmap). So we
+ * need to at least filter out those dependencies. Maybe we could do
+ * it in a cheaper way (if there are no expr clauses, we can just
+ * discard all negative attnums without any lookups).
+ */
+ if (unique_exprs_cnt > 0 || stat->exprs != NIL)
+ {
+ int ndeps = 0;
+
+ for (i = 0; i < deps->ndeps; i++)
+ {
+ bool skip = false;
+ MVDependency *dep = deps->deps[i];
+ int j;
+
+ for (j = 0; j < dep->nattributes; j++)
+ {
+ int idx;
+ Node *expr;
+ int k;
+ AttrNumber unique_attnum = InvalidAttrNumber;
+ AttrNumber attnum;
+
+ /* undo the per-statistics offset */
+ attnum = dep->attributes[j];
+
+ /*
+ * For regular attributes we can simply check if it
+ * matches any clause. If there's no matching clause, we
+ * can just ignore it. We need to offset the attnum
+ * though.
+ */
+ if (AttrNumberIsForUserDefinedAttr(attnum))
+ {
+ dep->attributes[j] = attnum + attnum_offset;
+
+ if (!bms_is_member(dep->attributes[j], clauses_attnums))
+ {
+ skip = true;
+ break;
+ }
+
+ continue;
+ }
+
+ /*
+ * the attnum should be a valid system attnum (-1, -2,
+ * ...)
+ */
+ Assert(AttributeNumberIsValid(attnum));
+
+ /*
+ * For expressions, we need to do two translations. First
+ * we have to translate the negative attnum to index in
+ * the list of expressions (in the statistics object).
+ * Then we need to see if there's a matching clause. The
+ * index of the unique expression determines the attnum
+ * (and we offset it).
+ */
+ idx = -(1 + attnum);
+
+ /* Is the expression index is valid? */
+ Assert((idx >= 0) && (idx < list_length(stat->exprs)));
+
+ expr = (Node *) list_nth(stat->exprs, idx);
+
+ /* try to find the expression in the unique list */
+ for (k = 0; k < unique_exprs_cnt; k++)
+ {
+ /*
+ * found a matching unique expression, use the attnum
+ * (derived from index of the unique expression)
+ */
+ if (equal(unique_exprs[k], expr))
+ {
+ unique_attnum = -(k + 1) + attnum_offset;
+ break;
+ }
+ }
+
+ /*
+ * Found no matching expression, so we can simply skip
+ * this dependency, because there's no chance it will be
+ * fully covered.
+ */
+ if (unique_attnum == InvalidAttrNumber)
+ {
+ skip = true;
+ break;
+ }
+
+ /* otherwise remap it to the new attnum */
+ dep->attributes[j] = unique_attnum;
+ }
+
+ /* if found a matching dependency, keep it */
+ if (!skip)
+ {
+ /* maybe we've skipped something earlier, so move it */
+ if (ndeps != i)
+ deps->deps[ndeps] = deps->deps[i];
+
+ ndeps++;
+ }
+ }
+
+ deps->ndeps = ndeps;
+ }
+
+ /*
+ * It's possible we've removed all dependencies, in which case we
+ * don't bother adding it to the list.
+ */
+ if (deps->ndeps > 0)
+ {
+ func_dependencies[nfunc_dependencies] = deps;
+ total_ndeps += deps->ndeps;
+ nfunc_dependencies++;
+ }
+ }
+
+ /* if no matching stats could be found then we've nothing to do */
+ if (nfunc_dependencies == 0)
+ {
+ pfree(func_dependencies);
+ bms_free(clauses_attnums);
+ pfree(list_attnums);
+ pfree(unique_exprs);
+ return 1.0;
+ }
+
+ /*
+ * Work out which dependencies we can apply, starting with the
+ * widest/strongest ones, and proceeding to smaller/weaker ones.
+ */
+ dependencies = (MVDependency **) palloc(sizeof(MVDependency *) *
+ total_ndeps);
+ ndependencies = 0;
+
+ while (true)
+ {
+ MVDependency *dependency;
+ AttrNumber attnum;
+
+ /* the widest/strongest dependency, fully matched by clauses */
+ dependency = find_strongest_dependency(func_dependencies,
+ nfunc_dependencies,
+ clauses_attnums);
+ if (!dependency)
+ break;
+
+ dependencies[ndependencies++] = dependency;
+
+ /* Ignore dependencies using this implied attribute in later loops */
+ attnum = dependency->attributes[dependency->nattributes - 1];
+ clauses_attnums = bms_del_member(clauses_attnums, attnum);
+ }
+
+ /*
+ * If we found applicable dependencies, use them to estimate all
+ * compatible clauses on attributes that they refer to.
+ */
+ if (ndependencies != 0)
+ s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype,
+ sjinfo, dependencies, ndependencies,
+ list_attnums, estimatedclauses);
+
+ /* free deserialized functional dependencies (and then the array) */
+ for (i = 0; i < nfunc_dependencies; i++)
+ pfree(func_dependencies[i]);
+
+ pfree(dependencies);
+ pfree(func_dependencies);
+ bms_free(clauses_attnums);
+ pfree(list_attnums);
+ pfree(unique_exprs);
+
+ return s1;
+}