diff options
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 7947 |
1 files changed, 7947 insertions, 0 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c new file mode 100644 index 0000000..962dec6 --- /dev/null +++ b/src/backend/utils/adt/selfuncs.c @@ -0,0 +1,7947 @@ +/*------------------------------------------------------------------------- + * + * selfuncs.c + * Selectivity functions and index cost estimation functions for + * standard operators and index access methods. + * + * Selectivity routines are registered in the pg_operator catalog + * in the "oprrest" and "oprjoin" attributes. + * + * Index cost functions are located via the index AM's API struct, + * which is obtained from the handler function registered in pg_am. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/adt/selfuncs.c + * + *------------------------------------------------------------------------- + */ + +/*---------- + * Operator selectivity estimation functions are called to estimate the + * selectivity of WHERE clauses whose top-level operator is their operator. + * We divide the problem into two cases: + * Restriction clause estimation: the clause involves vars of just + * one relation. + * Join clause estimation: the clause involves vars of multiple rels. + * Join selectivity estimation is far more difficult and usually less accurate + * than restriction estimation. + * + * When dealing with the inner scan of a nestloop join, we consider the + * join's joinclauses as restriction clauses for the inner relation, and + * treat vars of the outer relation as parameters (a/k/a constants of unknown + * values). So, restriction estimators need to be able to accept an argument + * telling which relation is to be treated as the variable. + * + * The call convention for a restriction estimator (oprrest function) is + * + * Selectivity oprrest (PlannerInfo *root, + * Oid operator, + * List *args, + * int varRelid); + * + * root: general information about the query (rtable and RelOptInfo lists + * are particularly important for the estimator). + * operator: OID of the specific operator in question. + * args: argument list from the operator clause. + * varRelid: if not zero, the relid (rtable index) of the relation to + * be treated as the variable relation. May be zero if the args list + * is known to contain vars of only one relation. + * + * This is represented at the SQL level (in pg_proc) as + * + * float8 oprrest (internal, oid, internal, int4); + * + * The result is a selectivity, that is, a fraction (0 to 1) of the rows + * of the relation that are expected to produce a TRUE result for the + * given operator. + * + * The call convention for a join estimator (oprjoin function) is similar + * except that varRelid is not needed, and instead join information is + * supplied: + * + * Selectivity oprjoin (PlannerInfo *root, + * Oid operator, + * List *args, + * JoinType jointype, + * SpecialJoinInfo *sjinfo); + * + * float8 oprjoin (internal, oid, internal, int2, internal); + * + * (Before Postgres 8.4, join estimators had only the first four of these + * parameters. That signature is still allowed, but deprecated.) The + * relationship between jointype and sjinfo is explained in the comments for + * clause_selectivity() --- the short version is that jointype is usually + * best ignored in favor of examining sjinfo. + * + * Join selectivity for regular inner and outer joins is defined as the + * fraction (0 to 1) of the cross product of the relations that is expected + * to produce a TRUE result for the given operator. For both semi and anti + * joins, however, the selectivity is defined as the fraction of the left-hand + * side relation's rows that are expected to have a match (ie, at least one + * row with a TRUE result) in the right-hand side. + * + * For both oprrest and oprjoin functions, the operator's input collation OID + * (if any) is passed using the standard fmgr mechanism, so that the estimator + * function can fetch it with PG_GET_COLLATION(). Note, however, that all + * statistics in pg_statistic are currently built using the relevant column's + * collation. + *---------- + */ + +#include "postgres.h" + +#include <ctype.h> +#include <math.h> + +#include "access/brin.h" +#include "access/brin_page.h" +#include "access/gin.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/visibilitymap.h" +#include "catalog/pg_am.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_statistic.h" +#include "catalog/pg_statistic_ext.h" +#include "executor/nodeAgg.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" +#include "optimizer/cost.h" +#include "optimizer/optimizer.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/plancat.h" +#include "parser/parse_clause.h" +#include "parser/parsetree.h" +#include "statistics/statistics.h" +#include "storage/bufmgr.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/date.h" +#include "utils/datum.h" +#include "utils/fmgroids.h" +#include "utils/index_selfuncs.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_locale.h" +#include "utils/rel.h" +#include "utils/selfuncs.h" +#include "utils/snapmgr.h" +#include "utils/spccache.h" +#include "utils/syscache.h" +#include "utils/timestamp.h" +#include "utils/typcache.h" + + +/* Hooks for plugins to get control when we ask for stats */ +get_relation_stats_hook_type get_relation_stats_hook = NULL; +get_index_stats_hook_type get_index_stats_hook = NULL; + +static double eqsel_internal(PG_FUNCTION_ARGS, bool negate); +static double eqjoinsel_inner(Oid opfuncoid, Oid collation, + VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2); +static double eqjoinsel_semi(Oid opfuncoid, Oid collation, + VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2, + RelOptInfo *inner_rel); +static bool estimate_multivariate_ndistinct(PlannerInfo *root, + RelOptInfo *rel, List **varinfos, double *ndistinct); +static bool convert_to_scalar(Datum value, Oid valuetypid, Oid collid, + double *scaledvalue, + Datum lobound, Datum hibound, Oid boundstypid, + double *scaledlobound, double *scaledhibound); +static double convert_numeric_to_scalar(Datum value, Oid typid, bool *failure); +static void convert_string_to_scalar(char *value, + double *scaledvalue, + char *lobound, + double *scaledlobound, + char *hibound, + double *scaledhibound); +static void convert_bytea_to_scalar(Datum value, + double *scaledvalue, + Datum lobound, + double *scaledlobound, + Datum hibound, + double *scaledhibound); +static double convert_one_string_to_scalar(char *value, + int rangelo, int rangehi); +static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen, + int rangelo, int rangehi); +static char *convert_string_datum(Datum value, Oid typid, Oid collid, + bool *failure); +static double convert_timevalue_to_scalar(Datum value, Oid typid, + bool *failure); +static void examine_simple_variable(PlannerInfo *root, Var *var, + VariableStatData *vardata); +static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata, + Oid sortop, Oid collation, + Datum *min, Datum *max); +static void get_stats_slot_range(AttStatsSlot *sslot, + Oid opfuncoid, FmgrInfo *opproc, + Oid collation, int16 typLen, bool typByVal, + Datum *min, Datum *max, bool *p_have_data); +static bool get_actual_variable_range(PlannerInfo *root, + VariableStatData *vardata, + Oid sortop, Oid collation, + Datum *min, Datum *max); +static bool get_actual_variable_endpoint(Relation heapRel, + Relation indexRel, + ScanDirection indexscandir, + ScanKey scankeys, + int16 typLen, + bool typByVal, + TupleTableSlot *tableslot, + MemoryContext outercontext, + Datum *endpointDatum); +static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids); + + +/* + * eqsel - Selectivity of "=" for any data types. + * + * Note: this routine is also used to estimate selectivity for some + * operators that are not "=" but have comparable selectivity behavior, + * such as "~=" (geometric approximate-match). Even for "=", we must + * keep in mind that the left and right datatypes may differ. + */ +Datum +eqsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8((float8) eqsel_internal(fcinfo, false)); +} + +/* + * Common code for eqsel() and neqsel() + */ +static double +eqsel_internal(PG_FUNCTION_ARGS, bool negate) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + Oid collation = PG_GET_COLLATION(); + VariableStatData vardata; + Node *other; + bool varonleft; + double selec; + + /* + * When asked about <>, we do the estimation using the corresponding = + * operator, then convert to <> via "1.0 - eq_selectivity - nullfrac". + */ + if (negate) + { + operator = get_negator(operator); + if (!OidIsValid(operator)) + { + /* Use default selectivity (should we raise an error instead?) */ + return 1.0 - DEFAULT_EQ_SEL; + } + } + + /* + * If expression is not variable = something or something = variable, then + * punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + return negate ? (1.0 - DEFAULT_EQ_SEL) : DEFAULT_EQ_SEL; + + /* + * We can do a lot better if the something is a constant. (Note: the + * Const might result from estimation rather than being a simple constant + * in the query.) + */ + if (IsA(other, Const)) + selec = var_eq_const(&vardata, operator, collation, + ((Const *) other)->constvalue, + ((Const *) other)->constisnull, + varonleft, negate); + else + selec = var_eq_non_const(&vardata, operator, collation, other, + varonleft, negate); + + ReleaseVariableStats(vardata); + + return selec; +} + +/* + * var_eq_const --- eqsel for var = const case + * + * This is exported so that some other estimation functions can use it. + */ +double +var_eq_const(VariableStatData *vardata, Oid operator, Oid collation, + Datum constval, bool constisnull, + bool varonleft, bool negate) +{ + double selec; + double nullfrac = 0.0; + bool isdefault; + Oid opfuncoid; + + /* + * If the constant is NULL, assume operator is strict and return zero, ie, + * operator will never return TRUE. (It's zero even for a negator op.) + */ + if (constisnull) + return 0.0; + + /* + * Grab the nullfrac for use below. Note we allow use of nullfrac + * regardless of security check. + */ + if (HeapTupleIsValid(vardata->statsTuple)) + { + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + nullfrac = stats->stanullfrac; + } + + /* + * If we matched the var to a unique index or DISTINCT clause, assume + * there is exactly one match regardless of anything else. (This is + * slightly bogus, since the index or clause's equality operator might be + * different from ours, but it's much more likely to be right than + * ignoring the information.) + */ + if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) + { + selec = 1.0 / vardata->rel->tuples; + } + else if (HeapTupleIsValid(vardata->statsTuple) && + statistic_proc_security_check(vardata, + (opfuncoid = get_opcode(operator)))) + { + AttStatsSlot sslot; + bool match = false; + int i; + + /* + * Is the constant "=" to any of the column's most common values? + * (Although the given operator may not really be "=", we will assume + * that seeing whether it returns TRUE is an appropriate test. If you + * don't like this, maybe you shouldn't be using eqsel for your + * operator...) + */ + if (get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS)) + { + LOCAL_FCINFO(fcinfo, 2); + FmgrInfo eqproc; + + fmgr_info(opfuncoid, &eqproc); + + /* + * Save a few cycles by setting up the fcinfo struct just once. + * Using FunctionCallInvoke directly also avoids failure if the + * eqproc returns NULL, though really equality functions should + * never do that. + */ + InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + /* be careful to apply operator right way 'round */ + if (varonleft) + fcinfo->args[1].value = constval; + else + fcinfo->args[0].value = constval; + + for (i = 0; i < sslot.nvalues; i++) + { + Datum fresult; + + if (varonleft) + fcinfo->args[0].value = sslot.values[i]; + else + fcinfo->args[1].value = sslot.values[i]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + { + match = true; + break; + } + } + } + else + { + /* no most-common-value info available */ + i = 0; /* keep compiler quiet */ + } + + if (match) + { + /* + * Constant is "=" to this common value. We know selectivity + * exactly (or as exactly as ANALYZE could calculate it, anyway). + */ + selec = sslot.numbers[i]; + } + else + { + /* + * Comparison is against a constant that is neither NULL nor any + * of the common values. Its selectivity cannot be more than + * this: + */ + double sumcommon = 0.0; + double otherdistinct; + + for (i = 0; i < sslot.nnumbers; i++) + sumcommon += sslot.numbers[i]; + selec = 1.0 - sumcommon - nullfrac; + CLAMP_PROBABILITY(selec); + + /* + * and in fact it's probably a good deal less. We approximate that + * all the not-common values share this remaining fraction + * equally, so we divide by the number of other distinct values. + */ + otherdistinct = get_variable_numdistinct(vardata, &isdefault) - + sslot.nnumbers; + if (otherdistinct > 1) + selec /= otherdistinct; + + /* + * Another cross-check: selectivity shouldn't be estimated as more + * than the least common "most common value". + */ + if (sslot.nnumbers > 0 && selec > sslot.numbers[sslot.nnumbers - 1]) + selec = sslot.numbers[sslot.nnumbers - 1]; + } + + free_attstatsslot(&sslot); + } + else + { + /* + * No ANALYZE stats available, so make a guess using estimated number + * of distinct values and assuming they are equally common. (The guess + * is unlikely to be very good, but we do know a few special cases.) + */ + selec = 1.0 / get_variable_numdistinct(vardata, &isdefault); + } + + /* now adjust if we wanted <> rather than = */ + if (negate) + selec = 1.0 - selec - nullfrac; + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * var_eq_non_const --- eqsel for var = something-other-than-const case + * + * This is exported so that some other estimation functions can use it. + */ +double +var_eq_non_const(VariableStatData *vardata, Oid operator, Oid collation, + Node *other, + bool varonleft, bool negate) +{ + double selec; + double nullfrac = 0.0; + bool isdefault; + + /* + * Grab the nullfrac for use below. + */ + if (HeapTupleIsValid(vardata->statsTuple)) + { + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + nullfrac = stats->stanullfrac; + } + + /* + * If we matched the var to a unique index or DISTINCT clause, assume + * there is exactly one match regardless of anything else. (This is + * slightly bogus, since the index or clause's equality operator might be + * different from ours, but it's much more likely to be right than + * ignoring the information.) + */ + if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) + { + selec = 1.0 / vardata->rel->tuples; + } + else if (HeapTupleIsValid(vardata->statsTuple)) + { + double ndistinct; + AttStatsSlot sslot; + + /* + * Search is for a value that we do not know a priori, but we will + * assume it is not NULL. Estimate the selectivity as non-null + * fraction divided by number of distinct values, so that we get a + * result averaged over all possible values whether common or + * uncommon. (Essentially, we are assuming that the not-yet-known + * comparison value is equally likely to be any of the possible + * values, regardless of their frequency in the table. Is that a good + * idea?) + */ + selec = 1.0 - nullfrac; + ndistinct = get_variable_numdistinct(vardata, &isdefault); + if (ndistinct > 1) + selec /= ndistinct; + + /* + * Cross-check: selectivity should never be estimated as more than the + * most common value's. + */ + if (get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_NUMBERS)) + { + if (sslot.nnumbers > 0 && selec > sslot.numbers[0]) + selec = sslot.numbers[0]; + free_attstatsslot(&sslot); + } + } + else + { + /* + * No ANALYZE stats available, so make a guess using estimated number + * of distinct values and assuming they are equally common. (The guess + * is unlikely to be very good, but we do know a few special cases.) + */ + selec = 1.0 / get_variable_numdistinct(vardata, &isdefault); + } + + /* now adjust if we wanted <> rather than = */ + if (negate) + selec = 1.0 - selec - nullfrac; + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * neqsel - Selectivity of "!=" for any data types. + * + * This routine is also used for some operators that are not "!=" + * but have comparable selectivity behavior. See above comments + * for eqsel(). + */ +Datum +neqsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8((float8) eqsel_internal(fcinfo, true)); +} + +/* + * scalarineqsel - Selectivity of "<", "<=", ">", ">=" for scalars. + * + * This is the guts of scalarltsel/scalarlesel/scalargtsel/scalargesel. + * The isgt and iseq flags distinguish which of the four cases apply. + * + * The caller has commuted the clause, if necessary, so that we can treat + * the variable as being on the left. The caller must also make sure that + * the other side of the clause is a non-null Const, and dissect that into + * a value and datatype. (This definition simplifies some callers that + * want to estimate against a computed value instead of a Const node.) + * + * This routine works for any datatype (or pair of datatypes) known to + * convert_to_scalar(). If it is applied to some other datatype, + * it will return an approximate estimate based on assuming that the constant + * value falls in the middle of the bin identified by binary search. + */ +static double +scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, bool iseq, + Oid collation, + VariableStatData *vardata, Datum constval, Oid consttype) +{ + Form_pg_statistic stats; + FmgrInfo opproc; + double mcv_selec, + hist_selec, + sumcommon; + double selec; + + if (!HeapTupleIsValid(vardata->statsTuple)) + { + /* + * No stats are available. Typically this means we have to fall back + * on the default estimate; but if the variable is CTID then we can + * make an estimate based on comparing the constant to the table size. + */ + if (vardata->var && IsA(vardata->var, Var) && + ((Var *) vardata->var)->varattno == SelfItemPointerAttributeNumber) + { + ItemPointer itemptr; + double block; + double density; + + /* + * If the relation's empty, we're going to include all of it. + * (This is mostly to avoid divide-by-zero below.) + */ + if (vardata->rel->pages == 0) + return 1.0; + + itemptr = (ItemPointer) DatumGetPointer(constval); + block = ItemPointerGetBlockNumberNoCheck(itemptr); + + /* + * Determine the average number of tuples per page (density). + * + * Since the last page will, on average, be only half full, we can + * estimate it to have half as many tuples as earlier pages. So + * give it half the weight of a regular page. + */ + density = vardata->rel->tuples / (vardata->rel->pages - 0.5); + + /* If target is the last page, use half the density. */ + if (block >= vardata->rel->pages - 1) + density *= 0.5; + + /* + * Using the average tuples per page, calculate how far into the + * page the itemptr is likely to be and adjust block accordingly, + * by adding that fraction of a whole block (but never more than a + * whole block, no matter how high the itemptr's offset is). Here + * we are ignoring the possibility of dead-tuple line pointers, + * which is fairly bogus, but we lack the info to do better. + */ + if (density > 0.0) + { + OffsetNumber offset = ItemPointerGetOffsetNumberNoCheck(itemptr); + + block += Min(offset / density, 1.0); + } + + /* + * Convert relative block number to selectivity. Again, the last + * page has only half weight. + */ + selec = block / (vardata->rel->pages - 0.5); + + /* + * The calculation so far gave us a selectivity for the "<=" case. + * We'll have one fewer tuple for "<" and one additional tuple for + * ">=", the latter of which we'll reverse the selectivity for + * below, so we can simply subtract one tuple for both cases. The + * cases that need this adjustment can be identified by iseq being + * equal to isgt. + */ + if (iseq == isgt && vardata->rel->tuples >= 1.0) + selec -= (1.0 / vardata->rel->tuples); + + /* Finally, reverse the selectivity for the ">", ">=" cases. */ + if (isgt) + selec = 1.0 - selec; + + CLAMP_PROBABILITY(selec); + return selec; + } + + /* no stats available, so default result */ + return DEFAULT_INEQ_SEL; + } + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + + fmgr_info(get_opcode(operator), &opproc); + + /* + * If we have most-common-values info, add up the fractions of the MCV + * entries that satisfy MCV OP CONST. These fractions contribute directly + * to the result selectivity. Also add up the total fraction represented + * by MCV entries. + */ + mcv_selec = mcv_selectivity(vardata, &opproc, collation, constval, true, + &sumcommon); + + /* + * If there is a histogram, determine which bin the constant falls in, and + * compute the resulting contribution to selectivity. + */ + hist_selec = ineq_histogram_selectivity(root, vardata, + operator, &opproc, isgt, iseq, + collation, + constval, consttype); + + /* + * Now merge the results from the MCV and histogram calculations, + * realizing that the histogram covers only the non-null values that are + * not listed in MCV. + */ + selec = 1.0 - stats->stanullfrac - sumcommon; + + if (hist_selec >= 0.0) + selec *= hist_selec; + else + { + /* + * If no histogram but there are values not accounted for by MCV, + * arbitrarily assume half of them will match. + */ + selec *= 0.5; + } + + selec += mcv_selec; + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * mcv_selectivity - Examine the MCV list for selectivity estimates + * + * Determine the fraction of the variable's MCV population that satisfies + * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft. Also + * compute the fraction of the total column population represented by the MCV + * list. This code will work for any boolean-returning predicate operator. + * + * The function result is the MCV selectivity, and the fraction of the + * total population is returned into *sumcommonp. Zeroes are returned + * if there is no MCV list. + */ +double +mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Oid collation, + Datum constval, bool varonleft, + double *sumcommonp) +{ + double mcv_selec, + sumcommon; + AttStatsSlot sslot; + int i; + + mcv_selec = 0.0; + sumcommon = 0.0; + + if (HeapTupleIsValid(vardata->statsTuple) && + statistic_proc_security_check(vardata, opproc->fn_oid) && + get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS)) + { + LOCAL_FCINFO(fcinfo, 2); + + /* + * We invoke the opproc "by hand" so that we won't fail on NULL + * results. Such cases won't arise for normal comparison functions, + * but generic_restriction_selectivity could perhaps be used with + * operators that can return NULL. A small side benefit is to not + * need to re-initialize the fcinfo struct from scratch each time. + */ + InitFunctionCallInfoData(*fcinfo, opproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + /* be careful to apply operator right way 'round */ + if (varonleft) + fcinfo->args[1].value = constval; + else + fcinfo->args[0].value = constval; + + for (i = 0; i < sslot.nvalues; i++) + { + Datum fresult; + + if (varonleft) + fcinfo->args[0].value = sslot.values[i]; + else + fcinfo->args[1].value = sslot.values[i]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + mcv_selec += sslot.numbers[i]; + sumcommon += sslot.numbers[i]; + } + free_attstatsslot(&sslot); + } + + *sumcommonp = sumcommon; + return mcv_selec; +} + +/* + * histogram_selectivity - Examine the histogram for selectivity estimates + * + * Determine the fraction of the variable's histogram entries that satisfy + * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft. + * + * This code will work for any boolean-returning predicate operator, whether + * or not it has anything to do with the histogram sort operator. We are + * essentially using the histogram just as a representative sample. However, + * small histograms are unlikely to be all that representative, so the caller + * should be prepared to fall back on some other estimation approach when the + * histogram is missing or very small. It may also be prudent to combine this + * approach with another one when the histogram is small. + * + * If the actual histogram size is not at least min_hist_size, we won't bother + * to do the calculation at all. Also, if the n_skip parameter is > 0, we + * ignore the first and last n_skip histogram elements, on the grounds that + * they are outliers and hence not very representative. Typical values for + * these parameters are 10 and 1. + * + * The function result is the selectivity, or -1 if there is no histogram + * or it's smaller than min_hist_size. + * + * The output parameter *hist_size receives the actual histogram size, + * or zero if no histogram. Callers may use this number to decide how + * much faith to put in the function result. + * + * Note that the result disregards both the most-common-values (if any) and + * null entries. The caller is expected to combine this result with + * statistics for those portions of the column population. It may also be + * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs. + */ +double +histogram_selectivity(VariableStatData *vardata, + FmgrInfo *opproc, Oid collation, + Datum constval, bool varonleft, + int min_hist_size, int n_skip, + int *hist_size) +{ + double result; + AttStatsSlot sslot; + + /* check sanity of parameters */ + Assert(n_skip >= 0); + Assert(min_hist_size > 2 * n_skip); + + if (HeapTupleIsValid(vardata->statsTuple) && + statistic_proc_security_check(vardata, opproc->fn_oid) && + get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_HISTOGRAM, InvalidOid, + ATTSTATSSLOT_VALUES)) + { + *hist_size = sslot.nvalues; + if (sslot.nvalues >= min_hist_size) + { + LOCAL_FCINFO(fcinfo, 2); + int nmatch = 0; + int i; + + /* + * We invoke the opproc "by hand" so that we won't fail on NULL + * results. Such cases won't arise for normal comparison + * functions, but generic_restriction_selectivity could perhaps be + * used with operators that can return NULL. A small side benefit + * is to not need to re-initialize the fcinfo struct from scratch + * each time. + */ + InitFunctionCallInfoData(*fcinfo, opproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + /* be careful to apply operator right way 'round */ + if (varonleft) + fcinfo->args[1].value = constval; + else + fcinfo->args[0].value = constval; + + for (i = n_skip; i < sslot.nvalues - n_skip; i++) + { + Datum fresult; + + if (varonleft) + fcinfo->args[0].value = sslot.values[i]; + else + fcinfo->args[1].value = sslot.values[i]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + nmatch++; + } + result = ((double) nmatch) / ((double) (sslot.nvalues - 2 * n_skip)); + } + else + result = -1; + free_attstatsslot(&sslot); + } + else + { + *hist_size = 0; + result = -1; + } + + return result; +} + +/* + * generic_restriction_selectivity - Selectivity for almost anything + * + * This function estimates selectivity for operators that we don't have any + * special knowledge about, but are on data types that we collect standard + * MCV and/or histogram statistics for. (Additional assumptions are that + * the operator is strict and immutable, or at least stable.) + * + * If we have "VAR OP CONST" or "CONST OP VAR", selectivity is estimated by + * applying the operator to each element of the column's MCV and/or histogram + * stats, and merging the results using the assumption that the histogram is + * a reasonable random sample of the column's non-MCV population. Note that + * if the operator's semantics are related to the histogram ordering, this + * might not be such a great assumption; other functions such as + * scalarineqsel() are probably a better match in such cases. + * + * Otherwise, fall back to the default selectivity provided by the caller. + */ +double +generic_restriction_selectivity(PlannerInfo *root, Oid oproid, Oid collation, + List *args, int varRelid, + double default_selectivity) +{ + double selec; + VariableStatData vardata; + Node *other; + bool varonleft; + + /* + * If expression is not variable OP something or something OP variable, + * then punt and return the default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + return default_selectivity; + + /* + * If the something is a NULL constant, assume operator is strict and + * return zero, ie, operator will never return TRUE. + */ + if (IsA(other, Const) && + ((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); + return 0.0; + } + + if (IsA(other, Const)) + { + /* Variable is being compared to a known non-null constant */ + Datum constval = ((Const *) other)->constvalue; + FmgrInfo opproc; + double mcvsum; + double mcvsel; + double nullfrac; + int hist_size; + + fmgr_info(get_opcode(oproid), &opproc); + + /* + * Calculate the selectivity for the column's most common values. + */ + mcvsel = mcv_selectivity(&vardata, &opproc, collation, + constval, varonleft, + &mcvsum); + + /* + * If the histogram is large enough, see what fraction of it matches + * the query, and assume that's representative of the non-MCV + * population. Otherwise use the default selectivity for the non-MCV + * population. + */ + selec = histogram_selectivity(&vardata, &opproc, collation, + constval, varonleft, + 10, 1, &hist_size); + if (selec < 0) + { + /* Nope, fall back on default */ + selec = default_selectivity; + } + else if (hist_size < 100) + { + /* + * For histogram sizes from 10 to 100, we combine the histogram + * and default selectivities, putting increasingly more trust in + * the histogram for larger sizes. + */ + double hist_weight = hist_size / 100.0; + + selec = selec * hist_weight + + default_selectivity * (1.0 - hist_weight); + } + + /* In any case, don't believe extremely small or large estimates. */ + if (selec < 0.0001) + selec = 0.0001; + else if (selec > 0.9999) + selec = 0.9999; + + /* Don't forget to account for nulls. */ + if (HeapTupleIsValid(vardata.statsTuple)) + nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac; + else + nullfrac = 0.0; + + /* + * Now merge the results from the MCV and histogram calculations, + * realizing that the histogram covers only the non-null values that + * are not listed in MCV. + */ + selec *= 1.0 - nullfrac - mcvsum; + selec += mcvsel; + } + else + { + /* Comparison value is not constant, so we can't do anything */ + selec = default_selectivity; + } + + ReleaseVariableStats(vardata); + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * ineq_histogram_selectivity - Examine the histogram for scalarineqsel + * + * Determine the fraction of the variable's histogram population that + * satisfies the inequality condition, ie, VAR < (or <=, >, >=) CONST. + * The isgt and iseq flags distinguish which of the four cases apply. + * + * While opproc could be looked up from the operator OID, common callers + * also need to call it separately, so we make the caller pass both. + * + * Returns -1 if there is no histogram (valid results will always be >= 0). + * + * Note that the result disregards both the most-common-values (if any) and + * null entries. The caller is expected to combine this result with + * statistics for those portions of the column population. + * + * This is exported so that some other estimation functions can use it. + */ +double +ineq_histogram_selectivity(PlannerInfo *root, + VariableStatData *vardata, + Oid opoid, FmgrInfo *opproc, bool isgt, bool iseq, + Oid collation, + Datum constval, Oid consttype) +{ + double hist_selec; + AttStatsSlot sslot; + + hist_selec = -1.0; + + /* + * Someday, ANALYZE might store more than one histogram per rel/att, + * corresponding to more than one possible sort ordering defined for the + * column type. Right now, we know there is only one, so just grab it and + * see if it matches the query. + * + * Note that we can't use opoid as search argument; the staop appearing in + * pg_statistic will be for the relevant '<' operator, but what we have + * might be some other inequality operator such as '>='. (Even if opoid + * is a '<' operator, it could be cross-type.) Hence we must use + * comparison_ops_are_compatible() to see if the operators match. + */ + if (HeapTupleIsValid(vardata->statsTuple) && + statistic_proc_security_check(vardata, opproc->fn_oid) && + get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_HISTOGRAM, InvalidOid, + ATTSTATSSLOT_VALUES)) + { + if (sslot.nvalues > 1 && + sslot.stacoll == collation && + comparison_ops_are_compatible(sslot.staop, opoid)) + { + /* + * Use binary search to find the desired location, namely the + * right end of the histogram bin containing the comparison value, + * which is the leftmost entry for which the comparison operator + * succeeds (if isgt) or fails (if !isgt). + * + * In this loop, we pay no attention to whether the operator iseq + * or not; that detail will be mopped up below. (We cannot tell, + * anyway, whether the operator thinks the values are equal.) + * + * If the binary search accesses the first or last histogram + * entry, we try to replace that endpoint with the true column min + * or max as found by get_actual_variable_range(). This + * ameliorates misestimates when the min or max is moving as a + * result of changes since the last ANALYZE. Note that this could + * result in effectively including MCVs into the histogram that + * weren't there before, but we don't try to correct for that. + */ + double histfrac; + int lobound = 0; /* first possible slot to search */ + int hibound = sslot.nvalues; /* last+1 slot to search */ + bool have_end = false; + + /* + * If there are only two histogram entries, we'll want up-to-date + * values for both. (If there are more than two, we need at most + * one of them to be updated, so we deal with that within the + * loop.) + */ + if (sslot.nvalues == 2) + have_end = get_actual_variable_range(root, + vardata, + sslot.staop, + collation, + &sslot.values[0], + &sslot.values[1]); + + while (lobound < hibound) + { + int probe = (lobound + hibound) / 2; + bool ltcmp; + + /* + * If we find ourselves about to compare to the first or last + * histogram entry, first try to replace it with the actual + * current min or max (unless we already did so above). + */ + if (probe == 0 && sslot.nvalues > 2) + have_end = get_actual_variable_range(root, + vardata, + sslot.staop, + collation, + &sslot.values[0], + NULL); + else if (probe == sslot.nvalues - 1 && sslot.nvalues > 2) + have_end = get_actual_variable_range(root, + vardata, + sslot.staop, + collation, + NULL, + &sslot.values[probe]); + + ltcmp = DatumGetBool(FunctionCall2Coll(opproc, + collation, + sslot.values[probe], + constval)); + if (isgt) + ltcmp = !ltcmp; + if (ltcmp) + lobound = probe + 1; + else + hibound = probe; + } + + if (lobound <= 0) + { + /* + * Constant is below lower histogram boundary. More + * precisely, we have found that no entry in the histogram + * satisfies the inequality clause (if !isgt) or they all do + * (if isgt). We estimate that that's true of the entire + * table, so set histfrac to 0.0 (which we'll flip to 1.0 + * below, if isgt). + */ + histfrac = 0.0; + } + else if (lobound >= sslot.nvalues) + { + /* + * Inverse case: constant is above upper histogram boundary. + */ + histfrac = 1.0; + } + else + { + /* We have values[i-1] <= constant <= values[i]. */ + int i = lobound; + double eq_selec = 0; + double val, + high, + low; + double binfrac; + + /* + * In the cases where we'll need it below, obtain an estimate + * of the selectivity of "x = constval". We use a calculation + * similar to what var_eq_const() does for a non-MCV constant, + * ie, estimate that all distinct non-MCV values occur equally + * often. But multiplication by "1.0 - sumcommon - nullfrac" + * will be done by our caller, so we shouldn't do that here. + * Therefore we can't try to clamp the estimate by reference + * to the least common MCV; the result would be too small. + * + * Note: since this is effectively assuming that constval + * isn't an MCV, it's logically dubious if constval in fact is + * one. But we have to apply *some* correction for equality, + * and anyway we cannot tell if constval is an MCV, since we + * don't have a suitable equality operator at hand. + */ + if (i == 1 || isgt == iseq) + { + double otherdistinct; + bool isdefault; + AttStatsSlot mcvslot; + + /* Get estimated number of distinct values */ + otherdistinct = get_variable_numdistinct(vardata, + &isdefault); + + /* Subtract off the number of known MCVs */ + if (get_attstatsslot(&mcvslot, vardata->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_NUMBERS)) + { + otherdistinct -= mcvslot.nnumbers; + free_attstatsslot(&mcvslot); + } + + /* If result doesn't seem sane, leave eq_selec at 0 */ + if (otherdistinct > 1) + eq_selec = 1.0 / otherdistinct; + } + + /* + * Convert the constant and the two nearest bin boundary + * values to a uniform comparison scale, and do a linear + * interpolation within this bin. + */ + if (convert_to_scalar(constval, consttype, collation, + &val, + sslot.values[i - 1], sslot.values[i], + vardata->vartype, + &low, &high)) + { + if (high <= low) + { + /* cope if bin boundaries appear identical */ + binfrac = 0.5; + } + else if (val <= low) + binfrac = 0.0; + else if (val >= high) + binfrac = 1.0; + else + { + binfrac = (val - low) / (high - low); + + /* + * Watch out for the possibility that we got a NaN or + * Infinity from the division. This can happen + * despite the previous checks, if for example "low" + * is -Infinity. + */ + if (isnan(binfrac) || + binfrac < 0.0 || binfrac > 1.0) + binfrac = 0.5; + } + } + else + { + /* + * Ideally we'd produce an error here, on the grounds that + * the given operator shouldn't have scalarXXsel + * registered as its selectivity func unless we can deal + * with its operand types. But currently, all manner of + * stuff is invoking scalarXXsel, so give a default + * estimate until that can be fixed. + */ + binfrac = 0.5; + } + + /* + * Now, compute the overall selectivity across the values + * represented by the histogram. We have i-1 full bins and + * binfrac partial bin below the constant. + */ + histfrac = (double) (i - 1) + binfrac; + histfrac /= (double) (sslot.nvalues - 1); + + /* + * At this point, histfrac is an estimate of the fraction of + * the population represented by the histogram that satisfies + * "x <= constval". Somewhat remarkably, this statement is + * true regardless of which operator we were doing the probes + * with, so long as convert_to_scalar() delivers reasonable + * results. If the probe constant is equal to some histogram + * entry, we would have considered the bin to the left of that + * entry if probing with "<" or ">=", or the bin to the right + * if probing with "<=" or ">"; but binfrac would have come + * out as 1.0 in the first case and 0.0 in the second, leading + * to the same histfrac in either case. For probe constants + * between histogram entries, we find the same bin and get the + * same estimate with any operator. + * + * The fact that the estimate corresponds to "x <= constval" + * and not "x < constval" is because of the way that ANALYZE + * constructs the histogram: each entry is, effectively, the + * rightmost value in its sample bucket. So selectivity + * values that are exact multiples of 1/(histogram_size-1) + * should be understood as estimates including a histogram + * entry plus everything to its left. + * + * However, that breaks down for the first histogram entry, + * which necessarily is the leftmost value in its sample + * bucket. That means the first histogram bin is slightly + * narrower than the rest, by an amount equal to eq_selec. + * Another way to say that is that we want "x <= leftmost" to + * be estimated as eq_selec not zero. So, if we're dealing + * with the first bin (i==1), rescale to make that true while + * adjusting the rest of that bin linearly. + */ + if (i == 1) + histfrac += eq_selec * (1.0 - binfrac); + + /* + * "x <= constval" is good if we want an estimate for "<=" or + * ">", but if we are estimating for "<" or ">=", we now need + * to decrease the estimate by eq_selec. + */ + if (isgt == iseq) + histfrac -= eq_selec; + } + + /* + * Now the estimate is finished for "<" and "<=" cases. If we are + * estimating for ">" or ">=", flip it. + */ + hist_selec = isgt ? (1.0 - histfrac) : histfrac; + + /* + * The histogram boundaries are only approximate to begin with, + * and may well be out of date anyway. Therefore, don't believe + * extremely small or large selectivity estimates --- unless we + * got actual current endpoint values from the table, in which + * case just do the usual sanity clamp. Somewhat arbitrarily, we + * set the cutoff for other cases at a hundredth of the histogram + * resolution. + */ + if (have_end) + CLAMP_PROBABILITY(hist_selec); + else + { + double cutoff = 0.01 / (double) (sslot.nvalues - 1); + + if (hist_selec < cutoff) + hist_selec = cutoff; + else if (hist_selec > 1.0 - cutoff) + hist_selec = 1.0 - cutoff; + } + } + else if (sslot.nvalues > 1) + { + /* + * If we get here, we have a histogram but it's not sorted the way + * we want. Do a brute-force search to see how many of the + * entries satisfy the comparison condition, and take that + * fraction as our estimate. (This is identical to the inner loop + * of histogram_selectivity; maybe share code?) + */ + LOCAL_FCINFO(fcinfo, 2); + int nmatch = 0; + + InitFunctionCallInfoData(*fcinfo, opproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + fcinfo->args[1].value = constval; + for (int i = 0; i < sslot.nvalues; i++) + { + Datum fresult; + + fcinfo->args[0].value = sslot.values[i]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + nmatch++; + } + hist_selec = ((double) nmatch) / ((double) sslot.nvalues); + + /* + * As above, clamp to a hundredth of the histogram resolution. + * This case is surely even less trustworthy than the normal one, + * so we shouldn't believe exact 0 or 1 selectivity. (Maybe the + * clamp should be more restrictive in this case?) + */ + { + double cutoff = 0.01 / (double) (sslot.nvalues - 1); + + if (hist_selec < cutoff) + hist_selec = cutoff; + else if (hist_selec > 1.0 - cutoff) + hist_selec = 1.0 - cutoff; + } + } + + free_attstatsslot(&sslot); + } + + return hist_selec; +} + +/* + * Common wrapper function for the selectivity estimators that simply + * invoke scalarineqsel(). + */ +static Datum +scalarineqsel_wrapper(PG_FUNCTION_ARGS, bool isgt, bool iseq) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + Oid collation = PG_GET_COLLATION(); + VariableStatData vardata; + Node *other; + bool varonleft; + Datum constval; + Oid consttype; + double selec; + + /* + * If expression is not variable op something or something op variable, + * then punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); + + /* + * Can't do anything useful if the something is not a constant, either. + */ + if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); + } + + /* + * If the constant is NULL, assume operator is strict and return zero, ie, + * operator will never return TRUE. + */ + if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(0.0); + } + constval = ((Const *) other)->constvalue; + consttype = ((Const *) other)->consttype; + + /* + * Force the var to be on the left to simplify logic in scalarineqsel. + */ + if (!varonleft) + { + operator = get_commutator(operator); + if (!operator) + { + /* Use default selectivity (should we raise an error instead?) */ + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); + } + isgt = !isgt; + } + + /* The rest of the work is done by scalarineqsel(). */ + selec = scalarineqsel(root, operator, isgt, iseq, collation, + &vardata, constval, consttype); + + ReleaseVariableStats(vardata); + + PG_RETURN_FLOAT8((float8) selec); +} + +/* + * scalarltsel - Selectivity of "<" for scalars. + */ +Datum +scalarltsel(PG_FUNCTION_ARGS) +{ + return scalarineqsel_wrapper(fcinfo, false, false); +} + +/* + * scalarlesel - Selectivity of "<=" for scalars. + */ +Datum +scalarlesel(PG_FUNCTION_ARGS) +{ + return scalarineqsel_wrapper(fcinfo, false, true); +} + +/* + * scalargtsel - Selectivity of ">" for scalars. + */ +Datum +scalargtsel(PG_FUNCTION_ARGS) +{ + return scalarineqsel_wrapper(fcinfo, true, false); +} + +/* + * scalargesel - Selectivity of ">=" for scalars. + */ +Datum +scalargesel(PG_FUNCTION_ARGS) +{ + return scalarineqsel_wrapper(fcinfo, true, true); +} + +/* + * boolvarsel - Selectivity of Boolean variable. + * + * This can actually be called on any boolean-valued expression. If it + * involves only Vars of the specified relation, and if there are statistics + * about the Var or expression (the latter is possible if it's indexed) then + * we'll produce a real estimate; otherwise it's just a default. + */ +Selectivity +boolvarsel(PlannerInfo *root, Node *arg, int varRelid) +{ + VariableStatData vardata; + double selec; + + examine_variable(root, arg, varRelid, &vardata); + if (HeapTupleIsValid(vardata.statsTuple)) + { + /* + * A boolean variable V is equivalent to the clause V = 't', so we + * compute the selectivity as if that is what we have. + */ + selec = var_eq_const(&vardata, BooleanEqualOperator, InvalidOid, + BoolGetDatum(true), false, true, false); + } + else + { + /* Otherwise, the default estimate is 0.5 */ + selec = 0.5; + } + ReleaseVariableStats(vardata); + return selec; +} + +/* + * booltestsel - Selectivity of BooleanTest Node. + */ +Selectivity +booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg, + int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo) +{ + VariableStatData vardata; + double selec; + + examine_variable(root, arg, varRelid, &vardata); + + if (HeapTupleIsValid(vardata.statsTuple)) + { + Form_pg_statistic stats; + double freq_null; + AttStatsSlot sslot; + + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + freq_null = stats->stanullfrac; + + if (get_attstatsslot(&sslot, vardata.statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS) + && sslot.nnumbers > 0) + { + double freq_true; + double freq_false; + + /* + * Get first MCV frequency and derive frequency for true. + */ + if (DatumGetBool(sslot.values[0])) + freq_true = sslot.numbers[0]; + else + freq_true = 1.0 - sslot.numbers[0] - freq_null; + + /* + * Next derive frequency for false. Then use these as appropriate + * to derive frequency for each case. + */ + freq_false = 1.0 - freq_true - freq_null; + + switch (booltesttype) + { + case IS_UNKNOWN: + /* select only NULL values */ + selec = freq_null; + break; + case IS_NOT_UNKNOWN: + /* select non-NULL values */ + selec = 1.0 - freq_null; + break; + case IS_TRUE: + /* select only TRUE values */ + selec = freq_true; + break; + case IS_NOT_TRUE: + /* select non-TRUE values */ + selec = 1.0 - freq_true; + break; + case IS_FALSE: + /* select only FALSE values */ + selec = freq_false; + break; + case IS_NOT_FALSE: + /* select non-FALSE values */ + selec = 1.0 - freq_false; + break; + default: + elog(ERROR, "unrecognized booltesttype: %d", + (int) booltesttype); + selec = 0.0; /* Keep compiler quiet */ + break; + } + + free_attstatsslot(&sslot); + } + else + { + /* + * No most-common-value info available. Still have null fraction + * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust + * for null fraction and assume a 50-50 split of TRUE and FALSE. + */ + switch (booltesttype) + { + case IS_UNKNOWN: + /* select only NULL values */ + selec = freq_null; + break; + case IS_NOT_UNKNOWN: + /* select non-NULL values */ + selec = 1.0 - freq_null; + break; + case IS_TRUE: + case IS_FALSE: + /* Assume we select half of the non-NULL values */ + selec = (1.0 - freq_null) / 2.0; + break; + case IS_NOT_TRUE: + case IS_NOT_FALSE: + /* Assume we select NULLs plus half of the non-NULLs */ + /* equiv. to freq_null + (1.0 - freq_null) / 2.0 */ + selec = (freq_null + 1.0) / 2.0; + break; + default: + elog(ERROR, "unrecognized booltesttype: %d", + (int) booltesttype); + selec = 0.0; /* Keep compiler quiet */ + break; + } + } + } + else + { + /* + * If we can't get variable statistics for the argument, perhaps + * clause_selectivity can do something with it. We ignore the + * possibility of a NULL value when using clause_selectivity, and just + * assume the value is either TRUE or FALSE. + */ + switch (booltesttype) + { + case IS_UNKNOWN: + selec = DEFAULT_UNK_SEL; + break; + case IS_NOT_UNKNOWN: + selec = DEFAULT_NOT_UNK_SEL; + break; + case IS_TRUE: + case IS_NOT_FALSE: + selec = (double) clause_selectivity(root, arg, + varRelid, + jointype, sjinfo); + break; + case IS_FALSE: + case IS_NOT_TRUE: + selec = 1.0 - (double) clause_selectivity(root, arg, + varRelid, + jointype, sjinfo); + break; + default: + elog(ERROR, "unrecognized booltesttype: %d", + (int) booltesttype); + selec = 0.0; /* Keep compiler quiet */ + break; + } + } + + ReleaseVariableStats(vardata); + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return (Selectivity) selec; +} + +/* + * nulltestsel - Selectivity of NullTest Node. + */ +Selectivity +nulltestsel(PlannerInfo *root, NullTestType nulltesttype, Node *arg, + int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo) +{ + VariableStatData vardata; + double selec; + + examine_variable(root, arg, varRelid, &vardata); + + if (HeapTupleIsValid(vardata.statsTuple)) + { + Form_pg_statistic stats; + double freq_null; + + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + freq_null = stats->stanullfrac; + + switch (nulltesttype) + { + case IS_NULL: + + /* + * Use freq_null directly. + */ + selec = freq_null; + break; + case IS_NOT_NULL: + + /* + * Select not unknown (not null) values. Calculate from + * freq_null. + */ + selec = 1.0 - freq_null; + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) nulltesttype); + return (Selectivity) 0; /* keep compiler quiet */ + } + } + else if (vardata.var && IsA(vardata.var, Var) && + ((Var *) vardata.var)->varattno < 0) + { + /* + * There are no stats for system columns, but we know they are never + * NULL. + */ + selec = (nulltesttype == IS_NULL) ? 0.0 : 1.0; + } + else + { + /* + * No ANALYZE stats available, so make a guess + */ + switch (nulltesttype) + { + case IS_NULL: + selec = DEFAULT_UNK_SEL; + break; + case IS_NOT_NULL: + selec = DEFAULT_NOT_UNK_SEL; + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) nulltesttype); + return (Selectivity) 0; /* keep compiler quiet */ + } + } + + ReleaseVariableStats(vardata); + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return (Selectivity) selec; +} + +/* + * strip_array_coercion - strip binary-compatible relabeling from an array expr + * + * For array values, the parser normally generates ArrayCoerceExpr conversions, + * but it seems possible that RelabelType might show up. Also, the planner + * is not currently tense about collapsing stacked ArrayCoerceExpr nodes, + * so we need to be ready to deal with more than one level. + */ +static Node * +strip_array_coercion(Node *node) +{ + for (;;) + { + if (node && IsA(node, ArrayCoerceExpr)) + { + ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node; + + /* + * If the per-element expression is just a RelabelType on top of + * CaseTestExpr, then we know it's a binary-compatible relabeling. + */ + if (IsA(acoerce->elemexpr, RelabelType) && + IsA(((RelabelType *) acoerce->elemexpr)->arg, CaseTestExpr)) + node = (Node *) acoerce->arg; + else + break; + } + else if (node && IsA(node, RelabelType)) + { + /* We don't really expect this case, but may as well cope */ + node = (Node *) ((RelabelType *) node)->arg; + } + else + break; + } + return node; +} + +/* + * scalararraysel - Selectivity of ScalarArrayOpExpr Node. + */ +Selectivity +scalararraysel(PlannerInfo *root, + ScalarArrayOpExpr *clause, + bool is_join_clause, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo) +{ + Oid operator = clause->opno; + bool useOr = clause->useOr; + bool isEquality = false; + bool isInequality = false; + Node *leftop; + Node *rightop; + Oid nominal_element_type; + Oid nominal_element_collation; + TypeCacheEntry *typentry; + RegProcedure oprsel; + FmgrInfo oprselproc; + Selectivity s1; + Selectivity s1disjoint; + + /* First, deconstruct the expression */ + Assert(list_length(clause->args) == 2); + leftop = (Node *) linitial(clause->args); + rightop = (Node *) lsecond(clause->args); + + /* aggressively reduce both sides to constants */ + leftop = estimate_expression_value(root, leftop); + rightop = estimate_expression_value(root, rightop); + + /* get nominal (after relabeling) element type of rightop */ + nominal_element_type = get_base_element_type(exprType(rightop)); + if (!OidIsValid(nominal_element_type)) + return (Selectivity) 0.5; /* probably shouldn't happen */ + /* get nominal collation, too, for generating constants */ + nominal_element_collation = exprCollation(rightop); + + /* look through any binary-compatible relabeling of rightop */ + rightop = strip_array_coercion(rightop); + + /* + * Detect whether the operator is the default equality or inequality + * operator of the array element type. + */ + typentry = lookup_type_cache(nominal_element_type, TYPECACHE_EQ_OPR); + if (OidIsValid(typentry->eq_opr)) + { + if (operator == typentry->eq_opr) + isEquality = true; + else if (get_negator(operator) == typentry->eq_opr) + isInequality = true; + } + + /* + * If it is equality or inequality, we might be able to estimate this as a + * form of array containment; for instance "const = ANY(column)" can be + * treated as "ARRAY[const] <@ column". scalararraysel_containment tries + * that, and returns the selectivity estimate if successful, or -1 if not. + */ + if ((isEquality || isInequality) && !is_join_clause) + { + s1 = scalararraysel_containment(root, leftop, rightop, + nominal_element_type, + isEquality, useOr, varRelid); + if (s1 >= 0.0) + return s1; + } + + /* + * Look up the underlying operator's selectivity estimator. Punt if it + * hasn't got one. + */ + if (is_join_clause) + oprsel = get_oprjoin(operator); + else + oprsel = get_oprrest(operator); + if (!oprsel) + return (Selectivity) 0.5; + fmgr_info(oprsel, &oprselproc); + + /* + * In the array-containment check above, we must only believe that an + * operator is equality or inequality if it is the default btree equality + * operator (or its negator) for the element type, since those are the + * operators that array containment will use. But in what follows, we can + * be a little laxer, and also believe that any operators using eqsel() or + * neqsel() as selectivity estimator act like equality or inequality. + */ + if (oprsel == F_EQSEL || oprsel == F_EQJOINSEL) + isEquality = true; + else if (oprsel == F_NEQSEL || oprsel == F_NEQJOINSEL) + isInequality = true; + + /* + * We consider three cases: + * + * 1. rightop is an Array constant: deconstruct the array, apply the + * operator's selectivity function for each array element, and merge the + * results in the same way that clausesel.c does for AND/OR combinations. + * + * 2. rightop is an ARRAY[] construct: apply the operator's selectivity + * function for each element of the ARRAY[] construct, and merge. + * + * 3. otherwise, make a guess ... + */ + if (rightop && IsA(rightop, Const)) + { + Datum arraydatum = ((Const *) rightop)->constvalue; + bool arrayisnull = ((Const *) rightop)->constisnull; + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int num_elems; + Datum *elem_values; + bool *elem_nulls; + int i; + + if (arrayisnull) /* qual can't succeed if null array */ + return (Selectivity) 0.0; + arrayval = DatumGetArrayTypeP(arraydatum); + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elem_values, &elem_nulls, &num_elems); + + /* + * For generic operators, we assume the probability of success is + * independent for each array element. But for "= ANY" or "<> ALL", + * if the array elements are distinct (which'd typically be the case) + * then the probabilities are disjoint, and we should just sum them. + * + * If we were being really tense we would try to confirm that the + * elements are all distinct, but that would be expensive and it + * doesn't seem to be worth the cycles; it would amount to penalizing + * well-written queries in favor of poorly-written ones. However, we + * do protect ourselves a little bit by checking whether the + * disjointness assumption leads to an impossible (out of range) + * probability; if so, we fall back to the normal calculation. + */ + s1 = s1disjoint = (useOr ? 0.0 : 1.0); + + for (i = 0; i < num_elems; i++) + { + List *args; + Selectivity s2; + + args = list_make2(leftop, + makeConst(nominal_element_type, + -1, + nominal_element_collation, + elmlen, + elem_values[i], + elem_nulls[i], + elmbyval)); + if (is_join_clause) + s2 = DatumGetFloat8(FunctionCall5Coll(&oprselproc, + clause->inputcollid, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + else + s2 = DatumGetFloat8(FunctionCall4Coll(&oprselproc, + clause->inputcollid, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int32GetDatum(varRelid))); + + if (useOr) + { + s1 = s1 + s2 - s1 * s2; + if (isEquality) + s1disjoint += s2; + } + else + { + s1 = s1 * s2; + if (isInequality) + s1disjoint += s2 - 1.0; + } + } + + /* accept disjoint-probability estimate if in range */ + if ((useOr ? isEquality : isInequality) && + s1disjoint >= 0.0 && s1disjoint <= 1.0) + s1 = s1disjoint; + } + else if (rightop && IsA(rightop, ArrayExpr) && + !((ArrayExpr *) rightop)->multidims) + { + ArrayExpr *arrayexpr = (ArrayExpr *) rightop; + int16 elmlen; + bool elmbyval; + ListCell *l; + + get_typlenbyval(arrayexpr->element_typeid, + &elmlen, &elmbyval); + + /* + * We use the assumption of disjoint probabilities here too, although + * the odds of equal array elements are rather higher if the elements + * are not all constants (which they won't be, else constant folding + * would have reduced the ArrayExpr to a Const). In this path it's + * critical to have the sanity check on the s1disjoint estimate. + */ + s1 = s1disjoint = (useOr ? 0.0 : 1.0); + + foreach(l, arrayexpr->elements) + { + Node *elem = (Node *) lfirst(l); + List *args; + Selectivity s2; + + /* + * Theoretically, if elem isn't of nominal_element_type we should + * insert a RelabelType, but it seems unlikely that any operator + * estimation function would really care ... + */ + args = list_make2(leftop, elem); + if (is_join_clause) + s2 = DatumGetFloat8(FunctionCall5Coll(&oprselproc, + clause->inputcollid, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + else + s2 = DatumGetFloat8(FunctionCall4Coll(&oprselproc, + clause->inputcollid, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int32GetDatum(varRelid))); + + if (useOr) + { + s1 = s1 + s2 - s1 * s2; + if (isEquality) + s1disjoint += s2; + } + else + { + s1 = s1 * s2; + if (isInequality) + s1disjoint += s2 - 1.0; + } + } + + /* accept disjoint-probability estimate if in range */ + if ((useOr ? isEquality : isInequality) && + s1disjoint >= 0.0 && s1disjoint <= 1.0) + s1 = s1disjoint; + } + else + { + CaseTestExpr *dummyexpr; + List *args; + Selectivity s2; + int i; + + /* + * We need a dummy rightop to pass to the operator selectivity + * routine. It can be pretty much anything that doesn't look like a + * constant; CaseTestExpr is a convenient choice. + */ + dummyexpr = makeNode(CaseTestExpr); + dummyexpr->typeId = nominal_element_type; + dummyexpr->typeMod = -1; + dummyexpr->collation = clause->inputcollid; + args = list_make2(leftop, dummyexpr); + if (is_join_clause) + s2 = DatumGetFloat8(FunctionCall5Coll(&oprselproc, + clause->inputcollid, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + else + s2 = DatumGetFloat8(FunctionCall4Coll(&oprselproc, + clause->inputcollid, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int32GetDatum(varRelid))); + s1 = useOr ? 0.0 : 1.0; + + /* + * Arbitrarily assume 10 elements in the eventual array value (see + * also estimate_array_length). We don't risk an assumption of + * disjoint probabilities here. + */ + for (i = 0; i < 10; i++) + { + if (useOr) + s1 = s1 + s2 - s1 * s2; + else + s1 = s1 * s2; + } + } + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(s1); + + return s1; +} + +/* + * Estimate number of elements in the array yielded by an expression. + * + * It's important that this agree with scalararraysel. + */ +int +estimate_array_length(Node *arrayexpr) +{ + /* look through any binary-compatible relabeling of arrayexpr */ + arrayexpr = strip_array_coercion(arrayexpr); + + if (arrayexpr && IsA(arrayexpr, Const)) + { + Datum arraydatum = ((Const *) arrayexpr)->constvalue; + bool arrayisnull = ((Const *) arrayexpr)->constisnull; + ArrayType *arrayval; + + if (arrayisnull) + return 0; + arrayval = DatumGetArrayTypeP(arraydatum); + return ArrayGetNItems(ARR_NDIM(arrayval), ARR_DIMS(arrayval)); + } + else if (arrayexpr && IsA(arrayexpr, ArrayExpr) && + !((ArrayExpr *) arrayexpr)->multidims) + { + return list_length(((ArrayExpr *) arrayexpr)->elements); + } + else + { + /* default guess --- see also scalararraysel */ + return 10; + } +} + +/* + * rowcomparesel - Selectivity of RowCompareExpr Node. + * + * We estimate RowCompare selectivity by considering just the first (high + * order) columns, which makes it equivalent to an ordinary OpExpr. While + * this estimate could be refined by considering additional columns, it + * seems unlikely that we could do a lot better without multi-column + * statistics. + */ +Selectivity +rowcomparesel(PlannerInfo *root, + RowCompareExpr *clause, + int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo) +{ + Selectivity s1; + Oid opno = linitial_oid(clause->opnos); + Oid inputcollid = linitial_oid(clause->inputcollids); + List *opargs; + bool is_join_clause; + + /* Build equivalent arg list for single operator */ + opargs = list_make2(linitial(clause->largs), linitial(clause->rargs)); + + /* + * Decide if it's a join clause. This should match clausesel.c's + * treat_as_join_clause(), except that we intentionally consider only the + * leading columns and not the rest of the clause. + */ + if (varRelid != 0) + { + /* + * Caller is forcing restriction mode (eg, because we are examining an + * inner indexscan qual). + */ + is_join_clause = false; + } + else if (sjinfo == NULL) + { + /* + * It must be a restriction clause, since it's being evaluated at a + * scan node. + */ + is_join_clause = false; + } + else + { + /* + * Otherwise, it's a join if there's more than one relation used. + */ + is_join_clause = (NumRelids(root, (Node *) opargs) > 1); + } + + if (is_join_clause) + { + /* Estimate selectivity for a join clause. */ + s1 = join_selectivity(root, opno, + opargs, + inputcollid, + jointype, + sjinfo); + } + else + { + /* Estimate selectivity for a restriction clause. */ + s1 = restriction_selectivity(root, opno, + opargs, + inputcollid, + varRelid); + } + + return s1; +} + +/* + * eqjoinsel - Join selectivity of "=" + */ +Datum +eqjoinsel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + +#ifdef NOT_USED + JoinType jointype = (JoinType) PG_GETARG_INT16(3); +#endif + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); + Oid collation = PG_GET_COLLATION(); + double selec; + double selec_inner; + VariableStatData vardata1; + VariableStatData vardata2; + double nd1; + double nd2; + bool isdefault1; + bool isdefault2; + Oid opfuncoid; + AttStatsSlot sslot1; + AttStatsSlot sslot2; + Form_pg_statistic stats1 = NULL; + Form_pg_statistic stats2 = NULL; + bool have_mcvs1 = false; + bool have_mcvs2 = false; + bool join_is_reversed; + RelOptInfo *inner_rel; + + get_join_variables(root, args, sjinfo, + &vardata1, &vardata2, &join_is_reversed); + + nd1 = get_variable_numdistinct(&vardata1, &isdefault1); + nd2 = get_variable_numdistinct(&vardata2, &isdefault2); + + opfuncoid = get_opcode(operator); + + memset(&sslot1, 0, sizeof(sslot1)); + memset(&sslot2, 0, sizeof(sslot2)); + + if (HeapTupleIsValid(vardata1.statsTuple)) + { + /* note we allow use of nullfrac regardless of security check */ + stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple); + if (statistic_proc_security_check(&vardata1, opfuncoid)) + have_mcvs1 = get_attstatsslot(&sslot1, vardata1.statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + if (HeapTupleIsValid(vardata2.statsTuple)) + { + /* note we allow use of nullfrac regardless of security check */ + stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple); + if (statistic_proc_security_check(&vardata2, opfuncoid)) + have_mcvs2 = get_attstatsslot(&sslot2, vardata2.statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + /* We need to compute the inner-join selectivity in all cases */ + selec_inner = eqjoinsel_inner(opfuncoid, collation, + &vardata1, &vardata2, + nd1, nd2, + isdefault1, isdefault2, + &sslot1, &sslot2, + stats1, stats2, + have_mcvs1, have_mcvs2); + + switch (sjinfo->jointype) + { + case JOIN_INNER: + case JOIN_LEFT: + case JOIN_FULL: + selec = selec_inner; + break; + case JOIN_SEMI: + case JOIN_ANTI: + + /* + * Look up the join's inner relation. min_righthand is sufficient + * information because neither SEMI nor ANTI joins permit any + * reassociation into or out of their RHS, so the righthand will + * always be exactly that set of rels. + */ + inner_rel = find_join_input_rel(root, sjinfo->min_righthand); + + if (!join_is_reversed) + selec = eqjoinsel_semi(opfuncoid, collation, + &vardata1, &vardata2, + nd1, nd2, + isdefault1, isdefault2, + &sslot1, &sslot2, + stats1, stats2, + have_mcvs1, have_mcvs2, + inner_rel); + else + { + Oid commop = get_commutator(operator); + Oid commopfuncoid = OidIsValid(commop) ? get_opcode(commop) : InvalidOid; + + selec = eqjoinsel_semi(commopfuncoid, collation, + &vardata2, &vardata1, + nd2, nd1, + isdefault2, isdefault1, + &sslot2, &sslot1, + stats2, stats1, + have_mcvs2, have_mcvs1, + inner_rel); + } + + /* + * We should never estimate the output of a semijoin to be more + * rows than we estimate for an inner join with the same input + * rels and join condition; it's obviously impossible for that to + * happen. The former estimate is N1 * Ssemi while the latter is + * N1 * N2 * Sinner, so we may clamp Ssemi <= N2 * Sinner. Doing + * this is worthwhile because of the shakier estimation rules we + * use in eqjoinsel_semi, particularly in cases where it has to + * punt entirely. + */ + selec = Min(selec, inner_rel->rows * selec_inner); + break; + default: + /* other values not expected here */ + elog(ERROR, "unrecognized join type: %d", + (int) sjinfo->jointype); + selec = 0; /* keep compiler quiet */ + break; + } + + free_attstatsslot(&sslot1); + free_attstatsslot(&sslot2); + + ReleaseVariableStats(vardata1); + ReleaseVariableStats(vardata2); + + CLAMP_PROBABILITY(selec); + + PG_RETURN_FLOAT8((float8) selec); +} + +/* + * eqjoinsel_inner --- eqjoinsel for normal inner join + * + * We also use this for LEFT/FULL outer joins; it's not presently clear + * that it's worth trying to distinguish them here. + */ +static double +eqjoinsel_inner(Oid opfuncoid, Oid collation, + VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2) +{ + double selec; + + if (have_mcvs1 && have_mcvs2) + { + /* + * We have most-common-value lists for both relations. Run through + * the lists to see which MCVs actually join to each other with the + * given operator. This allows us to determine the exact join + * selectivity for the portion of the relations represented by the MCV + * lists. We still have to estimate for the remaining population, but + * in a skewed distribution this gives us a big leg up in accuracy. + * For motivation see the analysis in Y. Ioannidis and S. + * Christodoulakis, "On the propagation of errors in the size of join + * results", Technical Report 1018, Computer Science Dept., University + * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu). + */ + LOCAL_FCINFO(fcinfo, 2); + FmgrInfo eqproc; + bool *hasmatch1; + bool *hasmatch2; + double nullfrac1 = stats1->stanullfrac; + double nullfrac2 = stats2->stanullfrac; + double matchprodfreq, + matchfreq1, + matchfreq2, + unmatchfreq1, + unmatchfreq2, + otherfreq1, + otherfreq2, + totalsel1, + totalsel2; + int i, + nmatches; + + fmgr_info(opfuncoid, &eqproc); + + /* + * Save a few cycles by setting up the fcinfo struct just once. Using + * FunctionCallInvoke directly also avoids failure if the eqproc + * returns NULL, though really equality functions should never do + * that. + */ + InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + + hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); + hasmatch2 = (bool *) palloc0(sslot2->nvalues * sizeof(bool)); + + /* + * Note we assume that each MCV will match at most one member of the + * other MCV list. If the operator isn't really equality, there could + * be multiple matches --- but we don't look for them, both for speed + * and because the math wouldn't add up... + */ + matchprodfreq = 0.0; + nmatches = 0; + for (i = 0; i < sslot1->nvalues; i++) + { + int j; + + fcinfo->args[0].value = sslot1->values[i]; + + for (j = 0; j < sslot2->nvalues; j++) + { + Datum fresult; + + if (hasmatch2[j]) + continue; + fcinfo->args[1].value = sslot2->values[j]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + { + hasmatch1[i] = hasmatch2[j] = true; + matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j]; + nmatches++; + break; + } + } + } + CLAMP_PROBABILITY(matchprodfreq); + /* Sum up frequencies of matched and unmatched MCVs */ + matchfreq1 = unmatchfreq1 = 0.0; + for (i = 0; i < sslot1->nvalues; i++) + { + if (hasmatch1[i]) + matchfreq1 += sslot1->numbers[i]; + else + unmatchfreq1 += sslot1->numbers[i]; + } + CLAMP_PROBABILITY(matchfreq1); + CLAMP_PROBABILITY(unmatchfreq1); + matchfreq2 = unmatchfreq2 = 0.0; + for (i = 0; i < sslot2->nvalues; i++) + { + if (hasmatch2[i]) + matchfreq2 += sslot2->numbers[i]; + else + unmatchfreq2 += sslot2->numbers[i]; + } + CLAMP_PROBABILITY(matchfreq2); + CLAMP_PROBABILITY(unmatchfreq2); + pfree(hasmatch1); + pfree(hasmatch2); + + /* + * Compute total frequency of non-null values that are not in the MCV + * lists. + */ + otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1; + otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2; + CLAMP_PROBABILITY(otherfreq1); + CLAMP_PROBABILITY(otherfreq2); + + /* + * We can estimate the total selectivity from the point of view of + * relation 1 as: the known selectivity for matched MCVs, plus + * unmatched MCVs that are assumed to match against random members of + * relation 2's non-MCV population, plus non-MCV values that are + * assumed to match against random members of relation 2's unmatched + * MCVs plus non-MCV values. + */ + totalsel1 = matchprodfreq; + if (nd2 > sslot2->nvalues) + totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - sslot2->nvalues); + if (nd2 > nmatches) + totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / + (nd2 - nmatches); + /* Same estimate from the point of view of relation 2. */ + totalsel2 = matchprodfreq; + if (nd1 > sslot1->nvalues) + totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - sslot1->nvalues); + if (nd1 > nmatches) + totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / + (nd1 - nmatches); + + /* + * Use the smaller of the two estimates. This can be justified in + * essentially the same terms as given below for the no-stats case: to + * a first approximation, we are estimating from the point of view of + * the relation with smaller nd. + */ + selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2; + } + else + { + /* + * We do not have MCV lists for both sides. Estimate the join + * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This + * is plausible if we assume that the join operator is strict and the + * non-null values are about equally distributed: a given non-null + * tuple of rel1 will join to either zero or N2*(1-nullfrac2)/nd2 rows + * of rel2, so total join rows are at most + * N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join selectivity of + * not more than (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it + * is not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression + * with MIN() is an upper bound. Using the MIN() means we estimate + * from the point of view of the relation with smaller nd (since the + * larger nd is determining the MIN). It is reasonable to assume that + * most tuples in this rel will have join partners, so the bound is + * probably reasonably tight and should be taken as-is. + * + * XXX Can we be smarter if we have an MCV list for just one side? It + * seems that if we assume equal distribution for the other side, we + * end up with the same answer anyway. + */ + double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0; + double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0; + + selec = (1.0 - nullfrac1) * (1.0 - nullfrac2); + if (nd1 > nd2) + selec /= nd1; + else + selec /= nd2; + } + + return selec; +} + +/* + * eqjoinsel_semi --- eqjoinsel for semi join + * + * (Also used for anti join, which we are supposed to estimate the same way.) + * Caller has ensured that vardata1 is the LHS variable. + * Unlike eqjoinsel_inner, we have to cope with opfuncoid being InvalidOid. + */ +static double +eqjoinsel_semi(Oid opfuncoid, Oid collation, + VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2, + RelOptInfo *inner_rel) +{ + double selec; + + /* + * We clamp nd2 to be not more than what we estimate the inner relation's + * size to be. This is intuitively somewhat reasonable since obviously + * there can't be more than that many distinct values coming from the + * inner rel. The reason for the asymmetry (ie, that we don't clamp nd1 + * likewise) is that this is the only pathway by which restriction clauses + * applied to the inner rel will affect the join result size estimate, + * since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by + * only the outer rel's size. If we clamped nd1 we'd be double-counting + * the selectivity of outer-rel restrictions. + * + * We can apply this clamping both with respect to the base relation from + * which the join variable comes (if there is just one), and to the + * immediate inner input relation of the current join. + * + * If we clamp, we can treat nd2 as being a non-default estimate; it's not + * great, maybe, but it didn't come out of nowhere either. This is most + * helpful when the inner relation is empty and consequently has no stats. + */ + if (vardata2->rel) + { + if (nd2 >= vardata2->rel->rows) + { + nd2 = vardata2->rel->rows; + isdefault2 = false; + } + } + if (nd2 >= inner_rel->rows) + { + nd2 = inner_rel->rows; + isdefault2 = false; + } + + if (have_mcvs1 && have_mcvs2 && OidIsValid(opfuncoid)) + { + /* + * We have most-common-value lists for both relations. Run through + * the lists to see which MCVs actually join to each other with the + * given operator. This allows us to determine the exact join + * selectivity for the portion of the relations represented by the MCV + * lists. We still have to estimate for the remaining population, but + * in a skewed distribution this gives us a big leg up in accuracy. + */ + LOCAL_FCINFO(fcinfo, 2); + FmgrInfo eqproc; + bool *hasmatch1; + bool *hasmatch2; + double nullfrac1 = stats1->stanullfrac; + double matchfreq1, + uncertainfrac, + uncertain; + int i, + nmatches, + clamped_nvalues2; + + /* + * The clamping above could have resulted in nd2 being less than + * sslot2->nvalues; in which case, we assume that precisely the nd2 + * most common values in the relation will appear in the join input, + * and so compare to only the first nd2 members of the MCV list. Of + * course this is frequently wrong, but it's the best bet we can make. + */ + clamped_nvalues2 = Min(sslot2->nvalues, nd2); + + fmgr_info(opfuncoid, &eqproc); + + /* + * Save a few cycles by setting up the fcinfo struct just once. Using + * FunctionCallInvoke directly also avoids failure if the eqproc + * returns NULL, though really equality functions should never do + * that. + */ + InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + + hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); + hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool)); + + /* + * Note we assume that each MCV will match at most one member of the + * other MCV list. If the operator isn't really equality, there could + * be multiple matches --- but we don't look for them, both for speed + * and because the math wouldn't add up... + */ + nmatches = 0; + for (i = 0; i < sslot1->nvalues; i++) + { + int j; + + fcinfo->args[0].value = sslot1->values[i]; + + for (j = 0; j < clamped_nvalues2; j++) + { + Datum fresult; + + if (hasmatch2[j]) + continue; + fcinfo->args[1].value = sslot2->values[j]; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + if (!fcinfo->isnull && DatumGetBool(fresult)) + { + hasmatch1[i] = hasmatch2[j] = true; + nmatches++; + break; + } + } + } + /* Sum up frequencies of matched MCVs */ + matchfreq1 = 0.0; + for (i = 0; i < sslot1->nvalues; i++) + { + if (hasmatch1[i]) + matchfreq1 += sslot1->numbers[i]; + } + CLAMP_PROBABILITY(matchfreq1); + pfree(hasmatch1); + pfree(hasmatch2); + + /* + * Now we need to estimate the fraction of relation 1 that has at + * least one join partner. We know for certain that the matched MCVs + * do, so that gives us a lower bound, but we're really in the dark + * about everything else. Our crude approach is: if nd1 <= nd2 then + * assume all non-null rel1 rows have join partners, else assume for + * the uncertain rows that a fraction nd2/nd1 have join partners. We + * can discount the known-matched MCVs from the distinct-values counts + * before doing the division. + * + * Crude as the above is, it's completely useless if we don't have + * reliable ndistinct values for both sides. Hence, if either nd1 or + * nd2 is default, punt and assume half of the uncertain rows have + * join partners. + */ + if (!isdefault1 && !isdefault2) + { + nd1 -= nmatches; + nd2 -= nmatches; + if (nd1 <= nd2 || nd2 < 0) + uncertainfrac = 1.0; + else + uncertainfrac = nd2 / nd1; + } + else + uncertainfrac = 0.5; + uncertain = 1.0 - matchfreq1 - nullfrac1; + CLAMP_PROBABILITY(uncertain); + selec = matchfreq1 + uncertainfrac * uncertain; + } + else + { + /* + * Without MCV lists for both sides, we can only use the heuristic + * about nd1 vs nd2. + */ + double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0; + + if (!isdefault1 && !isdefault2) + { + if (nd1 <= nd2 || nd2 < 0) + selec = 1.0 - nullfrac1; + else + selec = (nd2 / nd1) * (1.0 - nullfrac1); + } + else + selec = 0.5 * (1.0 - nullfrac1); + } + + return selec; +} + +/* + * neqjoinsel - Join selectivity of "!=" + */ +Datum +neqjoinsel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + JoinType jointype = (JoinType) PG_GETARG_INT16(3); + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); + Oid collation = PG_GET_COLLATION(); + float8 result; + + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI) + { + /* + * For semi-joins, if there is more than one distinct value in the RHS + * relation then every non-null LHS row must find a row to join since + * it can only be equal to one of them. We'll assume that there is + * always more than one distinct RHS value for the sake of stability, + * though in theory we could have special cases for empty RHS + * (selectivity = 0) and single-distinct-value RHS (selectivity = + * fraction of LHS that has the same value as the single RHS value). + * + * For anti-joins, if we use the same assumption that there is more + * than one distinct key in the RHS relation, then every non-null LHS + * row must be suppressed by the anti-join. + * + * So either way, the selectivity estimate should be 1 - nullfrac. + */ + VariableStatData leftvar; + VariableStatData rightvar; + bool reversed; + HeapTuple statsTuple; + double nullfrac; + + get_join_variables(root, args, sjinfo, &leftvar, &rightvar, &reversed); + statsTuple = reversed ? rightvar.statsTuple : leftvar.statsTuple; + if (HeapTupleIsValid(statsTuple)) + nullfrac = ((Form_pg_statistic) GETSTRUCT(statsTuple))->stanullfrac; + else + nullfrac = 0.0; + ReleaseVariableStats(leftvar); + ReleaseVariableStats(rightvar); + + result = 1.0 - nullfrac; + } + else + { + /* + * We want 1 - eqjoinsel() where the equality operator is the one + * associated with this != operator, that is, its negator. + */ + Oid eqop = get_negator(operator); + + if (eqop) + { + result = + DatumGetFloat8(DirectFunctionCall5Coll(eqjoinsel, + collation, + PointerGetDatum(root), + ObjectIdGetDatum(eqop), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + } + else + { + /* Use default selectivity (should we raise an error instead?) */ + result = DEFAULT_EQ_SEL; + } + result = 1.0 - result; + } + + PG_RETURN_FLOAT8(result); +} + +/* + * scalarltjoinsel - Join selectivity of "<" for scalars + */ +Datum +scalarltjoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); +} + +/* + * scalarlejoinsel - Join selectivity of "<=" for scalars + */ +Datum +scalarlejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); +} + +/* + * scalargtjoinsel - Join selectivity of ">" for scalars + */ +Datum +scalargtjoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); +} + +/* + * scalargejoinsel - Join selectivity of ">=" for scalars + */ +Datum +scalargejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); +} + + +/* + * mergejoinscansel - Scan selectivity of merge join. + * + * A merge join will stop as soon as it exhausts either input stream. + * Therefore, if we can estimate the ranges of both input variables, + * we can estimate how much of the input will actually be read. This + * can have a considerable impact on the cost when using indexscans. + * + * Also, we can estimate how much of each input has to be read before the + * first join pair is found, which will affect the join's startup time. + * + * clause should be a clause already known to be mergejoinable. opfamily, + * strategy, and nulls_first specify the sort ordering being used. + * + * The outputs are: + * *leftstart is set to the fraction of the left-hand variable expected + * to be scanned before the first join pair is found (0 to 1). + * *leftend is set to the fraction of the left-hand variable expected + * to be scanned before the join terminates (0 to 1). + * *rightstart, *rightend similarly for the right-hand variable. + */ +void +mergejoinscansel(PlannerInfo *root, Node *clause, + Oid opfamily, int strategy, bool nulls_first, + Selectivity *leftstart, Selectivity *leftend, + Selectivity *rightstart, Selectivity *rightend) +{ + Node *left, + *right; + VariableStatData leftvar, + rightvar; + int op_strategy; + Oid op_lefttype; + Oid op_righttype; + Oid opno, + collation, + lsortop, + rsortop, + lstatop, + rstatop, + ltop, + leop, + revltop, + revleop; + bool isgt; + Datum leftmin, + leftmax, + rightmin, + rightmax; + double selec; + + /* Set default results if we can't figure anything out. */ + /* XXX should default "start" fraction be a bit more than 0? */ + *leftstart = *rightstart = 0.0; + *leftend = *rightend = 1.0; + + /* Deconstruct the merge clause */ + if (!is_opclause(clause)) + return; /* shouldn't happen */ + opno = ((OpExpr *) clause)->opno; + collation = ((OpExpr *) clause)->inputcollid; + left = get_leftop((Expr *) clause); + right = get_rightop((Expr *) clause); + if (!right) + return; /* shouldn't happen */ + + /* Look for stats for the inputs */ + examine_variable(root, left, 0, &leftvar); + examine_variable(root, right, 0, &rightvar); + + /* Extract the operator's declared left/right datatypes */ + get_op_opfamily_properties(opno, opfamily, false, + &op_strategy, + &op_lefttype, + &op_righttype); + Assert(op_strategy == BTEqualStrategyNumber); + + /* + * Look up the various operators we need. If we don't find them all, it + * probably means the opfamily is broken, but we just fail silently. + * + * Note: we expect that pg_statistic histograms will be sorted by the '<' + * operator, regardless of which sort direction we are considering. + */ + switch (strategy) + { + case BTLessStrategyNumber: + isgt = false; + if (op_lefttype == op_righttype) + { + /* easy case */ + ltop = get_opfamily_member(opfamily, + op_lefttype, op_righttype, + BTLessStrategyNumber); + leop = get_opfamily_member(opfamily, + op_lefttype, op_righttype, + BTLessEqualStrategyNumber); + lsortop = ltop; + rsortop = ltop; + lstatop = lsortop; + rstatop = rsortop; + revltop = ltop; + revleop = leop; + } + else + { + ltop = get_opfamily_member(opfamily, + op_lefttype, op_righttype, + BTLessStrategyNumber); + leop = get_opfamily_member(opfamily, + op_lefttype, op_righttype, + BTLessEqualStrategyNumber); + lsortop = get_opfamily_member(opfamily, + op_lefttype, op_lefttype, + BTLessStrategyNumber); + rsortop = get_opfamily_member(opfamily, + op_righttype, op_righttype, + BTLessStrategyNumber); + lstatop = lsortop; + rstatop = rsortop; + revltop = get_opfamily_member(opfamily, + op_righttype, op_lefttype, + BTLessStrategyNumber); + revleop = get_opfamily_member(opfamily, + op_righttype, op_lefttype, + BTLessEqualStrategyNumber); + } + break; + case BTGreaterStrategyNumber: + /* descending-order case */ + isgt = true; + if (op_lefttype == op_righttype) + { + /* easy case */ + ltop = get_opfamily_member(opfamily, + op_lefttype, op_righttype, + BTGreaterStrategyNumber); + leop = get_opfamily_member(opfamily, + op_lefttype, op_righttype, + BTGreaterEqualStrategyNumber); + lsortop = ltop; + rsortop = ltop; + lstatop = get_opfamily_member(opfamily, + op_lefttype, op_lefttype, + BTLessStrategyNumber); + rstatop = lstatop; + revltop = ltop; + revleop = leop; + } + else + { + ltop = get_opfamily_member(opfamily, + op_lefttype, op_righttype, + BTGreaterStrategyNumber); + leop = get_opfamily_member(opfamily, + op_lefttype, op_righttype, + BTGreaterEqualStrategyNumber); + lsortop = get_opfamily_member(opfamily, + op_lefttype, op_lefttype, + BTGreaterStrategyNumber); + rsortop = get_opfamily_member(opfamily, + op_righttype, op_righttype, + BTGreaterStrategyNumber); + lstatop = get_opfamily_member(opfamily, + op_lefttype, op_lefttype, + BTLessStrategyNumber); + rstatop = get_opfamily_member(opfamily, + op_righttype, op_righttype, + BTLessStrategyNumber); + revltop = get_opfamily_member(opfamily, + op_righttype, op_lefttype, + BTGreaterStrategyNumber); + revleop = get_opfamily_member(opfamily, + op_righttype, op_lefttype, + BTGreaterEqualStrategyNumber); + } + break; + default: + goto fail; /* shouldn't get here */ + } + + if (!OidIsValid(lsortop) || + !OidIsValid(rsortop) || + !OidIsValid(lstatop) || + !OidIsValid(rstatop) || + !OidIsValid(ltop) || + !OidIsValid(leop) || + !OidIsValid(revltop) || + !OidIsValid(revleop)) + goto fail; /* insufficient info in catalogs */ + + /* Try to get ranges of both inputs */ + if (!isgt) + { + if (!get_variable_range(root, &leftvar, lstatop, collation, + &leftmin, &leftmax)) + goto fail; /* no range available from stats */ + if (!get_variable_range(root, &rightvar, rstatop, collation, + &rightmin, &rightmax)) + goto fail; /* no range available from stats */ + } + else + { + /* need to swap the max and min */ + if (!get_variable_range(root, &leftvar, lstatop, collation, + &leftmax, &leftmin)) + goto fail; /* no range available from stats */ + if (!get_variable_range(root, &rightvar, rstatop, collation, + &rightmax, &rightmin)) + goto fail; /* no range available from stats */ + } + + /* + * Now, the fraction of the left variable that will be scanned is the + * fraction that's <= the right-side maximum value. But only believe + * non-default estimates, else stick with our 1.0. + */ + selec = scalarineqsel(root, leop, isgt, true, collation, &leftvar, + rightmax, op_righttype); + if (selec != DEFAULT_INEQ_SEL) + *leftend = selec; + + /* And similarly for the right variable. */ + selec = scalarineqsel(root, revleop, isgt, true, collation, &rightvar, + leftmax, op_lefttype); + if (selec != DEFAULT_INEQ_SEL) + *rightend = selec; + + /* + * Only one of the two "end" fractions can really be less than 1.0; + * believe the smaller estimate and reset the other one to exactly 1.0. If + * we get exactly equal estimates (as can easily happen with self-joins), + * believe neither. + */ + if (*leftend > *rightend) + *leftend = 1.0; + else if (*leftend < *rightend) + *rightend = 1.0; + else + *leftend = *rightend = 1.0; + + /* + * Also, the fraction of the left variable that will be scanned before the + * first join pair is found is the fraction that's < the right-side + * minimum value. But only believe non-default estimates, else stick with + * our own default. + */ + selec = scalarineqsel(root, ltop, isgt, false, collation, &leftvar, + rightmin, op_righttype); + if (selec != DEFAULT_INEQ_SEL) + *leftstart = selec; + + /* And similarly for the right variable. */ + selec = scalarineqsel(root, revltop, isgt, false, collation, &rightvar, + leftmin, op_lefttype); + if (selec != DEFAULT_INEQ_SEL) + *rightstart = selec; + + /* + * Only one of the two "start" fractions can really be more than zero; + * believe the larger estimate and reset the other one to exactly 0.0. If + * we get exactly equal estimates (as can easily happen with self-joins), + * believe neither. + */ + if (*leftstart < *rightstart) + *leftstart = 0.0; + else if (*leftstart > *rightstart) + *rightstart = 0.0; + else + *leftstart = *rightstart = 0.0; + + /* + * If the sort order is nulls-first, we're going to have to skip over any + * nulls too. These would not have been counted by scalarineqsel, and we + * can safely add in this fraction regardless of whether we believe + * scalarineqsel's results or not. But be sure to clamp the sum to 1.0! + */ + if (nulls_first) + { + Form_pg_statistic stats; + + if (HeapTupleIsValid(leftvar.statsTuple)) + { + stats = (Form_pg_statistic) GETSTRUCT(leftvar.statsTuple); + *leftstart += stats->stanullfrac; + CLAMP_PROBABILITY(*leftstart); + *leftend += stats->stanullfrac; + CLAMP_PROBABILITY(*leftend); + } + if (HeapTupleIsValid(rightvar.statsTuple)) + { + stats = (Form_pg_statistic) GETSTRUCT(rightvar.statsTuple); + *rightstart += stats->stanullfrac; + CLAMP_PROBABILITY(*rightstart); + *rightend += stats->stanullfrac; + CLAMP_PROBABILITY(*rightend); + } + } + + /* Disbelieve start >= end, just in case that can happen */ + if (*leftstart >= *leftend) + { + *leftstart = 0.0; + *leftend = 1.0; + } + if (*rightstart >= *rightend) + { + *rightstart = 0.0; + *rightend = 1.0; + } + +fail: + ReleaseVariableStats(leftvar); + ReleaseVariableStats(rightvar); +} + + +/* + * matchingsel -- generic matching-operator selectivity support + * + * Use these for any operators that (a) are on data types for which we collect + * standard statistics, and (b) have behavior for which the default estimate + * (twice DEFAULT_EQ_SEL) is sane. Typically that is good for match-like + * operators. + */ + +Datum +matchingsel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + Oid collation = PG_GET_COLLATION(); + double selec; + + /* Use generic restriction selectivity logic. */ + selec = generic_restriction_selectivity(root, operator, collation, + args, varRelid, + DEFAULT_MATCHING_SEL); + + PG_RETURN_FLOAT8((float8) selec); +} + +Datum +matchingjoinsel(PG_FUNCTION_ARGS) +{ + /* Just punt, for the moment. */ + PG_RETURN_FLOAT8(DEFAULT_MATCHING_SEL); +} + + +/* + * Helper routine for estimate_num_groups: add an item to a list of + * GroupVarInfos, but only if it's not known equal to any of the existing + * entries. + */ +typedef struct +{ + Node *var; /* might be an expression, not just a Var */ + RelOptInfo *rel; /* relation it belongs to */ + double ndistinct; /* # distinct values */ + bool isdefault; /* true if DEFAULT_NUM_DISTINCT was used */ +} GroupVarInfo; + +static List * +add_unique_group_var(PlannerInfo *root, List *varinfos, + Node *var, VariableStatData *vardata) +{ + GroupVarInfo *varinfo; + double ndistinct; + bool isdefault; + ListCell *lc; + + ndistinct = get_variable_numdistinct(vardata, &isdefault); + + foreach(lc, varinfos) + { + varinfo = (GroupVarInfo *) lfirst(lc); + + /* Drop exact duplicates */ + if (equal(var, varinfo->var)) + return varinfos; + + /* + * Drop known-equal vars, but only if they belong to different + * relations (see comments for estimate_num_groups) + */ + if (vardata->rel != varinfo->rel && + exprs_known_equal(root, var, varinfo->var)) + { + if (varinfo->ndistinct <= ndistinct) + { + /* Keep older item, forget new one */ + return varinfos; + } + else + { + /* Delete the older item */ + varinfos = foreach_delete_current(varinfos, lc); + } + } + } + + varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo)); + + varinfo->var = var; + varinfo->rel = vardata->rel; + varinfo->ndistinct = ndistinct; + varinfo->isdefault = isdefault; + varinfos = lappend(varinfos, varinfo); + return varinfos; +} + +/* + * estimate_num_groups - Estimate number of groups in a grouped query + * + * Given a query having a GROUP BY clause, estimate how many groups there + * will be --- ie, the number of distinct combinations of the GROUP BY + * expressions. + * + * This routine is also used to estimate the number of rows emitted by + * a DISTINCT filtering step; that is an isomorphic problem. (Note: + * actually, we only use it for DISTINCT when there's no grouping or + * aggregation ahead of the DISTINCT.) + * + * Inputs: + * root - the query + * groupExprs - list of expressions being grouped by + * input_rows - number of rows estimated to arrive at the group/unique + * filter step + * pgset - NULL, or a List** pointing to a grouping set to filter the + * groupExprs against + * + * Outputs: + * estinfo - When passed as non-NULL, the function will set bits in the + * "flags" field in order to provide callers with additional information + * about the estimation. Currently, we only set the SELFLAG_USED_DEFAULT + * bit if we used any default values in the estimation. + * + * Given the lack of any cross-correlation statistics in the system, it's + * impossible to do anything really trustworthy with GROUP BY conditions + * involving multiple Vars. We should however avoid assuming the worst + * case (all possible cross-product terms actually appear as groups) since + * very often the grouped-by Vars are highly correlated. Our current approach + * is as follows: + * 1. Expressions yielding boolean are assumed to contribute two groups, + * independently of their content, and are ignored in the subsequent + * steps. This is mainly because tests like "col IS NULL" break the + * heuristic used in step 2 especially badly. + * 2. Reduce the given expressions to a list of unique Vars used. For + * example, GROUP BY a, a + b is treated the same as GROUP BY a, b. + * It is clearly correct not to count the same Var more than once. + * It is also reasonable to treat f(x) the same as x: f() cannot + * increase the number of distinct values (unless it is volatile, + * which we consider unlikely for grouping), but it probably won't + * reduce the number of distinct values much either. + * As a special case, if a GROUP BY expression can be matched to an + * expressional index for which we have statistics, then we treat the + * whole expression as though it were just a Var. + * 3. If the list contains Vars of different relations that are known equal + * due to equivalence classes, then drop all but one of the Vars from each + * known-equal set, keeping the one with smallest estimated # of values + * (since the extra values of the others can't appear in joined rows). + * Note the reason we only consider Vars of different relations is that + * if we considered ones of the same rel, we'd be double-counting the + * restriction selectivity of the equality in the next step. + * 4. For Vars within a single source rel, we multiply together the numbers + * of values, clamp to the number of rows in the rel (divided by 10 if + * more than one Var), and then multiply by a factor based on the + * selectivity of the restriction clauses for that rel. When there's + * more than one Var, the initial product is probably too high (it's the + * worst case) but clamping to a fraction of the rel's rows seems to be a + * helpful heuristic for not letting the estimate get out of hand. (The + * factor of 10 is derived from pre-Postgres-7.4 practice.) The factor + * we multiply by to adjust for the restriction selectivity assumes that + * the restriction clauses are independent of the grouping, which may not + * be a valid assumption, but it's hard to do better. + * 5. If there are Vars from multiple rels, we repeat step 4 for each such + * rel, and multiply the results together. + * Note that rels not containing grouped Vars are ignored completely, as are + * join clauses. Such rels cannot increase the number of groups, and we + * assume such clauses do not reduce the number either (somewhat bogus, + * but we don't have the info to do better). + */ +double +estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, + List **pgset, EstimationInfo *estinfo) +{ + List *varinfos = NIL; + double srf_multiplier = 1.0; + double numdistinct; + ListCell *l; + int i; + + /* Zero the estinfo output parameter, if non-NULL */ + if (estinfo != NULL) + memset(estinfo, 0, sizeof(EstimationInfo)); + + /* + * We don't ever want to return an estimate of zero groups, as that tends + * to lead to division-by-zero and other unpleasantness. The input_rows + * estimate is usually already at least 1, but clamp it just in case it + * isn't. + */ + input_rows = clamp_row_est(input_rows); + + /* + * If no grouping columns, there's exactly one group. (This can't happen + * for normal cases with GROUP BY or DISTINCT, but it is possible for + * corner cases with set operations.) + */ + if (groupExprs == NIL || (pgset && list_length(*pgset) < 1)) + return 1.0; + + /* + * Count groups derived from boolean grouping expressions. For other + * expressions, find the unique Vars used, treating an expression as a Var + * if we can find stats for it. For each one, record the statistical + * estimate of number of distinct values (total in its table, without + * regard for filtering). + */ + numdistinct = 1.0; + + i = 0; + foreach(l, groupExprs) + { + Node *groupexpr = (Node *) lfirst(l); + double this_srf_multiplier; + VariableStatData vardata; + List *varshere; + ListCell *l2; + + /* is expression in this grouping set? */ + if (pgset && !list_member_int(*pgset, i++)) + continue; + + /* + * Set-returning functions in grouping columns are a bit problematic. + * The code below will effectively ignore their SRF nature and come up + * with a numdistinct estimate as though they were scalar functions. + * We compensate by scaling up the end result by the largest SRF + * rowcount estimate. (This will be an overestimate if the SRF + * produces multiple copies of any output value, but it seems best to + * assume the SRF's outputs are distinct. In any case, it's probably + * pointless to worry too much about this without much better + * estimates for SRF output rowcounts than we have today.) + */ + this_srf_multiplier = expression_returns_set_rows(root, groupexpr); + if (srf_multiplier < this_srf_multiplier) + srf_multiplier = this_srf_multiplier; + + /* Short-circuit for expressions returning boolean */ + if (exprType(groupexpr) == BOOLOID) + { + numdistinct *= 2.0; + continue; + } + + /* + * If examine_variable is able to deduce anything about the GROUP BY + * expression, treat it as a single variable even if it's really more + * complicated. + * + * XXX This has the consequence that if there's a statistics object on + * the expression, we don't split it into individual Vars. This + * affects our selection of statistics in + * estimate_multivariate_ndistinct, because it's probably better to + * use more accurate estimate for each expression and treat them as + * independent, than to combine estimates for the extracted variables + * when we don't know how that relates to the expressions. + */ + examine_variable(root, groupexpr, 0, &vardata); + if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique) + { + varinfos = add_unique_group_var(root, varinfos, + groupexpr, &vardata); + ReleaseVariableStats(vardata); + continue; + } + ReleaseVariableStats(vardata); + + /* + * Else pull out the component Vars. Handle PlaceHolderVars by + * recursing into their arguments (effectively assuming that the + * PlaceHolderVar doesn't change the number of groups, which boils + * down to ignoring the possible addition of nulls to the result set). + */ + varshere = pull_var_clause(groupexpr, + PVC_RECURSE_AGGREGATES | + PVC_RECURSE_WINDOWFUNCS | + PVC_RECURSE_PLACEHOLDERS); + + /* + * If we find any variable-free GROUP BY item, then either it is a + * constant (and we can ignore it) or it contains a volatile function; + * in the latter case we punt and assume that each input row will + * yield a distinct group. + */ + if (varshere == NIL) + { + if (contain_volatile_functions(groupexpr)) + return input_rows; + continue; + } + + /* + * Else add variables to varinfos list + */ + foreach(l2, varshere) + { + Node *var = (Node *) lfirst(l2); + + examine_variable(root, var, 0, &vardata); + varinfos = add_unique_group_var(root, varinfos, var, &vardata); + ReleaseVariableStats(vardata); + } + } + + /* + * If now no Vars, we must have an all-constant or all-boolean GROUP BY + * list. + */ + if (varinfos == NIL) + { + /* Apply SRF multiplier as we would do in the long path */ + numdistinct *= srf_multiplier; + /* Round off */ + numdistinct = ceil(numdistinct); + /* Guard against out-of-range answers */ + if (numdistinct > input_rows) + numdistinct = input_rows; + if (numdistinct < 1.0) + numdistinct = 1.0; + return numdistinct; + } + + /* + * Group Vars by relation and estimate total numdistinct. + * + * For each iteration of the outer loop, we process the frontmost Var in + * varinfos, plus all other Vars in the same relation. We remove these + * Vars from the newvarinfos list for the next iteration. This is the + * easiest way to group Vars of same rel together. + */ + do + { + GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos); + RelOptInfo *rel = varinfo1->rel; + double reldistinct = 1; + double relmaxndistinct = reldistinct; + int relvarcount = 0; + List *newvarinfos = NIL; + List *relvarinfos = NIL; + + /* + * Split the list of varinfos in two - one for the current rel, one + * for remaining Vars on other rels. + */ + relvarinfos = lappend(relvarinfos, varinfo1); + for_each_from(l, varinfos, 1) + { + GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l); + + if (varinfo2->rel == varinfo1->rel) + { + /* varinfos on current rel */ + relvarinfos = lappend(relvarinfos, varinfo2); + } + else + { + /* not time to process varinfo2 yet */ + newvarinfos = lappend(newvarinfos, varinfo2); + } + } + + /* + * Get the numdistinct estimate for the Vars of this rel. We + * iteratively search for multivariate n-distinct with maximum number + * of vars; assuming that each var group is independent of the others, + * we multiply them together. Any remaining relvarinfos after no more + * multivariate matches are found are assumed independent too, so + * their individual ndistinct estimates are multiplied also. + * + * While iterating, count how many separate numdistinct values we + * apply. We apply a fudge factor below, but only if we multiplied + * more than one such values. + */ + while (relvarinfos) + { + double mvndistinct; + + if (estimate_multivariate_ndistinct(root, rel, &relvarinfos, + &mvndistinct)) + { + reldistinct *= mvndistinct; + if (relmaxndistinct < mvndistinct) + relmaxndistinct = mvndistinct; + relvarcount++; + } + else + { + foreach(l, relvarinfos) + { + GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l); + + reldistinct *= varinfo2->ndistinct; + if (relmaxndistinct < varinfo2->ndistinct) + relmaxndistinct = varinfo2->ndistinct; + relvarcount++; + + /* + * When varinfo2's isdefault is set then we'd better set + * the SELFLAG_USED_DEFAULT bit in the EstimationInfo. + */ + if (estinfo != NULL && varinfo2->isdefault) + estinfo->flags |= SELFLAG_USED_DEFAULT; + + } + + /* we're done with this relation */ + relvarinfos = NIL; + } + } + + /* + * Sanity check --- don't divide by zero if empty relation. + */ + Assert(IS_SIMPLE_REL(rel)); + if (rel->tuples > 0) + { + /* + * Clamp to size of rel, or size of rel / 10 if multiple Vars. The + * fudge factor is because the Vars are probably correlated but we + * don't know by how much. We should never clamp to less than the + * largest ndistinct value for any of the Vars, though, since + * there will surely be at least that many groups. + */ + double clamp = rel->tuples; + + if (relvarcount > 1) + { + clamp *= 0.1; + if (clamp < relmaxndistinct) + { + clamp = relmaxndistinct; + /* for sanity in case some ndistinct is too large: */ + if (clamp > rel->tuples) + clamp = rel->tuples; + } + } + if (reldistinct > clamp) + reldistinct = clamp; + + /* + * Update the estimate based on the restriction selectivity, + * guarding against division by zero when reldistinct is zero. + * Also skip this if we know that we are returning all rows. + */ + if (reldistinct > 0 && rel->rows < rel->tuples) + { + /* + * Given a table containing N rows with n distinct values in a + * uniform distribution, if we select p rows at random then + * the expected number of distinct values selected is + * + * n * (1 - product((N-N/n-i)/(N-i), i=0..p-1)) + * + * = n * (1 - (N-N/n)! / (N-N/n-p)! * (N-p)! / N!) + * + * See "Approximating block accesses in database + * organizations", S. B. Yao, Communications of the ACM, + * Volume 20 Issue 4, April 1977 Pages 260-261. + * + * Alternatively, re-arranging the terms from the factorials, + * this may be written as + * + * n * (1 - product((N-p-i)/(N-i), i=0..N/n-1)) + * + * This form of the formula is more efficient to compute in + * the common case where p is larger than N/n. Additionally, + * as pointed out by Dell'Era, if i << N for all terms in the + * product, it can be approximated by + * + * n * (1 - ((N-p)/N)^(N/n)) + * + * See "Expected distinct values when selecting from a bag + * without replacement", Alberto Dell'Era, + * http://www.adellera.it/investigations/distinct_balls/. + * + * The condition i << N is equivalent to n >> 1, so this is a + * good approximation when the number of distinct values in + * the table is large. It turns out that this formula also + * works well even when n is small. + */ + reldistinct *= + (1 - pow((rel->tuples - rel->rows) / rel->tuples, + rel->tuples / reldistinct)); + } + reldistinct = clamp_row_est(reldistinct); + + /* + * Update estimate of total distinct groups. + */ + numdistinct *= reldistinct; + } + + varinfos = newvarinfos; + } while (varinfos != NIL); + + /* Now we can account for the effects of any SRFs */ + numdistinct *= srf_multiplier; + + /* Round off */ + numdistinct = ceil(numdistinct); + + /* Guard against out-of-range answers */ + if (numdistinct > input_rows) + numdistinct = input_rows; + if (numdistinct < 1.0) + numdistinct = 1.0; + + return numdistinct; +} + +/* + * Estimate hash bucket statistics when the specified expression is used + * as a hash key for the given number of buckets. + * + * This attempts to determine two values: + * + * 1. The frequency of the most common value of the expression (returns + * zero into *mcv_freq if we can't get that). + * + * 2. The "bucketsize fraction", ie, average number of entries in a bucket + * divided by total tuples in relation. + * + * XXX This is really pretty bogus since we're effectively assuming that the + * distribution of hash keys will be the same after applying restriction + * clauses as it was in the underlying relation. However, we are not nearly + * smart enough to figure out how the restrict clauses might change the + * distribution, so this will have to do for now. + * + * We are passed the number of buckets the executor will use for the given + * input relation. If the data were perfectly distributed, with the same + * number of tuples going into each available bucket, then the bucketsize + * fraction would be 1/nbuckets. But this happy state of affairs will occur + * only if (a) there are at least nbuckets distinct data values, and (b) + * we have a not-too-skewed data distribution. Otherwise the buckets will + * be nonuniformly occupied. If the other relation in the join has a key + * distribution similar to this one's, then the most-loaded buckets are + * exactly those that will be probed most often. Therefore, the "average" + * bucket size for costing purposes should really be taken as something close + * to the "worst case" bucket size. We try to estimate this by adjusting the + * fraction if there are too few distinct data values, and then scaling up + * by the ratio of the most common value's frequency to the average frequency. + * + * If no statistics are available, use a default estimate of 0.1. This will + * discourage use of a hash rather strongly if the inner relation is large, + * which is what we want. We do not want to hash unless we know that the + * inner rel is well-dispersed (or the alternatives seem much worse). + * + * The caller should also check that the mcv_freq is not so large that the + * most common value would by itself require an impractically large bucket. + * In a hash join, the executor can split buckets if they get too big, but + * obviously that doesn't help for a bucket that contains many duplicates of + * the same value. + */ +void +estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets, + Selectivity *mcv_freq, + Selectivity *bucketsize_frac) +{ + VariableStatData vardata; + double estfract, + ndistinct, + stanullfrac, + avgfreq; + bool isdefault; + AttStatsSlot sslot; + + examine_variable(root, hashkey, 0, &vardata); + + /* Look up the frequency of the most common value, if available */ + *mcv_freq = 0.0; + + if (HeapTupleIsValid(vardata.statsTuple)) + { + if (get_attstatsslot(&sslot, vardata.statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_NUMBERS)) + { + /* + * The first MCV stat is for the most common value. + */ + if (sslot.nnumbers > 0) + *mcv_freq = sslot.numbers[0]; + free_attstatsslot(&sslot); + } + } + + /* Get number of distinct values */ + ndistinct = get_variable_numdistinct(&vardata, &isdefault); + + /* + * If ndistinct isn't real, punt. We normally return 0.1, but if the + * mcv_freq is known to be even higher than that, use it instead. + */ + if (isdefault) + { + *bucketsize_frac = (Selectivity) Max(0.1, *mcv_freq); + ReleaseVariableStats(vardata); + return; + } + + /* Get fraction that are null */ + if (HeapTupleIsValid(vardata.statsTuple)) + { + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + stanullfrac = stats->stanullfrac; + } + else + stanullfrac = 0.0; + + /* Compute avg freq of all distinct data values in raw relation */ + avgfreq = (1.0 - stanullfrac) / ndistinct; + + /* + * Adjust ndistinct to account for restriction clauses. Observe we are + * assuming that the data distribution is affected uniformly by the + * restriction clauses! + * + * XXX Possibly better way, but much more expensive: multiply by + * selectivity of rel's restriction clauses that mention the target Var. + */ + if (vardata.rel && vardata.rel->tuples > 0) + { + ndistinct *= vardata.rel->rows / vardata.rel->tuples; + ndistinct = clamp_row_est(ndistinct); + } + + /* + * Initial estimate of bucketsize fraction is 1/nbuckets as long as the + * number of buckets is less than the expected number of distinct values; + * otherwise it is 1/ndistinct. + */ + if (ndistinct > nbuckets) + estfract = 1.0 / nbuckets; + else + estfract = 1.0 / ndistinct; + + /* + * Adjust estimated bucketsize upward to account for skewed distribution. + */ + if (avgfreq > 0.0 && *mcv_freq > avgfreq) + estfract *= *mcv_freq / avgfreq; + + /* + * Clamp bucketsize to sane range (the above adjustment could easily + * produce an out-of-range result). We set the lower bound a little above + * zero, since zero isn't a very sane result. + */ + if (estfract < 1.0e-6) + estfract = 1.0e-6; + else if (estfract > 1.0) + estfract = 1.0; + + *bucketsize_frac = (Selectivity) estfract; + + ReleaseVariableStats(vardata); +} + +/* + * estimate_hashagg_tablesize + * estimate the number of bytes that a hash aggregate hashtable will + * require based on the agg_costs, path width and number of groups. + * + * We return the result as "double" to forestall any possible overflow + * problem in the multiplication by dNumGroups. + * + * XXX this may be over-estimating the size now that hashagg knows to omit + * unneeded columns from the hashtable. Also for mixed-mode grouping sets, + * grouping columns not in the hashed set are counted here even though hashagg + * won't store them. Is this a problem? + */ +double +estimate_hashagg_tablesize(PlannerInfo *root, Path *path, + const AggClauseCosts *agg_costs, double dNumGroups) +{ + Size hashentrysize; + + hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos), + path->pathtarget->width, + agg_costs->transitionSpace); + + /* + * Note that this disregards the effect of fill-factor and growth policy + * of the hash table. That's probably ok, given that the default + * fill-factor is relatively high. It'd be hard to meaningfully factor in + * "double-in-size" growth policies here. + */ + return hashentrysize * dNumGroups; +} + + +/*------------------------------------------------------------------------- + * + * Support routines + * + *------------------------------------------------------------------------- + */ + +/* + * Find applicable ndistinct statistics for the given list of VarInfos (which + * must all belong to the given rel), and update *ndistinct to the estimate of + * the MVNDistinctItem that best matches. If a match it found, *varinfos is + * updated to remove the list of matched varinfos. + * + * Varinfos that aren't for simple Vars are ignored. + * + * Return true if we're able to find a match, false otherwise. + */ +static bool +estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, + List **varinfos, double *ndistinct) +{ + ListCell *lc; + int nmatches_vars; + int nmatches_exprs; + Oid statOid = InvalidOid; + MVNDistinct *stats; + StatisticExtInfo *matched_info = NULL; + RangeTblEntry *rte; + + /* bail out immediately if the table has no extended statistics */ + if (!rel->statlist) + return false; + + /* + * When dealing with regular inheritance trees, ignore extended stats + * (which were built without data from child rels, and thus do not + * represent them). For partitioned tables data there's no data in the + * non-leaf relations, so we build stats only for the inheritance tree. + * So for partitioned tables we do consider extended stats. + */ + rte = planner_rt_fetch(rel->relid, root); + if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) + return false; + + /* look for the ndistinct statistics object matching the most vars */ + nmatches_vars = 0; /* we require at least two matches */ + nmatches_exprs = 0; + foreach(lc, rel->statlist) + { + ListCell *lc2; + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); + int nshared_vars = 0; + int nshared_exprs = 0; + + /* skip statistics of other kinds */ + if (info->kind != STATS_EXT_NDISTINCT) + continue; + + /* + * Determine how many expressions (and variables in non-matched + * expressions) match. We'll then use these numbers to pick the + * statistics object that best matches the clauses. + */ + foreach(lc2, *varinfos) + { + ListCell *lc3; + GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2); + AttrNumber attnum; + + Assert(varinfo->rel == rel); + + /* simple Var, search in statistics keys directly */ + if (IsA(varinfo->var, Var)) + { + attnum = ((Var *) varinfo->var)->varattno; + + /* + * Ignore system attributes - we don't support statistics on + * them, so can't match them (and it'd fail as the values are + * negative). + */ + if (!AttrNumberIsForUserDefinedAttr(attnum)) + continue; + + if (bms_is_member(attnum, info->keys)) + nshared_vars++; + + continue; + } + + /* expression - see if it's in the statistics object */ + foreach(lc3, info->exprs) + { + Node *expr = (Node *) lfirst(lc3); + + if (equal(varinfo->var, expr)) + { + nshared_exprs++; + break; + } + } + } + + if (nshared_vars + nshared_exprs < 2) + continue; + + /* + * Does this statistics object match more columns than the currently + * best object? If so, use this one instead. + * + * XXX This should break ties using name of the object, or something + * like that, to make the outcome stable. + */ + if ((nshared_exprs > nmatches_exprs) || + (((nshared_exprs == nmatches_exprs)) && (nshared_vars > nmatches_vars))) + { + statOid = info->statOid; + nmatches_vars = nshared_vars; + nmatches_exprs = nshared_exprs; + matched_info = info; + } + } + + /* No match? */ + if (statOid == InvalidOid) + return false; + + Assert(nmatches_vars + nmatches_exprs > 1); + + stats = statext_ndistinct_load(statOid); + + /* + * If we have a match, search it for the specific item that matches (there + * must be one), and construct the output values. + */ + if (stats) + { + int i; + List *newlist = NIL; + MVNDistinctItem *item = NULL; + ListCell *lc2; + Bitmapset *matched = NULL; + AttrNumber attnum_offset; + + /* + * How much we need to offset the attnums? If there are no + * expressions, no offset is needed. Otherwise offset enough to move + * the lowest one (which is equal to number of expressions) to 1. + */ + if (matched_info->exprs) + attnum_offset = (list_length(matched_info->exprs) + 1); + else + attnum_offset = 0; + + /* see what actually matched */ + foreach(lc2, *varinfos) + { + ListCell *lc3; + int idx; + bool found = false; + + GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2); + + /* + * Process a simple Var expression, by matching it to keys + * directly. If there's a matching expression, we'll try matching + * it later. + */ + if (IsA(varinfo->var, Var)) + { + AttrNumber attnum = ((Var *) varinfo->var)->varattno; + + /* + * Ignore expressions on system attributes. Can't rely on the + * bms check for negative values. + */ + if (!AttrNumberIsForUserDefinedAttr(attnum)) + continue; + + /* Is the variable covered by the statistics object? */ + if (!bms_is_member(attnum, matched_info->keys)) + continue; + + attnum = attnum + attnum_offset; + + /* ensure sufficient offset */ + Assert(AttrNumberIsForUserDefinedAttr(attnum)); + + matched = bms_add_member(matched, attnum); + + found = true; + } + + /* + * XXX Maybe we should allow searching the expressions even if we + * found an attribute matching the expression? That would handle + * trivial expressions like "(a)" but it seems fairly useless. + */ + if (found) + continue; + + /* expression - see if it's in the statistics object */ + idx = 0; + foreach(lc3, matched_info->exprs) + { + Node *expr = (Node *) lfirst(lc3); + + if (equal(varinfo->var, expr)) + { + AttrNumber attnum = -(idx + 1); + + attnum = attnum + attnum_offset; + + /* ensure sufficient offset */ + Assert(AttrNumberIsForUserDefinedAttr(attnum)); + + matched = bms_add_member(matched, attnum); + + /* there should be just one matching expression */ + break; + } + + idx++; + } + } + + /* Find the specific item that exactly matches the combination */ + for (i = 0; i < stats->nitems; i++) + { + int j; + MVNDistinctItem *tmpitem = &stats->items[i]; + + if (tmpitem->nattributes != bms_num_members(matched)) + continue; + + /* assume it's the right item */ + item = tmpitem; + + /* check that all item attributes/expressions fit the match */ + for (j = 0; j < tmpitem->nattributes; j++) + { + AttrNumber attnum = tmpitem->attributes[j]; + + /* + * Thanks to how we constructed the matched bitmap above, we + * can just offset all attnums the same way. + */ + attnum = attnum + attnum_offset; + + if (!bms_is_member(attnum, matched)) + { + /* nah, it's not this item */ + item = NULL; + break; + } + } + + /* + * If the item has all the matched attributes, we know it's the + * right one - there can't be a better one. matching more. + */ + if (item) + break; + } + + /* + * Make sure we found an item. There has to be one, because ndistinct + * statistics includes all combinations of attributes. + */ + if (!item) + elog(ERROR, "corrupt MVNDistinct entry"); + + /* Form the output varinfo list, keeping only unmatched ones */ + foreach(lc, *varinfos) + { + GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc); + ListCell *lc3; + bool found = false; + + /* + * Let's look at plain variables first, because it's the most + * common case and the check is quite cheap. We can simply get the + * attnum and check (with an offset) matched bitmap. + */ + if (IsA(varinfo->var, Var)) + { + AttrNumber attnum = ((Var *) varinfo->var)->varattno; + + /* + * If it's a system attribute, we're done. We don't support + * extended statistics on system attributes, so it's clearly + * not matched. Just keep the expression and continue. + */ + if (!AttrNumberIsForUserDefinedAttr(attnum)) + { + newlist = lappend(newlist, varinfo); + continue; + } + + /* apply the same offset as above */ + attnum += attnum_offset; + + /* if it's not matched, keep the varinfo */ + if (!bms_is_member(attnum, matched)) + newlist = lappend(newlist, varinfo); + + /* The rest of the loop deals with complex expressions. */ + continue; + } + + /* + * Process complex expressions, not just simple Vars. + * + * First, we search for an exact match of an expression. If we + * find one, we can just discard the whole GroupExprInfo, with all + * the variables we extracted from it. + * + * Otherwise we inspect the individual vars, and try matching it + * to variables in the item. + */ + foreach(lc3, matched_info->exprs) + { + Node *expr = (Node *) lfirst(lc3); + + if (equal(varinfo->var, expr)) + { + found = true; + break; + } + } + + /* found exact match, skip */ + if (found) + continue; + + newlist = lappend(newlist, varinfo); + } + + *varinfos = newlist; + *ndistinct = item->ndistinct; + return true; + } + + return false; +} + +/* + * convert_to_scalar + * Convert non-NULL values of the indicated types to the comparison + * scale needed by scalarineqsel(). + * Returns "true" if successful. + * + * XXX this routine is a hack: ideally we should look up the conversion + * subroutines in pg_type. + * + * All numeric datatypes are simply converted to their equivalent + * "double" values. (NUMERIC values that are outside the range of "double" + * are clamped to +/- HUGE_VAL.) + * + * String datatypes are converted by convert_string_to_scalar(), + * which is explained below. The reason why this routine deals with + * three values at a time, not just one, is that we need it for strings. + * + * The bytea datatype is just enough different from strings that it has + * to be treated separately. + * + * The several datatypes representing absolute times are all converted + * to Timestamp, which is actually an int64, and then we promote that to + * a double. Note this will give correct results even for the "special" + * values of Timestamp, since those are chosen to compare correctly; + * see timestamp_cmp. + * + * The several datatypes representing relative times (intervals) are all + * converted to measurements expressed in seconds. + */ +static bool +convert_to_scalar(Datum value, Oid valuetypid, Oid collid, double *scaledvalue, + Datum lobound, Datum hibound, Oid boundstypid, + double *scaledlobound, double *scaledhibound) +{ + bool failure = false; + + /* + * Both the valuetypid and the boundstypid should exactly match the + * declared input type(s) of the operator we are invoked for. However, + * extensions might try to use scalarineqsel as estimator for operators + * with input type(s) we don't handle here; in such cases, we want to + * return false, not fail. In any case, we mustn't assume that valuetypid + * and boundstypid are identical. + * + * XXX The histogram we are interpolating between points of could belong + * to a column that's only binary-compatible with the declared type. In + * essence we are assuming that the semantics of binary-compatible types + * are enough alike that we can use a histogram generated with one type's + * operators to estimate selectivity for the other's. This is outright + * wrong in some cases --- in particular signed versus unsigned + * interpretation could trip us up. But it's useful enough in the + * majority of cases that we do it anyway. Should think about more + * rigorous ways to do it. + */ + switch (valuetypid) + { + /* + * Built-in numeric types + */ + case BOOLOID: + case INT2OID: + case INT4OID: + case INT8OID: + case FLOAT4OID: + case FLOAT8OID: + case NUMERICOID: + case OIDOID: + case REGPROCOID: + case REGPROCEDUREOID: + case REGOPEROID: + case REGOPERATOROID: + case REGCLASSOID: + case REGTYPEOID: + case REGCOLLATIONOID: + case REGCONFIGOID: + case REGDICTIONARYOID: + case REGROLEOID: + case REGNAMESPACEOID: + *scaledvalue = convert_numeric_to_scalar(value, valuetypid, + &failure); + *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid, + &failure); + *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid, + &failure); + return !failure; + + /* + * Built-in string types + */ + case CHAROID: + case BPCHAROID: + case VARCHAROID: + case TEXTOID: + case NAMEOID: + { + char *valstr = convert_string_datum(value, valuetypid, + collid, &failure); + char *lostr = convert_string_datum(lobound, boundstypid, + collid, &failure); + char *histr = convert_string_datum(hibound, boundstypid, + collid, &failure); + + /* + * Bail out if any of the values is not of string type. We + * might leak converted strings for the other value(s), but + * that's not worth troubling over. + */ + if (failure) + return false; + + convert_string_to_scalar(valstr, scaledvalue, + lostr, scaledlobound, + histr, scaledhibound); + pfree(valstr); + pfree(lostr); + pfree(histr); + return true; + } + + /* + * Built-in bytea type + */ + case BYTEAOID: + { + /* We only support bytea vs bytea comparison */ + if (boundstypid != BYTEAOID) + return false; + convert_bytea_to_scalar(value, scaledvalue, + lobound, scaledlobound, + hibound, scaledhibound); + return true; + } + + /* + * Built-in time types + */ + case TIMESTAMPOID: + case TIMESTAMPTZOID: + case DATEOID: + case INTERVALOID: + case TIMEOID: + case TIMETZOID: + *scaledvalue = convert_timevalue_to_scalar(value, valuetypid, + &failure); + *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid, + &failure); + *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid, + &failure); + return !failure; + + /* + * Built-in network types + */ + case INETOID: + case CIDROID: + case MACADDROID: + case MACADDR8OID: + *scaledvalue = convert_network_to_scalar(value, valuetypid, + &failure); + *scaledlobound = convert_network_to_scalar(lobound, boundstypid, + &failure); + *scaledhibound = convert_network_to_scalar(hibound, boundstypid, + &failure); + return !failure; + } + /* Don't know how to convert */ + *scaledvalue = *scaledlobound = *scaledhibound = 0; + return false; +} + +/* + * Do convert_to_scalar()'s work for any numeric data type. + * + * On failure (e.g., unsupported typid), set *failure to true; + * otherwise, that variable is not changed. + */ +static double +convert_numeric_to_scalar(Datum value, Oid typid, bool *failure) +{ + switch (typid) + { + case BOOLOID: + return (double) DatumGetBool(value); + case INT2OID: + return (double) DatumGetInt16(value); + case INT4OID: + return (double) DatumGetInt32(value); + case INT8OID: + return (double) DatumGetInt64(value); + case FLOAT4OID: + return (double) DatumGetFloat4(value); + case FLOAT8OID: + return (double) DatumGetFloat8(value); + case NUMERICOID: + /* Note: out-of-range values will be clamped to +-HUGE_VAL */ + return (double) + DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow, + value)); + case OIDOID: + case REGPROCOID: + case REGPROCEDUREOID: + case REGOPEROID: + case REGOPERATOROID: + case REGCLASSOID: + case REGTYPEOID: + case REGCOLLATIONOID: + case REGCONFIGOID: + case REGDICTIONARYOID: + case REGROLEOID: + case REGNAMESPACEOID: + /* we can treat OIDs as integers... */ + return (double) DatumGetObjectId(value); + } + + *failure = true; + return 0; +} + +/* + * Do convert_to_scalar()'s work for any character-string data type. + * + * String datatypes are converted to a scale that ranges from 0 to 1, + * where we visualize the bytes of the string as fractional digits. + * + * We do not want the base to be 256, however, since that tends to + * generate inflated selectivity estimates; few databases will have + * occurrences of all 256 possible byte values at each position. + * Instead, use the smallest and largest byte values seen in the bounds + * as the estimated range for each byte, after some fudging to deal with + * the fact that we probably aren't going to see the full range that way. + * + * An additional refinement is that we discard any common prefix of the + * three strings before computing the scaled values. This allows us to + * "zoom in" when we encounter a narrow data range. An example is a phone + * number database where all the values begin with the same area code. + * (Actually, the bounds will be adjacent histogram-bin-boundary values, + * so this is more likely to happen than you might think.) + */ +static void +convert_string_to_scalar(char *value, + double *scaledvalue, + char *lobound, + double *scaledlobound, + char *hibound, + double *scaledhibound) +{ + int rangelo, + rangehi; + char *sptr; + + rangelo = rangehi = (unsigned char) hibound[0]; + for (sptr = lobound; *sptr; sptr++) + { + if (rangelo > (unsigned char) *sptr) + rangelo = (unsigned char) *sptr; + if (rangehi < (unsigned char) *sptr) + rangehi = (unsigned char) *sptr; + } + for (sptr = hibound; *sptr; sptr++) + { + if (rangelo > (unsigned char) *sptr) + rangelo = (unsigned char) *sptr; + if (rangehi < (unsigned char) *sptr) + rangehi = (unsigned char) *sptr; + } + /* If range includes any upper-case ASCII chars, make it include all */ + if (rangelo <= 'Z' && rangehi >= 'A') + { + if (rangelo > 'A') + rangelo = 'A'; + if (rangehi < 'Z') + rangehi = 'Z'; + } + /* Ditto lower-case */ + if (rangelo <= 'z' && rangehi >= 'a') + { + if (rangelo > 'a') + rangelo = 'a'; + if (rangehi < 'z') + rangehi = 'z'; + } + /* Ditto digits */ + if (rangelo <= '9' && rangehi >= '0') + { + if (rangelo > '0') + rangelo = '0'; + if (rangehi < '9') + rangehi = '9'; + } + + /* + * If range includes less than 10 chars, assume we have not got enough + * data, and make it include regular ASCII set. + */ + if (rangehi - rangelo < 9) + { + rangelo = ' '; + rangehi = 127; + } + + /* + * Now strip any common prefix of the three strings. + */ + while (*lobound) + { + if (*lobound != *hibound || *lobound != *value) + break; + lobound++, hibound++, value++; + } + + /* + * Now we can do the conversions. + */ + *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi); + *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi); + *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi); +} + +static double +convert_one_string_to_scalar(char *value, int rangelo, int rangehi) +{ + int slen = strlen(value); + double num, + denom, + base; + + if (slen <= 0) + return 0.0; /* empty string has scalar value 0 */ + + /* + * There seems little point in considering more than a dozen bytes from + * the string. Since base is at least 10, that will give us nominal + * resolution of at least 12 decimal digits, which is surely far more + * precision than this estimation technique has got anyway (especially in + * non-C locales). Also, even with the maximum possible base of 256, this + * ensures denom cannot grow larger than 256^13 = 2.03e31, which will not + * overflow on any known machine. + */ + if (slen > 12) + slen = 12; + + /* Convert initial characters to fraction */ + base = rangehi - rangelo + 1; + num = 0.0; + denom = base; + while (slen-- > 0) + { + int ch = (unsigned char) *value++; + + if (ch < rangelo) + ch = rangelo - 1; + else if (ch > rangehi) + ch = rangehi + 1; + num += ((double) (ch - rangelo)) / denom; + denom *= base; + } + + return num; +} + +/* + * Convert a string-type Datum into a palloc'd, null-terminated string. + * + * On failure (e.g., unsupported typid), set *failure to true; + * otherwise, that variable is not changed. (We'll return NULL on failure.) + * + * When using a non-C locale, we must pass the string through strxfrm() + * before continuing, so as to generate correct locale-specific results. + */ +static char * +convert_string_datum(Datum value, Oid typid, Oid collid, bool *failure) +{ + char *val; + + switch (typid) + { + case CHAROID: + val = (char *) palloc(2); + val[0] = DatumGetChar(value); + val[1] = '\0'; + break; + case BPCHAROID: + case VARCHAROID: + case TEXTOID: + val = TextDatumGetCString(value); + break; + case NAMEOID: + { + NameData *nm = (NameData *) DatumGetPointer(value); + + val = pstrdup(NameStr(*nm)); + break; + } + default: + *failure = true; + return NULL; + } + + if (!lc_collate_is_c(collid)) + { + char *xfrmstr; + size_t xfrmlen; + size_t xfrmlen2 PG_USED_FOR_ASSERTS_ONLY; + + /* + * XXX: We could guess at a suitable output buffer size and only call + * strxfrm twice if our guess is too small. + * + * XXX: strxfrm doesn't support UTF-8 encoding on Win32, it can return + * bogus data or set an error. This is not really a problem unless it + * crashes since it will only give an estimation error and nothing + * fatal. + */ + xfrmlen = strxfrm(NULL, val, 0); +#ifdef WIN32 + + /* + * On Windows, strxfrm returns INT_MAX when an error occurs. Instead + * of trying to allocate this much memory (and fail), just return the + * original string unmodified as if we were in the C locale. + */ + if (xfrmlen == INT_MAX) + return val; +#endif + xfrmstr = (char *) palloc(xfrmlen + 1); + xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1); + + /* + * Some systems (e.g., glibc) can return a smaller value from the + * second call than the first; thus the Assert must be <= not ==. + */ + Assert(xfrmlen2 <= xfrmlen); + pfree(val); + val = xfrmstr; + } + + return val; +} + +/* + * Do convert_to_scalar()'s work for any bytea data type. + * + * Very similar to convert_string_to_scalar except we can't assume + * null-termination and therefore pass explicit lengths around. + * + * Also, assumptions about likely "normal" ranges of characters have been + * removed - a data range of 0..255 is always used, for now. (Perhaps + * someday we will add information about actual byte data range to + * pg_statistic.) + */ +static void +convert_bytea_to_scalar(Datum value, + double *scaledvalue, + Datum lobound, + double *scaledlobound, + Datum hibound, + double *scaledhibound) +{ + bytea *valuep = DatumGetByteaPP(value); + bytea *loboundp = DatumGetByteaPP(lobound); + bytea *hiboundp = DatumGetByteaPP(hibound); + int rangelo, + rangehi, + valuelen = VARSIZE_ANY_EXHDR(valuep), + loboundlen = VARSIZE_ANY_EXHDR(loboundp), + hiboundlen = VARSIZE_ANY_EXHDR(hiboundp), + i, + minlen; + unsigned char *valstr = (unsigned char *) VARDATA_ANY(valuep); + unsigned char *lostr = (unsigned char *) VARDATA_ANY(loboundp); + unsigned char *histr = (unsigned char *) VARDATA_ANY(hiboundp); + + /* + * Assume bytea data is uniformly distributed across all byte values. + */ + rangelo = 0; + rangehi = 255; + + /* + * Now strip any common prefix of the three strings. + */ + minlen = Min(Min(valuelen, loboundlen), hiboundlen); + for (i = 0; i < minlen; i++) + { + if (*lostr != *histr || *lostr != *valstr) + break; + lostr++, histr++, valstr++; + loboundlen--, hiboundlen--, valuelen--; + } + + /* + * Now we can do the conversions. + */ + *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi); + *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi); + *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi); +} + +static double +convert_one_bytea_to_scalar(unsigned char *value, int valuelen, + int rangelo, int rangehi) +{ + double num, + denom, + base; + + if (valuelen <= 0) + return 0.0; /* empty string has scalar value 0 */ + + /* + * Since base is 256, need not consider more than about 10 chars (even + * this many seems like overkill) + */ + if (valuelen > 10) + valuelen = 10; + + /* Convert initial characters to fraction */ + base = rangehi - rangelo + 1; + num = 0.0; + denom = base; + while (valuelen-- > 0) + { + int ch = *value++; + + if (ch < rangelo) + ch = rangelo - 1; + else if (ch > rangehi) + ch = rangehi + 1; + num += ((double) (ch - rangelo)) / denom; + denom *= base; + } + + return num; +} + +/* + * Do convert_to_scalar()'s work for any timevalue data type. + * + * On failure (e.g., unsupported typid), set *failure to true; + * otherwise, that variable is not changed. + */ +static double +convert_timevalue_to_scalar(Datum value, Oid typid, bool *failure) +{ + switch (typid) + { + case TIMESTAMPOID: + return DatumGetTimestamp(value); + case TIMESTAMPTZOID: + return DatumGetTimestampTz(value); + case DATEOID: + return date2timestamp_no_overflow(DatumGetDateADT(value)); + case INTERVALOID: + { + Interval *interval = DatumGetIntervalP(value); + + /* + * Convert the month part of Interval to days using assumed + * average month length of 365.25/12.0 days. Not too + * accurate, but plenty good enough for our purposes. + */ + return interval->time + interval->day * (double) USECS_PER_DAY + + interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY); + } + case TIMEOID: + return DatumGetTimeADT(value); + case TIMETZOID: + { + TimeTzADT *timetz = DatumGetTimeTzADTP(value); + + /* use GMT-equivalent time */ + return (double) (timetz->time + (timetz->zone * 1000000.0)); + } + } + + *failure = true; + return 0; +} + + +/* + * get_restriction_variable + * Examine the args of a restriction clause to see if it's of the + * form (variable op pseudoconstant) or (pseudoconstant op variable), + * where "variable" could be either a Var or an expression in vars of a + * single relation. If so, extract information about the variable, + * and also indicate which side it was on and the other argument. + * + * Inputs: + * root: the planner info + * args: clause argument list + * varRelid: see specs for restriction selectivity functions + * + * Outputs: (these are valid only if true is returned) + * *vardata: gets information about variable (see examine_variable) + * *other: gets other clause argument, aggressively reduced to a constant + * *varonleft: set true if variable is on the left, false if on the right + * + * Returns true if a variable is identified, otherwise false. + * + * Note: if there are Vars on both sides of the clause, we must fail, because + * callers are expecting that the other side will act like a pseudoconstant. + */ +bool +get_restriction_variable(PlannerInfo *root, List *args, int varRelid, + VariableStatData *vardata, Node **other, + bool *varonleft) +{ + Node *left, + *right; + VariableStatData rdata; + + /* Fail if not a binary opclause (probably shouldn't happen) */ + if (list_length(args) != 2) + return false; + + left = (Node *) linitial(args); + right = (Node *) lsecond(args); + + /* + * Examine both sides. Note that when varRelid is nonzero, Vars of other + * relations will be treated as pseudoconstants. + */ + examine_variable(root, left, varRelid, vardata); + examine_variable(root, right, varRelid, &rdata); + + /* + * If one side is a variable and the other not, we win. + */ + if (vardata->rel && rdata.rel == NULL) + { + *varonleft = true; + *other = estimate_expression_value(root, rdata.var); + /* Assume we need no ReleaseVariableStats(rdata) here */ + return true; + } + + if (vardata->rel == NULL && rdata.rel) + { + *varonleft = false; + *other = estimate_expression_value(root, vardata->var); + /* Assume we need no ReleaseVariableStats(*vardata) here */ + *vardata = rdata; + return true; + } + + /* Oops, clause has wrong structure (probably var op var) */ + ReleaseVariableStats(*vardata); + ReleaseVariableStats(rdata); + + return false; +} + +/* + * get_join_variables + * Apply examine_variable() to each side of a join clause. + * Also, attempt to identify whether the join clause has the same + * or reversed sense compared to the SpecialJoinInfo. + * + * We consider the join clause "normal" if it is "lhs_var OP rhs_var", + * or "reversed" if it is "rhs_var OP lhs_var". In complicated cases + * where we can't tell for sure, we default to assuming it's normal. + */ +void +get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo, + VariableStatData *vardata1, VariableStatData *vardata2, + bool *join_is_reversed) +{ + Node *left, + *right; + + if (list_length(args) != 2) + elog(ERROR, "join operator should take two arguments"); + + left = (Node *) linitial(args); + right = (Node *) lsecond(args); + + examine_variable(root, left, 0, vardata1); + examine_variable(root, right, 0, vardata2); + + if (vardata1->rel && + bms_is_subset(vardata1->rel->relids, sjinfo->syn_righthand)) + *join_is_reversed = true; /* var1 is on RHS */ + else if (vardata2->rel && + bms_is_subset(vardata2->rel->relids, sjinfo->syn_lefthand)) + *join_is_reversed = true; /* var2 is on LHS */ + else + *join_is_reversed = false; +} + +/* statext_expressions_load copies the tuple, so just pfree it. */ +static void +ReleaseDummy(HeapTuple tuple) +{ + pfree(tuple); +} + +/* + * examine_variable + * Try to look up statistical data about an expression. + * Fill in a VariableStatData struct to describe the expression. + * + * Inputs: + * root: the planner info + * node: the expression tree to examine + * varRelid: see specs for restriction selectivity functions + * + * Outputs: *vardata is filled as follows: + * var: the input expression (with any binary relabeling stripped, if + * it is or contains a variable; but otherwise the type is preserved) + * rel: RelOptInfo for relation containing variable; NULL if expression + * contains no Vars (NOTE this could point to a RelOptInfo of a + * subquery, not one in the current query). + * statsTuple: the pg_statistic entry for the variable, if one exists; + * otherwise NULL. + * freefunc: pointer to a function to release statsTuple with. + * vartype: exposed type of the expression; this should always match + * the declared input type of the operator we are estimating for. + * atttype, atttypmod: actual type/typmod of the "var" expression. This is + * commonly the same as the exposed type of the variable argument, + * but can be different in binary-compatible-type cases. + * isunique: true if we were able to match the var to a unique index or a + * single-column DISTINCT clause, implying its values are unique for + * this query. (Caution: this should be trusted for statistical + * purposes only, since we do not check indimmediate nor verify that + * the exact same definition of equality applies.) + * acl_ok: true if current user has permission to read the column(s) + * underlying the pg_statistic entry. This is consulted by + * statistic_proc_security_check(). + * + * Caller is responsible for doing ReleaseVariableStats() before exiting. + */ +void +examine_variable(PlannerInfo *root, Node *node, int varRelid, + VariableStatData *vardata) +{ + Node *basenode; + Relids varnos; + RelOptInfo *onerel; + + /* Make sure we don't return dangling pointers in vardata */ + MemSet(vardata, 0, sizeof(VariableStatData)); + + /* Save the exposed type of the expression */ + vardata->vartype = exprType(node); + + /* Look inside any binary-compatible relabeling */ + + if (IsA(node, RelabelType)) + basenode = (Node *) ((RelabelType *) node)->arg; + else + basenode = node; + + /* Fast path for a simple Var */ + + if (IsA(basenode, Var) && + (varRelid == 0 || varRelid == ((Var *) basenode)->varno)) + { + Var *var = (Var *) basenode; + + /* Set up result fields other than the stats tuple */ + vardata->var = basenode; /* return Var without relabeling */ + vardata->rel = find_base_rel(root, var->varno); + vardata->atttype = var->vartype; + vardata->atttypmod = var->vartypmod; + vardata->isunique = has_unique_index(vardata->rel, var->varattno); + + /* Try to locate some stats */ + examine_simple_variable(root, var, vardata); + + return; + } + + /* + * Okay, it's a more complicated expression. Determine variable + * membership. Note that when varRelid isn't zero, only vars of that + * relation are considered "real" vars. + */ + varnos = pull_varnos(root, basenode); + + onerel = NULL; + + switch (bms_membership(varnos)) + { + case BMS_EMPTY_SET: + /* No Vars at all ... must be pseudo-constant clause */ + break; + case BMS_SINGLETON: + if (varRelid == 0 || bms_is_member(varRelid, varnos)) + { + onerel = find_base_rel(root, + (varRelid ? varRelid : bms_singleton_member(varnos))); + vardata->rel = onerel; + node = basenode; /* strip any relabeling */ + } + /* else treat it as a constant */ + break; + case BMS_MULTIPLE: + if (varRelid == 0) + { + /* treat it as a variable of a join relation */ + vardata->rel = find_join_rel(root, varnos); + node = basenode; /* strip any relabeling */ + } + else if (bms_is_member(varRelid, varnos)) + { + /* ignore the vars belonging to other relations */ + vardata->rel = find_base_rel(root, varRelid); + node = basenode; /* strip any relabeling */ + /* note: no point in expressional-index search here */ + } + /* else treat it as a constant */ + break; + } + + bms_free(varnos); + + vardata->var = node; + vardata->atttype = exprType(node); + vardata->atttypmod = exprTypmod(node); + + if (onerel) + { + /* + * We have an expression in vars of a single relation. Try to match + * it to expressional index columns, in hopes of finding some + * statistics. + * + * Note that we consider all index columns including INCLUDE columns, + * since there could be stats for such columns. But the test for + * uniqueness needs to be warier. + * + * XXX it's conceivable that there are multiple matches with different + * index opfamilies; if so, we need to pick one that matches the + * operator we are estimating for. FIXME later. + */ + ListCell *ilist; + ListCell *slist; + + foreach(ilist, onerel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist); + ListCell *indexpr_item; + int pos; + + indexpr_item = list_head(index->indexprs); + if (indexpr_item == NULL) + continue; /* no expressions here... */ + + for (pos = 0; pos < index->ncolumns; pos++) + { + if (index->indexkeys[pos] == 0) + { + Node *indexkey; + + if (indexpr_item == NULL) + elog(ERROR, "too few entries in indexprs list"); + indexkey = (Node *) lfirst(indexpr_item); + if (indexkey && IsA(indexkey, RelabelType)) + indexkey = (Node *) ((RelabelType *) indexkey)->arg; + if (equal(node, indexkey)) + { + /* + * Found a match ... is it a unique index? Tests here + * should match has_unique_index(). + */ + if (index->unique && + index->nkeycolumns == 1 && + pos == 0 && + (index->indpred == NIL || index->predOK)) + vardata->isunique = true; + + /* + * Has it got stats? We only consider stats for + * non-partial indexes, since partial indexes probably + * don't reflect whole-relation statistics; the above + * check for uniqueness is the only info we take from + * a partial index. + * + * An index stats hook, however, must make its own + * decisions about what to do with partial indexes. + */ + if (get_index_stats_hook && + (*get_index_stats_hook) (root, index->indexoid, + pos + 1, vardata)) + { + /* + * The hook took control of acquiring a stats + * tuple. If it did supply a tuple, it'd better + * have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata->statsTuple) && + !vardata->freefunc) + elog(ERROR, "no function provided to release variable stats with"); + } + else if (index->indpred == NIL) + { + vardata->statsTuple = + SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(index->indexoid), + Int16GetDatum(pos + 1), + BoolGetDatum(false)); + vardata->freefunc = ReleaseSysCache; + + if (HeapTupleIsValid(vardata->statsTuple)) + { + /* Get index's table for permission check */ + RangeTblEntry *rte; + Oid userid; + + rte = planner_rt_fetch(index->rel->relid, root); + Assert(rte->rtekind == RTE_RELATION); + + /* + * Use checkAsUser if it's set, in case we're + * accessing the table via a view. + */ + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + /* + * For simplicity, we insist on the whole + * table being selectable, rather than trying + * to identify which column(s) the index + * depends on. Also require all rows to be + * selectable --- there must be no + * securityQuals from security barrier views + * or RLS policies. + */ + vardata->acl_ok = + rte->securityQuals == NIL && + (pg_class_aclcheck(rte->relid, userid, + ACL_SELECT) == ACLCHECK_OK); + + /* + * If the user doesn't have permissions to + * access an inheritance child relation, check + * the permissions of the table actually + * mentioned in the query, since most likely + * the user does have that permission. Note + * that whole-table select privilege on the + * parent doesn't quite guarantee that the + * user could read all columns of the child. + * But in practice it's unlikely that any + * interesting security violation could result + * from allowing access to the expression + * index's stats, so we allow it anyway. See + * similar code in examine_simple_variable() + * for additional comments. + */ + if (!vardata->acl_ok && + root->append_rel_array != NULL) + { + AppendRelInfo *appinfo; + Index varno = index->rel->relid; + + appinfo = root->append_rel_array[varno]; + while (appinfo && + planner_rt_fetch(appinfo->parent_relid, + root)->rtekind == RTE_RELATION) + { + varno = appinfo->parent_relid; + appinfo = root->append_rel_array[varno]; + } + if (varno != index->rel->relid) + { + /* Repeat access check on this rel */ + rte = planner_rt_fetch(varno, root); + Assert(rte->rtekind == RTE_RELATION); + + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + vardata->acl_ok = + rte->securityQuals == NIL && + (pg_class_aclcheck(rte->relid, + userid, + ACL_SELECT) == ACLCHECK_OK); + } + } + } + else + { + /* suppress leakproofness checks later */ + vardata->acl_ok = true; + } + } + if (vardata->statsTuple) + break; + } + indexpr_item = lnext(index->indexprs, indexpr_item); + } + } + if (vardata->statsTuple) + break; + } + + /* + * Search extended statistics for one with a matching expression. + * There might be multiple ones, so just grab the first one. In the + * future, we might consider the statistics target (and pick the most + * accurate statistics) and maybe some other parameters. + */ + foreach(slist, onerel->statlist) + { + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(slist); + RangeTblEntry *rte = planner_rt_fetch(onerel->relid, root); + ListCell *expr_item; + int pos; + + /* + * Stop once we've found statistics for the expression (either + * from extended stats, or for an index in the preceding loop). + */ + if (vardata->statsTuple) + break; + + /* + * When dealing with regular inheritance trees, ignore extended + * stats (which were built without data from child rels, and thus + * do not represent them). For partitioned tables data there's no + * data in the non-leaf relations, so we build stats only for the + * inheritance tree. So for partitioned tables we do consider + * extended stats. + */ + if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) + break; + + /* skip stats without per-expression stats */ + if (info->kind != STATS_EXT_EXPRESSIONS) + continue; + + pos = 0; + foreach(expr_item, info->exprs) + { + Node *expr = (Node *) lfirst(expr_item); + + Assert(expr); + + /* strip RelabelType before comparing it */ + if (expr && IsA(expr, RelabelType)) + expr = (Node *) ((RelabelType *) expr)->arg; + + /* found a match, see if we can extract pg_statistic row */ + if (equal(node, expr)) + { + HeapTuple t = statext_expressions_load(info->statOid, pos); + + /* Get statistics object's table for permission check */ + RangeTblEntry *rte; + Oid userid; + + vardata->statsTuple = t; + + /* + * XXX Not sure if we should cache the tuple somewhere. + * Now we just create a new copy every time. + */ + vardata->freefunc = ReleaseDummy; + + rte = planner_rt_fetch(onerel->relid, root); + Assert(rte->rtekind == RTE_RELATION); + + /* + * Use checkAsUser if it's set, in case we're accessing + * the table via a view. + */ + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + /* + * For simplicity, we insist on the whole table being + * selectable, rather than trying to identify which + * column(s) the statistics object depends on. Also + * require all rows to be selectable --- there must be no + * securityQuals from security barrier views or RLS + * policies. + */ + vardata->acl_ok = + rte->securityQuals == NIL && + (pg_class_aclcheck(rte->relid, userid, + ACL_SELECT) == ACLCHECK_OK); + + /* + * If the user doesn't have permissions to access an + * inheritance child relation, check the permissions of + * the table actually mentioned in the query, since most + * likely the user does have that permission. Note that + * whole-table select privilege on the parent doesn't + * quite guarantee that the user could read all columns of + * the child. But in practice it's unlikely that any + * interesting security violation could result from + * allowing access to the expression stats, so we allow it + * anyway. See similar code in examine_simple_variable() + * for additional comments. + */ + if (!vardata->acl_ok && + root->append_rel_array != NULL) + { + AppendRelInfo *appinfo; + Index varno = onerel->relid; + + appinfo = root->append_rel_array[varno]; + while (appinfo && + planner_rt_fetch(appinfo->parent_relid, + root)->rtekind == RTE_RELATION) + { + varno = appinfo->parent_relid; + appinfo = root->append_rel_array[varno]; + } + if (varno != onerel->relid) + { + /* Repeat access check on this rel */ + rte = planner_rt_fetch(varno, root); + Assert(rte->rtekind == RTE_RELATION); + + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + vardata->acl_ok = + rte->securityQuals == NIL && + (pg_class_aclcheck(rte->relid, + userid, + ACL_SELECT) == ACLCHECK_OK); + } + } + + break; + } + + pos++; + } + } + } +} + +/* + * examine_simple_variable + * Handle a simple Var for examine_variable + * + * This is split out as a subroutine so that we can recurse to deal with + * Vars referencing subqueries. + * + * We already filled in all the fields of *vardata except for the stats tuple. + */ +static void +examine_simple_variable(PlannerInfo *root, Var *var, + VariableStatData *vardata) +{ + RangeTblEntry *rte = root->simple_rte_array[var->varno]; + + Assert(IsA(rte, RangeTblEntry)); + + if (get_relation_stats_hook && + (*get_relation_stats_hook) (root, rte, var->varattno, vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it did supply + * a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata->statsTuple) && + !vardata->freefunc) + elog(ERROR, "no function provided to release variable stats with"); + } + else if (rte->rtekind == RTE_RELATION) + { + /* + * Plain table or parent of an inheritance appendrel, so look up the + * column in pg_statistic + */ + vardata->statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(rte->relid), + Int16GetDatum(var->varattno), + BoolGetDatum(rte->inh)); + vardata->freefunc = ReleaseSysCache; + + if (HeapTupleIsValid(vardata->statsTuple)) + { + Oid userid; + + /* + * Check if user has permission to read this column. We require + * all rows to be accessible, so there must be no securityQuals + * from security barrier views or RLS policies. Use checkAsUser + * if it's set, in case we're accessing the table via a view. + */ + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + vardata->acl_ok = + rte->securityQuals == NIL && + ((pg_class_aclcheck(rte->relid, userid, + ACL_SELECT) == ACLCHECK_OK) || + (pg_attribute_aclcheck(rte->relid, var->varattno, userid, + ACL_SELECT) == ACLCHECK_OK)); + + /* + * If the user doesn't have permissions to access an inheritance + * child relation or specifically this attribute, check the + * permissions of the table/column actually mentioned in the + * query, since most likely the user does have that permission + * (else the query will fail at runtime), and if the user can read + * the column there then he can get the values of the child table + * too. To do that, we must find out which of the root parent's + * attributes the child relation's attribute corresponds to. + */ + if (!vardata->acl_ok && var->varattno > 0 && + root->append_rel_array != NULL) + { + AppendRelInfo *appinfo; + Index varno = var->varno; + int varattno = var->varattno; + bool found = false; + + appinfo = root->append_rel_array[varno]; + + /* + * Partitions are mapped to their immediate parent, not the + * root parent, so must be ready to walk up multiple + * AppendRelInfos. But stop if we hit a parent that is not + * RTE_RELATION --- that's a flattened UNION ALL subquery, not + * an inheritance parent. + */ + while (appinfo && + planner_rt_fetch(appinfo->parent_relid, + root)->rtekind == RTE_RELATION) + { + int parent_varattno; + + found = false; + if (varattno <= 0 || varattno > appinfo->num_child_cols) + break; /* safety check */ + parent_varattno = appinfo->parent_colnos[varattno - 1]; + if (parent_varattno == 0) + break; /* Var is local to child */ + + varno = appinfo->parent_relid; + varattno = parent_varattno; + found = true; + + /* If the parent is itself a child, continue up. */ + appinfo = root->append_rel_array[varno]; + } + + /* + * In rare cases, the Var may be local to the child table, in + * which case, we've got to live with having no access to this + * column's stats. + */ + if (!found) + return; + + /* Repeat the access check on this parent rel & column */ + rte = planner_rt_fetch(varno, root); + Assert(rte->rtekind == RTE_RELATION); + + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + vardata->acl_ok = + rte->securityQuals == NIL && + ((pg_class_aclcheck(rte->relid, userid, + ACL_SELECT) == ACLCHECK_OK) || + (pg_attribute_aclcheck(rte->relid, varattno, userid, + ACL_SELECT) == ACLCHECK_OK)); + } + } + else + { + /* suppress any possible leakproofness checks later */ + vardata->acl_ok = true; + } + } + else if (rte->rtekind == RTE_SUBQUERY && !rte->inh) + { + /* + * Plain subquery (not one that was converted to an appendrel). + */ + Query *subquery = rte->subquery; + RelOptInfo *rel; + TargetEntry *ste; + + /* + * Punt if it's a whole-row var rather than a plain column reference. + */ + if (var->varattno == InvalidAttrNumber) + return; + + /* + * Punt if subquery uses set operations or GROUP BY, as these will + * mash underlying columns' stats beyond recognition. (Set ops are + * particularly nasty; if we forged ahead, we would return stats + * relevant to only the leftmost subselect...) DISTINCT is also + * problematic, but we check that later because there is a possibility + * of learning something even with it. + */ + if (subquery->setOperations || + subquery->groupClause || + subquery->groupingSets) + return; + + /* + * OK, fetch RelOptInfo for subquery. Note that we don't change the + * rel returned in vardata, since caller expects it to be a rel of the + * caller's query level. Because we might already be recursing, we + * can't use that rel pointer either, but have to look up the Var's + * rel afresh. + */ + rel = find_base_rel(root, var->varno); + + /* If the subquery hasn't been planned yet, we have to punt */ + if (rel->subroot == NULL) + return; + Assert(IsA(rel->subroot, PlannerInfo)); + + /* + * Switch our attention to the subquery as mangled by the planner. It + * was okay to look at the pre-planning version for the tests above, + * but now we need a Var that will refer to the subroot's live + * RelOptInfos. For instance, if any subquery pullup happened during + * planning, Vars in the targetlist might have gotten replaced, and we + * need to see the replacement expressions. + */ + subquery = rel->subroot->parse; + Assert(IsA(subquery, Query)); + + /* Get the subquery output expression referenced by the upper Var */ + ste = get_tle_by_resno(subquery->targetList, var->varattno); + if (ste == NULL || ste->resjunk) + elog(ERROR, "subquery %s does not have attribute %d", + rte->eref->aliasname, var->varattno); + var = (Var *) ste->expr; + + /* + * If subquery uses DISTINCT, we can't make use of any stats for the + * variable ... but, if it's the only DISTINCT column, we are entitled + * to consider it unique. We do the test this way so that it works + * for cases involving DISTINCT ON. + */ + if (subquery->distinctClause) + { + if (list_length(subquery->distinctClause) == 1 && + targetIsInSortList(ste, InvalidOid, subquery->distinctClause)) + vardata->isunique = true; + /* cannot go further */ + return; + } + + /* + * If the sub-query originated from a view with the security_barrier + * attribute, we must not look at the variable's statistics, though it + * seems all right to notice the existence of a DISTINCT clause. So + * stop here. + * + * This is probably a harsher restriction than necessary; it's + * certainly OK for the selectivity estimator (which is a C function, + * and therefore omnipotent anyway) to look at the statistics. But + * many selectivity estimators will happily *invoke the operator + * function* to try to work out a good estimate - and that's not OK. + * So for now, don't dig down for stats. + */ + if (rte->security_barrier) + return; + + /* Can only handle a simple Var of subquery's query level */ + if (var && IsA(var, Var) && + var->varlevelsup == 0) + { + /* + * OK, recurse into the subquery. Note that the original setting + * of vardata->isunique (which will surely be false) is left + * unchanged in this situation. That's what we want, since even + * if the underlying column is unique, the subquery may have + * joined to other tables in a way that creates duplicates. + */ + examine_simple_variable(rel->subroot, var, vardata); + } + } + else + { + /* + * Otherwise, the Var comes from a FUNCTION, VALUES, or CTE RTE. (We + * won't see RTE_JOIN here because join alias Vars have already been + * flattened.) There's not much we can do with function outputs, but + * maybe someday try to be smarter about VALUES and/or CTEs. + */ + } +} + +/* + * Check whether it is permitted to call func_oid passing some of the + * pg_statistic data in vardata. We allow this either if the user has SELECT + * privileges on the table or column underlying the pg_statistic data or if + * the function is marked leak-proof. + */ +bool +statistic_proc_security_check(VariableStatData *vardata, Oid func_oid) +{ + if (vardata->acl_ok) + return true; + + if (!OidIsValid(func_oid)) + return false; + + if (get_func_leakproof(func_oid)) + return true; + + ereport(DEBUG2, + (errmsg_internal("not using statistics because function \"%s\" is not leak-proof", + get_func_name(func_oid)))); + return false; +} + +/* + * get_variable_numdistinct + * Estimate the number of distinct values of a variable. + * + * vardata: results of examine_variable + * *isdefault: set to true if the result is a default rather than based on + * anything meaningful. + * + * NB: be careful to produce a positive integral result, since callers may + * compare the result to exact integer counts, or might divide by it. + */ +double +get_variable_numdistinct(VariableStatData *vardata, bool *isdefault) +{ + double stadistinct; + double stanullfrac = 0.0; + double ntuples; + + *isdefault = false; + + /* + * Determine the stadistinct value to use. There are cases where we can + * get an estimate even without a pg_statistic entry, or can get a better + * value than is in pg_statistic. Grab stanullfrac too if we can find it + * (otherwise, assume no nulls, for lack of any better idea). + */ + if (HeapTupleIsValid(vardata->statsTuple)) + { + /* Use the pg_statistic entry */ + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + stadistinct = stats->stadistinct; + stanullfrac = stats->stanullfrac; + } + else if (vardata->vartype == BOOLOID) + { + /* + * Special-case boolean columns: presumably, two distinct values. + * + * Are there any other datatypes we should wire in special estimates + * for? + */ + stadistinct = 2.0; + } + else if (vardata->rel && vardata->rel->rtekind == RTE_VALUES) + { + /* + * If the Var represents a column of a VALUES RTE, assume it's unique. + * This could of course be very wrong, but it should tend to be true + * in well-written queries. We could consider examining the VALUES' + * contents to get some real statistics; but that only works if the + * entries are all constants, and it would be pretty expensive anyway. + */ + stadistinct = -1.0; /* unique (and all non null) */ + } + else + { + /* + * We don't keep statistics for system columns, but in some cases we + * can infer distinctness anyway. + */ + if (vardata->var && IsA(vardata->var, Var)) + { + switch (((Var *) vardata->var)->varattno) + { + case SelfItemPointerAttributeNumber: + stadistinct = -1.0; /* unique (and all non null) */ + break; + case TableOidAttributeNumber: + stadistinct = 1.0; /* only 1 value */ + break; + default: + stadistinct = 0.0; /* means "unknown" */ + break; + } + } + else + stadistinct = 0.0; /* means "unknown" */ + + /* + * XXX consider using estimate_num_groups on expressions? + */ + } + + /* + * If there is a unique index or DISTINCT clause for the variable, assume + * it is unique no matter what pg_statistic says; the statistics could be + * out of date, or we might have found a partial unique index that proves + * the var is unique for this query. However, we'd better still believe + * the null-fraction statistic. + */ + if (vardata->isunique) + stadistinct = -1.0 * (1.0 - stanullfrac); + + /* + * If we had an absolute estimate, use that. + */ + if (stadistinct > 0.0) + return clamp_row_est(stadistinct); + + /* + * Otherwise we need to get the relation size; punt if not available. + */ + if (vardata->rel == NULL) + { + *isdefault = true; + return DEFAULT_NUM_DISTINCT; + } + ntuples = vardata->rel->tuples; + if (ntuples <= 0.0) + { + *isdefault = true; + return DEFAULT_NUM_DISTINCT; + } + + /* + * If we had a relative estimate, use that. + */ + if (stadistinct < 0.0) + return clamp_row_est(-stadistinct * ntuples); + + /* + * With no data, estimate ndistinct = ntuples if the table is small, else + * use default. We use DEFAULT_NUM_DISTINCT as the cutoff for "small" so + * that the behavior isn't discontinuous. + */ + if (ntuples < DEFAULT_NUM_DISTINCT) + return clamp_row_est(ntuples); + + *isdefault = true; + return DEFAULT_NUM_DISTINCT; +} + +/* + * get_variable_range + * Estimate the minimum and maximum value of the specified variable. + * If successful, store values in *min and *max, and return true. + * If no data available, return false. + * + * sortop is the "<" comparison operator to use. This should generally + * be "<" not ">", as only the former is likely to be found in pg_statistic. + * The collation must be specified too. + */ +static bool +get_variable_range(PlannerInfo *root, VariableStatData *vardata, + Oid sortop, Oid collation, + Datum *min, Datum *max) +{ + Datum tmin = 0; + Datum tmax = 0; + bool have_data = false; + int16 typLen; + bool typByVal; + Oid opfuncoid; + FmgrInfo opproc; + AttStatsSlot sslot; + + /* + * XXX It's very tempting to try to use the actual column min and max, if + * we can get them relatively-cheaply with an index probe. However, since + * this function is called many times during join planning, that could + * have unpleasant effects on planning speed. Need more investigation + * before enabling this. + */ +#ifdef NOT_USED + if (get_actual_variable_range(root, vardata, sortop, collation, min, max)) + return true; +#endif + + if (!HeapTupleIsValid(vardata->statsTuple)) + { + /* no stats available, so default result */ + return false; + } + + /* + * If we can't apply the sortop to the stats data, just fail. In + * principle, if there's a histogram and no MCVs, we could return the + * histogram endpoints without ever applying the sortop ... but it's + * probably not worth trying, because whatever the caller wants to do with + * the endpoints would likely fail the security check too. + */ + if (!statistic_proc_security_check(vardata, + (opfuncoid = get_opcode(sortop)))) + return false; + + opproc.fn_oid = InvalidOid; /* mark this as not looked up yet */ + + get_typlenbyval(vardata->atttype, &typLen, &typByVal); + + /* + * If there is a histogram with the ordering we want, grab the first and + * last values. + */ + if (get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_HISTOGRAM, sortop, + ATTSTATSSLOT_VALUES)) + { + if (sslot.stacoll == collation && sslot.nvalues > 0) + { + tmin = datumCopy(sslot.values[0], typByVal, typLen); + tmax = datumCopy(sslot.values[sslot.nvalues - 1], typByVal, typLen); + have_data = true; + } + free_attstatsslot(&sslot); + } + + /* + * Otherwise, if there is a histogram with some other ordering, scan it + * and get the min and max values according to the ordering we want. This + * of course may not find values that are really extremal according to our + * ordering, but it beats ignoring available data. + */ + if (!have_data && + get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_HISTOGRAM, InvalidOid, + ATTSTATSSLOT_VALUES)) + { + get_stats_slot_range(&sslot, opfuncoid, &opproc, + collation, typLen, typByVal, + &tmin, &tmax, &have_data); + free_attstatsslot(&sslot); + } + + /* + * If we have most-common-values info, look for extreme MCVs. This is + * needed even if we also have a histogram, since the histogram excludes + * the MCVs. However, if we *only* have MCVs and no histogram, we should + * be pretty wary of deciding that that is a full representation of the + * data. Proceed only if the MCVs represent the whole table (to within + * roundoff error). + */ + if (get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + have_data ? ATTSTATSSLOT_VALUES : + (ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))) + { + bool use_mcvs = have_data; + + if (!have_data) + { + double sumcommon = 0.0; + double nullfrac; + int i; + + for (i = 0; i < sslot.nnumbers; i++) + sumcommon += sslot.numbers[i]; + nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata->statsTuple))->stanullfrac; + if (sumcommon + nullfrac > 0.99999) + use_mcvs = true; + } + + if (use_mcvs) + get_stats_slot_range(&sslot, opfuncoid, &opproc, + collation, typLen, typByVal, + &tmin, &tmax, &have_data); + free_attstatsslot(&sslot); + } + + *min = tmin; + *max = tmax; + return have_data; +} + +/* + * get_stats_slot_range: scan sslot for min/max values + * + * Subroutine for get_variable_range: update min/max/have_data according + * to what we find in the statistics array. + */ +static void +get_stats_slot_range(AttStatsSlot *sslot, Oid opfuncoid, FmgrInfo *opproc, + Oid collation, int16 typLen, bool typByVal, + Datum *min, Datum *max, bool *p_have_data) +{ + Datum tmin = *min; + Datum tmax = *max; + bool have_data = *p_have_data; + bool found_tmin = false; + bool found_tmax = false; + + /* Look up the comparison function, if we didn't already do so */ + if (opproc->fn_oid != opfuncoid) + fmgr_info(opfuncoid, opproc); + + /* Scan all the slot's values */ + for (int i = 0; i < sslot->nvalues; i++) + { + if (!have_data) + { + tmin = tmax = sslot->values[i]; + found_tmin = found_tmax = true; + *p_have_data = have_data = true; + continue; + } + if (DatumGetBool(FunctionCall2Coll(opproc, + collation, + sslot->values[i], tmin))) + { + tmin = sslot->values[i]; + found_tmin = true; + } + if (DatumGetBool(FunctionCall2Coll(opproc, + collation, + tmax, sslot->values[i]))) + { + tmax = sslot->values[i]; + found_tmax = true; + } + } + + /* + * Copy the slot's values, if we found new extreme values. + */ + if (found_tmin) + *min = datumCopy(tmin, typByVal, typLen); + if (found_tmax) + *max = datumCopy(tmax, typByVal, typLen); +} + + +/* + * get_actual_variable_range + * Attempt to identify the current *actual* minimum and/or maximum + * of the specified variable, by looking for a suitable btree index + * and fetching its low and/or high values. + * If successful, store values in *min and *max, and return true. + * (Either pointer can be NULL if that endpoint isn't needed.) + * If no data available, return false. + * + * sortop is the "<" comparison operator to use. + * collation is the required collation. + */ +static bool +get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, + Oid sortop, Oid collation, + Datum *min, Datum *max) +{ + bool have_data = false; + RelOptInfo *rel = vardata->rel; + RangeTblEntry *rte; + ListCell *lc; + + /* No hope if no relation or it doesn't have indexes */ + if (rel == NULL || rel->indexlist == NIL) + return false; + /* If it has indexes it must be a plain relation */ + rte = root->simple_rte_array[rel->relid]; + Assert(rte->rtekind == RTE_RELATION); + + /* Search through the indexes to see if any match our problem */ + foreach(lc, rel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(lc); + ScanDirection indexscandir; + + /* Ignore non-btree indexes */ + if (index->relam != BTREE_AM_OID) + continue; + + /* + * Ignore partial indexes --- we only want stats that cover the entire + * relation. + */ + if (index->indpred != NIL) + continue; + + /* + * The index list might include hypothetical indexes inserted by a + * get_relation_info hook --- don't try to access them. + */ + if (index->hypothetical) + continue; + + /* + * The first index column must match the desired variable, sortop, and + * collation --- but we can use a descending-order index. + */ + if (collation != index->indexcollations[0]) + continue; /* test first 'cause it's cheapest */ + if (!match_index_to_operand(vardata->var, 0, index)) + continue; + switch (get_op_opfamily_strategy(sortop, index->sortopfamily[0])) + { + case BTLessStrategyNumber: + if (index->reverse_sort[0]) + indexscandir = BackwardScanDirection; + else + indexscandir = ForwardScanDirection; + break; + case BTGreaterStrategyNumber: + if (index->reverse_sort[0]) + indexscandir = ForwardScanDirection; + else + indexscandir = BackwardScanDirection; + break; + default: + /* index doesn't match the sortop */ + continue; + } + + /* + * Found a suitable index to extract data from. Set up some data that + * can be used by both invocations of get_actual_variable_endpoint. + */ + { + MemoryContext tmpcontext; + MemoryContext oldcontext; + Relation heapRel; + Relation indexRel; + TupleTableSlot *slot; + int16 typLen; + bool typByVal; + ScanKeyData scankeys[1]; + + /* Make sure any cruft gets recycled when we're done */ + tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "get_actual_variable_range workspace", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(tmpcontext); + + /* + * Open the table and index so we can read from them. We should + * already have some type of lock on each. + */ + heapRel = table_open(rte->relid, NoLock); + indexRel = index_open(index->indexoid, NoLock); + + /* build some stuff needed for indexscan execution */ + slot = table_slot_create(heapRel, NULL); + get_typlenbyval(vardata->atttype, &typLen, &typByVal); + + /* set up an IS NOT NULL scan key so that we ignore nulls */ + ScanKeyEntryInitialize(&scankeys[0], + SK_ISNULL | SK_SEARCHNOTNULL, + 1, /* index col to scan */ + InvalidStrategy, /* no strategy */ + InvalidOid, /* no strategy subtype */ + InvalidOid, /* no collation */ + InvalidOid, /* no reg proc for this */ + (Datum) 0); /* constant */ + + /* If min is requested ... */ + if (min) + { + have_data = get_actual_variable_endpoint(heapRel, + indexRel, + indexscandir, + scankeys, + typLen, + typByVal, + slot, + oldcontext, + min); + } + else + { + /* If min not requested, assume index is nonempty */ + have_data = true; + } + + /* If max is requested, and we didn't find the index is empty */ + if (max && have_data) + { + /* scan in the opposite direction; all else is the same */ + have_data = get_actual_variable_endpoint(heapRel, + indexRel, + -indexscandir, + scankeys, + typLen, + typByVal, + slot, + oldcontext, + max); + } + + /* Clean everything up */ + ExecDropSingleTupleTableSlot(slot); + + index_close(indexRel, NoLock); + table_close(heapRel, NoLock); + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(tmpcontext); + + /* And we're done */ + break; + } + } + + return have_data; +} + +/* + * Get one endpoint datum (min or max depending on indexscandir) from the + * specified index. Return true if successful, false if index is empty. + * On success, endpoint value is stored to *endpointDatum (and copied into + * outercontext). + * + * scankeys is a 1-element scankey array set up to reject nulls. + * typLen/typByVal describe the datatype of the index's first column. + * tableslot is a slot suitable to hold table tuples, in case we need + * to probe the heap. + * (We could compute these values locally, but that would mean computing them + * twice when get_actual_variable_range needs both the min and the max.) + */ +static bool +get_actual_variable_endpoint(Relation heapRel, + Relation indexRel, + ScanDirection indexscandir, + ScanKey scankeys, + int16 typLen, + bool typByVal, + TupleTableSlot *tableslot, + MemoryContext outercontext, + Datum *endpointDatum) +{ + bool have_data = false; + SnapshotData SnapshotNonVacuumable; + IndexScanDesc index_scan; + Buffer vmbuffer = InvalidBuffer; + ItemPointer tid; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + MemoryContext oldcontext; + + /* + * We use the index-only-scan machinery for this. With mostly-static + * tables that's a win because it avoids a heap visit. It's also a win + * for dynamic data, but the reason is less obvious; read on for details. + * + * In principle, we should scan the index with our current active + * snapshot, which is the best approximation we've got to what the query + * will see when executed. But that won't be exact if a new snap is taken + * before running the query, and it can be very expensive if a lot of + * recently-dead or uncommitted rows exist at the beginning or end of the + * index (because we'll laboriously fetch each one and reject it). + * Instead, we use SnapshotNonVacuumable. That will accept recently-dead + * and uncommitted rows as well as normal visible rows. On the other + * hand, it will reject known-dead rows, and thus not give a bogus answer + * when the extreme value has been deleted (unless the deletion was quite + * recent); that case motivates not using SnapshotAny here. + * + * A crucial point here is that SnapshotNonVacuumable, with + * GlobalVisTestFor(heapRel) as horizon, yields the inverse of the + * condition that the indexscan will use to decide that index entries are + * killable (see heap_hot_search_buffer()). Therefore, if the snapshot + * rejects a tuple (or more precisely, all tuples of a HOT chain) and we + * have to continue scanning past it, we know that the indexscan will mark + * that index entry killed. That means that the next + * get_actual_variable_endpoint() call will not have to re-consider that + * index entry. In this way we avoid repetitive work when this function + * is used a lot during planning. + * + * But using SnapshotNonVacuumable creates a hazard of its own. In a + * recently-created index, some index entries may point at "broken" HOT + * chains in which not all the tuple versions contain data matching the + * index entry. The live tuple version(s) certainly do match the index, + * but SnapshotNonVacuumable can accept recently-dead tuple versions that + * don't match. Hence, if we took data from the selected heap tuple, we + * might get a bogus answer that's not close to the index extremal value, + * or could even be NULL. We avoid this hazard because we take the data + * from the index entry not the heap. + */ + InitNonVacuumableSnapshot(SnapshotNonVacuumable, + GlobalVisTestFor(heapRel)); + + index_scan = index_beginscan(heapRel, indexRel, + &SnapshotNonVacuumable, + 1, 0); + /* Set it up for index-only scan */ + index_scan->xs_want_itup = true; + index_rescan(index_scan, scankeys, 1, NULL, 0); + + /* Fetch first/next tuple in specified direction */ + while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) + { + if (!VM_ALL_VISIBLE(heapRel, + ItemPointerGetBlockNumber(tid), + &vmbuffer)) + { + /* Rats, we have to visit the heap to check visibility */ + if (!index_fetch_heap(index_scan, tableslot)) + continue; /* no visible tuple, try next index entry */ + + /* We don't actually need the heap tuple for anything */ + ExecClearTuple(tableslot); + + /* + * We don't care whether there's more than one visible tuple in + * the HOT chain; if any are visible, that's good enough. + */ + } + + /* + * We expect that btree will return data in IndexTuple not HeapTuple + * format. It's not lossy either. + */ + if (!index_scan->xs_itup) + elog(ERROR, "no data returned for index-only scan"); + if (index_scan->xs_recheck) + elog(ERROR, "unexpected recheck indication from btree"); + + /* OK to deconstruct the index tuple */ + index_deform_tuple(index_scan->xs_itup, + index_scan->xs_itupdesc, + values, isnull); + + /* Shouldn't have got a null, but be careful */ + if (isnull[0]) + elog(ERROR, "found unexpected null value in index \"%s\"", + RelationGetRelationName(indexRel)); + + /* Copy the index column value out to caller's context */ + oldcontext = MemoryContextSwitchTo(outercontext); + *endpointDatum = datumCopy(values[0], typByVal, typLen); + MemoryContextSwitchTo(oldcontext); + have_data = true; + break; + } + + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + index_endscan(index_scan); + + return have_data; +} + +/* + * find_join_input_rel + * Look up the input relation for a join. + * + * We assume that the input relation's RelOptInfo must have been constructed + * already. + */ +static RelOptInfo * +find_join_input_rel(PlannerInfo *root, Relids relids) +{ + RelOptInfo *rel = NULL; + + switch (bms_membership(relids)) + { + case BMS_EMPTY_SET: + /* should not happen */ + break; + case BMS_SINGLETON: + rel = find_base_rel(root, bms_singleton_member(relids)); + break; + case BMS_MULTIPLE: + rel = find_join_rel(root, relids); + break; + } + + if (rel == NULL) + elog(ERROR, "could not find RelOptInfo for given relids"); + + return rel; +} + + +/*------------------------------------------------------------------------- + * + * Index cost estimation functions + * + *------------------------------------------------------------------------- + */ + +/* + * Extract the actual indexquals (as RestrictInfos) from an IndexClause list + */ +List * +get_quals_from_indexclauses(List *indexclauses) +{ + List *result = NIL; + ListCell *lc; + + foreach(lc, indexclauses) + { + IndexClause *iclause = lfirst_node(IndexClause, lc); + ListCell *lc2; + + foreach(lc2, iclause->indexquals) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc2); + + result = lappend(result, rinfo); + } + } + return result; +} + +/* + * Compute the total evaluation cost of the comparison operands in a list + * of index qual expressions. Since we know these will be evaluated just + * once per scan, there's no need to distinguish startup from per-row cost. + * + * This can be used either on the result of get_quals_from_indexclauses(), + * or directly on an indexorderbys list. In both cases, we expect that the + * index key expression is on the left side of binary clauses. + */ +Cost +index_other_operands_eval_cost(PlannerInfo *root, List *indexquals) +{ + Cost qual_arg_cost = 0; + ListCell *lc; + + foreach(lc, indexquals) + { + Expr *clause = (Expr *) lfirst(lc); + Node *other_operand; + QualCost index_qual_cost; + + /* + * Index quals will have RestrictInfos, indexorderbys won't. Look + * through RestrictInfo if present. + */ + if (IsA(clause, RestrictInfo)) + clause = ((RestrictInfo *) clause)->clause; + + if (IsA(clause, OpExpr)) + { + OpExpr *op = (OpExpr *) clause; + + other_operand = (Node *) lsecond(op->args); + } + else if (IsA(clause, RowCompareExpr)) + { + RowCompareExpr *rc = (RowCompareExpr *) clause; + + other_operand = (Node *) rc->rargs; + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; + + other_operand = (Node *) lsecond(saop->args); + } + else if (IsA(clause, NullTest)) + { + other_operand = NULL; + } + else + { + elog(ERROR, "unsupported indexqual type: %d", + (int) nodeTag(clause)); + other_operand = NULL; /* keep compiler quiet */ + } + + cost_qual_eval_node(&index_qual_cost, other_operand, root); + qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple; + } + return qual_arg_cost; +} + +void +genericcostestimate(PlannerInfo *root, + IndexPath *path, + double loop_count, + GenericCosts *costs) +{ + IndexOptInfo *index = path->indexinfo; + List *indexQuals = get_quals_from_indexclauses(path->indexclauses); + List *indexOrderBys = path->indexorderbys; + Cost indexStartupCost; + Cost indexTotalCost; + Selectivity indexSelectivity; + double indexCorrelation; + double numIndexPages; + double numIndexTuples; + double spc_random_page_cost; + double num_sa_scans; + double num_outer_scans; + double num_scans; + double qual_op_cost; + double qual_arg_cost; + List *selectivityQuals; + ListCell *l; + + /* + * If the index is partial, AND the index predicate with the explicitly + * given indexquals to produce a more accurate idea of the index + * selectivity. + */ + selectivityQuals = add_predicate_to_index_quals(index, indexQuals); + + /* + * Check for ScalarArrayOpExpr index quals, and estimate the number of + * index scans that will be performed. + */ + num_sa_scans = 1; + foreach(l, indexQuals) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); + + if (IsA(rinfo->clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause; + int alength = estimate_array_length(lsecond(saop->args)); + + if (alength > 1) + num_sa_scans *= alength; + } + } + + /* Estimate the fraction of main-table tuples that will be visited */ + indexSelectivity = clauselist_selectivity(root, selectivityQuals, + index->rel->relid, + JOIN_INNER, + NULL); + + /* + * If caller didn't give us an estimate, estimate the number of index + * tuples that will be visited. We do it in this rather peculiar-looking + * way in order to get the right answer for partial indexes. + */ + numIndexTuples = costs->numIndexTuples; + if (numIndexTuples <= 0.0) + { + numIndexTuples = indexSelectivity * index->rel->tuples; + + /* + * The above calculation counts all the tuples visited across all + * scans induced by ScalarArrayOpExpr nodes. We want to consider the + * average per-indexscan number, so adjust. This is a handy place to + * round to integer, too. (If caller supplied tuple estimate, it's + * responsible for handling these considerations.) + */ + numIndexTuples = rint(numIndexTuples / num_sa_scans); + } + + /* + * We can bound the number of tuples by the index size in any case. Also, + * always estimate at least one tuple is touched, even when + * indexSelectivity estimate is tiny. + */ + if (numIndexTuples > index->tuples) + numIndexTuples = index->tuples; + if (numIndexTuples < 1.0) + numIndexTuples = 1.0; + + /* + * Estimate the number of index pages that will be retrieved. + * + * We use the simplistic method of taking a pro-rata fraction of the total + * number of index pages. In effect, this counts only leaf pages and not + * any overhead such as index metapage or upper tree levels. + * + * In practice access to upper index levels is often nearly free because + * those tend to stay in cache under load; moreover, the cost involved is + * highly dependent on index type. We therefore ignore such costs here + * and leave it to the caller to add a suitable charge if needed. + */ + if (index->pages > 1 && index->tuples > 1) + numIndexPages = ceil(numIndexTuples * index->pages / index->tuples); + else + numIndexPages = 1.0; + + /* fetch estimated page cost for tablespace containing index */ + get_tablespace_page_costs(index->reltablespace, + &spc_random_page_cost, + NULL); + + /* + * Now compute the disk access costs. + * + * The above calculations are all per-index-scan. However, if we are in a + * nestloop inner scan, we can expect the scan to be repeated (with + * different search keys) for each row of the outer relation. Likewise, + * ScalarArrayOpExpr quals result in multiple index scans. This creates + * the potential for cache effects to reduce the number of disk page + * fetches needed. We want to estimate the average per-scan I/O cost in + * the presence of caching. + * + * We use the Mackert-Lohman formula (see costsize.c for details) to + * estimate the total number of page fetches that occur. While this + * wasn't what it was designed for, it seems a reasonable model anyway. + * Note that we are counting pages not tuples anymore, so we take N = T = + * index size, as if there were one "tuple" per page. + */ + num_outer_scans = loop_count; + num_scans = num_sa_scans * num_outer_scans; + + if (num_scans > 1) + { + double pages_fetched; + + /* total page fetches ignoring cache effects */ + pages_fetched = numIndexPages * num_scans; + + /* use Mackert and Lohman formula to adjust for cache effects */ + pages_fetched = index_pages_fetched(pages_fetched, + index->pages, + (double) index->pages, + root); + + /* + * Now compute the total disk access cost, and then report a pro-rated + * share for each outer scan. (Don't pro-rate for ScalarArrayOpExpr, + * since that's internal to the indexscan.) + */ + indexTotalCost = (pages_fetched * spc_random_page_cost) + / num_outer_scans; + } + else + { + /* + * For a single index scan, we just charge spc_random_page_cost per + * page touched. + */ + indexTotalCost = numIndexPages * spc_random_page_cost; + } + + /* + * CPU cost: any complex expressions in the indexquals will need to be + * evaluated once at the start of the scan to reduce them to runtime keys + * to pass to the index AM (see nodeIndexscan.c). We model the per-tuple + * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per + * indexqual operator. Because we have numIndexTuples as a per-scan + * number, we have to multiply by num_sa_scans to get the correct result + * for ScalarArrayOpExpr cases. Similarly add in costs for any index + * ORDER BY expressions. + * + * Note: this neglects the possible costs of rechecking lossy operators. + * Detecting that that might be needed seems more expensive than it's + * worth, though, considering all the other inaccuracies here ... + */ + qual_arg_cost = index_other_operands_eval_cost(root, indexQuals) + + index_other_operands_eval_cost(root, indexOrderBys); + qual_op_cost = cpu_operator_cost * + (list_length(indexQuals) + list_length(indexOrderBys)); + + indexStartupCost = qual_arg_cost; + indexTotalCost += qual_arg_cost; + indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost); + + /* + * Generic assumption about index correlation: there isn't any. + */ + indexCorrelation = 0.0; + + /* + * Return everything to caller. + */ + costs->indexStartupCost = indexStartupCost; + costs->indexTotalCost = indexTotalCost; + costs->indexSelectivity = indexSelectivity; + costs->indexCorrelation = indexCorrelation; + costs->numIndexPages = numIndexPages; + costs->numIndexTuples = numIndexTuples; + costs->spc_random_page_cost = spc_random_page_cost; + costs->num_sa_scans = num_sa_scans; +} + +/* + * If the index is partial, add its predicate to the given qual list. + * + * ANDing the index predicate with the explicitly given indexquals produces + * a more accurate idea of the index's selectivity. However, we need to be + * careful not to insert redundant clauses, because clauselist_selectivity() + * is easily fooled into computing a too-low selectivity estimate. Our + * approach is to add only the predicate clause(s) that cannot be proven to + * be implied by the given indexquals. This successfully handles cases such + * as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50". + * There are many other cases where we won't detect redundancy, leading to a + * too-low selectivity estimate, which will bias the system in favor of using + * partial indexes where possible. That is not necessarily bad though. + * + * Note that indexQuals contains RestrictInfo nodes while the indpred + * does not, so the output list will be mixed. This is OK for both + * predicate_implied_by() and clauselist_selectivity(), but might be + * problematic if the result were passed to other things. + */ +List * +add_predicate_to_index_quals(IndexOptInfo *index, List *indexQuals) +{ + List *predExtraQuals = NIL; + ListCell *lc; + + if (index->indpred == NIL) + return indexQuals; + + foreach(lc, index->indpred) + { + Node *predQual = (Node *) lfirst(lc); + List *oneQual = list_make1(predQual); + + if (!predicate_implied_by(oneQual, indexQuals, false)) + predExtraQuals = list_concat(predExtraQuals, oneQual); + } + return list_concat(predExtraQuals, indexQuals); +} + + +void +btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation, + double *indexPages) +{ + IndexOptInfo *index = path->indexinfo; + GenericCosts costs; + Oid relid; + AttrNumber colnum; + VariableStatData vardata; + double numIndexTuples; + Cost descentCost; + List *indexBoundQuals; + int indexcol; + bool eqQualHere; + bool found_saop; + bool found_is_null_op; + double num_sa_scans; + ListCell *lc; + + /* + * For a btree scan, only leading '=' quals plus inequality quals for the + * immediately next attribute contribute to index selectivity (these are + * the "boundary quals" that determine the starting and stopping points of + * the index scan). Additional quals can suppress visits to the heap, so + * it's OK to count them in indexSelectivity, but they should not count + * for estimating numIndexTuples. So we must examine the given indexquals + * to find out which ones count as boundary quals. We rely on the + * knowledge that they are given in index column order. + * + * For a RowCompareExpr, we consider only the first column, just as + * rowcomparesel() does. + * + * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N + * index scans not one, but the ScalarArrayOpExpr's operator can be + * considered to act the same as it normally does. + */ + indexBoundQuals = NIL; + indexcol = 0; + eqQualHere = false; + found_saop = false; + found_is_null_op = false; + num_sa_scans = 1; + foreach(lc, path->indexclauses) + { + IndexClause *iclause = lfirst_node(IndexClause, lc); + ListCell *lc2; + + if (indexcol != iclause->indexcol) + { + /* Beginning of a new column's quals */ + if (!eqQualHere) + break; /* done if no '=' qual for indexcol */ + eqQualHere = false; + indexcol++; + if (indexcol != iclause->indexcol) + break; /* no quals at all for indexcol */ + } + + /* Examine each indexqual associated with this index clause */ + foreach(lc2, iclause->indexquals) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc2); + Expr *clause = rinfo->clause; + Oid clause_op = InvalidOid; + int op_strategy; + + if (IsA(clause, OpExpr)) + { + OpExpr *op = (OpExpr *) clause; + + clause_op = op->opno; + } + else if (IsA(clause, RowCompareExpr)) + { + RowCompareExpr *rc = (RowCompareExpr *) clause; + + clause_op = linitial_oid(rc->opnos); + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; + Node *other_operand = (Node *) lsecond(saop->args); + int alength = estimate_array_length(other_operand); + + clause_op = saop->opno; + found_saop = true; + /* count number of SA scans induced by indexBoundQuals only */ + if (alength > 1) + num_sa_scans *= alength; + } + else if (IsA(clause, NullTest)) + { + NullTest *nt = (NullTest *) clause; + + if (nt->nulltesttype == IS_NULL) + { + found_is_null_op = true; + /* IS NULL is like = for selectivity purposes */ + eqQualHere = true; + } + } + else + elog(ERROR, "unsupported indexqual type: %d", + (int) nodeTag(clause)); + + /* check for equality operator */ + if (OidIsValid(clause_op)) + { + op_strategy = get_op_opfamily_strategy(clause_op, + index->opfamily[indexcol]); + Assert(op_strategy != 0); /* not a member of opfamily?? */ + if (op_strategy == BTEqualStrategyNumber) + eqQualHere = true; + } + + indexBoundQuals = lappend(indexBoundQuals, rinfo); + } + } + + /* + * If index is unique and we found an '=' clause for each column, we can + * just assume numIndexTuples = 1 and skip the expensive + * clauselist_selectivity calculations. However, a ScalarArrayOp or + * NullTest invalidates that theory, even though it sets eqQualHere. + */ + if (index->unique && + indexcol == index->nkeycolumns - 1 && + eqQualHere && + !found_saop && + !found_is_null_op) + numIndexTuples = 1.0; + else + { + List *selectivityQuals; + Selectivity btreeSelectivity; + + /* + * If the index is partial, AND the index predicate with the + * index-bound quals to produce a more accurate idea of the number of + * rows covered by the bound conditions. + */ + selectivityQuals = add_predicate_to_index_quals(index, indexBoundQuals); + + btreeSelectivity = clauselist_selectivity(root, selectivityQuals, + index->rel->relid, + JOIN_INNER, + NULL); + numIndexTuples = btreeSelectivity * index->rel->tuples; + + /* + * As in genericcostestimate(), we have to adjust for any + * ScalarArrayOpExpr quals included in indexBoundQuals, and then round + * to integer. + */ + numIndexTuples = rint(numIndexTuples / num_sa_scans); + } + + /* + * Now do generic index cost estimation. + */ + MemSet(&costs, 0, sizeof(costs)); + costs.numIndexTuples = numIndexTuples; + + genericcostestimate(root, path, loop_count, &costs); + + /* + * Add a CPU-cost component to represent the costs of initial btree + * descent. We don't charge any I/O cost for touching upper btree levels, + * since they tend to stay in cache, but we still have to do about log2(N) + * comparisons to descend a btree of N leaf tuples. We charge one + * cpu_operator_cost per comparison. + * + * If there are ScalarArrayOpExprs, charge this once per SA scan. The + * ones after the first one are not startup cost so far as the overall + * plan is concerned, so add them only to "total" cost. + */ + if (index->tuples > 1) /* avoid computing log(0) */ + { + descentCost = ceil(log(index->tuples) / log(2.0)) * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + } + + /* + * Even though we're not charging I/O cost for touching upper btree pages, + * it's still reasonable to charge some CPU cost per page descended + * through. Moreover, if we had no such charge at all, bloated indexes + * would appear to have the same search cost as unbloated ones, at least + * in cases where only a single leaf page is expected to be visited. This + * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page + * touched. The number of such pages is btree tree height plus one (ie, + * we charge for the leaf page too). As above, charge once per SA scan. + */ + descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + + /* + * If we can get an estimate of the first column's ordering correlation C + * from pg_statistic, estimate the index correlation as C for a + * single-column index, or C * 0.75 for multiple columns. (The idea here + * is that multiple columns dilute the importance of the first column's + * ordering, but don't negate it entirely. Before 8.0 we divided the + * correlation by the number of columns, but that seems too strong.) + */ + MemSet(&vardata, 0, sizeof(vardata)); + + if (index->indexkeys[0] != 0) + { + /* Simple variable --- look to stats for the underlying table */ + RangeTblEntry *rte = planner_rt_fetch(index->rel->relid, root); + + Assert(rte->rtekind == RTE_RELATION); + relid = rte->relid; + Assert(relid != InvalidOid); + colnum = index->indexkeys[0]; + + if (get_relation_stats_hook && + (*get_relation_stats_hook) (root, rte, colnum, &vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it did + * supply a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata.statsTuple) && + !vardata.freefunc) + elog(ERROR, "no function provided to release variable stats with"); + } + else + { + vardata.statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(relid), + Int16GetDatum(colnum), + BoolGetDatum(rte->inh)); + vardata.freefunc = ReleaseSysCache; + } + } + else + { + /* Expression --- maybe there are stats for the index itself */ + relid = index->indexoid; + colnum = 1; + + if (get_index_stats_hook && + (*get_index_stats_hook) (root, relid, colnum, &vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it did + * supply a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata.statsTuple) && + !vardata.freefunc) + elog(ERROR, "no function provided to release variable stats with"); + } + else + { + vardata.statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(relid), + Int16GetDatum(colnum), + BoolGetDatum(false)); + vardata.freefunc = ReleaseSysCache; + } + } + + if (HeapTupleIsValid(vardata.statsTuple)) + { + Oid sortop; + AttStatsSlot sslot; + + sortop = get_opfamily_member(index->opfamily[0], + index->opcintype[0], + index->opcintype[0], + BTLessStrategyNumber); + if (OidIsValid(sortop) && + get_attstatsslot(&sslot, vardata.statsTuple, + STATISTIC_KIND_CORRELATION, sortop, + ATTSTATSSLOT_NUMBERS)) + { + double varCorrelation; + + Assert(sslot.nnumbers == 1); + varCorrelation = sslot.numbers[0]; + + if (index->reverse_sort[0]) + varCorrelation = -varCorrelation; + + if (index->nkeycolumns > 1) + costs.indexCorrelation = varCorrelation * 0.75; + else + costs.indexCorrelation = varCorrelation; + + free_attstatsslot(&sslot); + } + } + + ReleaseVariableStats(vardata); + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + *indexPages = costs.numIndexPages; +} + +void +hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation, + double *indexPages) +{ + GenericCosts costs; + + MemSet(&costs, 0, sizeof(costs)); + + genericcostestimate(root, path, loop_count, &costs); + + /* + * A hash index has no descent costs as such, since the index AM can go + * directly to the target bucket after computing the hash value. There + * are a couple of other hash-specific costs that we could conceivably add + * here, though: + * + * Ideally we'd charge spc_random_page_cost for each page in the target + * bucket, not just the numIndexPages pages that genericcostestimate + * thought we'd visit. However in most cases we don't know which bucket + * that will be. There's no point in considering the average bucket size + * because the hash AM makes sure that's always one page. + * + * Likewise, we could consider charging some CPU for each index tuple in + * the bucket, if we knew how many there were. But the per-tuple cost is + * just a hash value comparison, not a general datatype-dependent + * comparison, so any such charge ought to be quite a bit less than + * cpu_operator_cost; which makes it probably not worth worrying about. + * + * A bigger issue is that chance hash-value collisions will result in + * wasted probes into the heap. We don't currently attempt to model this + * cost on the grounds that it's rare, but maybe it's not rare enough. + * (Any fix for this ought to consider the generic lossy-operator problem, + * though; it's not entirely hash-specific.) + */ + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + *indexPages = costs.numIndexPages; +} + +void +gistcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation, + double *indexPages) +{ + IndexOptInfo *index = path->indexinfo; + GenericCosts costs; + Cost descentCost; + + MemSet(&costs, 0, sizeof(costs)); + + genericcostestimate(root, path, loop_count, &costs); + + /* + * We model index descent costs similarly to those for btree, but to do + * that we first need an idea of the tree height. We somewhat arbitrarily + * assume that the fanout is 100, meaning the tree height is at most + * log100(index->pages). + * + * Although this computation isn't really expensive enough to require + * caching, we might as well use index->tree_height to cache it. + */ + if (index->tree_height < 0) /* unknown? */ + { + if (index->pages > 1) /* avoid computing log(0) */ + index->tree_height = (int) (log(index->pages) / log(100.0)); + else + index->tree_height = 0; + } + + /* + * Add a CPU-cost component to represent the costs of initial descent. We + * just use log(N) here not log2(N) since the branching factor isn't + * necessarily two anyway. As for btree, charge once per SA scan. + */ + if (index->tuples > 1) /* avoid computing log(0) */ + { + descentCost = ceil(log(index->tuples)) * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + } + + /* + * Likewise add a per-page charge, calculated the same as for btrees. + */ + descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + *indexPages = costs.numIndexPages; +} + +void +spgcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation, + double *indexPages) +{ + IndexOptInfo *index = path->indexinfo; + GenericCosts costs; + Cost descentCost; + + MemSet(&costs, 0, sizeof(costs)); + + genericcostestimate(root, path, loop_count, &costs); + + /* + * We model index descent costs similarly to those for btree, but to do + * that we first need an idea of the tree height. We somewhat arbitrarily + * assume that the fanout is 100, meaning the tree height is at most + * log100(index->pages). + * + * Although this computation isn't really expensive enough to require + * caching, we might as well use index->tree_height to cache it. + */ + if (index->tree_height < 0) /* unknown? */ + { + if (index->pages > 1) /* avoid computing log(0) */ + index->tree_height = (int) (log(index->pages) / log(100.0)); + else + index->tree_height = 0; + } + + /* + * Add a CPU-cost component to represent the costs of initial descent. We + * just use log(N) here not log2(N) since the branching factor isn't + * necessarily two anyway. As for btree, charge once per SA scan. + */ + if (index->tuples > 1) /* avoid computing log(0) */ + { + descentCost = ceil(log(index->tuples)) * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + } + + /* + * Likewise add a per-page charge, calculated the same as for btrees. + */ + descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost; + costs.indexStartupCost += descentCost; + costs.indexTotalCost += costs.num_sa_scans * descentCost; + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + *indexPages = costs.numIndexPages; +} + + +/* + * Support routines for gincostestimate + */ + +typedef struct +{ + bool attHasFullScan[INDEX_MAX_KEYS]; + bool attHasNormalScan[INDEX_MAX_KEYS]; + double partialEntries; + double exactEntries; + double searchEntries; + double arrayScans; +} GinQualCounts; + +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN query, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + */ +static bool +gincost_pattern(IndexOptInfo *index, int indexcol, + Oid clause_op, Datum query, + GinQualCounts *counts) +{ + FmgrInfo flinfo; + Oid extractProcOid; + Oid collation; + int strategy_op; + Oid lefttype, + righttype; + int32 nentries = 0; + bool *partial_matches = NULL; + Pointer *extra_data = NULL; + bool *nullFlags = NULL; + int32 searchMode = GIN_SEARCH_MODE_DEFAULT; + int32 i; + + Assert(indexcol < index->nkeycolumns); + + /* + * Get the operator's strategy number and declared input data types within + * the index opfamily. (We don't need the latter, but we use + * get_op_opfamily_properties because it will throw error if it fails to + * find a matching pg_amop entry.) + */ + get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false, + &strategy_op, &lefttype, &righttype); + + /* + * GIN always uses the "default" support functions, which are those with + * lefttype == righttype == the opclass' opcintype (see + * IndexSupportInitialize in relcache.c). + */ + extractProcOid = get_opfamily_proc(index->opfamily[indexcol], + index->opcintype[indexcol], + index->opcintype[indexcol], + GIN_EXTRACTQUERY_PROC); + + if (!OidIsValid(extractProcOid)) + { + /* should not happen; throw same error as index_getprocinfo */ + elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", + GIN_EXTRACTQUERY_PROC, indexcol + 1, + get_rel_name(index->indexoid)); + } + + /* + * Choose collation to pass to extractProc (should match initGinState). + */ + if (OidIsValid(index->indexcollations[indexcol])) + collation = index->indexcollations[indexcol]; + else + collation = DEFAULT_COLLATION_OID; + + fmgr_info(extractProcOid, &flinfo); + + set_fn_opclass_options(&flinfo, index->opclassoptions[indexcol]); + + FunctionCall7Coll(&flinfo, + collation, + query, + PointerGetDatum(&nentries), + UInt16GetDatum(strategy_op), + PointerGetDatum(&partial_matches), + PointerGetDatum(&extra_data), + PointerGetDatum(&nullFlags), + PointerGetDatum(&searchMode)); + + if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT) + { + /* No match is possible */ + return false; + } + + for (i = 0; i < nentries; i++) + { + /* + * For partial match we haven't any information to estimate number of + * matched entries in index, so, we just estimate it as 100 + */ + if (partial_matches && partial_matches[i]) + counts->partialEntries += 100; + else + counts->exactEntries++; + + counts->searchEntries++; + } + + if (searchMode == GIN_SEARCH_MODE_DEFAULT) + { + counts->attHasNormalScan[indexcol] = true; + } + else if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) + { + /* Treat "include empty" like an exact-match item */ + counts->attHasNormalScan[indexcol] = true; + counts->exactEntries++; + counts->searchEntries++; + } + else + { + /* It's GIN_SEARCH_MODE_ALL */ + counts->attHasFullScan[indexcol] = true; + } + + return true; +} + +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN index clause, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + */ +static bool +gincost_opexpr(PlannerInfo *root, + IndexOptInfo *index, + int indexcol, + OpExpr *clause, + GinQualCounts *counts) +{ + Oid clause_op = clause->opno; + Node *operand = (Node *) lsecond(clause->args); + + /* aggressively reduce to a constant, and look through relabeling */ + operand = estimate_expression_value(root, operand); + + if (IsA(operand, RelabelType)) + operand = (Node *) ((RelabelType *) operand)->arg; + + /* + * It's impossible to call extractQuery method for unknown operand. So + * unless operand is a Const we can't do much; just assume there will be + * one ordinary search entry from the operand at runtime. + */ + if (!IsA(operand, Const)) + { + counts->exactEntries++; + counts->searchEntries++; + return true; + } + + /* If Const is null, there can be no matches */ + if (((Const *) operand)->constisnull) + return false; + + /* Otherwise, apply extractQuery and get the actual term counts */ + return gincost_pattern(index, indexcol, clause_op, + ((Const *) operand)->constvalue, + counts); +} + +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN index clause, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + * + * A ScalarArrayOpExpr will give rise to N separate indexscans at runtime, + * each of which involves one value from the RHS array, plus all the + * non-array quals (if any). To model this, we average the counts across + * the RHS elements, and add the averages to the counts in *counts (which + * correspond to per-indexscan costs). We also multiply counts->arrayScans + * by N, causing gincostestimate to scale up its estimates accordingly. + */ +static bool +gincost_scalararrayopexpr(PlannerInfo *root, + IndexOptInfo *index, + int indexcol, + ScalarArrayOpExpr *clause, + double numIndexEntries, + GinQualCounts *counts) +{ + Oid clause_op = clause->opno; + Node *rightop = (Node *) lsecond(clause->args); + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int numElems; + Datum *elemValues; + bool *elemNulls; + GinQualCounts arraycounts; + int numPossible = 0; + int i; + + Assert(clause->useOr); + + /* aggressively reduce to a constant, and look through relabeling */ + rightop = estimate_expression_value(root, rightop); + + if (IsA(rightop, RelabelType)) + rightop = (Node *) ((RelabelType *) rightop)->arg; + + /* + * It's impossible to call extractQuery method for unknown operand. So + * unless operand is a Const we can't do much; just assume there will be + * one ordinary search entry from each array entry at runtime, and fall + * back on a probably-bad estimate of the number of array entries. + */ + if (!IsA(rightop, Const)) + { + counts->exactEntries++; + counts->searchEntries++; + counts->arrayScans *= estimate_array_length(rightop); + return true; + } + + /* If Const is null, there can be no matches */ + if (((Const *) rightop)->constisnull) + return false; + + /* Otherwise, extract the array elements and iterate over them */ + arrayval = DatumGetArrayTypeP(((Const *) rightop)->constvalue); + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elemValues, &elemNulls, &numElems); + + memset(&arraycounts, 0, sizeof(arraycounts)); + + for (i = 0; i < numElems; i++) + { + GinQualCounts elemcounts; + + /* NULL can't match anything, so ignore, as the executor will */ + if (elemNulls[i]) + continue; + + /* Otherwise, apply extractQuery and get the actual term counts */ + memset(&elemcounts, 0, sizeof(elemcounts)); + + if (gincost_pattern(index, indexcol, clause_op, elemValues[i], + &elemcounts)) + { + /* We ignore array elements that are unsatisfiable patterns */ + numPossible++; + + if (elemcounts.attHasFullScan[indexcol] && + !elemcounts.attHasNormalScan[indexcol]) + { + /* + * Full index scan will be required. We treat this as if + * every key in the index had been listed in the query; is + * that reasonable? + */ + elemcounts.partialEntries = 0; + elemcounts.exactEntries = numIndexEntries; + elemcounts.searchEntries = numIndexEntries; + } + arraycounts.partialEntries += elemcounts.partialEntries; + arraycounts.exactEntries += elemcounts.exactEntries; + arraycounts.searchEntries += elemcounts.searchEntries; + } + } + + if (numPossible == 0) + { + /* No satisfiable patterns in the array */ + return false; + } + + /* + * Now add the averages to the global counts. This will give us an + * estimate of the average number of terms searched for in each indexscan, + * including contributions from both array and non-array quals. + */ + counts->partialEntries += arraycounts.partialEntries / numPossible; + counts->exactEntries += arraycounts.exactEntries / numPossible; + counts->searchEntries += arraycounts.searchEntries / numPossible; + + counts->arrayScans *= numPossible; + + return true; +} + +/* + * GIN has search behavior completely different from other index types + */ +void +gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation, + double *indexPages) +{ + IndexOptInfo *index = path->indexinfo; + List *indexQuals = get_quals_from_indexclauses(path->indexclauses); + List *selectivityQuals; + double numPages = index->pages, + numTuples = index->tuples; + double numEntryPages, + numDataPages, + numPendingPages, + numEntries; + GinQualCounts counts; + bool matchPossible; + bool fullIndexScan; + double partialScale; + double entryPagesFetched, + dataPagesFetched, + dataPagesFetchedBySel; + double qual_op_cost, + qual_arg_cost, + spc_random_page_cost, + outer_scans; + Relation indexRel; + GinStatsData ginStats; + ListCell *lc; + int i; + + /* + * Obtain statistical information from the meta page, if possible. Else + * set ginStats to zeroes, and we'll cope below. + */ + if (!index->hypothetical) + { + /* Lock should have already been obtained in plancat.c */ + indexRel = index_open(index->indexoid, NoLock); + ginGetStats(indexRel, &ginStats); + index_close(indexRel, NoLock); + } + else + { + memset(&ginStats, 0, sizeof(ginStats)); + } + + /* + * Assuming we got valid (nonzero) stats at all, nPendingPages can be + * trusted, but the other fields are data as of the last VACUUM. We can + * scale them up to account for growth since then, but that method only + * goes so far; in the worst case, the stats might be for a completely + * empty index, and scaling them will produce pretty bogus numbers. + * Somewhat arbitrarily, set the cutoff for doing scaling at 4X growth; if + * it's grown more than that, fall back to estimating things only from the + * assumed-accurate index size. But we'll trust nPendingPages in any case + * so long as it's not clearly insane, ie, more than the index size. + */ + if (ginStats.nPendingPages < numPages) + numPendingPages = ginStats.nPendingPages; + else + numPendingPages = 0; + + if (numPages > 0 && ginStats.nTotalPages <= numPages && + ginStats.nTotalPages > numPages / 4 && + ginStats.nEntryPages > 0 && ginStats.nEntries > 0) + { + /* + * OK, the stats seem close enough to sane to be trusted. But we + * still need to scale them by the ratio numPages / nTotalPages to + * account for growth since the last VACUUM. + */ + double scale = numPages / ginStats.nTotalPages; + + numEntryPages = ceil(ginStats.nEntryPages * scale); + numDataPages = ceil(ginStats.nDataPages * scale); + numEntries = ceil(ginStats.nEntries * scale); + /* ensure we didn't round up too much */ + numEntryPages = Min(numEntryPages, numPages - numPendingPages); + numDataPages = Min(numDataPages, + numPages - numPendingPages - numEntryPages); + } + else + { + /* + * We might get here because it's a hypothetical index, or an index + * created pre-9.1 and never vacuumed since upgrading (in which case + * its stats would read as zeroes), or just because it's grown too + * much since the last VACUUM for us to put our faith in scaling. + * + * Invent some plausible internal statistics based on the index page + * count (and clamp that to at least 10 pages, just in case). We + * estimate that 90% of the index is entry pages, and the rest is data + * pages. Estimate 100 entries per entry page; this is rather bogus + * since it'll depend on the size of the keys, but it's more robust + * than trying to predict the number of entries per heap tuple. + */ + numPages = Max(numPages, 10); + numEntryPages = floor((numPages - numPendingPages) * 0.90); + numDataPages = numPages - numPendingPages - numEntryPages; + numEntries = floor(numEntryPages * 100); + } + + /* In an empty index, numEntries could be zero. Avoid divide-by-zero */ + if (numEntries < 1) + numEntries = 1; + + /* + * If the index is partial, AND the index predicate with the index-bound + * quals to produce a more accurate idea of the number of rows covered by + * the bound conditions. + */ + selectivityQuals = add_predicate_to_index_quals(index, indexQuals); + + /* Estimate the fraction of main-table tuples that will be visited */ + *indexSelectivity = clauselist_selectivity(root, selectivityQuals, + index->rel->relid, + JOIN_INNER, + NULL); + + /* fetch estimated page cost for tablespace containing index */ + get_tablespace_page_costs(index->reltablespace, + &spc_random_page_cost, + NULL); + + /* + * Generic assumption about index correlation: there isn't any. + */ + *indexCorrelation = 0.0; + + /* + * Examine quals to estimate number of search entries & partial matches + */ + memset(&counts, 0, sizeof(counts)); + counts.arrayScans = 1; + matchPossible = true; + + foreach(lc, path->indexclauses) + { + IndexClause *iclause = lfirst_node(IndexClause, lc); + ListCell *lc2; + + foreach(lc2, iclause->indexquals) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc2); + Expr *clause = rinfo->clause; + + if (IsA(clause, OpExpr)) + { + matchPossible = gincost_opexpr(root, + index, + iclause->indexcol, + (OpExpr *) clause, + &counts); + if (!matchPossible) + break; + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + matchPossible = gincost_scalararrayopexpr(root, + index, + iclause->indexcol, + (ScalarArrayOpExpr *) clause, + numEntries, + &counts); + if (!matchPossible) + break; + } + else + { + /* shouldn't be anything else for a GIN index */ + elog(ERROR, "unsupported GIN indexqual type: %d", + (int) nodeTag(clause)); + } + } + } + + /* Fall out if there were any provably-unsatisfiable quals */ + if (!matchPossible) + { + *indexStartupCost = 0; + *indexTotalCost = 0; + *indexSelectivity = 0; + return; + } + + /* + * If attribute has a full scan and at the same time doesn't have normal + * scan, then we'll have to scan all non-null entries of that attribute. + * Currently, we don't have per-attribute statistics for GIN. Thus, we + * must assume the whole GIN index has to be scanned in this case. + */ + fullIndexScan = false; + for (i = 0; i < index->nkeycolumns; i++) + { + if (counts.attHasFullScan[i] && !counts.attHasNormalScan[i]) + { + fullIndexScan = true; + break; + } + } + + if (fullIndexScan || indexQuals == NIL) + { + /* + * Full index scan will be required. We treat this as if every key in + * the index had been listed in the query; is that reasonable? + */ + counts.partialEntries = 0; + counts.exactEntries = numEntries; + counts.searchEntries = numEntries; + } + + /* Will we have more than one iteration of a nestloop scan? */ + outer_scans = loop_count; + + /* + * Compute cost to begin scan, first of all, pay attention to pending + * list. + */ + entryPagesFetched = numPendingPages; + + /* + * Estimate number of entry pages read. We need to do + * counts.searchEntries searches. Use a power function as it should be, + * but tuples on leaf pages usually is much greater. Here we include all + * searches in entry tree, including search of first entry in partial + * match algorithm + */ + entryPagesFetched += ceil(counts.searchEntries * rint(pow(numEntryPages, 0.15))); + + /* + * Add an estimate of entry pages read by partial match algorithm. It's a + * scan over leaf pages in entry tree. We haven't any useful stats here, + * so estimate it as proportion. Because counts.partialEntries is really + * pretty bogus (see code above), it's possible that it is more than + * numEntries; clamp the proportion to ensure sanity. + */ + partialScale = counts.partialEntries / numEntries; + partialScale = Min(partialScale, 1.0); + + entryPagesFetched += ceil(numEntryPages * partialScale); + + /* + * Partial match algorithm reads all data pages before doing actual scan, + * so it's a startup cost. Again, we haven't any useful stats here, so + * estimate it as proportion. + */ + dataPagesFetched = ceil(numDataPages * partialScale); + + /* + * Calculate cache effects if more than one scan due to nestloops or array + * quals. The result is pro-rated per nestloop scan, but the array qual + * factor shouldn't be pro-rated (compare genericcostestimate). + */ + if (outer_scans > 1 || counts.arrayScans > 1) + { + entryPagesFetched *= outer_scans * counts.arrayScans; + entryPagesFetched = index_pages_fetched(entryPagesFetched, + (BlockNumber) numEntryPages, + numEntryPages, root); + entryPagesFetched /= outer_scans; + dataPagesFetched *= outer_scans * counts.arrayScans; + dataPagesFetched = index_pages_fetched(dataPagesFetched, + (BlockNumber) numDataPages, + numDataPages, root); + dataPagesFetched /= outer_scans; + } + + /* + * Here we use random page cost because logically-close pages could be far + * apart on disk. + */ + *indexStartupCost = (entryPagesFetched + dataPagesFetched) * spc_random_page_cost; + + /* + * Now compute the number of data pages fetched during the scan. + * + * We assume every entry to have the same number of items, and that there + * is no overlap between them. (XXX: tsvector and array opclasses collect + * statistics on the frequency of individual keys; it would be nice to use + * those here.) + */ + dataPagesFetched = ceil(numDataPages * counts.exactEntries / numEntries); + + /* + * If there is a lot of overlap among the entries, in particular if one of + * the entries is very frequent, the above calculation can grossly + * under-estimate. As a simple cross-check, calculate a lower bound based + * on the overall selectivity of the quals. At a minimum, we must read + * one item pointer for each matching entry. + * + * The width of each item pointer varies, based on the level of + * compression. We don't have statistics on that, but an average of + * around 3 bytes per item is fairly typical. + */ + dataPagesFetchedBySel = ceil(*indexSelectivity * + (numTuples / (BLCKSZ / 3))); + if (dataPagesFetchedBySel > dataPagesFetched) + dataPagesFetched = dataPagesFetchedBySel; + + /* Account for cache effects, the same as above */ + if (outer_scans > 1 || counts.arrayScans > 1) + { + dataPagesFetched *= outer_scans * counts.arrayScans; + dataPagesFetched = index_pages_fetched(dataPagesFetched, + (BlockNumber) numDataPages, + numDataPages, root); + dataPagesFetched /= outer_scans; + } + + /* And apply random_page_cost as the cost per page */ + *indexTotalCost = *indexStartupCost + + dataPagesFetched * spc_random_page_cost; + + /* + * Add on index qual eval costs, much as in genericcostestimate. But we + * can disregard indexorderbys, since GIN doesn't support those. + */ + qual_arg_cost = index_other_operands_eval_cost(root, indexQuals); + qual_op_cost = cpu_operator_cost * list_length(indexQuals); + + *indexStartupCost += qual_arg_cost; + *indexTotalCost += qual_arg_cost; + *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost); + *indexPages = dataPagesFetched; +} + +/* + * BRIN has search behavior completely different from other index types + */ +void +brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation, + double *indexPages) +{ + IndexOptInfo *index = path->indexinfo; + List *indexQuals = get_quals_from_indexclauses(path->indexclauses); + double numPages = index->pages; + RelOptInfo *baserel = index->rel; + RangeTblEntry *rte = planner_rt_fetch(baserel->relid, root); + Cost spc_seq_page_cost; + Cost spc_random_page_cost; + double qual_arg_cost; + double qualSelectivity; + BrinStatsData statsData; + double indexRanges; + double minimalRanges; + double estimatedRanges; + double selec; + Relation indexRel; + ListCell *l; + VariableStatData vardata; + + Assert(rte->rtekind == RTE_RELATION); + + /* fetch estimated page cost for the tablespace containing the index */ + get_tablespace_page_costs(index->reltablespace, + &spc_random_page_cost, + &spc_seq_page_cost); + + /* + * Obtain some data from the index itself, if possible. Otherwise invent + * some plausible internal statistics based on the relation page count. + */ + if (!index->hypothetical) + { + /* + * A lock should have already been obtained on the index in plancat.c. + */ + indexRel = index_open(index->indexoid, NoLock); + brinGetStats(indexRel, &statsData); + index_close(indexRel, NoLock); + + /* work out the actual number of ranges in the index */ + indexRanges = Max(ceil((double) baserel->pages / + statsData.pagesPerRange), 1.0); + } + else + { + /* + * Assume default number of pages per range, and estimate the number + * of ranges based on that. + */ + indexRanges = Max(ceil((double) baserel->pages / + BRIN_DEFAULT_PAGES_PER_RANGE), 1.0); + + statsData.pagesPerRange = BRIN_DEFAULT_PAGES_PER_RANGE; + statsData.revmapNumPages = (indexRanges / REVMAP_PAGE_MAXITEMS) + 1; + } + + /* + * Compute index correlation + * + * Because we can use all index quals equally when scanning, we can use + * the largest correlation (in absolute value) among columns used by the + * query. Start at zero, the worst possible case. If we cannot find any + * correlation statistics, we will keep it as 0. + */ + *indexCorrelation = 0; + + foreach(l, path->indexclauses) + { + IndexClause *iclause = lfirst_node(IndexClause, l); + AttrNumber attnum = index->indexkeys[iclause->indexcol]; + + /* attempt to lookup stats in relation for this index column */ + if (attnum != 0) + { + /* Simple variable -- look to stats for the underlying table */ + if (get_relation_stats_hook && + (*get_relation_stats_hook) (root, rte, attnum, &vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it + * did supply a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata.statsTuple) && !vardata.freefunc) + elog(ERROR, + "no function provided to release variable stats with"); + } + else + { + vardata.statsTuple = + SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(rte->relid), + Int16GetDatum(attnum), + BoolGetDatum(false)); + vardata.freefunc = ReleaseSysCache; + } + } + else + { + /* + * Looks like we've found an expression column in the index. Let's + * see if there's any stats for it. + */ + + /* get the attnum from the 0-based index. */ + attnum = iclause->indexcol + 1; + + if (get_index_stats_hook && + (*get_index_stats_hook) (root, index->indexoid, attnum, &vardata)) + { + /* + * The hook took control of acquiring a stats tuple. If it + * did supply a tuple, it'd better have supplied a freefunc. + */ + if (HeapTupleIsValid(vardata.statsTuple) && + !vardata.freefunc) + elog(ERROR, "no function provided to release variable stats with"); + } + else + { + vardata.statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(index->indexoid), + Int16GetDatum(attnum), + BoolGetDatum(false)); + vardata.freefunc = ReleaseSysCache; + } + } + + if (HeapTupleIsValid(vardata.statsTuple)) + { + AttStatsSlot sslot; + + if (get_attstatsslot(&sslot, vardata.statsTuple, + STATISTIC_KIND_CORRELATION, InvalidOid, + ATTSTATSSLOT_NUMBERS)) + { + double varCorrelation = 0.0; + + if (sslot.nnumbers > 0) + varCorrelation = Abs(sslot.numbers[0]); + + if (varCorrelation > *indexCorrelation) + *indexCorrelation = varCorrelation; + + free_attstatsslot(&sslot); + } + } + + ReleaseVariableStats(vardata); + } + + qualSelectivity = clauselist_selectivity(root, indexQuals, + baserel->relid, + JOIN_INNER, NULL); + + /* + * Now calculate the minimum possible ranges we could match with if all of + * the rows were in the perfect order in the table's heap. + */ + minimalRanges = ceil(indexRanges * qualSelectivity); + + /* + * Now estimate the number of ranges that we'll touch by using the + * indexCorrelation from the stats. Careful not to divide by zero (note + * we're using the absolute value of the correlation). + */ + if (*indexCorrelation < 1.0e-10) + estimatedRanges = indexRanges; + else + estimatedRanges = Min(minimalRanges / *indexCorrelation, indexRanges); + + /* we expect to visit this portion of the table */ + selec = estimatedRanges / indexRanges; + + CLAMP_PROBABILITY(selec); + + *indexSelectivity = selec; + + /* + * Compute the index qual costs, much as in genericcostestimate, to add to + * the index costs. We can disregard indexorderbys, since BRIN doesn't + * support those. + */ + qual_arg_cost = index_other_operands_eval_cost(root, indexQuals); + + /* + * Compute the startup cost as the cost to read the whole revmap + * sequentially, including the cost to execute the index quals. + */ + *indexStartupCost = + spc_seq_page_cost * statsData.revmapNumPages * loop_count; + *indexStartupCost += qual_arg_cost; + + /* + * To read a BRIN index there might be a bit of back and forth over + * regular pages, as revmap might point to them out of sequential order; + * calculate the total cost as reading the whole index in random order. + */ + *indexTotalCost = *indexStartupCost + + spc_random_page_cost * (numPages - statsData.revmapNumPages) * loop_count; + + /* + * Charge a small amount per range tuple which we expect to match to. This + * is meant to reflect the costs of manipulating the bitmap. The BRIN scan + * will set a bit for each page in the range when we find a matching + * range, so we must multiply the charge by the number of pages in the + * range. + */ + *indexTotalCost += 0.1 * cpu_operator_cost * estimatedRanges * + statsData.pagesPerRange; + + *indexPages = index->pages; +} |