/*------------------------------------------------------------------------- * * network_selfuncs.c * Functions for selectivity estimation of inet/cidr operators * * This module provides estimators for the subnet inclusion and overlap * operators. Estimates are based on null fraction, most common values, * and histogram of inet/cidr columns. * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/backend/utils/adt/network_selfuncs.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "access/htup_details.h" #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" #include "utils/builtins.h" #include "utils/inet.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" /* Default selectivity for the inet overlap operator */ #define DEFAULT_OVERLAP_SEL 0.01 /* Default selectivity for the various inclusion operators */ #define DEFAULT_INCLUSION_SEL 0.005 /* Default selectivity for specified operator */ #define DEFAULT_SEL(operator) \ ((operator) == OID_INET_OVERLAP_OP ? \ DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL) /* Maximum number of items to consider in join selectivity calculations */ #define MAX_CONSIDERED_ELEMS 1024 static Selectivity networkjoinsel_inner(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2); static Selectivity networkjoinsel_semi(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2); static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues); static Selectivity inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue, int opr_codenum); static Selectivity inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues, Oid operator); static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues, Datum *hist_values, int hist_nvalues, int opr_codenum); static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues, Datum *hist2_values, int hist2_nvalues, int opr_codenum); static Selectivity inet_semi_join_sel(Datum lhs_value, bool mcv_exists, Datum *mcv_values, int mcv_nvalues, bool hist_exists, Datum *hist_values, int hist_nvalues, double hist_weight, FmgrInfo *proc, int opr_codenum); static int inet_opr_codenum(Oid operator); static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum); static int inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum); static int inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum); /* * Selectivity estimation for the subnet inclusion/overlap operators */ Datum networksel(PG_FUNCTION_ARGS) { PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); VariableStatData vardata; Node *other; bool varonleft; Selectivity selec, mcv_selec, non_mcv_selec; Datum constvalue; Form_pg_statistic stats; AttStatsSlot hslot; double sumcommon, nullfrac; FmgrInfo proc; /* * If expression is not (variable op something) or (something op * variable), then punt and return a default estimate. */ if (!get_restriction_variable(root, args, varRelid, &vardata, &other, &varonleft)) PG_RETURN_FLOAT8(DEFAULT_SEL(operator)); /* * Can't do anything useful if the something is not a constant, either. */ if (!IsA(other, Const)) { ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(DEFAULT_SEL(operator)); } /* All of the operators handled here are strict. */ if (((Const *) other)->constisnull) { ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(0.0); } constvalue = ((Const *) other)->constvalue; /* Otherwise, we need stats in order to produce a non-default estimate. */ if (!HeapTupleIsValid(vardata.statsTuple)) { ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(DEFAULT_SEL(operator)); } stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); nullfrac = stats->stanullfrac; /* * If we have most-common-values info, add up the fractions of the MCV * entries that satisfy MCV OP CONST. These fractions contribute directly * to the result selectivity. Also add up the total fraction represented * by MCV entries. */ fmgr_info(get_opcode(operator), &proc); mcv_selec = mcv_selectivity(&vardata, &proc, InvalidOid, constvalue, varonleft, &sumcommon); /* * If we have a histogram, use it to estimate the proportion of the * non-MCV population that satisfies the clause. If we don't, apply the * default selectivity to that population. */ if (get_attstatsslot(&hslot, vardata.statsTuple, STATISTIC_KIND_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES)) { int opr_codenum = inet_opr_codenum(operator); /* Commute if needed, so we can consider histogram to be on the left */ if (!varonleft) opr_codenum = -opr_codenum; non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues, constvalue, opr_codenum); free_attstatsslot(&hslot); } else non_mcv_selec = DEFAULT_SEL(operator); /* Combine selectivities for MCV and non-MCV populations */ selec = mcv_selec + (1.0 - nullfrac - sumcommon) * non_mcv_selec; /* Result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(selec); } /* * Join selectivity estimation for the subnet inclusion/overlap operators * * This function has the same structure as eqjoinsel() in selfuncs.c. * * Throughout networkjoinsel and its subroutines, we have a performance issue * in that the amount of work to be done is O(N^2) in the length of the MCV * and histogram arrays. To keep the runtime from getting out of hand when * large statistics targets have been set, we arbitrarily limit the number of * values considered to 1024 (MAX_CONSIDERED_ELEMS). For the MCV arrays, this * is easy: just consider at most the first N elements. (Since the MCVs are * sorted by decreasing frequency, this correctly gets us the first N MCVs.) * For the histogram arrays, we decimate; that is consider only every k'th * element, where k is chosen so that no more than MAX_CONSIDERED_ELEMS * elements are considered. This should still give us a good random sample of * the non-MCV population. Decimation is done on-the-fly in the loops that * iterate over the histogram arrays. */ Datum networkjoinsel(PG_FUNCTION_ARGS) { PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); #ifdef NOT_USED JoinType jointype = (JoinType) PG_GETARG_INT16(3); #endif SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); double selec; VariableStatData vardata1; VariableStatData vardata2; bool join_is_reversed; get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &join_is_reversed); switch (sjinfo->jointype) { case JOIN_INNER: case JOIN_LEFT: case JOIN_FULL: /* * Selectivity for left/full join is not exactly the same as inner * join, but we neglect the difference, as eqjoinsel does. */ selec = networkjoinsel_inner(operator, &vardata1, &vardata2); break; case JOIN_SEMI: case JOIN_ANTI: /* Here, it's important that we pass the outer var on the left. */ if (!join_is_reversed) selec = networkjoinsel_semi(operator, &vardata1, &vardata2); else selec = networkjoinsel_semi(get_commutator(operator), &vardata2, &vardata1); break; default: /* other values not expected here */ elog(ERROR, "unrecognized join type: %d", (int) sjinfo->jointype); selec = 0; /* keep compiler quiet */ break; } ReleaseVariableStats(vardata1); ReleaseVariableStats(vardata2); CLAMP_PROBABILITY(selec); PG_RETURN_FLOAT8((float8) selec); } /* * Inner join selectivity estimation for subnet inclusion/overlap operators * * Calculates MCV vs MCV, MCV vs histogram and histogram vs histogram * selectivity for join using the subnet inclusion operators. Unlike the * join selectivity function for the equality operator, eqjoinsel_inner(), * one to one matching of the values is not enough. Network inclusion * operators are likely to match many to many, so we must check all pairs. * (Note: it might be possible to exploit understanding of the histogram's * btree ordering to reduce the work needed, but we don't currently try.) * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner(). */ static Selectivity networkjoinsel_inner(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2) { Form_pg_statistic stats; double nullfrac1 = 0.0, nullfrac2 = 0.0; Selectivity selec = 0.0, sumcommon1 = 0.0, sumcommon2 = 0.0; bool mcv1_exists = false, mcv2_exists = false, hist1_exists = false, hist2_exists = false; int opr_codenum; int mcv1_length = 0, mcv2_length = 0; AttStatsSlot mcv1_slot; AttStatsSlot mcv2_slot; AttStatsSlot hist1_slot; AttStatsSlot hist2_slot; if (HeapTupleIsValid(vardata1->statsTuple)) { stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); nullfrac1 = stats->stanullfrac; mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple, STATISTIC_KIND_MCV, InvalidOid, ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple, STATISTIC_KIND_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES); /* Arbitrarily limit number of MCVs considered */ mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS); if (mcv1_exists) sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length); } else { memset(&mcv1_slot, 0, sizeof(mcv1_slot)); memset(&hist1_slot, 0, sizeof(hist1_slot)); } if (HeapTupleIsValid(vardata2->statsTuple)) { stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple); nullfrac2 = stats->stanullfrac; mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple, STATISTIC_KIND_MCV, InvalidOid, ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple, STATISTIC_KIND_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES); /* Arbitrarily limit number of MCVs considered */ mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS); if (mcv2_exists) sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length); } else { memset(&mcv2_slot, 0, sizeof(mcv2_slot)); memset(&hist2_slot, 0, sizeof(hist2_slot)); } opr_codenum = inet_opr_codenum(operator); /* * Calculate selectivity for MCV vs MCV matches. */ if (mcv1_exists && mcv2_exists) selec += inet_mcv_join_sel(mcv1_slot.values, mcv1_slot.numbers, mcv1_length, mcv2_slot.values, mcv2_slot.numbers, mcv2_length, operator); /* * Add in selectivities for MCV vs histogram matches, scaling according to * the fractions of the populations represented by the histograms. Note * that the second case needs to commute the operator. */ if (mcv1_exists && hist2_exists) selec += (1.0 - nullfrac2 - sumcommon2) * inet_mcv_hist_sel(mcv1_slot.values, mcv1_slot.numbers, mcv1_length, hist2_slot.values, hist2_slot.nvalues, opr_codenum); if (mcv2_exists && hist1_exists) selec += (1.0 - nullfrac1 - sumcommon1) * inet_mcv_hist_sel(mcv2_slot.values, mcv2_slot.numbers, mcv2_length, hist1_slot.values, hist1_slot.nvalues, -opr_codenum); /* * Add in selectivity for histogram vs histogram matches, again scaling * appropriately. */ if (hist1_exists && hist2_exists) selec += (1.0 - nullfrac1 - sumcommon1) * (1.0 - nullfrac2 - sumcommon2) * inet_hist_inclusion_join_sel(hist1_slot.values, hist1_slot.nvalues, hist2_slot.values, hist2_slot.nvalues, opr_codenum); /* * If useful statistics are not available then use the default estimate. * We can apply null fractions if known, though. */ if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists)) selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator); /* Release stats. */ free_attstatsslot(&mcv1_slot); free_attstatsslot(&mcv2_slot); free_attstatsslot(&hist1_slot); free_attstatsslot(&hist2_slot); return selec; } /* * Semi join selectivity estimation for subnet inclusion/overlap operators * * Calculates MCV vs MCV, MCV vs histogram, histogram vs MCV, and histogram vs * histogram selectivity for semi/anti join cases. */ static Selectivity networkjoinsel_semi(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2) { Form_pg_statistic stats; Selectivity selec = 0.0, sumcommon1 = 0.0, sumcommon2 = 0.0; double nullfrac1 = 0.0, nullfrac2 = 0.0, hist2_weight = 0.0; bool mcv1_exists = false, mcv2_exists = false, hist1_exists = false, hist2_exists = false; int opr_codenum; FmgrInfo proc; int i, mcv1_length = 0, mcv2_length = 0; AttStatsSlot mcv1_slot; AttStatsSlot mcv2_slot; AttStatsSlot hist1_slot; AttStatsSlot hist2_slot; if (HeapTupleIsValid(vardata1->statsTuple)) { stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); nullfrac1 = stats->stanullfrac; mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple, STATISTIC_KIND_MCV, InvalidOid, ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple, STATISTIC_KIND_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES); /* Arbitrarily limit number of MCVs considered */ mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS); if (mcv1_exists) sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length); } else { memset(&mcv1_slot, 0, sizeof(mcv1_slot)); memset(&hist1_slot, 0, sizeof(hist1_slot)); } if (HeapTupleIsValid(vardata2->statsTuple)) { stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple); nullfrac2 = stats->stanullfrac; mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple, STATISTIC_KIND_MCV, InvalidOid, ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple, STATISTIC_KIND_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES); /* Arbitrarily limit number of MCVs considered */ mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS); if (mcv2_exists) sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length); } else { memset(&mcv2_slot, 0, sizeof(mcv2_slot)); memset(&hist2_slot, 0, sizeof(hist2_slot)); } opr_codenum = inet_opr_codenum(operator); fmgr_info(get_opcode(operator), &proc); /* Estimate number of input rows represented by RHS histogram. */ if (hist2_exists && vardata2->rel) hist2_weight = (1.0 - nullfrac2 - sumcommon2) * vardata2->rel->rows; /* * Consider each element of the LHS MCV list, matching it to whatever RHS * stats we have. Scale according to the known frequency of the MCV. */ if (mcv1_exists && (mcv2_exists || hist2_exists)) { for (i = 0; i < mcv1_length; i++) { selec += mcv1_slot.numbers[i] * inet_semi_join_sel(mcv1_slot.values[i], mcv2_exists, mcv2_slot.values, mcv2_length, hist2_exists, hist2_slot.values, hist2_slot.nvalues, hist2_weight, &proc, opr_codenum); } } /* * Consider each element of the LHS histogram, except for the first and * last elements, which we exclude on the grounds that they're outliers * and thus not very representative. Scale on the assumption that each * such histogram element represents an equal share of the LHS histogram * population (which is a bit bogus, because the members of its bucket may * not all act the same with respect to the join clause, but it's hard to * do better). * * If there are too many histogram elements, decimate to limit runtime. */ if (hist1_exists && hist1_slot.nvalues > 2 && (mcv2_exists || hist2_exists)) { double hist_selec_sum = 0.0; int k, n; k = (hist1_slot.nvalues - 3) / MAX_CONSIDERED_ELEMS + 1; n = 0; for (i = 1; i < hist1_slot.nvalues - 1; i += k) { hist_selec_sum += inet_semi_join_sel(hist1_slot.values[i], mcv2_exists, mcv2_slot.values, mcv2_length, hist2_exists, hist2_slot.values, hist2_slot.nvalues, hist2_weight, &proc, opr_codenum); n++; } selec += (1.0 - nullfrac1 - sumcommon1) * hist_selec_sum / n; } /* * If useful statistics are not available then use the default estimate. * We can apply null fractions if known, though. */ if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists)) selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator); /* Release stats. */ free_attstatsslot(&mcv1_slot); free_attstatsslot(&mcv2_slot); free_attstatsslot(&hist1_slot); free_attstatsslot(&hist2_slot); return selec; } /* * Compute the fraction of a relation's population that is represented * by the MCV list. */ static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues) { Selectivity sumcommon = 0.0; int i; for (i = 0; i < mcv_nvalues; i++) { sumcommon += mcv_numbers[i]; } return sumcommon; } /* * Inet histogram vs single value selectivity estimation * * Estimate the fraction of the histogram population that satisfies * "value OPR CONST". (The result needs to be scaled to reflect the * proportion of the total population represented by the histogram.) * * The histogram is originally for the inet btree comparison operators. * Only the common bits of the network part and the length of the network part * (masklen) are interesting for the subnet inclusion operators. Fortunately, * btree comparison treats the network part as the major sort key. Even so, * the length of the network part would not really be significant in the * histogram. This would lead to big mistakes for data sets with uneven * masklen distribution. To reduce this problem, comparisons with the left * and the right sides of the buckets are used together. * * Histogram bucket matches are calculated in two forms. If the constant * matches both bucket endpoints the bucket is considered as fully matched. * The second form is to match the bucket partially; we recognize this when * the constant matches just one endpoint, or the two endpoints fall on * opposite sides of the constant. (Note that when the constant matches an * interior histogram element, it gets credit for partial matches to the * buckets on both sides, while a match to a histogram endpoint gets credit * for only one partial match. This is desirable.) * * The divider in the partial bucket match is imagined as the distance * between the decisive bits and the common bits of the addresses. It will * be used as a power of two as it is the natural scale for the IP network * inclusion. This partial bucket match divider calculation is an empirical * formula and subject to change with more experiment. * * For a partial match, we try to calculate dividers for both of the * boundaries. If the address family of a boundary value does not match the * constant or comparison of the length of the network parts is not correct * for the operator, the divider for that boundary will not be taken into * account. If both of the dividers are valid, the greater one will be used * to minimize the mistake in buckets that have disparate masklens. This * calculation is unfair when dividers can be calculated for both of the * boundaries but they are far from each other; but it is not a common * situation as the boundaries are expected to share most of their significant * bits of their masklens. The mistake would be greater, if we would use the * minimum instead of the maximum, and we don't know a sensible way to combine * them. * * For partial match in buckets that have different address families on the * left and right sides, only the boundary with the same address family is * taken into consideration. This can cause more mistakes for these buckets * if the masklens of their boundaries are also disparate. But this can only * happen in one bucket, since only two address families exist. It seems a * better option than not considering these buckets at all. */ static Selectivity inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue, int opr_codenum) { Selectivity match = 0.0; inet *query, *left, *right; int i, k, n; int left_order, right_order, left_divider, right_divider; /* guard against zero-divide below */ if (nvalues <= 1) return 0.0; /* if there are too many histogram elements, decimate to limit runtime */ k = (nvalues - 2) / MAX_CONSIDERED_ELEMS + 1; query = DatumGetInetPP(constvalue); /* "left" is the left boundary value of the current bucket ... */ left = DatumGetInetPP(values[0]); left_order = inet_inclusion_cmp(left, query, opr_codenum); n = 0; for (i = k; i < nvalues; i += k) { /* ... and "right" is the right boundary value */ right = DatumGetInetPP(values[i]); right_order = inet_inclusion_cmp(right, query, opr_codenum); if (left_order == 0 && right_order == 0) { /* The whole bucket matches, since both endpoints do. */ match += 1.0; } else if ((left_order <= 0 && right_order >= 0) || (left_order >= 0 && right_order <= 0)) { /* Partial bucket match. */ left_divider = inet_hist_match_divider(left, query, opr_codenum); right_divider = inet_hist_match_divider(right, query, opr_codenum); if (left_divider >= 0 || right_divider >= 0) match += 1.0 / pow(2.0, Max(left_divider, right_divider)); } /* Shift the variables. */ left = right; left_order = right_order; /* Count the number of buckets considered. */ n++; } return match / n; } /* * Inet MCV vs MCV join selectivity estimation * * We simply add up the fractions of the populations that satisfy the clause. * The result is exact and does not need to be scaled further. */ static Selectivity inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues, Oid operator) { Selectivity selec = 0.0; FmgrInfo proc; int i, j; fmgr_info(get_opcode(operator), &proc); for (i = 0; i < mcv1_nvalues; i++) { for (j = 0; j < mcv2_nvalues; j++) if (DatumGetBool(FunctionCall2(&proc, mcv1_values[i], mcv2_values[j]))) selec += mcv1_numbers[i] * mcv2_numbers[j]; } return selec; } /* * Inet MCV vs histogram join selectivity estimation * * For each MCV on the lefthand side, estimate the fraction of the righthand's * histogram population that satisfies the join clause, and add those up, * scaling by the MCV's frequency. The result still needs to be scaled * according to the fraction of the righthand's population represented by * the histogram. */ static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues, Datum *hist_values, int hist_nvalues, int opr_codenum) { Selectivity selec = 0.0; int i; /* * We'll call inet_hist_value_selec with the histogram on the left, so we * must commute the operator. */ opr_codenum = -opr_codenum; for (i = 0; i < mcv_nvalues; i++) { selec += mcv_numbers[i] * inet_hist_value_sel(hist_values, hist_nvalues, mcv_values[i], opr_codenum); } return selec; } /* * Inet histogram vs histogram join selectivity estimation * * Here, we take all values listed in the second histogram (except for the * first and last elements, which are excluded on the grounds of possibly * not being very representative) and treat them as a uniform sample of * the non-MCV population for that relation. For each one, we apply * inet_hist_value_selec to see what fraction of the first histogram * it matches. * * We could alternatively do this the other way around using the operator's * commutator. XXX would it be worthwhile to do it both ways and take the * average? That would at least avoid non-commutative estimation results. */ static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues, Datum *hist2_values, int hist2_nvalues, int opr_codenum) { double match = 0.0; int i, k, n; if (hist2_nvalues <= 2) return 0.0; /* no interior histogram elements */ /* if there are too many histogram elements, decimate to limit runtime */ k = (hist2_nvalues - 3) / MAX_CONSIDERED_ELEMS + 1; n = 0; for (i = 1; i < hist2_nvalues - 1; i += k) { match += inet_hist_value_sel(hist1_values, hist1_nvalues, hist2_values[i], opr_codenum); n++; } return match / n; } /* * Inet semi join selectivity estimation for one value * * The function calculates the probability that there is at least one row * in the RHS table that satisfies the "lhs_value op column" condition. * It is used in semi join estimation to check a sample from the left hand * side table. * * The MCV and histogram from the right hand side table should be provided as * arguments with the lhs_value from the left hand side table for the join. * hist_weight is the total number of rows represented by the histogram. * For example, if the table has 1000 rows, and 10% of the rows are in the MCV * list, and another 10% are NULLs, hist_weight would be 800. * * First, the lhs_value will be matched to the most common values. If it * matches any of them, 1.0 will be returned, because then there is surely * a match. * * Otherwise, the histogram will be used to estimate the number of rows in * the second table that match the condition. If the estimate is greater * than 1.0, 1.0 will be returned, because it means there is a greater chance * that the lhs_value will match more than one row in the table. If it is * between 0.0 and 1.0, it will be returned as the probability. */ static Selectivity inet_semi_join_sel(Datum lhs_value, bool mcv_exists, Datum *mcv_values, int mcv_nvalues, bool hist_exists, Datum *hist_values, int hist_nvalues, double hist_weight, FmgrInfo *proc, int opr_codenum) { if (mcv_exists) { int i; for (i = 0; i < mcv_nvalues; i++) { if (DatumGetBool(FunctionCall2(proc, lhs_value, mcv_values[i]))) return 1.0; } } if (hist_exists && hist_weight > 0) { Selectivity hist_selec; /* Commute operator, since we're passing lhs_value on the right */ hist_selec = inet_hist_value_sel(hist_values, hist_nvalues, lhs_value, -opr_codenum); if (hist_selec > 0) return Min(1.0, hist_weight * hist_selec); } return 0.0; } /* * Assign useful code numbers for the subnet inclusion/overlap operators * * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend * on the exact codes assigned here; but many other places in this file * know that they can negate a code to obtain the code for the commutator * operator. */ static int inet_opr_codenum(Oid operator) { switch (operator) { case OID_INET_SUP_OP: return -2; case OID_INET_SUPEQ_OP: return -1; case OID_INET_OVERLAP_OP: return 0; case OID_INET_SUBEQ_OP: return 1; case OID_INET_SUB_OP: return 2; default: elog(ERROR, "unrecognized operator %u for inet selectivity", operator); } return 0; /* unreached, but keep compiler quiet */ } /* * Comparison function for the subnet inclusion/overlap operators * * If the comparison is okay for the specified inclusion operator, the return * value will be 0. Otherwise the return value will be less than or greater * than 0 as appropriate for the operator. * * Comparison is compatible with the basic comparison function for the inet * type. See network_cmp_internal() in network.c for the original. Basic * comparison operators are implemented with the network_cmp_internal() * function. It is possible to implement the subnet inclusion operators with * this function. * * Comparison is first on the common bits of the network part, then on the * length of the network part (masklen) as in the network_cmp_internal() * function. Only the first part is in this function. The second part is * separated to another function for reusability. The difference between the * second part and the original network_cmp_internal() is that the inclusion * operator is considered while comparing the lengths of the network parts. * See the inet_masklen_inclusion_cmp() function below. */ static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum) { if (ip_family(left) == ip_family(right)) { int order; order = bitncmp(ip_addr(left), ip_addr(right), Min(ip_bits(left), ip_bits(right))); if (order != 0) return order; return inet_masklen_inclusion_cmp(left, right, opr_codenum); } return ip_family(left) - ip_family(right); } /* * Masklen comparison function for the subnet inclusion/overlap operators * * Compares the lengths of the network parts of the inputs. If the comparison * is okay for the specified inclusion operator, the return value will be 0. * Otherwise the return value will be less than or greater than 0 as * appropriate for the operator. */ static int inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum) { int order; order = (int) ip_bits(left) - (int) ip_bits(right); /* * Return 0 if the operator would accept this combination of masklens. * Note that opr_codenum zero (overlaps) will accept all cases. */ if ((order > 0 && opr_codenum >= 0) || (order == 0 && opr_codenum >= -1 && opr_codenum <= 1) || (order < 0 && opr_codenum <= 0)) return 0; /* * Otherwise, return a negative value for sup/supeq (notionally, the RHS * needs to have a larger masklen than it has, which would make it sort * later), or a positive value for sub/subeq (vice versa). */ return opr_codenum; } /* * Inet histogram partial match divider calculation * * First the families and the lengths of the network parts are compared using * the subnet inclusion operator. If those are acceptable for the operator, * the divider will be calculated using the masklens and the common bits of * the addresses. -1 will be returned if it cannot be calculated. * * See commentary for inet_hist_value_sel() for some rationale for this. */ static int inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum) { if (ip_family(boundary) == ip_family(query) && inet_masklen_inclusion_cmp(boundary, query, opr_codenum) == 0) { int min_bits, decisive_bits; min_bits = Min(ip_bits(boundary), ip_bits(query)); /* * Set decisive_bits to the masklen of the one that should contain the * other according to the operator. */ if (opr_codenum < 0) decisive_bits = ip_bits(boundary); else if (opr_codenum > 0) decisive_bits = ip_bits(query); else decisive_bits = min_bits; /* * Now return the number of non-common decisive bits. (This will be * zero if the boundary and query in fact match, else positive.) */ if (min_bits > 0) return decisive_bits - bitncommon(ip_addr(boundary), ip_addr(query), min_bits); return decisive_bits; } return -1; }