diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:19:15 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:19:15 +0000 |
commit | 6eb9c5a5657d1fe77b55cc261450f3538d35a94d (patch) | |
tree | 657d8194422a5daccecfd42d654b8a245ef7b4c8 /src/backend/partitioning | |
parent | Initial commit. (diff) | |
download | postgresql-13-6eb9c5a5657d1fe77b55cc261450f3538d35a94d.tar.xz postgresql-13-6eb9c5a5657d1fe77b55cc261450f3538d35a94d.zip |
Adding upstream version 13.4.upstream/13.4upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/partitioning')
-rw-r--r-- | src/backend/partitioning/Makefile | 20 | ||||
-rw-r--r-- | src/backend/partitioning/partbounds.c | 4817 | ||||
-rw-r--r-- | src/backend/partitioning/partdesc.c | 368 | ||||
-rw-r--r-- | src/backend/partitioning/partprune.c | 3589 |
4 files changed, 8794 insertions, 0 deletions
diff --git a/src/backend/partitioning/Makefile b/src/backend/partitioning/Makefile new file mode 100644 index 0000000..a73fd14 --- /dev/null +++ b/src/backend/partitioning/Makefile @@ -0,0 +1,20 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for backend/partitioning +# +# IDENTIFICATION +# src/backend/partitioning/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/partitioning +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + partbounds.o \ + partdesc.o \ + partprune.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c new file mode 100644 index 0000000..7613ebb --- /dev/null +++ b/src/backend/partitioning/partbounds.c @@ -0,0 +1,4817 @@ +/*------------------------------------------------------------------------- + * + * partbounds.c + * Support routines for manipulating partition bounds + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/partitioning/partbounds.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relation.h" +#include "access/table.h" +#include "access/tableam.h" +#include "catalog/partition.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_type.h" +#include "commands/tablecmds.h" +#include "common/hashfn.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "nodes/pathnodes.h" +#include "parser/parse_coerce.h" +#include "partitioning/partbounds.h" +#include "partitioning/partdesc.h" +#include "partitioning/partprune.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/partcache.h" +#include "utils/ruleutils.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + +/* + * When qsort'ing partition bounds after reading from the catalog, each bound + * is represented with one of the following structs. + */ + +/* One bound of a hash partition */ +typedef struct PartitionHashBound +{ + int modulus; + int remainder; + int index; +} PartitionHashBound; + +/* One value coming from some (index'th) list partition */ +typedef struct PartitionListValue +{ + int index; + Datum value; +} PartitionListValue; + +/* One bound of a range partition */ +typedef struct PartitionRangeBound +{ + int index; + Datum *datums; /* range bound datums */ + PartitionRangeDatumKind *kind; /* the kind of each datum */ + bool lower; /* this is the lower (vs upper) bound */ +} PartitionRangeBound; + +/* + * Mapping from partitions of a joining relation to partitions of a join + * relation being computed (a.k.a merged partitions) + */ +typedef struct PartitionMap +{ + int nparts; /* number of partitions */ + int *merged_indexes; /* indexes of merged partitions */ + bool *merged; /* flags to indicate whether partitions are + * merged with non-dummy partitions */ + bool did_remapping; /* did we re-map partitions? */ + int *old_indexes; /* old indexes of merged partitions if + * did_remapping */ +} PartitionMap; + +/* Macro for comparing two range bounds */ +#define compare_range_bounds(partnatts, partsupfunc, partcollations, \ + bound1, bound2) \ + (partition_rbound_cmp(partnatts, partsupfunc, partcollations, \ + (bound1)->datums, (bound1)->kind, (bound1)->lower, \ + bound2)) + +static int32 qsort_partition_hbound_cmp(const void *a, const void *b); +static int32 qsort_partition_list_value_cmp(const void *a, const void *b, + void *arg); +static int32 qsort_partition_rbound_cmp(const void *a, const void *b, + void *arg); +static PartitionBoundInfo create_hash_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); +static PartitionBoundInfo create_list_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); +static PartitionBoundInfo create_range_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); +static PartitionBoundInfo merge_list_bounds(FmgrInfo *partsupfunc, + Oid *collations, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + JoinType jointype, + List **outer_parts, + List **inner_parts); +static PartitionBoundInfo merge_range_bounds(int partnatts, + FmgrInfo *partsupfuncs, + Oid *partcollations, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + JoinType jointype, + List **outer_parts, + List **inner_parts); +static void init_partition_map(RelOptInfo *rel, PartitionMap *map); +static void free_partition_map(PartitionMap *map); +static bool is_dummy_partition(RelOptInfo *rel, int part_index); +static int merge_matching_partitions(PartitionMap *outer_map, + PartitionMap *inner_map, + int outer_part, + int inner_part, + int *next_index); +static int process_outer_partition(PartitionMap *outer_map, + PartitionMap *inner_map, + bool outer_has_default, + bool inner_has_default, + int outer_index, + int inner_default, + JoinType jointype, + int *next_index, + int *default_index); +static int process_inner_partition(PartitionMap *outer_map, + PartitionMap *inner_map, + bool outer_has_default, + bool inner_has_default, + int inner_index, + int outer_default, + JoinType jointype, + int *next_index, + int *default_index); +static void merge_null_partitions(PartitionMap *outer_map, + PartitionMap *inner_map, + bool outer_has_null, + bool inner_has_null, + int outer_null, + int inner_null, + JoinType jointype, + int *next_index, + int *null_index); +static void merge_default_partitions(PartitionMap *outer_map, + PartitionMap *inner_map, + bool outer_has_default, + bool inner_has_default, + int outer_default, + int inner_default, + JoinType jointype, + int *next_index, + int *default_index); +static int merge_partition_with_dummy(PartitionMap *map, int index, + int *next_index); +static void fix_merged_indexes(PartitionMap *outer_map, + PartitionMap *inner_map, + int nmerged, List *merged_indexes); +static void generate_matching_part_pairs(RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + PartitionMap *outer_map, + PartitionMap *inner_map, + int nmerged, + List **outer_parts, + List **inner_parts); +static PartitionBoundInfo build_merged_partition_bounds(char strategy, + List *merged_datums, + List *merged_kinds, + List *merged_indexes, + int null_index, + int default_index); +static int get_range_partition(RelOptInfo *rel, + PartitionBoundInfo bi, + int *lb_pos, + PartitionRangeBound *lb, + PartitionRangeBound *ub); +static int get_range_partition_internal(PartitionBoundInfo bi, + int *lb_pos, + PartitionRangeBound *lb, + PartitionRangeBound *ub); +static bool compare_range_partitions(int partnatts, FmgrInfo *partsupfuncs, + Oid *partcollations, + PartitionRangeBound *outer_lb, + PartitionRangeBound *outer_ub, + PartitionRangeBound *inner_lb, + PartitionRangeBound *inner_ub, + int *lb_cmpval, int *ub_cmpval); +static void get_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs, + Oid *partcollations, JoinType jointype, + PartitionRangeBound *outer_lb, + PartitionRangeBound *outer_ub, + PartitionRangeBound *inner_lb, + PartitionRangeBound *inner_ub, + int lb_cmpval, int ub_cmpval, + PartitionRangeBound *merged_lb, + PartitionRangeBound *merged_ub); +static void add_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs, + Oid *partcollations, + PartitionRangeBound *merged_lb, + PartitionRangeBound *merged_ub, + int merged_index, + List **merged_datums, + List **merged_kinds, + List **merged_indexes); +static PartitionRangeBound *make_one_partition_rbound(PartitionKey key, int index, + List *datums, bool lower); +static int32 partition_hbound_cmp(int modulus1, int remainder1, int modulus2, + int remainder2); +static int32 partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, + Oid *partcollation, Datum *datums1, + PartitionRangeDatumKind *kind1, bool lower1, + PartitionRangeBound *b2); +static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, + Oid *partcollation, + PartitionBoundInfo boundinfo, + PartitionRangeBound *probe, bool *is_equal); +static Expr *make_partition_op_expr(PartitionKey key, int keynum, + uint16 strategy, Expr *arg1, Expr *arg2); +static Oid get_partition_operator(PartitionKey key, int col, + StrategyNumber strategy, bool *need_relabel); +static List *get_qual_for_hash(Relation parent, PartitionBoundSpec *spec); +static List *get_qual_for_list(Relation parent, PartitionBoundSpec *spec); +static List *get_qual_for_range(Relation parent, PartitionBoundSpec *spec, + bool for_default); +static void get_range_key_properties(PartitionKey key, int keynum, + PartitionRangeDatum *ldatum, + PartitionRangeDatum *udatum, + ListCell **partexprs_item, + Expr **keyCol, + Const **lower_val, Const **upper_val); +static List *get_range_nulltest(PartitionKey key); + +/* + * get_qual_from_partbound + * Given a parser node for partition bound, return the list of executable + * expressions as partition constraint + */ +List * +get_qual_from_partbound(Relation rel, Relation parent, + PartitionBoundSpec *spec) +{ + PartitionKey key = RelationGetPartitionKey(parent); + List *my_qual = NIL; + + Assert(key != NULL); + + switch (key->strategy) + { + case PARTITION_STRATEGY_HASH: + Assert(spec->strategy == PARTITION_STRATEGY_HASH); + my_qual = get_qual_for_hash(parent, spec); + break; + + case PARTITION_STRATEGY_LIST: + Assert(spec->strategy == PARTITION_STRATEGY_LIST); + my_qual = get_qual_for_list(parent, spec); + break; + + case PARTITION_STRATEGY_RANGE: + Assert(spec->strategy == PARTITION_STRATEGY_RANGE); + my_qual = get_qual_for_range(parent, spec, false); + break; + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + + return my_qual; +} + +/* + * partition_bounds_create + * Build a PartitionBoundInfo struct from a list of PartitionBoundSpec + * nodes + * + * This function creates a PartitionBoundInfo and fills the values of its + * various members based on the input list. Importantly, 'datums' array will + * contain Datum representation of individual bounds (possibly after + * de-duplication as in case of range bounds), sorted in a canonical order + * defined by qsort_partition_* functions of respective partitioning methods. + * 'indexes' array will contain as many elements as there are bounds (specific + * exceptions to this rule are listed in the function body), which represent + * the 0-based canonical positions of partitions. + * + * Upon return from this function, *mapping is set to an array of + * list_length(boundspecs) elements, each of which maps the original index of + * a partition to its canonical index. + * + * Note: The objects returned by this function are wholly allocated in the + * current memory context. + */ +PartitionBoundInfo +partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) +{ + int i; + + Assert(nparts > 0); + + /* + * For each partitioning method, we first convert the partition bounds + * from their parser node representation to the internal representation, + * along with any additional preprocessing (such as de-duplicating range + * bounds). Resulting bound datums are then added to the 'datums' array + * in PartitionBoundInfo. For each datum added, an integer indicating the + * canonical partition index is added to the 'indexes' array. + * + * For each bound, we remember its partition's position (0-based) in the + * original list to later map it to the canonical index. + */ + + /* + * Initialize mapping array with invalid values, this is filled within + * each sub-routine below depending on the bound type. + */ + *mapping = (int *) palloc(sizeof(int) * nparts); + for (i = 0; i < nparts; i++) + (*mapping)[i] = -1; + + switch (key->strategy) + { + case PARTITION_STRATEGY_HASH: + return create_hash_bounds(boundspecs, nparts, key, mapping); + + case PARTITION_STRATEGY_LIST: + return create_list_bounds(boundspecs, nparts, key, mapping); + + case PARTITION_STRATEGY_RANGE: + return create_range_bounds(boundspecs, nparts, key, mapping); + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + break; + } + + Assert(false); + return NULL; /* keep compiler quiet */ +} + +/* + * create_hash_bounds + * Create a PartitionBoundInfo for a hash partitioned table + */ +static PartitionBoundInfo +create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) +{ + PartitionBoundInfo boundinfo; + PartitionHashBound **hbounds = NULL; + int i; + int ndatums = 0; + int greatest_modulus; + + boundinfo = (PartitionBoundInfoData *) + palloc0(sizeof(PartitionBoundInfoData)); + boundinfo->strategy = key->strategy; + /* No special hash partitions. */ + boundinfo->null_index = -1; + boundinfo->default_index = -1; + + ndatums = nparts; + hbounds = (PartitionHashBound **) + palloc(nparts * sizeof(PartitionHashBound *)); + + /* Convert from node to the internal representation */ + for (i = 0; i < nparts; i++) + { + PartitionBoundSpec *spec = boundspecs[i]; + + if (spec->strategy != PARTITION_STRATEGY_HASH) + elog(ERROR, "invalid strategy in partition bound spec"); + + hbounds[i] = (PartitionHashBound *) palloc(sizeof(PartitionHashBound)); + hbounds[i]->modulus = spec->modulus; + hbounds[i]->remainder = spec->remainder; + hbounds[i]->index = i; + } + + /* Sort all the bounds in ascending order */ + qsort(hbounds, nparts, sizeof(PartitionHashBound *), + qsort_partition_hbound_cmp); + + /* After sorting, moduli are now stored in ascending order. */ + greatest_modulus = hbounds[ndatums - 1]->modulus; + + boundinfo->ndatums = ndatums; + boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); + boundinfo->nindexes = greatest_modulus; + boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int)); + for (i = 0; i < greatest_modulus; i++) + boundinfo->indexes[i] = -1; + + /* + * For hash partitioning, there are as many datums (modulus and remainder + * pairs) as there are partitions. Indexes are simply values ranging from + * 0 to (nparts - 1). + */ + for (i = 0; i < nparts; i++) + { + int modulus = hbounds[i]->modulus; + int remainder = hbounds[i]->remainder; + + boundinfo->datums[i] = (Datum *) palloc(2 * sizeof(Datum)); + boundinfo->datums[i][0] = Int32GetDatum(modulus); + boundinfo->datums[i][1] = Int32GetDatum(remainder); + + while (remainder < greatest_modulus) + { + /* overlap? */ + Assert(boundinfo->indexes[remainder] == -1); + boundinfo->indexes[remainder] = i; + remainder += modulus; + } + + (*mapping)[hbounds[i]->index] = i; + pfree(hbounds[i]); + } + pfree(hbounds); + + return boundinfo; +} + +/* + * create_list_bounds + * Create a PartitionBoundInfo for a list partitioned table + */ +static PartitionBoundInfo +create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) +{ + PartitionBoundInfo boundinfo; + PartitionListValue **all_values = NULL; + ListCell *cell; + int i = 0; + int ndatums = 0; + int next_index = 0; + int default_index = -1; + int null_index = -1; + List *non_null_values = NIL; + + boundinfo = (PartitionBoundInfoData *) + palloc0(sizeof(PartitionBoundInfoData)); + boundinfo->strategy = key->strategy; + /* Will be set correctly below. */ + boundinfo->null_index = -1; + boundinfo->default_index = -1; + + /* Create a unified list of non-null values across all partitions. */ + for (i = 0; i < nparts; i++) + { + PartitionBoundSpec *spec = boundspecs[i]; + ListCell *c; + + if (spec->strategy != PARTITION_STRATEGY_LIST) + elog(ERROR, "invalid strategy in partition bound spec"); + + /* + * Note the index of the partition bound spec for the default + * partition. There's no datum to add to the list on non-null datums + * for this partition. + */ + if (spec->is_default) + { + default_index = i; + continue; + } + + foreach(c, spec->listdatums) + { + Const *val = castNode(Const, lfirst(c)); + PartitionListValue *list_value = NULL; + + if (!val->constisnull) + { + list_value = (PartitionListValue *) + palloc0(sizeof(PartitionListValue)); + list_value->index = i; + list_value->value = val->constvalue; + } + else + { + /* + * Never put a null into the values array; save the index of + * the partition that stores nulls, instead. + */ + if (null_index != -1) + elog(ERROR, "found null more than once"); + null_index = i; + } + + if (list_value) + non_null_values = lappend(non_null_values, list_value); + } + } + + ndatums = list_length(non_null_values); + + /* + * Collect all list values in one array. Alongside the value, we also save + * the index of partition the value comes from. + */ + all_values = (PartitionListValue **) + palloc(ndatums * sizeof(PartitionListValue *)); + i = 0; + foreach(cell, non_null_values) + { + PartitionListValue *src = lfirst(cell); + + all_values[i] = (PartitionListValue *) + palloc(sizeof(PartitionListValue)); + all_values[i]->value = src->value; + all_values[i]->index = src->index; + i++; + } + + qsort_arg(all_values, ndatums, sizeof(PartitionListValue *), + qsort_partition_list_value_cmp, (void *) key); + + boundinfo->ndatums = ndatums; + boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); + boundinfo->nindexes = ndatums; + boundinfo->indexes = (int *) palloc(ndatums * sizeof(int)); + + /* + * Copy values. Canonical indexes are values ranging from 0 to (nparts - + * 1) assigned to each partition such that all datums of a given partition + * receive the same value. The value for a given partition is the index of + * that partition's smallest datum in the all_values[] array. + */ + for (i = 0; i < ndatums; i++) + { + int orig_index = all_values[i]->index; + + boundinfo->datums[i] = (Datum *) palloc(sizeof(Datum)); + boundinfo->datums[i][0] = datumCopy(all_values[i]->value, + key->parttypbyval[0], + key->parttyplen[0]); + + /* If the old index has no mapping, assign one */ + if ((*mapping)[orig_index] == -1) + (*mapping)[orig_index] = next_index++; + + boundinfo->indexes[i] = (*mapping)[orig_index]; + } + + /* + * Set the canonical value for null_index, if any. + * + * It is possible that the null-accepting partition has not been assigned + * an index yet, which could happen if such partition accepts only null + * and hence not handled in the above loop which only looked at non-null + * values. + */ + if (null_index != -1) + { + Assert(null_index >= 0); + if ((*mapping)[null_index] == -1) + (*mapping)[null_index] = next_index++; + boundinfo->null_index = (*mapping)[null_index]; + } + + /* Set the canonical value for default_index, if any. */ + if (default_index != -1) + { + /* + * The default partition accepts any value not specified in the lists + * of other partitions, hence it should not get mapped index while + * assigning those for non-null datums. + */ + Assert(default_index >= 0); + Assert((*mapping)[default_index] == -1); + (*mapping)[default_index] = next_index++; + boundinfo->default_index = (*mapping)[default_index]; + } + + /* All partitions must now have been assigned canonical indexes. */ + Assert(next_index == nparts); + return boundinfo; +} + +/* + * create_range_bounds + * Create a PartitionBoundInfo for a range partitioned table + */ +static PartitionBoundInfo +create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) +{ + PartitionBoundInfo boundinfo; + PartitionRangeBound **rbounds = NULL; + PartitionRangeBound **all_bounds, + *prev; + int i, + k; + int ndatums = 0; + int default_index = -1; + int next_index = 0; + + boundinfo = (PartitionBoundInfoData *) + palloc0(sizeof(PartitionBoundInfoData)); + boundinfo->strategy = key->strategy; + /* There is no special null-accepting range partition. */ + boundinfo->null_index = -1; + /* Will be set correctly below. */ + boundinfo->default_index = -1; + + all_bounds = (PartitionRangeBound **) + palloc0(2 * nparts * sizeof(PartitionRangeBound *)); + + /* Create a unified list of range bounds across all the partitions. */ + ndatums = 0; + for (i = 0; i < nparts; i++) + { + PartitionBoundSpec *spec = boundspecs[i]; + PartitionRangeBound *lower, + *upper; + + if (spec->strategy != PARTITION_STRATEGY_RANGE) + elog(ERROR, "invalid strategy in partition bound spec"); + + /* + * Note the index of the partition bound spec for the default + * partition. There's no datum to add to the all_bounds array for + * this partition. + */ + if (spec->is_default) + { + default_index = i; + continue; + } + + lower = make_one_partition_rbound(key, i, spec->lowerdatums, true); + upper = make_one_partition_rbound(key, i, spec->upperdatums, false); + all_bounds[ndatums++] = lower; + all_bounds[ndatums++] = upper; + } + + Assert(ndatums == nparts * 2 || + (default_index != -1 && ndatums == (nparts - 1) * 2)); + + /* Sort all the bounds in ascending order */ + qsort_arg(all_bounds, ndatums, + sizeof(PartitionRangeBound *), + qsort_partition_rbound_cmp, + (void *) key); + + /* Save distinct bounds from all_bounds into rbounds. */ + rbounds = (PartitionRangeBound **) + palloc(ndatums * sizeof(PartitionRangeBound *)); + k = 0; + prev = NULL; + for (i = 0; i < ndatums; i++) + { + PartitionRangeBound *cur = all_bounds[i]; + bool is_distinct = false; + int j; + + /* Is the current bound distinct from the previous one? */ + for (j = 0; j < key->partnatts; j++) + { + Datum cmpval; + + if (prev == NULL || cur->kind[j] != prev->kind[j]) + { + is_distinct = true; + break; + } + + /* + * If the bounds are both MINVALUE or MAXVALUE, stop now and treat + * them as equal, since any values after this point must be + * ignored. + */ + if (cur->kind[j] != PARTITION_RANGE_DATUM_VALUE) + break; + + cmpval = FunctionCall2Coll(&key->partsupfunc[j], + key->partcollation[j], + cur->datums[j], + prev->datums[j]); + if (DatumGetInt32(cmpval) != 0) + { + is_distinct = true; + break; + } + } + + /* + * Only if the bound is distinct save it into a temporary array, i.e, + * rbounds which is later copied into boundinfo datums array. + */ + if (is_distinct) + rbounds[k++] = all_bounds[i]; + + prev = cur; + } + + /* Update ndatums to hold the count of distinct datums. */ + ndatums = k; + + /* + * Add datums to boundinfo. Canonical indexes are values ranging from 0 + * to nparts - 1, assigned in that order to each partition's upper bound. + * For 'datums' elements that are lower bounds, there is -1 in the + * 'indexes' array to signify that no partition exists for the values less + * than such a bound and greater than or equal to the previous upper + * bound. + */ + boundinfo->ndatums = ndatums; + boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); + boundinfo->kind = (PartitionRangeDatumKind **) + palloc(ndatums * + sizeof(PartitionRangeDatumKind *)); + + /* + * For range partitioning, an additional value of -1 is stored as the last + * element of the indexes[] array. + */ + boundinfo->nindexes = ndatums + 1; + boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int)); + + for (i = 0; i < ndatums; i++) + { + int j; + + boundinfo->datums[i] = (Datum *) palloc(key->partnatts * + sizeof(Datum)); + boundinfo->kind[i] = (PartitionRangeDatumKind *) + palloc(key->partnatts * + sizeof(PartitionRangeDatumKind)); + for (j = 0; j < key->partnatts; j++) + { + if (rbounds[i]->kind[j] == PARTITION_RANGE_DATUM_VALUE) + boundinfo->datums[i][j] = + datumCopy(rbounds[i]->datums[j], + key->parttypbyval[j], + key->parttyplen[j]); + boundinfo->kind[i][j] = rbounds[i]->kind[j]; + } + + /* + * There is no mapping for invalid indexes. + * + * Any lower bounds in the rbounds array have invalid indexes + * assigned, because the values between the previous bound (if there + * is one) and this (lower) bound are not part of the range of any + * existing partition. + */ + if (rbounds[i]->lower) + boundinfo->indexes[i] = -1; + else + { + int orig_index = rbounds[i]->index; + + /* If the old index has no mapping, assign one */ + if ((*mapping)[orig_index] == -1) + (*mapping)[orig_index] = next_index++; + + boundinfo->indexes[i] = (*mapping)[orig_index]; + } + } + + /* Set the canonical value for default_index, if any. */ + if (default_index != -1) + { + Assert(default_index >= 0 && (*mapping)[default_index] == -1); + (*mapping)[default_index] = next_index++; + boundinfo->default_index = (*mapping)[default_index]; + } + + /* The extra -1 element. */ + Assert(i == ndatums); + boundinfo->indexes[i] = -1; + + /* All partitions must now have been assigned canonical indexes. */ + Assert(next_index == nparts); + return boundinfo; +} + +/* + * Are two partition bound collections logically equal? + * + * Used in the keep logic of relcache.c (ie, in RelationClearRelation()). + * This is also useful when b1 and b2 are bound collections of two separate + * relations, respectively, because PartitionBoundInfo is a canonical + * representation of partition bounds. + */ +bool +partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, + PartitionBoundInfo b1, PartitionBoundInfo b2) +{ + int i; + + if (b1->strategy != b2->strategy) + return false; + + if (b1->ndatums != b2->ndatums) + return false; + + if (b1->nindexes != b2->nindexes) + return false; + + if (b1->null_index != b2->null_index) + return false; + + if (b1->default_index != b2->default_index) + return false; + + /* For all partition strategies, the indexes[] arrays have to match */ + for (i = 0; i < b1->nindexes; i++) + { + if (b1->indexes[i] != b2->indexes[i]) + return false; + } + + /* Finally, compare the datums[] arrays */ + if (b1->strategy == PARTITION_STRATEGY_HASH) + { + /* + * We arrange the partitions in the ascending order of their moduli + * and remainders. Also every modulus is factor of next larger + * modulus. Therefore we can safely store index of a given partition + * in indexes array at remainder of that partition. Also entries at + * (remainder + N * modulus) positions in indexes array are all same + * for (modulus, remainder) specification for any partition. Thus the + * datums arrays from the given bounds are the same, if and only if + * their indexes arrays are the same. So, it suffices to compare the + * indexes arrays. + * + * Nonetheless make sure that the bounds are indeed the same when the + * indexes match. Hash partition bound stores modulus and remainder + * at b1->datums[i][0] and b1->datums[i][1] position respectively. + */ +#ifdef USE_ASSERT_CHECKING + for (i = 0; i < b1->ndatums; i++) + Assert((b1->datums[i][0] == b2->datums[i][0] && + b1->datums[i][1] == b2->datums[i][1])); +#endif + } + else + { + for (i = 0; i < b1->ndatums; i++) + { + int j; + + for (j = 0; j < partnatts; j++) + { + /* For range partitions, the bounds might not be finite. */ + if (b1->kind != NULL) + { + /* The different kinds of bound all differ from each other */ + if (b1->kind[i][j] != b2->kind[i][j]) + return false; + + /* + * Non-finite bounds are equal without further + * examination. + */ + if (b1->kind[i][j] != PARTITION_RANGE_DATUM_VALUE) + continue; + } + + /* + * Compare the actual values. Note that it would be both + * incorrect and unsafe to invoke the comparison operator + * derived from the partitioning specification here. It would + * be incorrect because we want the relcache entry to be + * updated for ANY change to the partition bounds, not just + * those that the partitioning operator thinks are + * significant. It would be unsafe because we might reach + * this code in the context of an aborted transaction, and an + * arbitrary partitioning operator might not be safe in that + * context. datumIsEqual() should be simple enough to be + * safe. + */ + if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j], + parttypbyval[j], parttyplen[j])) + return false; + } + } + } + return true; +} + +/* + * Return a copy of given PartitionBoundInfo structure. The data types of bounds + * are described by given partition key specification. + * + * Note: it's important that this function and its callees not do any catalog + * access, nor anything else that would result in allocating memory other than + * the returned data structure. Since this is called in a long-lived context, + * that would result in unwanted memory leaks. + */ +PartitionBoundInfo +partition_bounds_copy(PartitionBoundInfo src, + PartitionKey key) +{ + PartitionBoundInfo dest; + int i; + int ndatums; + int nindexes; + int partnatts; + bool hash_part; + int natts; + + dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData)); + + dest->strategy = src->strategy; + ndatums = dest->ndatums = src->ndatums; + nindexes = dest->nindexes = src->nindexes; + partnatts = key->partnatts; + + /* List partitioned tables have only a single partition key. */ + Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1); + + dest->datums = (Datum **) palloc(sizeof(Datum *) * ndatums); + + if (src->kind != NULL) + { + dest->kind = (PartitionRangeDatumKind **) palloc(ndatums * + sizeof(PartitionRangeDatumKind *)); + for (i = 0; i < ndatums; i++) + { + dest->kind[i] = (PartitionRangeDatumKind *) palloc(partnatts * + sizeof(PartitionRangeDatumKind)); + + memcpy(dest->kind[i], src->kind[i], + sizeof(PartitionRangeDatumKind) * key->partnatts); + } + } + else + dest->kind = NULL; + + /* + * For hash partitioning, datums array will have two elements - modulus + * and remainder. + */ + hash_part = (key->strategy == PARTITION_STRATEGY_HASH); + natts = hash_part ? 2 : partnatts; + + for (i = 0; i < ndatums; i++) + { + int j; + + dest->datums[i] = (Datum *) palloc(sizeof(Datum) * natts); + + for (j = 0; j < natts; j++) + { + bool byval; + int typlen; + + if (hash_part) + { + typlen = sizeof(int32); /* Always int4 */ + byval = true; /* int4 is pass-by-value */ + } + else + { + byval = key->parttypbyval[j]; + typlen = key->parttyplen[j]; + } + + if (dest->kind == NULL || + dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE) + dest->datums[i][j] = datumCopy(src->datums[i][j], + byval, typlen); + } + } + + dest->indexes = (int *) palloc(sizeof(int) * nindexes); + memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes); + + dest->null_index = src->null_index; + dest->default_index = src->default_index; + + return dest; +} + +/* + * partition_bounds_merge + * Check to see whether every partition of 'outer_rel' matches/overlaps + * one partition of 'inner_rel' at most, and vice versa; and if so, build + * and return the partition bounds for a join relation between the rels, + * generating two lists of the matching/overlapping partitions, which are + * returned to *outer_parts and *inner_parts respectively. + * + * The lists contain the same number of partitions, and the partitions at the + * same positions in the lists indicate join pairs used for partitioned join. + * If a partition on one side matches/overlaps multiple partitions on the other + * side, this function returns NULL, setting *outer_parts and *inner_parts to + * NIL. + */ +PartitionBoundInfo +partition_bounds_merge(int partnatts, + FmgrInfo *partsupfunc, Oid *partcollation, + RelOptInfo *outer_rel, RelOptInfo *inner_rel, + JoinType jointype, + List **outer_parts, List **inner_parts) +{ + /* + * Currently, this function is called only from try_partitionwise_join(), + * so the join type should be INNER, LEFT, FULL, SEMI, or ANTI. + */ + Assert(jointype == JOIN_INNER || jointype == JOIN_LEFT || + jointype == JOIN_FULL || jointype == JOIN_SEMI || + jointype == JOIN_ANTI); + + /* The partitioning strategies should be the same. */ + Assert(outer_rel->boundinfo->strategy == inner_rel->boundinfo->strategy); + + *outer_parts = *inner_parts = NIL; + switch (outer_rel->boundinfo->strategy) + { + case PARTITION_STRATEGY_HASH: + + /* + * For hash partitioned tables, we currently support partitioned + * join only when they have exactly the same partition bounds. + * + * XXX: it might be possible to relax the restriction to support + * cases where hash partitioned tables have missing partitions + * and/or different moduli, but it's not clear if it would be + * useful to support the former case since it's unusual to have + * missing partitions. On the other hand, it would be useful to + * support the latter case, but in that case, there is a high + * probability that a partition on one side will match multiple + * partitions on the other side, which is the scenario the current + * implementation of partitioned join can't handle. + */ + return NULL; + + case PARTITION_STRATEGY_LIST: + return merge_list_bounds(partsupfunc, + partcollation, + outer_rel, + inner_rel, + jointype, + outer_parts, + inner_parts); + + case PARTITION_STRATEGY_RANGE: + return merge_range_bounds(partnatts, + partsupfunc, + partcollation, + outer_rel, + inner_rel, + jointype, + outer_parts, + inner_parts); + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) outer_rel->boundinfo->strategy); + return NULL; /* keep compiler quiet */ + } +} + +/* + * merge_list_bounds + * Create the partition bounds for a join relation between list + * partitioned tables, if possible + * + * In this function we try to find sets of matching partitions from both sides + * by comparing list values stored in their partition bounds. Since the list + * values appear in the ascending order, an algorithm similar to merge join is + * used for that. If a partition on one side doesn't have a matching + * partition on the other side, the algorithm tries to match it with the + * default partition on the other side if any; if not, the algorithm tries to + * match it with a dummy partition on the other side if it's on the + * non-nullable side of an outer join. Also, if both sides have the default + * partitions, the algorithm tries to match them with each other. We give up + * if the algorithm finds a partition matching multiple partitions on the + * other side, which is the scenario the current implementation of partitioned + * join can't handle. + */ +static PartitionBoundInfo +merge_list_bounds(FmgrInfo *partsupfunc, Oid *partcollation, + RelOptInfo *outer_rel, RelOptInfo *inner_rel, + JoinType jointype, + List **outer_parts, List **inner_parts) +{ + PartitionBoundInfo merged_bounds = NULL; + PartitionBoundInfo outer_bi = outer_rel->boundinfo; + PartitionBoundInfo inner_bi = inner_rel->boundinfo; + bool outer_has_default = partition_bound_has_default(outer_bi); + bool inner_has_default = partition_bound_has_default(inner_bi); + int outer_default = outer_bi->default_index; + int inner_default = inner_bi->default_index; + bool outer_has_null = partition_bound_accepts_nulls(outer_bi); + bool inner_has_null = partition_bound_accepts_nulls(inner_bi); + PartitionMap outer_map; + PartitionMap inner_map; + int outer_pos; + int inner_pos; + int next_index = 0; + int null_index = -1; + int default_index = -1; + List *merged_datums = NIL; + List *merged_indexes = NIL; + + Assert(*outer_parts == NIL); + Assert(*inner_parts == NIL); + Assert(outer_bi->strategy == inner_bi->strategy && + outer_bi->strategy == PARTITION_STRATEGY_LIST); + /* List partitioning doesn't require kinds. */ + Assert(!outer_bi->kind && !inner_bi->kind); + + init_partition_map(outer_rel, &outer_map); + init_partition_map(inner_rel, &inner_map); + + /* + * If the default partitions (if any) have been proven empty, deem them + * non-existent. + */ + if (outer_has_default && is_dummy_partition(outer_rel, outer_default)) + outer_has_default = false; + if (inner_has_default && is_dummy_partition(inner_rel, inner_default)) + inner_has_default = false; + + /* + * Merge partitions from both sides. In each iteration we compare a pair + * of list values, one from each side, and decide whether the + * corresponding partitions match or not. If the two values match + * exactly, move to the next pair of list values, otherwise move to the + * next list value on the side with a smaller list value. + */ + outer_pos = inner_pos = 0; + while (outer_pos < outer_bi->ndatums || inner_pos < inner_bi->ndatums) + { + int outer_index = -1; + int inner_index = -1; + Datum *outer_datums; + Datum *inner_datums; + int cmpval; + Datum *merged_datum = NULL; + int merged_index = -1; + + if (outer_pos < outer_bi->ndatums) + { + /* + * If the partition on the outer side has been proven empty, + * ignore it and move to the next datum on the outer side. + */ + outer_index = outer_bi->indexes[outer_pos]; + if (is_dummy_partition(outer_rel, outer_index)) + { + outer_pos++; + continue; + } + } + if (inner_pos < inner_bi->ndatums) + { + /* + * If the partition on the inner side has been proven empty, + * ignore it and move to the next datum on the inner side. + */ + inner_index = inner_bi->indexes[inner_pos]; + if (is_dummy_partition(inner_rel, inner_index)) + { + inner_pos++; + continue; + } + } + + /* Get the list values. */ + outer_datums = outer_pos < outer_bi->ndatums ? + outer_bi->datums[outer_pos] : NULL; + inner_datums = inner_pos < inner_bi->ndatums ? + inner_bi->datums[inner_pos] : NULL; + + /* + * We run this loop till both sides finish. This allows us to avoid + * duplicating code to handle the remaining values on the side which + * finishes later. For that we set the comparison parameter cmpval in + * such a way that it appears as if the side which finishes earlier + * has an extra value higher than any other value on the unfinished + * side. That way we advance the values on the unfinished side till + * all of its values are exhausted. + */ + if (outer_pos >= outer_bi->ndatums) + cmpval = 1; + else if (inner_pos >= inner_bi->ndatums) + cmpval = -1; + else + { + Assert(outer_datums != NULL && inner_datums != NULL); + cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[0], + partcollation[0], + outer_datums[0], + inner_datums[0])); + } + + if (cmpval == 0) + { + /* Two list values match exactly. */ + Assert(outer_pos < outer_bi->ndatums); + Assert(inner_pos < inner_bi->ndatums); + Assert(outer_index >= 0); + Assert(inner_index >= 0); + + /* + * Try merging both partitions. If successful, add the list value + * and index of the merged partition below. + */ + merged_index = merge_matching_partitions(&outer_map, &inner_map, + outer_index, inner_index, + &next_index); + if (merged_index == -1) + goto cleanup; + + merged_datum = outer_datums; + + /* Move to the next pair of list values. */ + outer_pos++; + inner_pos++; + } + else if (cmpval < 0) + { + /* A list value missing from the inner side. */ + Assert(outer_pos < outer_bi->ndatums); + + /* + * If the inner side has the default partition, or this is an + * outer join, try to assign a merged partition to the outer + * partition (see process_outer_partition()). Otherwise, the + * outer partition will not contribute to the result. + */ + if (inner_has_default || IS_OUTER_JOIN(jointype)) + { + /* Get the outer partition. */ + outer_index = outer_bi->indexes[outer_pos]; + Assert(outer_index >= 0); + merged_index = process_outer_partition(&outer_map, + &inner_map, + outer_has_default, + inner_has_default, + outer_index, + inner_default, + jointype, + &next_index, + &default_index); + if (merged_index == -1) + goto cleanup; + merged_datum = outer_datums; + } + + /* Move to the next list value on the outer side. */ + outer_pos++; + } + else + { + /* A list value missing from the outer side. */ + Assert(cmpval > 0); + Assert(inner_pos < inner_bi->ndatums); + + /* + * If the outer side has the default partition, or this is a FULL + * join, try to assign a merged partition to the inner partition + * (see process_inner_partition()). Otherwise, the inner + * partition will not contribute to the result. + */ + if (outer_has_default || jointype == JOIN_FULL) + { + /* Get the inner partition. */ + inner_index = inner_bi->indexes[inner_pos]; + Assert(inner_index >= 0); + merged_index = process_inner_partition(&outer_map, + &inner_map, + outer_has_default, + inner_has_default, + inner_index, + outer_default, + jointype, + &next_index, + &default_index); + if (merged_index == -1) + goto cleanup; + merged_datum = inner_datums; + } + + /* Move to the next list value on the inner side. */ + inner_pos++; + } + + /* + * If we assigned a merged partition, add the list value and index of + * the merged partition if appropriate. + */ + if (merged_index >= 0 && merged_index != default_index) + { + merged_datums = lappend(merged_datums, merged_datum); + merged_indexes = lappend_int(merged_indexes, merged_index); + } + } + + /* + * If the NULL partitions (if any) have been proven empty, deem them + * non-existent. + */ + if (outer_has_null && + is_dummy_partition(outer_rel, outer_bi->null_index)) + outer_has_null = false; + if (inner_has_null && + is_dummy_partition(inner_rel, inner_bi->null_index)) + inner_has_null = false; + + /* Merge the NULL partitions if any. */ + if (outer_has_null || inner_has_null) + merge_null_partitions(&outer_map, &inner_map, + outer_has_null, inner_has_null, + outer_bi->null_index, inner_bi->null_index, + jointype, &next_index, &null_index); + else + Assert(null_index == -1); + + /* Merge the default partitions if any. */ + if (outer_has_default || inner_has_default) + merge_default_partitions(&outer_map, &inner_map, + outer_has_default, inner_has_default, + outer_default, inner_default, + jointype, &next_index, &default_index); + else + Assert(default_index == -1); + + /* If we have merged partitions, create the partition bounds. */ + if (next_index > 0) + { + /* Fix the merged_indexes list if necessary. */ + if (outer_map.did_remapping || inner_map.did_remapping) + { + Assert(jointype == JOIN_FULL); + fix_merged_indexes(&outer_map, &inner_map, + next_index, merged_indexes); + } + + /* Use maps to match partitions from inputs. */ + generate_matching_part_pairs(outer_rel, inner_rel, + &outer_map, &inner_map, + next_index, + outer_parts, inner_parts); + Assert(*outer_parts != NIL); + Assert(*inner_parts != NIL); + Assert(list_length(*outer_parts) == list_length(*inner_parts)); + Assert(list_length(*outer_parts) <= next_index); + + /* Make a PartitionBoundInfo struct to return. */ + merged_bounds = build_merged_partition_bounds(outer_bi->strategy, + merged_datums, + NIL, + merged_indexes, + null_index, + default_index); + Assert(merged_bounds); + } + +cleanup: + /* Free local memory before returning. */ + list_free(merged_datums); + list_free(merged_indexes); + free_partition_map(&outer_map); + free_partition_map(&inner_map); + + return merged_bounds; +} + +/* + * merge_range_bounds + * Create the partition bounds for a join relation between range + * partitioned tables, if possible + * + * In this function we try to find sets of overlapping partitions from both + * sides by comparing ranges stored in their partition bounds. Since the + * ranges appear in the ascending order, an algorithm similar to merge join is + * used for that. If a partition on one side doesn't have an overlapping + * partition on the other side, the algorithm tries to match it with the + * default partition on the other side if any; if not, the algorithm tries to + * match it with a dummy partition on the other side if it's on the + * non-nullable side of an outer join. Also, if both sides have the default + * partitions, the algorithm tries to match them with each other. We give up + * if the algorithm finds a partition overlapping multiple partitions on the + * other side, which is the scenario the current implementation of partitioned + * join can't handle. + */ +static PartitionBoundInfo +merge_range_bounds(int partnatts, FmgrInfo *partsupfuncs, + Oid *partcollations, + RelOptInfo *outer_rel, RelOptInfo *inner_rel, + JoinType jointype, + List **outer_parts, List **inner_parts) +{ + PartitionBoundInfo merged_bounds = NULL; + PartitionBoundInfo outer_bi = outer_rel->boundinfo; + PartitionBoundInfo inner_bi = inner_rel->boundinfo; + bool outer_has_default = partition_bound_has_default(outer_bi); + bool inner_has_default = partition_bound_has_default(inner_bi); + int outer_default = outer_bi->default_index; + int inner_default = inner_bi->default_index; + PartitionMap outer_map; + PartitionMap inner_map; + int outer_index; + int inner_index; + int outer_lb_pos; + int inner_lb_pos; + PartitionRangeBound outer_lb; + PartitionRangeBound outer_ub; + PartitionRangeBound inner_lb; + PartitionRangeBound inner_ub; + int next_index = 0; + int default_index = -1; + List *merged_datums = NIL; + List *merged_kinds = NIL; + List *merged_indexes = NIL; + + Assert(*outer_parts == NIL); + Assert(*inner_parts == NIL); + Assert(outer_bi->strategy == inner_bi->strategy && + outer_bi->strategy == PARTITION_STRATEGY_RANGE); + + init_partition_map(outer_rel, &outer_map); + init_partition_map(inner_rel, &inner_map); + + /* + * If the default partitions (if any) have been proven empty, deem them + * non-existent. + */ + if (outer_has_default && is_dummy_partition(outer_rel, outer_default)) + outer_has_default = false; + if (inner_has_default && is_dummy_partition(inner_rel, inner_default)) + inner_has_default = false; + + /* + * Merge partitions from both sides. In each iteration we compare a pair + * of ranges, one from each side, and decide whether the corresponding + * partitions match or not. If the two ranges overlap, move to the next + * pair of ranges, otherwise move to the next range on the side with a + * lower range. outer_lb_pos/inner_lb_pos keep track of the positions of + * lower bounds in the datums arrays in the outer/inner + * PartitionBoundInfos respectively. + */ + outer_lb_pos = inner_lb_pos = 0; + outer_index = get_range_partition(outer_rel, outer_bi, &outer_lb_pos, + &outer_lb, &outer_ub); + inner_index = get_range_partition(inner_rel, inner_bi, &inner_lb_pos, + &inner_lb, &inner_ub); + while (outer_index >= 0 || inner_index >= 0) + { + bool overlap; + int ub_cmpval; + int lb_cmpval; + PartitionRangeBound merged_lb = {-1, NULL, NULL, true}; + PartitionRangeBound merged_ub = {-1, NULL, NULL, false}; + int merged_index = -1; + + /* + * We run this loop till both sides finish. This allows us to avoid + * duplicating code to handle the remaining ranges on the side which + * finishes later. For that we set the comparison parameter cmpval in + * such a way that it appears as if the side which finishes earlier + * has an extra range higher than any other range on the unfinished + * side. That way we advance the ranges on the unfinished side till + * all of its ranges are exhausted. + */ + if (outer_index == -1) + { + overlap = false; + lb_cmpval = 1; + ub_cmpval = 1; + } + else if (inner_index == -1) + { + overlap = false; + lb_cmpval = -1; + ub_cmpval = -1; + } + else + overlap = compare_range_partitions(partnatts, partsupfuncs, + partcollations, + &outer_lb, &outer_ub, + &inner_lb, &inner_ub, + &lb_cmpval, &ub_cmpval); + + if (overlap) + { + /* Two ranges overlap; form a join pair. */ + + PartitionRangeBound save_outer_ub; + PartitionRangeBound save_inner_ub; + + /* Both partitions should not have been merged yet. */ + Assert(outer_index >= 0); + Assert(outer_map.merged_indexes[outer_index] == -1 && + outer_map.merged[outer_index] == false); + Assert(inner_index >= 0); + Assert(inner_map.merged_indexes[inner_index] == -1 && + inner_map.merged[inner_index] == false); + + /* + * Get the index of the merged partition. Both partitions aren't + * merged yet, so the partitions should be merged successfully. + */ + merged_index = merge_matching_partitions(&outer_map, &inner_map, + outer_index, inner_index, + &next_index); + Assert(merged_index >= 0); + + /* Get the range bounds of the merged partition. */ + get_merged_range_bounds(partnatts, partsupfuncs, + partcollations, jointype, + &outer_lb, &outer_ub, + &inner_lb, &inner_ub, + lb_cmpval, ub_cmpval, + &merged_lb, &merged_ub); + + /* Save the upper bounds of both partitions for use below. */ + save_outer_ub = outer_ub; + save_inner_ub = inner_ub; + + /* Move to the next pair of ranges. */ + outer_index = get_range_partition(outer_rel, outer_bi, &outer_lb_pos, + &outer_lb, &outer_ub); + inner_index = get_range_partition(inner_rel, inner_bi, &inner_lb_pos, + &inner_lb, &inner_ub); + + /* + * If the range of a partition on one side overlaps the range of + * the next partition on the other side, that will cause the + * partition on one side to match at least two partitions on the + * other side, which is the case that we currently don't support + * partitioned join for; give up. + */ + if (ub_cmpval > 0 && inner_index >= 0 && + compare_range_bounds(partnatts, partsupfuncs, partcollations, + &save_outer_ub, &inner_lb) > 0) + goto cleanup; + if (ub_cmpval < 0 && outer_index >= 0 && + compare_range_bounds(partnatts, partsupfuncs, partcollations, + &outer_lb, &save_inner_ub) < 0) + goto cleanup; + + /* + * A row from a non-overlapping portion (if any) of a partition on + * one side might find its join partner in the default partition + * (if any) on the other side, causing the same situation as + * above; give up in that case. + */ + if ((outer_has_default && (lb_cmpval > 0 || ub_cmpval < 0)) || + (inner_has_default && (lb_cmpval < 0 || ub_cmpval > 0))) + goto cleanup; + } + else if (ub_cmpval < 0) + { + /* A non-overlapping outer range. */ + + /* The outer partition should not have been merged yet. */ + Assert(outer_index >= 0); + Assert(outer_map.merged_indexes[outer_index] == -1 && + outer_map.merged[outer_index] == false); + + /* + * If the inner side has the default partition, or this is an + * outer join, try to assign a merged partition to the outer + * partition (see process_outer_partition()). Otherwise, the + * outer partition will not contribute to the result. + */ + if (inner_has_default || IS_OUTER_JOIN(jointype)) + { + merged_index = process_outer_partition(&outer_map, + &inner_map, + outer_has_default, + inner_has_default, + outer_index, + inner_default, + jointype, + &next_index, + &default_index); + if (merged_index == -1) + goto cleanup; + merged_lb = outer_lb; + merged_ub = outer_ub; + } + + /* Move to the next range on the outer side. */ + outer_index = get_range_partition(outer_rel, outer_bi, &outer_lb_pos, + &outer_lb, &outer_ub); + } + else + { + /* A non-overlapping inner range. */ + Assert(ub_cmpval > 0); + + /* The inner partition should not have been merged yet. */ + Assert(inner_index >= 0); + Assert(inner_map.merged_indexes[inner_index] == -1 && + inner_map.merged[inner_index] == false); + + /* + * If the outer side has the default partition, or this is a FULL + * join, try to assign a merged partition to the inner partition + * (see process_inner_partition()). Otherwise, the inner + * partition will not contribute to the result. + */ + if (outer_has_default || jointype == JOIN_FULL) + { + merged_index = process_inner_partition(&outer_map, + &inner_map, + outer_has_default, + inner_has_default, + inner_index, + outer_default, + jointype, + &next_index, + &default_index); + if (merged_index == -1) + goto cleanup; + merged_lb = inner_lb; + merged_ub = inner_ub; + } + + /* Move to the next range on the inner side. */ + inner_index = get_range_partition(inner_rel, inner_bi, &inner_lb_pos, + &inner_lb, &inner_ub); + } + + /* + * If we assigned a merged partition, add the range bounds and index + * of the merged partition if appropriate. + */ + if (merged_index >= 0 && merged_index != default_index) + add_merged_range_bounds(partnatts, partsupfuncs, partcollations, + &merged_lb, &merged_ub, merged_index, + &merged_datums, &merged_kinds, + &merged_indexes); + } + + /* Merge the default partitions if any. */ + if (outer_has_default || inner_has_default) + merge_default_partitions(&outer_map, &inner_map, + outer_has_default, inner_has_default, + outer_default, inner_default, + jointype, &next_index, &default_index); + else + Assert(default_index == -1); + + /* If we have merged partitions, create the partition bounds. */ + if (next_index > 0) + { + /* + * Unlike the case of list partitioning, we wouldn't have re-merged + * partitions, so did_remapping should be left alone. + */ + Assert(!outer_map.did_remapping); + Assert(!inner_map.did_remapping); + + /* Use maps to match partitions from inputs. */ + generate_matching_part_pairs(outer_rel, inner_rel, + &outer_map, &inner_map, + next_index, + outer_parts, inner_parts); + Assert(*outer_parts != NIL); + Assert(*inner_parts != NIL); + Assert(list_length(*outer_parts) == list_length(*inner_parts)); + Assert(list_length(*outer_parts) == next_index); + + /* Make a PartitionBoundInfo struct to return. */ + merged_bounds = build_merged_partition_bounds(outer_bi->strategy, + merged_datums, + merged_kinds, + merged_indexes, + -1, + default_index); + Assert(merged_bounds); + } + +cleanup: + /* Free local memory before returning. */ + list_free(merged_datums); + list_free(merged_kinds); + list_free(merged_indexes); + free_partition_map(&outer_map); + free_partition_map(&inner_map); + + return merged_bounds; +} + +/* + * init_partition_map + * Initialize a PartitionMap struct for given relation + */ +static void +init_partition_map(RelOptInfo *rel, PartitionMap *map) +{ + int nparts = rel->nparts; + int i; + + map->nparts = nparts; + map->merged_indexes = (int *) palloc(sizeof(int) * nparts); + map->merged = (bool *) palloc(sizeof(bool) * nparts); + map->did_remapping = false; + map->old_indexes = (int *) palloc(sizeof(int) * nparts); + for (i = 0; i < nparts; i++) + { + map->merged_indexes[i] = map->old_indexes[i] = -1; + map->merged[i] = false; + } +} + +/* + * free_partition_map + */ +static void +free_partition_map(PartitionMap *map) +{ + pfree(map->merged_indexes); + pfree(map->merged); + pfree(map->old_indexes); +} + +/* + * is_dummy_partition --- has partition been proven empty? + */ +static bool +is_dummy_partition(RelOptInfo *rel, int part_index) +{ + RelOptInfo *part_rel; + + Assert(part_index >= 0); + part_rel = rel->part_rels[part_index]; + if (part_rel == NULL || IS_DUMMY_REL(part_rel)) + return true; + return false; +} + +/* + * merge_matching_partitions + * Try to merge given outer/inner partitions, and return the index of a + * merged partition produced from them if successful, -1 otherwise + * + * If the merged partition is newly created, *next_index is incremented. + */ +static int +merge_matching_partitions(PartitionMap *outer_map, PartitionMap *inner_map, + int outer_index, int inner_index, int *next_index) +{ + int outer_merged_index; + int inner_merged_index; + bool outer_merged; + bool inner_merged; + + Assert(outer_index >= 0 && outer_index < outer_map->nparts); + outer_merged_index = outer_map->merged_indexes[outer_index]; + outer_merged = outer_map->merged[outer_index]; + Assert(inner_index >= 0 && inner_index < inner_map->nparts); + inner_merged_index = inner_map->merged_indexes[inner_index]; + inner_merged = inner_map->merged[inner_index]; + + /* + * Handle cases where we have already assigned a merged partition to each + * of the given partitions. + */ + if (outer_merged_index >= 0 && inner_merged_index >= 0) + { + /* + * If the mereged partitions are the same, no need to do anything; + * return the index of the merged partitions. Otherwise, if each of + * the given partitions has been merged with a dummy partition on the + * other side, re-map them to either of the two merged partitions. + * Otherwise, they can't be merged, so return -1. + */ + if (outer_merged_index == inner_merged_index) + { + Assert(outer_merged); + Assert(inner_merged); + return outer_merged_index; + } + if (!outer_merged && !inner_merged) + { + /* + * This can only happen for a list-partitioning case. We re-map + * them to the merged partition with the smaller of the two merged + * indexes to preserve the property that the canonical order of + * list partitions is determined by the indexes assigned to the + * smallest list value of each partition. + */ + if (outer_merged_index < inner_merged_index) + { + outer_map->merged[outer_index] = true; + inner_map->merged_indexes[inner_index] = outer_merged_index; + inner_map->merged[inner_index] = true; + inner_map->did_remapping = true; + inner_map->old_indexes[inner_index] = inner_merged_index; + return outer_merged_index; + } + else + { + inner_map->merged[inner_index] = true; + outer_map->merged_indexes[outer_index] = inner_merged_index; + outer_map->merged[outer_index] = true; + outer_map->did_remapping = true; + outer_map->old_indexes[outer_index] = outer_merged_index; + return inner_merged_index; + } + } + return -1; + } + + /* At least one of the given partitions should not have yet been merged. */ + Assert(outer_merged_index == -1 || inner_merged_index == -1); + + /* + * If neither of them has been merged, merge them. Otherwise, if one has + * been merged with a dummy partition on the other side (and the other + * hasn't yet been merged with anything), re-merge them. Otherwise, they + * can't be merged, so return -1. + */ + if (outer_merged_index == -1 && inner_merged_index == -1) + { + int merged_index = *next_index; + + Assert(!outer_merged); + Assert(!inner_merged); + outer_map->merged_indexes[outer_index] = merged_index; + outer_map->merged[outer_index] = true; + inner_map->merged_indexes[inner_index] = merged_index; + inner_map->merged[inner_index] = true; + *next_index = *next_index + 1; + return merged_index; + } + if (outer_merged_index >= 0 && !outer_map->merged[outer_index]) + { + Assert(inner_merged_index == -1); + Assert(!inner_merged); + inner_map->merged_indexes[inner_index] = outer_merged_index; + inner_map->merged[inner_index] = true; + outer_map->merged[outer_index] = true; + return outer_merged_index; + } + if (inner_merged_index >= 0 && !inner_map->merged[inner_index]) + { + Assert(outer_merged_index == -1); + Assert(!outer_merged); + outer_map->merged_indexes[outer_index] = inner_merged_index; + outer_map->merged[outer_index] = true; + inner_map->merged[inner_index] = true; + return inner_merged_index; + } + return -1; +} + +/* + * process_outer_partition + * Try to assign given outer partition a merged partition, and return the + * index of the merged partition if successful, -1 otherwise + * + * If the partition is newly created, *next_index is incremented. Also, if it + * is the default partition of the join relation, *default_index is set to the + * index if not already done. + */ +static int +process_outer_partition(PartitionMap *outer_map, + PartitionMap *inner_map, + bool outer_has_default, + bool inner_has_default, + int outer_index, + int inner_default, + JoinType jointype, + int *next_index, + int *default_index) +{ + int merged_index = -1; + + Assert(outer_index >= 0); + + /* + * If the inner side has the default partition, a row from the outer + * partition might find its join partner in the default partition; try + * merging the outer partition with the default partition. Otherwise, + * this should be an outer join, in which case the outer partition has to + * be scanned all the way anyway; merge the outer partition with a dummy + * partition on the other side. + */ + if (inner_has_default) + { + Assert(inner_default >= 0); + + /* + * If the outer side has the default partition as well, the default + * partition on the inner side will have two matching partitions on + * the other side: the outer partition and the default partition on + * the outer side. Partitionwise join doesn't handle this scenario + * yet. + */ + if (outer_has_default) + return -1; + + merged_index = merge_matching_partitions(outer_map, inner_map, + outer_index, inner_default, + next_index); + if (merged_index == -1) + return -1; + + /* + * If this is a FULL join, the default partition on the inner side has + * to be scanned all the way anyway, so the resulting partition will + * contain all key values from the default partition, which any other + * partition of the join relation will not contain. Thus the + * resulting partition will act as the default partition of the join + * relation; record the index in *default_index if not already done. + */ + if (jointype == JOIN_FULL) + { + if (*default_index == -1) + *default_index = merged_index; + else + Assert(*default_index == merged_index); + } + } + else + { + Assert(IS_OUTER_JOIN(jointype)); + Assert(jointype != JOIN_RIGHT); + + /* If we have already assigned a partition, no need to do anything. */ + merged_index = outer_map->merged_indexes[outer_index]; + if (merged_index == -1) + merged_index = merge_partition_with_dummy(outer_map, outer_index, + next_index); + } + return merged_index; +} + +/* + * process_inner_partition + * Try to assign given inner partition a merged partition, and return the + * index of the merged partition if successful, -1 otherwise + * + * If the partition is newly created, *next_index is incremented. Also, if it + * is the default partition of the join relation, *default_index is set to the + * index if not already done. + */ +static int +process_inner_partition(PartitionMap *outer_map, + PartitionMap *inner_map, + bool outer_has_default, + bool inner_has_default, + int inner_index, + int outer_default, + JoinType jointype, + int *next_index, + int *default_index) +{ + int merged_index = -1; + + Assert(inner_index >= 0); + + /* + * If the outer side has the default partition, a row from the inner + * partition might find its join partner in the default partition; try + * merging the inner partition with the default partition. Otherwise, + * this should be a FULL join, in which case the inner partition has to be + * scanned all the way anyway; merge the inner partition with a dummy + * partition on the other side. + */ + if (outer_has_default) + { + Assert(outer_default >= 0); + + /* + * If the inner side has the default partition as well, the default + * partition on the outer side will have two matching partitions on + * the other side: the inner partition and the default partition on + * the inner side. Partitionwise join doesn't handle this scenario + * yet. + */ + if (inner_has_default) + return -1; + + merged_index = merge_matching_partitions(outer_map, inner_map, + outer_default, inner_index, + next_index); + if (merged_index == -1) + return -1; + + /* + * If this is an outer join, the default partition on the outer side + * has to be scanned all the way anyway, so the resulting partition + * will contain all key values from the default partition, which any + * other partition of the join relation will not contain. Thus the + * resulting partition will act as the default partition of the join + * relation; record the index in *default_index if not already done. + */ + if (IS_OUTER_JOIN(jointype)) + { + Assert(jointype != JOIN_RIGHT); + if (*default_index == -1) + *default_index = merged_index; + else + Assert(*default_index == merged_index); + } + } + else + { + Assert(jointype == JOIN_FULL); + + /* If we have already assigned a partition, no need to do anything. */ + merged_index = inner_map->merged_indexes[inner_index]; + if (merged_index == -1) + merged_index = merge_partition_with_dummy(inner_map, inner_index, + next_index); + } + return merged_index; +} + +/* + * merge_null_partitions + * Merge the NULL partitions from a join's outer and inner sides. + * + * If the merged partition produced from them is the NULL partition of the join + * relation, *null_index is set to the index of the merged partition. + * + * Note: We assume here that the join clause for a partitioned join is strict + * because have_partkey_equi_join() requires that the corresponding operator + * be mergejoinable, and we currently assume that mergejoinable operators are + * strict (see MJEvalOuterValues()/MJEvalInnerValues()). + */ +static void +merge_null_partitions(PartitionMap *outer_map, + PartitionMap *inner_map, + bool outer_has_null, + bool inner_has_null, + int outer_null, + int inner_null, + JoinType jointype, + int *next_index, + int *null_index) +{ + bool consider_outer_null = false; + bool consider_inner_null = false; + + Assert(outer_has_null || inner_has_null); + Assert(*null_index == -1); + + /* + * Check whether the NULL partitions have already been merged and if so, + * set the consider_outer_null/consider_inner_null flags. + */ + if (outer_has_null) + { + Assert(outer_null >= 0 && outer_null < outer_map->nparts); + if (outer_map->merged_indexes[outer_null] == -1) + consider_outer_null = true; + } + if (inner_has_null) + { + Assert(inner_null >= 0 && inner_null < inner_map->nparts); + if (inner_map->merged_indexes[inner_null] == -1) + consider_inner_null = true; + } + + /* If both flags are set false, we don't need to do anything. */ + if (!consider_outer_null && !consider_inner_null) + return; + + if (consider_outer_null && !consider_inner_null) + { + Assert(outer_has_null); + + /* + * If this is an outer join, the NULL partition on the outer side has + * to be scanned all the way anyway; merge the NULL partition with a + * dummy partition on the other side. In that case + * consider_outer_null means that the NULL partition only contains + * NULL values as the key values, so the merged partition will do so; + * treat it as the NULL partition of the join relation. + */ + if (IS_OUTER_JOIN(jointype)) + { + Assert(jointype != JOIN_RIGHT); + *null_index = merge_partition_with_dummy(outer_map, outer_null, + next_index); + } + } + else if (!consider_outer_null && consider_inner_null) + { + Assert(inner_has_null); + + /* + * If this is a FULL join, the NULL partition on the inner side has to + * be scanned all the way anyway; merge the NULL partition with a + * dummy partition on the other side. In that case + * consider_inner_null means that the NULL partition only contains + * NULL values as the key values, so the merged partition will do so; + * treat it as the NULL partition of the join relation. + */ + if (jointype == JOIN_FULL) + *null_index = merge_partition_with_dummy(inner_map, inner_null, + next_index); + } + else + { + Assert(consider_outer_null && consider_inner_null); + Assert(outer_has_null); + Assert(inner_has_null); + + /* + * If this is an outer join, the NULL partition on the outer side (and + * that on the inner side if this is a FULL join) have to be scanned + * all the way anyway, so merge them. Note that each of the NULL + * partitions isn't merged yet, so they should be merged successfully. + * Like the above, each of the NULL partitions only contains NULL + * values as the key values, so the merged partition will do so; treat + * it as the NULL partition of the join relation. + * + * Note: if this an INNER/SEMI join, the join clause will never be + * satisfied by two NULL values (see comments above), so both the NULL + * partitions can be eliminated. + */ + if (IS_OUTER_JOIN(jointype)) + { + Assert(jointype != JOIN_RIGHT); + *null_index = merge_matching_partitions(outer_map, inner_map, + outer_null, inner_null, + next_index); + Assert(*null_index >= 0); + } + } +} + +/* + * merge_default_partitions + * Merge the default partitions from a join's outer and inner sides. + * + * If the merged partition produced from them is the default partition of the + * join relation, *default_index is set to the index of the merged partition. + */ +static void +merge_default_partitions(PartitionMap *outer_map, + PartitionMap *inner_map, + bool outer_has_default, + bool inner_has_default, + int outer_default, + int inner_default, + JoinType jointype, + int *next_index, + int *default_index) +{ + int outer_merged_index = -1; + int inner_merged_index = -1; + + Assert(outer_has_default || inner_has_default); + + /* Get the merged partition indexes for the default partitions. */ + if (outer_has_default) + { + Assert(outer_default >= 0 && outer_default < outer_map->nparts); + outer_merged_index = outer_map->merged_indexes[outer_default]; + } + if (inner_has_default) + { + Assert(inner_default >= 0 && inner_default < inner_map->nparts); + inner_merged_index = inner_map->merged_indexes[inner_default]; + } + + if (outer_has_default && !inner_has_default) + { + /* + * If this is an outer join, the default partition on the outer side + * has to be scanned all the way anyway; if we have not yet assigned a + * partition, merge the default partition with a dummy partition on + * the other side. The merged partition will act as the default + * partition of the join relation (see comments in + * process_inner_partition()). + */ + if (IS_OUTER_JOIN(jointype)) + { + Assert(jointype != JOIN_RIGHT); + if (outer_merged_index == -1) + { + Assert(*default_index == -1); + *default_index = merge_partition_with_dummy(outer_map, + outer_default, + next_index); + } + else + Assert(*default_index == outer_merged_index); + } + else + Assert(*default_index == -1); + } + else if (!outer_has_default && inner_has_default) + { + /* + * If this is a FULL join, the default partition on the inner side has + * to be scanned all the way anyway; if we have not yet assigned a + * partition, merge the default partition with a dummy partition on + * the other side. The merged partition will act as the default + * partition of the join relation (see comments in + * process_outer_partition()). + */ + if (jointype == JOIN_FULL) + { + if (inner_merged_index == -1) + { + Assert(*default_index == -1); + *default_index = merge_partition_with_dummy(inner_map, + inner_default, + next_index); + } + else + Assert(*default_index == inner_merged_index); + } + else + Assert(*default_index == -1); + } + else + { + Assert(outer_has_default && inner_has_default); + + /* + * The default partitions have to be joined with each other, so merge + * them. Note that each of the default partitions isn't merged yet + * (see, process_outer_partition()/process_innerer_partition()), so + * they should be merged successfully. The merged partition will act + * as the default partition of the join relation. + */ + Assert(outer_merged_index == -1); + Assert(inner_merged_index == -1); + Assert(*default_index == -1); + *default_index = merge_matching_partitions(outer_map, + inner_map, + outer_default, + inner_default, + next_index); + Assert(*default_index >= 0); + } +} + +/* + * merge_partition_with_dummy + * Assign given partition a new partition of a join relation + * + * Note: The caller assumes that the given partition doesn't have a non-dummy + * matching partition on the other side, but if the given partition finds the + * matching partition later, we will adjust the assignment. + */ +static int +merge_partition_with_dummy(PartitionMap *map, int index, int *next_index) +{ + int merged_index = *next_index; + + Assert(index >= 0 && index < map->nparts); + Assert(map->merged_indexes[index] == -1); + Assert(!map->merged[index]); + map->merged_indexes[index] = merged_index; + /* Leave the merged flag alone! */ + *next_index = *next_index + 1; + return merged_index; +} + +/* + * fix_merged_indexes + * Adjust merged indexes of re-merged partitions + */ +static void +fix_merged_indexes(PartitionMap *outer_map, PartitionMap *inner_map, + int nmerged, List *merged_indexes) +{ + int *new_indexes; + int merged_index; + int i; + ListCell *lc; + + Assert(nmerged > 0); + + new_indexes = (int *) palloc(sizeof(int) * nmerged); + for (i = 0; i < nmerged; i++) + new_indexes[i] = -1; + + /* Build the mapping of old merged indexes to new merged indexes. */ + if (outer_map->did_remapping) + { + for (i = 0; i < outer_map->nparts; i++) + { + merged_index = outer_map->old_indexes[i]; + if (merged_index >= 0) + new_indexes[merged_index] = outer_map->merged_indexes[i]; + } + } + if (inner_map->did_remapping) + { + for (i = 0; i < inner_map->nparts; i++) + { + merged_index = inner_map->old_indexes[i]; + if (merged_index >= 0) + new_indexes[merged_index] = inner_map->merged_indexes[i]; + } + } + + /* Fix the merged_indexes list using the mapping. */ + foreach(lc, merged_indexes) + { + merged_index = lfirst_int(lc); + Assert(merged_index >= 0); + if (new_indexes[merged_index] >= 0) + lfirst_int(lc) = new_indexes[merged_index]; + } + + pfree(new_indexes); +} + +/* + * generate_matching_part_pairs + * Generate a pair of lists of partitions that produce merged partitions + * + * The lists of partitions are built in the order of merged partition indexes, + * and returned in *outer_parts and *inner_parts. + */ +static void +generate_matching_part_pairs(RelOptInfo *outer_rel, RelOptInfo *inner_rel, + PartitionMap *outer_map, PartitionMap *inner_map, + int nmerged, + List **outer_parts, List **inner_parts) +{ + int outer_nparts = outer_map->nparts; + int inner_nparts = inner_map->nparts; + int *outer_indexes; + int *inner_indexes; + int max_nparts; + int i; + + Assert(nmerged > 0); + Assert(*outer_parts == NIL); + Assert(*inner_parts == NIL); + + outer_indexes = (int *) palloc(sizeof(int) * nmerged); + inner_indexes = (int *) palloc(sizeof(int) * nmerged); + for (i = 0; i < nmerged; i++) + outer_indexes[i] = inner_indexes[i] = -1; + + /* Set pairs of matching partitions. */ + Assert(outer_nparts == outer_rel->nparts); + Assert(inner_nparts == inner_rel->nparts); + max_nparts = Max(outer_nparts, inner_nparts); + for (i = 0; i < max_nparts; i++) + { + if (i < outer_nparts) + { + int merged_index = outer_map->merged_indexes[i]; + + if (merged_index >= 0) + { + Assert(merged_index < nmerged); + outer_indexes[merged_index] = i; + } + } + if (i < inner_nparts) + { + int merged_index = inner_map->merged_indexes[i]; + + if (merged_index >= 0) + { + Assert(merged_index < nmerged); + inner_indexes[merged_index] = i; + } + } + } + + /* Build the list pairs. */ + for (i = 0; i < nmerged; i++) + { + int outer_index = outer_indexes[i]; + int inner_index = inner_indexes[i]; + + /* + * If both partitions are dummy, it means the merged partition that + * had been assigned to the outer/inner partition was removed when + * re-merging the outer/inner partition in + * merge_matching_partitions(); ignore the merged partition. + */ + if (outer_index == -1 && inner_index == -1) + continue; + + *outer_parts = lappend(*outer_parts, outer_index >= 0 ? + outer_rel->part_rels[outer_index] : NULL); + *inner_parts = lappend(*inner_parts, inner_index >= 0 ? + inner_rel->part_rels[inner_index] : NULL); + } + + pfree(outer_indexes); + pfree(inner_indexes); +} + +/* + * build_merged_partition_bounds + * Create a PartitionBoundInfo struct from merged partition bounds + */ +static PartitionBoundInfo +build_merged_partition_bounds(char strategy, List *merged_datums, + List *merged_kinds, List *merged_indexes, + int null_index, int default_index) +{ + PartitionBoundInfo merged_bounds; + int ndatums = list_length(merged_datums); + int pos; + ListCell *lc; + + merged_bounds = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData)); + merged_bounds->strategy = strategy; + merged_bounds->ndatums = ndatums; + + merged_bounds->datums = (Datum **) palloc(sizeof(Datum *) * ndatums); + pos = 0; + foreach(lc, merged_datums) + merged_bounds->datums[pos++] = (Datum *) lfirst(lc); + + if (strategy == PARTITION_STRATEGY_RANGE) + { + Assert(list_length(merged_kinds) == ndatums); + merged_bounds->kind = (PartitionRangeDatumKind **) + palloc(sizeof(PartitionRangeDatumKind *) * ndatums); + pos = 0; + foreach(lc, merged_kinds) + merged_bounds->kind[pos++] = (PartitionRangeDatumKind *) lfirst(lc); + + /* There are ndatums+1 indexes in the case of range partitioning. */ + merged_indexes = lappend_int(merged_indexes, -1); + ndatums++; + } + else + { + Assert(strategy == PARTITION_STRATEGY_LIST); + Assert(merged_kinds == NIL); + merged_bounds->kind = NULL; + } + + Assert(list_length(merged_indexes) == ndatums); + merged_bounds->nindexes = ndatums; + merged_bounds->indexes = (int *) palloc(sizeof(int) * ndatums); + pos = 0; + foreach(lc, merged_indexes) + merged_bounds->indexes[pos++] = lfirst_int(lc); + + merged_bounds->null_index = null_index; + merged_bounds->default_index = default_index; + + return merged_bounds; +} + +/* + * get_range_partition + * Get the next non-dummy partition of a range-partitioned relation, + * returning the index of that partition + * + * *lb and *ub are set to the lower and upper bounds of that partition + * respectively, and *lb_pos is advanced to the next lower bound, if any. + */ +static int +get_range_partition(RelOptInfo *rel, + PartitionBoundInfo bi, + int *lb_pos, + PartitionRangeBound *lb, + PartitionRangeBound *ub) +{ + int part_index; + + Assert(bi->strategy == PARTITION_STRATEGY_RANGE); + + do + { + part_index = get_range_partition_internal(bi, lb_pos, lb, ub); + if (part_index == -1) + return -1; + } while (is_dummy_partition(rel, part_index)); + + return part_index; +} + +static int +get_range_partition_internal(PartitionBoundInfo bi, + int *lb_pos, + PartitionRangeBound *lb, + PartitionRangeBound *ub) +{ + /* Return the index as -1 if we've exhausted all lower bounds. */ + if (*lb_pos >= bi->ndatums) + return -1; + + /* A lower bound should have at least one more bound after it. */ + Assert(*lb_pos + 1 < bi->ndatums); + + /* Set the lower bound. */ + lb->index = bi->indexes[*lb_pos]; + lb->datums = bi->datums[*lb_pos]; + lb->kind = bi->kind[*lb_pos]; + lb->lower = true; + /* Set the upper bound. */ + ub->index = bi->indexes[*lb_pos + 1]; + ub->datums = bi->datums[*lb_pos + 1]; + ub->kind = bi->kind[*lb_pos + 1]; + ub->lower = false; + + /* The index assigned to an upper bound should be valid. */ + Assert(ub->index >= 0); + + /* + * Advance the position to the next lower bound. If there are no bounds + * left beyond the upper bound, we have reached the last lower bound. + */ + if (*lb_pos + 2 >= bi->ndatums) + *lb_pos = bi->ndatums; + else + { + /* + * If the index assigned to the bound next to the upper bound isn't + * valid, that is the next lower bound; else, the upper bound is also + * the lower bound of the next range partition. + */ + if (bi->indexes[*lb_pos + 2] < 0) + *lb_pos = *lb_pos + 2; + else + *lb_pos = *lb_pos + 1; + } + + return ub->index; +} + +/* + * compare_range_partitions + * Compare the bounds of two range partitions, and return true if the + * two partitions overlap, false otherwise + * + * *lb_cmpval is set to -1, 0, or 1 if the outer partition's lower bound is + * lower than, equal to, or higher than the inner partition's lower bound + * respectively. Likewise, *ub_cmpval is set to -1, 0, or 1 if the outer + * partition's upper bound is lower than, equal to, or higher than the inner + * partition's upper bound respectively. + */ +static bool +compare_range_partitions(int partnatts, FmgrInfo *partsupfuncs, + Oid *partcollations, + PartitionRangeBound *outer_lb, + PartitionRangeBound *outer_ub, + PartitionRangeBound *inner_lb, + PartitionRangeBound *inner_ub, + int *lb_cmpval, int *ub_cmpval) +{ + /* + * Check if the outer partition's upper bound is lower than the inner + * partition's lower bound; if so the partitions aren't overlapping. + */ + if (compare_range_bounds(partnatts, partsupfuncs, partcollations, + outer_ub, inner_lb) < 0) + { + *lb_cmpval = -1; + *ub_cmpval = -1; + return false; + } + + /* + * Check if the outer partition's lower bound is higher than the inner + * partition's upper bound; if so the partitions aren't overlapping. + */ + if (compare_range_bounds(partnatts, partsupfuncs, partcollations, + outer_lb, inner_ub) > 0) + { + *lb_cmpval = 1; + *ub_cmpval = 1; + return false; + } + + /* All other cases indicate overlapping partitions. */ + *lb_cmpval = compare_range_bounds(partnatts, partsupfuncs, partcollations, + outer_lb, inner_lb); + *ub_cmpval = compare_range_bounds(partnatts, partsupfuncs, partcollations, + outer_ub, inner_ub); + return true; +} + +/* + * get_merged_range_bounds + * Given the bounds of range partitions to be joined, determine the bounds + * of a merged partition produced from the range partitions + * + * *merged_lb and *merged_ub are set to the lower and upper bounds of the + * merged partition. + */ +static void +get_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs, + Oid *partcollations, JoinType jointype, + PartitionRangeBound *outer_lb, + PartitionRangeBound *outer_ub, + PartitionRangeBound *inner_lb, + PartitionRangeBound *inner_ub, + int lb_cmpval, int ub_cmpval, + PartitionRangeBound *merged_lb, + PartitionRangeBound *merged_ub) +{ + Assert(compare_range_bounds(partnatts, partsupfuncs, partcollations, + outer_lb, inner_lb) == lb_cmpval); + Assert(compare_range_bounds(partnatts, partsupfuncs, partcollations, + outer_ub, inner_ub) == ub_cmpval); + + switch (jointype) + { + case JOIN_INNER: + case JOIN_SEMI: + + /* + * An INNER/SEMI join will have the rows that fit both sides, so + * the lower bound of the merged partition will be the higher of + * the two lower bounds, and the upper bound of the merged + * partition will be the lower of the two upper bounds. + */ + *merged_lb = (lb_cmpval > 0) ? *outer_lb : *inner_lb; + *merged_ub = (ub_cmpval < 0) ? *outer_ub : *inner_ub; + break; + + case JOIN_LEFT: + case JOIN_ANTI: + + /* + * A LEFT/ANTI join will have all the rows from the outer side, so + * the bounds of the merged partition will be the same as the + * outer bounds. + */ + *merged_lb = *outer_lb; + *merged_ub = *outer_ub; + break; + + case JOIN_FULL: + + /* + * A FULL join will have all the rows from both sides, so the + * lower bound of the merged partition will be the lower of the + * two lower bounds, and the upper bound of the merged partition + * will be the higher of the two upper bounds. + */ + *merged_lb = (lb_cmpval < 0) ? *outer_lb : *inner_lb; + *merged_ub = (ub_cmpval > 0) ? *outer_ub : *inner_ub; + break; + + default: + elog(ERROR, "unrecognized join type: %d", (int) jointype); + } +} + +/* + * add_merged_range_bounds + * Add the bounds of a merged partition to the lists of range bounds + */ +static void +add_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs, + Oid *partcollations, + PartitionRangeBound *merged_lb, + PartitionRangeBound *merged_ub, + int merged_index, + List **merged_datums, + List **merged_kinds, + List **merged_indexes) +{ + int cmpval; + + if (!*merged_datums) + { + /* First merged partition */ + Assert(!*merged_kinds); + Assert(!*merged_indexes); + cmpval = 1; + } + else + { + PartitionRangeBound prev_ub; + + Assert(*merged_datums); + Assert(*merged_kinds); + Assert(*merged_indexes); + + /* Get the last upper bound. */ + prev_ub.index = llast_int(*merged_indexes); + prev_ub.datums = (Datum *) llast(*merged_datums); + prev_ub.kind = (PartitionRangeDatumKind *) llast(*merged_kinds); + prev_ub.lower = false; + + /* + * We pass to partition_rbound_cmp() lower1 as false to prevent it + * from considering the last upper bound to be smaller than the lower + * bound of the merged partition when the values of the two range + * bounds compare equal. + */ + cmpval = partition_rbound_cmp(partnatts, partsupfuncs, partcollations, + merged_lb->datums, merged_lb->kind, + false, &prev_ub); + Assert(cmpval >= 0); + } + + /* + * If the lower bound is higher than the last upper bound, add the lower + * bound with the index as -1 indicating that that is a lower bound; else, + * the last upper bound will be reused as the lower bound of the merged + * partition, so skip this. + */ + if (cmpval > 0) + { + *merged_datums = lappend(*merged_datums, merged_lb->datums); + *merged_kinds = lappend(*merged_kinds, merged_lb->kind); + *merged_indexes = lappend_int(*merged_indexes, -1); + } + + /* Add the upper bound and index of the merged partition. */ + *merged_datums = lappend(*merged_datums, merged_ub->datums); + *merged_kinds = lappend(*merged_kinds, merged_ub->kind); + *merged_indexes = lappend_int(*merged_indexes, merged_index); +} + +/* + * partitions_are_ordered + * Determine whether the partitions described by 'boundinfo' are ordered, + * that is partitions appearing earlier in the PartitionDesc sequence + * contain partition keys strictly less than those appearing later. + * Also, if NULL values are possible, they must come in the last + * partition defined in the PartitionDesc. + * + * If out of order, or there is insufficient info to know the order, + * then we return false. + */ +bool +partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts) +{ + Assert(boundinfo != NULL); + + switch (boundinfo->strategy) + { + case PARTITION_STRATEGY_RANGE: + + /* + * RANGE-type partitioning guarantees that the partitions can be + * scanned in the order that they're defined in the PartitionDesc + * to provide sequential, non-overlapping ranges of tuples. + * However, if a DEFAULT partition exists then it doesn't work, as + * that could contain tuples from either below or above the + * defined range, or tuples belonging to gaps between partitions. + */ + if (!partition_bound_has_default(boundinfo)) + return true; + break; + + case PARTITION_STRATEGY_LIST: + + /* + * LIST partitioning can also guarantee ordering, but only if the + * partitions don't accept interleaved values. We could likely + * check for this by looping over the PartitionBound's indexes + * array to check that the indexes are in order. For now, let's + * just keep it simple and just accept LIST partitioning when + * there's no DEFAULT partition, exactly one value per partition, + * and optionally a NULL partition that does not accept any other + * values. Such a NULL partition will come last in the + * PartitionDesc, and the other partitions will be properly + * ordered. This is a cheap test to make as it does not require + * any per-partition processing. Maybe we'd like to handle more + * complex cases in the future. + */ + if (partition_bound_has_default(boundinfo)) + return false; + + if (boundinfo->ndatums + partition_bound_accepts_nulls(boundinfo) + == nparts) + return true; + break; + + default: + /* HASH, or some other strategy */ + break; + } + + return false; +} + +/* + * check_new_partition_bound + * + * Checks if the new partition's bound overlaps any of the existing partitions + * of parent. Also performs additional checks as necessary per strategy. + */ +void +check_new_partition_bound(char *relname, Relation parent, + PartitionBoundSpec *spec) +{ + PartitionKey key = RelationGetPartitionKey(parent); + PartitionDesc partdesc = RelationGetPartitionDesc(parent); + PartitionBoundInfo boundinfo = partdesc->boundinfo; + ParseState *pstate = make_parsestate(NULL); + int with = -1; + bool overlap = false; + + if (spec->is_default) + { + /* + * The default partition bound never conflicts with any other + * partition's; if that's what we're attaching, the only possible + * problem is that one already exists, so check for that and we're + * done. + */ + if (boundinfo == NULL || !partition_bound_has_default(boundinfo)) + return; + + /* Default partition already exists, error out. */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("partition \"%s\" conflicts with existing default partition \"%s\"", + relname, get_rel_name(partdesc->oids[boundinfo->default_index])), + parser_errposition(pstate, spec->location))); + } + + switch (key->strategy) + { + case PARTITION_STRATEGY_HASH: + { + Assert(spec->strategy == PARTITION_STRATEGY_HASH); + Assert(spec->remainder >= 0 && spec->remainder < spec->modulus); + + if (partdesc->nparts > 0) + { + Datum **datums = boundinfo->datums; + int ndatums = boundinfo->ndatums; + int greatest_modulus; + int remainder; + int offset; + bool valid_modulus = true; + int prev_modulus, /* Previous largest modulus */ + next_modulus; /* Next largest modulus */ + + /* + * Check rule that every modulus must be a factor of the + * next larger modulus. For example, if you have a bunch + * of partitions that all have modulus 5, you can add a + * new partition with modulus 10 or a new partition with + * modulus 15, but you cannot add both a partition with + * modulus 10 and a partition with modulus 15, because 10 + * is not a factor of 15. + * + * Get the greatest (modulus, remainder) pair contained in + * boundinfo->datums that is less than or equal to the + * (spec->modulus, spec->remainder) pair. + */ + offset = partition_hash_bsearch(boundinfo, + spec->modulus, + spec->remainder); + if (offset < 0) + { + next_modulus = DatumGetInt32(datums[0][0]); + valid_modulus = (next_modulus % spec->modulus) == 0; + } + else + { + prev_modulus = DatumGetInt32(datums[offset][0]); + valid_modulus = (spec->modulus % prev_modulus) == 0; + + if (valid_modulus && (offset + 1) < ndatums) + { + next_modulus = DatumGetInt32(datums[offset + 1][0]); + valid_modulus = (next_modulus % spec->modulus) == 0; + } + } + + if (!valid_modulus) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("every hash partition modulus must be a factor of the next larger modulus"))); + + greatest_modulus = boundinfo->nindexes; + remainder = spec->remainder; + + /* + * Normally, the lowest remainder that could conflict with + * the new partition is equal to the remainder specified + * for the new partition, but when the new partition has a + * modulus higher than any used so far, we need to adjust. + */ + if (remainder >= greatest_modulus) + remainder = remainder % greatest_modulus; + + /* Check every potentially-conflicting remainder. */ + do + { + if (boundinfo->indexes[remainder] != -1) + { + overlap = true; + with = boundinfo->indexes[remainder]; + break; + } + remainder += spec->modulus; + } while (remainder < greatest_modulus); + } + + break; + } + + case PARTITION_STRATEGY_LIST: + { + Assert(spec->strategy == PARTITION_STRATEGY_LIST); + + if (partdesc->nparts > 0) + { + ListCell *cell; + + Assert(boundinfo && + boundinfo->strategy == PARTITION_STRATEGY_LIST && + (boundinfo->ndatums > 0 || + partition_bound_accepts_nulls(boundinfo) || + partition_bound_has_default(boundinfo))); + + foreach(cell, spec->listdatums) + { + Const *val = castNode(Const, lfirst(cell)); + + if (!val->constisnull) + { + int offset; + bool equal; + + offset = partition_list_bsearch(&key->partsupfunc[0], + key->partcollation, + boundinfo, + val->constvalue, + &equal); + if (offset >= 0 && equal) + { + overlap = true; + with = boundinfo->indexes[offset]; + break; + } + } + else if (partition_bound_accepts_nulls(boundinfo)) + { + overlap = true; + with = boundinfo->null_index; + break; + } + } + } + + break; + } + + case PARTITION_STRATEGY_RANGE: + { + PartitionRangeBound *lower, + *upper; + + Assert(spec->strategy == PARTITION_STRATEGY_RANGE); + lower = make_one_partition_rbound(key, -1, spec->lowerdatums, true); + upper = make_one_partition_rbound(key, -1, spec->upperdatums, false); + + /* + * First check if the resulting range would be empty with + * specified lower and upper bounds + */ + if (partition_rbound_cmp(key->partnatts, key->partsupfunc, + key->partcollation, lower->datums, + lower->kind, true, upper) >= 0) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("empty range bound specified for partition \"%s\"", + relname), + errdetail("Specified lower bound %s is greater than or equal to upper bound %s.", + get_range_partbound_string(spec->lowerdatums), + get_range_partbound_string(spec->upperdatums)), + parser_errposition(pstate, spec->location))); + } + + if (partdesc->nparts > 0) + { + int offset; + bool equal; + + Assert(boundinfo && + boundinfo->strategy == PARTITION_STRATEGY_RANGE && + (boundinfo->ndatums > 0 || + partition_bound_has_default(boundinfo))); + + /* + * Test whether the new lower bound (which is treated + * inclusively as part of the new partition) lies inside + * an existing partition, or in a gap. + * + * If it's inside an existing partition, the bound at + * offset + 1 will be the upper bound of that partition, + * and its index will be >= 0. + * + * If it's in a gap, the bound at offset + 1 will be the + * lower bound of the next partition, and its index will + * be -1. This is also true if there is no next partition, + * since the index array is initialised with an extra -1 + * at the end. + */ + offset = partition_range_bsearch(key->partnatts, + key->partsupfunc, + key->partcollation, + boundinfo, lower, + &equal); + + if (boundinfo->indexes[offset + 1] < 0) + { + /* + * Check that the new partition will fit in the gap. + * For it to fit, the new upper bound must be less + * than or equal to the lower bound of the next + * partition, if there is one. + */ + if (offset + 1 < boundinfo->ndatums) + { + int32 cmpval; + Datum *datums; + PartitionRangeDatumKind *kind; + bool is_lower; + + datums = boundinfo->datums[offset + 1]; + kind = boundinfo->kind[offset + 1]; + is_lower = (boundinfo->indexes[offset + 1] == -1); + + cmpval = partition_rbound_cmp(key->partnatts, + key->partsupfunc, + key->partcollation, + datums, kind, + is_lower, upper); + if (cmpval < 0) + { + /* + * The new partition overlaps with the + * existing partition between offset + 1 and + * offset + 2. + */ + overlap = true; + with = boundinfo->indexes[offset + 2]; + } + } + } + else + { + /* + * The new partition overlaps with the existing + * partition between offset and offset + 1. + */ + overlap = true; + with = boundinfo->indexes[offset + 1]; + } + } + + break; + } + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + + if (overlap) + { + Assert(with >= 0); + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("partition \"%s\" would overlap partition \"%s\"", + relname, get_rel_name(partdesc->oids[with])), + parser_errposition(pstate, spec->location))); + } +} + +/* + * check_default_partition_contents + * + * This function checks if there exists a row in the default partition that + * would properly belong to the new partition being added. If it finds one, + * it throws an error. + */ +void +check_default_partition_contents(Relation parent, Relation default_rel, + PartitionBoundSpec *new_spec) +{ + List *new_part_constraints; + List *def_part_constraints; + List *all_parts; + ListCell *lc; + + new_part_constraints = (new_spec->strategy == PARTITION_STRATEGY_LIST) + ? get_qual_for_list(parent, new_spec) + : get_qual_for_range(parent, new_spec, false); + def_part_constraints = + get_proposed_default_constraint(new_part_constraints); + + /* + * Map the Vars in the constraint expression from parent's attnos to + * default_rel's. + */ + def_part_constraints = + map_partition_varattnos(def_part_constraints, 1, default_rel, + parent); + + /* + * If the existing constraints on the default partition imply that it will + * not contain any row that would belong to the new partition, we can + * avoid scanning the default partition. + */ + if (PartConstraintImpliedByRelConstraint(default_rel, def_part_constraints)) + { + ereport(DEBUG1, + (errmsg("updated partition constraint for default partition \"%s\" is implied by existing constraints", + RelationGetRelationName(default_rel)))); + return; + } + + /* + * Scan the default partition and its subpartitions, and check for rows + * that do not satisfy the revised partition constraints. + */ + if (default_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + all_parts = find_all_inheritors(RelationGetRelid(default_rel), + AccessExclusiveLock, NULL); + else + all_parts = list_make1_oid(RelationGetRelid(default_rel)); + + foreach(lc, all_parts) + { + Oid part_relid = lfirst_oid(lc); + Relation part_rel; + Expr *partition_constraint; + EState *estate; + ExprState *partqualstate = NULL; + Snapshot snapshot; + ExprContext *econtext; + TableScanDesc scan; + MemoryContext oldCxt; + TupleTableSlot *tupslot; + + /* Lock already taken above. */ + if (part_relid != RelationGetRelid(default_rel)) + { + part_rel = table_open(part_relid, NoLock); + + /* + * Map the Vars in the constraint expression from default_rel's + * the sub-partition's. + */ + partition_constraint = make_ands_explicit(def_part_constraints); + partition_constraint = (Expr *) + map_partition_varattnos((List *) partition_constraint, 1, + part_rel, default_rel); + + /* + * If the partition constraints on default partition child imply + * that it will not contain any row that would belong to the new + * partition, we can avoid scanning the child table. + */ + if (PartConstraintImpliedByRelConstraint(part_rel, + def_part_constraints)) + { + ereport(DEBUG1, + (errmsg("updated partition constraint for default partition \"%s\" is implied by existing constraints", + RelationGetRelationName(part_rel)))); + + table_close(part_rel, NoLock); + continue; + } + } + else + { + part_rel = default_rel; + partition_constraint = make_ands_explicit(def_part_constraints); + } + + /* + * Only RELKIND_RELATION relations (i.e. leaf partitions) need to be + * scanned. + */ + if (part_rel->rd_rel->relkind != RELKIND_RELATION) + { + if (part_rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + ereport(WARNING, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("skipped scanning foreign table \"%s\" which is a partition of default partition \"%s\"", + RelationGetRelationName(part_rel), + RelationGetRelationName(default_rel)))); + + if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel)) + table_close(part_rel, NoLock); + + continue; + } + + estate = CreateExecutorState(); + + /* Build expression execution states for partition check quals */ + partqualstate = ExecPrepareExpr(partition_constraint, estate); + + econtext = GetPerTupleExprContext(estate); + snapshot = RegisterSnapshot(GetLatestSnapshot()); + tupslot = table_slot_create(part_rel, &estate->es_tupleTable); + scan = table_beginscan(part_rel, snapshot, 0, NULL); + + /* + * Switch to per-tuple memory context and reset it for each tuple + * produced, so we don't leak memory. + */ + oldCxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + + while (table_scan_getnextslot(scan, ForwardScanDirection, tupslot)) + { + econtext->ecxt_scantuple = tupslot; + + if (!ExecCheck(partqualstate, econtext)) + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("updated partition constraint for default partition \"%s\" would be violated by some row", + RelationGetRelationName(default_rel)), + errtable(default_rel))); + + ResetExprContext(econtext); + CHECK_FOR_INTERRUPTS(); + } + + MemoryContextSwitchTo(oldCxt); + table_endscan(scan); + UnregisterSnapshot(snapshot); + ExecDropSingleTupleTableSlot(tupslot); + FreeExecutorState(estate); + + if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel)) + table_close(part_rel, NoLock); /* keep the lock until commit */ + } +} + +/* + * get_hash_partition_greatest_modulus + * + * Returns the greatest modulus of the hash partition bound. + * This is no longer used in the core code, but we keep it around + * in case external modules are using it. + */ +int +get_hash_partition_greatest_modulus(PartitionBoundInfo bound) +{ + Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH); + return bound->nindexes; +} + +/* + * make_one_partition_rbound + * + * Return a PartitionRangeBound given a list of PartitionRangeDatum elements + * and a flag telling whether the bound is lower or not. Made into a function + * because there are multiple sites that want to use this facility. + */ +static PartitionRangeBound * +make_one_partition_rbound(PartitionKey key, int index, List *datums, bool lower) +{ + PartitionRangeBound *bound; + ListCell *lc; + int i; + + Assert(datums != NIL); + + bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound)); + bound->index = index; + bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum)); + bound->kind = (PartitionRangeDatumKind *) palloc0(key->partnatts * + sizeof(PartitionRangeDatumKind)); + bound->lower = lower; + + i = 0; + foreach(lc, datums) + { + PartitionRangeDatum *datum = castNode(PartitionRangeDatum, lfirst(lc)); + + /* What's contained in this range datum? */ + bound->kind[i] = datum->kind; + + if (datum->kind == PARTITION_RANGE_DATUM_VALUE) + { + Const *val = castNode(Const, datum->value); + + if (val->constisnull) + elog(ERROR, "invalid range bound datum"); + bound->datums[i] = val->constvalue; + } + + i++; + } + + return bound; +} + +/* + * partition_rbound_cmp + * + * Return for two range bounds whether the 1st one (specified in datums1, + * kind1, and lower1) is <, =, or > the bound specified in *b2. + * + * partnatts, partsupfunc and partcollation give the number of attributes in the + * bounds to be compared, comparison function to be used and the collations of + * attributes, respectively. + * + * Note that if the values of the two range bounds compare equal, then we take + * into account whether they are upper or lower bounds, and an upper bound is + * considered to be smaller than a lower bound. This is important to the way + * that RelationBuildPartitionDesc() builds the PartitionBoundInfoData + * structure, which only stores the upper bound of a common boundary between + * two contiguous partitions. + */ +static int32 +partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, + Oid *partcollation, + Datum *datums1, PartitionRangeDatumKind *kind1, + bool lower1, PartitionRangeBound *b2) +{ + int32 cmpval = 0; /* placate compiler */ + int i; + Datum *datums2 = b2->datums; + PartitionRangeDatumKind *kind2 = b2->kind; + bool lower2 = b2->lower; + + for (i = 0; i < partnatts; i++) + { + /* + * First, handle cases where the column is unbounded, which should not + * invoke the comparison procedure, and should not consider any later + * columns. Note that the PartitionRangeDatumKind enum elements + * compare the same way as the values they represent. + */ + if (kind1[i] < kind2[i]) + return -1; + else if (kind1[i] > kind2[i]) + return 1; + else if (kind1[i] != PARTITION_RANGE_DATUM_VALUE) + + /* + * The column bounds are both MINVALUE or both MAXVALUE. No later + * columns should be considered, but we still need to compare + * whether they are upper or lower bounds. + */ + break; + + cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i], + partcollation[i], + datums1[i], + datums2[i])); + if (cmpval != 0) + break; + } + + /* + * If the comparison is anything other than equal, we're done. If they + * compare equal though, we still have to consider whether the boundaries + * are inclusive or exclusive. Exclusive one is considered smaller of the + * two. + */ + if (cmpval == 0 && lower1 != lower2) + cmpval = lower1 ? 1 : -1; + + return cmpval; +} + +/* + * partition_rbound_datum_cmp + * + * Return whether range bound (specified in rb_datums and rb_kind) + * is <, =, or > partition key of tuple (tuple_datums) + * + * n_tuple_datums, partsupfunc and partcollation give number of attributes in + * the bounds to be compared, comparison function to be used and the collations + * of attributes resp. + * + */ +int32 +partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation, + Datum *rb_datums, PartitionRangeDatumKind *rb_kind, + Datum *tuple_datums, int n_tuple_datums) +{ + int i; + int32 cmpval = -1; + + for (i = 0; i < n_tuple_datums; i++) + { + if (rb_kind[i] == PARTITION_RANGE_DATUM_MINVALUE) + return -1; + else if (rb_kind[i] == PARTITION_RANGE_DATUM_MAXVALUE) + return 1; + + cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i], + partcollation[i], + rb_datums[i], + tuple_datums[i])); + if (cmpval != 0) + break; + } + + return cmpval; +} + +/* + * partition_hbound_cmp + * + * Compares modulus first, then remainder if modulus is equal. + */ +static int32 +partition_hbound_cmp(int modulus1, int remainder1, int modulus2, int remainder2) +{ + if (modulus1 < modulus2) + return -1; + if (modulus1 > modulus2) + return 1; + if (modulus1 == modulus2 && remainder1 != remainder2) + return (remainder1 > remainder2) ? 1 : -1; + return 0; +} + +/* + * partition_list_bsearch + * Returns the index of the greatest bound datum that is less than equal + * to the given value or -1 if all of the bound datums are greater + * + * *is_equal is set to true if the bound datum at the returned index is equal + * to the input value. + */ +int +partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, + PartitionBoundInfo boundinfo, + Datum value, bool *is_equal) +{ + int lo, + hi, + mid; + + lo = -1; + hi = boundinfo->ndatums - 1; + while (lo < hi) + { + int32 cmpval; + + mid = (lo + hi + 1) / 2; + cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[0], + partcollation[0], + boundinfo->datums[mid][0], + value)); + if (cmpval <= 0) + { + lo = mid; + *is_equal = (cmpval == 0); + if (*is_equal) + break; + } + else + hi = mid - 1; + } + + return lo; +} + +/* + * partition_range_bsearch + * Returns the index of the greatest range bound that is less than or + * equal to the given range bound or -1 if all of the range bounds are + * greater + * + * *is_equal is set to true if the range bound at the returned index is equal + * to the input range bound + */ +static int +partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc, + Oid *partcollation, + PartitionBoundInfo boundinfo, + PartitionRangeBound *probe, bool *is_equal) +{ + int lo, + hi, + mid; + + lo = -1; + hi = boundinfo->ndatums - 1; + while (lo < hi) + { + int32 cmpval; + + mid = (lo + hi + 1) / 2; + cmpval = partition_rbound_cmp(partnatts, partsupfunc, + partcollation, + boundinfo->datums[mid], + boundinfo->kind[mid], + (boundinfo->indexes[mid] == -1), + probe); + if (cmpval <= 0) + { + lo = mid; + *is_equal = (cmpval == 0); + + if (*is_equal) + break; + } + else + hi = mid - 1; + } + + return lo; +} + +/* + * partition_range_bsearch + * Returns the index of the greatest range bound that is less than or + * equal to the given tuple or -1 if all of the range bounds are greater + * + * *is_equal is set to true if the range bound at the returned index is equal + * to the input tuple. + */ +int +partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation, + PartitionBoundInfo boundinfo, + int nvalues, Datum *values, bool *is_equal) +{ + int lo, + hi, + mid; + + lo = -1; + hi = boundinfo->ndatums - 1; + while (lo < hi) + { + int32 cmpval; + + mid = (lo + hi + 1) / 2; + cmpval = partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[mid], + boundinfo->kind[mid], + values, + nvalues); + if (cmpval <= 0) + { + lo = mid; + *is_equal = (cmpval == 0); + + if (*is_equal) + break; + } + else + hi = mid - 1; + } + + return lo; +} + +/* + * partition_hash_bsearch + * Returns the index of the greatest (modulus, remainder) pair that is + * less than or equal to the given (modulus, remainder) pair or -1 if + * all of them are greater + */ +int +partition_hash_bsearch(PartitionBoundInfo boundinfo, + int modulus, int remainder) +{ + int lo, + hi, + mid; + + lo = -1; + hi = boundinfo->ndatums - 1; + while (lo < hi) + { + int32 cmpval, + bound_modulus, + bound_remainder; + + mid = (lo + hi + 1) / 2; + bound_modulus = DatumGetInt32(boundinfo->datums[mid][0]); + bound_remainder = DatumGetInt32(boundinfo->datums[mid][1]); + cmpval = partition_hbound_cmp(bound_modulus, bound_remainder, + modulus, remainder); + if (cmpval <= 0) + { + lo = mid; + + if (cmpval == 0) + break; + } + else + hi = mid - 1; + } + + return lo; +} + +/* + * qsort_partition_hbound_cmp + * + * Hash bounds are sorted by modulus, then by remainder. + */ +static int32 +qsort_partition_hbound_cmp(const void *a, const void *b) +{ + PartitionHashBound *h1 = (*(PartitionHashBound *const *) a); + PartitionHashBound *h2 = (*(PartitionHashBound *const *) b); + + return partition_hbound_cmp(h1->modulus, h1->remainder, + h2->modulus, h2->remainder); +} + +/* + * qsort_partition_list_value_cmp + * + * Compare two list partition bound datums. + */ +static int32 +qsort_partition_list_value_cmp(const void *a, const void *b, void *arg) +{ + Datum val1 = (*(PartitionListValue *const *) a)->value, + val2 = (*(PartitionListValue *const *) b)->value; + PartitionKey key = (PartitionKey) arg; + + return DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], + key->partcollation[0], + val1, val2)); +} + +/* + * qsort_partition_rbound_cmp + * + * Used when sorting range bounds across all range partitions. + */ +static int32 +qsort_partition_rbound_cmp(const void *a, const void *b, void *arg) +{ + PartitionRangeBound *b1 = (*(PartitionRangeBound *const *) a); + PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b); + PartitionKey key = (PartitionKey) arg; + + return partition_rbound_cmp(key->partnatts, key->partsupfunc, + key->partcollation, b1->datums, b1->kind, + b1->lower, b2); +} + +/* + * get_partition_operator + * + * Return oid of the operator of the given strategy for the given partition + * key column. It is assumed that the partitioning key is of the same type as + * the chosen partitioning opclass, or at least binary-compatible. In the + * latter case, *need_relabel is set to true if the opclass is not of a + * polymorphic type (indicating a RelabelType node needed on top), otherwise + * false. + */ +static Oid +get_partition_operator(PartitionKey key, int col, StrategyNumber strategy, + bool *need_relabel) +{ + Oid operoid; + + /* + * Get the operator in the partitioning opfamily using the opclass' + * declared input type as both left- and righttype. + */ + operoid = get_opfamily_member(key->partopfamily[col], + key->partopcintype[col], + key->partopcintype[col], + strategy); + if (!OidIsValid(operoid)) + elog(ERROR, "missing operator %d(%u,%u) in partition opfamily %u", + strategy, key->partopcintype[col], key->partopcintype[col], + key->partopfamily[col]); + + /* + * If the partition key column is not of the same type as the operator + * class and not polymorphic, tell caller to wrap the non-Const expression + * in a RelabelType. This matches what parse_coerce.c does. + */ + *need_relabel = (key->parttypid[col] != key->partopcintype[col] && + key->partopcintype[col] != RECORDOID && + !IsPolymorphicType(key->partopcintype[col])); + + return operoid; +} + +/* + * make_partition_op_expr + * Returns an Expr for the given partition key column with arg1 and + * arg2 as its leftop and rightop, respectively + */ +static Expr * +make_partition_op_expr(PartitionKey key, int keynum, + uint16 strategy, Expr *arg1, Expr *arg2) +{ + Oid operoid; + bool need_relabel = false; + Expr *result = NULL; + + /* Get the correct btree operator for this partitioning column */ + operoid = get_partition_operator(key, keynum, strategy, &need_relabel); + + /* + * Chosen operator may be such that the non-Const operand needs to be + * coerced, so apply the same; see the comment in + * get_partition_operator(). + */ + if (!IsA(arg1, Const) && + (need_relabel || + key->partcollation[keynum] != key->parttypcoll[keynum])) + arg1 = (Expr *) makeRelabelType(arg1, + key->partopcintype[keynum], + -1, + key->partcollation[keynum], + COERCE_EXPLICIT_CAST); + + /* Generate the actual expression */ + switch (key->strategy) + { + case PARTITION_STRATEGY_LIST: + { + List *elems = (List *) arg2; + int nelems = list_length(elems); + + Assert(nelems >= 1); + Assert(keynum == 0); + + if (nelems > 1 && + !type_is_array(key->parttypid[keynum])) + { + ArrayExpr *arrexpr; + ScalarArrayOpExpr *saopexpr; + + /* Construct an ArrayExpr for the right-hand inputs */ + arrexpr = makeNode(ArrayExpr); + arrexpr->array_typeid = + get_array_type(key->parttypid[keynum]); + arrexpr->array_collid = key->parttypcoll[keynum]; + arrexpr->element_typeid = key->parttypid[keynum]; + arrexpr->elements = elems; + arrexpr->multidims = false; + arrexpr->location = -1; + + /* Build leftop = ANY (rightop) */ + saopexpr = makeNode(ScalarArrayOpExpr); + saopexpr->opno = operoid; + saopexpr->opfuncid = get_opcode(operoid); + saopexpr->useOr = true; + saopexpr->inputcollid = key->partcollation[keynum]; + saopexpr->args = list_make2(arg1, arrexpr); + saopexpr->location = -1; + + result = (Expr *) saopexpr; + } + else + { + List *elemops = NIL; + ListCell *lc; + + foreach(lc, elems) + { + Expr *elem = lfirst(lc), + *elemop; + + elemop = make_opclause(operoid, + BOOLOID, + false, + arg1, elem, + InvalidOid, + key->partcollation[keynum]); + elemops = lappend(elemops, elemop); + } + + result = nelems > 1 ? makeBoolExpr(OR_EXPR, elemops, -1) : linitial(elemops); + } + break; + } + + case PARTITION_STRATEGY_RANGE: + result = make_opclause(operoid, + BOOLOID, + false, + arg1, arg2, + InvalidOid, + key->partcollation[keynum]); + break; + + default: + elog(ERROR, "invalid partitioning strategy"); + break; + } + + return result; +} + +/* + * get_qual_for_hash + * + * Returns a CHECK constraint expression to use as a hash partition's + * constraint, given the parent relation and partition bound structure. + * + * The partition constraint for a hash partition is always a call to the + * built-in function satisfies_hash_partition(). + */ +static List * +get_qual_for_hash(Relation parent, PartitionBoundSpec *spec) +{ + PartitionKey key = RelationGetPartitionKey(parent); + FuncExpr *fexpr; + Node *relidConst; + Node *modulusConst; + Node *remainderConst; + List *args; + ListCell *partexprs_item; + int i; + + /* Fixed arguments. */ + relidConst = (Node *) makeConst(OIDOID, + -1, + InvalidOid, + sizeof(Oid), + ObjectIdGetDatum(RelationGetRelid(parent)), + false, + true); + + modulusConst = (Node *) makeConst(INT4OID, + -1, + InvalidOid, + sizeof(int32), + Int32GetDatum(spec->modulus), + false, + true); + + remainderConst = (Node *) makeConst(INT4OID, + -1, + InvalidOid, + sizeof(int32), + Int32GetDatum(spec->remainder), + false, + true); + + args = list_make3(relidConst, modulusConst, remainderConst); + partexprs_item = list_head(key->partexprs); + + /* Add an argument for each key column. */ + for (i = 0; i < key->partnatts; i++) + { + Node *keyCol; + + /* Left operand */ + if (key->partattrs[i] != 0) + { + keyCol = (Node *) makeVar(1, + key->partattrs[i], + key->parttypid[i], + key->parttypmod[i], + key->parttypcoll[i], + 0); + } + else + { + keyCol = (Node *) copyObject(lfirst(partexprs_item)); + partexprs_item = lnext(key->partexprs, partexprs_item); + } + + args = lappend(args, keyCol); + } + + fexpr = makeFuncExpr(F_SATISFIES_HASH_PARTITION, + BOOLOID, + args, + InvalidOid, + InvalidOid, + COERCE_EXPLICIT_CALL); + + return list_make1(fexpr); +} + +/* + * get_qual_for_list + * + * Returns an implicit-AND list of expressions to use as a list partition's + * constraint, given the parent relation and partition bound structure. + * + * The function returns NIL for a default partition when it's the only + * partition since in that case there is no constraint. + */ +static List * +get_qual_for_list(Relation parent, PartitionBoundSpec *spec) +{ + PartitionKey key = RelationGetPartitionKey(parent); + List *result; + Expr *keyCol; + Expr *opexpr; + NullTest *nulltest; + ListCell *cell; + List *elems = NIL; + bool list_has_null = false; + + /* + * Only single-column list partitioning is supported, so we are worried + * only about the partition key with index 0. + */ + Assert(key->partnatts == 1); + + /* Construct Var or expression representing the partition column */ + if (key->partattrs[0] != 0) + keyCol = (Expr *) makeVar(1, + key->partattrs[0], + key->parttypid[0], + key->parttypmod[0], + key->parttypcoll[0], + 0); + else + keyCol = (Expr *) copyObject(linitial(key->partexprs)); + + /* + * For default list partition, collect datums for all the partitions. The + * default partition constraint should check that the partition key is + * equal to none of those. + */ + if (spec->is_default) + { + int i; + int ndatums = 0; + PartitionDesc pdesc = RelationGetPartitionDesc(parent); + PartitionBoundInfo boundinfo = pdesc->boundinfo; + + if (boundinfo) + { + ndatums = boundinfo->ndatums; + + if (partition_bound_accepts_nulls(boundinfo)) + list_has_null = true; + } + + /* + * If default is the only partition, there need not be any partition + * constraint on it. + */ + if (ndatums == 0 && !list_has_null) + return NIL; + + for (i = 0; i < ndatums; i++) + { + Const *val; + + /* + * Construct Const from known-not-null datum. We must be careful + * to copy the value, because our result has to be able to outlive + * the relcache entry we're copying from. + */ + val = makeConst(key->parttypid[0], + key->parttypmod[0], + key->parttypcoll[0], + key->parttyplen[0], + datumCopy(*boundinfo->datums[i], + key->parttypbyval[0], + key->parttyplen[0]), + false, /* isnull */ + key->parttypbyval[0]); + + elems = lappend(elems, val); + } + } + else + { + /* + * Create list of Consts for the allowed values, excluding any nulls. + */ + foreach(cell, spec->listdatums) + { + Const *val = castNode(Const, lfirst(cell)); + + if (val->constisnull) + list_has_null = true; + else + elems = lappend(elems, copyObject(val)); + } + } + + if (elems) + { + /* + * Generate the operator expression from the non-null partition + * values. + */ + opexpr = make_partition_op_expr(key, 0, BTEqualStrategyNumber, + keyCol, (Expr *) elems); + } + else + { + /* + * If there are no partition values, we don't need an operator + * expression. + */ + opexpr = NULL; + } + + if (!list_has_null) + { + /* + * Gin up a "col IS NOT NULL" test that will be AND'd with the main + * expression. This might seem redundant, but the partition routing + * machinery needs it. + */ + nulltest = makeNode(NullTest); + nulltest->arg = keyCol; + nulltest->nulltesttype = IS_NOT_NULL; + nulltest->argisrow = false; + nulltest->location = -1; + + result = opexpr ? list_make2(nulltest, opexpr) : list_make1(nulltest); + } + else + { + /* + * Gin up a "col IS NULL" test that will be OR'd with the main + * expression. + */ + nulltest = makeNode(NullTest); + nulltest->arg = keyCol; + nulltest->nulltesttype = IS_NULL; + nulltest->argisrow = false; + nulltest->location = -1; + + if (opexpr) + { + Expr *or; + + or = makeBoolExpr(OR_EXPR, list_make2(nulltest, opexpr), -1); + result = list_make1(or); + } + else + result = list_make1(nulltest); + } + + /* + * Note that, in general, applying NOT to a constraint expression doesn't + * necessarily invert the set of rows it accepts, because NOT (NULL) is + * NULL. However, the partition constraints we construct here never + * evaluate to NULL, so applying NOT works as intended. + */ + if (spec->is_default) + { + result = list_make1(make_ands_explicit(result)); + result = list_make1(makeBoolExpr(NOT_EXPR, result, -1)); + } + + return result; +} + +/* + * get_qual_for_range + * + * Returns an implicit-AND list of expressions to use as a range partition's + * constraint, given the parent relation and partition bound structure. + * + * For a multi-column range partition key, say (a, b, c), with (al, bl, cl) + * as the lower bound tuple and (au, bu, cu) as the upper bound tuple, we + * generate an expression tree of the following form: + * + * (a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL) + * AND + * (a > al OR (a = al AND b > bl) OR (a = al AND b = bl AND c >= cl)) + * AND + * (a < au OR (a = au AND b < bu) OR (a = au AND b = bu AND c < cu)) + * + * It is often the case that a prefix of lower and upper bound tuples contains + * the same values, for example, (al = au), in which case, we will emit an + * expression tree of the following form: + * + * (a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL) + * AND + * (a = al) + * AND + * (b > bl OR (b = bl AND c >= cl)) + * AND + * (b < bu OR (b = bu AND c < cu)) + * + * If a bound datum is either MINVALUE or MAXVALUE, these expressions are + * simplified using the fact that any value is greater than MINVALUE and less + * than MAXVALUE. So, for example, if cu = MAXVALUE, c < cu is automatically + * true, and we need not emit any expression for it, and the last line becomes + * + * (b < bu) OR (b = bu), which is simplified to (b <= bu) + * + * In most common cases with only one partition column, say a, the following + * expression tree will be generated: a IS NOT NULL AND a >= al AND a < au + * + * For default partition, it returns the negation of the constraints of all + * the other partitions. + * + * External callers should pass for_default as false; we set it to true only + * when recursing. + */ +static List * +get_qual_for_range(Relation parent, PartitionBoundSpec *spec, + bool for_default) +{ + List *result = NIL; + ListCell *cell1, + *cell2, + *partexprs_item, + *partexprs_item_saved; + int i, + j; + PartitionRangeDatum *ldatum, + *udatum; + PartitionKey key = RelationGetPartitionKey(parent); + Expr *keyCol; + Const *lower_val, + *upper_val; + List *lower_or_arms, + *upper_or_arms; + int num_or_arms, + current_or_arm; + ListCell *lower_or_start_datum, + *upper_or_start_datum; + bool need_next_lower_arm, + need_next_upper_arm; + + if (spec->is_default) + { + List *or_expr_args = NIL; + PartitionDesc pdesc = RelationGetPartitionDesc(parent); + Oid *inhoids = pdesc->oids; + int nparts = pdesc->nparts, + i; + + for (i = 0; i < nparts; i++) + { + Oid inhrelid = inhoids[i]; + HeapTuple tuple; + Datum datum; + bool isnull; + PartitionBoundSpec *bspec; + + tuple = SearchSysCache1(RELOID, inhrelid); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", inhrelid); + + datum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + if (isnull) + elog(ERROR, "null relpartbound for relation %u", inhrelid); + + bspec = (PartitionBoundSpec *) + stringToNode(TextDatumGetCString(datum)); + if (!IsA(bspec, PartitionBoundSpec)) + elog(ERROR, "expected PartitionBoundSpec"); + + if (!bspec->is_default) + { + List *part_qual; + + part_qual = get_qual_for_range(parent, bspec, true); + + /* + * AND the constraints of the partition and add to + * or_expr_args + */ + or_expr_args = lappend(or_expr_args, list_length(part_qual) > 1 + ? makeBoolExpr(AND_EXPR, part_qual, -1) + : linitial(part_qual)); + } + ReleaseSysCache(tuple); + } + + if (or_expr_args != NIL) + { + Expr *other_parts_constr; + + /* + * Combine the constraints obtained for non-default partitions + * using OR. As requested, each of the OR's args doesn't include + * the NOT NULL test for partition keys (which is to avoid its + * useless repetition). Add the same now. + */ + other_parts_constr = + makeBoolExpr(AND_EXPR, + lappend(get_range_nulltest(key), + list_length(or_expr_args) > 1 + ? makeBoolExpr(OR_EXPR, or_expr_args, + -1) + : linitial(or_expr_args)), + -1); + + /* + * Finally, the default partition contains everything *NOT* + * contained in the non-default partitions. + */ + result = list_make1(makeBoolExpr(NOT_EXPR, + list_make1(other_parts_constr), -1)); + } + + return result; + } + + lower_or_start_datum = list_head(spec->lowerdatums); + upper_or_start_datum = list_head(spec->upperdatums); + num_or_arms = key->partnatts; + + /* + * If it is the recursive call for default, we skip the get_range_nulltest + * to avoid accumulating the NullTest on the same keys for each partition. + */ + if (!for_default) + result = get_range_nulltest(key); + + /* + * Iterate over the key columns and check if the corresponding lower and + * upper datums are equal using the btree equality operator for the + * column's type. If equal, we emit single keyCol = common_value + * expression. Starting from the first column for which the corresponding + * lower and upper bound datums are not equal, we generate OR expressions + * as shown in the function's header comment. + */ + i = 0; + partexprs_item = list_head(key->partexprs); + partexprs_item_saved = partexprs_item; /* placate compiler */ + forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums) + { + EState *estate; + MemoryContext oldcxt; + Expr *test_expr; + ExprState *test_exprstate; + Datum test_result; + bool isNull; + + ldatum = castNode(PartitionRangeDatum, lfirst(cell1)); + udatum = castNode(PartitionRangeDatum, lfirst(cell2)); + + /* + * Since get_range_key_properties() modifies partexprs_item, and we + * might need to start over from the previous expression in the later + * part of this function, save away the current value. + */ + partexprs_item_saved = partexprs_item; + + get_range_key_properties(key, i, ldatum, udatum, + &partexprs_item, + &keyCol, + &lower_val, &upper_val); + + /* + * If either value is NULL, the corresponding partition bound is + * either MINVALUE or MAXVALUE, and we treat them as unequal, because + * even if they're the same, there is no common value to equate the + * key column with. + */ + if (!lower_val || !upper_val) + break; + + /* Create the test expression */ + estate = CreateExecutorState(); + oldcxt = MemoryContextSwitchTo(estate->es_query_cxt); + test_expr = make_partition_op_expr(key, i, BTEqualStrategyNumber, + (Expr *) lower_val, + (Expr *) upper_val); + fix_opfuncids((Node *) test_expr); + test_exprstate = ExecInitExpr(test_expr, NULL); + test_result = ExecEvalExprSwitchContext(test_exprstate, + GetPerTupleExprContext(estate), + &isNull); + MemoryContextSwitchTo(oldcxt); + FreeExecutorState(estate); + + /* If not equal, go generate the OR expressions */ + if (!DatumGetBool(test_result)) + break; + + /* + * The bounds for the last key column can't be equal, because such a + * range partition would never be allowed to be defined (it would have + * an empty range otherwise). + */ + if (i == key->partnatts - 1) + elog(ERROR, "invalid range bound specification"); + + /* Equal, so generate keyCol = lower_val expression */ + result = lappend(result, + make_partition_op_expr(key, i, BTEqualStrategyNumber, + keyCol, (Expr *) lower_val)); + + i++; + } + + /* First pair of lower_val and upper_val that are not equal. */ + lower_or_start_datum = cell1; + upper_or_start_datum = cell2; + + /* OR will have as many arms as there are key columns left. */ + num_or_arms = key->partnatts - i; + current_or_arm = 0; + lower_or_arms = upper_or_arms = NIL; + need_next_lower_arm = need_next_upper_arm = true; + while (current_or_arm < num_or_arms) + { + List *lower_or_arm_args = NIL, + *upper_or_arm_args = NIL; + + /* Restart scan of columns from the i'th one */ + j = i; + partexprs_item = partexprs_item_saved; + + for_both_cell(cell1, spec->lowerdatums, lower_or_start_datum, + cell2, spec->upperdatums, upper_or_start_datum) + { + PartitionRangeDatum *ldatum_next = NULL, + *udatum_next = NULL; + + ldatum = castNode(PartitionRangeDatum, lfirst(cell1)); + if (lnext(spec->lowerdatums, cell1)) + ldatum_next = castNode(PartitionRangeDatum, + lfirst(lnext(spec->lowerdatums, cell1))); + udatum = castNode(PartitionRangeDatum, lfirst(cell2)); + if (lnext(spec->upperdatums, cell2)) + udatum_next = castNode(PartitionRangeDatum, + lfirst(lnext(spec->upperdatums, cell2))); + get_range_key_properties(key, j, ldatum, udatum, + &partexprs_item, + &keyCol, + &lower_val, &upper_val); + + if (need_next_lower_arm && lower_val) + { + uint16 strategy; + + /* + * For the non-last columns of this arm, use the EQ operator. + * For the last column of this arm, use GT, unless this is the + * last column of the whole bound check, or the next bound + * datum is MINVALUE, in which case use GE. + */ + if (j - i < current_or_arm) + strategy = BTEqualStrategyNumber; + else if (j == key->partnatts - 1 || + (ldatum_next && + ldatum_next->kind == PARTITION_RANGE_DATUM_MINVALUE)) + strategy = BTGreaterEqualStrategyNumber; + else + strategy = BTGreaterStrategyNumber; + + lower_or_arm_args = lappend(lower_or_arm_args, + make_partition_op_expr(key, j, + strategy, + keyCol, + (Expr *) lower_val)); + } + + if (need_next_upper_arm && upper_val) + { + uint16 strategy; + + /* + * For the non-last columns of this arm, use the EQ operator. + * For the last column of this arm, use LT, unless the next + * bound datum is MAXVALUE, in which case use LE. + */ + if (j - i < current_or_arm) + strategy = BTEqualStrategyNumber; + else if (udatum_next && + udatum_next->kind == PARTITION_RANGE_DATUM_MAXVALUE) + strategy = BTLessEqualStrategyNumber; + else + strategy = BTLessStrategyNumber; + + upper_or_arm_args = lappend(upper_or_arm_args, + make_partition_op_expr(key, j, + strategy, + keyCol, + (Expr *) upper_val)); + } + + /* + * Did we generate enough of OR's arguments? First arm considers + * the first of the remaining columns, second arm considers first + * two of the remaining columns, and so on. + */ + ++j; + if (j - i > current_or_arm) + { + /* + * We must not emit any more arms if the new column that will + * be considered is unbounded, or this one was. + */ + if (!lower_val || !ldatum_next || + ldatum_next->kind != PARTITION_RANGE_DATUM_VALUE) + need_next_lower_arm = false; + if (!upper_val || !udatum_next || + udatum_next->kind != PARTITION_RANGE_DATUM_VALUE) + need_next_upper_arm = false; + break; + } + } + + if (lower_or_arm_args != NIL) + lower_or_arms = lappend(lower_or_arms, + list_length(lower_or_arm_args) > 1 + ? makeBoolExpr(AND_EXPR, lower_or_arm_args, -1) + : linitial(lower_or_arm_args)); + + if (upper_or_arm_args != NIL) + upper_or_arms = lappend(upper_or_arms, + list_length(upper_or_arm_args) > 1 + ? makeBoolExpr(AND_EXPR, upper_or_arm_args, -1) + : linitial(upper_or_arm_args)); + + /* If no work to do in the next iteration, break away. */ + if (!need_next_lower_arm && !need_next_upper_arm) + break; + + ++current_or_arm; + } + + /* + * Generate the OR expressions for each of lower and upper bounds (if + * required), and append to the list of implicitly ANDed list of + * expressions. + */ + if (lower_or_arms != NIL) + result = lappend(result, + list_length(lower_or_arms) > 1 + ? makeBoolExpr(OR_EXPR, lower_or_arms, -1) + : linitial(lower_or_arms)); + if (upper_or_arms != NIL) + result = lappend(result, + list_length(upper_or_arms) > 1 + ? makeBoolExpr(OR_EXPR, upper_or_arms, -1) + : linitial(upper_or_arms)); + + /* + * As noted above, for non-default, we return list with constant TRUE. If + * the result is NIL during the recursive call for default, it implies + * this is the only other partition which can hold every value of the key + * except NULL. Hence we return the NullTest result skipped earlier. + */ + if (result == NIL) + result = for_default + ? get_range_nulltest(key) + : list_make1(makeBoolConst(true, false)); + + return result; +} + +/* + * get_range_key_properties + * Returns range partition key information for a given column + * + * This is a subroutine for get_qual_for_range, and its API is pretty + * specialized to that caller. + * + * Constructs an Expr for the key column (returned in *keyCol) and Consts + * for the lower and upper range limits (returned in *lower_val and + * *upper_val). For MINVALUE/MAXVALUE limits, NULL is returned instead of + * a Const. All of these structures are freshly palloc'd. + * + * *partexprs_item points to the cell containing the next expression in + * the key->partexprs list, or NULL. It may be advanced upon return. + */ +static void +get_range_key_properties(PartitionKey key, int keynum, + PartitionRangeDatum *ldatum, + PartitionRangeDatum *udatum, + ListCell **partexprs_item, + Expr **keyCol, + Const **lower_val, Const **upper_val) +{ + /* Get partition key expression for this column */ + if (key->partattrs[keynum] != 0) + { + *keyCol = (Expr *) makeVar(1, + key->partattrs[keynum], + key->parttypid[keynum], + key->parttypmod[keynum], + key->parttypcoll[keynum], + 0); + } + else + { + if (*partexprs_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + *keyCol = copyObject(lfirst(*partexprs_item)); + *partexprs_item = lnext(key->partexprs, *partexprs_item); + } + + /* Get appropriate Const nodes for the bounds */ + if (ldatum->kind == PARTITION_RANGE_DATUM_VALUE) + *lower_val = castNode(Const, copyObject(ldatum->value)); + else + *lower_val = NULL; + + if (udatum->kind == PARTITION_RANGE_DATUM_VALUE) + *upper_val = castNode(Const, copyObject(udatum->value)); + else + *upper_val = NULL; +} + +/* + * get_range_nulltest + * + * A non-default range partition table does not currently allow partition + * keys to be null, so emit an IS NOT NULL expression for each key column. + */ +static List * +get_range_nulltest(PartitionKey key) +{ + List *result = NIL; + NullTest *nulltest; + ListCell *partexprs_item; + int i; + + partexprs_item = list_head(key->partexprs); + for (i = 0; i < key->partnatts; i++) + { + Expr *keyCol; + + if (key->partattrs[i] != 0) + { + keyCol = (Expr *) makeVar(1, + key->partattrs[i], + key->parttypid[i], + key->parttypmod[i], + key->parttypcoll[i], + 0); + } + else + { + if (partexprs_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + keyCol = copyObject(lfirst(partexprs_item)); + partexprs_item = lnext(key->partexprs, partexprs_item); + } + + nulltest = makeNode(NullTest); + nulltest->arg = keyCol; + nulltest->nulltesttype = IS_NOT_NULL; + nulltest->argisrow = false; + nulltest->location = -1; + result = lappend(result, nulltest); + } + + return result; +} + +/* + * compute_partition_hash_value + * + * Compute the hash value for given partition key values. + */ +uint64 +compute_partition_hash_value(int partnatts, FmgrInfo *partsupfunc, Oid *partcollation, + Datum *values, bool *isnull) +{ + int i; + uint64 rowHash = 0; + Datum seed = UInt64GetDatum(HASH_PARTITION_SEED); + + for (i = 0; i < partnatts; i++) + { + /* Nulls are just ignored */ + if (!isnull[i]) + { + Datum hash; + + Assert(OidIsValid(partsupfunc[i].fn_oid)); + + /* + * Compute hash for each datum value by calling respective + * datatype-specific hash functions of each partition key + * attribute. + */ + hash = FunctionCall2Coll(&partsupfunc[i], partcollation[i], + values[i], seed); + + /* Form a single 64-bit hash value */ + rowHash = hash_combine64(rowHash, DatumGetUInt64(hash)); + } + } + + return rowHash; +} + +/* + * satisfies_hash_partition + * + * This is an SQL-callable function for use in hash partition constraints. + * The first three arguments are the parent table OID, modulus, and remainder. + * The remaining arguments are the value of the partitioning columns (or + * expressions); these are hashed and the results are combined into a single + * hash value by calling hash_combine64. + * + * Returns true if remainder produced when this computed single hash value is + * divided by the given modulus is equal to given remainder, otherwise false. + * NB: it's important that this never return null, as the constraint machinery + * would consider that to be a "pass". + * + * See get_qual_for_hash() for usage. + */ +Datum +satisfies_hash_partition(PG_FUNCTION_ARGS) +{ + typedef struct ColumnsHashData + { + Oid relid; + int nkeys; + Oid variadic_type; + int16 variadic_typlen; + bool variadic_typbyval; + char variadic_typalign; + Oid partcollid[PARTITION_MAX_KEYS]; + FmgrInfo partsupfunc[FLEXIBLE_ARRAY_MEMBER]; + } ColumnsHashData; + Oid parentId; + int modulus; + int remainder; + Datum seed = UInt64GetDatum(HASH_PARTITION_SEED); + ColumnsHashData *my_extra; + uint64 rowHash = 0; + + /* Return false if the parent OID, modulus, or remainder is NULL. */ + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_BOOL(false); + parentId = PG_GETARG_OID(0); + modulus = PG_GETARG_INT32(1); + remainder = PG_GETARG_INT32(2); + + /* Sanity check modulus and remainder. */ + if (modulus <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("modulus for hash partition must be an integer value greater than zero"))); + if (remainder < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("remainder for hash partition must be an integer value greater than or equal to zero"))); + if (remainder >= modulus) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("remainder for hash partition must be less than modulus"))); + + /* + * Cache hash function information. + */ + my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra; + if (my_extra == NULL || my_extra->relid != parentId) + { + Relation parent; + PartitionKey key; + int j; + + /* Open parent relation and fetch partition key info */ + parent = relation_open(parentId, AccessShareLock); + key = RelationGetPartitionKey(parent); + + /* Reject parent table that is not hash-partitioned. */ + if (key == NULL || key->strategy != PARTITION_STRATEGY_HASH) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"%s\" is not a hash partitioned table", + get_rel_name(parentId)))); + + if (!get_fn_expr_variadic(fcinfo->flinfo)) + { + int nargs = PG_NARGS() - 3; + + /* complain if wrong number of column values */ + if (key->partnatts != nargs) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of partitioning columns (%d) does not match number of partition keys provided (%d)", + key->partnatts, nargs))); + + /* allocate space for our cache */ + fcinfo->flinfo->fn_extra = + MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt, + offsetof(ColumnsHashData, partsupfunc) + + sizeof(FmgrInfo) * nargs); + my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra; + my_extra->relid = parentId; + my_extra->nkeys = key->partnatts; + memcpy(my_extra->partcollid, key->partcollation, + key->partnatts * sizeof(Oid)); + + /* check argument types and save fmgr_infos */ + for (j = 0; j < key->partnatts; ++j) + { + Oid argtype = get_fn_expr_argtype(fcinfo->flinfo, j + 3); + + if (argtype != key->parttypid[j] && !IsBinaryCoercible(argtype, key->parttypid[j])) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("column %d of the partition key has type \"%s\", but supplied value is of type \"%s\"", + j + 1, format_type_be(key->parttypid[j]), format_type_be(argtype)))); + + fmgr_info_copy(&my_extra->partsupfunc[j], + &key->partsupfunc[j], + fcinfo->flinfo->fn_mcxt); + } + } + else + { + ArrayType *variadic_array = PG_GETARG_ARRAYTYPE_P(3); + + /* allocate space for our cache -- just one FmgrInfo in this case */ + fcinfo->flinfo->fn_extra = + MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt, + offsetof(ColumnsHashData, partsupfunc) + + sizeof(FmgrInfo)); + my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra; + my_extra->relid = parentId; + my_extra->nkeys = key->partnatts; + my_extra->variadic_type = ARR_ELEMTYPE(variadic_array); + get_typlenbyvalalign(my_extra->variadic_type, + &my_extra->variadic_typlen, + &my_extra->variadic_typbyval, + &my_extra->variadic_typalign); + my_extra->partcollid[0] = key->partcollation[0]; + + /* check argument types */ + for (j = 0; j < key->partnatts; ++j) + if (key->parttypid[j] != my_extra->variadic_type) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("column %d of the partition key has type \"%s\", but supplied value is of type \"%s\"", + j + 1, + format_type_be(key->parttypid[j]), + format_type_be(my_extra->variadic_type)))); + + fmgr_info_copy(&my_extra->partsupfunc[0], + &key->partsupfunc[0], + fcinfo->flinfo->fn_mcxt); + } + + /* Hold lock until commit */ + relation_close(parent, NoLock); + } + + if (!OidIsValid(my_extra->variadic_type)) + { + int nkeys = my_extra->nkeys; + int i; + + /* + * For a non-variadic call, neither the number of arguments nor their + * types can change across calls, so avoid the expense of rechecking + * here. + */ + + for (i = 0; i < nkeys; i++) + { + Datum hash; + + /* keys start from fourth argument of function. */ + int argno = i + 3; + + if (PG_ARGISNULL(argno)) + continue; + + hash = FunctionCall2Coll(&my_extra->partsupfunc[i], + my_extra->partcollid[i], + PG_GETARG_DATUM(argno), + seed); + + /* Form a single 64-bit hash value */ + rowHash = hash_combine64(rowHash, DatumGetUInt64(hash)); + } + } + else + { + ArrayType *variadic_array = PG_GETARG_ARRAYTYPE_P(3); + int i; + int nelems; + Datum *datum; + bool *isnull; + + deconstruct_array(variadic_array, + my_extra->variadic_type, + my_extra->variadic_typlen, + my_extra->variadic_typbyval, + my_extra->variadic_typalign, + &datum, &isnull, &nelems); + + /* complain if wrong number of column values */ + if (nelems != my_extra->nkeys) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of partitioning columns (%d) does not match number of partition keys provided (%d)", + my_extra->nkeys, nelems))); + + for (i = 0; i < nelems; i++) + { + Datum hash; + + if (isnull[i]) + continue; + + hash = FunctionCall2Coll(&my_extra->partsupfunc[0], + my_extra->partcollid[0], + datum[i], + seed); + + /* Form a single 64-bit hash value */ + rowHash = hash_combine64(rowHash, DatumGetUInt64(hash)); + } + } + + PG_RETURN_BOOL(rowHash % modulus == remainder); +} diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c new file mode 100644 index 0000000..0f124a5 --- /dev/null +++ b/src/backend/partitioning/partdesc.c @@ -0,0 +1,368 @@ +/*------------------------------------------------------------------------- + * + * partdesc.c + * Support routines for manipulating partition descriptors + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/partitioning/partdesc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/htup_details.h" +#include "access/table.h" +#include "catalog/indexing.h" +#include "catalog/partition.h" +#include "catalog/pg_inherits.h" +#include "partitioning/partbounds.h" +#include "partitioning/partdesc.h" +#include "storage/bufmgr.h" +#include "storage/sinval.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/hsearch.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/partcache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + +typedef struct PartitionDirectoryData +{ + MemoryContext pdir_mcxt; + HTAB *pdir_hash; +} PartitionDirectoryData; + +typedef struct PartitionDirectoryEntry +{ + Oid reloid; + Relation rel; + PartitionDesc pd; +} PartitionDirectoryEntry; + +static void RelationBuildPartitionDesc(Relation rel); + + +/* + * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned + * + * Note: we arrange for partition descriptors to not get freed until the + * relcache entry's refcount goes to zero (see hacks in RelationClose, + * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even + * though we hand back a direct pointer into the relcache entry, it's safe + * for callers to continue to use that pointer as long as (a) they hold the + * relation open, and (b) they hold a relation lock strong enough to ensure + * that the data doesn't become stale. + */ +PartitionDesc +RelationGetPartitionDesc(Relation rel) +{ + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + return NULL; + + if (unlikely(rel->rd_partdesc == NULL)) + RelationBuildPartitionDesc(rel); + + return rel->rd_partdesc; +} + +/* + * RelationBuildPartitionDesc + * Form rel's partition descriptor, and store in relcache entry + * + * Partition descriptor is a complex structure; to avoid complicated logic to + * free individual elements whenever the relcache entry is flushed, we give it + * its own memory context, a child of CacheMemoryContext, which can easily be + * deleted on its own. To avoid leaking memory in that context in case of an + * error partway through this function, the context is initially created as a + * child of CurTransactionContext and only re-parented to CacheMemoryContext + * at the end, when no further errors are possible. Also, we don't make this + * context the current context except in very brief code sections, out of fear + * that some of our callees allocate memory on their own which would be leaked + * permanently. + */ +static void +RelationBuildPartitionDesc(Relation rel) +{ + PartitionDesc partdesc; + PartitionBoundInfo boundinfo = NULL; + List *inhoids; + PartitionBoundSpec **boundspecs = NULL; + Oid *oids = NULL; + bool *is_leaf = NULL; + ListCell *cell; + int i, + nparts; + PartitionKey key = RelationGetPartitionKey(rel); + MemoryContext new_pdcxt; + MemoryContext oldcxt; + int *mapping; + + /* + * Get partition oids from pg_inherits. This uses a single snapshot to + * fetch the list of children, so while more children may be getting added + * concurrently, whatever this function returns will be accurate as of + * some well-defined point in time. + */ + inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); + nparts = list_length(inhoids); + + /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */ + if (nparts > 0) + { + oids = (Oid *) palloc(nparts * sizeof(Oid)); + is_leaf = (bool *) palloc(nparts * sizeof(bool)); + boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + } + + /* Collect bound spec nodes for each partition. */ + i = 0; + foreach(cell, inhoids) + { + Oid inhrelid = lfirst_oid(cell); + HeapTuple tuple; + PartitionBoundSpec *boundspec = NULL; + + /* Try fetching the tuple from the catcache, for speed. */ + tuple = SearchSysCache1(RELOID, inhrelid); + if (HeapTupleIsValid(tuple)) + { + Datum datum; + bool isnull; + + datum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + if (!isnull) + boundspec = stringToNode(TextDatumGetCString(datum)); + ReleaseSysCache(tuple); + } + + /* + * The system cache may be out of date; if so, we may find no pg_class + * tuple or an old one where relpartbound is NULL. In that case, try + * the table directly. We can't just AcceptInvalidationMessages() and + * retry the system cache lookup because it's possible that a + * concurrent ATTACH PARTITION operation has removed itself from the + * ProcArray but not yet added invalidation messages to the shared + * queue; InvalidateSystemCaches() would work, but seems excessive. + * + * Note that this algorithm assumes that PartitionBoundSpec we manage + * to fetch is the right one -- so this is only good enough for + * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or + * some hypothetical operation that changes the partition bounds. + */ + if (boundspec == NULL) + { + Relation pg_class; + SysScanDesc scan; + ScanKeyData key[1]; + Datum datum; + bool isnull; + + pg_class = table_open(RelationRelationId, AccessShareLock); + ScanKeyInit(&key[0], + Anum_pg_class_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(inhrelid)); + scan = systable_beginscan(pg_class, ClassOidIndexId, true, + NULL, 1, key); + tuple = systable_getnext(scan); + datum = heap_getattr(tuple, Anum_pg_class_relpartbound, + RelationGetDescr(pg_class), &isnull); + if (!isnull) + boundspec = stringToNode(TextDatumGetCString(datum)); + systable_endscan(scan); + table_close(pg_class, AccessShareLock); + } + + /* Sanity checks. */ + if (!boundspec) + elog(ERROR, "missing relpartbound for relation %u", inhrelid); + if (!IsA(boundspec, PartitionBoundSpec)) + elog(ERROR, "invalid relpartbound for relation %u", inhrelid); + + /* + * If the PartitionBoundSpec says this is the default partition, its + * OID should match pg_partitioned_table.partdefid; if not, the + * catalog is corrupt. + */ + if (boundspec->is_default) + { + Oid partdefid; + + partdefid = get_default_partition_oid(RelationGetRelid(rel)); + if (partdefid != inhrelid) + elog(ERROR, "expected partdefid %u, but got %u", + inhrelid, partdefid); + } + + /* Save results. */ + oids[i] = inhrelid; + is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE); + boundspecs[i] = boundspec; + ++i; + } + + /* + * Create PartitionBoundInfo and mapping, working in the caller's context. + * This could fail, but we haven't done any damage if so. + */ + if (nparts > 0) + boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping); + + /* + * Now build the actual relcache partition descriptor, copying all the + * data into a new, small context. As per above comment, we don't make + * this a long-lived context until it's finished. + */ + new_pdcxt = AllocSetContextCreate(CurTransactionContext, + "partition descriptor", + ALLOCSET_SMALL_SIZES); + MemoryContextCopyAndSetIdentifier(new_pdcxt, + RelationGetRelationName(rel)); + + partdesc = (PartitionDescData *) + MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData)); + partdesc->nparts = nparts; + /* If there are no partitions, the rest of the partdesc can stay zero */ + if (nparts > 0) + { + oldcxt = MemoryContextSwitchTo(new_pdcxt); + partdesc->boundinfo = partition_bounds_copy(boundinfo, key); + partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid)); + partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool)); + + /* + * Assign OIDs from the original array into mapped indexes of the + * result array. The order of OIDs in the former is defined by the + * catalog scan that retrieved them, whereas that in the latter is + * defined by canonicalized representation of the partition bounds. + * Also save leaf-ness of each partition. + */ + for (i = 0; i < nparts; i++) + { + int index = mapping[i]; + + partdesc->oids[index] = oids[i]; + partdesc->is_leaf[index] = is_leaf[i]; + } + MemoryContextSwitchTo(oldcxt); + } + + /* + * We have a fully valid partdesc ready to store into the relcache. + * Reparent it so it has the right lifespan. + */ + MemoryContextSetParent(new_pdcxt, CacheMemoryContext); + + /* + * But first, a kluge: if there's an old rd_pdcxt, it contains an old + * partition descriptor that may still be referenced somewhere. Preserve + * it, while not leaking it, by reattaching it as a child context of the + * new rd_pdcxt. Eventually it will get dropped by either RelationClose + * or RelationClearRelation. + */ + if (rel->rd_pdcxt != NULL) + MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt); + rel->rd_pdcxt = new_pdcxt; + rel->rd_partdesc = partdesc; +} + +/* + * CreatePartitionDirectory + * Create a new partition directory object. + */ +PartitionDirectory +CreatePartitionDirectory(MemoryContext mcxt) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(mcxt); + PartitionDirectory pdir; + HASHCTL ctl; + + MemSet(&ctl, 0, sizeof(HASHCTL)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(PartitionDirectoryEntry); + ctl.hcxt = mcxt; + + pdir = palloc(sizeof(PartitionDirectoryData)); + pdir->pdir_mcxt = mcxt; + pdir->pdir_hash = hash_create("partition directory", 256, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + MemoryContextSwitchTo(oldcontext); + return pdir; +} + +/* + * PartitionDirectoryLookup + * Look up the partition descriptor for a relation in the directory. + * + * The purpose of this function is to ensure that we get the same + * PartitionDesc for each relation every time we look it up. In the + * face of concurrent DDL, different PartitionDescs may be constructed with + * different views of the catalog state, but any single particular OID + * will always get the same PartitionDesc for as long as the same + * PartitionDirectory is used. + */ +PartitionDesc +PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel) +{ + PartitionDirectoryEntry *pde; + Oid relid = RelationGetRelid(rel); + bool found; + + pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found); + if (!found) + { + /* + * We must keep a reference count on the relation so that the + * PartitionDesc to which we are pointing can't get destroyed. + */ + RelationIncrementReferenceCount(rel); + pde->rel = rel; + pde->pd = RelationGetPartitionDesc(rel); + Assert(pde->pd != NULL); + } + return pde->pd; +} + +/* + * DestroyPartitionDirectory + * Destroy a partition directory. + * + * Release the reference counts we're holding. + */ +void +DestroyPartitionDirectory(PartitionDirectory pdir) +{ + HASH_SEQ_STATUS status; + PartitionDirectoryEntry *pde; + + hash_seq_init(&status, pdir->pdir_hash); + while ((pde = hash_seq_search(&status)) != NULL) + RelationDecrementReferenceCount(pde->rel); +} + +/* + * get_default_oid_from_partdesc + * + * Given a partition descriptor, return the OID of the default partition, if + * one exists; else, return InvalidOid. + */ +Oid +get_default_oid_from_partdesc(PartitionDesc partdesc) +{ + if (partdesc && partdesc->boundinfo && + partition_bound_has_default(partdesc->boundinfo)) + return partdesc->oids[partdesc->boundinfo->default_index]; + + return InvalidOid; +} diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c new file mode 100644 index 0000000..988215b --- /dev/null +++ b/src/backend/partitioning/partprune.c @@ -0,0 +1,3589 @@ +/*------------------------------------------------------------------------- + * + * partprune.c + * Support for partition pruning during query planning and execution + * + * This module implements partition pruning using the information contained in + * a table's partition descriptor, query clauses, and run-time parameters. + * + * During planning, clauses that can be matched to the table's partition key + * are turned into a set of "pruning steps", which are then executed to + * identify a set of partitions (as indexes in the RelOptInfo->part_rels + * array) that satisfy the constraints in the step. Partitions not in the set + * are said to have been pruned. + * + * A base pruning step may involve expressions whose values are only known + * during execution, such as Params, in which case pruning cannot occur + * entirely during planning. In that case, such steps are included alongside + * the plan, so that they can be used by the executor for further pruning. + * + * There are two kinds of pruning steps. A "base" pruning step represents + * tests on partition key column(s), typically comparisons to expressions. + * A "combine" pruning step represents a Boolean connector (AND/OR), and + * combines the outputs of some previous steps using the appropriate + * combination method. + * + * See gen_partprune_steps_internal() for more details on step generation. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/partitioning/partprune.c + * + *------------------------------------------------------------------------- +*/ +#include "postgres.h" + +#include "access/hash.h" +#include "access/nbtree.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/appendinfo.h" +#include "optimizer/cost.h" +#include "optimizer/optimizer.h" +#include "optimizer/pathnode.h" +#include "parser/parsetree.h" +#include "partitioning/partbounds.h" +#include "partitioning/partprune.h" +#include "rewrite/rewriteManip.h" +#include "utils/array.h" +#include "utils/lsyscache.h" + + +/* + * Information about a clause matched with a partition key. + */ +typedef struct PartClauseInfo +{ + int keyno; /* Partition key number (0 to partnatts - 1) */ + Oid opno; /* operator used to compare partkey to expr */ + bool op_is_ne; /* is clause's original operator <> ? */ + Expr *expr; /* expr the partition key is compared to */ + Oid cmpfn; /* Oid of function to compare 'expr' to the + * partition key */ + int op_strategy; /* btree strategy identifying the operator */ +} PartClauseInfo; + +/* + * PartClauseMatchStatus + * Describes the result of match_clause_to_partition_key() + */ +typedef enum PartClauseMatchStatus +{ + PARTCLAUSE_NOMATCH, + PARTCLAUSE_MATCH_CLAUSE, + PARTCLAUSE_MATCH_NULLNESS, + PARTCLAUSE_MATCH_STEPS, + PARTCLAUSE_MATCH_CONTRADICT, + PARTCLAUSE_UNSUPPORTED +} PartClauseMatchStatus; + +/* + * PartClauseTarget + * Identifies which qual clauses we can use for generating pruning steps + */ +typedef enum PartClauseTarget +{ + PARTTARGET_PLANNER, /* want to prune during planning */ + PARTTARGET_INITIAL, /* want to prune during executor startup */ + PARTTARGET_EXEC /* want to prune during each plan node scan */ +} PartClauseTarget; + +/* + * GeneratePruningStepsContext + * Information about the current state of generation of "pruning steps" + * for a given set of clauses + * + * gen_partprune_steps() initializes and returns an instance of this struct. + * + * Note that has_mutable_op, has_mutable_arg, and has_exec_param are set if + * we found any potentially-useful-for-pruning clause having those properties, + * whether or not we actually used the clause in the steps list. This + * definition allows us to skip the PARTTARGET_EXEC pass in some cases. + */ +typedef struct GeneratePruningStepsContext +{ + /* Copies of input arguments for gen_partprune_steps: */ + RelOptInfo *rel; /* the partitioned relation */ + PartClauseTarget target; /* use-case we're generating steps for */ + /* Result data: */ + List *steps; /* list of PartitionPruneSteps */ + bool has_mutable_op; /* clauses include any stable operators */ + bool has_mutable_arg; /* clauses include any mutable comparison + * values, *other than* exec params */ + bool has_exec_param; /* clauses include any PARAM_EXEC params */ + bool contradictory; /* clauses were proven self-contradictory */ + /* Working state: */ + int next_step_id; +} GeneratePruningStepsContext; + +/* The result of performing one PartitionPruneStep */ +typedef struct PruneStepResult +{ + /* + * The offsets of bounds (in a table's boundinfo) whose partition is + * selected by the pruning step. + */ + Bitmapset *bound_offsets; + + bool scan_default; /* Scan the default partition? */ + bool scan_null; /* Scan the partition for NULL values? */ +} PruneStepResult; + + +static List *make_partitionedrel_pruneinfo(PlannerInfo *root, + RelOptInfo *parentrel, + int *relid_subplan_map, + List *partitioned_rels, List *prunequal, + Bitmapset **matchedsubplans); +static void gen_partprune_steps(RelOptInfo *rel, List *clauses, + PartClauseTarget target, + GeneratePruningStepsContext *context); +static List *gen_partprune_steps_internal(GeneratePruningStepsContext *context, + List *clauses); +static PartitionPruneStep *gen_prune_step_op(GeneratePruningStepsContext *context, + StrategyNumber opstrategy, bool op_is_ne, + List *exprs, List *cmpfns, Bitmapset *nullkeys); +static PartitionPruneStep *gen_prune_step_combine(GeneratePruningStepsContext *context, + List *source_stepids, + PartitionPruneCombineOp combineOp); +static PartitionPruneStep *gen_prune_steps_from_opexps(GeneratePruningStepsContext *context, + List **keyclauses, Bitmapset *nullkeys); +static PartClauseMatchStatus match_clause_to_partition_key(GeneratePruningStepsContext *context, + Expr *clause, Expr *partkey, int partkeyidx, + bool *clause_is_not_null, + PartClauseInfo **pc, List **clause_steps); +static List *get_steps_using_prefix(GeneratePruningStepsContext *context, + StrategyNumber step_opstrategy, + bool step_op_is_ne, + Expr *step_lastexpr, + Oid step_lastcmpfn, + int step_lastkeyno, + Bitmapset *step_nullkeys, + List *prefix); +static List *get_steps_using_prefix_recurse(GeneratePruningStepsContext *context, + StrategyNumber step_opstrategy, + bool step_op_is_ne, + Expr *step_lastexpr, + Oid step_lastcmpfn, + int step_lastkeyno, + Bitmapset *step_nullkeys, + List *prefix, + ListCell *start, + List *step_exprs, + List *step_cmpfns); +static PruneStepResult *get_matching_hash_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum *values, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys); +static PruneStepResult *get_matching_list_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum value, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys); +static PruneStepResult *get_matching_range_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum *values, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys); +static Bitmapset *pull_exec_paramids(Expr *expr); +static bool pull_exec_paramids_walker(Node *node, Bitmapset **context); +static Bitmapset *get_partkey_exec_paramids(List *steps); +static PruneStepResult *perform_pruning_base_step(PartitionPruneContext *context, + PartitionPruneStepOp *opstep); +static PruneStepResult *perform_pruning_combine_step(PartitionPruneContext *context, + PartitionPruneStepCombine *cstep, + PruneStepResult **step_results); +static PartClauseMatchStatus match_boolean_partition_clause(Oid partopfamily, + Expr *clause, + Expr *partkey, + Expr **outconst); +static void partkey_datum_from_expr(PartitionPruneContext *context, + Expr *expr, int stateidx, + Datum *value, bool *isnull); + + +/* + * make_partition_pruneinfo + * Builds a PartitionPruneInfo which can be used in the executor to allow + * additional partition pruning to take place. Returns NULL when + * partition pruning would be useless. + * + * 'parentrel' is the RelOptInfo for an appendrel, and 'subpaths' is the list + * of scan paths for its child rels. + * + * 'partitioned_rels' is a List containing Lists of relids of partitioned + * tables (a/k/a non-leaf partitions) that are parents of some of the child + * rels. Here we attempt to populate the PartitionPruneInfo by adding a + * 'prune_infos' item for each sublist in the 'partitioned_rels' list. + * However, some of the sets of partitioned relations may not require any + * run-time pruning. In these cases we'll simply not include a 'prune_infos' + * item for that set and instead we'll add all the subplans which belong to + * that set into the PartitionPruneInfo's 'other_subplans' field. Callers + * will likely never want to prune subplans which are mentioned in this field. + * + * 'prunequal' is a list of potential pruning quals. + */ +PartitionPruneInfo * +make_partition_pruneinfo(PlannerInfo *root, RelOptInfo *parentrel, + List *subpaths, List *partitioned_rels, + List *prunequal) +{ + PartitionPruneInfo *pruneinfo; + Bitmapset *allmatchedsubplans = NULL; + int *relid_subplan_map; + ListCell *lc; + List *prunerelinfos; + int i; + + /* + * Construct a temporary array to map from planner relids to subplan + * indexes. For convenience, we use 1-based indexes here, so that zero + * can represent an un-filled array entry. + */ + relid_subplan_map = palloc0(sizeof(int) * root->simple_rel_array_size); + + /* + * relid_subplan_map maps relid of a leaf partition to the index in + * 'subpaths' of the scan plan for that partition. + */ + i = 1; + foreach(lc, subpaths) + { + Path *path = (Path *) lfirst(lc); + RelOptInfo *pathrel = path->parent; + + Assert(IS_SIMPLE_REL(pathrel)); + Assert(pathrel->relid < root->simple_rel_array_size); + /* No duplicates please */ + Assert(relid_subplan_map[pathrel->relid] == 0); + + relid_subplan_map[pathrel->relid] = i++; + } + + /* We now build a PartitionedRelPruneInfo for each partitioned rel. */ + prunerelinfos = NIL; + foreach(lc, partitioned_rels) + { + List *rels = (List *) lfirst(lc); + List *pinfolist; + Bitmapset *matchedsubplans = NULL; + + pinfolist = make_partitionedrel_pruneinfo(root, parentrel, + relid_subplan_map, + rels, prunequal, + &matchedsubplans); + + /* When pruning is possible, record the matched subplans */ + if (pinfolist != NIL) + { + prunerelinfos = lappend(prunerelinfos, pinfolist); + allmatchedsubplans = bms_join(matchedsubplans, + allmatchedsubplans); + } + } + + pfree(relid_subplan_map); + + /* + * If none of the partition hierarchies had any useful run-time pruning + * quals, then we can just not bother with run-time pruning. + */ + if (prunerelinfos == NIL) + return NULL; + + /* Else build the result data structure */ + pruneinfo = makeNode(PartitionPruneInfo); + pruneinfo->prune_infos = prunerelinfos; + + /* + * Some subplans may not belong to any of the listed partitioned rels. + * This can happen for UNION ALL queries which include a non-partitioned + * table, or when some of the hierarchies aren't run-time prunable. Build + * a bitmapset of the indexes of all such subplans, so that the executor + * can identify which subplans should never be pruned. + */ + if (bms_num_members(allmatchedsubplans) < list_length(subpaths)) + { + Bitmapset *other_subplans; + + /* Create the complement of allmatchedsubplans */ + other_subplans = bms_add_range(NULL, 0, list_length(subpaths) - 1); + other_subplans = bms_del_members(other_subplans, allmatchedsubplans); + + pruneinfo->other_subplans = other_subplans; + } + else + pruneinfo->other_subplans = NULL; + + return pruneinfo; +} + +/* + * make_partitionedrel_pruneinfo + * Build a List of PartitionedRelPruneInfos, one for each partitioned + * rel. These can be used in the executor to allow additional partition + * pruning to take place. + * + * Here we generate partition pruning steps for 'prunequal' and also build a + * data structure which allows mapping of partition indexes into 'subpaths' + * indexes. + * + * If no non-Const expressions are being compared to the partition key in any + * of the 'partitioned_rels', then we return NIL to indicate no run-time + * pruning should be performed. Run-time pruning would be useless since the + * pruning done during planning will have pruned everything that can be. + * + * On non-NIL return, 'matchedsubplans' is set to the subplan indexes which + * were matched to this partition hierarchy. + */ +static List * +make_partitionedrel_pruneinfo(PlannerInfo *root, RelOptInfo *parentrel, + int *relid_subplan_map, + List *partitioned_rels, List *prunequal, + Bitmapset **matchedsubplans) +{ + RelOptInfo *targetpart = NULL; + List *pinfolist = NIL; + bool doruntimeprune = false; + int *relid_subpart_map; + Bitmapset *subplansfound = NULL; + ListCell *lc; + int i; + + /* + * Examine each partitioned rel, constructing a temporary array to map + * from planner relids to index of the partitioned rel, and building a + * PartitionedRelPruneInfo for each partitioned rel. + * + * In this phase we discover whether runtime pruning is needed at all; if + * not, we can avoid doing further work. + */ + relid_subpart_map = palloc0(sizeof(int) * root->simple_rel_array_size); + + i = 1; + foreach(lc, partitioned_rels) + { + Index rti = lfirst_int(lc); + RelOptInfo *subpart = find_base_rel(root, rti); + PartitionedRelPruneInfo *pinfo; + List *partprunequal; + List *initial_pruning_steps; + List *exec_pruning_steps; + Bitmapset *execparamids; + GeneratePruningStepsContext context; + + /* + * Fill the mapping array. + * + * relid_subpart_map maps relid of a non-leaf partition to the index + * in 'partitioned_rels' of that rel (which will also be the index in + * the returned PartitionedRelPruneInfo list of the info for that + * partition). We use 1-based indexes here, so that zero can + * represent an un-filled array entry. + */ + Assert(rti < root->simple_rel_array_size); + /* No duplicates please */ + Assert(relid_subpart_map[rti] == 0); + relid_subpart_map[rti] = i++; + + /* + * Translate pruning qual, if necessary, for this partition. + * + * The first item in the list is the target partitioned relation. + */ + if (!targetpart) + { + targetpart = subpart; + + /* + * The prunequal is presented to us as a qual for 'parentrel'. + * Frequently this rel is the same as targetpart, so we can skip + * an adjust_appendrel_attrs step. But it might not be, and then + * we have to translate. We update the prunequal parameter here, + * because in later iterations of the loop for child partitions, + * we want to translate from parent to child variables. + */ + if (!bms_equal(parentrel->relids, subpart->relids)) + { + int nappinfos; + AppendRelInfo **appinfos = find_appinfos_by_relids(root, + subpart->relids, + &nappinfos); + + prunequal = (List *) adjust_appendrel_attrs(root, (Node *) + prunequal, + nappinfos, + appinfos); + + pfree(appinfos); + } + + partprunequal = prunequal; + } + else + { + /* + * For sub-partitioned tables the columns may not be in the same + * order as the parent, so we must translate the prunequal to make + * it compatible with this relation. + */ + partprunequal = (List *) + adjust_appendrel_attrs_multilevel(root, + (Node *) prunequal, + subpart->relids, + targetpart->relids); + } + + /* + * Convert pruning qual to pruning steps. We may need to do this + * twice, once to obtain executor startup pruning steps, and once for + * executor per-scan pruning steps. This first pass creates startup + * pruning steps and detects whether there's any possibly-useful quals + * that would require per-scan pruning. + */ + gen_partprune_steps(subpart, partprunequal, PARTTARGET_INITIAL, + &context); + + if (context.contradictory) + { + /* + * This shouldn't happen as the planner should have detected this + * earlier. However, we do use additional quals from parameterized + * paths here. These do only compare Params to the partition key, + * so this shouldn't cause the discovery of any new qual + * contradictions that were not previously discovered as the Param + * values are unknown during planning. Anyway, we'd better do + * something sane here, so let's just disable run-time pruning. + */ + return NIL; + } + + /* + * If no mutable operators or expressions appear in usable pruning + * clauses, then there's no point in running startup pruning, because + * plan-time pruning should have pruned everything prunable. + */ + if (context.has_mutable_op || context.has_mutable_arg) + initial_pruning_steps = context.steps; + else + initial_pruning_steps = NIL; + + /* + * If no exec Params appear in potentially-usable pruning clauses, + * then there's no point in even thinking about per-scan pruning. + */ + if (context.has_exec_param) + { + /* ... OK, we'd better think about it */ + gen_partprune_steps(subpart, partprunequal, PARTTARGET_EXEC, + &context); + + if (context.contradictory) + { + /* As above, skip run-time pruning if anything fishy happens */ + return NIL; + } + + exec_pruning_steps = context.steps; + + /* + * Detect which exec Params actually got used; the fact that some + * were in available clauses doesn't mean we actually used them. + * Skip per-scan pruning if there are none. + */ + execparamids = get_partkey_exec_paramids(exec_pruning_steps); + + if (bms_is_empty(execparamids)) + exec_pruning_steps = NIL; + } + else + { + /* No exec Params anywhere, so forget about scan-time pruning */ + exec_pruning_steps = NIL; + execparamids = NULL; + } + + if (initial_pruning_steps || exec_pruning_steps) + doruntimeprune = true; + + /* Begin constructing the PartitionedRelPruneInfo for this rel */ + pinfo = makeNode(PartitionedRelPruneInfo); + pinfo->rtindex = rti; + pinfo->initial_pruning_steps = initial_pruning_steps; + pinfo->exec_pruning_steps = exec_pruning_steps; + pinfo->execparamids = execparamids; + /* Remaining fields will be filled in the next loop */ + + pinfolist = lappend(pinfolist, pinfo); + } + + if (!doruntimeprune) + { + /* No run-time pruning required. */ + pfree(relid_subpart_map); + return NIL; + } + + /* + * Run-time pruning will be required, so initialize other information. + * That includes two maps -- one needed to convert partition indexes of + * leaf partitions to the indexes of their subplans in the subplan list, + * another needed to convert partition indexes of sub-partitioned + * partitions to the indexes of their PartitionedRelPruneInfo in the + * PartitionedRelPruneInfo list. + */ + foreach(lc, pinfolist) + { + PartitionedRelPruneInfo *pinfo = lfirst(lc); + RelOptInfo *subpart = find_base_rel(root, pinfo->rtindex); + Bitmapset *present_parts; + int nparts = subpart->nparts; + int *subplan_map; + int *subpart_map; + Oid *relid_map; + + /* + * Construct the subplan and subpart maps for this partitioning level. + * Here we convert to zero-based indexes, with -1 for empty entries. + * Also construct a Bitmapset of all partitions that are present (that + * is, not pruned already). + */ + subplan_map = (int *) palloc(nparts * sizeof(int)); + memset(subplan_map, -1, nparts * sizeof(int)); + subpart_map = (int *) palloc(nparts * sizeof(int)); + memset(subpart_map, -1, nparts * sizeof(int)); + relid_map = (Oid *) palloc0(nparts * sizeof(Oid)); + present_parts = NULL; + + for (i = 0; i < nparts; i++) + { + RelOptInfo *partrel = subpart->part_rels[i]; + int subplanidx; + int subpartidx; + + /* Skip processing pruned partitions. */ + if (partrel == NULL) + continue; + + subplan_map[i] = subplanidx = relid_subplan_map[partrel->relid] - 1; + subpart_map[i] = subpartidx = relid_subpart_map[partrel->relid] - 1; + relid_map[i] = planner_rt_fetch(partrel->relid, root)->relid; + if (subplanidx >= 0) + { + present_parts = bms_add_member(present_parts, i); + + /* Record finding this subplan */ + subplansfound = bms_add_member(subplansfound, subplanidx); + } + else if (subpartidx >= 0) + present_parts = bms_add_member(present_parts, i); + } + + /* Record the maps and other information. */ + pinfo->present_parts = present_parts; + pinfo->nparts = nparts; + pinfo->subplan_map = subplan_map; + pinfo->subpart_map = subpart_map; + pinfo->relid_map = relid_map; + } + + pfree(relid_subpart_map); + + *matchedsubplans = subplansfound; + + return pinfolist; +} + +/* + * gen_partprune_steps + * Process 'clauses' (typically a rel's baserestrictinfo list of clauses) + * and create a list of "partition pruning steps". + * + * 'target' tells whether to generate pruning steps for planning (use + * immutable clauses only), or for executor startup (use any allowable + * clause except ones containing PARAM_EXEC Params), or for executor + * per-scan pruning (use any allowable clause). + * + * 'context' is an output argument that receives the steps list as well as + * some subsidiary flags; see the GeneratePruningStepsContext typedef. + */ +static void +gen_partprune_steps(RelOptInfo *rel, List *clauses, PartClauseTarget target, + GeneratePruningStepsContext *context) +{ + /* Initialize all output values to zero/false/NULL */ + memset(context, 0, sizeof(GeneratePruningStepsContext)); + context->rel = rel; + context->target = target; + + /* + * If this partitioned table is in turn a partition, and it shares any + * partition keys with its parent, then it's possible that the hierarchy + * allows the parent a narrower range of values than some of its + * partitions (particularly the default one). This is normally not + * useful, but it can be to prune the default partition. + */ + if (partition_bound_has_default(rel->boundinfo) && rel->partition_qual) + { + /* Make a copy to avoid modifying the passed-in List */ + clauses = list_concat_copy(clauses, rel->partition_qual); + } + + /* Down into the rabbit-hole. */ + (void) gen_partprune_steps_internal(context, clauses); +} + +/* + * prune_append_rel_partitions + * Process rel's baserestrictinfo and make use of quals which can be + * evaluated during query planning in order to determine the minimum set + * of partitions which must be scanned to satisfy these quals. Returns + * the matching partitions in the form of a Bitmapset containing the + * partitions' indexes in the rel's part_rels array. + * + * Callers must ensure that 'rel' is a partitioned table. + */ +Bitmapset * +prune_append_rel_partitions(RelOptInfo *rel) +{ + List *clauses = rel->baserestrictinfo; + List *pruning_steps; + GeneratePruningStepsContext gcontext; + PartitionPruneContext context; + + Assert(rel->part_scheme != NULL); + + /* If there are no partitions, return the empty set */ + if (rel->nparts == 0) + return NULL; + + /* + * If pruning is disabled or if there are no clauses to prune with, return + * all partitions. + */ + if (!enable_partition_pruning || clauses == NIL) + return bms_add_range(NULL, 0, rel->nparts - 1); + + /* + * Process clauses to extract pruning steps that are usable at plan time. + * If the clauses are found to be contradictory, we can return the empty + * set. + */ + gen_partprune_steps(rel, clauses, PARTTARGET_PLANNER, + &gcontext); + if (gcontext.contradictory) + return NULL; + pruning_steps = gcontext.steps; + + /* If there's nothing usable, return all partitions */ + if (pruning_steps == NIL) + return bms_add_range(NULL, 0, rel->nparts - 1); + + /* Set up PartitionPruneContext */ + context.strategy = rel->part_scheme->strategy; + context.partnatts = rel->part_scheme->partnatts; + context.nparts = rel->nparts; + context.boundinfo = rel->boundinfo; + context.partcollation = rel->part_scheme->partcollation; + context.partsupfunc = rel->part_scheme->partsupfunc; + context.stepcmpfuncs = (FmgrInfo *) palloc0(sizeof(FmgrInfo) * + context.partnatts * + list_length(pruning_steps)); + context.ppccontext = CurrentMemoryContext; + + /* These are not valid when being called from the planner */ + context.planstate = NULL; + context.exprstates = NULL; + + /* Actual pruning happens here. */ + return get_matching_partitions(&context, pruning_steps); +} + +/* + * get_matching_partitions + * Determine partitions that survive partition pruning + * + * Note: context->planstate must be set to a valid PlanState when the + * pruning_steps were generated with a target other than PARTTARGET_PLANNER. + * + * Returns a Bitmapset of the RelOptInfo->part_rels indexes of the surviving + * partitions. + */ +Bitmapset * +get_matching_partitions(PartitionPruneContext *context, List *pruning_steps) +{ + Bitmapset *result; + int num_steps = list_length(pruning_steps), + i; + PruneStepResult **results, + *final_result; + ListCell *lc; + bool scan_default; + + /* If there are no pruning steps then all partitions match. */ + if (num_steps == 0) + { + Assert(context->nparts > 0); + return bms_add_range(NULL, 0, context->nparts - 1); + } + + /* + * Allocate space for individual pruning steps to store its result. Each + * slot will hold a PruneStepResult after performing a given pruning step. + * Later steps may use the result of one or more earlier steps. The + * result of applying all pruning steps is the value contained in the slot + * of the last pruning step. + */ + results = (PruneStepResult **) + palloc0(num_steps * sizeof(PruneStepResult *)); + foreach(lc, pruning_steps) + { + PartitionPruneStep *step = lfirst(lc); + + switch (nodeTag(step)) + { + case T_PartitionPruneStepOp: + results[step->step_id] = + perform_pruning_base_step(context, + (PartitionPruneStepOp *) step); + break; + + case T_PartitionPruneStepCombine: + results[step->step_id] = + perform_pruning_combine_step(context, + (PartitionPruneStepCombine *) step, + results); + break; + + default: + elog(ERROR, "invalid pruning step type: %d", + (int) nodeTag(step)); + } + } + + /* + * At this point we know the offsets of all the datums whose corresponding + * partitions need to be in the result, including special null-accepting + * and default partitions. Collect the actual partition indexes now. + */ + final_result = results[num_steps - 1]; + Assert(final_result != NULL); + i = -1; + result = NULL; + scan_default = final_result->scan_default; + while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0) + { + int partindex; + + Assert(i < context->boundinfo->nindexes); + partindex = context->boundinfo->indexes[i]; + + if (partindex < 0) + { + /* + * In range partitioning cases, if a partition index is -1 it + * means that the bound at the offset is the upper bound for a + * range not covered by any partition (other than a possible + * default partition). In hash partitioning, the same means no + * partition has been defined for the corresponding remainder + * value. + * + * In either case, the value is still part of the queried range of + * values, so mark to scan the default partition if one exists. + */ + scan_default |= partition_bound_has_default(context->boundinfo); + continue; + } + + result = bms_add_member(result, partindex); + } + + /* Add the null and/or default partition if needed and present. */ + if (final_result->scan_null) + { + Assert(context->strategy == PARTITION_STRATEGY_LIST); + Assert(partition_bound_accepts_nulls(context->boundinfo)); + result = bms_add_member(result, context->boundinfo->null_index); + } + if (scan_default) + { + Assert(context->strategy == PARTITION_STRATEGY_LIST || + context->strategy == PARTITION_STRATEGY_RANGE); + Assert(partition_bound_has_default(context->boundinfo)); + result = bms_add_member(result, context->boundinfo->default_index); + } + + return result; +} + +/* + * gen_partprune_steps_internal + * Processes 'clauses' to generate partition pruning steps. + * + * From OpExpr clauses that are mutually AND'd, we find combinations of those + * that match to the partition key columns and for every such combination, + * we emit a PartitionPruneStepOp containing a vector of expressions whose + * values are used as a look up key to search partitions by comparing the + * values with partition bounds. Relevant details of the operator and a + * vector of (possibly cross-type) comparison functions is also included with + * each step. + * + * For BoolExpr clauses, we recursively generate steps for each argument, and + * return a PartitionPruneStepCombine of their results. + * + * The return value is a list of the steps generated, which are also added to + * the context's steps list. Each step is assigned a step identifier, unique + * even across recursive calls. + * + * If we find clauses that are mutually contradictory, or contradictory with + * the partitioning constraint, or a pseudoconstant clause that contains + * false, we set context->contradictory to true and return NIL (that is, no + * pruning steps). Caller should consider all partitions as pruned in that + * case. + */ +static List * +gen_partprune_steps_internal(GeneratePruningStepsContext *context, + List *clauses) +{ + PartitionScheme part_scheme = context->rel->part_scheme; + List *keyclauses[PARTITION_MAX_KEYS]; + Bitmapset *nullkeys = NULL, + *notnullkeys = NULL; + bool generate_opsteps = false; + List *result = NIL; + ListCell *lc; + + /* + * If this partitioned relation has a default partition and is itself a + * partition (as evidenced by partition_qual being not NIL), we first + * check if the clauses contradict the partition constraint. If they do, + * there's no need to generate any steps as it'd already be proven that no + * partitions need to be scanned. + * + * This is a measure of last resort only to be used because the default + * partition cannot be pruned using the steps generated from clauses that + * contradict the parent's partition constraint; regular pruning, which is + * cheaper, is sufficient when no default partition exists. + */ + if (partition_bound_has_default(context->rel->boundinfo) && + predicate_refuted_by(context->rel->partition_qual, clauses, false)) + { + context->contradictory = true; + return NIL; + } + + memset(keyclauses, 0, sizeof(keyclauses)); + foreach(lc, clauses) + { + Expr *clause = (Expr *) lfirst(lc); + int i; + + /* Look through RestrictInfo, if any */ + if (IsA(clause, RestrictInfo)) + clause = ((RestrictInfo *) clause)->clause; + + /* Constant-false-or-null is contradictory */ + if (IsA(clause, Const) && + (((Const *) clause)->constisnull || + !DatumGetBool(((Const *) clause)->constvalue))) + { + context->contradictory = true; + return NIL; + } + + /* Get the BoolExpr's out of the way. */ + if (IsA(clause, BoolExpr)) + { + /* + * Generate steps for arguments. + * + * While steps generated for the arguments themselves will be + * added to context->steps during recursion and will be evaluated + * independently, collect their step IDs to be stored in the + * combine step we'll be creating. + */ + if (is_orclause(clause)) + { + List *arg_stepids = NIL; + bool all_args_contradictory = true; + ListCell *lc1; + + /* + * We can share the outer context area with the recursive + * call, but contradictory had better not be true yet. + */ + Assert(!context->contradictory); + + /* + * Get pruning step for each arg. If we get contradictory for + * all args, it means the OR expression is false as a whole. + */ + foreach(lc1, ((BoolExpr *) clause)->args) + { + Expr *arg = lfirst(lc1); + bool arg_contradictory; + List *argsteps; + + argsteps = gen_partprune_steps_internal(context, + list_make1(arg)); + arg_contradictory = context->contradictory; + /* Keep context->contradictory clear till we're done */ + context->contradictory = false; + + if (arg_contradictory) + { + /* Just ignore self-contradictory arguments. */ + continue; + } + else + all_args_contradictory = false; + + if (argsteps != NIL) + { + PartitionPruneStep *step; + + Assert(list_length(argsteps) == 1); + step = (PartitionPruneStep *) linitial(argsteps); + arg_stepids = lappend_int(arg_stepids, step->step_id); + } + else + { + PartitionPruneStep *orstep; + + /* + * The arg didn't contain a clause matching this + * partition key. We cannot prune using such an arg. + * To indicate that to the pruning code, we must + * construct a dummy PartitionPruneStepCombine whose + * source_stepids is set to an empty List. + */ + orstep = gen_prune_step_combine(context, NIL, + PARTPRUNE_COMBINE_UNION); + arg_stepids = lappend_int(arg_stepids, orstep->step_id); + } + } + + /* If all the OR arms are contradictory, we can stop */ + if (all_args_contradictory) + { + context->contradictory = true; + return NIL; + } + + if (arg_stepids != NIL) + { + PartitionPruneStep *step; + + step = gen_prune_step_combine(context, arg_stepids, + PARTPRUNE_COMBINE_UNION); + result = lappend(result, step); + } + continue; + } + else if (is_andclause(clause)) + { + List *args = ((BoolExpr *) clause)->args; + List *argsteps, + *arg_stepids = NIL; + ListCell *lc1; + + /* + * args may itself contain clauses of arbitrary type, so just + * recurse and later combine the component partitions sets + * using a combine step. + */ + argsteps = gen_partprune_steps_internal(context, args); + + /* If any AND arm is contradictory, we can stop immediately */ + if (context->contradictory) + return NIL; + + foreach(lc1, argsteps) + { + PartitionPruneStep *step = lfirst(lc1); + + arg_stepids = lappend_int(arg_stepids, step->step_id); + } + + if (arg_stepids != NIL) + { + PartitionPruneStep *step; + + step = gen_prune_step_combine(context, arg_stepids, + PARTPRUNE_COMBINE_INTERSECT); + result = lappend(result, step); + } + continue; + } + + /* + * Fall-through for a NOT clause, which if it's a Boolean clause, + * will be handled in match_clause_to_partition_key(). We + * currently don't perform any pruning for more complex NOT + * clauses. + */ + } + + /* + * See if we can match this clause to any of the partition keys. + */ + for (i = 0; i < part_scheme->partnatts; i++) + { + Expr *partkey = linitial(context->rel->partexprs[i]); + bool clause_is_not_null = false; + PartClauseInfo *pc = NULL; + List *clause_steps = NIL; + + switch (match_clause_to_partition_key(context, + clause, partkey, i, + &clause_is_not_null, + &pc, &clause_steps)) + { + case PARTCLAUSE_MATCH_CLAUSE: + Assert(pc != NULL); + + /* + * Since we only allow strict operators, check for any + * contradicting IS NULL. + */ + if (bms_is_member(i, nullkeys)) + { + context->contradictory = true; + return NIL; + } + generate_opsteps = true; + keyclauses[i] = lappend(keyclauses[i], pc); + break; + + case PARTCLAUSE_MATCH_NULLNESS: + if (!clause_is_not_null) + { + /* + * check for conflicting IS NOT NULL as well as + * contradicting strict clauses + */ + if (bms_is_member(i, notnullkeys) || + keyclauses[i] != NIL) + { + context->contradictory = true; + return NIL; + } + nullkeys = bms_add_member(nullkeys, i); + } + else + { + /* check for conflicting IS NULL */ + if (bms_is_member(i, nullkeys)) + { + context->contradictory = true; + return NIL; + } + notnullkeys = bms_add_member(notnullkeys, i); + } + break; + + case PARTCLAUSE_MATCH_STEPS: + Assert(clause_steps != NIL); + result = list_concat(result, clause_steps); + break; + + case PARTCLAUSE_MATCH_CONTRADICT: + /* We've nothing more to do if a contradiction was found. */ + context->contradictory = true; + return NIL; + + case PARTCLAUSE_NOMATCH: + + /* + * Clause didn't match this key, but it might match the + * next one. + */ + continue; + + case PARTCLAUSE_UNSUPPORTED: + /* This clause cannot be used for pruning. */ + break; + } + + /* done; go check the next clause. */ + break; + } + } + + /*----------- + * Now generate some (more) pruning steps. We have three strategies: + * + * 1) Generate pruning steps based on IS NULL clauses: + * a) For list partitioning, null partition keys can only be found in + * the designated null-accepting partition, so if there are IS NULL + * clauses containing partition keys we should generate a pruning + * step that gets rid of all partitions but that one. We can + * disregard any OpExpr we may have found. + * b) For range partitioning, only the default partition can contain + * NULL values, so the same rationale applies. + * c) For hash partitioning, we only apply this strategy if we have + * IS NULL clauses for all the keys. Strategy 2 below will take + * care of the case where some keys have OpExprs and others have + * IS NULL clauses. + * + * 2) If not, generate steps based on OpExprs we have (if any). + * + * 3) If this doesn't work either, we may be able to generate steps to + * prune just the null-accepting partition (if one exists), if we have + * IS NOT NULL clauses for all partition keys. + */ + if (!bms_is_empty(nullkeys) && + (part_scheme->strategy == PARTITION_STRATEGY_LIST || + part_scheme->strategy == PARTITION_STRATEGY_RANGE || + (part_scheme->strategy == PARTITION_STRATEGY_HASH && + bms_num_members(nullkeys) == part_scheme->partnatts))) + { + PartitionPruneStep *step; + + /* Strategy 1 */ + step = gen_prune_step_op(context, InvalidStrategy, + false, NIL, NIL, nullkeys); + result = lappend(result, step); + } + else if (generate_opsteps) + { + PartitionPruneStep *step; + + /* Strategy 2 */ + step = gen_prune_steps_from_opexps(context, keyclauses, nullkeys); + if (step != NULL) + result = lappend(result, step); + } + else if (bms_num_members(notnullkeys) == part_scheme->partnatts) + { + PartitionPruneStep *step; + + /* Strategy 3 */ + step = gen_prune_step_op(context, InvalidStrategy, + false, NIL, NIL, NULL); + result = lappend(result, step); + } + + /* + * Finally, results from all entries appearing in result should be + * combined using an INTERSECT combine step, if more than one. + */ + if (list_length(result) > 1) + { + List *step_ids = NIL; + + foreach(lc, result) + { + PartitionPruneStep *step = lfirst(lc); + + step_ids = lappend_int(step_ids, step->step_id); + } + + if (step_ids != NIL) + { + PartitionPruneStep *step; + + step = gen_prune_step_combine(context, step_ids, + PARTPRUNE_COMBINE_INTERSECT); + result = lappend(result, step); + } + } + + return result; +} + +/* + * gen_prune_step_op + * Generate a pruning step for a specific operator + * + * The step is assigned a unique step identifier and added to context's 'steps' + * list. + */ +static PartitionPruneStep * +gen_prune_step_op(GeneratePruningStepsContext *context, + StrategyNumber opstrategy, bool op_is_ne, + List *exprs, List *cmpfns, + Bitmapset *nullkeys) +{ + PartitionPruneStepOp *opstep = makeNode(PartitionPruneStepOp); + + opstep->step.step_id = context->next_step_id++; + + /* + * For clauses that contain an <> operator, set opstrategy to + * InvalidStrategy to signal get_matching_list_bounds to do the right + * thing. + */ + opstep->opstrategy = op_is_ne ? InvalidStrategy : opstrategy; + Assert(list_length(exprs) == list_length(cmpfns)); + opstep->exprs = exprs; + opstep->cmpfns = cmpfns; + opstep->nullkeys = nullkeys; + + context->steps = lappend(context->steps, opstep); + + return (PartitionPruneStep *) opstep; +} + +/* + * gen_prune_step_combine + * Generate a pruning step for a combination of several other steps + * + * The step is assigned a unique step identifier and added to context's + * 'steps' list. + */ +static PartitionPruneStep * +gen_prune_step_combine(GeneratePruningStepsContext *context, + List *source_stepids, + PartitionPruneCombineOp combineOp) +{ + PartitionPruneStepCombine *cstep = makeNode(PartitionPruneStepCombine); + + cstep->step.step_id = context->next_step_id++; + cstep->combineOp = combineOp; + cstep->source_stepids = source_stepids; + + context->steps = lappend(context->steps, cstep); + + return (PartitionPruneStep *) cstep; +} + +/* + * gen_prune_steps_from_opexps + * Generate pruning steps based on clauses for partition keys + * + * 'keyclauses' contains one list of clauses per partition key. We check here + * if we have found clauses for a valid subset of the partition key. In some + * cases, (depending on the type of partitioning being used) if we didn't + * find clauses for a given key, we discard clauses that may have been + * found for any subsequent keys; see specific notes below. + */ +static PartitionPruneStep * +gen_prune_steps_from_opexps(GeneratePruningStepsContext *context, + List **keyclauses, Bitmapset *nullkeys) +{ + PartitionScheme part_scheme = context->rel->part_scheme; + List *opsteps = NIL; + List *btree_clauses[BTMaxStrategyNumber + 1], + *hash_clauses[HTMaxStrategyNumber + 1]; + int i; + ListCell *lc; + + memset(btree_clauses, 0, sizeof(btree_clauses)); + memset(hash_clauses, 0, sizeof(hash_clauses)); + for (i = 0; i < part_scheme->partnatts; i++) + { + List *clauselist = keyclauses[i]; + bool consider_next_key = true; + + /* + * For range partitioning, if we have no clauses for the current key, + * we can't consider any later keys either, so we can stop here. + */ + if (part_scheme->strategy == PARTITION_STRATEGY_RANGE && + clauselist == NIL) + break; + + /* + * For hash partitioning, if a column doesn't have the necessary + * equality clause, there should be an IS NULL clause, otherwise + * pruning is not possible. + */ + if (part_scheme->strategy == PARTITION_STRATEGY_HASH && + clauselist == NIL && !bms_is_member(i, nullkeys)) + return NULL; + + foreach(lc, clauselist) + { + PartClauseInfo *pc = (PartClauseInfo *) lfirst(lc); + Oid lefttype, + righttype; + + /* Look up the operator's btree/hash strategy number. */ + if (pc->op_strategy == InvalidStrategy) + get_op_opfamily_properties(pc->opno, + part_scheme->partopfamily[i], + false, + &pc->op_strategy, + &lefttype, + &righttype); + + switch (part_scheme->strategy) + { + case PARTITION_STRATEGY_LIST: + case PARTITION_STRATEGY_RANGE: + btree_clauses[pc->op_strategy] = + lappend(btree_clauses[pc->op_strategy], pc); + + /* + * We can't consider subsequent partition keys if the + * clause for the current key contains a non-inclusive + * operator. + */ + if (pc->op_strategy == BTLessStrategyNumber || + pc->op_strategy == BTGreaterStrategyNumber) + consider_next_key = false; + break; + + case PARTITION_STRATEGY_HASH: + if (pc->op_strategy != HTEqualStrategyNumber) + elog(ERROR, "invalid clause for hash partitioning"); + hash_clauses[pc->op_strategy] = + lappend(hash_clauses[pc->op_strategy], pc); + break; + + default: + elog(ERROR, "invalid partition strategy: %c", + part_scheme->strategy); + break; + } + } + + /* + * If we've decided that clauses for subsequent partition keys + * wouldn't be useful for pruning, don't search any further. + */ + if (!consider_next_key) + break; + } + + /* + * Now, we have divided clauses according to their operator strategies. + * Check for each strategy if we can generate pruning step(s) by + * collecting a list of expressions whose values will constitute a vector + * that can be used as a lookup key by a partition bound searching + * function. + */ + switch (part_scheme->strategy) + { + case PARTITION_STRATEGY_LIST: + case PARTITION_STRATEGY_RANGE: + { + List *eq_clauses = btree_clauses[BTEqualStrategyNumber]; + List *le_clauses = btree_clauses[BTLessEqualStrategyNumber]; + List *ge_clauses = btree_clauses[BTGreaterEqualStrategyNumber]; + int strat; + + /* + * For each clause under consideration for a given strategy, + * we collect expressions from clauses for earlier keys, whose + * operator strategy is inclusive, into a list called + * 'prefix'. By appending the clause's own expression to the + * 'prefix', we'll generate one step using the so generated + * vector and assign the current strategy to it. Actually, + * 'prefix' might contain multiple clauses for the same key, + * in which case, we must generate steps for various + * combinations of expressions of different keys, which + * get_steps_using_prefix takes care of for us. + */ + for (strat = 1; strat <= BTMaxStrategyNumber; strat++) + { + foreach(lc, btree_clauses[strat]) + { + PartClauseInfo *pc = lfirst(lc); + ListCell *eq_start; + ListCell *le_start; + ListCell *ge_start; + ListCell *lc1; + List *prefix = NIL; + List *pc_steps; + bool prefix_valid = true; + bool pk_has_clauses; + int keyno; + + /* + * If this is a clause for the first partition key, + * there are no preceding expressions; generate a + * pruning step without a prefix. + * + * Note that we pass NULL for step_nullkeys, because + * we don't search list/range partition bounds where + * some keys are NULL. + */ + if (pc->keyno == 0) + { + Assert(pc->op_strategy == strat); + pc_steps = get_steps_using_prefix(context, strat, + pc->op_is_ne, + pc->expr, + pc->cmpfn, + 0, + NULL, + NIL); + opsteps = list_concat(opsteps, pc_steps); + continue; + } + + eq_start = list_head(eq_clauses); + le_start = list_head(le_clauses); + ge_start = list_head(ge_clauses); + + /* + * We arrange clauses into prefix in ascending order + * of their partition key numbers. + */ + for (keyno = 0; keyno < pc->keyno; keyno++) + { + pk_has_clauses = false; + + /* + * Expressions from = clauses can always be in the + * prefix, provided they're from an earlier key. + */ + for_each_cell(lc1, eq_clauses, eq_start) + { + PartClauseInfo *eqpc = lfirst(lc1); + + if (eqpc->keyno == keyno) + { + prefix = lappend(prefix, eqpc); + pk_has_clauses = true; + } + else + { + Assert(eqpc->keyno > keyno); + break; + } + } + eq_start = lc1; + + /* + * If we're generating steps for </<= strategy, we + * can add other <= clauses to the prefix, + * provided they're from an earlier key. + */ + if (strat == BTLessStrategyNumber || + strat == BTLessEqualStrategyNumber) + { + for_each_cell(lc1, le_clauses, le_start) + { + PartClauseInfo *lepc = lfirst(lc1); + + if (lepc->keyno == keyno) + { + prefix = lappend(prefix, lepc); + pk_has_clauses = true; + } + else + { + Assert(lepc->keyno > keyno); + break; + } + } + le_start = lc1; + } + + /* + * If we're generating steps for >/>= strategy, we + * can add other >= clauses to the prefix, + * provided they're from an earlier key. + */ + if (strat == BTGreaterStrategyNumber || + strat == BTGreaterEqualStrategyNumber) + { + for_each_cell(lc1, ge_clauses, ge_start) + { + PartClauseInfo *gepc = lfirst(lc1); + + if (gepc->keyno == keyno) + { + prefix = lappend(prefix, gepc); + pk_has_clauses = true; + } + else + { + Assert(gepc->keyno > keyno); + break; + } + } + ge_start = lc1; + } + + /* + * If this key has no clauses, prefix is not valid + * anymore. + */ + if (!pk_has_clauses) + { + prefix_valid = false; + break; + } + } + + /* + * If prefix_valid, generate PartitionPruneStepOps. + * Otherwise, we would not find clauses for a valid + * subset of the partition keys anymore for the + * strategy; give up on generating partition pruning + * steps further for the strategy. + * + * As mentioned above, if 'prefix' contains multiple + * expressions for the same key, the following will + * generate multiple steps, one for each combination + * of the expressions for different keys. + * + * Note that we pass NULL for step_nullkeys, because + * we don't search list/range partition bounds where + * some keys are NULL. + */ + if (prefix_valid) + { + Assert(pc->op_strategy == strat); + pc_steps = get_steps_using_prefix(context, strat, + pc->op_is_ne, + pc->expr, + pc->cmpfn, + pc->keyno, + NULL, + prefix); + opsteps = list_concat(opsteps, pc_steps); + } + else + break; + } + } + break; + } + + case PARTITION_STRATEGY_HASH: + { + List *eq_clauses = hash_clauses[HTEqualStrategyNumber]; + + /* For hash partitioning, we have just the = strategy. */ + if (eq_clauses != NIL) + { + PartClauseInfo *pc; + List *pc_steps; + List *prefix = NIL; + int last_keyno; + ListCell *lc1; + + /* + * Locate the clause for the greatest column. This may + * not belong to the last partition key, but it is the + * clause belonging to the last partition key we found a + * clause for above. + */ + pc = llast(eq_clauses); + + /* + * There might be multiple clauses which matched to that + * partition key; find the first such clause. While at + * it, add all the clauses before that one to 'prefix'. + */ + last_keyno = pc->keyno; + foreach(lc, eq_clauses) + { + pc = lfirst(lc); + if (pc->keyno == last_keyno) + break; + prefix = lappend(prefix, pc); + } + + /* + * For each clause for the "last" column, after appending + * the clause's own expression to the 'prefix', we'll + * generate one step using the so generated vector and + * assign = as its strategy. Actually, 'prefix' might + * contain multiple clauses for the same key, in which + * case, we must generate steps for various combinations + * of expressions of different keys, which + * get_steps_using_prefix will take care of for us. + */ + for_each_cell(lc1, eq_clauses, lc) + { + pc = lfirst(lc1); + + /* + * Note that we pass nullkeys for step_nullkeys, + * because we need to tell hash partition bound search + * function which of the keys we found IS NULL clauses + * for. + */ + Assert(pc->op_strategy == HTEqualStrategyNumber); + pc_steps = + get_steps_using_prefix(context, + HTEqualStrategyNumber, + false, + pc->expr, + pc->cmpfn, + pc->keyno, + nullkeys, + prefix); + opsteps = list_concat(opsteps, pc_steps); + } + } + break; + } + + default: + elog(ERROR, "invalid partition strategy: %c", + part_scheme->strategy); + break; + } + + /* Lastly, add a combine step to mutually AND these op steps, if needed */ + if (list_length(opsteps) > 1) + { + List *opstep_ids = NIL; + + foreach(lc, opsteps) + { + PartitionPruneStep *step = lfirst(lc); + + opstep_ids = lappend_int(opstep_ids, step->step_id); + } + + if (opstep_ids != NIL) + return gen_prune_step_combine(context, opstep_ids, + PARTPRUNE_COMBINE_INTERSECT); + return NULL; + } + else if (opsteps != NIL) + return linitial(opsteps); + + return NULL; +} + +/* + * If the partition key has a collation, then the clause must have the same + * input collation. If the partition key is non-collatable, we assume the + * collation doesn't matter, because while collation wasn't considered when + * performing partitioning, the clause still may have a collation assigned + * due to the other input being of a collatable type. + * + * See also IndexCollMatchesExprColl. + */ +#define PartCollMatchesExprColl(partcoll, exprcoll) \ + ((partcoll) == InvalidOid || (partcoll) == (exprcoll)) + +/* + * match_clause_to_partition_key + * Attempt to match the given 'clause' with the specified partition key. + * + * Return value is: + * * PARTCLAUSE_NOMATCH if the clause doesn't match this partition key (but + * caller should keep trying, because it might match a subsequent key). + * Output arguments: none set. + * + * * PARTCLAUSE_MATCH_CLAUSE if there is a match. + * Output arguments: *pc is set to a PartClauseInfo constructed for the + * matched clause. + * + * * PARTCLAUSE_MATCH_NULLNESS if there is a match, and the matched clause was + * either a "a IS NULL" or "a IS NOT NULL" clause. + * Output arguments: *clause_is_not_null is set to false in the former case + * true otherwise. + * + * * PARTCLAUSE_MATCH_STEPS if there is a match. + * Output arguments: *clause_steps is set to a list of PartitionPruneStep + * generated for the clause. + * + * * PARTCLAUSE_MATCH_CONTRADICT if the clause is self-contradictory, ie + * it provably returns FALSE or NULL. + * Output arguments: none set. + * + * * PARTCLAUSE_UNSUPPORTED if the clause doesn't match this partition key + * and couldn't possibly match any other one either, due to its form or + * properties (such as containing a volatile function). + * Output arguments: none set. + */ +static PartClauseMatchStatus +match_clause_to_partition_key(GeneratePruningStepsContext *context, + Expr *clause, Expr *partkey, int partkeyidx, + bool *clause_is_not_null, PartClauseInfo **pc, + List **clause_steps) +{ + PartClauseMatchStatus boolmatchstatus; + PartitionScheme part_scheme = context->rel->part_scheme; + Oid partopfamily = part_scheme->partopfamily[partkeyidx], + partcoll = part_scheme->partcollation[partkeyidx]; + Expr *expr; + + /* + * Recognize specially shaped clauses that match a Boolean partition key. + */ + boolmatchstatus = match_boolean_partition_clause(partopfamily, clause, + partkey, &expr); + + if (boolmatchstatus == PARTCLAUSE_MATCH_CLAUSE) + { + PartClauseInfo *partclause; + + partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo)); + partclause->keyno = partkeyidx; + /* Do pruning with the Boolean equality operator. */ + partclause->opno = BooleanEqualOperator; + partclause->op_is_ne = false; + partclause->expr = expr; + /* We know that expr is of Boolean type. */ + partclause->cmpfn = part_scheme->partsupfunc[partkeyidx].fn_oid; + partclause->op_strategy = InvalidStrategy; + + *pc = partclause; + + return PARTCLAUSE_MATCH_CLAUSE; + } + else if (IsA(clause, OpExpr) && + list_length(((OpExpr *) clause)->args) == 2) + { + OpExpr *opclause = (OpExpr *) clause; + Expr *leftop, + *rightop; + Oid opno, + op_lefttype, + op_righttype, + negator = InvalidOid; + Oid cmpfn; + int op_strategy; + bool is_opne_listp = false; + PartClauseInfo *partclause; + + leftop = (Expr *) get_leftop(clause); + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + rightop = (Expr *) get_rightop(clause); + if (IsA(rightop, RelabelType)) + rightop = ((RelabelType *) rightop)->arg; + opno = opclause->opno; + + /* check if the clause matches this partition key */ + if (equal(leftop, partkey)) + expr = rightop; + else if (equal(rightop, partkey)) + { + /* + * It's only useful if we can commute the operator to put the + * partkey on the left. If we can't, the clause can be deemed + * UNSUPPORTED. Even if its leftop matches some later partkey, we + * now know it has Vars on the right, so it's no use. + */ + opno = get_commutator(opno); + if (!OidIsValid(opno)) + return PARTCLAUSE_UNSUPPORTED; + expr = leftop; + } + else + /* clause does not match this partition key, but perhaps next. */ + return PARTCLAUSE_NOMATCH; + + /* + * Partition key match also requires collation match. There may be + * multiple partkeys with the same expression but different + * collations, so failure is NOMATCH. + */ + if (!PartCollMatchesExprColl(partcoll, opclause->inputcollid)) + return PARTCLAUSE_NOMATCH; + + /* + * See if the operator is relevant to the partitioning opfamily. + * + * Normally we only care about operators that are listed as being part + * of the partitioning operator family. But there is one exception: + * the not-equals operators are not listed in any operator family + * whatsoever, but their negators (equality) are. We can use one of + * those if we find it, but only for list partitioning. + * + * Note: we report NOMATCH on failure, in case a later partkey has the + * same expression but different opfamily. That's unlikely, but not + * much more so than duplicate expressions with different collations. + */ + if (op_in_opfamily(opno, partopfamily)) + { + get_op_opfamily_properties(opno, partopfamily, false, + &op_strategy, &op_lefttype, + &op_righttype); + } + else + { + if (part_scheme->strategy != PARTITION_STRATEGY_LIST) + return PARTCLAUSE_NOMATCH; + + /* See if the negator is equality */ + negator = get_negator(opno); + if (OidIsValid(negator) && op_in_opfamily(negator, partopfamily)) + { + get_op_opfamily_properties(negator, partopfamily, false, + &op_strategy, &op_lefttype, + &op_righttype); + if (op_strategy == BTEqualStrategyNumber) + is_opne_listp = true; /* bingo */ + } + + /* Nope, it's not <> either. */ + if (!is_opne_listp) + return PARTCLAUSE_NOMATCH; + } + + /* + * Only allow strict operators. This will guarantee nulls are + * filtered. (This test is likely useless, since btree and hash + * comparison operators are generally strict.) + */ + if (!op_strict(opno)) + return PARTCLAUSE_UNSUPPORTED; + + /* + * OK, we have a match to the partition key and a suitable operator. + * Examine the other argument to see if it's usable for pruning. + * + * In most of these cases, we can return UNSUPPORTED because the same + * failure would occur no matter which partkey it's matched to. (In + * particular, now that we've successfully matched one side of the + * opclause to a partkey, there is no chance that matching the other + * side to another partkey will produce a usable result, since that'd + * mean there are Vars on both sides.) + * + * Also, if we reject an argument for a target-dependent reason, set + * appropriate fields of *context to report that. We postpone these + * tests until after matching the partkey and the operator, so as to + * reduce the odds of setting the context fields for clauses that do + * not end up contributing to pruning steps. + * + * First, check for non-Const argument. (We assume that any immutable + * subexpression will have been folded to a Const already.) + */ + if (!IsA(expr, Const)) + { + Bitmapset *paramids; + + /* + * When pruning in the planner, we only support pruning using + * comparisons to constants. We cannot prune on the basis of + * anything that's not immutable. (Note that has_mutable_arg and + * has_exec_param do not get set for this target value.) + */ + if (context->target == PARTTARGET_PLANNER) + return PARTCLAUSE_UNSUPPORTED; + + /* + * We can never prune using an expression that contains Vars. + */ + if (contain_var_clause((Node *) expr)) + return PARTCLAUSE_UNSUPPORTED; + + /* + * And we must reject anything containing a volatile function. + * Stable functions are OK though. + */ + if (contain_volatile_functions((Node *) expr)) + return PARTCLAUSE_UNSUPPORTED; + + /* + * See if there are any exec Params. If so, we can only use this + * expression during per-scan pruning. + */ + paramids = pull_exec_paramids(expr); + if (!bms_is_empty(paramids)) + { + context->has_exec_param = true; + if (context->target != PARTTARGET_EXEC) + return PARTCLAUSE_UNSUPPORTED; + } + else + { + /* It's potentially usable, but mutable */ + context->has_mutable_arg = true; + } + } + + /* + * Check whether the comparison operator itself is immutable. (We + * assume anything that's in a btree or hash opclass is at least + * stable, but we need to check for immutability.) + */ + if (op_volatile(opno) != PROVOLATILE_IMMUTABLE) + { + context->has_mutable_op = true; + + /* + * When pruning in the planner, we cannot prune with mutable + * operators. + */ + if (context->target == PARTTARGET_PLANNER) + return PARTCLAUSE_UNSUPPORTED; + } + + /* + * Now find the procedure to use, based on the types. If the clause's + * other argument is of the same type as the partitioning opclass's + * declared input type, we can use the procedure cached in + * PartitionKey. If not, search for a cross-type one in the same + * opfamily; if one doesn't exist, report no match. + */ + if (op_righttype == part_scheme->partopcintype[partkeyidx]) + cmpfn = part_scheme->partsupfunc[partkeyidx].fn_oid; + else + { + switch (part_scheme->strategy) + { + /* + * For range and list partitioning, we need the ordering + * procedure with lefttype being the partition key's type, + * and righttype the clause's operator's right type. + */ + case PARTITION_STRATEGY_LIST: + case PARTITION_STRATEGY_RANGE: + cmpfn = + get_opfamily_proc(part_scheme->partopfamily[partkeyidx], + part_scheme->partopcintype[partkeyidx], + op_righttype, BTORDER_PROC); + break; + + /* + * For hash partitioning, we need the hashing procedure + * for the clause's type. + */ + case PARTITION_STRATEGY_HASH: + cmpfn = + get_opfamily_proc(part_scheme->partopfamily[partkeyidx], + op_righttype, op_righttype, + HASHEXTENDED_PROC); + break; + + default: + elog(ERROR, "invalid partition strategy: %c", + part_scheme->strategy); + cmpfn = InvalidOid; /* keep compiler quiet */ + break; + } + + if (!OidIsValid(cmpfn)) + return PARTCLAUSE_NOMATCH; + } + + /* + * Build the clause, passing the negator if applicable. + */ + partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo)); + partclause->keyno = partkeyidx; + if (is_opne_listp) + { + Assert(OidIsValid(negator)); + partclause->opno = negator; + partclause->op_is_ne = true; + partclause->op_strategy = InvalidStrategy; + } + else + { + partclause->opno = opno; + partclause->op_is_ne = false; + partclause->op_strategy = op_strategy; + } + partclause->expr = expr; + partclause->cmpfn = cmpfn; + + *pc = partclause; + + return PARTCLAUSE_MATCH_CLAUSE; + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; + Oid saop_op = saop->opno; + Oid saop_coll = saop->inputcollid; + Expr *leftop = (Expr *) linitial(saop->args), + *rightop = (Expr *) lsecond(saop->args); + List *elem_exprs, + *elem_clauses; + ListCell *lc1; + + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + /* check if the LHS matches this partition key */ + if (!equal(leftop, partkey) || + !PartCollMatchesExprColl(partcoll, saop->inputcollid)) + return PARTCLAUSE_NOMATCH; + + /* + * See if the operator is relevant to the partitioning opfamily. + * + * In case of NOT IN (..), we get a '<>', which we handle if list + * partitioning is in use and we're able to confirm that it's negator + * is a btree equality operator belonging to the partitioning operator + * family. As above, report NOMATCH for non-matching operator. + */ + if (!op_in_opfamily(saop_op, partopfamily)) + { + Oid negator; + + if (part_scheme->strategy != PARTITION_STRATEGY_LIST) + return PARTCLAUSE_NOMATCH; + + negator = get_negator(saop_op); + if (OidIsValid(negator) && op_in_opfamily(negator, partopfamily)) + { + int strategy; + Oid lefttype, + righttype; + + get_op_opfamily_properties(negator, partopfamily, + false, &strategy, + &lefttype, &righttype); + if (strategy != BTEqualStrategyNumber) + return PARTCLAUSE_NOMATCH; + } + else + return PARTCLAUSE_NOMATCH; /* no useful negator */ + } + + /* + * Only allow strict operators. This will guarantee nulls are + * filtered. (This test is likely useless, since btree and hash + * comparison operators are generally strict.) + */ + if (!op_strict(saop_op)) + return PARTCLAUSE_UNSUPPORTED; + + /* + * OK, we have a match to the partition key and a suitable operator. + * Examine the array argument to see if it's usable for pruning. This + * is identical to the logic for a plain OpExpr. + */ + if (!IsA(rightop, Const)) + { + Bitmapset *paramids; + + /* + * When pruning in the planner, we only support pruning using + * comparisons to constants. We cannot prune on the basis of + * anything that's not immutable. (Note that has_mutable_arg and + * has_exec_param do not get set for this target value.) + */ + if (context->target == PARTTARGET_PLANNER) + return PARTCLAUSE_UNSUPPORTED; + + /* + * We can never prune using an expression that contains Vars. + */ + if (contain_var_clause((Node *) rightop)) + return PARTCLAUSE_UNSUPPORTED; + + /* + * And we must reject anything containing a volatile function. + * Stable functions are OK though. + */ + if (contain_volatile_functions((Node *) rightop)) + return PARTCLAUSE_UNSUPPORTED; + + /* + * See if there are any exec Params. If so, we can only use this + * expression during per-scan pruning. + */ + paramids = pull_exec_paramids(rightop); + if (!bms_is_empty(paramids)) + { + context->has_exec_param = true; + if (context->target != PARTTARGET_EXEC) + return PARTCLAUSE_UNSUPPORTED; + } + else + { + /* It's potentially usable, but mutable */ + context->has_mutable_arg = true; + } + } + + /* + * Check whether the comparison operator itself is immutable. (We + * assume anything that's in a btree or hash opclass is at least + * stable, but we need to check for immutability.) + */ + if (op_volatile(saop_op) != PROVOLATILE_IMMUTABLE) + { + context->has_mutable_op = true; + + /* + * When pruning in the planner, we cannot prune with mutable + * operators. + */ + if (context->target == PARTTARGET_PLANNER) + return PARTCLAUSE_UNSUPPORTED; + } + + /* + * Examine the contents of the array argument. + */ + elem_exprs = NIL; + if (IsA(rightop, Const)) + { + /* + * For a constant array, convert the elements to a list of Const + * nodes, one for each array element (excepting nulls). + */ + Const *arr = (Const *) rightop; + ArrayType *arrval; + int16 elemlen; + bool elembyval; + char elemalign; + Datum *elem_values; + bool *elem_nulls; + int num_elems, + i; + + /* If the array itself is null, the saop returns null */ + if (arr->constisnull) + return PARTCLAUSE_MATCH_CONTRADICT; + + arrval = DatumGetArrayTypeP(arr->constvalue); + get_typlenbyvalalign(ARR_ELEMTYPE(arrval), + &elemlen, &elembyval, &elemalign); + deconstruct_array(arrval, + ARR_ELEMTYPE(arrval), + elemlen, elembyval, elemalign, + &elem_values, &elem_nulls, + &num_elems); + for (i = 0; i < num_elems; i++) + { + Const *elem_expr; + + /* + * A null array element must lead to a null comparison result, + * since saop_op is known strict. We can ignore it in the + * useOr case, but otherwise it implies self-contradiction. + */ + if (elem_nulls[i]) + { + if (saop->useOr) + continue; + return PARTCLAUSE_MATCH_CONTRADICT; + } + + elem_expr = makeConst(ARR_ELEMTYPE(arrval), -1, + arr->constcollid, elemlen, + elem_values[i], false, elembyval); + elem_exprs = lappend(elem_exprs, elem_expr); + } + } + else if (IsA(rightop, ArrayExpr)) + { + ArrayExpr *arrexpr = castNode(ArrayExpr, rightop); + + /* + * For a nested ArrayExpr, we don't know how to get the actual + * scalar values out into a flat list, so we give up doing + * anything with this ScalarArrayOpExpr. + */ + if (arrexpr->multidims) + return PARTCLAUSE_UNSUPPORTED; + + /* + * Otherwise, we can just use the list of element values. + */ + elem_exprs = arrexpr->elements; + } + else + { + /* Give up on any other clause types. */ + return PARTCLAUSE_UNSUPPORTED; + } + + /* + * Now generate a list of clauses, one for each array element, of the + * form leftop saop_op elem_expr + */ + elem_clauses = NIL; + foreach(lc1, elem_exprs) + { + Expr *rightop = (Expr *) lfirst(lc1), + *elem_clause; + + elem_clause = make_opclause(saop_op, BOOLOID, false, + leftop, rightop, + InvalidOid, saop_coll); + elem_clauses = lappend(elem_clauses, elem_clause); + } + + /* + * If we have an ANY clause and multiple elements, now turn the list + * of clauses into an OR expression. + */ + if (saop->useOr && list_length(elem_clauses) > 1) + elem_clauses = list_make1(makeBoolExpr(OR_EXPR, elem_clauses, -1)); + + /* Finally, generate steps */ + *clause_steps = gen_partprune_steps_internal(context, elem_clauses); + if (context->contradictory) + return PARTCLAUSE_MATCH_CONTRADICT; + else if (*clause_steps == NIL) + return PARTCLAUSE_UNSUPPORTED; /* step generation failed */ + return PARTCLAUSE_MATCH_STEPS; + } + else if (IsA(clause, NullTest)) + { + NullTest *nulltest = (NullTest *) clause; + Expr *arg = nulltest->arg; + + if (IsA(arg, RelabelType)) + arg = ((RelabelType *) arg)->arg; + + /* Does arg match with this partition key column? */ + if (!equal(arg, partkey)) + return PARTCLAUSE_NOMATCH; + + *clause_is_not_null = (nulltest->nulltesttype == IS_NOT_NULL); + + return PARTCLAUSE_MATCH_NULLNESS; + } + + /* + * If we get here then the return value depends on the result of the + * match_boolean_partition_clause call above. If the call returned + * PARTCLAUSE_UNSUPPORTED then we're either not dealing with a bool qual + * or the bool qual is not suitable for pruning. Since the qual didn't + * match up to any of the other qual types supported here, then trying to + * match it against any other partition key is a waste of time, so just + * return PARTCLAUSE_UNSUPPORTED. If the qual just couldn't be matched to + * this partition key, then it may match another, so return + * PARTCLAUSE_NOMATCH. The only other value that + * match_boolean_partition_clause can return is PARTCLAUSE_MATCH_CLAUSE, + * and since that value was already dealt with above, then we can just + * return boolmatchstatus. + */ + return boolmatchstatus; +} + +/* + * get_steps_using_prefix + * Generate list of PartitionPruneStepOp steps each consisting of given + * opstrategy + * + * To generate steps, step_lastexpr and step_lastcmpfn are appended to + * expressions and cmpfns, respectively, extracted from the clauses in + * 'prefix'. Actually, since 'prefix' may contain multiple clauses for the + * same partition key column, we must generate steps for various combinations + * of the clauses of different keys. + * + * For list/range partitioning, callers must ensure that step_nullkeys is + * NULL, and that prefix contains at least one clause for each of the + * partition keys earlier than one specified in step_lastkeyno if it's + * greater than zero. For hash partitioning, step_nullkeys is allowed to be + * non-NULL, but they must ensure that prefix contains at least one clause + * for each of the partition keys other than those specified in step_nullkeys + * and step_lastkeyno. + * + * For both cases, callers must also ensure that clauses in prefix are sorted + * in ascending order of their partition key numbers. + */ +static List * +get_steps_using_prefix(GeneratePruningStepsContext *context, + StrategyNumber step_opstrategy, + bool step_op_is_ne, + Expr *step_lastexpr, + Oid step_lastcmpfn, + int step_lastkeyno, + Bitmapset *step_nullkeys, + List *prefix) +{ + Assert(step_nullkeys == NULL || + context->rel->part_scheme->strategy == PARTITION_STRATEGY_HASH); + + /* Quick exit if there are no values to prefix with. */ + if (list_length(prefix) == 0) + { + PartitionPruneStep *step; + + step = gen_prune_step_op(context, + step_opstrategy, + step_op_is_ne, + list_make1(step_lastexpr), + list_make1_oid(step_lastcmpfn), + step_nullkeys); + return list_make1(step); + } + + /* Recurse to generate steps for various combinations. */ + return get_steps_using_prefix_recurse(context, + step_opstrategy, + step_op_is_ne, + step_lastexpr, + step_lastcmpfn, + step_lastkeyno, + step_nullkeys, + prefix, + list_head(prefix), + NIL, NIL); +} + +/* + * get_steps_using_prefix_recurse + * Recursively generate combinations of clauses for different partition + * keys and start generating steps upon reaching clauses for the greatest + * column that is less than the one for which we're currently generating + * steps (that is, step_lastkeyno) + * + * 'prefix' is the list of PartClauseInfos. + * 'start' is where we should start iterating for the current invocation. + * 'step_exprs' and 'step_cmpfns' each contains the expressions and cmpfns + * we've generated so far from the clauses for the previous part keys. + */ +static List * +get_steps_using_prefix_recurse(GeneratePruningStepsContext *context, + StrategyNumber step_opstrategy, + bool step_op_is_ne, + Expr *step_lastexpr, + Oid step_lastcmpfn, + int step_lastkeyno, + Bitmapset *step_nullkeys, + List *prefix, + ListCell *start, + List *step_exprs, + List *step_cmpfns) +{ + List *result = NIL; + ListCell *lc; + int cur_keyno; + + /* Actually, recursion would be limited by PARTITION_MAX_KEYS. */ + check_stack_depth(); + + /* Check if we need to recurse. */ + Assert(start != NULL); + cur_keyno = ((PartClauseInfo *) lfirst(start))->keyno; + if (cur_keyno < step_lastkeyno - 1) + { + PartClauseInfo *pc; + ListCell *next_start; + + /* + * For each clause with cur_keyno, add its expr and cmpfn to + * step_exprs and step_cmpfns, respectively, and recurse after setting + * next_start to the ListCell of the first clause for the next + * partition key. + */ + for_each_cell(lc, prefix, start) + { + pc = lfirst(lc); + + if (pc->keyno > cur_keyno) + break; + } + next_start = lc; + + for_each_cell(lc, prefix, start) + { + List *moresteps; + List *step_exprs1, + *step_cmpfns1; + + pc = lfirst(lc); + if (pc->keyno == cur_keyno) + { + /* Leave the original step_exprs unmodified. */ + step_exprs1 = list_copy(step_exprs); + step_exprs1 = lappend(step_exprs1, pc->expr); + + /* Leave the original step_cmpfns unmodified. */ + step_cmpfns1 = list_copy(step_cmpfns); + step_cmpfns1 = lappend_oid(step_cmpfns1, pc->cmpfn); + } + else + { + Assert(pc->keyno > cur_keyno); + break; + } + + moresteps = get_steps_using_prefix_recurse(context, + step_opstrategy, + step_op_is_ne, + step_lastexpr, + step_lastcmpfn, + step_lastkeyno, + step_nullkeys, + prefix, + next_start, + step_exprs1, + step_cmpfns1); + result = list_concat(result, moresteps); + + list_free(step_exprs1); + list_free(step_cmpfns1); + } + } + else + { + /* + * End the current recursion cycle and start generating steps, one for + * each clause with cur_keyno, which is all clauses from here onward + * till the end of the list. Note that for hash partitioning, + * step_nullkeys is allowed to be non-empty, in which case step_exprs + * would only contain expressions for the earlier partition keys that + * are not specified in step_nullkeys. + */ + Assert(list_length(step_exprs) == cur_keyno || + !bms_is_empty(step_nullkeys)); + /* + * Note also that for hash partitioning, each partition key should + * have either equality clauses or an IS NULL clause, so if a + * partition key doesn't have an expression, it would be specified + * in step_nullkeys. + */ + Assert(context->rel->part_scheme->strategy + != PARTITION_STRATEGY_HASH || + list_length(step_exprs) + 2 + bms_num_members(step_nullkeys) == + context->rel->part_scheme->partnatts); + for_each_cell(lc, prefix, start) + { + PartClauseInfo *pc = lfirst(lc); + PartitionPruneStep *step; + List *step_exprs1, + *step_cmpfns1; + + Assert(pc->keyno == cur_keyno); + + /* Leave the original step_exprs unmodified. */ + step_exprs1 = list_copy(step_exprs); + step_exprs1 = lappend(step_exprs1, pc->expr); + step_exprs1 = lappend(step_exprs1, step_lastexpr); + + /* Leave the original step_cmpfns unmodified. */ + step_cmpfns1 = list_copy(step_cmpfns); + step_cmpfns1 = lappend_oid(step_cmpfns1, pc->cmpfn); + step_cmpfns1 = lappend_oid(step_cmpfns1, step_lastcmpfn); + + step = gen_prune_step_op(context, + step_opstrategy, step_op_is_ne, + step_exprs1, step_cmpfns1, + step_nullkeys); + result = lappend(result, step); + } + } + + return result; +} + +/* + * get_matching_hash_bounds + * Determine offset of the hash bound matching the specified values, + * considering that all the non-null values come from clauses containing + * a compatible hash equality operator and any keys that are null come + * from an IS NULL clause. + * + * Generally this function will return a single matching bound offset, + * although if a partition has not been setup for a given modulus then we may + * return no matches. If the number of clauses found don't cover the entire + * partition key, then we'll need to return all offsets. + * + * 'opstrategy' if non-zero must be HTEqualStrategyNumber. + * + * 'values' contains Datums indexed by the partition key to use for pruning. + * + * 'nvalues', the number of Datums in the 'values' array. + * + * 'partsupfunc' contains partition hashing functions that can produce correct + * hash for the type of the values contained in 'values'. + * + * 'nullkeys' is the set of partition keys that are null. + */ +static PruneStepResult * +get_matching_hash_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum *values, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys) +{ + PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PartitionBoundInfo boundinfo = context->boundinfo; + int *partindices = boundinfo->indexes; + int partnatts = context->partnatts; + bool isnull[PARTITION_MAX_KEYS]; + int i; + uint64 rowHash; + int greatest_modulus; + Oid *partcollation = context->partcollation; + + Assert(context->strategy == PARTITION_STRATEGY_HASH); + + /* + * For hash partitioning we can only perform pruning based on equality + * clauses to the partition key or IS NULL clauses. We also can only + * prune if we got values for all keys. + */ + if (nvalues + bms_num_members(nullkeys) == partnatts) + { + /* + * If there are any values, they must have come from clauses + * containing an equality operator compatible with hash partitioning. + */ + Assert(opstrategy == HTEqualStrategyNumber || nvalues == 0); + + for (i = 0; i < partnatts; i++) + isnull[i] = bms_is_member(i, nullkeys); + + rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation, + values, isnull); + + greatest_modulus = boundinfo->nindexes; + if (partindices[rowHash % greatest_modulus] >= 0) + result->bound_offsets = + bms_make_singleton(rowHash % greatest_modulus); + } + else + { + /* Report all valid offsets into the boundinfo->indexes array. */ + result->bound_offsets = bms_add_range(NULL, 0, + boundinfo->nindexes - 1); + } + + /* + * There is neither a special hash null partition or the default hash + * partition. + */ + result->scan_null = result->scan_default = false; + + return result; +} + +/* + * get_matching_list_bounds + * Determine the offsets of list bounds matching the specified value, + * according to the semantics of the given operator strategy + * + * scan_default will be set in the returned struct, if the default partition + * needs to be scanned, provided one exists at all. scan_null will be set if + * the special null-accepting partition needs to be scanned. + * + * 'opstrategy' if non-zero must be a btree strategy number. + * + * 'value' contains the value to use for pruning. + * + * 'nvalues', if non-zero, should be exactly 1, because of list partitioning. + * + * 'partsupfunc' contains the list partitioning comparison function to be used + * to perform partition_list_bsearch + * + * 'nullkeys' is the set of partition keys that are null. + */ +static PruneStepResult * +get_matching_list_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum value, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys) +{ + PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PartitionBoundInfo boundinfo = context->boundinfo; + int off, + minoff, + maxoff; + bool is_equal; + bool inclusive = false; + Oid *partcollation = context->partcollation; + + Assert(context->strategy == PARTITION_STRATEGY_LIST); + Assert(context->partnatts == 1); + + result->scan_null = result->scan_default = false; + + if (!bms_is_empty(nullkeys)) + { + /* + * Nulls may exist in only one partition - the partition whose + * accepted set of values includes null or the default partition if + * the former doesn't exist. + */ + if (partition_bound_accepts_nulls(boundinfo)) + result->scan_null = true; + else + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + + /* + * If there are no datums to compare keys with, but there are partitions, + * just return the default partition if one exists. + */ + if (boundinfo->ndatums == 0) + { + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + + minoff = 0; + maxoff = boundinfo->ndatums - 1; + + /* + * If there are no values to compare with the datums in boundinfo, it + * means the caller asked for partitions for all non-null datums. Add + * indexes of *all* partitions, including the default if any. + */ + if (nvalues == 0) + { + Assert(boundinfo->ndatums > 0); + result->bound_offsets = bms_add_range(NULL, 0, + boundinfo->ndatums - 1); + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + + /* Special case handling of values coming from a <> operator clause. */ + if (opstrategy == InvalidStrategy) + { + /* + * First match to all bounds. We'll remove any matching datums below. + */ + Assert(boundinfo->ndatums > 0); + result->bound_offsets = bms_add_range(NULL, 0, + boundinfo->ndatums - 1); + + off = partition_list_bsearch(partsupfunc, partcollation, boundinfo, + value, &is_equal); + if (off >= 0 && is_equal) + { + + /* We have a match. Remove from the result. */ + Assert(boundinfo->indexes[off] >= 0); + result->bound_offsets = bms_del_member(result->bound_offsets, + off); + } + + /* Always include the default partition if any. */ + result->scan_default = partition_bound_has_default(boundinfo); + + return result; + } + + /* + * With range queries, always include the default list partition, because + * list partitions divide the key space in a discontinuous manner, not all + * values in the given range will have a partition assigned. This may not + * technically be true for some data types (e.g. integer types), however, + * we currently lack any sort of infrastructure to provide us with proofs + * that would allow us to do anything smarter here. + */ + if (opstrategy != BTEqualStrategyNumber) + result->scan_default = partition_bound_has_default(boundinfo); + + switch (opstrategy) + { + case BTEqualStrategyNumber: + off = partition_list_bsearch(partsupfunc, + partcollation, + boundinfo, value, + &is_equal); + if (off >= 0 && is_equal) + { + Assert(boundinfo->indexes[off] >= 0); + result->bound_offsets = bms_make_singleton(off); + } + else + result->scan_default = partition_bound_has_default(boundinfo); + return result; + + case BTGreaterEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTGreaterStrategyNumber: + off = partition_list_bsearch(partsupfunc, + partcollation, + boundinfo, value, + &is_equal); + if (off >= 0) + { + /* We don't want the matched datum to be in the result. */ + if (!is_equal || !inclusive) + off++; + } + else + { + /* + * This case means all partition bounds are greater, which in + * turn means that all partitions satisfy this key. + */ + off = 0; + } + + /* + * off is greater than the numbers of datums we have partitions + * for. The only possible partition that could contain a match is + * the default partition, but we must've set context->scan_default + * above anyway if one exists. + */ + if (off > boundinfo->ndatums - 1) + return result; + + minoff = off; + break; + + case BTLessEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTLessStrategyNumber: + off = partition_list_bsearch(partsupfunc, + partcollation, + boundinfo, value, + &is_equal); + if (off >= 0 && is_equal && !inclusive) + off--; + + /* + * off is smaller than the datums of all non-default partitions. + * The only possible partition that could contain a match is the + * default partition, but we must've set context->scan_default + * above anyway if one exists. + */ + if (off < 0) + return result; + + maxoff = off; + break; + + default: + elog(ERROR, "invalid strategy number %d", opstrategy); + break; + } + + Assert(minoff >= 0 && maxoff >= 0); + result->bound_offsets = bms_add_range(NULL, minoff, maxoff); + return result; +} + + +/* + * get_matching_range_bounds + * Determine the offsets of range bounds matching the specified values, + * according to the semantics of the given operator strategy + * + * Each datum whose offset is in result is to be treated as the upper bound of + * the partition that will contain the desired values. + * + * scan_default is set in the returned struct if a default partition exists + * and we're absolutely certain that it needs to be scanned. We do *not* set + * it just because values match portions of the key space uncovered by + * partitions other than default (space which we normally assume to belong to + * the default partition): the final set of bounds obtained after combining + * multiple pruning steps might exclude it, so we infer its inclusion + * elsewhere. + * + * 'opstrategy' if non-zero must be a btree strategy number. + * + * 'values' contains Datums indexed by the partition key to use for pruning. + * + * 'nvalues', number of Datums in 'values' array. Must be <= context->partnatts. + * + * 'partsupfunc' contains the range partitioning comparison functions to be + * used to perform partition_range_datum_bsearch or partition_rbound_datum_cmp + * using. + * + * 'nullkeys' is the set of partition keys that are null. + */ +static PruneStepResult * +get_matching_range_bounds(PartitionPruneContext *context, + StrategyNumber opstrategy, Datum *values, int nvalues, + FmgrInfo *partsupfunc, Bitmapset *nullkeys) +{ + PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + PartitionBoundInfo boundinfo = context->boundinfo; + Oid *partcollation = context->partcollation; + int partnatts = context->partnatts; + int *partindices = boundinfo->indexes; + int off, + minoff, + maxoff; + bool is_equal; + bool inclusive = false; + + Assert(context->strategy == PARTITION_STRATEGY_RANGE); + Assert(nvalues <= partnatts); + + result->scan_null = result->scan_default = false; + + /* + * If there are no datums to compare keys with, or if we got an IS NULL + * clause just return the default partition, if it exists. + */ + if (boundinfo->ndatums == 0 || !bms_is_empty(nullkeys)) + { + result->scan_default = partition_bound_has_default(boundinfo); + return result; + } + + minoff = 0; + maxoff = boundinfo->ndatums; + + /* + * If there are no values to compare with the datums in boundinfo, it + * means the caller asked for partitions for all non-null datums. Add + * indexes of *all* partitions, including the default partition if one + * exists. + */ + if (nvalues == 0) + { + /* ignore key space not covered by any partitions */ + if (partindices[minoff] < 0) + minoff++; + if (partindices[maxoff] < 0) + maxoff--; + + result->scan_default = partition_bound_has_default(boundinfo); + Assert(partindices[minoff] >= 0 && + partindices[maxoff] >= 0); + result->bound_offsets = bms_add_range(NULL, minoff, maxoff); + + return result; + } + + /* + * If the query does not constrain all key columns, we'll need to scan the + * default partition, if any. + */ + if (nvalues < partnatts) + result->scan_default = partition_bound_has_default(boundinfo); + + switch (opstrategy) + { + case BTEqualStrategyNumber: + /* Look for the smallest bound that is = lookup value. */ + off = partition_range_datum_bsearch(partsupfunc, + partcollation, + boundinfo, + nvalues, values, + &is_equal); + + if (off >= 0 && is_equal) + { + if (nvalues == partnatts) + { + /* There can only be zero or one matching partition. */ + result->bound_offsets = bms_make_singleton(off + 1); + return result; + } + else + { + int saved_off = off; + + /* + * Since the lookup value contains only a prefix of keys, + * we must find other bounds that may also match the + * prefix. partition_range_datum_bsearch() returns the + * offset of one of them, find others by checking adjacent + * bounds. + */ + + /* + * First find greatest bound that's smaller than the + * lookup value. + */ + while (off >= 1) + { + int32 cmpval; + + cmpval = + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off - 1], + boundinfo->kind[off - 1], + values, nvalues); + if (cmpval != 0) + break; + off--; + } + + Assert(0 == + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off], + boundinfo->kind[off], + values, nvalues)); + + /* + * We can treat 'off' as the offset of the smallest bound + * to be included in the result, if we know it is the + * upper bound of the partition in which the lookup value + * could possibly exist. One case it couldn't is if the + * bound, or precisely the matched portion of its prefix, + * is not inclusive. + */ + if (boundinfo->kind[off][nvalues] == + PARTITION_RANGE_DATUM_MINVALUE) + off++; + + minoff = off; + + /* + * Now find smallest bound that's greater than the lookup + * value. + */ + off = saved_off; + while (off < boundinfo->ndatums - 1) + { + int32 cmpval; + + cmpval = partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off + 1], + boundinfo->kind[off + 1], + values, nvalues); + if (cmpval != 0) + break; + off++; + } + + Assert(0 == + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off], + boundinfo->kind[off], + values, nvalues)); + + /* + * off + 1, then would be the offset of the greatest bound + * to be included in the result. + */ + maxoff = off + 1; + } + + Assert(minoff >= 0 && maxoff >= 0); + result->bound_offsets = bms_add_range(NULL, minoff, maxoff); + } + else + { + /* + * The lookup value falls in the range between some bounds in + * boundinfo. 'off' would be the offset of the greatest bound + * that is <= lookup value, so add off + 1 to the result + * instead as the offset of the upper bound of the only + * partition that may contain the lookup value. If 'off' is + * -1 indicating that all bounds are greater, then we simply + * end up adding the first bound's offset, that is, 0. + */ + result->bound_offsets = bms_make_singleton(off + 1); + } + + return result; + + case BTGreaterEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTGreaterStrategyNumber: + + /* + * Look for the smallest bound that is > or >= lookup value and + * set minoff to its offset. + */ + off = partition_range_datum_bsearch(partsupfunc, + partcollation, + boundinfo, + nvalues, values, + &is_equal); + if (off < 0) + { + /* + * All bounds are greater than the lookup value, so include + * all of them in the result. + */ + minoff = 0; + } + else + { + if (is_equal && nvalues < partnatts) + { + /* + * Since the lookup value contains only a prefix of keys, + * we must find other bounds that may also match the + * prefix. partition_range_datum_bsearch() returns the + * offset of one of them, find others by checking adjacent + * bounds. + * + * Based on whether the lookup values are inclusive or + * not, we must either include the indexes of all such + * bounds in the result (that is, set minoff to the index + * of smallest such bound) or find the smallest one that's + * greater than the lookup values and set minoff to that. + */ + while (off >= 1 && off < boundinfo->ndatums - 1) + { + int32 cmpval; + int nextoff; + + nextoff = inclusive ? off - 1 : off + 1; + cmpval = + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[nextoff], + boundinfo->kind[nextoff], + values, nvalues); + if (cmpval != 0) + break; + + off = nextoff; + } + + Assert(0 == + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off], + boundinfo->kind[off], + values, nvalues)); + + minoff = inclusive ? off : off + 1; + } + else + { + + /* + * lookup value falls in the range between some bounds in + * boundinfo. off would be the offset of the greatest + * bound that is <= lookup value, so add off + 1 to the + * result instead as the offset of the upper bound of the + * smallest partition that may contain the lookup value. + */ + minoff = off + 1; + } + } + break; + + case BTLessEqualStrategyNumber: + inclusive = true; + /* fall through */ + case BTLessStrategyNumber: + + /* + * Look for the greatest bound that is < or <= lookup value and + * set maxoff to its offset. + */ + off = partition_range_datum_bsearch(partsupfunc, + partcollation, + boundinfo, + nvalues, values, + &is_equal); + if (off >= 0) + { + /* + * See the comment above. + */ + if (is_equal && nvalues < partnatts) + { + while (off >= 1 && off < boundinfo->ndatums - 1) + { + int32 cmpval; + int nextoff; + + nextoff = inclusive ? off + 1 : off - 1; + cmpval = partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[nextoff], + boundinfo->kind[nextoff], + values, nvalues); + if (cmpval != 0) + break; + + off = nextoff; + } + + Assert(0 == + partition_rbound_datum_cmp(partsupfunc, + partcollation, + boundinfo->datums[off], + boundinfo->kind[off], + values, nvalues)); + + maxoff = inclusive ? off + 1 : off; + } + + /* + * The lookup value falls in the range between some bounds in + * boundinfo. 'off' would be the offset of the greatest bound + * that is <= lookup value, so add off + 1 to the result + * instead as the offset of the upper bound of the greatest + * partition that may contain lookup value. If the lookup + * value had exactly matched the bound, but it isn't + * inclusive, no need add the adjacent partition. + */ + else if (!is_equal || inclusive) + maxoff = off + 1; + else + maxoff = off; + } + else + { + /* + * 'off' is -1 indicating that all bounds are greater, so just + * set the first bound's offset as maxoff. + */ + maxoff = off + 1; + } + break; + + default: + elog(ERROR, "invalid strategy number %d", opstrategy); + break; + } + + Assert(minoff >= 0 && minoff <= boundinfo->ndatums); + Assert(maxoff >= 0 && maxoff <= boundinfo->ndatums); + + /* + * If the smallest partition to return has MINVALUE (negative infinity) as + * its lower bound, increment it to point to the next finite bound + * (supposedly its upper bound), so that we don't advertently end up + * scanning the default partition. + */ + if (minoff < boundinfo->ndatums && partindices[minoff] < 0) + { + int lastkey = nvalues - 1; + + if (boundinfo->kind[minoff][lastkey] == + PARTITION_RANGE_DATUM_MINVALUE) + { + minoff++; + Assert(boundinfo->indexes[minoff] >= 0); + } + } + + /* + * If the previous greatest partition has MAXVALUE (positive infinity) as + * its upper bound (something only possible to do with multi-column range + * partitioning), we scan switch to it as the greatest partition to + * return. Again, so that we don't advertently end up scanning the + * default partition. + */ + if (maxoff >= 1 && partindices[maxoff] < 0) + { + int lastkey = nvalues - 1; + + if (boundinfo->kind[maxoff - 1][lastkey] == + PARTITION_RANGE_DATUM_MAXVALUE) + { + maxoff--; + Assert(boundinfo->indexes[maxoff] >= 0); + } + } + + Assert(minoff >= 0 && maxoff >= 0); + if (minoff <= maxoff) + result->bound_offsets = bms_add_range(NULL, minoff, maxoff); + + return result; +} + +/* + * pull_exec_paramids + * Returns a Bitmapset containing the paramids of all Params with + * paramkind = PARAM_EXEC in 'expr'. + */ +static Bitmapset * +pull_exec_paramids(Expr *expr) +{ + Bitmapset *result = NULL; + + (void) pull_exec_paramids_walker((Node *) expr, &result); + + return result; +} + +static bool +pull_exec_paramids_walker(Node *node, Bitmapset **context) +{ + if (node == NULL) + return false; + if (IsA(node, Param)) + { + Param *param = (Param *) node; + + if (param->paramkind == PARAM_EXEC) + *context = bms_add_member(*context, param->paramid); + return false; + } + return expression_tree_walker(node, pull_exec_paramids_walker, + (void *) context); +} + +/* + * get_partkey_exec_paramids + * Loop through given pruning steps and find out which exec Params + * are used. + * + * Returns a Bitmapset of Param IDs. + */ +static Bitmapset * +get_partkey_exec_paramids(List *steps) +{ + Bitmapset *execparamids = NULL; + ListCell *lc; + + foreach(lc, steps) + { + PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc); + ListCell *lc2; + + if (!IsA(step, PartitionPruneStepOp)) + continue; + + foreach(lc2, step->exprs) + { + Expr *expr = lfirst(lc2); + + /* We can be quick for plain Consts */ + if (!IsA(expr, Const)) + execparamids = bms_join(execparamids, + pull_exec_paramids(expr)); + } + } + + return execparamids; +} + +/* + * perform_pruning_base_step + * Determines the indexes of datums that satisfy conditions specified in + * 'opstep'. + * + * Result also contains whether special null-accepting and/or default + * partition need to be scanned. + */ +static PruneStepResult * +perform_pruning_base_step(PartitionPruneContext *context, + PartitionPruneStepOp *opstep) +{ + ListCell *lc1, + *lc2; + int keyno, + nvalues; + Datum values[PARTITION_MAX_KEYS]; + FmgrInfo *partsupfunc; + int stateidx; + + /* + * There better be the same number of expressions and compare functions. + */ + Assert(list_length(opstep->exprs) == list_length(opstep->cmpfns)); + + nvalues = 0; + lc1 = list_head(opstep->exprs); + lc2 = list_head(opstep->cmpfns); + + /* + * Generate the partition lookup key that will be used by one of the + * get_matching_*_bounds functions called below. + */ + for (keyno = 0; keyno < context->partnatts; keyno++) + { + /* + * For hash partitioning, it is possible that values of some keys are + * not provided in operator clauses, but instead the planner found + * that they appeared in a IS NULL clause. + */ + if (bms_is_member(keyno, opstep->nullkeys)) + continue; + + /* + * For range partitioning, we must only perform pruning with values + * for either all partition keys or a prefix thereof. + */ + if (keyno > nvalues && context->strategy == PARTITION_STRATEGY_RANGE) + break; + + if (lc1 != NULL) + { + Expr *expr; + Datum datum; + bool isnull; + Oid cmpfn; + + expr = lfirst(lc1); + stateidx = PruneCxtStateIdx(context->partnatts, + opstep->step.step_id, keyno); + partkey_datum_from_expr(context, expr, stateidx, + &datum, &isnull); + + /* + * Since we only allow strict operators in pruning steps, any + * null-valued comparison value must cause the comparison to fail, + * so that no partitions could match. + */ + if (isnull) + { + PruneStepResult *result; + + result = (PruneStepResult *) palloc(sizeof(PruneStepResult)); + result->bound_offsets = NULL; + result->scan_default = false; + result->scan_null = false; + + return result; + } + + /* Set up the stepcmpfuncs entry, unless we already did */ + cmpfn = lfirst_oid(lc2); + Assert(OidIsValid(cmpfn)); + if (cmpfn != context->stepcmpfuncs[stateidx].fn_oid) + { + /* + * If the needed support function is the same one cached in + * the relation's partition key, copy the cached FmgrInfo. + * Otherwise (i.e., when we have a cross-type comparison), an + * actual lookup is required. + */ + if (cmpfn == context->partsupfunc[keyno].fn_oid) + fmgr_info_copy(&context->stepcmpfuncs[stateidx], + &context->partsupfunc[keyno], + context->ppccontext); + else + fmgr_info_cxt(cmpfn, &context->stepcmpfuncs[stateidx], + context->ppccontext); + } + + values[keyno] = datum; + nvalues++; + + lc1 = lnext(opstep->exprs, lc1); + lc2 = lnext(opstep->cmpfns, lc2); + } + } + + /* + * Point partsupfunc to the entry for the 0th key of this step; the + * additional support functions, if any, follow consecutively. + */ + stateidx = PruneCxtStateIdx(context->partnatts, opstep->step.step_id, 0); + partsupfunc = &context->stepcmpfuncs[stateidx]; + + switch (context->strategy) + { + case PARTITION_STRATEGY_HASH: + return get_matching_hash_bounds(context, + opstep->opstrategy, + values, nvalues, + partsupfunc, + opstep->nullkeys); + + case PARTITION_STRATEGY_LIST: + return get_matching_list_bounds(context, + opstep->opstrategy, + values[0], nvalues, + &partsupfunc[0], + opstep->nullkeys); + + case PARTITION_STRATEGY_RANGE: + return get_matching_range_bounds(context, + opstep->opstrategy, + values, nvalues, + partsupfunc, + opstep->nullkeys); + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) context->strategy); + break; + } + + return NULL; +} + +/* + * perform_pruning_combine_step + * Determines the indexes of datums obtained by combining those given + * by the steps identified by cstep->source_stepids using the specified + * combination method + * + * Since cstep may refer to the result of earlier steps, we also receive + * step_results here. + */ +static PruneStepResult * +perform_pruning_combine_step(PartitionPruneContext *context, + PartitionPruneStepCombine *cstep, + PruneStepResult **step_results) +{ + PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult)); + bool firststep; + ListCell *lc1; + + /* + * A combine step without any source steps is an indication to not perform + * any partition pruning. Return all datum indexes in that case. + */ + if (cstep->source_stepids == NIL) + { + PartitionBoundInfo boundinfo = context->boundinfo; + + result->bound_offsets = + bms_add_range(NULL, 0, boundinfo->nindexes - 1); + result->scan_default = partition_bound_has_default(boundinfo); + result->scan_null = partition_bound_accepts_nulls(boundinfo); + return result; + } + + switch (cstep->combineOp) + { + case PARTPRUNE_COMBINE_UNION: + foreach(lc1, cstep->source_stepids) + { + int step_id = lfirst_int(lc1); + PruneStepResult *step_result; + + /* + * step_results[step_id] must contain a valid result, which is + * confirmed by the fact that cstep's step_id is greater than + * step_id and the fact that results of the individual steps + * are evaluated in sequence of their step_ids. + */ + if (step_id >= cstep->step.step_id) + elog(ERROR, "invalid pruning combine step argument"); + step_result = step_results[step_id]; + Assert(step_result != NULL); + + /* Record any additional datum indexes from this step */ + result->bound_offsets = bms_add_members(result->bound_offsets, + step_result->bound_offsets); + + /* Update whether to scan null and default partitions. */ + if (!result->scan_null) + result->scan_null = step_result->scan_null; + if (!result->scan_default) + result->scan_default = step_result->scan_default; + } + break; + + case PARTPRUNE_COMBINE_INTERSECT: + firststep = true; + foreach(lc1, cstep->source_stepids) + { + int step_id = lfirst_int(lc1); + PruneStepResult *step_result; + + if (step_id >= cstep->step.step_id) + elog(ERROR, "invalid pruning combine step argument"); + step_result = step_results[step_id]; + Assert(step_result != NULL); + + if (firststep) + { + /* Copy step's result the first time. */ + result->bound_offsets = + bms_copy(step_result->bound_offsets); + result->scan_null = step_result->scan_null; + result->scan_default = step_result->scan_default; + firststep = false; + } + else + { + /* Record datum indexes common to both steps */ + result->bound_offsets = + bms_int_members(result->bound_offsets, + step_result->bound_offsets); + + /* Update whether to scan null and default partitions. */ + if (result->scan_null) + result->scan_null = step_result->scan_null; + if (result->scan_default) + result->scan_default = step_result->scan_default; + } + } + break; + } + + return result; +} + +/* + * match_boolean_partition_clause + * + * If we're able to match the clause to the partition key as specially-shaped + * boolean clause, set *outconst to a Const containing a true or false value + * and return PARTCLAUSE_MATCH_CLAUSE. Returns PARTCLAUSE_UNSUPPORTED if the + * clause is not a boolean clause or if the boolean clause is unsuitable for + * partition pruning. Returns PARTCLAUSE_NOMATCH if it's a bool quals but + * just does not match this partition key. *outconst is set to NULL in the + * latter two cases. + */ +static PartClauseMatchStatus +match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey, + Expr **outconst) +{ + Expr *leftop; + + *outconst = NULL; + + if (!IsBooleanOpfamily(partopfamily)) + return PARTCLAUSE_UNSUPPORTED; + + if (IsA(clause, BooleanTest)) + { + BooleanTest *btest = (BooleanTest *) clause; + + /* Only IS [NOT] TRUE/FALSE are any good to us */ + if (btest->booltesttype == IS_UNKNOWN || + btest->booltesttype == IS_NOT_UNKNOWN) + return PARTCLAUSE_UNSUPPORTED; + + leftop = btest->arg; + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + if (equal(leftop, partkey)) + *outconst = (btest->booltesttype == IS_TRUE || + btest->booltesttype == IS_NOT_FALSE) + ? (Expr *) makeBoolConst(true, false) + : (Expr *) makeBoolConst(false, false); + + if (*outconst) + return PARTCLAUSE_MATCH_CLAUSE; + } + else + { + bool is_not_clause = is_notclause(clause); + + leftop = is_not_clause ? get_notclausearg(clause) : clause; + + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + /* Compare to the partition key, and make up a clause ... */ + if (equal(leftop, partkey)) + *outconst = is_not_clause ? + (Expr *) makeBoolConst(false, false) : + (Expr *) makeBoolConst(true, false); + else if (equal(negate_clause((Node *) leftop), partkey)) + *outconst = (Expr *) makeBoolConst(false, false); + + if (*outconst) + return PARTCLAUSE_MATCH_CLAUSE; + } + + return PARTCLAUSE_NOMATCH; +} + +/* + * partkey_datum_from_expr + * Evaluate expression for potential partition pruning + * + * Evaluate 'expr'; set *value and *isnull to the resulting Datum and nullflag. + * + * If expr isn't a Const, its ExprState is in stateidx of the context + * exprstate array. + * + * Note that the evaluated result may be in the per-tuple memory context of + * context->planstate->ps_ExprContext, and we may have leaked other memory + * there too. This memory must be recovered by resetting that ExprContext + * after we're done with the pruning operation (see execPartition.c). + */ +static void +partkey_datum_from_expr(PartitionPruneContext *context, + Expr *expr, int stateidx, + Datum *value, bool *isnull) +{ + if (IsA(expr, Const)) + { + /* We can always determine the value of a constant */ + Const *con = (Const *) expr; + + *value = con->constvalue; + *isnull = con->constisnull; + } + else + { + ExprState *exprstate; + ExprContext *ectx; + + /* + * We should never see a non-Const in a step unless we're running in + * the executor. + */ + Assert(context->planstate != NULL); + + exprstate = context->exprstates[stateidx]; + ectx = context->planstate->ps_ExprContext; + *value = ExecEvalExprSwitchContext(exprstate, ectx, isnull); + } +} |