summaryrefslogtreecommitdiffstats
path: root/src/backend/partitioning
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
commit5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree739caf8c461053357daa9f162bef34516c7bf452 /src/backend/partitioning
parentInitial commit. (diff)
downloadpostgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/partitioning')
-rw-r--r--src/backend/partitioning/Makefile20
-rw-r--r--src/backend/partitioning/partbounds.c5001
-rw-r--r--src/backend/partitioning/partdesc.c462
-rw-r--r--src/backend/partitioning/partprune.c3726
4 files changed, 9209 insertions, 0 deletions
diff --git a/src/backend/partitioning/Makefile b/src/backend/partitioning/Makefile
new file mode 100644
index 0000000..a73fd14
--- /dev/null
+++ b/src/backend/partitioning/Makefile
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for backend/partitioning
+#
+# IDENTIFICATION
+# src/backend/partitioning/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/partitioning
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ partbounds.o \
+ partdesc.o \
+ partprune.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c
new file mode 100644
index 0000000..091d6e8
--- /dev/null
+++ b/src/backend/partitioning/partbounds.c
@@ -0,0 +1,5001 @@
+/*-------------------------------------------------------------------------
+ *
+ * partbounds.c
+ * Support routines for manipulating partition bounds
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/partitioning/partbounds.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relation.h"
+#include "access/table.h"
+#include "access/tableam.h"
+#include "catalog/partition.h"
+#include "catalog/pg_inherits.h"
+#include "catalog/pg_type.h"
+#include "commands/tablecmds.h"
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/pathnodes.h"
+#include "parser/parse_coerce.h"
+#include "partitioning/partbounds.h"
+#include "partitioning/partdesc.h"
+#include "partitioning/partprune.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/partcache.h"
+#include "utils/ruleutils.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+
+/*
+ * When qsort'ing partition bounds after reading from the catalog, each bound
+ * is represented with one of the following structs.
+ */
+
+/* One bound of a hash partition */
+typedef struct PartitionHashBound
+{
+ int modulus;
+ int remainder;
+ int index;
+} PartitionHashBound;
+
+/* One value coming from some (index'th) list partition */
+typedef struct PartitionListValue
+{
+ int index;
+ Datum value;
+} PartitionListValue;
+
+/* One bound of a range partition */
+typedef struct PartitionRangeBound
+{
+ int index;
+ Datum *datums; /* range bound datums */
+ PartitionRangeDatumKind *kind; /* the kind of each datum */
+ bool lower; /* this is the lower (vs upper) bound */
+} PartitionRangeBound;
+
+/*
+ * Mapping from partitions of a joining relation to partitions of a join
+ * relation being computed (a.k.a merged partitions)
+ */
+typedef struct PartitionMap
+{
+ int nparts; /* number of partitions */
+ int *merged_indexes; /* indexes of merged partitions */
+ bool *merged; /* flags to indicate whether partitions are
+ * merged with non-dummy partitions */
+ bool did_remapping; /* did we re-map partitions? */
+ int *old_indexes; /* old indexes of merged partitions if
+ * did_remapping */
+} PartitionMap;
+
+/* Macro for comparing two range bounds */
+#define compare_range_bounds(partnatts, partsupfunc, partcollations, \
+ bound1, bound2) \
+ (partition_rbound_cmp(partnatts, partsupfunc, partcollations, \
+ (bound1)->datums, (bound1)->kind, (bound1)->lower, \
+ bound2))
+
+static int32 qsort_partition_hbound_cmp(const void *a, const void *b);
+static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
+ void *arg);
+static int32 qsort_partition_rbound_cmp(const void *a, const void *b,
+ void *arg);
+static PartitionBoundInfo create_hash_bounds(PartitionBoundSpec **boundspecs,
+ int nparts, PartitionKey key, int **mapping);
+static PartitionBoundInfo create_list_bounds(PartitionBoundSpec **boundspecs,
+ int nparts, PartitionKey key, int **mapping);
+static PartitionBoundInfo create_range_bounds(PartitionBoundSpec **boundspecs,
+ int nparts, PartitionKey key, int **mapping);
+static PartitionBoundInfo merge_list_bounds(FmgrInfo *partsupfunc,
+ Oid *collations,
+ RelOptInfo *outer_rel,
+ RelOptInfo *inner_rel,
+ JoinType jointype,
+ List **outer_parts,
+ List **inner_parts);
+static PartitionBoundInfo merge_range_bounds(int partnatts,
+ FmgrInfo *partsupfuncs,
+ Oid *partcollations,
+ RelOptInfo *outer_rel,
+ RelOptInfo *inner_rel,
+ JoinType jointype,
+ List **outer_parts,
+ List **inner_parts);
+static void init_partition_map(RelOptInfo *rel, PartitionMap *map);
+static void free_partition_map(PartitionMap *map);
+static bool is_dummy_partition(RelOptInfo *rel, int part_index);
+static int merge_matching_partitions(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ int outer_part,
+ int inner_part,
+ int *next_index);
+static int process_outer_partition(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ bool outer_has_default,
+ bool inner_has_default,
+ int outer_index,
+ int inner_default,
+ JoinType jointype,
+ int *next_index,
+ int *default_index);
+static int process_inner_partition(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ bool outer_has_default,
+ bool inner_has_default,
+ int inner_index,
+ int outer_default,
+ JoinType jointype,
+ int *next_index,
+ int *default_index);
+static void merge_null_partitions(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ bool outer_has_null,
+ bool inner_has_null,
+ int outer_null,
+ int inner_null,
+ JoinType jointype,
+ int *next_index,
+ int *null_index);
+static void merge_default_partitions(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ bool outer_has_default,
+ bool inner_has_default,
+ int outer_default,
+ int inner_default,
+ JoinType jointype,
+ int *next_index,
+ int *default_index);
+static int merge_partition_with_dummy(PartitionMap *map, int index,
+ int *next_index);
+static void fix_merged_indexes(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ int nmerged, List *merged_indexes);
+static void generate_matching_part_pairs(RelOptInfo *outer_rel,
+ RelOptInfo *inner_rel,
+ PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ int nmerged,
+ List **outer_parts,
+ List **inner_parts);
+static PartitionBoundInfo build_merged_partition_bounds(char strategy,
+ List *merged_datums,
+ List *merged_kinds,
+ List *merged_indexes,
+ int null_index,
+ int default_index);
+static int get_range_partition(RelOptInfo *rel,
+ PartitionBoundInfo bi,
+ int *lb_pos,
+ PartitionRangeBound *lb,
+ PartitionRangeBound *ub);
+static int get_range_partition_internal(PartitionBoundInfo bi,
+ int *lb_pos,
+ PartitionRangeBound *lb,
+ PartitionRangeBound *ub);
+static bool compare_range_partitions(int partnatts, FmgrInfo *partsupfuncs,
+ Oid *partcollations,
+ PartitionRangeBound *outer_lb,
+ PartitionRangeBound *outer_ub,
+ PartitionRangeBound *inner_lb,
+ PartitionRangeBound *inner_ub,
+ int *lb_cmpval, int *ub_cmpval);
+static void get_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs,
+ Oid *partcollations, JoinType jointype,
+ PartitionRangeBound *outer_lb,
+ PartitionRangeBound *outer_ub,
+ PartitionRangeBound *inner_lb,
+ PartitionRangeBound *inner_ub,
+ int lb_cmpval, int ub_cmpval,
+ PartitionRangeBound *merged_lb,
+ PartitionRangeBound *merged_ub);
+static void add_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs,
+ Oid *partcollations,
+ PartitionRangeBound *merged_lb,
+ PartitionRangeBound *merged_ub,
+ int merged_index,
+ List **merged_datums,
+ List **merged_kinds,
+ List **merged_indexes);
+static PartitionRangeBound *make_one_partition_rbound(PartitionKey key, int index,
+ List *datums, bool lower);
+static int32 partition_hbound_cmp(int modulus1, int remainder1, int modulus2,
+ int remainder2);
+static int32 partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc,
+ Oid *partcollation, Datum *datums1,
+ PartitionRangeDatumKind *kind1, bool lower1,
+ PartitionRangeBound *b2);
+static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
+ Oid *partcollation,
+ PartitionBoundInfo boundinfo,
+ PartitionRangeBound *probe, int32 *cmpval);
+static Expr *make_partition_op_expr(PartitionKey key, int keynum,
+ uint16 strategy, Expr *arg1, Expr *arg2);
+static Oid get_partition_operator(PartitionKey key, int col,
+ StrategyNumber strategy, bool *need_relabel);
+static List *get_qual_for_hash(Relation parent, PartitionBoundSpec *spec);
+static List *get_qual_for_list(Relation parent, PartitionBoundSpec *spec);
+static List *get_qual_for_range(Relation parent, PartitionBoundSpec *spec,
+ bool for_default);
+static void get_range_key_properties(PartitionKey key, int keynum,
+ PartitionRangeDatum *ldatum,
+ PartitionRangeDatum *udatum,
+ ListCell **partexprs_item,
+ Expr **keyCol,
+ Const **lower_val, Const **upper_val);
+static List *get_range_nulltest(PartitionKey key);
+
+/*
+ * get_qual_from_partbound
+ * Given a parser node for partition bound, return the list of executable
+ * expressions as partition constraint
+ */
+List *
+get_qual_from_partbound(Relation parent, PartitionBoundSpec *spec)
+{
+ PartitionKey key = RelationGetPartitionKey(parent);
+ List *my_qual = NIL;
+
+ Assert(key != NULL);
+
+ switch (key->strategy)
+ {
+ case PARTITION_STRATEGY_HASH:
+ Assert(spec->strategy == PARTITION_STRATEGY_HASH);
+ my_qual = get_qual_for_hash(parent, spec);
+ break;
+
+ case PARTITION_STRATEGY_LIST:
+ Assert(spec->strategy == PARTITION_STRATEGY_LIST);
+ my_qual = get_qual_for_list(parent, spec);
+ break;
+
+ case PARTITION_STRATEGY_RANGE:
+ Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
+ my_qual = get_qual_for_range(parent, spec, false);
+ break;
+
+ default:
+ elog(ERROR, "unexpected partition strategy: %d",
+ (int) key->strategy);
+ }
+
+ return my_qual;
+}
+
+/*
+ * partition_bounds_create
+ * Build a PartitionBoundInfo struct from a list of PartitionBoundSpec
+ * nodes
+ *
+ * This function creates a PartitionBoundInfo and fills the values of its
+ * various members based on the input list. Importantly, 'datums' array will
+ * contain Datum representation of individual bounds (possibly after
+ * de-duplication as in case of range bounds), sorted in a canonical order
+ * defined by qsort_partition_* functions of respective partitioning methods.
+ * 'indexes' array will contain as many elements as there are bounds (specific
+ * exceptions to this rule are listed in the function body), which represent
+ * the 0-based canonical positions of partitions.
+ *
+ * Upon return from this function, *mapping is set to an array of
+ * list_length(boundspecs) elements, each of which maps the original index of
+ * a partition to its canonical index.
+ *
+ * Note: The objects returned by this function are wholly allocated in the
+ * current memory context.
+ */
+PartitionBoundInfo
+partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts,
+ PartitionKey key, int **mapping)
+{
+ int i;
+
+ Assert(nparts > 0);
+
+ /*
+ * For each partitioning method, we first convert the partition bounds
+ * from their parser node representation to the internal representation,
+ * along with any additional preprocessing (such as de-duplicating range
+ * bounds). Resulting bound datums are then added to the 'datums' array
+ * in PartitionBoundInfo. For each datum added, an integer indicating the
+ * canonical partition index is added to the 'indexes' array.
+ *
+ * For each bound, we remember its partition's position (0-based) in the
+ * original list to later map it to the canonical index.
+ */
+
+ /*
+ * Initialize mapping array with invalid values, this is filled within
+ * each sub-routine below depending on the bound type.
+ */
+ *mapping = (int *) palloc(sizeof(int) * nparts);
+ for (i = 0; i < nparts; i++)
+ (*mapping)[i] = -1;
+
+ switch (key->strategy)
+ {
+ case PARTITION_STRATEGY_HASH:
+ return create_hash_bounds(boundspecs, nparts, key, mapping);
+
+ case PARTITION_STRATEGY_LIST:
+ return create_list_bounds(boundspecs, nparts, key, mapping);
+
+ case PARTITION_STRATEGY_RANGE:
+ return create_range_bounds(boundspecs, nparts, key, mapping);
+
+ default:
+ elog(ERROR, "unexpected partition strategy: %d",
+ (int) key->strategy);
+ break;
+ }
+
+ Assert(false);
+ return NULL; /* keep compiler quiet */
+}
+
+/*
+ * create_hash_bounds
+ * Create a PartitionBoundInfo for a hash partitioned table
+ */
+static PartitionBoundInfo
+create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
+ PartitionKey key, int **mapping)
+{
+ PartitionBoundInfo boundinfo;
+ PartitionHashBound *hbounds;
+ int i;
+ int greatest_modulus;
+ Datum *boundDatums;
+
+ boundinfo = (PartitionBoundInfoData *)
+ palloc0(sizeof(PartitionBoundInfoData));
+ boundinfo->strategy = key->strategy;
+ /* No special hash partitions. */
+ boundinfo->null_index = -1;
+ boundinfo->default_index = -1;
+
+ hbounds = (PartitionHashBound *)
+ palloc(nparts * sizeof(PartitionHashBound));
+
+ /* Convert from node to the internal representation */
+ for (i = 0; i < nparts; i++)
+ {
+ PartitionBoundSpec *spec = boundspecs[i];
+
+ if (spec->strategy != PARTITION_STRATEGY_HASH)
+ elog(ERROR, "invalid strategy in partition bound spec");
+
+ hbounds[i].modulus = spec->modulus;
+ hbounds[i].remainder = spec->remainder;
+ hbounds[i].index = i;
+ }
+
+ /* Sort all the bounds in ascending order */
+ qsort(hbounds, nparts, sizeof(PartitionHashBound),
+ qsort_partition_hbound_cmp);
+
+ /* After sorting, moduli are now stored in ascending order. */
+ greatest_modulus = hbounds[nparts - 1].modulus;
+
+ boundinfo->ndatums = nparts;
+ boundinfo->datums = (Datum **) palloc0(nparts * sizeof(Datum *));
+ boundinfo->kind = NULL;
+ boundinfo->interleaved_parts = NULL;
+ boundinfo->nindexes = greatest_modulus;
+ boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
+ for (i = 0; i < greatest_modulus; i++)
+ boundinfo->indexes[i] = -1;
+
+ /*
+ * In the loop below, to save from allocating a series of small datum
+ * arrays, here we just allocate a single array and below we'll just
+ * assign a portion of this array per partition.
+ */
+ boundDatums = (Datum *) palloc(nparts * 2 * sizeof(Datum));
+
+ /*
+ * For hash partitioning, there are as many datums (modulus and remainder
+ * pairs) as there are partitions. Indexes are simply values ranging from
+ * 0 to (nparts - 1).
+ */
+ for (i = 0; i < nparts; i++)
+ {
+ int modulus = hbounds[i].modulus;
+ int remainder = hbounds[i].remainder;
+
+ boundinfo->datums[i] = &boundDatums[i * 2];
+ boundinfo->datums[i][0] = Int32GetDatum(modulus);
+ boundinfo->datums[i][1] = Int32GetDatum(remainder);
+
+ while (remainder < greatest_modulus)
+ {
+ /* overlap? */
+ Assert(boundinfo->indexes[remainder] == -1);
+ boundinfo->indexes[remainder] = i;
+ remainder += modulus;
+ }
+
+ (*mapping)[hbounds[i].index] = i;
+ }
+ pfree(hbounds);
+
+ return boundinfo;
+}
+
+/*
+ * get_non_null_list_datum_count
+ * Counts the number of non-null Datums in each partition.
+ */
+static int
+get_non_null_list_datum_count(PartitionBoundSpec **boundspecs, int nparts)
+{
+ int i;
+ int count = 0;
+
+ for (i = 0; i < nparts; i++)
+ {
+ ListCell *lc;
+
+ foreach(lc, boundspecs[i]->listdatums)
+ {
+ Const *val = lfirst_node(Const, lc);
+
+ if (!val->constisnull)
+ count++;
+ }
+ }
+
+ return count;
+}
+
+/*
+ * create_list_bounds
+ * Create a PartitionBoundInfo for a list partitioned table
+ */
+static PartitionBoundInfo
+create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
+ PartitionKey key, int **mapping)
+{
+ PartitionBoundInfo boundinfo;
+ PartitionListValue *all_values;
+ int i;
+ int j;
+ int ndatums;
+ int next_index = 0;
+ int default_index = -1;
+ int null_index = -1;
+ Datum *boundDatums;
+
+ boundinfo = (PartitionBoundInfoData *)
+ palloc0(sizeof(PartitionBoundInfoData));
+ boundinfo->strategy = key->strategy;
+ /* Will be set correctly below. */
+ boundinfo->null_index = -1;
+ boundinfo->default_index = -1;
+
+ ndatums = get_non_null_list_datum_count(boundspecs, nparts);
+ all_values = (PartitionListValue *)
+ palloc(ndatums * sizeof(PartitionListValue));
+
+ /* Create a unified list of non-null values across all partitions. */
+ for (j = 0, i = 0; i < nparts; i++)
+ {
+ PartitionBoundSpec *spec = boundspecs[i];
+ ListCell *c;
+
+ if (spec->strategy != PARTITION_STRATEGY_LIST)
+ elog(ERROR, "invalid strategy in partition bound spec");
+
+ /*
+ * Note the index of the partition bound spec for the default
+ * partition. There's no datum to add to the list on non-null datums
+ * for this partition.
+ */
+ if (spec->is_default)
+ {
+ default_index = i;
+ continue;
+ }
+
+ foreach(c, spec->listdatums)
+ {
+ Const *val = lfirst_node(Const, c);
+
+ if (!val->constisnull)
+ {
+ all_values[j].index = i;
+ all_values[j].value = val->constvalue;
+ j++;
+ }
+ else
+ {
+ /*
+ * Never put a null into the values array; save the index of
+ * the partition that stores nulls, instead.
+ */
+ if (null_index != -1)
+ elog(ERROR, "found null more than once");
+ null_index = i;
+ }
+ }
+ }
+
+ /* ensure we found a Datum for every slot in the all_values array */
+ Assert(j == ndatums);
+
+ qsort_arg(all_values, ndatums, sizeof(PartitionListValue),
+ qsort_partition_list_value_cmp, (void *) key);
+
+ boundinfo->ndatums = ndatums;
+ boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
+ boundinfo->kind = NULL;
+ boundinfo->interleaved_parts = NULL;
+ boundinfo->nindexes = ndatums;
+ boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
+
+ /*
+ * In the loop below, to save from allocating a series of small datum
+ * arrays, here we just allocate a single array and below we'll just
+ * assign a portion of this array per datum.
+ */
+ boundDatums = (Datum *) palloc(ndatums * sizeof(Datum));
+
+ /*
+ * Copy values. Canonical indexes are values ranging from 0 to (nparts -
+ * 1) assigned to each partition such that all datums of a given partition
+ * receive the same value. The value for a given partition is the index of
+ * that partition's smallest datum in the all_values[] array.
+ */
+ for (i = 0; i < ndatums; i++)
+ {
+ int orig_index = all_values[i].index;
+
+ boundinfo->datums[i] = &boundDatums[i];
+ boundinfo->datums[i][0] = datumCopy(all_values[i].value,
+ key->parttypbyval[0],
+ key->parttyplen[0]);
+
+ /* If the old index has no mapping, assign one */
+ if ((*mapping)[orig_index] == -1)
+ (*mapping)[orig_index] = next_index++;
+
+ boundinfo->indexes[i] = (*mapping)[orig_index];
+ }
+
+ pfree(all_values);
+
+ /*
+ * Set the canonical value for null_index, if any.
+ *
+ * It is possible that the null-accepting partition has not been assigned
+ * an index yet, which could happen if such partition accepts only null
+ * and hence not handled in the above loop which only looked at non-null
+ * values.
+ */
+ if (null_index != -1)
+ {
+ Assert(null_index >= 0);
+ if ((*mapping)[null_index] == -1)
+ (*mapping)[null_index] = next_index++;
+ boundinfo->null_index = (*mapping)[null_index];
+ }
+
+ /* Set the canonical value for default_index, if any. */
+ if (default_index != -1)
+ {
+ /*
+ * The default partition accepts any value not specified in the lists
+ * of other partitions, hence it should not get mapped index while
+ * assigning those for non-null datums.
+ */
+ Assert(default_index >= 0);
+ Assert((*mapping)[default_index] == -1);
+ (*mapping)[default_index] = next_index++;
+ boundinfo->default_index = (*mapping)[default_index];
+ }
+
+ /*
+ * Calculate interleaved partitions. Here we look for partitions which
+ * might be interleaved with other partitions and set a bit in
+ * interleaved_parts for any partitions which may be interleaved with
+ * another partition.
+ */
+
+ /*
+ * There must be multiple partitions to have any interleaved partitions,
+ * otherwise there's nothing to interleave with.
+ */
+ if (nparts > 1)
+ {
+ /*
+ * Short-circuit check to see if only 1 Datum is allowed per
+ * partition. When this is true there's no need to do the more
+ * expensive checks to look for interleaved values.
+ */
+ if (boundinfo->ndatums +
+ partition_bound_accepts_nulls(boundinfo) +
+ partition_bound_has_default(boundinfo) != nparts)
+ {
+ int last_index = -1;
+
+ /*
+ * Since the indexes array is sorted in Datum order, if any
+ * partitions are interleaved then it will show up by the
+ * partition indexes not being in ascending order. Here we check
+ * for that and record all partitions that are out of order.
+ */
+ for (i = 0; i < boundinfo->nindexes; i++)
+ {
+ int index = boundinfo->indexes[i];
+
+ if (index < last_index)
+ boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
+ index);
+
+ /*
+ * Otherwise, if the null_index exists in the indexes array,
+ * then the NULL partition must also allow some other Datum,
+ * therefore it's "interleaved".
+ */
+ else if (partition_bound_accepts_nulls(boundinfo) &&
+ index == boundinfo->null_index)
+ boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
+ index);
+
+ last_index = index;
+ }
+ }
+
+ /*
+ * The DEFAULT partition is the "catch-all" partition that can contain
+ * anything that does not belong to any other partition. If there are
+ * any other partitions then the DEFAULT partition must be marked as
+ * interleaved.
+ */
+ if (partition_bound_has_default(boundinfo))
+ boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
+ boundinfo->default_index);
+ }
+
+
+ /* All partitions must now have been assigned canonical indexes. */
+ Assert(next_index == nparts);
+ return boundinfo;
+}
+
+/*
+ * create_range_bounds
+ * Create a PartitionBoundInfo for a range partitioned table
+ */
+static PartitionBoundInfo
+create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
+ PartitionKey key, int **mapping)
+{
+ PartitionBoundInfo boundinfo;
+ PartitionRangeBound **rbounds = NULL;
+ PartitionRangeBound **all_bounds,
+ *prev;
+ int i,
+ k,
+ partnatts;
+ int ndatums = 0;
+ int default_index = -1;
+ int next_index = 0;
+ Datum *boundDatums;
+ PartitionRangeDatumKind *boundKinds;
+
+ boundinfo = (PartitionBoundInfoData *)
+ palloc0(sizeof(PartitionBoundInfoData));
+ boundinfo->strategy = key->strategy;
+ /* There is no special null-accepting range partition. */
+ boundinfo->null_index = -1;
+ /* Will be set correctly below. */
+ boundinfo->default_index = -1;
+
+ all_bounds = (PartitionRangeBound **)
+ palloc0(2 * nparts * sizeof(PartitionRangeBound *));
+
+ /* Create a unified list of range bounds across all the partitions. */
+ ndatums = 0;
+ for (i = 0; i < nparts; i++)
+ {
+ PartitionBoundSpec *spec = boundspecs[i];
+ PartitionRangeBound *lower,
+ *upper;
+
+ if (spec->strategy != PARTITION_STRATEGY_RANGE)
+ elog(ERROR, "invalid strategy in partition bound spec");
+
+ /*
+ * Note the index of the partition bound spec for the default
+ * partition. There's no datum to add to the all_bounds array for
+ * this partition.
+ */
+ if (spec->is_default)
+ {
+ default_index = i;
+ continue;
+ }
+
+ lower = make_one_partition_rbound(key, i, spec->lowerdatums, true);
+ upper = make_one_partition_rbound(key, i, spec->upperdatums, false);
+ all_bounds[ndatums++] = lower;
+ all_bounds[ndatums++] = upper;
+ }
+
+ Assert(ndatums == nparts * 2 ||
+ (default_index != -1 && ndatums == (nparts - 1) * 2));
+
+ /* Sort all the bounds in ascending order */
+ qsort_arg(all_bounds, ndatums,
+ sizeof(PartitionRangeBound *),
+ qsort_partition_rbound_cmp,
+ (void *) key);
+
+ /* Save distinct bounds from all_bounds into rbounds. */
+ rbounds = (PartitionRangeBound **)
+ palloc(ndatums * sizeof(PartitionRangeBound *));
+ k = 0;
+ prev = NULL;
+ for (i = 0; i < ndatums; i++)
+ {
+ PartitionRangeBound *cur = all_bounds[i];
+ bool is_distinct = false;
+ int j;
+
+ /* Is the current bound distinct from the previous one? */
+ for (j = 0; j < key->partnatts; j++)
+ {
+ Datum cmpval;
+
+ if (prev == NULL || cur->kind[j] != prev->kind[j])
+ {
+ is_distinct = true;
+ break;
+ }
+
+ /*
+ * If the bounds are both MINVALUE or MAXVALUE, stop now and treat
+ * them as equal, since any values after this point must be
+ * ignored.
+ */
+ if (cur->kind[j] != PARTITION_RANGE_DATUM_VALUE)
+ break;
+
+ cmpval = FunctionCall2Coll(&key->partsupfunc[j],
+ key->partcollation[j],
+ cur->datums[j],
+ prev->datums[j]);
+ if (DatumGetInt32(cmpval) != 0)
+ {
+ is_distinct = true;
+ break;
+ }
+ }
+
+ /*
+ * Only if the bound is distinct save it into a temporary array, i.e,
+ * rbounds which is later copied into boundinfo datums array.
+ */
+ if (is_distinct)
+ rbounds[k++] = all_bounds[i];
+
+ prev = cur;
+ }
+
+ pfree(all_bounds);
+
+ /* Update ndatums to hold the count of distinct datums. */
+ ndatums = k;
+
+ /*
+ * Add datums to boundinfo. Canonical indexes are values ranging from 0
+ * to nparts - 1, assigned in that order to each partition's upper bound.
+ * For 'datums' elements that are lower bounds, there is -1 in the
+ * 'indexes' array to signify that no partition exists for the values less
+ * than such a bound and greater than or equal to the previous upper
+ * bound.
+ */
+ boundinfo->ndatums = ndatums;
+ boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
+ boundinfo->kind = (PartitionRangeDatumKind **)
+ palloc(ndatums *
+ sizeof(PartitionRangeDatumKind *));
+ boundinfo->interleaved_parts = NULL;
+
+ /*
+ * For range partitioning, an additional value of -1 is stored as the last
+ * element of the indexes[] array.
+ */
+ boundinfo->nindexes = ndatums + 1;
+ boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int));
+
+ /*
+ * In the loop below, to save from allocating a series of small arrays,
+ * here we just allocate a single array for Datums and another for
+ * PartitionRangeDatumKinds, below we'll just assign a portion of these
+ * arrays in each loop.
+ */
+ partnatts = key->partnatts;
+ boundDatums = (Datum *) palloc(ndatums * partnatts * sizeof(Datum));
+ boundKinds = (PartitionRangeDatumKind *) palloc(ndatums * partnatts *
+ sizeof(PartitionRangeDatumKind));
+
+ for (i = 0; i < ndatums; i++)
+ {
+ int j;
+
+ boundinfo->datums[i] = &boundDatums[i * partnatts];
+ boundinfo->kind[i] = &boundKinds[i * partnatts];
+ for (j = 0; j < partnatts; j++)
+ {
+ if (rbounds[i]->kind[j] == PARTITION_RANGE_DATUM_VALUE)
+ boundinfo->datums[i][j] =
+ datumCopy(rbounds[i]->datums[j],
+ key->parttypbyval[j],
+ key->parttyplen[j]);
+ boundinfo->kind[i][j] = rbounds[i]->kind[j];
+ }
+
+ /*
+ * There is no mapping for invalid indexes.
+ *
+ * Any lower bounds in the rbounds array have invalid indexes
+ * assigned, because the values between the previous bound (if there
+ * is one) and this (lower) bound are not part of the range of any
+ * existing partition.
+ */
+ if (rbounds[i]->lower)
+ boundinfo->indexes[i] = -1;
+ else
+ {
+ int orig_index = rbounds[i]->index;
+
+ /* If the old index has no mapping, assign one */
+ if ((*mapping)[orig_index] == -1)
+ (*mapping)[orig_index] = next_index++;
+
+ boundinfo->indexes[i] = (*mapping)[orig_index];
+ }
+ }
+
+ pfree(rbounds);
+
+ /* Set the canonical value for default_index, if any. */
+ if (default_index != -1)
+ {
+ Assert(default_index >= 0 && (*mapping)[default_index] == -1);
+ (*mapping)[default_index] = next_index++;
+ boundinfo->default_index = (*mapping)[default_index];
+ }
+
+ /* The extra -1 element. */
+ Assert(i == ndatums);
+ boundinfo->indexes[i] = -1;
+
+ /* All partitions must now have been assigned canonical indexes. */
+ Assert(next_index == nparts);
+ return boundinfo;
+}
+
+/*
+ * Are two partition bound collections logically equal?
+ *
+ * Used in the keep logic of relcache.c (ie, in RelationClearRelation()).
+ * This is also useful when b1 and b2 are bound collections of two separate
+ * relations, respectively, because PartitionBoundInfo is a canonical
+ * representation of partition bounds.
+ */
+bool
+partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
+ PartitionBoundInfo b1, PartitionBoundInfo b2)
+{
+ int i;
+
+ if (b1->strategy != b2->strategy)
+ return false;
+
+ if (b1->ndatums != b2->ndatums)
+ return false;
+
+ if (b1->nindexes != b2->nindexes)
+ return false;
+
+ if (b1->null_index != b2->null_index)
+ return false;
+
+ if (b1->default_index != b2->default_index)
+ return false;
+
+ /* For all partition strategies, the indexes[] arrays have to match */
+ for (i = 0; i < b1->nindexes; i++)
+ {
+ if (b1->indexes[i] != b2->indexes[i])
+ return false;
+ }
+
+ /* Finally, compare the datums[] arrays */
+ if (b1->strategy == PARTITION_STRATEGY_HASH)
+ {
+ /*
+ * We arrange the partitions in the ascending order of their moduli
+ * and remainders. Also every modulus is factor of next larger
+ * modulus. Therefore we can safely store index of a given partition
+ * in indexes array at remainder of that partition. Also entries at
+ * (remainder + N * modulus) positions in indexes array are all same
+ * for (modulus, remainder) specification for any partition. Thus the
+ * datums arrays from the given bounds are the same, if and only if
+ * their indexes arrays are the same. So, it suffices to compare the
+ * indexes arrays.
+ *
+ * Nonetheless make sure that the bounds are indeed the same when the
+ * indexes match. Hash partition bound stores modulus and remainder
+ * at b1->datums[i][0] and b1->datums[i][1] position respectively.
+ */
+#ifdef USE_ASSERT_CHECKING
+ for (i = 0; i < b1->ndatums; i++)
+ Assert((b1->datums[i][0] == b2->datums[i][0] &&
+ b1->datums[i][1] == b2->datums[i][1]));
+#endif
+ }
+ else
+ {
+ for (i = 0; i < b1->ndatums; i++)
+ {
+ int j;
+
+ for (j = 0; j < partnatts; j++)
+ {
+ /* For range partitions, the bounds might not be finite. */
+ if (b1->kind != NULL)
+ {
+ /* The different kinds of bound all differ from each other */
+ if (b1->kind[i][j] != b2->kind[i][j])
+ return false;
+
+ /*
+ * Non-finite bounds are equal without further
+ * examination.
+ */
+ if (b1->kind[i][j] != PARTITION_RANGE_DATUM_VALUE)
+ continue;
+ }
+
+ /*
+ * Compare the actual values. Note that it would be both
+ * incorrect and unsafe to invoke the comparison operator
+ * derived from the partitioning specification here. It would
+ * be incorrect because we want the relcache entry to be
+ * updated for ANY change to the partition bounds, not just
+ * those that the partitioning operator thinks are
+ * significant. It would be unsafe because we might reach
+ * this code in the context of an aborted transaction, and an
+ * arbitrary partitioning operator might not be safe in that
+ * context. datumIsEqual() should be simple enough to be
+ * safe.
+ */
+ if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j],
+ parttypbyval[j], parttyplen[j]))
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+/*
+ * Return a copy of given PartitionBoundInfo structure. The data types of bounds
+ * are described by given partition key specification.
+ *
+ * Note: it's important that this function and its callees not do any catalog
+ * access, nor anything else that would result in allocating memory other than
+ * the returned data structure. Since this is called in a long-lived context,
+ * that would result in unwanted memory leaks.
+ */
+PartitionBoundInfo
+partition_bounds_copy(PartitionBoundInfo src,
+ PartitionKey key)
+{
+ PartitionBoundInfo dest;
+ int i;
+ int ndatums;
+ int nindexes;
+ int partnatts;
+ bool hash_part;
+ int natts;
+ Datum *boundDatums;
+
+ dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
+
+ dest->strategy = src->strategy;
+ ndatums = dest->ndatums = src->ndatums;
+ nindexes = dest->nindexes = src->nindexes;
+ partnatts = key->partnatts;
+
+ /* List partitioned tables have only a single partition key. */
+ Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
+
+ dest->datums = (Datum **) palloc(sizeof(Datum *) * ndatums);
+
+ if (src->kind != NULL)
+ {
+ PartitionRangeDatumKind *boundKinds;
+
+ /* only RANGE partition should have a non-NULL kind */
+ Assert(key->strategy == PARTITION_STRATEGY_RANGE);
+
+ dest->kind = (PartitionRangeDatumKind **) palloc(ndatums *
+ sizeof(PartitionRangeDatumKind *));
+
+ /*
+ * In the loop below, to save from allocating a series of small arrays
+ * for storing the PartitionRangeDatumKind, we allocate a single chunk
+ * here and use a smaller portion of it for each datum.
+ */
+ boundKinds = (PartitionRangeDatumKind *) palloc(ndatums * partnatts *
+ sizeof(PartitionRangeDatumKind));
+
+ for (i = 0; i < ndatums; i++)
+ {
+ dest->kind[i] = &boundKinds[i * partnatts];
+ memcpy(dest->kind[i], src->kind[i],
+ sizeof(PartitionRangeDatumKind) * partnatts);
+ }
+ }
+ else
+ dest->kind = NULL;
+
+ /* copy interleaved partitions for LIST partitioned tables */
+ dest->interleaved_parts = bms_copy(src->interleaved_parts);
+
+ /*
+ * For hash partitioning, datums array will have two elements - modulus
+ * and remainder.
+ */
+ hash_part = (key->strategy == PARTITION_STRATEGY_HASH);
+ natts = hash_part ? 2 : partnatts;
+ boundDatums = palloc(ndatums * natts * sizeof(Datum));
+
+ for (i = 0; i < ndatums; i++)
+ {
+ int j;
+
+ dest->datums[i] = &boundDatums[i * natts];
+
+ for (j = 0; j < natts; j++)
+ {
+ bool byval;
+ int typlen;
+
+ if (hash_part)
+ {
+ typlen = sizeof(int32); /* Always int4 */
+ byval = true; /* int4 is pass-by-value */
+ }
+ else
+ {
+ byval = key->parttypbyval[j];
+ typlen = key->parttyplen[j];
+ }
+
+ if (dest->kind == NULL ||
+ dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE)
+ dest->datums[i][j] = datumCopy(src->datums[i][j],
+ byval, typlen);
+ }
+ }
+
+ dest->indexes = (int *) palloc(sizeof(int) * nindexes);
+ memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);
+
+ dest->null_index = src->null_index;
+ dest->default_index = src->default_index;
+
+ return dest;
+}
+
+/*
+ * partition_bounds_merge
+ * Check to see whether every partition of 'outer_rel' matches/overlaps
+ * one partition of 'inner_rel' at most, and vice versa; and if so, build
+ * and return the partition bounds for a join relation between the rels,
+ * generating two lists of the matching/overlapping partitions, which are
+ * returned to *outer_parts and *inner_parts respectively.
+ *
+ * The lists contain the same number of partitions, and the partitions at the
+ * same positions in the lists indicate join pairs used for partitioned join.
+ * If a partition on one side matches/overlaps multiple partitions on the other
+ * side, this function returns NULL, setting *outer_parts and *inner_parts to
+ * NIL.
+ */
+PartitionBoundInfo
+partition_bounds_merge(int partnatts,
+ FmgrInfo *partsupfunc, Oid *partcollation,
+ RelOptInfo *outer_rel, RelOptInfo *inner_rel,
+ JoinType jointype,
+ List **outer_parts, List **inner_parts)
+{
+ /*
+ * Currently, this function is called only from try_partitionwise_join(),
+ * so the join type should be INNER, LEFT, FULL, SEMI, or ANTI.
+ */
+ Assert(jointype == JOIN_INNER || jointype == JOIN_LEFT ||
+ jointype == JOIN_FULL || jointype == JOIN_SEMI ||
+ jointype == JOIN_ANTI);
+
+ /* The partitioning strategies should be the same. */
+ Assert(outer_rel->boundinfo->strategy == inner_rel->boundinfo->strategy);
+
+ *outer_parts = *inner_parts = NIL;
+ switch (outer_rel->boundinfo->strategy)
+ {
+ case PARTITION_STRATEGY_HASH:
+
+ /*
+ * For hash partitioned tables, we currently support partitioned
+ * join only when they have exactly the same partition bounds.
+ *
+ * XXX: it might be possible to relax the restriction to support
+ * cases where hash partitioned tables have missing partitions
+ * and/or different moduli, but it's not clear if it would be
+ * useful to support the former case since it's unusual to have
+ * missing partitions. On the other hand, it would be useful to
+ * support the latter case, but in that case, there is a high
+ * probability that a partition on one side will match multiple
+ * partitions on the other side, which is the scenario the current
+ * implementation of partitioned join can't handle.
+ */
+ return NULL;
+
+ case PARTITION_STRATEGY_LIST:
+ return merge_list_bounds(partsupfunc,
+ partcollation,
+ outer_rel,
+ inner_rel,
+ jointype,
+ outer_parts,
+ inner_parts);
+
+ case PARTITION_STRATEGY_RANGE:
+ return merge_range_bounds(partnatts,
+ partsupfunc,
+ partcollation,
+ outer_rel,
+ inner_rel,
+ jointype,
+ outer_parts,
+ inner_parts);
+
+ default:
+ elog(ERROR, "unexpected partition strategy: %d",
+ (int) outer_rel->boundinfo->strategy);
+ return NULL; /* keep compiler quiet */
+ }
+}
+
+/*
+ * merge_list_bounds
+ * Create the partition bounds for a join relation between list
+ * partitioned tables, if possible
+ *
+ * In this function we try to find sets of matching partitions from both sides
+ * by comparing list values stored in their partition bounds. Since the list
+ * values appear in the ascending order, an algorithm similar to merge join is
+ * used for that. If a partition on one side doesn't have a matching
+ * partition on the other side, the algorithm tries to match it with the
+ * default partition on the other side if any; if not, the algorithm tries to
+ * match it with a dummy partition on the other side if it's on the
+ * non-nullable side of an outer join. Also, if both sides have the default
+ * partitions, the algorithm tries to match them with each other. We give up
+ * if the algorithm finds a partition matching multiple partitions on the
+ * other side, which is the scenario the current implementation of partitioned
+ * join can't handle.
+ */
+static PartitionBoundInfo
+merge_list_bounds(FmgrInfo *partsupfunc, Oid *partcollation,
+ RelOptInfo *outer_rel, RelOptInfo *inner_rel,
+ JoinType jointype,
+ List **outer_parts, List **inner_parts)
+{
+ PartitionBoundInfo merged_bounds = NULL;
+ PartitionBoundInfo outer_bi = outer_rel->boundinfo;
+ PartitionBoundInfo inner_bi = inner_rel->boundinfo;
+ bool outer_has_default = partition_bound_has_default(outer_bi);
+ bool inner_has_default = partition_bound_has_default(inner_bi);
+ int outer_default = outer_bi->default_index;
+ int inner_default = inner_bi->default_index;
+ bool outer_has_null = partition_bound_accepts_nulls(outer_bi);
+ bool inner_has_null = partition_bound_accepts_nulls(inner_bi);
+ PartitionMap outer_map;
+ PartitionMap inner_map;
+ int outer_pos;
+ int inner_pos;
+ int next_index = 0;
+ int null_index = -1;
+ int default_index = -1;
+ List *merged_datums = NIL;
+ List *merged_indexes = NIL;
+
+ Assert(*outer_parts == NIL);
+ Assert(*inner_parts == NIL);
+ Assert(outer_bi->strategy == inner_bi->strategy &&
+ outer_bi->strategy == PARTITION_STRATEGY_LIST);
+ /* List partitioning doesn't require kinds. */
+ Assert(!outer_bi->kind && !inner_bi->kind);
+
+ init_partition_map(outer_rel, &outer_map);
+ init_partition_map(inner_rel, &inner_map);
+
+ /*
+ * If the default partitions (if any) have been proven empty, deem them
+ * non-existent.
+ */
+ if (outer_has_default && is_dummy_partition(outer_rel, outer_default))
+ outer_has_default = false;
+ if (inner_has_default && is_dummy_partition(inner_rel, inner_default))
+ inner_has_default = false;
+
+ /*
+ * Merge partitions from both sides. In each iteration we compare a pair
+ * of list values, one from each side, and decide whether the
+ * corresponding partitions match or not. If the two values match
+ * exactly, move to the next pair of list values, otherwise move to the
+ * next list value on the side with a smaller list value.
+ */
+ outer_pos = inner_pos = 0;
+ while (outer_pos < outer_bi->ndatums || inner_pos < inner_bi->ndatums)
+ {
+ int outer_index = -1;
+ int inner_index = -1;
+ Datum *outer_datums;
+ Datum *inner_datums;
+ int cmpval;
+ Datum *merged_datum = NULL;
+ int merged_index = -1;
+
+ if (outer_pos < outer_bi->ndatums)
+ {
+ /*
+ * If the partition on the outer side has been proven empty,
+ * ignore it and move to the next datum on the outer side.
+ */
+ outer_index = outer_bi->indexes[outer_pos];
+ if (is_dummy_partition(outer_rel, outer_index))
+ {
+ outer_pos++;
+ continue;
+ }
+ }
+ if (inner_pos < inner_bi->ndatums)
+ {
+ /*
+ * If the partition on the inner side has been proven empty,
+ * ignore it and move to the next datum on the inner side.
+ */
+ inner_index = inner_bi->indexes[inner_pos];
+ if (is_dummy_partition(inner_rel, inner_index))
+ {
+ inner_pos++;
+ continue;
+ }
+ }
+
+ /* Get the list values. */
+ outer_datums = outer_pos < outer_bi->ndatums ?
+ outer_bi->datums[outer_pos] : NULL;
+ inner_datums = inner_pos < inner_bi->ndatums ?
+ inner_bi->datums[inner_pos] : NULL;
+
+ /*
+ * We run this loop till both sides finish. This allows us to avoid
+ * duplicating code to handle the remaining values on the side which
+ * finishes later. For that we set the comparison parameter cmpval in
+ * such a way that it appears as if the side which finishes earlier
+ * has an extra value higher than any other value on the unfinished
+ * side. That way we advance the values on the unfinished side till
+ * all of its values are exhausted.
+ */
+ if (outer_pos >= outer_bi->ndatums)
+ cmpval = 1;
+ else if (inner_pos >= inner_bi->ndatums)
+ cmpval = -1;
+ else
+ {
+ Assert(outer_datums != NULL && inner_datums != NULL);
+ cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[0],
+ partcollation[0],
+ outer_datums[0],
+ inner_datums[0]));
+ }
+
+ if (cmpval == 0)
+ {
+ /* Two list values match exactly. */
+ Assert(outer_pos < outer_bi->ndatums);
+ Assert(inner_pos < inner_bi->ndatums);
+ Assert(outer_index >= 0);
+ Assert(inner_index >= 0);
+
+ /*
+ * Try merging both partitions. If successful, add the list value
+ * and index of the merged partition below.
+ */
+ merged_index = merge_matching_partitions(&outer_map, &inner_map,
+ outer_index, inner_index,
+ &next_index);
+ if (merged_index == -1)
+ goto cleanup;
+
+ merged_datum = outer_datums;
+
+ /* Move to the next pair of list values. */
+ outer_pos++;
+ inner_pos++;
+ }
+ else if (cmpval < 0)
+ {
+ /* A list value missing from the inner side. */
+ Assert(outer_pos < outer_bi->ndatums);
+
+ /*
+ * If the inner side has the default partition, or this is an
+ * outer join, try to assign a merged partition to the outer
+ * partition (see process_outer_partition()). Otherwise, the
+ * outer partition will not contribute to the result.
+ */
+ if (inner_has_default || IS_OUTER_JOIN(jointype))
+ {
+ /* Get the outer partition. */
+ outer_index = outer_bi->indexes[outer_pos];
+ Assert(outer_index >= 0);
+ merged_index = process_outer_partition(&outer_map,
+ &inner_map,
+ outer_has_default,
+ inner_has_default,
+ outer_index,
+ inner_default,
+ jointype,
+ &next_index,
+ &default_index);
+ if (merged_index == -1)
+ goto cleanup;
+ merged_datum = outer_datums;
+ }
+
+ /* Move to the next list value on the outer side. */
+ outer_pos++;
+ }
+ else
+ {
+ /* A list value missing from the outer side. */
+ Assert(cmpval > 0);
+ Assert(inner_pos < inner_bi->ndatums);
+
+ /*
+ * If the outer side has the default partition, or this is a FULL
+ * join, try to assign a merged partition to the inner partition
+ * (see process_inner_partition()). Otherwise, the inner
+ * partition will not contribute to the result.
+ */
+ if (outer_has_default || jointype == JOIN_FULL)
+ {
+ /* Get the inner partition. */
+ inner_index = inner_bi->indexes[inner_pos];
+ Assert(inner_index >= 0);
+ merged_index = process_inner_partition(&outer_map,
+ &inner_map,
+ outer_has_default,
+ inner_has_default,
+ inner_index,
+ outer_default,
+ jointype,
+ &next_index,
+ &default_index);
+ if (merged_index == -1)
+ goto cleanup;
+ merged_datum = inner_datums;
+ }
+
+ /* Move to the next list value on the inner side. */
+ inner_pos++;
+ }
+
+ /*
+ * If we assigned a merged partition, add the list value and index of
+ * the merged partition if appropriate.
+ */
+ if (merged_index >= 0 && merged_index != default_index)
+ {
+ merged_datums = lappend(merged_datums, merged_datum);
+ merged_indexes = lappend_int(merged_indexes, merged_index);
+ }
+ }
+
+ /*
+ * If the NULL partitions (if any) have been proven empty, deem them
+ * non-existent.
+ */
+ if (outer_has_null &&
+ is_dummy_partition(outer_rel, outer_bi->null_index))
+ outer_has_null = false;
+ if (inner_has_null &&
+ is_dummy_partition(inner_rel, inner_bi->null_index))
+ inner_has_null = false;
+
+ /* Merge the NULL partitions if any. */
+ if (outer_has_null || inner_has_null)
+ merge_null_partitions(&outer_map, &inner_map,
+ outer_has_null, inner_has_null,
+ outer_bi->null_index, inner_bi->null_index,
+ jointype, &next_index, &null_index);
+ else
+ Assert(null_index == -1);
+
+ /* Merge the default partitions if any. */
+ if (outer_has_default || inner_has_default)
+ merge_default_partitions(&outer_map, &inner_map,
+ outer_has_default, inner_has_default,
+ outer_default, inner_default,
+ jointype, &next_index, &default_index);
+ else
+ Assert(default_index == -1);
+
+ /* If we have merged partitions, create the partition bounds. */
+ if (next_index > 0)
+ {
+ /* Fix the merged_indexes list if necessary. */
+ if (outer_map.did_remapping || inner_map.did_remapping)
+ {
+ Assert(jointype == JOIN_FULL);
+ fix_merged_indexes(&outer_map, &inner_map,
+ next_index, merged_indexes);
+ }
+
+ /* Use maps to match partitions from inputs. */
+ generate_matching_part_pairs(outer_rel, inner_rel,
+ &outer_map, &inner_map,
+ next_index,
+ outer_parts, inner_parts);
+ Assert(*outer_parts != NIL);
+ Assert(*inner_parts != NIL);
+ Assert(list_length(*outer_parts) == list_length(*inner_parts));
+ Assert(list_length(*outer_parts) <= next_index);
+
+ /* Make a PartitionBoundInfo struct to return. */
+ merged_bounds = build_merged_partition_bounds(outer_bi->strategy,
+ merged_datums,
+ NIL,
+ merged_indexes,
+ null_index,
+ default_index);
+ Assert(merged_bounds);
+ }
+
+cleanup:
+ /* Free local memory before returning. */
+ list_free(merged_datums);
+ list_free(merged_indexes);
+ free_partition_map(&outer_map);
+ free_partition_map(&inner_map);
+
+ return merged_bounds;
+}
+
+/*
+ * merge_range_bounds
+ * Create the partition bounds for a join relation between range
+ * partitioned tables, if possible
+ *
+ * In this function we try to find sets of overlapping partitions from both
+ * sides by comparing ranges stored in their partition bounds. Since the
+ * ranges appear in the ascending order, an algorithm similar to merge join is
+ * used for that. If a partition on one side doesn't have an overlapping
+ * partition on the other side, the algorithm tries to match it with the
+ * default partition on the other side if any; if not, the algorithm tries to
+ * match it with a dummy partition on the other side if it's on the
+ * non-nullable side of an outer join. Also, if both sides have the default
+ * partitions, the algorithm tries to match them with each other. We give up
+ * if the algorithm finds a partition overlapping multiple partitions on the
+ * other side, which is the scenario the current implementation of partitioned
+ * join can't handle.
+ */
+static PartitionBoundInfo
+merge_range_bounds(int partnatts, FmgrInfo *partsupfuncs,
+ Oid *partcollations,
+ RelOptInfo *outer_rel, RelOptInfo *inner_rel,
+ JoinType jointype,
+ List **outer_parts, List **inner_parts)
+{
+ PartitionBoundInfo merged_bounds = NULL;
+ PartitionBoundInfo outer_bi = outer_rel->boundinfo;
+ PartitionBoundInfo inner_bi = inner_rel->boundinfo;
+ bool outer_has_default = partition_bound_has_default(outer_bi);
+ bool inner_has_default = partition_bound_has_default(inner_bi);
+ int outer_default = outer_bi->default_index;
+ int inner_default = inner_bi->default_index;
+ PartitionMap outer_map;
+ PartitionMap inner_map;
+ int outer_index;
+ int inner_index;
+ int outer_lb_pos;
+ int inner_lb_pos;
+ PartitionRangeBound outer_lb;
+ PartitionRangeBound outer_ub;
+ PartitionRangeBound inner_lb;
+ PartitionRangeBound inner_ub;
+ int next_index = 0;
+ int default_index = -1;
+ List *merged_datums = NIL;
+ List *merged_kinds = NIL;
+ List *merged_indexes = NIL;
+
+ Assert(*outer_parts == NIL);
+ Assert(*inner_parts == NIL);
+ Assert(outer_bi->strategy == inner_bi->strategy &&
+ outer_bi->strategy == PARTITION_STRATEGY_RANGE);
+
+ init_partition_map(outer_rel, &outer_map);
+ init_partition_map(inner_rel, &inner_map);
+
+ /*
+ * If the default partitions (if any) have been proven empty, deem them
+ * non-existent.
+ */
+ if (outer_has_default && is_dummy_partition(outer_rel, outer_default))
+ outer_has_default = false;
+ if (inner_has_default && is_dummy_partition(inner_rel, inner_default))
+ inner_has_default = false;
+
+ /*
+ * Merge partitions from both sides. In each iteration we compare a pair
+ * of ranges, one from each side, and decide whether the corresponding
+ * partitions match or not. If the two ranges overlap, move to the next
+ * pair of ranges, otherwise move to the next range on the side with a
+ * lower range. outer_lb_pos/inner_lb_pos keep track of the positions of
+ * lower bounds in the datums arrays in the outer/inner
+ * PartitionBoundInfos respectively.
+ */
+ outer_lb_pos = inner_lb_pos = 0;
+ outer_index = get_range_partition(outer_rel, outer_bi, &outer_lb_pos,
+ &outer_lb, &outer_ub);
+ inner_index = get_range_partition(inner_rel, inner_bi, &inner_lb_pos,
+ &inner_lb, &inner_ub);
+ while (outer_index >= 0 || inner_index >= 0)
+ {
+ bool overlap;
+ int ub_cmpval;
+ int lb_cmpval;
+ PartitionRangeBound merged_lb = {-1, NULL, NULL, true};
+ PartitionRangeBound merged_ub = {-1, NULL, NULL, false};
+ int merged_index = -1;
+
+ /*
+ * We run this loop till both sides finish. This allows us to avoid
+ * duplicating code to handle the remaining ranges on the side which
+ * finishes later. For that we set the comparison parameter cmpval in
+ * such a way that it appears as if the side which finishes earlier
+ * has an extra range higher than any other range on the unfinished
+ * side. That way we advance the ranges on the unfinished side till
+ * all of its ranges are exhausted.
+ */
+ if (outer_index == -1)
+ {
+ overlap = false;
+ lb_cmpval = 1;
+ ub_cmpval = 1;
+ }
+ else if (inner_index == -1)
+ {
+ overlap = false;
+ lb_cmpval = -1;
+ ub_cmpval = -1;
+ }
+ else
+ overlap = compare_range_partitions(partnatts, partsupfuncs,
+ partcollations,
+ &outer_lb, &outer_ub,
+ &inner_lb, &inner_ub,
+ &lb_cmpval, &ub_cmpval);
+
+ if (overlap)
+ {
+ /* Two ranges overlap; form a join pair. */
+
+ PartitionRangeBound save_outer_ub;
+ PartitionRangeBound save_inner_ub;
+
+ /* Both partitions should not have been merged yet. */
+ Assert(outer_index >= 0);
+ Assert(outer_map.merged_indexes[outer_index] == -1 &&
+ outer_map.merged[outer_index] == false);
+ Assert(inner_index >= 0);
+ Assert(inner_map.merged_indexes[inner_index] == -1 &&
+ inner_map.merged[inner_index] == false);
+
+ /*
+ * Get the index of the merged partition. Both partitions aren't
+ * merged yet, so the partitions should be merged successfully.
+ */
+ merged_index = merge_matching_partitions(&outer_map, &inner_map,
+ outer_index, inner_index,
+ &next_index);
+ Assert(merged_index >= 0);
+
+ /* Get the range bounds of the merged partition. */
+ get_merged_range_bounds(partnatts, partsupfuncs,
+ partcollations, jointype,
+ &outer_lb, &outer_ub,
+ &inner_lb, &inner_ub,
+ lb_cmpval, ub_cmpval,
+ &merged_lb, &merged_ub);
+
+ /* Save the upper bounds of both partitions for use below. */
+ save_outer_ub = outer_ub;
+ save_inner_ub = inner_ub;
+
+ /* Move to the next pair of ranges. */
+ outer_index = get_range_partition(outer_rel, outer_bi, &outer_lb_pos,
+ &outer_lb, &outer_ub);
+ inner_index = get_range_partition(inner_rel, inner_bi, &inner_lb_pos,
+ &inner_lb, &inner_ub);
+
+ /*
+ * If the range of a partition on one side overlaps the range of
+ * the next partition on the other side, that will cause the
+ * partition on one side to match at least two partitions on the
+ * other side, which is the case that we currently don't support
+ * partitioned join for; give up.
+ */
+ if (ub_cmpval > 0 && inner_index >= 0 &&
+ compare_range_bounds(partnatts, partsupfuncs, partcollations,
+ &save_outer_ub, &inner_lb) > 0)
+ goto cleanup;
+ if (ub_cmpval < 0 && outer_index >= 0 &&
+ compare_range_bounds(partnatts, partsupfuncs, partcollations,
+ &outer_lb, &save_inner_ub) < 0)
+ goto cleanup;
+
+ /*
+ * A row from a non-overlapping portion (if any) of a partition on
+ * one side might find its join partner in the default partition
+ * (if any) on the other side, causing the same situation as
+ * above; give up in that case.
+ */
+ if ((outer_has_default && (lb_cmpval > 0 || ub_cmpval < 0)) ||
+ (inner_has_default && (lb_cmpval < 0 || ub_cmpval > 0)))
+ goto cleanup;
+ }
+ else if (ub_cmpval < 0)
+ {
+ /* A non-overlapping outer range. */
+
+ /* The outer partition should not have been merged yet. */
+ Assert(outer_index >= 0);
+ Assert(outer_map.merged_indexes[outer_index] == -1 &&
+ outer_map.merged[outer_index] == false);
+
+ /*
+ * If the inner side has the default partition, or this is an
+ * outer join, try to assign a merged partition to the outer
+ * partition (see process_outer_partition()). Otherwise, the
+ * outer partition will not contribute to the result.
+ */
+ if (inner_has_default || IS_OUTER_JOIN(jointype))
+ {
+ merged_index = process_outer_partition(&outer_map,
+ &inner_map,
+ outer_has_default,
+ inner_has_default,
+ outer_index,
+ inner_default,
+ jointype,
+ &next_index,
+ &default_index);
+ if (merged_index == -1)
+ goto cleanup;
+ merged_lb = outer_lb;
+ merged_ub = outer_ub;
+ }
+
+ /* Move to the next range on the outer side. */
+ outer_index = get_range_partition(outer_rel, outer_bi, &outer_lb_pos,
+ &outer_lb, &outer_ub);
+ }
+ else
+ {
+ /* A non-overlapping inner range. */
+ Assert(ub_cmpval > 0);
+
+ /* The inner partition should not have been merged yet. */
+ Assert(inner_index >= 0);
+ Assert(inner_map.merged_indexes[inner_index] == -1 &&
+ inner_map.merged[inner_index] == false);
+
+ /*
+ * If the outer side has the default partition, or this is a FULL
+ * join, try to assign a merged partition to the inner partition
+ * (see process_inner_partition()). Otherwise, the inner
+ * partition will not contribute to the result.
+ */
+ if (outer_has_default || jointype == JOIN_FULL)
+ {
+ merged_index = process_inner_partition(&outer_map,
+ &inner_map,
+ outer_has_default,
+ inner_has_default,
+ inner_index,
+ outer_default,
+ jointype,
+ &next_index,
+ &default_index);
+ if (merged_index == -1)
+ goto cleanup;
+ merged_lb = inner_lb;
+ merged_ub = inner_ub;
+ }
+
+ /* Move to the next range on the inner side. */
+ inner_index = get_range_partition(inner_rel, inner_bi, &inner_lb_pos,
+ &inner_lb, &inner_ub);
+ }
+
+ /*
+ * If we assigned a merged partition, add the range bounds and index
+ * of the merged partition if appropriate.
+ */
+ if (merged_index >= 0 && merged_index != default_index)
+ add_merged_range_bounds(partnatts, partsupfuncs, partcollations,
+ &merged_lb, &merged_ub, merged_index,
+ &merged_datums, &merged_kinds,
+ &merged_indexes);
+ }
+
+ /* Merge the default partitions if any. */
+ if (outer_has_default || inner_has_default)
+ merge_default_partitions(&outer_map, &inner_map,
+ outer_has_default, inner_has_default,
+ outer_default, inner_default,
+ jointype, &next_index, &default_index);
+ else
+ Assert(default_index == -1);
+
+ /* If we have merged partitions, create the partition bounds. */
+ if (next_index > 0)
+ {
+ /*
+ * Unlike the case of list partitioning, we wouldn't have re-merged
+ * partitions, so did_remapping should be left alone.
+ */
+ Assert(!outer_map.did_remapping);
+ Assert(!inner_map.did_remapping);
+
+ /* Use maps to match partitions from inputs. */
+ generate_matching_part_pairs(outer_rel, inner_rel,
+ &outer_map, &inner_map,
+ next_index,
+ outer_parts, inner_parts);
+ Assert(*outer_parts != NIL);
+ Assert(*inner_parts != NIL);
+ Assert(list_length(*outer_parts) == list_length(*inner_parts));
+ Assert(list_length(*outer_parts) == next_index);
+
+ /* Make a PartitionBoundInfo struct to return. */
+ merged_bounds = build_merged_partition_bounds(outer_bi->strategy,
+ merged_datums,
+ merged_kinds,
+ merged_indexes,
+ -1,
+ default_index);
+ Assert(merged_bounds);
+ }
+
+cleanup:
+ /* Free local memory before returning. */
+ list_free(merged_datums);
+ list_free(merged_kinds);
+ list_free(merged_indexes);
+ free_partition_map(&outer_map);
+ free_partition_map(&inner_map);
+
+ return merged_bounds;
+}
+
+/*
+ * init_partition_map
+ * Initialize a PartitionMap struct for given relation
+ */
+static void
+init_partition_map(RelOptInfo *rel, PartitionMap *map)
+{
+ int nparts = rel->nparts;
+ int i;
+
+ map->nparts = nparts;
+ map->merged_indexes = (int *) palloc(sizeof(int) * nparts);
+ map->merged = (bool *) palloc(sizeof(bool) * nparts);
+ map->did_remapping = false;
+ map->old_indexes = (int *) palloc(sizeof(int) * nparts);
+ for (i = 0; i < nparts; i++)
+ {
+ map->merged_indexes[i] = map->old_indexes[i] = -1;
+ map->merged[i] = false;
+ }
+}
+
+/*
+ * free_partition_map
+ */
+static void
+free_partition_map(PartitionMap *map)
+{
+ pfree(map->merged_indexes);
+ pfree(map->merged);
+ pfree(map->old_indexes);
+}
+
+/*
+ * is_dummy_partition --- has partition been proven empty?
+ */
+static bool
+is_dummy_partition(RelOptInfo *rel, int part_index)
+{
+ RelOptInfo *part_rel;
+
+ Assert(part_index >= 0);
+ part_rel = rel->part_rels[part_index];
+ if (part_rel == NULL || IS_DUMMY_REL(part_rel))
+ return true;
+ return false;
+}
+
+/*
+ * merge_matching_partitions
+ * Try to merge given outer/inner partitions, and return the index of a
+ * merged partition produced from them if successful, -1 otherwise
+ *
+ * If the merged partition is newly created, *next_index is incremented.
+ */
+static int
+merge_matching_partitions(PartitionMap *outer_map, PartitionMap *inner_map,
+ int outer_index, int inner_index, int *next_index)
+{
+ int outer_merged_index;
+ int inner_merged_index;
+ bool outer_merged;
+ bool inner_merged;
+
+ Assert(outer_index >= 0 && outer_index < outer_map->nparts);
+ outer_merged_index = outer_map->merged_indexes[outer_index];
+ outer_merged = outer_map->merged[outer_index];
+ Assert(inner_index >= 0 && inner_index < inner_map->nparts);
+ inner_merged_index = inner_map->merged_indexes[inner_index];
+ inner_merged = inner_map->merged[inner_index];
+
+ /*
+ * Handle cases where we have already assigned a merged partition to each
+ * of the given partitions.
+ */
+ if (outer_merged_index >= 0 && inner_merged_index >= 0)
+ {
+ /*
+ * If the merged partitions are the same, no need to do anything;
+ * return the index of the merged partitions. Otherwise, if each of
+ * the given partitions has been merged with a dummy partition on the
+ * other side, re-map them to either of the two merged partitions.
+ * Otherwise, they can't be merged, so return -1.
+ */
+ if (outer_merged_index == inner_merged_index)
+ {
+ Assert(outer_merged);
+ Assert(inner_merged);
+ return outer_merged_index;
+ }
+ if (!outer_merged && !inner_merged)
+ {
+ /*
+ * This can only happen for a list-partitioning case. We re-map
+ * them to the merged partition with the smaller of the two merged
+ * indexes to preserve the property that the canonical order of
+ * list partitions is determined by the indexes assigned to the
+ * smallest list value of each partition.
+ */
+ if (outer_merged_index < inner_merged_index)
+ {
+ outer_map->merged[outer_index] = true;
+ inner_map->merged_indexes[inner_index] = outer_merged_index;
+ inner_map->merged[inner_index] = true;
+ inner_map->did_remapping = true;
+ inner_map->old_indexes[inner_index] = inner_merged_index;
+ return outer_merged_index;
+ }
+ else
+ {
+ inner_map->merged[inner_index] = true;
+ outer_map->merged_indexes[outer_index] = inner_merged_index;
+ outer_map->merged[outer_index] = true;
+ outer_map->did_remapping = true;
+ outer_map->old_indexes[outer_index] = outer_merged_index;
+ return inner_merged_index;
+ }
+ }
+ return -1;
+ }
+
+ /* At least one of the given partitions should not have yet been merged. */
+ Assert(outer_merged_index == -1 || inner_merged_index == -1);
+
+ /*
+ * If neither of them has been merged, merge them. Otherwise, if one has
+ * been merged with a dummy partition on the other side (and the other
+ * hasn't yet been merged with anything), re-merge them. Otherwise, they
+ * can't be merged, so return -1.
+ */
+ if (outer_merged_index == -1 && inner_merged_index == -1)
+ {
+ int merged_index = *next_index;
+
+ Assert(!outer_merged);
+ Assert(!inner_merged);
+ outer_map->merged_indexes[outer_index] = merged_index;
+ outer_map->merged[outer_index] = true;
+ inner_map->merged_indexes[inner_index] = merged_index;
+ inner_map->merged[inner_index] = true;
+ *next_index = *next_index + 1;
+ return merged_index;
+ }
+ if (outer_merged_index >= 0 && !outer_map->merged[outer_index])
+ {
+ Assert(inner_merged_index == -1);
+ Assert(!inner_merged);
+ inner_map->merged_indexes[inner_index] = outer_merged_index;
+ inner_map->merged[inner_index] = true;
+ outer_map->merged[outer_index] = true;
+ return outer_merged_index;
+ }
+ if (inner_merged_index >= 0 && !inner_map->merged[inner_index])
+ {
+ Assert(outer_merged_index == -1);
+ Assert(!outer_merged);
+ outer_map->merged_indexes[outer_index] = inner_merged_index;
+ outer_map->merged[outer_index] = true;
+ inner_map->merged[inner_index] = true;
+ return inner_merged_index;
+ }
+ return -1;
+}
+
+/*
+ * process_outer_partition
+ * Try to assign given outer partition a merged partition, and return the
+ * index of the merged partition if successful, -1 otherwise
+ *
+ * If the partition is newly created, *next_index is incremented. Also, if it
+ * is the default partition of the join relation, *default_index is set to the
+ * index if not already done.
+ */
+static int
+process_outer_partition(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ bool outer_has_default,
+ bool inner_has_default,
+ int outer_index,
+ int inner_default,
+ JoinType jointype,
+ int *next_index,
+ int *default_index)
+{
+ int merged_index = -1;
+
+ Assert(outer_index >= 0);
+
+ /*
+ * If the inner side has the default partition, a row from the outer
+ * partition might find its join partner in the default partition; try
+ * merging the outer partition with the default partition. Otherwise,
+ * this should be an outer join, in which case the outer partition has to
+ * be scanned all the way anyway; merge the outer partition with a dummy
+ * partition on the other side.
+ */
+ if (inner_has_default)
+ {
+ Assert(inner_default >= 0);
+
+ /*
+ * If the outer side has the default partition as well, the default
+ * partition on the inner side will have two matching partitions on
+ * the other side: the outer partition and the default partition on
+ * the outer side. Partitionwise join doesn't handle this scenario
+ * yet.
+ */
+ if (outer_has_default)
+ return -1;
+
+ merged_index = merge_matching_partitions(outer_map, inner_map,
+ outer_index, inner_default,
+ next_index);
+ if (merged_index == -1)
+ return -1;
+
+ /*
+ * If this is a FULL join, the default partition on the inner side has
+ * to be scanned all the way anyway, so the resulting partition will
+ * contain all key values from the default partition, which any other
+ * partition of the join relation will not contain. Thus the
+ * resulting partition will act as the default partition of the join
+ * relation; record the index in *default_index if not already done.
+ */
+ if (jointype == JOIN_FULL)
+ {
+ if (*default_index == -1)
+ *default_index = merged_index;
+ else
+ Assert(*default_index == merged_index);
+ }
+ }
+ else
+ {
+ Assert(IS_OUTER_JOIN(jointype));
+ Assert(jointype != JOIN_RIGHT);
+
+ /* If we have already assigned a partition, no need to do anything. */
+ merged_index = outer_map->merged_indexes[outer_index];
+ if (merged_index == -1)
+ merged_index = merge_partition_with_dummy(outer_map, outer_index,
+ next_index);
+ }
+ return merged_index;
+}
+
+/*
+ * process_inner_partition
+ * Try to assign given inner partition a merged partition, and return the
+ * index of the merged partition if successful, -1 otherwise
+ *
+ * If the partition is newly created, *next_index is incremented. Also, if it
+ * is the default partition of the join relation, *default_index is set to the
+ * index if not already done.
+ */
+static int
+process_inner_partition(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ bool outer_has_default,
+ bool inner_has_default,
+ int inner_index,
+ int outer_default,
+ JoinType jointype,
+ int *next_index,
+ int *default_index)
+{
+ int merged_index = -1;
+
+ Assert(inner_index >= 0);
+
+ /*
+ * If the outer side has the default partition, a row from the inner
+ * partition might find its join partner in the default partition; try
+ * merging the inner partition with the default partition. Otherwise,
+ * this should be a FULL join, in which case the inner partition has to be
+ * scanned all the way anyway; merge the inner partition with a dummy
+ * partition on the other side.
+ */
+ if (outer_has_default)
+ {
+ Assert(outer_default >= 0);
+
+ /*
+ * If the inner side has the default partition as well, the default
+ * partition on the outer side will have two matching partitions on
+ * the other side: the inner partition and the default partition on
+ * the inner side. Partitionwise join doesn't handle this scenario
+ * yet.
+ */
+ if (inner_has_default)
+ return -1;
+
+ merged_index = merge_matching_partitions(outer_map, inner_map,
+ outer_default, inner_index,
+ next_index);
+ if (merged_index == -1)
+ return -1;
+
+ /*
+ * If this is an outer join, the default partition on the outer side
+ * has to be scanned all the way anyway, so the resulting partition
+ * will contain all key values from the default partition, which any
+ * other partition of the join relation will not contain. Thus the
+ * resulting partition will act as the default partition of the join
+ * relation; record the index in *default_index if not already done.
+ */
+ if (IS_OUTER_JOIN(jointype))
+ {
+ Assert(jointype != JOIN_RIGHT);
+ if (*default_index == -1)
+ *default_index = merged_index;
+ else
+ Assert(*default_index == merged_index);
+ }
+ }
+ else
+ {
+ Assert(jointype == JOIN_FULL);
+
+ /* If we have already assigned a partition, no need to do anything. */
+ merged_index = inner_map->merged_indexes[inner_index];
+ if (merged_index == -1)
+ merged_index = merge_partition_with_dummy(inner_map, inner_index,
+ next_index);
+ }
+ return merged_index;
+}
+
+/*
+ * merge_null_partitions
+ * Merge the NULL partitions from a join's outer and inner sides.
+ *
+ * If the merged partition produced from them is the NULL partition of the join
+ * relation, *null_index is set to the index of the merged partition.
+ *
+ * Note: We assume here that the join clause for a partitioned join is strict
+ * because have_partkey_equi_join() requires that the corresponding operator
+ * be mergejoinable, and we currently assume that mergejoinable operators are
+ * strict (see MJEvalOuterValues()/MJEvalInnerValues()).
+ */
+static void
+merge_null_partitions(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ bool outer_has_null,
+ bool inner_has_null,
+ int outer_null,
+ int inner_null,
+ JoinType jointype,
+ int *next_index,
+ int *null_index)
+{
+ bool consider_outer_null = false;
+ bool consider_inner_null = false;
+
+ Assert(outer_has_null || inner_has_null);
+ Assert(*null_index == -1);
+
+ /*
+ * Check whether the NULL partitions have already been merged and if so,
+ * set the consider_outer_null/consider_inner_null flags.
+ */
+ if (outer_has_null)
+ {
+ Assert(outer_null >= 0 && outer_null < outer_map->nparts);
+ if (outer_map->merged_indexes[outer_null] == -1)
+ consider_outer_null = true;
+ }
+ if (inner_has_null)
+ {
+ Assert(inner_null >= 0 && inner_null < inner_map->nparts);
+ if (inner_map->merged_indexes[inner_null] == -1)
+ consider_inner_null = true;
+ }
+
+ /* If both flags are set false, we don't need to do anything. */
+ if (!consider_outer_null && !consider_inner_null)
+ return;
+
+ if (consider_outer_null && !consider_inner_null)
+ {
+ Assert(outer_has_null);
+
+ /*
+ * If this is an outer join, the NULL partition on the outer side has
+ * to be scanned all the way anyway; merge the NULL partition with a
+ * dummy partition on the other side. In that case
+ * consider_outer_null means that the NULL partition only contains
+ * NULL values as the key values, so the merged partition will do so;
+ * treat it as the NULL partition of the join relation.
+ */
+ if (IS_OUTER_JOIN(jointype))
+ {
+ Assert(jointype != JOIN_RIGHT);
+ *null_index = merge_partition_with_dummy(outer_map, outer_null,
+ next_index);
+ }
+ }
+ else if (!consider_outer_null && consider_inner_null)
+ {
+ Assert(inner_has_null);
+
+ /*
+ * If this is a FULL join, the NULL partition on the inner side has to
+ * be scanned all the way anyway; merge the NULL partition with a
+ * dummy partition on the other side. In that case
+ * consider_inner_null means that the NULL partition only contains
+ * NULL values as the key values, so the merged partition will do so;
+ * treat it as the NULL partition of the join relation.
+ */
+ if (jointype == JOIN_FULL)
+ *null_index = merge_partition_with_dummy(inner_map, inner_null,
+ next_index);
+ }
+ else
+ {
+ Assert(consider_outer_null && consider_inner_null);
+ Assert(outer_has_null);
+ Assert(inner_has_null);
+
+ /*
+ * If this is an outer join, the NULL partition on the outer side (and
+ * that on the inner side if this is a FULL join) have to be scanned
+ * all the way anyway, so merge them. Note that each of the NULL
+ * partitions isn't merged yet, so they should be merged successfully.
+ * Like the above, each of the NULL partitions only contains NULL
+ * values as the key values, so the merged partition will do so; treat
+ * it as the NULL partition of the join relation.
+ *
+ * Note: if this an INNER/SEMI join, the join clause will never be
+ * satisfied by two NULL values (see comments above), so both the NULL
+ * partitions can be eliminated.
+ */
+ if (IS_OUTER_JOIN(jointype))
+ {
+ Assert(jointype != JOIN_RIGHT);
+ *null_index = merge_matching_partitions(outer_map, inner_map,
+ outer_null, inner_null,
+ next_index);
+ Assert(*null_index >= 0);
+ }
+ }
+}
+
+/*
+ * merge_default_partitions
+ * Merge the default partitions from a join's outer and inner sides.
+ *
+ * If the merged partition produced from them is the default partition of the
+ * join relation, *default_index is set to the index of the merged partition.
+ */
+static void
+merge_default_partitions(PartitionMap *outer_map,
+ PartitionMap *inner_map,
+ bool outer_has_default,
+ bool inner_has_default,
+ int outer_default,
+ int inner_default,
+ JoinType jointype,
+ int *next_index,
+ int *default_index)
+{
+ int outer_merged_index = -1;
+ int inner_merged_index = -1;
+
+ Assert(outer_has_default || inner_has_default);
+
+ /* Get the merged partition indexes for the default partitions. */
+ if (outer_has_default)
+ {
+ Assert(outer_default >= 0 && outer_default < outer_map->nparts);
+ outer_merged_index = outer_map->merged_indexes[outer_default];
+ }
+ if (inner_has_default)
+ {
+ Assert(inner_default >= 0 && inner_default < inner_map->nparts);
+ inner_merged_index = inner_map->merged_indexes[inner_default];
+ }
+
+ if (outer_has_default && !inner_has_default)
+ {
+ /*
+ * If this is an outer join, the default partition on the outer side
+ * has to be scanned all the way anyway; if we have not yet assigned a
+ * partition, merge the default partition with a dummy partition on
+ * the other side. The merged partition will act as the default
+ * partition of the join relation (see comments in
+ * process_inner_partition()).
+ */
+ if (IS_OUTER_JOIN(jointype))
+ {
+ Assert(jointype != JOIN_RIGHT);
+ if (outer_merged_index == -1)
+ {
+ Assert(*default_index == -1);
+ *default_index = merge_partition_with_dummy(outer_map,
+ outer_default,
+ next_index);
+ }
+ else
+ Assert(*default_index == outer_merged_index);
+ }
+ else
+ Assert(*default_index == -1);
+ }
+ else if (!outer_has_default && inner_has_default)
+ {
+ /*
+ * If this is a FULL join, the default partition on the inner side has
+ * to be scanned all the way anyway; if we have not yet assigned a
+ * partition, merge the default partition with a dummy partition on
+ * the other side. The merged partition will act as the default
+ * partition of the join relation (see comments in
+ * process_outer_partition()).
+ */
+ if (jointype == JOIN_FULL)
+ {
+ if (inner_merged_index == -1)
+ {
+ Assert(*default_index == -1);
+ *default_index = merge_partition_with_dummy(inner_map,
+ inner_default,
+ next_index);
+ }
+ else
+ Assert(*default_index == inner_merged_index);
+ }
+ else
+ Assert(*default_index == -1);
+ }
+ else
+ {
+ Assert(outer_has_default && inner_has_default);
+
+ /*
+ * The default partitions have to be joined with each other, so merge
+ * them. Note that each of the default partitions isn't merged yet
+ * (see, process_outer_partition()/process_innerer_partition()), so
+ * they should be merged successfully. The merged partition will act
+ * as the default partition of the join relation.
+ */
+ Assert(outer_merged_index == -1);
+ Assert(inner_merged_index == -1);
+ Assert(*default_index == -1);
+ *default_index = merge_matching_partitions(outer_map,
+ inner_map,
+ outer_default,
+ inner_default,
+ next_index);
+ Assert(*default_index >= 0);
+ }
+}
+
+/*
+ * merge_partition_with_dummy
+ * Assign given partition a new partition of a join relation
+ *
+ * Note: The caller assumes that the given partition doesn't have a non-dummy
+ * matching partition on the other side, but if the given partition finds the
+ * matching partition later, we will adjust the assignment.
+ */
+static int
+merge_partition_with_dummy(PartitionMap *map, int index, int *next_index)
+{
+ int merged_index = *next_index;
+
+ Assert(index >= 0 && index < map->nparts);
+ Assert(map->merged_indexes[index] == -1);
+ Assert(!map->merged[index]);
+ map->merged_indexes[index] = merged_index;
+ /* Leave the merged flag alone! */
+ *next_index = *next_index + 1;
+ return merged_index;
+}
+
+/*
+ * fix_merged_indexes
+ * Adjust merged indexes of re-merged partitions
+ */
+static void
+fix_merged_indexes(PartitionMap *outer_map, PartitionMap *inner_map,
+ int nmerged, List *merged_indexes)
+{
+ int *new_indexes;
+ int merged_index;
+ int i;
+ ListCell *lc;
+
+ Assert(nmerged > 0);
+
+ new_indexes = (int *) palloc(sizeof(int) * nmerged);
+ for (i = 0; i < nmerged; i++)
+ new_indexes[i] = -1;
+
+ /* Build the mapping of old merged indexes to new merged indexes. */
+ if (outer_map->did_remapping)
+ {
+ for (i = 0; i < outer_map->nparts; i++)
+ {
+ merged_index = outer_map->old_indexes[i];
+ if (merged_index >= 0)
+ new_indexes[merged_index] = outer_map->merged_indexes[i];
+ }
+ }
+ if (inner_map->did_remapping)
+ {
+ for (i = 0; i < inner_map->nparts; i++)
+ {
+ merged_index = inner_map->old_indexes[i];
+ if (merged_index >= 0)
+ new_indexes[merged_index] = inner_map->merged_indexes[i];
+ }
+ }
+
+ /* Fix the merged_indexes list using the mapping. */
+ foreach(lc, merged_indexes)
+ {
+ merged_index = lfirst_int(lc);
+ Assert(merged_index >= 0);
+ if (new_indexes[merged_index] >= 0)
+ lfirst_int(lc) = new_indexes[merged_index];
+ }
+
+ pfree(new_indexes);
+}
+
+/*
+ * generate_matching_part_pairs
+ * Generate a pair of lists of partitions that produce merged partitions
+ *
+ * The lists of partitions are built in the order of merged partition indexes,
+ * and returned in *outer_parts and *inner_parts.
+ */
+static void
+generate_matching_part_pairs(RelOptInfo *outer_rel, RelOptInfo *inner_rel,
+ PartitionMap *outer_map, PartitionMap *inner_map,
+ int nmerged,
+ List **outer_parts, List **inner_parts)
+{
+ int outer_nparts = outer_map->nparts;
+ int inner_nparts = inner_map->nparts;
+ int *outer_indexes;
+ int *inner_indexes;
+ int max_nparts;
+ int i;
+
+ Assert(nmerged > 0);
+ Assert(*outer_parts == NIL);
+ Assert(*inner_parts == NIL);
+
+ outer_indexes = (int *) palloc(sizeof(int) * nmerged);
+ inner_indexes = (int *) palloc(sizeof(int) * nmerged);
+ for (i = 0; i < nmerged; i++)
+ outer_indexes[i] = inner_indexes[i] = -1;
+
+ /* Set pairs of matching partitions. */
+ Assert(outer_nparts == outer_rel->nparts);
+ Assert(inner_nparts == inner_rel->nparts);
+ max_nparts = Max(outer_nparts, inner_nparts);
+ for (i = 0; i < max_nparts; i++)
+ {
+ if (i < outer_nparts)
+ {
+ int merged_index = outer_map->merged_indexes[i];
+
+ if (merged_index >= 0)
+ {
+ Assert(merged_index < nmerged);
+ outer_indexes[merged_index] = i;
+ }
+ }
+ if (i < inner_nparts)
+ {
+ int merged_index = inner_map->merged_indexes[i];
+
+ if (merged_index >= 0)
+ {
+ Assert(merged_index < nmerged);
+ inner_indexes[merged_index] = i;
+ }
+ }
+ }
+
+ /* Build the list pairs. */
+ for (i = 0; i < nmerged; i++)
+ {
+ int outer_index = outer_indexes[i];
+ int inner_index = inner_indexes[i];
+
+ /*
+ * If both partitions are dummy, it means the merged partition that
+ * had been assigned to the outer/inner partition was removed when
+ * re-merging the outer/inner partition in
+ * merge_matching_partitions(); ignore the merged partition.
+ */
+ if (outer_index == -1 && inner_index == -1)
+ continue;
+
+ *outer_parts = lappend(*outer_parts, outer_index >= 0 ?
+ outer_rel->part_rels[outer_index] : NULL);
+ *inner_parts = lappend(*inner_parts, inner_index >= 0 ?
+ inner_rel->part_rels[inner_index] : NULL);
+ }
+
+ pfree(outer_indexes);
+ pfree(inner_indexes);
+}
+
+/*
+ * build_merged_partition_bounds
+ * Create a PartitionBoundInfo struct from merged partition bounds
+ */
+static PartitionBoundInfo
+build_merged_partition_bounds(char strategy, List *merged_datums,
+ List *merged_kinds, List *merged_indexes,
+ int null_index, int default_index)
+{
+ PartitionBoundInfo merged_bounds;
+ int ndatums = list_length(merged_datums);
+ int pos;
+ ListCell *lc;
+
+ merged_bounds = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
+ merged_bounds->strategy = strategy;
+ merged_bounds->ndatums = ndatums;
+
+ merged_bounds->datums = (Datum **) palloc(sizeof(Datum *) * ndatums);
+ pos = 0;
+ foreach(lc, merged_datums)
+ merged_bounds->datums[pos++] = (Datum *) lfirst(lc);
+
+ if (strategy == PARTITION_STRATEGY_RANGE)
+ {
+ Assert(list_length(merged_kinds) == ndatums);
+ merged_bounds->kind = (PartitionRangeDatumKind **)
+ palloc(sizeof(PartitionRangeDatumKind *) * ndatums);
+ pos = 0;
+ foreach(lc, merged_kinds)
+ merged_bounds->kind[pos++] = (PartitionRangeDatumKind *) lfirst(lc);
+
+ /* There are ndatums+1 indexes in the case of range partitioning. */
+ merged_indexes = lappend_int(merged_indexes, -1);
+ ndatums++;
+ }
+ else
+ {
+ Assert(strategy == PARTITION_STRATEGY_LIST);
+ Assert(merged_kinds == NIL);
+ merged_bounds->kind = NULL;
+ }
+
+ /* interleaved_parts is always NULL for join relations. */
+ merged_bounds->interleaved_parts = NULL;
+
+ Assert(list_length(merged_indexes) == ndatums);
+ merged_bounds->nindexes = ndatums;
+ merged_bounds->indexes = (int *) palloc(sizeof(int) * ndatums);
+ pos = 0;
+ foreach(lc, merged_indexes)
+ merged_bounds->indexes[pos++] = lfirst_int(lc);
+
+ merged_bounds->null_index = null_index;
+ merged_bounds->default_index = default_index;
+
+ return merged_bounds;
+}
+
+/*
+ * get_range_partition
+ * Get the next non-dummy partition of a range-partitioned relation,
+ * returning the index of that partition
+ *
+ * *lb and *ub are set to the lower and upper bounds of that partition
+ * respectively, and *lb_pos is advanced to the next lower bound, if any.
+ */
+static int
+get_range_partition(RelOptInfo *rel,
+ PartitionBoundInfo bi,
+ int *lb_pos,
+ PartitionRangeBound *lb,
+ PartitionRangeBound *ub)
+{
+ int part_index;
+
+ Assert(bi->strategy == PARTITION_STRATEGY_RANGE);
+
+ do
+ {
+ part_index = get_range_partition_internal(bi, lb_pos, lb, ub);
+ if (part_index == -1)
+ return -1;
+ } while (is_dummy_partition(rel, part_index));
+
+ return part_index;
+}
+
+static int
+get_range_partition_internal(PartitionBoundInfo bi,
+ int *lb_pos,
+ PartitionRangeBound *lb,
+ PartitionRangeBound *ub)
+{
+ /* Return the index as -1 if we've exhausted all lower bounds. */
+ if (*lb_pos >= bi->ndatums)
+ return -1;
+
+ /* A lower bound should have at least one more bound after it. */
+ Assert(*lb_pos + 1 < bi->ndatums);
+
+ /* Set the lower bound. */
+ lb->index = bi->indexes[*lb_pos];
+ lb->datums = bi->datums[*lb_pos];
+ lb->kind = bi->kind[*lb_pos];
+ lb->lower = true;
+ /* Set the upper bound. */
+ ub->index = bi->indexes[*lb_pos + 1];
+ ub->datums = bi->datums[*lb_pos + 1];
+ ub->kind = bi->kind[*lb_pos + 1];
+ ub->lower = false;
+
+ /* The index assigned to an upper bound should be valid. */
+ Assert(ub->index >= 0);
+
+ /*
+ * Advance the position to the next lower bound. If there are no bounds
+ * left beyond the upper bound, we have reached the last lower bound.
+ */
+ if (*lb_pos + 2 >= bi->ndatums)
+ *lb_pos = bi->ndatums;
+ else
+ {
+ /*
+ * If the index assigned to the bound next to the upper bound isn't
+ * valid, that is the next lower bound; else, the upper bound is also
+ * the lower bound of the next range partition.
+ */
+ if (bi->indexes[*lb_pos + 2] < 0)
+ *lb_pos = *lb_pos + 2;
+ else
+ *lb_pos = *lb_pos + 1;
+ }
+
+ return ub->index;
+}
+
+/*
+ * compare_range_partitions
+ * Compare the bounds of two range partitions, and return true if the
+ * two partitions overlap, false otherwise
+ *
+ * *lb_cmpval is set to -1, 0, or 1 if the outer partition's lower bound is
+ * lower than, equal to, or higher than the inner partition's lower bound
+ * respectively. Likewise, *ub_cmpval is set to -1, 0, or 1 if the outer
+ * partition's upper bound is lower than, equal to, or higher than the inner
+ * partition's upper bound respectively.
+ */
+static bool
+compare_range_partitions(int partnatts, FmgrInfo *partsupfuncs,
+ Oid *partcollations,
+ PartitionRangeBound *outer_lb,
+ PartitionRangeBound *outer_ub,
+ PartitionRangeBound *inner_lb,
+ PartitionRangeBound *inner_ub,
+ int *lb_cmpval, int *ub_cmpval)
+{
+ /*
+ * Check if the outer partition's upper bound is lower than the inner
+ * partition's lower bound; if so the partitions aren't overlapping.
+ */
+ if (compare_range_bounds(partnatts, partsupfuncs, partcollations,
+ outer_ub, inner_lb) < 0)
+ {
+ *lb_cmpval = -1;
+ *ub_cmpval = -1;
+ return false;
+ }
+
+ /*
+ * Check if the outer partition's lower bound is higher than the inner
+ * partition's upper bound; if so the partitions aren't overlapping.
+ */
+ if (compare_range_bounds(partnatts, partsupfuncs, partcollations,
+ outer_lb, inner_ub) > 0)
+ {
+ *lb_cmpval = 1;
+ *ub_cmpval = 1;
+ return false;
+ }
+
+ /* All other cases indicate overlapping partitions. */
+ *lb_cmpval = compare_range_bounds(partnatts, partsupfuncs, partcollations,
+ outer_lb, inner_lb);
+ *ub_cmpval = compare_range_bounds(partnatts, partsupfuncs, partcollations,
+ outer_ub, inner_ub);
+ return true;
+}
+
+/*
+ * get_merged_range_bounds
+ * Given the bounds of range partitions to be joined, determine the bounds
+ * of a merged partition produced from the range partitions
+ *
+ * *merged_lb and *merged_ub are set to the lower and upper bounds of the
+ * merged partition.
+ */
+static void
+get_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs,
+ Oid *partcollations, JoinType jointype,
+ PartitionRangeBound *outer_lb,
+ PartitionRangeBound *outer_ub,
+ PartitionRangeBound *inner_lb,
+ PartitionRangeBound *inner_ub,
+ int lb_cmpval, int ub_cmpval,
+ PartitionRangeBound *merged_lb,
+ PartitionRangeBound *merged_ub)
+{
+ Assert(compare_range_bounds(partnatts, partsupfuncs, partcollations,
+ outer_lb, inner_lb) == lb_cmpval);
+ Assert(compare_range_bounds(partnatts, partsupfuncs, partcollations,
+ outer_ub, inner_ub) == ub_cmpval);
+
+ switch (jointype)
+ {
+ case JOIN_INNER:
+ case JOIN_SEMI:
+
+ /*
+ * An INNER/SEMI join will have the rows that fit both sides, so
+ * the lower bound of the merged partition will be the higher of
+ * the two lower bounds, and the upper bound of the merged
+ * partition will be the lower of the two upper bounds.
+ */
+ *merged_lb = (lb_cmpval > 0) ? *outer_lb : *inner_lb;
+ *merged_ub = (ub_cmpval < 0) ? *outer_ub : *inner_ub;
+ break;
+
+ case JOIN_LEFT:
+ case JOIN_ANTI:
+
+ /*
+ * A LEFT/ANTI join will have all the rows from the outer side, so
+ * the bounds of the merged partition will be the same as the
+ * outer bounds.
+ */
+ *merged_lb = *outer_lb;
+ *merged_ub = *outer_ub;
+ break;
+
+ case JOIN_FULL:
+
+ /*
+ * A FULL join will have all the rows from both sides, so the
+ * lower bound of the merged partition will be the lower of the
+ * two lower bounds, and the upper bound of the merged partition
+ * will be the higher of the two upper bounds.
+ */
+ *merged_lb = (lb_cmpval < 0) ? *outer_lb : *inner_lb;
+ *merged_ub = (ub_cmpval > 0) ? *outer_ub : *inner_ub;
+ break;
+
+ default:
+ elog(ERROR, "unrecognized join type: %d", (int) jointype);
+ }
+}
+
+/*
+ * add_merged_range_bounds
+ * Add the bounds of a merged partition to the lists of range bounds
+ */
+static void
+add_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs,
+ Oid *partcollations,
+ PartitionRangeBound *merged_lb,
+ PartitionRangeBound *merged_ub,
+ int merged_index,
+ List **merged_datums,
+ List **merged_kinds,
+ List **merged_indexes)
+{
+ int cmpval;
+
+ if (!*merged_datums)
+ {
+ /* First merged partition */
+ Assert(!*merged_kinds);
+ Assert(!*merged_indexes);
+ cmpval = 1;
+ }
+ else
+ {
+ PartitionRangeBound prev_ub;
+
+ Assert(*merged_datums);
+ Assert(*merged_kinds);
+ Assert(*merged_indexes);
+
+ /* Get the last upper bound. */
+ prev_ub.index = llast_int(*merged_indexes);
+ prev_ub.datums = (Datum *) llast(*merged_datums);
+ prev_ub.kind = (PartitionRangeDatumKind *) llast(*merged_kinds);
+ prev_ub.lower = false;
+
+ /*
+ * We pass lower1 = false to partition_rbound_cmp() to prevent it from
+ * considering the last upper bound to be smaller than the lower bound
+ * of the merged partition when the values of the two range bounds
+ * compare equal.
+ */
+ cmpval = partition_rbound_cmp(partnatts, partsupfuncs, partcollations,
+ merged_lb->datums, merged_lb->kind,
+ false, &prev_ub);
+ Assert(cmpval >= 0);
+ }
+
+ /*
+ * If the lower bound is higher than the last upper bound, add the lower
+ * bound with the index as -1 indicating that that is a lower bound; else,
+ * the last upper bound will be reused as the lower bound of the merged
+ * partition, so skip this.
+ */
+ if (cmpval > 0)
+ {
+ *merged_datums = lappend(*merged_datums, merged_lb->datums);
+ *merged_kinds = lappend(*merged_kinds, merged_lb->kind);
+ *merged_indexes = lappend_int(*merged_indexes, -1);
+ }
+
+ /* Add the upper bound and index of the merged partition. */
+ *merged_datums = lappend(*merged_datums, merged_ub->datums);
+ *merged_kinds = lappend(*merged_kinds, merged_ub->kind);
+ *merged_indexes = lappend_int(*merged_indexes, merged_index);
+}
+
+/*
+ * partitions_are_ordered
+ * Determine whether the partitions described by 'boundinfo' are ordered,
+ * that is partitions appearing earlier in the PartitionDesc sequence
+ * contain partition keys strictly less than those appearing later.
+ * Also, if NULL values are possible, they must come in the last
+ * partition defined in the PartitionDesc. 'live_parts' marks which
+ * partitions we should include when checking the ordering. Partitions
+ * that do not appear in 'live_parts' are ignored.
+ *
+ * If out of order, or there is insufficient info to know the order,
+ * then we return false.
+ */
+bool
+partitions_are_ordered(PartitionBoundInfo boundinfo, Bitmapset *live_parts)
+{
+ Assert(boundinfo != NULL);
+
+ switch (boundinfo->strategy)
+ {
+ case PARTITION_STRATEGY_RANGE:
+
+ /*
+ * RANGE-type partitioning guarantees that the partitions can be
+ * scanned in the order that they're defined in the PartitionDesc
+ * to provide sequential, non-overlapping ranges of tuples.
+ * However, if a DEFAULT partition exists and it's contained
+ * within live_parts, then the partitions are not ordered.
+ */
+ if (!partition_bound_has_default(boundinfo) ||
+ !bms_is_member(boundinfo->default_index, live_parts))
+ return true;
+ break;
+
+ case PARTITION_STRATEGY_LIST:
+
+ /*
+ * LIST partitioned are ordered providing none of live_parts
+ * overlap with the partitioned table's interleaved partitions.
+ */
+ if (!bms_overlap(live_parts, boundinfo->interleaved_parts))
+ return true;
+
+ break;
+ default:
+ /* HASH, or some other strategy */
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * check_new_partition_bound
+ *
+ * Checks if the new partition's bound overlaps any of the existing partitions
+ * of parent. Also performs additional checks as necessary per strategy.
+ */
+void
+check_new_partition_bound(char *relname, Relation parent,
+ PartitionBoundSpec *spec, ParseState *pstate)
+{
+ PartitionKey key = RelationGetPartitionKey(parent);
+ PartitionDesc partdesc = RelationGetPartitionDesc(parent, false);
+ PartitionBoundInfo boundinfo = partdesc->boundinfo;
+ int with = -1;
+ bool overlap = false;
+ int overlap_location = -1;
+
+ if (spec->is_default)
+ {
+ /*
+ * The default partition bound never conflicts with any other
+ * partition's; if that's what we're attaching, the only possible
+ * problem is that one already exists, so check for that and we're
+ * done.
+ */
+ if (boundinfo == NULL || !partition_bound_has_default(boundinfo))
+ return;
+
+ /* Default partition already exists, error out. */
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("partition \"%s\" conflicts with existing default partition \"%s\"",
+ relname, get_rel_name(partdesc->oids[boundinfo->default_index])),
+ parser_errposition(pstate, spec->location)));
+ }
+
+ switch (key->strategy)
+ {
+ case PARTITION_STRATEGY_HASH:
+ {
+ Assert(spec->strategy == PARTITION_STRATEGY_HASH);
+ Assert(spec->remainder >= 0 && spec->remainder < spec->modulus);
+
+ if (partdesc->nparts > 0)
+ {
+ int greatest_modulus;
+ int remainder;
+ int offset;
+
+ /*
+ * Check rule that every modulus must be a factor of the
+ * next larger modulus. (For example, if you have a bunch
+ * of partitions that all have modulus 5, you can add a
+ * new partition with modulus 10 or a new partition with
+ * modulus 15, but you cannot add both a partition with
+ * modulus 10 and a partition with modulus 15, because 10
+ * is not a factor of 15.) We need only check the next
+ * smaller and next larger existing moduli, relying on
+ * previous enforcement of this rule to be sure that the
+ * rest are in line.
+ */
+
+ /*
+ * Get the greatest (modulus, remainder) pair contained in
+ * boundinfo->datums that is less than or equal to the
+ * (spec->modulus, spec->remainder) pair.
+ */
+ offset = partition_hash_bsearch(boundinfo,
+ spec->modulus,
+ spec->remainder);
+ if (offset < 0)
+ {
+ int next_modulus;
+
+ /*
+ * All existing moduli are greater or equal, so the
+ * new one must be a factor of the smallest one, which
+ * is first in the boundinfo.
+ */
+ next_modulus = DatumGetInt32(boundinfo->datums[0][0]);
+ if (next_modulus % spec->modulus != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("every hash partition modulus must be a factor of the next larger modulus"),
+ errdetail("The new modulus %d is not a factor of %d, the modulus of existing partition \"%s\".",
+ spec->modulus, next_modulus,
+ get_rel_name(partdesc->oids[0]))));
+ }
+ else
+ {
+ int prev_modulus;
+
+ /*
+ * We found the largest (modulus, remainder) pair less
+ * than or equal to the new one. That modulus must be
+ * a divisor of, or equal to, the new modulus.
+ */
+ prev_modulus = DatumGetInt32(boundinfo->datums[offset][0]);
+
+ if (spec->modulus % prev_modulus != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("every hash partition modulus must be a factor of the next larger modulus"),
+ errdetail("The new modulus %d is not divisible by %d, the modulus of existing partition \"%s\".",
+ spec->modulus,
+ prev_modulus,
+ get_rel_name(partdesc->oids[offset]))));
+
+ if (offset + 1 < boundinfo->ndatums)
+ {
+ int next_modulus;
+
+ /*
+ * Look at the next higher (modulus, remainder)
+ * pair. That could have the same modulus and a
+ * larger remainder than the new pair, in which
+ * case we're good. If it has a larger modulus,
+ * the new modulus must divide that one.
+ */
+ next_modulus = DatumGetInt32(boundinfo->datums[offset + 1][0]);
+
+ if (next_modulus % spec->modulus != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("every hash partition modulus must be a factor of the next larger modulus"),
+ errdetail("The new modulus %d is not a factor of %d, the modulus of existing partition \"%s\".",
+ spec->modulus, next_modulus,
+ get_rel_name(partdesc->oids[offset + 1]))));
+ }
+ }
+
+ greatest_modulus = boundinfo->nindexes;
+ remainder = spec->remainder;
+
+ /*
+ * Normally, the lowest remainder that could conflict with
+ * the new partition is equal to the remainder specified
+ * for the new partition, but when the new partition has a
+ * modulus higher than any used so far, we need to adjust.
+ */
+ if (remainder >= greatest_modulus)
+ remainder = remainder % greatest_modulus;
+
+ /* Check every potentially-conflicting remainder. */
+ do
+ {
+ if (boundinfo->indexes[remainder] != -1)
+ {
+ overlap = true;
+ overlap_location = spec->location;
+ with = boundinfo->indexes[remainder];
+ break;
+ }
+ remainder += spec->modulus;
+ } while (remainder < greatest_modulus);
+ }
+
+ break;
+ }
+
+ case PARTITION_STRATEGY_LIST:
+ {
+ Assert(spec->strategy == PARTITION_STRATEGY_LIST);
+
+ if (partdesc->nparts > 0)
+ {
+ ListCell *cell;
+
+ Assert(boundinfo &&
+ boundinfo->strategy == PARTITION_STRATEGY_LIST &&
+ (boundinfo->ndatums > 0 ||
+ partition_bound_accepts_nulls(boundinfo) ||
+ partition_bound_has_default(boundinfo)));
+
+ foreach(cell, spec->listdatums)
+ {
+ Const *val = lfirst_node(Const, cell);
+
+ overlap_location = val->location;
+ if (!val->constisnull)
+ {
+ int offset;
+ bool equal;
+
+ offset = partition_list_bsearch(&key->partsupfunc[0],
+ key->partcollation,
+ boundinfo,
+ val->constvalue,
+ &equal);
+ if (offset >= 0 && equal)
+ {
+ overlap = true;
+ with = boundinfo->indexes[offset];
+ break;
+ }
+ }
+ else if (partition_bound_accepts_nulls(boundinfo))
+ {
+ overlap = true;
+ with = boundinfo->null_index;
+ break;
+ }
+ }
+ }
+
+ break;
+ }
+
+ case PARTITION_STRATEGY_RANGE:
+ {
+ PartitionRangeBound *lower,
+ *upper;
+ int cmpval;
+
+ Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
+ lower = make_one_partition_rbound(key, -1, spec->lowerdatums, true);
+ upper = make_one_partition_rbound(key, -1, spec->upperdatums, false);
+
+ /*
+ * First check if the resulting range would be empty with
+ * specified lower and upper bounds. partition_rbound_cmp
+ * cannot return zero here, since the lower-bound flags are
+ * different.
+ */
+ cmpval = partition_rbound_cmp(key->partnatts,
+ key->partsupfunc,
+ key->partcollation,
+ lower->datums, lower->kind,
+ true, upper);
+ Assert(cmpval != 0);
+ if (cmpval > 0)
+ {
+ /* Point to problematic key in the lower datums list. */
+ PartitionRangeDatum *datum = list_nth(spec->lowerdatums,
+ cmpval - 1);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("empty range bound specified for partition \"%s\"",
+ relname),
+ errdetail("Specified lower bound %s is greater than or equal to upper bound %s.",
+ get_range_partbound_string(spec->lowerdatums),
+ get_range_partbound_string(spec->upperdatums)),
+ parser_errposition(pstate, datum->location)));
+ }
+
+ if (partdesc->nparts > 0)
+ {
+ int offset;
+
+ Assert(boundinfo &&
+ boundinfo->strategy == PARTITION_STRATEGY_RANGE &&
+ (boundinfo->ndatums > 0 ||
+ partition_bound_has_default(boundinfo)));
+
+ /*
+ * Test whether the new lower bound (which is treated
+ * inclusively as part of the new partition) lies inside
+ * an existing partition, or in a gap.
+ *
+ * If it's inside an existing partition, the bound at
+ * offset + 1 will be the upper bound of that partition,
+ * and its index will be >= 0.
+ *
+ * If it's in a gap, the bound at offset + 1 will be the
+ * lower bound of the next partition, and its index will
+ * be -1. This is also true if there is no next partition,
+ * since the index array is initialised with an extra -1
+ * at the end.
+ */
+ offset = partition_range_bsearch(key->partnatts,
+ key->partsupfunc,
+ key->partcollation,
+ boundinfo, lower,
+ &cmpval);
+
+ if (boundinfo->indexes[offset + 1] < 0)
+ {
+ /*
+ * Check that the new partition will fit in the gap.
+ * For it to fit, the new upper bound must be less
+ * than or equal to the lower bound of the next
+ * partition, if there is one.
+ */
+ if (offset + 1 < boundinfo->ndatums)
+ {
+ Datum *datums;
+ PartitionRangeDatumKind *kind;
+ bool is_lower;
+
+ datums = boundinfo->datums[offset + 1];
+ kind = boundinfo->kind[offset + 1];
+ is_lower = (boundinfo->indexes[offset + 1] == -1);
+
+ cmpval = partition_rbound_cmp(key->partnatts,
+ key->partsupfunc,
+ key->partcollation,
+ datums, kind,
+ is_lower, upper);
+ if (cmpval < 0)
+ {
+ /*
+ * Point to problematic key in the upper
+ * datums list.
+ */
+ PartitionRangeDatum *datum =
+ list_nth(spec->upperdatums, Abs(cmpval) - 1);
+
+ /*
+ * The new partition overlaps with the
+ * existing partition between offset + 1 and
+ * offset + 2.
+ */
+ overlap = true;
+ overlap_location = datum->location;
+ with = boundinfo->indexes[offset + 2];
+ }
+ }
+ }
+ else
+ {
+ /*
+ * The new partition overlaps with the existing
+ * partition between offset and offset + 1.
+ */
+ PartitionRangeDatum *datum;
+
+ /*
+ * Point to problematic key in the lower datums list;
+ * if we have equality, point to the first one.
+ */
+ datum = cmpval == 0 ? linitial(spec->lowerdatums) :
+ list_nth(spec->lowerdatums, Abs(cmpval) - 1);
+ overlap = true;
+ overlap_location = datum->location;
+ with = boundinfo->indexes[offset + 1];
+ }
+ }
+
+ break;
+ }
+
+ default:
+ elog(ERROR, "unexpected partition strategy: %d",
+ (int) key->strategy);
+ }
+
+ if (overlap)
+ {
+ Assert(with >= 0);
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("partition \"%s\" would overlap partition \"%s\"",
+ relname, get_rel_name(partdesc->oids[with])),
+ parser_errposition(pstate, overlap_location)));
+ }
+}
+
+/*
+ * check_default_partition_contents
+ *
+ * This function checks if there exists a row in the default partition that
+ * would properly belong to the new partition being added. If it finds one,
+ * it throws an error.
+ */
+void
+check_default_partition_contents(Relation parent, Relation default_rel,
+ PartitionBoundSpec *new_spec)
+{
+ List *new_part_constraints;
+ List *def_part_constraints;
+ List *all_parts;
+ ListCell *lc;
+
+ new_part_constraints = (new_spec->strategy == PARTITION_STRATEGY_LIST)
+ ? get_qual_for_list(parent, new_spec)
+ : get_qual_for_range(parent, new_spec, false);
+ def_part_constraints =
+ get_proposed_default_constraint(new_part_constraints);
+
+ /*
+ * Map the Vars in the constraint expression from parent's attnos to
+ * default_rel's.
+ */
+ def_part_constraints =
+ map_partition_varattnos(def_part_constraints, 1, default_rel,
+ parent);
+
+ /*
+ * If the existing constraints on the default partition imply that it will
+ * not contain any row that would belong to the new partition, we can
+ * avoid scanning the default partition.
+ */
+ if (PartConstraintImpliedByRelConstraint(default_rel, def_part_constraints))
+ {
+ ereport(DEBUG1,
+ (errmsg_internal("updated partition constraint for default partition \"%s\" is implied by existing constraints",
+ RelationGetRelationName(default_rel))));
+ return;
+ }
+
+ /*
+ * Scan the default partition and its subpartitions, and check for rows
+ * that do not satisfy the revised partition constraints.
+ */
+ if (default_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+ all_parts = find_all_inheritors(RelationGetRelid(default_rel),
+ AccessExclusiveLock, NULL);
+ else
+ all_parts = list_make1_oid(RelationGetRelid(default_rel));
+
+ foreach(lc, all_parts)
+ {
+ Oid part_relid = lfirst_oid(lc);
+ Relation part_rel;
+ Expr *partition_constraint;
+ EState *estate;
+ ExprState *partqualstate = NULL;
+ Snapshot snapshot;
+ ExprContext *econtext;
+ TableScanDesc scan;
+ MemoryContext oldCxt;
+ TupleTableSlot *tupslot;
+
+ /* Lock already taken above. */
+ if (part_relid != RelationGetRelid(default_rel))
+ {
+ part_rel = table_open(part_relid, NoLock);
+
+ /*
+ * Map the Vars in the constraint expression from default_rel's
+ * the sub-partition's.
+ */
+ partition_constraint = make_ands_explicit(def_part_constraints);
+ partition_constraint = (Expr *)
+ map_partition_varattnos((List *) partition_constraint, 1,
+ part_rel, default_rel);
+
+ /*
+ * If the partition constraints on default partition child imply
+ * that it will not contain any row that would belong to the new
+ * partition, we can avoid scanning the child table.
+ */
+ if (PartConstraintImpliedByRelConstraint(part_rel,
+ def_part_constraints))
+ {
+ ereport(DEBUG1,
+ (errmsg_internal("updated partition constraint for default partition \"%s\" is implied by existing constraints",
+ RelationGetRelationName(part_rel))));
+
+ table_close(part_rel, NoLock);
+ continue;
+ }
+ }
+ else
+ {
+ part_rel = default_rel;
+ partition_constraint = make_ands_explicit(def_part_constraints);
+ }
+
+ /*
+ * Only RELKIND_RELATION relations (i.e. leaf partitions) need to be
+ * scanned.
+ */
+ if (part_rel->rd_rel->relkind != RELKIND_RELATION)
+ {
+ if (part_rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+ ereport(WARNING,
+ (errcode(ERRCODE_CHECK_VIOLATION),
+ errmsg("skipped scanning foreign table \"%s\" which is a partition of default partition \"%s\"",
+ RelationGetRelationName(part_rel),
+ RelationGetRelationName(default_rel))));
+
+ if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel))
+ table_close(part_rel, NoLock);
+
+ continue;
+ }
+
+ estate = CreateExecutorState();
+
+ /* Build expression execution states for partition check quals */
+ partqualstate = ExecPrepareExpr(partition_constraint, estate);
+
+ econtext = GetPerTupleExprContext(estate);
+ snapshot = RegisterSnapshot(GetLatestSnapshot());
+ tupslot = table_slot_create(part_rel, &estate->es_tupleTable);
+ scan = table_beginscan(part_rel, snapshot, 0, NULL);
+
+ /*
+ * Switch to per-tuple memory context and reset it for each tuple
+ * produced, so we don't leak memory.
+ */
+ oldCxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
+
+ while (table_scan_getnextslot(scan, ForwardScanDirection, tupslot))
+ {
+ econtext->ecxt_scantuple = tupslot;
+
+ if (!ExecCheck(partqualstate, econtext))
+ ereport(ERROR,
+ (errcode(ERRCODE_CHECK_VIOLATION),
+ errmsg("updated partition constraint for default partition \"%s\" would be violated by some row",
+ RelationGetRelationName(default_rel)),
+ errtable(default_rel)));
+
+ ResetExprContext(econtext);
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ MemoryContextSwitchTo(oldCxt);
+ table_endscan(scan);
+ UnregisterSnapshot(snapshot);
+ ExecDropSingleTupleTableSlot(tupslot);
+ FreeExecutorState(estate);
+
+ if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel))
+ table_close(part_rel, NoLock); /* keep the lock until commit */
+ }
+}
+
+/*
+ * get_hash_partition_greatest_modulus
+ *
+ * Returns the greatest modulus of the hash partition bound.
+ * This is no longer used in the core code, but we keep it around
+ * in case external modules are using it.
+ */
+int
+get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
+{
+ Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
+ return bound->nindexes;
+}
+
+/*
+ * make_one_partition_rbound
+ *
+ * Return a PartitionRangeBound given a list of PartitionRangeDatum elements
+ * and a flag telling whether the bound is lower or not. Made into a function
+ * because there are multiple sites that want to use this facility.
+ */
+static PartitionRangeBound *
+make_one_partition_rbound(PartitionKey key, int index, List *datums, bool lower)
+{
+ PartitionRangeBound *bound;
+ ListCell *lc;
+ int i;
+
+ Assert(datums != NIL);
+
+ bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound));
+ bound->index = index;
+ bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum));
+ bound->kind = (PartitionRangeDatumKind *) palloc0(key->partnatts *
+ sizeof(PartitionRangeDatumKind));
+ bound->lower = lower;
+
+ i = 0;
+ foreach(lc, datums)
+ {
+ PartitionRangeDatum *datum = lfirst_node(PartitionRangeDatum, lc);
+
+ /* What's contained in this range datum? */
+ bound->kind[i] = datum->kind;
+
+ if (datum->kind == PARTITION_RANGE_DATUM_VALUE)
+ {
+ Const *val = castNode(Const, datum->value);
+
+ if (val->constisnull)
+ elog(ERROR, "invalid range bound datum");
+ bound->datums[i] = val->constvalue;
+ }
+
+ i++;
+ }
+
+ return bound;
+}
+
+/*
+ * partition_rbound_cmp
+ *
+ * For two range bounds this decides whether the 1st one (specified by
+ * datums1, kind1, and lower1) is <, =, or > the bound specified in *b2.
+ *
+ * 0 is returned if they are equal, otherwise a non-zero integer whose sign
+ * indicates the ordering, and whose absolute value gives the 1-based
+ * partition key number of the first mismatching column.
+ *
+ * partnatts, partsupfunc and partcollation give the number of attributes in the
+ * bounds to be compared, comparison function to be used and the collations of
+ * attributes, respectively.
+ *
+ * Note that if the values of the two range bounds compare equal, then we take
+ * into account whether they are upper or lower bounds, and an upper bound is
+ * considered to be smaller than a lower bound. This is important to the way
+ * that RelationBuildPartitionDesc() builds the PartitionBoundInfoData
+ * structure, which only stores the upper bound of a common boundary between
+ * two contiguous partitions.
+ */
+static int32
+partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc,
+ Oid *partcollation,
+ Datum *datums1, PartitionRangeDatumKind *kind1,
+ bool lower1, PartitionRangeBound *b2)
+{
+ int32 colnum = 0;
+ int32 cmpval = 0; /* placate compiler */
+ int i;
+ Datum *datums2 = b2->datums;
+ PartitionRangeDatumKind *kind2 = b2->kind;
+ bool lower2 = b2->lower;
+
+ for (i = 0; i < partnatts; i++)
+ {
+ /* Track column number in case we need it for result */
+ colnum++;
+
+ /*
+ * First, handle cases where the column is unbounded, which should not
+ * invoke the comparison procedure, and should not consider any later
+ * columns. Note that the PartitionRangeDatumKind enum elements
+ * compare the same way as the values they represent.
+ */
+ if (kind1[i] < kind2[i])
+ return -colnum;
+ else if (kind1[i] > kind2[i])
+ return colnum;
+ else if (kind1[i] != PARTITION_RANGE_DATUM_VALUE)
+ {
+ /*
+ * The column bounds are both MINVALUE or both MAXVALUE. No later
+ * columns should be considered, but we still need to compare
+ * whether they are upper or lower bounds.
+ */
+ break;
+ }
+
+ cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i],
+ partcollation[i],
+ datums1[i],
+ datums2[i]));
+ if (cmpval != 0)
+ break;
+ }
+
+ /*
+ * If the comparison is anything other than equal, we're done. If they
+ * compare equal though, we still have to consider whether the boundaries
+ * are inclusive or exclusive. Exclusive one is considered smaller of the
+ * two.
+ */
+ if (cmpval == 0 && lower1 != lower2)
+ cmpval = lower1 ? 1 : -1;
+
+ return cmpval == 0 ? 0 : (cmpval < 0 ? -colnum : colnum);
+}
+
+/*
+ * partition_rbound_datum_cmp
+ *
+ * Return whether range bound (specified in rb_datums and rb_kind)
+ * is <, =, or > partition key of tuple (tuple_datums)
+ *
+ * n_tuple_datums, partsupfunc and partcollation give number of attributes in
+ * the bounds to be compared, comparison function to be used and the collations
+ * of attributes resp.
+ */
+int32
+partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation,
+ Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
+ Datum *tuple_datums, int n_tuple_datums)
+{
+ int i;
+ int32 cmpval = -1;
+
+ for (i = 0; i < n_tuple_datums; i++)
+ {
+ if (rb_kind[i] == PARTITION_RANGE_DATUM_MINVALUE)
+ return -1;
+ else if (rb_kind[i] == PARTITION_RANGE_DATUM_MAXVALUE)
+ return 1;
+
+ cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i],
+ partcollation[i],
+ rb_datums[i],
+ tuple_datums[i]));
+ if (cmpval != 0)
+ break;
+ }
+
+ return cmpval;
+}
+
+/*
+ * partition_hbound_cmp
+ *
+ * Compares modulus first, then remainder if modulus is equal.
+ */
+static int32
+partition_hbound_cmp(int modulus1, int remainder1, int modulus2, int remainder2)
+{
+ if (modulus1 < modulus2)
+ return -1;
+ if (modulus1 > modulus2)
+ return 1;
+ if (modulus1 == modulus2 && remainder1 != remainder2)
+ return (remainder1 > remainder2) ? 1 : -1;
+ return 0;
+}
+
+/*
+ * partition_list_bsearch
+ * Returns the index of the greatest bound datum that is less than equal
+ * to the given value or -1 if all of the bound datums are greater
+ *
+ * *is_equal is set to true if the bound datum at the returned index is equal
+ * to the input value.
+ */
+int
+partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
+ PartitionBoundInfo boundinfo,
+ Datum value, bool *is_equal)
+{
+ int lo,
+ hi,
+ mid;
+
+ lo = -1;
+ hi = boundinfo->ndatums - 1;
+ while (lo < hi)
+ {
+ int32 cmpval;
+
+ mid = (lo + hi + 1) / 2;
+ cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[0],
+ partcollation[0],
+ boundinfo->datums[mid][0],
+ value));
+ if (cmpval <= 0)
+ {
+ lo = mid;
+ *is_equal = (cmpval == 0);
+ if (*is_equal)
+ break;
+ }
+ else
+ hi = mid - 1;
+ }
+
+ return lo;
+}
+
+/*
+ * partition_range_bsearch
+ * Returns the index of the greatest range bound that is less than or
+ * equal to the given range bound or -1 if all of the range bounds are
+ * greater
+ *
+ * Upon return from this function, *cmpval is set to 0 if the bound at the
+ * returned index matches the input range bound exactly, otherwise a
+ * non-zero integer whose sign indicates the ordering, and whose absolute
+ * value gives the 1-based partition key number of the first mismatching
+ * column.
+ */
+static int
+partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
+ Oid *partcollation,
+ PartitionBoundInfo boundinfo,
+ PartitionRangeBound *probe, int32 *cmpval)
+{
+ int lo,
+ hi,
+ mid;
+
+ lo = -1;
+ hi = boundinfo->ndatums - 1;
+ while (lo < hi)
+ {
+ mid = (lo + hi + 1) / 2;
+ *cmpval = partition_rbound_cmp(partnatts, partsupfunc,
+ partcollation,
+ boundinfo->datums[mid],
+ boundinfo->kind[mid],
+ (boundinfo->indexes[mid] == -1),
+ probe);
+ if (*cmpval <= 0)
+ {
+ lo = mid;
+ if (*cmpval == 0)
+ break;
+ }
+ else
+ hi = mid - 1;
+ }
+
+ return lo;
+}
+
+/*
+ * partition_range_datum_bsearch
+ * Returns the index of the greatest range bound that is less than or
+ * equal to the given tuple or -1 if all of the range bounds are greater
+ *
+ * *is_equal is set to true if the range bound at the returned index is equal
+ * to the input tuple.
+ */
+int
+partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
+ PartitionBoundInfo boundinfo,
+ int nvalues, Datum *values, bool *is_equal)
+{
+ int lo,
+ hi,
+ mid;
+
+ lo = -1;
+ hi = boundinfo->ndatums - 1;
+ while (lo < hi)
+ {
+ int32 cmpval;
+
+ mid = (lo + hi + 1) / 2;
+ cmpval = partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[mid],
+ boundinfo->kind[mid],
+ values,
+ nvalues);
+ if (cmpval <= 0)
+ {
+ lo = mid;
+ *is_equal = (cmpval == 0);
+
+ if (*is_equal)
+ break;
+ }
+ else
+ hi = mid - 1;
+ }
+
+ return lo;
+}
+
+/*
+ * partition_hash_bsearch
+ * Returns the index of the greatest (modulus, remainder) pair that is
+ * less than or equal to the given (modulus, remainder) pair or -1 if
+ * all of them are greater
+ */
+int
+partition_hash_bsearch(PartitionBoundInfo boundinfo,
+ int modulus, int remainder)
+{
+ int lo,
+ hi,
+ mid;
+
+ lo = -1;
+ hi = boundinfo->ndatums - 1;
+ while (lo < hi)
+ {
+ int32 cmpval,
+ bound_modulus,
+ bound_remainder;
+
+ mid = (lo + hi + 1) / 2;
+ bound_modulus = DatumGetInt32(boundinfo->datums[mid][0]);
+ bound_remainder = DatumGetInt32(boundinfo->datums[mid][1]);
+ cmpval = partition_hbound_cmp(bound_modulus, bound_remainder,
+ modulus, remainder);
+ if (cmpval <= 0)
+ {
+ lo = mid;
+
+ if (cmpval == 0)
+ break;
+ }
+ else
+ hi = mid - 1;
+ }
+
+ return lo;
+}
+
+/*
+ * qsort_partition_hbound_cmp
+ *
+ * Hash bounds are sorted by modulus, then by remainder.
+ */
+static int32
+qsort_partition_hbound_cmp(const void *a, const void *b)
+{
+ const PartitionHashBound *h1 = (const PartitionHashBound *) a;
+ const PartitionHashBound *h2 = (const PartitionHashBound *) b;
+
+ return partition_hbound_cmp(h1->modulus, h1->remainder,
+ h2->modulus, h2->remainder);
+}
+
+/*
+ * qsort_partition_list_value_cmp
+ *
+ * Compare two list partition bound datums.
+ */
+static int32
+qsort_partition_list_value_cmp(const void *a, const void *b, void *arg)
+{
+ Datum val1 = ((const PartitionListValue *) a)->value,
+ val2 = ((const PartitionListValue *) b)->value;
+ PartitionKey key = (PartitionKey) arg;
+
+ return DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
+ key->partcollation[0],
+ val1, val2));
+}
+
+/*
+ * qsort_partition_rbound_cmp
+ *
+ * Used when sorting range bounds across all range partitions.
+ */
+static int32
+qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
+{
+ PartitionRangeBound *b1 = (*(PartitionRangeBound *const *) a);
+ PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b);
+ PartitionKey key = (PartitionKey) arg;
+
+ return compare_range_bounds(key->partnatts, key->partsupfunc,
+ key->partcollation,
+ b1, b2);
+}
+
+/*
+ * get_partition_operator
+ *
+ * Return oid of the operator of the given strategy for the given partition
+ * key column. It is assumed that the partitioning key is of the same type as
+ * the chosen partitioning opclass, or at least binary-compatible. In the
+ * latter case, *need_relabel is set to true if the opclass is not of a
+ * polymorphic type (indicating a RelabelType node needed on top), otherwise
+ * false.
+ */
+static Oid
+get_partition_operator(PartitionKey key, int col, StrategyNumber strategy,
+ bool *need_relabel)
+{
+ Oid operoid;
+
+ /*
+ * Get the operator in the partitioning opfamily using the opclass'
+ * declared input type as both left- and righttype.
+ */
+ operoid = get_opfamily_member(key->partopfamily[col],
+ key->partopcintype[col],
+ key->partopcintype[col],
+ strategy);
+ if (!OidIsValid(operoid))
+ elog(ERROR, "missing operator %d(%u,%u) in partition opfamily %u",
+ strategy, key->partopcintype[col], key->partopcintype[col],
+ key->partopfamily[col]);
+
+ /*
+ * If the partition key column is not of the same type as the operator
+ * class and not polymorphic, tell caller to wrap the non-Const expression
+ * in a RelabelType. This matches what parse_coerce.c does.
+ */
+ *need_relabel = (key->parttypid[col] != key->partopcintype[col] &&
+ key->partopcintype[col] != RECORDOID &&
+ !IsPolymorphicType(key->partopcintype[col]));
+
+ return operoid;
+}
+
+/*
+ * make_partition_op_expr
+ * Returns an Expr for the given partition key column with arg1 and
+ * arg2 as its leftop and rightop, respectively
+ */
+static Expr *
+make_partition_op_expr(PartitionKey key, int keynum,
+ uint16 strategy, Expr *arg1, Expr *arg2)
+{
+ Oid operoid;
+ bool need_relabel = false;
+ Expr *result = NULL;
+
+ /* Get the correct btree operator for this partitioning column */
+ operoid = get_partition_operator(key, keynum, strategy, &need_relabel);
+
+ /*
+ * Chosen operator may be such that the non-Const operand needs to be
+ * coerced, so apply the same; see the comment in
+ * get_partition_operator().
+ */
+ if (!IsA(arg1, Const) &&
+ (need_relabel ||
+ key->partcollation[keynum] != key->parttypcoll[keynum]))
+ arg1 = (Expr *) makeRelabelType(arg1,
+ key->partopcintype[keynum],
+ -1,
+ key->partcollation[keynum],
+ COERCE_EXPLICIT_CAST);
+
+ /* Generate the actual expression */
+ switch (key->strategy)
+ {
+ case PARTITION_STRATEGY_LIST:
+ {
+ List *elems = (List *) arg2;
+ int nelems = list_length(elems);
+
+ Assert(nelems >= 1);
+ Assert(keynum == 0);
+
+ if (nelems > 1 &&
+ !type_is_array(key->parttypid[keynum]))
+ {
+ ArrayExpr *arrexpr;
+ ScalarArrayOpExpr *saopexpr;
+
+ /* Construct an ArrayExpr for the right-hand inputs */
+ arrexpr = makeNode(ArrayExpr);
+ arrexpr->array_typeid =
+ get_array_type(key->parttypid[keynum]);
+ arrexpr->array_collid = key->parttypcoll[keynum];
+ arrexpr->element_typeid = key->parttypid[keynum];
+ arrexpr->elements = elems;
+ arrexpr->multidims = false;
+ arrexpr->location = -1;
+
+ /* Build leftop = ANY (rightop) */
+ saopexpr = makeNode(ScalarArrayOpExpr);
+ saopexpr->opno = operoid;
+ saopexpr->opfuncid = get_opcode(operoid);
+ saopexpr->hashfuncid = InvalidOid;
+ saopexpr->negfuncid = InvalidOid;
+ saopexpr->useOr = true;
+ saopexpr->inputcollid = key->partcollation[keynum];
+ saopexpr->args = list_make2(arg1, arrexpr);
+ saopexpr->location = -1;
+
+ result = (Expr *) saopexpr;
+ }
+ else
+ {
+ List *elemops = NIL;
+ ListCell *lc;
+
+ foreach(lc, elems)
+ {
+ Expr *elem = lfirst(lc),
+ *elemop;
+
+ elemop = make_opclause(operoid,
+ BOOLOID,
+ false,
+ arg1, elem,
+ InvalidOid,
+ key->partcollation[keynum]);
+ elemops = lappend(elemops, elemop);
+ }
+
+ result = nelems > 1 ? makeBoolExpr(OR_EXPR, elemops, -1) : linitial(elemops);
+ }
+ break;
+ }
+
+ case PARTITION_STRATEGY_RANGE:
+ result = make_opclause(operoid,
+ BOOLOID,
+ false,
+ arg1, arg2,
+ InvalidOid,
+ key->partcollation[keynum]);
+ break;
+
+ default:
+ elog(ERROR, "invalid partitioning strategy");
+ break;
+ }
+
+ return result;
+}
+
+/*
+ * get_qual_for_hash
+ *
+ * Returns a CHECK constraint expression to use as a hash partition's
+ * constraint, given the parent relation and partition bound structure.
+ *
+ * The partition constraint for a hash partition is always a call to the
+ * built-in function satisfies_hash_partition().
+ */
+static List *
+get_qual_for_hash(Relation parent, PartitionBoundSpec *spec)
+{
+ PartitionKey key = RelationGetPartitionKey(parent);
+ FuncExpr *fexpr;
+ Node *relidConst;
+ Node *modulusConst;
+ Node *remainderConst;
+ List *args;
+ ListCell *partexprs_item;
+ int i;
+
+ /* Fixed arguments. */
+ relidConst = (Node *) makeConst(OIDOID,
+ -1,
+ InvalidOid,
+ sizeof(Oid),
+ ObjectIdGetDatum(RelationGetRelid(parent)),
+ false,
+ true);
+
+ modulusConst = (Node *) makeConst(INT4OID,
+ -1,
+ InvalidOid,
+ sizeof(int32),
+ Int32GetDatum(spec->modulus),
+ false,
+ true);
+
+ remainderConst = (Node *) makeConst(INT4OID,
+ -1,
+ InvalidOid,
+ sizeof(int32),
+ Int32GetDatum(spec->remainder),
+ false,
+ true);
+
+ args = list_make3(relidConst, modulusConst, remainderConst);
+ partexprs_item = list_head(key->partexprs);
+
+ /* Add an argument for each key column. */
+ for (i = 0; i < key->partnatts; i++)
+ {
+ Node *keyCol;
+
+ /* Left operand */
+ if (key->partattrs[i] != 0)
+ {
+ keyCol = (Node *) makeVar(1,
+ key->partattrs[i],
+ key->parttypid[i],
+ key->parttypmod[i],
+ key->parttypcoll[i],
+ 0);
+ }
+ else
+ {
+ keyCol = (Node *) copyObject(lfirst(partexprs_item));
+ partexprs_item = lnext(key->partexprs, partexprs_item);
+ }
+
+ args = lappend(args, keyCol);
+ }
+
+ fexpr = makeFuncExpr(F_SATISFIES_HASH_PARTITION,
+ BOOLOID,
+ args,
+ InvalidOid,
+ InvalidOid,
+ COERCE_EXPLICIT_CALL);
+
+ return list_make1(fexpr);
+}
+
+/*
+ * get_qual_for_list
+ *
+ * Returns an implicit-AND list of expressions to use as a list partition's
+ * constraint, given the parent relation and partition bound structure.
+ *
+ * The function returns NIL for a default partition when it's the only
+ * partition since in that case there is no constraint.
+ */
+static List *
+get_qual_for_list(Relation parent, PartitionBoundSpec *spec)
+{
+ PartitionKey key = RelationGetPartitionKey(parent);
+ List *result;
+ Expr *keyCol;
+ Expr *opexpr;
+ NullTest *nulltest;
+ ListCell *cell;
+ List *elems = NIL;
+ bool list_has_null = false;
+
+ /*
+ * Only single-column list partitioning is supported, so we are worried
+ * only about the partition key with index 0.
+ */
+ Assert(key->partnatts == 1);
+
+ /* Construct Var or expression representing the partition column */
+ if (key->partattrs[0] != 0)
+ keyCol = (Expr *) makeVar(1,
+ key->partattrs[0],
+ key->parttypid[0],
+ key->parttypmod[0],
+ key->parttypcoll[0],
+ 0);
+ else
+ keyCol = (Expr *) copyObject(linitial(key->partexprs));
+
+ /*
+ * For default list partition, collect datums for all the partitions. The
+ * default partition constraint should check that the partition key is
+ * equal to none of those.
+ */
+ if (spec->is_default)
+ {
+ int i;
+ int ndatums = 0;
+ PartitionDesc pdesc = RelationGetPartitionDesc(parent, false);
+ PartitionBoundInfo boundinfo = pdesc->boundinfo;
+
+ if (boundinfo)
+ {
+ ndatums = boundinfo->ndatums;
+
+ if (partition_bound_accepts_nulls(boundinfo))
+ list_has_null = true;
+ }
+
+ /*
+ * If default is the only partition, there need not be any partition
+ * constraint on it.
+ */
+ if (ndatums == 0 && !list_has_null)
+ return NIL;
+
+ for (i = 0; i < ndatums; i++)
+ {
+ Const *val;
+
+ /*
+ * Construct Const from known-not-null datum. We must be careful
+ * to copy the value, because our result has to be able to outlive
+ * the relcache entry we're copying from.
+ */
+ val = makeConst(key->parttypid[0],
+ key->parttypmod[0],
+ key->parttypcoll[0],
+ key->parttyplen[0],
+ datumCopy(*boundinfo->datums[i],
+ key->parttypbyval[0],
+ key->parttyplen[0]),
+ false, /* isnull */
+ key->parttypbyval[0]);
+
+ elems = lappend(elems, val);
+ }
+ }
+ else
+ {
+ /*
+ * Create list of Consts for the allowed values, excluding any nulls.
+ */
+ foreach(cell, spec->listdatums)
+ {
+ Const *val = lfirst_node(Const, cell);
+
+ if (val->constisnull)
+ list_has_null = true;
+ else
+ elems = lappend(elems, copyObject(val));
+ }
+ }
+
+ if (elems)
+ {
+ /*
+ * Generate the operator expression from the non-null partition
+ * values.
+ */
+ opexpr = make_partition_op_expr(key, 0, BTEqualStrategyNumber,
+ keyCol, (Expr *) elems);
+ }
+ else
+ {
+ /*
+ * If there are no partition values, we don't need an operator
+ * expression.
+ */
+ opexpr = NULL;
+ }
+
+ if (!list_has_null)
+ {
+ /*
+ * Gin up a "col IS NOT NULL" test that will be ANDed with the main
+ * expression. This might seem redundant, but the partition routing
+ * machinery needs it.
+ */
+ nulltest = makeNode(NullTest);
+ nulltest->arg = keyCol;
+ nulltest->nulltesttype = IS_NOT_NULL;
+ nulltest->argisrow = false;
+ nulltest->location = -1;
+
+ result = opexpr ? list_make2(nulltest, opexpr) : list_make1(nulltest);
+ }
+ else
+ {
+ /*
+ * Gin up a "col IS NULL" test that will be OR'd with the main
+ * expression.
+ */
+ nulltest = makeNode(NullTest);
+ nulltest->arg = keyCol;
+ nulltest->nulltesttype = IS_NULL;
+ nulltest->argisrow = false;
+ nulltest->location = -1;
+
+ if (opexpr)
+ {
+ Expr *or;
+
+ or = makeBoolExpr(OR_EXPR, list_make2(nulltest, opexpr), -1);
+ result = list_make1(or);
+ }
+ else
+ result = list_make1(nulltest);
+ }
+
+ /*
+ * Note that, in general, applying NOT to a constraint expression doesn't
+ * necessarily invert the set of rows it accepts, because NOT (NULL) is
+ * NULL. However, the partition constraints we construct here never
+ * evaluate to NULL, so applying NOT works as intended.
+ */
+ if (spec->is_default)
+ {
+ result = list_make1(make_ands_explicit(result));
+ result = list_make1(makeBoolExpr(NOT_EXPR, result, -1));
+ }
+
+ return result;
+}
+
+/*
+ * get_qual_for_range
+ *
+ * Returns an implicit-AND list of expressions to use as a range partition's
+ * constraint, given the parent relation and partition bound structure.
+ *
+ * For a multi-column range partition key, say (a, b, c), with (al, bl, cl)
+ * as the lower bound tuple and (au, bu, cu) as the upper bound tuple, we
+ * generate an expression tree of the following form:
+ *
+ * (a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL)
+ * AND
+ * (a > al OR (a = al AND b > bl) OR (a = al AND b = bl AND c >= cl))
+ * AND
+ * (a < au OR (a = au AND b < bu) OR (a = au AND b = bu AND c < cu))
+ *
+ * It is often the case that a prefix of lower and upper bound tuples contains
+ * the same values, for example, (al = au), in which case, we will emit an
+ * expression tree of the following form:
+ *
+ * (a IS NOT NULL) and (b IS NOT NULL) and (c IS NOT NULL)
+ * AND
+ * (a = al)
+ * AND
+ * (b > bl OR (b = bl AND c >= cl))
+ * AND
+ * (b < bu OR (b = bu AND c < cu))
+ *
+ * If a bound datum is either MINVALUE or MAXVALUE, these expressions are
+ * simplified using the fact that any value is greater than MINVALUE and less
+ * than MAXVALUE. So, for example, if cu = MAXVALUE, c < cu is automatically
+ * true, and we need not emit any expression for it, and the last line becomes
+ *
+ * (b < bu) OR (b = bu), which is simplified to (b <= bu)
+ *
+ * In most common cases with only one partition column, say a, the following
+ * expression tree will be generated: a IS NOT NULL AND a >= al AND a < au
+ *
+ * For default partition, it returns the negation of the constraints of all
+ * the other partitions.
+ *
+ * External callers should pass for_default as false; we set it to true only
+ * when recursing.
+ */
+static List *
+get_qual_for_range(Relation parent, PartitionBoundSpec *spec,
+ bool for_default)
+{
+ List *result = NIL;
+ ListCell *cell1,
+ *cell2,
+ *partexprs_item,
+ *partexprs_item_saved;
+ int i,
+ j;
+ PartitionRangeDatum *ldatum,
+ *udatum;
+ PartitionKey key = RelationGetPartitionKey(parent);
+ Expr *keyCol;
+ Const *lower_val,
+ *upper_val;
+ List *lower_or_arms,
+ *upper_or_arms;
+ int num_or_arms,
+ current_or_arm;
+ ListCell *lower_or_start_datum,
+ *upper_or_start_datum;
+ bool need_next_lower_arm,
+ need_next_upper_arm;
+
+ if (spec->is_default)
+ {
+ List *or_expr_args = NIL;
+ PartitionDesc pdesc = RelationGetPartitionDesc(parent, false);
+ Oid *inhoids = pdesc->oids;
+ int nparts = pdesc->nparts,
+ i;
+
+ for (i = 0; i < nparts; i++)
+ {
+ Oid inhrelid = inhoids[i];
+ HeapTuple tuple;
+ Datum datum;
+ bool isnull;
+ PartitionBoundSpec *bspec;
+
+ tuple = SearchSysCache1(RELOID, inhrelid);
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for relation %u", inhrelid);
+
+ datum = SysCacheGetAttr(RELOID, tuple,
+ Anum_pg_class_relpartbound,
+ &isnull);
+ if (isnull)
+ elog(ERROR, "null relpartbound for relation %u", inhrelid);
+
+ bspec = (PartitionBoundSpec *)
+ stringToNode(TextDatumGetCString(datum));
+ if (!IsA(bspec, PartitionBoundSpec))
+ elog(ERROR, "expected PartitionBoundSpec");
+
+ if (!bspec->is_default)
+ {
+ List *part_qual;
+
+ part_qual = get_qual_for_range(parent, bspec, true);
+
+ /*
+ * AND the constraints of the partition and add to
+ * or_expr_args
+ */
+ or_expr_args = lappend(or_expr_args, list_length(part_qual) > 1
+ ? makeBoolExpr(AND_EXPR, part_qual, -1)
+ : linitial(part_qual));
+ }
+ ReleaseSysCache(tuple);
+ }
+
+ if (or_expr_args != NIL)
+ {
+ Expr *other_parts_constr;
+
+ /*
+ * Combine the constraints obtained for non-default partitions
+ * using OR. As requested, each of the OR's args doesn't include
+ * the NOT NULL test for partition keys (which is to avoid its
+ * useless repetition). Add the same now.
+ */
+ other_parts_constr =
+ makeBoolExpr(AND_EXPR,
+ lappend(get_range_nulltest(key),
+ list_length(or_expr_args) > 1
+ ? makeBoolExpr(OR_EXPR, or_expr_args,
+ -1)
+ : linitial(or_expr_args)),
+ -1);
+
+ /*
+ * Finally, the default partition contains everything *NOT*
+ * contained in the non-default partitions.
+ */
+ result = list_make1(makeBoolExpr(NOT_EXPR,
+ list_make1(other_parts_constr), -1));
+ }
+
+ return result;
+ }
+
+ /*
+ * If it is the recursive call for default, we skip the get_range_nulltest
+ * to avoid accumulating the NullTest on the same keys for each partition.
+ */
+ if (!for_default)
+ result = get_range_nulltest(key);
+
+ /*
+ * Iterate over the key columns and check if the corresponding lower and
+ * upper datums are equal using the btree equality operator for the
+ * column's type. If equal, we emit single keyCol = common_value
+ * expression. Starting from the first column for which the corresponding
+ * lower and upper bound datums are not equal, we generate OR expressions
+ * as shown in the function's header comment.
+ */
+ i = 0;
+ partexprs_item = list_head(key->partexprs);
+ partexprs_item_saved = partexprs_item; /* placate compiler */
+ forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
+ {
+ EState *estate;
+ MemoryContext oldcxt;
+ Expr *test_expr;
+ ExprState *test_exprstate;
+ Datum test_result;
+ bool isNull;
+
+ ldatum = lfirst_node(PartitionRangeDatum, cell1);
+ udatum = lfirst_node(PartitionRangeDatum, cell2);
+
+ /*
+ * Since get_range_key_properties() modifies partexprs_item, and we
+ * might need to start over from the previous expression in the later
+ * part of this function, save away the current value.
+ */
+ partexprs_item_saved = partexprs_item;
+
+ get_range_key_properties(key, i, ldatum, udatum,
+ &partexprs_item,
+ &keyCol,
+ &lower_val, &upper_val);
+
+ /*
+ * If either value is NULL, the corresponding partition bound is
+ * either MINVALUE or MAXVALUE, and we treat them as unequal, because
+ * even if they're the same, there is no common value to equate the
+ * key column with.
+ */
+ if (!lower_val || !upper_val)
+ break;
+
+ /* Create the test expression */
+ estate = CreateExecutorState();
+ oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
+ test_expr = make_partition_op_expr(key, i, BTEqualStrategyNumber,
+ (Expr *) lower_val,
+ (Expr *) upper_val);
+ fix_opfuncids((Node *) test_expr);
+ test_exprstate = ExecInitExpr(test_expr, NULL);
+ test_result = ExecEvalExprSwitchContext(test_exprstate,
+ GetPerTupleExprContext(estate),
+ &isNull);
+ MemoryContextSwitchTo(oldcxt);
+ FreeExecutorState(estate);
+
+ /* If not equal, go generate the OR expressions */
+ if (!DatumGetBool(test_result))
+ break;
+
+ /*
+ * The bounds for the last key column can't be equal, because such a
+ * range partition would never be allowed to be defined (it would have
+ * an empty range otherwise).
+ */
+ if (i == key->partnatts - 1)
+ elog(ERROR, "invalid range bound specification");
+
+ /* Equal, so generate keyCol = lower_val expression */
+ result = lappend(result,
+ make_partition_op_expr(key, i, BTEqualStrategyNumber,
+ keyCol, (Expr *) lower_val));
+
+ i++;
+ }
+
+ /* First pair of lower_val and upper_val that are not equal. */
+ lower_or_start_datum = cell1;
+ upper_or_start_datum = cell2;
+
+ /* OR will have as many arms as there are key columns left. */
+ num_or_arms = key->partnatts - i;
+ current_or_arm = 0;
+ lower_or_arms = upper_or_arms = NIL;
+ need_next_lower_arm = need_next_upper_arm = true;
+ while (current_or_arm < num_or_arms)
+ {
+ List *lower_or_arm_args = NIL,
+ *upper_or_arm_args = NIL;
+
+ /* Restart scan of columns from the i'th one */
+ j = i;
+ partexprs_item = partexprs_item_saved;
+
+ for_both_cell(cell1, spec->lowerdatums, lower_or_start_datum,
+ cell2, spec->upperdatums, upper_or_start_datum)
+ {
+ PartitionRangeDatum *ldatum_next = NULL,
+ *udatum_next = NULL;
+
+ ldatum = lfirst_node(PartitionRangeDatum, cell1);
+ if (lnext(spec->lowerdatums, cell1))
+ ldatum_next = castNode(PartitionRangeDatum,
+ lfirst(lnext(spec->lowerdatums, cell1)));
+ udatum = lfirst_node(PartitionRangeDatum, cell2);
+ if (lnext(spec->upperdatums, cell2))
+ udatum_next = castNode(PartitionRangeDatum,
+ lfirst(lnext(spec->upperdatums, cell2)));
+ get_range_key_properties(key, j, ldatum, udatum,
+ &partexprs_item,
+ &keyCol,
+ &lower_val, &upper_val);
+
+ if (need_next_lower_arm && lower_val)
+ {
+ uint16 strategy;
+
+ /*
+ * For the non-last columns of this arm, use the EQ operator.
+ * For the last column of this arm, use GT, unless this is the
+ * last column of the whole bound check, or the next bound
+ * datum is MINVALUE, in which case use GE.
+ */
+ if (j - i < current_or_arm)
+ strategy = BTEqualStrategyNumber;
+ else if (j == key->partnatts - 1 ||
+ (ldatum_next &&
+ ldatum_next->kind == PARTITION_RANGE_DATUM_MINVALUE))
+ strategy = BTGreaterEqualStrategyNumber;
+ else
+ strategy = BTGreaterStrategyNumber;
+
+ lower_or_arm_args = lappend(lower_or_arm_args,
+ make_partition_op_expr(key, j,
+ strategy,
+ keyCol,
+ (Expr *) lower_val));
+ }
+
+ if (need_next_upper_arm && upper_val)
+ {
+ uint16 strategy;
+
+ /*
+ * For the non-last columns of this arm, use the EQ operator.
+ * For the last column of this arm, use LT, unless the next
+ * bound datum is MAXVALUE, in which case use LE.
+ */
+ if (j - i < current_or_arm)
+ strategy = BTEqualStrategyNumber;
+ else if (udatum_next &&
+ udatum_next->kind == PARTITION_RANGE_DATUM_MAXVALUE)
+ strategy = BTLessEqualStrategyNumber;
+ else
+ strategy = BTLessStrategyNumber;
+
+ upper_or_arm_args = lappend(upper_or_arm_args,
+ make_partition_op_expr(key, j,
+ strategy,
+ keyCol,
+ (Expr *) upper_val));
+ }
+
+ /*
+ * Did we generate enough of OR's arguments? First arm considers
+ * the first of the remaining columns, second arm considers first
+ * two of the remaining columns, and so on.
+ */
+ ++j;
+ if (j - i > current_or_arm)
+ {
+ /*
+ * We must not emit any more arms if the new column that will
+ * be considered is unbounded, or this one was.
+ */
+ if (!lower_val || !ldatum_next ||
+ ldatum_next->kind != PARTITION_RANGE_DATUM_VALUE)
+ need_next_lower_arm = false;
+ if (!upper_val || !udatum_next ||
+ udatum_next->kind != PARTITION_RANGE_DATUM_VALUE)
+ need_next_upper_arm = false;
+ break;
+ }
+ }
+
+ if (lower_or_arm_args != NIL)
+ lower_or_arms = lappend(lower_or_arms,
+ list_length(lower_or_arm_args) > 1
+ ? makeBoolExpr(AND_EXPR, lower_or_arm_args, -1)
+ : linitial(lower_or_arm_args));
+
+ if (upper_or_arm_args != NIL)
+ upper_or_arms = lappend(upper_or_arms,
+ list_length(upper_or_arm_args) > 1
+ ? makeBoolExpr(AND_EXPR, upper_or_arm_args, -1)
+ : linitial(upper_or_arm_args));
+
+ /* If no work to do in the next iteration, break away. */
+ if (!need_next_lower_arm && !need_next_upper_arm)
+ break;
+
+ ++current_or_arm;
+ }
+
+ /*
+ * Generate the OR expressions for each of lower and upper bounds (if
+ * required), and append to the list of implicitly ANDed list of
+ * expressions.
+ */
+ if (lower_or_arms != NIL)
+ result = lappend(result,
+ list_length(lower_or_arms) > 1
+ ? makeBoolExpr(OR_EXPR, lower_or_arms, -1)
+ : linitial(lower_or_arms));
+ if (upper_or_arms != NIL)
+ result = lappend(result,
+ list_length(upper_or_arms) > 1
+ ? makeBoolExpr(OR_EXPR, upper_or_arms, -1)
+ : linitial(upper_or_arms));
+
+ /*
+ * As noted above, for non-default, we return list with constant TRUE. If
+ * the result is NIL during the recursive call for default, it implies
+ * this is the only other partition which can hold every value of the key
+ * except NULL. Hence we return the NullTest result skipped earlier.
+ */
+ if (result == NIL)
+ result = for_default
+ ? get_range_nulltest(key)
+ : list_make1(makeBoolConst(true, false));
+
+ return result;
+}
+
+/*
+ * get_range_key_properties
+ * Returns range partition key information for a given column
+ *
+ * This is a subroutine for get_qual_for_range, and its API is pretty
+ * specialized to that caller.
+ *
+ * Constructs an Expr for the key column (returned in *keyCol) and Consts
+ * for the lower and upper range limits (returned in *lower_val and
+ * *upper_val). For MINVALUE/MAXVALUE limits, NULL is returned instead of
+ * a Const. All of these structures are freshly palloc'd.
+ *
+ * *partexprs_item points to the cell containing the next expression in
+ * the key->partexprs list, or NULL. It may be advanced upon return.
+ */
+static void
+get_range_key_properties(PartitionKey key, int keynum,
+ PartitionRangeDatum *ldatum,
+ PartitionRangeDatum *udatum,
+ ListCell **partexprs_item,
+ Expr **keyCol,
+ Const **lower_val, Const **upper_val)
+{
+ /* Get partition key expression for this column */
+ if (key->partattrs[keynum] != 0)
+ {
+ *keyCol = (Expr *) makeVar(1,
+ key->partattrs[keynum],
+ key->parttypid[keynum],
+ key->parttypmod[keynum],
+ key->parttypcoll[keynum],
+ 0);
+ }
+ else
+ {
+ if (*partexprs_item == NULL)
+ elog(ERROR, "wrong number of partition key expressions");
+ *keyCol = copyObject(lfirst(*partexprs_item));
+ *partexprs_item = lnext(key->partexprs, *partexprs_item);
+ }
+
+ /* Get appropriate Const nodes for the bounds */
+ if (ldatum->kind == PARTITION_RANGE_DATUM_VALUE)
+ *lower_val = castNode(Const, copyObject(ldatum->value));
+ else
+ *lower_val = NULL;
+
+ if (udatum->kind == PARTITION_RANGE_DATUM_VALUE)
+ *upper_val = castNode(Const, copyObject(udatum->value));
+ else
+ *upper_val = NULL;
+}
+
+/*
+ * get_range_nulltest
+ *
+ * A non-default range partition table does not currently allow partition
+ * keys to be null, so emit an IS NOT NULL expression for each key column.
+ */
+static List *
+get_range_nulltest(PartitionKey key)
+{
+ List *result = NIL;
+ NullTest *nulltest;
+ ListCell *partexprs_item;
+ int i;
+
+ partexprs_item = list_head(key->partexprs);
+ for (i = 0; i < key->partnatts; i++)
+ {
+ Expr *keyCol;
+
+ if (key->partattrs[i] != 0)
+ {
+ keyCol = (Expr *) makeVar(1,
+ key->partattrs[i],
+ key->parttypid[i],
+ key->parttypmod[i],
+ key->parttypcoll[i],
+ 0);
+ }
+ else
+ {
+ if (partexprs_item == NULL)
+ elog(ERROR, "wrong number of partition key expressions");
+ keyCol = copyObject(lfirst(partexprs_item));
+ partexprs_item = lnext(key->partexprs, partexprs_item);
+ }
+
+ nulltest = makeNode(NullTest);
+ nulltest->arg = keyCol;
+ nulltest->nulltesttype = IS_NOT_NULL;
+ nulltest->argisrow = false;
+ nulltest->location = -1;
+ result = lappend(result, nulltest);
+ }
+
+ return result;
+}
+
+/*
+ * compute_partition_hash_value
+ *
+ * Compute the hash value for given partition key values.
+ */
+uint64
+compute_partition_hash_value(int partnatts, FmgrInfo *partsupfunc, Oid *partcollation,
+ Datum *values, bool *isnull)
+{
+ int i;
+ uint64 rowHash = 0;
+ Datum seed = UInt64GetDatum(HASH_PARTITION_SEED);
+
+ for (i = 0; i < partnatts; i++)
+ {
+ /* Nulls are just ignored */
+ if (!isnull[i])
+ {
+ Datum hash;
+
+ Assert(OidIsValid(partsupfunc[i].fn_oid));
+
+ /*
+ * Compute hash for each datum value by calling respective
+ * datatype-specific hash functions of each partition key
+ * attribute.
+ */
+ hash = FunctionCall2Coll(&partsupfunc[i], partcollation[i],
+ values[i], seed);
+
+ /* Form a single 64-bit hash value */
+ rowHash = hash_combine64(rowHash, DatumGetUInt64(hash));
+ }
+ }
+
+ return rowHash;
+}
+
+/*
+ * satisfies_hash_partition
+ *
+ * This is an SQL-callable function for use in hash partition constraints.
+ * The first three arguments are the parent table OID, modulus, and remainder.
+ * The remaining arguments are the value of the partitioning columns (or
+ * expressions); these are hashed and the results are combined into a single
+ * hash value by calling hash_combine64.
+ *
+ * Returns true if remainder produced when this computed single hash value is
+ * divided by the given modulus is equal to given remainder, otherwise false.
+ * NB: it's important that this never return null, as the constraint machinery
+ * would consider that to be a "pass".
+ *
+ * See get_qual_for_hash() for usage.
+ */
+Datum
+satisfies_hash_partition(PG_FUNCTION_ARGS)
+{
+ typedef struct ColumnsHashData
+ {
+ Oid relid;
+ int nkeys;
+ Oid variadic_type;
+ int16 variadic_typlen;
+ bool variadic_typbyval;
+ char variadic_typalign;
+ Oid partcollid[PARTITION_MAX_KEYS];
+ FmgrInfo partsupfunc[FLEXIBLE_ARRAY_MEMBER];
+ } ColumnsHashData;
+ Oid parentId;
+ int modulus;
+ int remainder;
+ Datum seed = UInt64GetDatum(HASH_PARTITION_SEED);
+ ColumnsHashData *my_extra;
+ uint64 rowHash = 0;
+
+ /* Return false if the parent OID, modulus, or remainder is NULL. */
+ if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
+ PG_RETURN_BOOL(false);
+ parentId = PG_GETARG_OID(0);
+ modulus = PG_GETARG_INT32(1);
+ remainder = PG_GETARG_INT32(2);
+
+ /* Sanity check modulus and remainder. */
+ if (modulus <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("modulus for hash partition must be an integer value greater than zero")));
+ if (remainder < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("remainder for hash partition must be an integer value greater than or equal to zero")));
+ if (remainder >= modulus)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("remainder for hash partition must be less than modulus")));
+
+ /*
+ * Cache hash function information.
+ */
+ my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra;
+ if (my_extra == NULL || my_extra->relid != parentId)
+ {
+ Relation parent;
+ PartitionKey key;
+ int j;
+
+ /* Open parent relation and fetch partition key info */
+ parent = relation_open(parentId, AccessShareLock);
+ key = RelationGetPartitionKey(parent);
+
+ /* Reject parent table that is not hash-partitioned. */
+ if (key == NULL || key->strategy != PARTITION_STRATEGY_HASH)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("\"%s\" is not a hash partitioned table",
+ get_rel_name(parentId))));
+
+ if (!get_fn_expr_variadic(fcinfo->flinfo))
+ {
+ int nargs = PG_NARGS() - 3;
+
+ /* complain if wrong number of column values */
+ if (key->partnatts != nargs)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("number of partitioning columns (%d) does not match number of partition keys provided (%d)",
+ key->partnatts, nargs)));
+
+ /* allocate space for our cache */
+ fcinfo->flinfo->fn_extra =
+ MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt,
+ offsetof(ColumnsHashData, partsupfunc) +
+ sizeof(FmgrInfo) * nargs);
+ my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra;
+ my_extra->relid = parentId;
+ my_extra->nkeys = key->partnatts;
+ memcpy(my_extra->partcollid, key->partcollation,
+ key->partnatts * sizeof(Oid));
+
+ /* check argument types and save fmgr_infos */
+ for (j = 0; j < key->partnatts; ++j)
+ {
+ Oid argtype = get_fn_expr_argtype(fcinfo->flinfo, j + 3);
+
+ if (argtype != key->parttypid[j] && !IsBinaryCoercible(argtype, key->parttypid[j]))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("column %d of the partition key has type %s, but supplied value is of type %s",
+ j + 1, format_type_be(key->parttypid[j]), format_type_be(argtype))));
+
+ fmgr_info_copy(&my_extra->partsupfunc[j],
+ &key->partsupfunc[j],
+ fcinfo->flinfo->fn_mcxt);
+ }
+ }
+ else
+ {
+ ArrayType *variadic_array = PG_GETARG_ARRAYTYPE_P(3);
+
+ /* allocate space for our cache -- just one FmgrInfo in this case */
+ fcinfo->flinfo->fn_extra =
+ MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt,
+ offsetof(ColumnsHashData, partsupfunc) +
+ sizeof(FmgrInfo));
+ my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra;
+ my_extra->relid = parentId;
+ my_extra->nkeys = key->partnatts;
+ my_extra->variadic_type = ARR_ELEMTYPE(variadic_array);
+ get_typlenbyvalalign(my_extra->variadic_type,
+ &my_extra->variadic_typlen,
+ &my_extra->variadic_typbyval,
+ &my_extra->variadic_typalign);
+ my_extra->partcollid[0] = key->partcollation[0];
+
+ /* check argument types */
+ for (j = 0; j < key->partnatts; ++j)
+ if (key->parttypid[j] != my_extra->variadic_type)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("column %d of the partition key has type \"%s\", but supplied value is of type \"%s\"",
+ j + 1,
+ format_type_be(key->parttypid[j]),
+ format_type_be(my_extra->variadic_type))));
+
+ fmgr_info_copy(&my_extra->partsupfunc[0],
+ &key->partsupfunc[0],
+ fcinfo->flinfo->fn_mcxt);
+ }
+
+ /* Hold lock until commit */
+ relation_close(parent, NoLock);
+ }
+
+ if (!OidIsValid(my_extra->variadic_type))
+ {
+ int nkeys = my_extra->nkeys;
+ int i;
+
+ /*
+ * For a non-variadic call, neither the number of arguments nor their
+ * types can change across calls, so avoid the expense of rechecking
+ * here.
+ */
+
+ for (i = 0; i < nkeys; i++)
+ {
+ Datum hash;
+
+ /* keys start from fourth argument of function. */
+ int argno = i + 3;
+
+ if (PG_ARGISNULL(argno))
+ continue;
+
+ hash = FunctionCall2Coll(&my_extra->partsupfunc[i],
+ my_extra->partcollid[i],
+ PG_GETARG_DATUM(argno),
+ seed);
+
+ /* Form a single 64-bit hash value */
+ rowHash = hash_combine64(rowHash, DatumGetUInt64(hash));
+ }
+ }
+ else
+ {
+ ArrayType *variadic_array = PG_GETARG_ARRAYTYPE_P(3);
+ int i;
+ int nelems;
+ Datum *datum;
+ bool *isnull;
+
+ deconstruct_array(variadic_array,
+ my_extra->variadic_type,
+ my_extra->variadic_typlen,
+ my_extra->variadic_typbyval,
+ my_extra->variadic_typalign,
+ &datum, &isnull, &nelems);
+
+ /* complain if wrong number of column values */
+ if (nelems != my_extra->nkeys)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("number of partitioning columns (%d) does not match number of partition keys provided (%d)",
+ my_extra->nkeys, nelems)));
+
+ for (i = 0; i < nelems; i++)
+ {
+ Datum hash;
+
+ if (isnull[i])
+ continue;
+
+ hash = FunctionCall2Coll(&my_extra->partsupfunc[0],
+ my_extra->partcollid[0],
+ datum[i],
+ seed);
+
+ /* Form a single 64-bit hash value */
+ rowHash = hash_combine64(rowHash, DatumGetUInt64(hash));
+ }
+ }
+
+ PG_RETURN_BOOL(rowHash % modulus == remainder);
+}
diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c
new file mode 100644
index 0000000..8b6e0bd
--- /dev/null
+++ b/src/backend/partitioning/partdesc.c
@@ -0,0 +1,462 @@
+/*-------------------------------------------------------------------------
+ *
+ * partdesc.c
+ * Support routines for manipulating partition descriptors
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/partitioning/partdesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/htup_details.h"
+#include "access/table.h"
+#include "catalog/partition.h"
+#include "catalog/pg_inherits.h"
+#include "partitioning/partbounds.h"
+#include "partitioning/partdesc.h"
+#include "storage/bufmgr.h"
+#include "storage/sinval.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/hsearch.h"
+#include "utils/inval.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/partcache.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+typedef struct PartitionDirectoryData
+{
+ MemoryContext pdir_mcxt;
+ HTAB *pdir_hash;
+ bool omit_detached;
+} PartitionDirectoryData;
+
+typedef struct PartitionDirectoryEntry
+{
+ Oid reloid;
+ Relation rel;
+ PartitionDesc pd;
+} PartitionDirectoryEntry;
+
+static PartitionDesc RelationBuildPartitionDesc(Relation rel,
+ bool omit_detached);
+
+
+/*
+ * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
+ *
+ * We keep two partdescs in relcache: rd_partdesc includes all partitions
+ * (even those being concurrently marked detached), while rd_partdesc_nodetach
+ * omits (some of) those. We store the pg_inherits.xmin value for the latter,
+ * to determine whether it can be validly reused in each case, since that
+ * depends on the active snapshot.
+ *
+ * Note: we arrange for partition descriptors to not get freed until the
+ * relcache entry's refcount goes to zero (see hacks in RelationClose,
+ * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
+ * though we hand back a direct pointer into the relcache entry, it's safe
+ * for callers to continue to use that pointer as long as (a) they hold the
+ * relation open, and (b) they hold a relation lock strong enough to ensure
+ * that the data doesn't become stale.
+ */
+PartitionDesc
+RelationGetPartitionDesc(Relation rel, bool omit_detached)
+{
+ Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+
+ /*
+ * If relcache has a partition descriptor, use that. However, we can only
+ * do so when we are asked to include all partitions including detached;
+ * and also when we know that there are no detached partitions.
+ *
+ * If there is no active snapshot, detached partitions aren't omitted
+ * either, so we can use the cached descriptor too in that case.
+ */
+ if (likely(rel->rd_partdesc &&
+ (!rel->rd_partdesc->detached_exist || !omit_detached ||
+ !ActiveSnapshotSet())))
+ return rel->rd_partdesc;
+
+ /*
+ * If we're asked to omit detached partitions, we may be able to use a
+ * cached descriptor too. We determine that based on the pg_inherits.xmin
+ * that was saved alongside that descriptor: if the xmin that was not in
+ * progress for that active snapshot is also not in progress for the
+ * current active snapshot, then we can use it. Otherwise build one from
+ * scratch.
+ */
+ if (omit_detached &&
+ rel->rd_partdesc_nodetached &&
+ ActiveSnapshotSet())
+ {
+ Snapshot activesnap;
+
+ Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
+ activesnap = GetActiveSnapshot();
+
+ if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
+ return rel->rd_partdesc_nodetached;
+ }
+
+ return RelationBuildPartitionDesc(rel, omit_detached);
+}
+
+/*
+ * RelationBuildPartitionDesc
+ * Form rel's partition descriptor, and store in relcache entry
+ *
+ * Partition descriptor is a complex structure; to avoid complicated logic to
+ * free individual elements whenever the relcache entry is flushed, we give it
+ * its own memory context, a child of CacheMemoryContext, which can easily be
+ * deleted on its own. To avoid leaking memory in that context in case of an
+ * error partway through this function, the context is initially created as a
+ * child of CurTransactionContext and only re-parented to CacheMemoryContext
+ * at the end, when no further errors are possible. Also, we don't make this
+ * context the current context except in very brief code sections, out of fear
+ * that some of our callees allocate memory on their own which would be leaked
+ * permanently.
+ *
+ * As a special case, partition descriptors that are requested to omit
+ * partitions being detached (and which contain such partitions) are transient
+ * and are not associated with the relcache entry. Such descriptors only last
+ * through the requesting Portal, so we use the corresponding memory context
+ * for them.
+ */
+static PartitionDesc
+RelationBuildPartitionDesc(Relation rel, bool omit_detached)
+{
+ PartitionDesc partdesc;
+ PartitionBoundInfo boundinfo = NULL;
+ List *inhoids;
+ PartitionBoundSpec **boundspecs = NULL;
+ Oid *oids = NULL;
+ bool *is_leaf = NULL;
+ bool detached_exist;
+ bool is_omit;
+ TransactionId detached_xmin;
+ ListCell *cell;
+ int i,
+ nparts;
+ PartitionKey key = RelationGetPartitionKey(rel);
+ MemoryContext new_pdcxt;
+ MemoryContext oldcxt;
+ int *mapping;
+
+ /*
+ * Get partition oids from pg_inherits. This uses a single snapshot to
+ * fetch the list of children, so while more children may be getting added
+ * concurrently, whatever this function returns will be accurate as of
+ * some well-defined point in time.
+ */
+ detached_exist = false;
+ detached_xmin = InvalidTransactionId;
+ inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
+ omit_detached, NoLock,
+ &detached_exist,
+ &detached_xmin);
+
+ nparts = list_length(inhoids);
+
+ /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
+ if (nparts > 0)
+ {
+ oids = (Oid *) palloc(nparts * sizeof(Oid));
+ is_leaf = (bool *) palloc(nparts * sizeof(bool));
+ boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
+ }
+
+ /* Collect bound spec nodes for each partition. */
+ i = 0;
+ foreach(cell, inhoids)
+ {
+ Oid inhrelid = lfirst_oid(cell);
+ HeapTuple tuple;
+ PartitionBoundSpec *boundspec = NULL;
+
+ /* Try fetching the tuple from the catcache, for speed. */
+ tuple = SearchSysCache1(RELOID, inhrelid);
+ if (HeapTupleIsValid(tuple))
+ {
+ Datum datum;
+ bool isnull;
+
+ datum = SysCacheGetAttr(RELOID, tuple,
+ Anum_pg_class_relpartbound,
+ &isnull);
+ if (!isnull)
+ boundspec = stringToNode(TextDatumGetCString(datum));
+ ReleaseSysCache(tuple);
+ }
+
+ /*
+ * The system cache may be out of date; if so, we may find no pg_class
+ * tuple or an old one where relpartbound is NULL. In that case, try
+ * the table directly. We can't just AcceptInvalidationMessages() and
+ * retry the system cache lookup because it's possible that a
+ * concurrent ATTACH PARTITION operation has removed itself from the
+ * ProcArray but not yet added invalidation messages to the shared
+ * queue; InvalidateSystemCaches() would work, but seems excessive.
+ *
+ * Note that this algorithm assumes that PartitionBoundSpec we manage
+ * to fetch is the right one -- so this is only good enough for
+ * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
+ * some hypothetical operation that changes the partition bounds.
+ */
+ if (boundspec == NULL)
+ {
+ Relation pg_class;
+ SysScanDesc scan;
+ ScanKeyData key[1];
+ Datum datum;
+ bool isnull;
+
+ pg_class = table_open(RelationRelationId, AccessShareLock);
+ ScanKeyInit(&key[0],
+ Anum_pg_class_oid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(inhrelid));
+ scan = systable_beginscan(pg_class, ClassOidIndexId, true,
+ NULL, 1, key);
+ tuple = systable_getnext(scan);
+ datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
+ RelationGetDescr(pg_class), &isnull);
+ if (!isnull)
+ boundspec = stringToNode(TextDatumGetCString(datum));
+ systable_endscan(scan);
+ table_close(pg_class, AccessShareLock);
+ }
+
+ /* Sanity checks. */
+ if (!boundspec)
+ elog(ERROR, "missing relpartbound for relation %u", inhrelid);
+ if (!IsA(boundspec, PartitionBoundSpec))
+ elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
+
+ /*
+ * If the PartitionBoundSpec says this is the default partition, its
+ * OID should match pg_partitioned_table.partdefid; if not, the
+ * catalog is corrupt.
+ */
+ if (boundspec->is_default)
+ {
+ Oid partdefid;
+
+ partdefid = get_default_partition_oid(RelationGetRelid(rel));
+ if (partdefid != inhrelid)
+ elog(ERROR, "expected partdefid %u, but got %u",
+ inhrelid, partdefid);
+ }
+
+ /* Save results. */
+ oids[i] = inhrelid;
+ is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
+ boundspecs[i] = boundspec;
+ ++i;
+ }
+
+ /*
+ * Create PartitionBoundInfo and mapping, working in the caller's context.
+ * This could fail, but we haven't done any damage if so.
+ */
+ if (nparts > 0)
+ boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
+
+ /*
+ * Now build the actual relcache partition descriptor, copying all the
+ * data into a new, small context. As per above comment, we don't make
+ * this a long-lived context until it's finished.
+ */
+ new_pdcxt = AllocSetContextCreate(CurTransactionContext,
+ "partition descriptor",
+ ALLOCSET_SMALL_SIZES);
+ MemoryContextCopyAndSetIdentifier(new_pdcxt,
+ RelationGetRelationName(rel));
+
+ partdesc = (PartitionDescData *)
+ MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
+ partdesc->nparts = nparts;
+ partdesc->detached_exist = detached_exist;
+ /* If there are no partitions, the rest of the partdesc can stay zero */
+ if (nparts > 0)
+ {
+ oldcxt = MemoryContextSwitchTo(new_pdcxt);
+ partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
+ partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
+ partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
+
+ /*
+ * Assign OIDs from the original array into mapped indexes of the
+ * result array. The order of OIDs in the former is defined by the
+ * catalog scan that retrieved them, whereas that in the latter is
+ * defined by canonicalized representation of the partition bounds.
+ * Also save leaf-ness of each partition.
+ */
+ for (i = 0; i < nparts; i++)
+ {
+ int index = mapping[i];
+
+ partdesc->oids[index] = oids[i];
+ partdesc->is_leaf[index] = is_leaf[i];
+ }
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ /*
+ * Are we working with the partdesc that omits the detached partition, or
+ * the one that includes it?
+ *
+ * Note that if a partition was found by the catalog's scan to have been
+ * detached, but the pg_inherit tuple saying so was not visible to the
+ * active snapshot (find_inheritance_children_extended will not have set
+ * detached_xmin in that case), we consider there to be no "omittable"
+ * detached partitions.
+ */
+ is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
+ TransactionIdIsValid(detached_xmin);
+
+ /*
+ * We have a fully valid partdesc. Reparent it so that it has the right
+ * lifespan.
+ */
+ MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
+
+ /*
+ * Store it into relcache.
+ *
+ * But first, a kluge: if there's an old context for this type of
+ * descriptor, it contains an old partition descriptor that may still be
+ * referenced somewhere. Preserve it, while not leaking it, by
+ * reattaching it as a child context of the new one. Eventually it will
+ * get dropped by either RelationClose or RelationClearRelation. (We keep
+ * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
+ * detached-partitions in rd_pddcxt.)
+ */
+ if (is_omit)
+ {
+ if (rel->rd_pddcxt != NULL)
+ MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
+ rel->rd_pddcxt = new_pdcxt;
+ rel->rd_partdesc_nodetached = partdesc;
+
+ /*
+ * For partdescs built excluding detached partitions, which we save
+ * separately, we also record the pg_inherits.xmin of the detached
+ * partition that was omitted; this informs a future potential user of
+ * such a cached partdesc to only use it after cross-checking that the
+ * xmin is indeed visible to the snapshot it is going to be working
+ * with.
+ */
+ Assert(TransactionIdIsValid(detached_xmin));
+ rel->rd_partdesc_nodetached_xmin = detached_xmin;
+ }
+ else
+ {
+ if (rel->rd_pdcxt != NULL)
+ MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
+ rel->rd_pdcxt = new_pdcxt;
+ rel->rd_partdesc = partdesc;
+ }
+
+ return partdesc;
+}
+
+/*
+ * CreatePartitionDirectory
+ * Create a new partition directory object.
+ */
+PartitionDirectory
+CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
+{
+ MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
+ PartitionDirectory pdir;
+ HASHCTL ctl;
+
+ pdir = palloc(sizeof(PartitionDirectoryData));
+ pdir->pdir_mcxt = mcxt;
+
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(PartitionDirectoryEntry);
+ ctl.hcxt = mcxt;
+
+ pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ pdir->omit_detached = omit_detached;
+
+ MemoryContextSwitchTo(oldcontext);
+ return pdir;
+}
+
+/*
+ * PartitionDirectoryLookup
+ * Look up the partition descriptor for a relation in the directory.
+ *
+ * The purpose of this function is to ensure that we get the same
+ * PartitionDesc for each relation every time we look it up. In the
+ * face of concurrent DDL, different PartitionDescs may be constructed with
+ * different views of the catalog state, but any single particular OID
+ * will always get the same PartitionDesc for as long as the same
+ * PartitionDirectory is used.
+ */
+PartitionDesc
+PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
+{
+ PartitionDirectoryEntry *pde;
+ Oid relid = RelationGetRelid(rel);
+ bool found;
+
+ pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
+ if (!found)
+ {
+ /*
+ * We must keep a reference count on the relation so that the
+ * PartitionDesc to which we are pointing can't get destroyed.
+ */
+ RelationIncrementReferenceCount(rel);
+ pde->rel = rel;
+ pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
+ Assert(pde->pd != NULL);
+ }
+ return pde->pd;
+}
+
+/*
+ * DestroyPartitionDirectory
+ * Destroy a partition directory.
+ *
+ * Release the reference counts we're holding.
+ */
+void
+DestroyPartitionDirectory(PartitionDirectory pdir)
+{
+ HASH_SEQ_STATUS status;
+ PartitionDirectoryEntry *pde;
+
+ hash_seq_init(&status, pdir->pdir_hash);
+ while ((pde = hash_seq_search(&status)) != NULL)
+ RelationDecrementReferenceCount(pde->rel);
+}
+
+/*
+ * get_default_oid_from_partdesc
+ *
+ * Given a partition descriptor, return the OID of the default partition, if
+ * one exists; else, return InvalidOid.
+ */
+Oid
+get_default_oid_from_partdesc(PartitionDesc partdesc)
+{
+ if (partdesc && partdesc->boundinfo &&
+ partition_bound_has_default(partdesc->boundinfo))
+ return partdesc->oids[partdesc->boundinfo->default_index];
+
+ return InvalidOid;
+}
diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
new file mode 100644
index 0000000..aeb4194
--- /dev/null
+++ b/src/backend/partitioning/partprune.c
@@ -0,0 +1,3726 @@
+/*-------------------------------------------------------------------------
+ *
+ * partprune.c
+ * Support for partition pruning during query planning and execution
+ *
+ * This module implements partition pruning using the information contained in
+ * a table's partition descriptor, query clauses, and run-time parameters.
+ *
+ * During planning, clauses that can be matched to the table's partition key
+ * are turned into a set of "pruning steps", which are then executed to
+ * identify a set of partitions (as indexes in the RelOptInfo->part_rels
+ * array) that satisfy the constraints in the step. Partitions not in the set
+ * are said to have been pruned.
+ *
+ * A base pruning step may involve expressions whose values are only known
+ * during execution, such as Params, in which case pruning cannot occur
+ * entirely during planning. In that case, such steps are included alongside
+ * the plan, so that they can be used by the executor for further pruning.
+ *
+ * There are two kinds of pruning steps. A "base" pruning step represents
+ * tests on partition key column(s), typically comparisons to expressions.
+ * A "combine" pruning step represents a Boolean connector (AND/OR), and
+ * combines the outputs of some previous steps using the appropriate
+ * combination method.
+ *
+ * See gen_partprune_steps_internal() for more details on step generation.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/partitioning/partprune.c
+ *
+ *-------------------------------------------------------------------------
+*/
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/nbtree.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/appendinfo.h"
+#include "optimizer/cost.h"
+#include "optimizer/optimizer.h"
+#include "optimizer/pathnode.h"
+#include "parser/parsetree.h"
+#include "partitioning/partbounds.h"
+#include "partitioning/partprune.h"
+#include "rewrite/rewriteManip.h"
+#include "utils/array.h"
+#include "utils/lsyscache.h"
+
+
+/*
+ * Information about a clause matched with a partition key.
+ */
+typedef struct PartClauseInfo
+{
+ int keyno; /* Partition key number (0 to partnatts - 1) */
+ Oid opno; /* operator used to compare partkey to expr */
+ bool op_is_ne; /* is clause's original operator <> ? */
+ Expr *expr; /* expr the partition key is compared to */
+ Oid cmpfn; /* Oid of function to compare 'expr' to the
+ * partition key */
+ int op_strategy; /* btree strategy identifying the operator */
+} PartClauseInfo;
+
+/*
+ * PartClauseMatchStatus
+ * Describes the result of match_clause_to_partition_key()
+ */
+typedef enum PartClauseMatchStatus
+{
+ PARTCLAUSE_NOMATCH,
+ PARTCLAUSE_MATCH_CLAUSE,
+ PARTCLAUSE_MATCH_NULLNESS,
+ PARTCLAUSE_MATCH_STEPS,
+ PARTCLAUSE_MATCH_CONTRADICT,
+ PARTCLAUSE_UNSUPPORTED
+} PartClauseMatchStatus;
+
+/*
+ * PartClauseTarget
+ * Identifies which qual clauses we can use for generating pruning steps
+ */
+typedef enum PartClauseTarget
+{
+ PARTTARGET_PLANNER, /* want to prune during planning */
+ PARTTARGET_INITIAL, /* want to prune during executor startup */
+ PARTTARGET_EXEC /* want to prune during each plan node scan */
+} PartClauseTarget;
+
+/*
+ * GeneratePruningStepsContext
+ * Information about the current state of generation of "pruning steps"
+ * for a given set of clauses
+ *
+ * gen_partprune_steps() initializes and returns an instance of this struct.
+ *
+ * Note that has_mutable_op, has_mutable_arg, and has_exec_param are set if
+ * we found any potentially-useful-for-pruning clause having those properties,
+ * whether or not we actually used the clause in the steps list. This
+ * definition allows us to skip the PARTTARGET_EXEC pass in some cases.
+ */
+typedef struct GeneratePruningStepsContext
+{
+ /* Copies of input arguments for gen_partprune_steps: */
+ RelOptInfo *rel; /* the partitioned relation */
+ PartClauseTarget target; /* use-case we're generating steps for */
+ /* Result data: */
+ List *steps; /* list of PartitionPruneSteps */
+ bool has_mutable_op; /* clauses include any stable operators */
+ bool has_mutable_arg; /* clauses include any mutable comparison
+ * values, *other than* exec params */
+ bool has_exec_param; /* clauses include any PARAM_EXEC params */
+ bool contradictory; /* clauses were proven self-contradictory */
+ /* Working state: */
+ int next_step_id;
+} GeneratePruningStepsContext;
+
+/* The result of performing one PartitionPruneStep */
+typedef struct PruneStepResult
+{
+ /*
+ * The offsets of bounds (in a table's boundinfo) whose partition is
+ * selected by the pruning step.
+ */
+ Bitmapset *bound_offsets;
+
+ bool scan_default; /* Scan the default partition? */
+ bool scan_null; /* Scan the partition for NULL values? */
+} PruneStepResult;
+
+
+static List *add_part_relids(List *allpartrelids, Bitmapset *partrelids);
+static List *make_partitionedrel_pruneinfo(PlannerInfo *root,
+ RelOptInfo *parentrel,
+ List *prunequal,
+ Bitmapset *partrelids,
+ int *relid_subplan_map,
+ Bitmapset **matchedsubplans);
+static void gen_partprune_steps(RelOptInfo *rel, List *clauses,
+ PartClauseTarget target,
+ GeneratePruningStepsContext *context);
+static List *gen_partprune_steps_internal(GeneratePruningStepsContext *context,
+ List *clauses);
+static PartitionPruneStep *gen_prune_step_op(GeneratePruningStepsContext *context,
+ StrategyNumber opstrategy, bool op_is_ne,
+ List *exprs, List *cmpfns, Bitmapset *nullkeys);
+static PartitionPruneStep *gen_prune_step_combine(GeneratePruningStepsContext *context,
+ List *source_stepids,
+ PartitionPruneCombineOp combineOp);
+static List *gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
+ List **keyclauses, Bitmapset *nullkeys);
+static PartClauseMatchStatus match_clause_to_partition_key(GeneratePruningStepsContext *context,
+ Expr *clause, Expr *partkey, int partkeyidx,
+ bool *clause_is_not_null,
+ PartClauseInfo **pc, List **clause_steps);
+static List *get_steps_using_prefix(GeneratePruningStepsContext *context,
+ StrategyNumber step_opstrategy,
+ bool step_op_is_ne,
+ Expr *step_lastexpr,
+ Oid step_lastcmpfn,
+ Bitmapset *step_nullkeys,
+ List *prefix);
+static List *get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
+ StrategyNumber step_opstrategy,
+ bool step_op_is_ne,
+ Expr *step_lastexpr,
+ Oid step_lastcmpfn,
+ Bitmapset *step_nullkeys,
+ List *prefix,
+ ListCell *start,
+ List *step_exprs,
+ List *step_cmpfns);
+static PruneStepResult *get_matching_hash_bounds(PartitionPruneContext *context,
+ StrategyNumber opstrategy, Datum *values, int nvalues,
+ FmgrInfo *partsupfunc, Bitmapset *nullkeys);
+static PruneStepResult *get_matching_list_bounds(PartitionPruneContext *context,
+ StrategyNumber opstrategy, Datum value, int nvalues,
+ FmgrInfo *partsupfunc, Bitmapset *nullkeys);
+static PruneStepResult *get_matching_range_bounds(PartitionPruneContext *context,
+ StrategyNumber opstrategy, Datum *values, int nvalues,
+ FmgrInfo *partsupfunc, Bitmapset *nullkeys);
+static Bitmapset *pull_exec_paramids(Expr *expr);
+static bool pull_exec_paramids_walker(Node *node, Bitmapset **context);
+static Bitmapset *get_partkey_exec_paramids(List *steps);
+static PruneStepResult *perform_pruning_base_step(PartitionPruneContext *context,
+ PartitionPruneStepOp *opstep);
+static PruneStepResult *perform_pruning_combine_step(PartitionPruneContext *context,
+ PartitionPruneStepCombine *cstep,
+ PruneStepResult **step_results);
+static PartClauseMatchStatus match_boolean_partition_clause(Oid partopfamily,
+ Expr *clause,
+ Expr *partkey,
+ Expr **outconst,
+ bool *noteq);
+static void partkey_datum_from_expr(PartitionPruneContext *context,
+ Expr *expr, int stateidx,
+ Datum *value, bool *isnull);
+
+
+/*
+ * make_partition_pruneinfo
+ * Builds a PartitionPruneInfo which can be used in the executor to allow
+ * additional partition pruning to take place. Returns NULL when
+ * partition pruning would be useless.
+ *
+ * 'parentrel' is the RelOptInfo for an appendrel, and 'subpaths' is the list
+ * of scan paths for its child rels.
+ * 'prunequal' is a list of potential pruning quals (i.e., restriction
+ * clauses that are applicable to the appendrel).
+ */
+PartitionPruneInfo *
+make_partition_pruneinfo(PlannerInfo *root, RelOptInfo *parentrel,
+ List *subpaths,
+ List *prunequal)
+{
+ PartitionPruneInfo *pruneinfo;
+ Bitmapset *allmatchedsubplans = NULL;
+ List *allpartrelids;
+ List *prunerelinfos;
+ int *relid_subplan_map;
+ ListCell *lc;
+ int i;
+
+ /*
+ * Scan the subpaths to see which ones are scans of partition child
+ * relations, and identify their parent partitioned rels. (Note: we must
+ * restrict the parent partitioned rels to be parentrel or children of
+ * parentrel, otherwise we couldn't translate prunequal to match.)
+ *
+ * Also construct a temporary array to map from partition-child-relation
+ * relid to the index in 'subpaths' of the scan plan for that partition.
+ * (Use of "subplan" rather than "subpath" is a bit of a misnomer, but
+ * we'll let it stand.) For convenience, we use 1-based indexes here, so
+ * that zero can represent an un-filled array entry.
+ */
+ allpartrelids = NIL;
+ relid_subplan_map = palloc0(sizeof(int) * root->simple_rel_array_size);
+
+ i = 1;
+ foreach(lc, subpaths)
+ {
+ Path *path = (Path *) lfirst(lc);
+ RelOptInfo *pathrel = path->parent;
+
+ /* We don't consider partitioned joins here */
+ if (pathrel->reloptkind == RELOPT_OTHER_MEMBER_REL)
+ {
+ RelOptInfo *prel = pathrel;
+ Bitmapset *partrelids = NULL;
+
+ /*
+ * Traverse up to the pathrel's topmost partitioned parent,
+ * collecting parent relids as we go; but stop if we reach
+ * parentrel. (Normally, a pathrel's topmost partitioned parent
+ * is either parentrel or a UNION ALL appendrel child of
+ * parentrel. But when handling partitionwise joins of
+ * multi-level partitioning trees, we can see an append path whose
+ * parentrel is an intermediate partitioned table.)
+ */
+ do
+ {
+ AppendRelInfo *appinfo;
+
+ Assert(prel->relid < root->simple_rel_array_size);
+ appinfo = root->append_rel_array[prel->relid];
+ prel = find_base_rel(root, appinfo->parent_relid);
+ if (!IS_PARTITIONED_REL(prel))
+ break; /* reached a non-partitioned parent */
+ /* accept this level as an interesting parent */
+ partrelids = bms_add_member(partrelids, prel->relid);
+ if (prel == parentrel)
+ break; /* don't traverse above parentrel */
+ } while (prel->reloptkind == RELOPT_OTHER_MEMBER_REL);
+
+ if (partrelids)
+ {
+ /*
+ * Found some relevant parent partitions, which may or may not
+ * overlap with partition trees we already found. Add new
+ * information to the allpartrelids list.
+ */
+ allpartrelids = add_part_relids(allpartrelids, partrelids);
+ /* Also record the subplan in relid_subplan_map[] */
+ /* No duplicates please */
+ Assert(relid_subplan_map[pathrel->relid] == 0);
+ relid_subplan_map[pathrel->relid] = i;
+ }
+ }
+ i++;
+ }
+
+ /*
+ * We now build a PartitionedRelPruneInfo for each topmost partitioned rel
+ * (omitting any that turn out not to have useful pruning quals).
+ */
+ prunerelinfos = NIL;
+ foreach(lc, allpartrelids)
+ {
+ Bitmapset *partrelids = (Bitmapset *) lfirst(lc);
+ List *pinfolist;
+ Bitmapset *matchedsubplans = NULL;
+
+ pinfolist = make_partitionedrel_pruneinfo(root, parentrel,
+ prunequal,
+ partrelids,
+ relid_subplan_map,
+ &matchedsubplans);
+
+ /* When pruning is possible, record the matched subplans */
+ if (pinfolist != NIL)
+ {
+ prunerelinfos = lappend(prunerelinfos, pinfolist);
+ allmatchedsubplans = bms_join(matchedsubplans,
+ allmatchedsubplans);
+ }
+ }
+
+ pfree(relid_subplan_map);
+
+ /*
+ * If none of the partition hierarchies had any useful run-time pruning
+ * quals, then we can just not bother with run-time pruning.
+ */
+ if (prunerelinfos == NIL)
+ return NULL;
+
+ /* Else build the result data structure */
+ pruneinfo = makeNode(PartitionPruneInfo);
+ pruneinfo->prune_infos = prunerelinfos;
+
+ /*
+ * Some subplans may not belong to any of the identified partitioned rels.
+ * This can happen for UNION ALL queries which include a non-partitioned
+ * table, or when some of the hierarchies aren't run-time prunable. Build
+ * a bitmapset of the indexes of all such subplans, so that the executor
+ * can identify which subplans should never be pruned.
+ */
+ if (bms_num_members(allmatchedsubplans) < list_length(subpaths))
+ {
+ Bitmapset *other_subplans;
+
+ /* Create the complement of allmatchedsubplans */
+ other_subplans = bms_add_range(NULL, 0, list_length(subpaths) - 1);
+ other_subplans = bms_del_members(other_subplans, allmatchedsubplans);
+
+ pruneinfo->other_subplans = other_subplans;
+ }
+ else
+ pruneinfo->other_subplans = NULL;
+
+ return pruneinfo;
+}
+
+/*
+ * add_part_relids
+ * Add new info to a list of Bitmapsets of partitioned relids.
+ *
+ * Within 'allpartrelids', there is one Bitmapset for each topmost parent
+ * partitioned rel. Each Bitmapset contains the RT indexes of the topmost
+ * parent as well as its relevant non-leaf child partitions. Since (by
+ * construction of the rangetable list) parent partitions must have lower
+ * RT indexes than their children, we can distinguish the topmost parent
+ * as being the lowest set bit in the Bitmapset.
+ *
+ * 'partrelids' contains the RT indexes of a parent partitioned rel, and
+ * possibly some non-leaf children, that are newly identified as parents of
+ * some subpath rel passed to make_partition_pruneinfo(). These are added
+ * to an appropriate member of 'allpartrelids'.
+ *
+ * Note that the list contains only RT indexes of partitioned tables that
+ * are parents of some scan-level relation appearing in the 'subpaths' that
+ * make_partition_pruneinfo() is dealing with. Also, "topmost" parents are
+ * not allowed to be higher than the 'parentrel' associated with the append
+ * path. In this way, we avoid expending cycles on partitioned rels that
+ * can't contribute useful pruning information for the problem at hand.
+ * (It is possible for 'parentrel' to be a child partitioned table, and it
+ * is also possible for scan-level relations to be child partitioned tables
+ * rather than leaf partitions. Hence we must construct this relation set
+ * with reference to the particular append path we're dealing with, rather
+ * than looking at the full partitioning structure represented in the
+ * RelOptInfos.)
+ */
+static List *
+add_part_relids(List *allpartrelids, Bitmapset *partrelids)
+{
+ Index targetpart;
+ ListCell *lc;
+
+ /* We can easily get the lowest set bit this way: */
+ targetpart = bms_next_member(partrelids, -1);
+ Assert(targetpart > 0);
+
+ /* Look for a matching topmost parent */
+ foreach(lc, allpartrelids)
+ {
+ Bitmapset *currpartrelids = (Bitmapset *) lfirst(lc);
+ Index currtarget = bms_next_member(currpartrelids, -1);
+
+ if (targetpart == currtarget)
+ {
+ /* Found a match, so add any new RT indexes to this hierarchy */
+ currpartrelids = bms_add_members(currpartrelids, partrelids);
+ lfirst(lc) = currpartrelids;
+ return allpartrelids;
+ }
+ }
+ /* No match, so add the new partition hierarchy to the list */
+ return lappend(allpartrelids, partrelids);
+}
+
+/*
+ * make_partitionedrel_pruneinfo
+ * Build a List of PartitionedRelPruneInfos, one for each interesting
+ * partitioned rel in a partitioning hierarchy. These can be used in the
+ * executor to allow additional partition pruning to take place.
+ *
+ * parentrel: rel associated with the appendpath being considered
+ * prunequal: potential pruning quals, represented for parentrel
+ * partrelids: Set of RT indexes identifying relevant partitioned tables
+ * within a single partitioning hierarchy
+ * relid_subplan_map[]: maps child relation relids to subplan indexes
+ * matchedsubplans: on success, receives the set of subplan indexes which
+ * were matched to this partition hierarchy
+ *
+ * If we cannot find any useful run-time pruning steps, return NIL.
+ * However, on success, each rel identified in partrelids will have
+ * an element in the result list, even if some of them are useless.
+ */
+static List *
+make_partitionedrel_pruneinfo(PlannerInfo *root, RelOptInfo *parentrel,
+ List *prunequal,
+ Bitmapset *partrelids,
+ int *relid_subplan_map,
+ Bitmapset **matchedsubplans)
+{
+ RelOptInfo *targetpart = NULL;
+ List *pinfolist = NIL;
+ bool doruntimeprune = false;
+ int *relid_subpart_map;
+ Bitmapset *subplansfound = NULL;
+ ListCell *lc;
+ int rti;
+ int i;
+
+ /*
+ * Examine each partitioned rel, constructing a temporary array to map
+ * from planner relids to index of the partitioned rel, and building a
+ * PartitionedRelPruneInfo for each partitioned rel.
+ *
+ * In this phase we discover whether runtime pruning is needed at all; if
+ * not, we can avoid doing further work.
+ */
+ relid_subpart_map = palloc0(sizeof(int) * root->simple_rel_array_size);
+
+ i = 1;
+ rti = -1;
+ while ((rti = bms_next_member(partrelids, rti)) > 0)
+ {
+ RelOptInfo *subpart = find_base_rel(root, rti);
+ PartitionedRelPruneInfo *pinfo;
+ List *partprunequal;
+ List *initial_pruning_steps;
+ List *exec_pruning_steps;
+ Bitmapset *execparamids;
+ GeneratePruningStepsContext context;
+
+ /*
+ * Fill the mapping array.
+ *
+ * relid_subpart_map maps relid of a non-leaf partition to the index
+ * in the returned PartitionedRelPruneInfo list of the info for that
+ * partition. We use 1-based indexes here, so that zero can represent
+ * an un-filled array entry.
+ */
+ Assert(rti < root->simple_rel_array_size);
+ relid_subpart_map[rti] = i++;
+
+ /*
+ * Translate pruning qual, if necessary, for this partition.
+ *
+ * The first item in the list is the target partitioned relation.
+ */
+ if (!targetpart)
+ {
+ targetpart = subpart;
+
+ /*
+ * The prunequal is presented to us as a qual for 'parentrel'.
+ * Frequently this rel is the same as targetpart, so we can skip
+ * an adjust_appendrel_attrs step. But it might not be, and then
+ * we have to translate. We update the prunequal parameter here,
+ * because in later iterations of the loop for child partitions,
+ * we want to translate from parent to child variables.
+ */
+ if (!bms_equal(parentrel->relids, subpart->relids))
+ {
+ int nappinfos;
+ AppendRelInfo **appinfos = find_appinfos_by_relids(root,
+ subpart->relids,
+ &nappinfos);
+
+ prunequal = (List *) adjust_appendrel_attrs(root, (Node *)
+ prunequal,
+ nappinfos,
+ appinfos);
+
+ pfree(appinfos);
+ }
+
+ partprunequal = prunequal;
+ }
+ else
+ {
+ /*
+ * For sub-partitioned tables the columns may not be in the same
+ * order as the parent, so we must translate the prunequal to make
+ * it compatible with this relation.
+ */
+ partprunequal = (List *)
+ adjust_appendrel_attrs_multilevel(root,
+ (Node *) prunequal,
+ subpart->relids,
+ targetpart->relids);
+ }
+
+ /*
+ * Convert pruning qual to pruning steps. We may need to do this
+ * twice, once to obtain executor startup pruning steps, and once for
+ * executor per-scan pruning steps. This first pass creates startup
+ * pruning steps and detects whether there's any possibly-useful quals
+ * that would require per-scan pruning.
+ */
+ gen_partprune_steps(subpart, partprunequal, PARTTARGET_INITIAL,
+ &context);
+
+ if (context.contradictory)
+ {
+ /*
+ * This shouldn't happen as the planner should have detected this
+ * earlier. However, we do use additional quals from parameterized
+ * paths here. These do only compare Params to the partition key,
+ * so this shouldn't cause the discovery of any new qual
+ * contradictions that were not previously discovered as the Param
+ * values are unknown during planning. Anyway, we'd better do
+ * something sane here, so let's just disable run-time pruning.
+ */
+ return NIL;
+ }
+
+ /*
+ * If no mutable operators or expressions appear in usable pruning
+ * clauses, then there's no point in running startup pruning, because
+ * plan-time pruning should have pruned everything prunable.
+ */
+ if (context.has_mutable_op || context.has_mutable_arg)
+ initial_pruning_steps = context.steps;
+ else
+ initial_pruning_steps = NIL;
+
+ /*
+ * If no exec Params appear in potentially-usable pruning clauses,
+ * then there's no point in even thinking about per-scan pruning.
+ */
+ if (context.has_exec_param)
+ {
+ /* ... OK, we'd better think about it */
+ gen_partprune_steps(subpart, partprunequal, PARTTARGET_EXEC,
+ &context);
+
+ if (context.contradictory)
+ {
+ /* As above, skip run-time pruning if anything fishy happens */
+ return NIL;
+ }
+
+ exec_pruning_steps = context.steps;
+
+ /*
+ * Detect which exec Params actually got used; the fact that some
+ * were in available clauses doesn't mean we actually used them.
+ * Skip per-scan pruning if there are none.
+ */
+ execparamids = get_partkey_exec_paramids(exec_pruning_steps);
+
+ if (bms_is_empty(execparamids))
+ exec_pruning_steps = NIL;
+ }
+ else
+ {
+ /* No exec Params anywhere, so forget about scan-time pruning */
+ exec_pruning_steps = NIL;
+ execparamids = NULL;
+ }
+
+ if (initial_pruning_steps || exec_pruning_steps)
+ doruntimeprune = true;
+
+ /* Begin constructing the PartitionedRelPruneInfo for this rel */
+ pinfo = makeNode(PartitionedRelPruneInfo);
+ pinfo->rtindex = rti;
+ pinfo->initial_pruning_steps = initial_pruning_steps;
+ pinfo->exec_pruning_steps = exec_pruning_steps;
+ pinfo->execparamids = execparamids;
+ /* Remaining fields will be filled in the next loop */
+
+ pinfolist = lappend(pinfolist, pinfo);
+ }
+
+ if (!doruntimeprune)
+ {
+ /* No run-time pruning required. */
+ pfree(relid_subpart_map);
+ return NIL;
+ }
+
+ /*
+ * Run-time pruning will be required, so initialize other information.
+ * That includes two maps -- one needed to convert partition indexes of
+ * leaf partitions to the indexes of their subplans in the subplan list,
+ * another needed to convert partition indexes of sub-partitioned
+ * partitions to the indexes of their PartitionedRelPruneInfo in the
+ * PartitionedRelPruneInfo list.
+ */
+ foreach(lc, pinfolist)
+ {
+ PartitionedRelPruneInfo *pinfo = lfirst(lc);
+ RelOptInfo *subpart = find_base_rel(root, pinfo->rtindex);
+ Bitmapset *present_parts;
+ int nparts = subpart->nparts;
+ int *subplan_map;
+ int *subpart_map;
+ Oid *relid_map;
+
+ /*
+ * Construct the subplan and subpart maps for this partitioning level.
+ * Here we convert to zero-based indexes, with -1 for empty entries.
+ * Also construct a Bitmapset of all partitions that are present (that
+ * is, not pruned already).
+ */
+ subplan_map = (int *) palloc(nparts * sizeof(int));
+ memset(subplan_map, -1, nparts * sizeof(int));
+ subpart_map = (int *) palloc(nparts * sizeof(int));
+ memset(subpart_map, -1, nparts * sizeof(int));
+ relid_map = (Oid *) palloc0(nparts * sizeof(Oid));
+ present_parts = NULL;
+
+ i = -1;
+ while ((i = bms_next_member(subpart->live_parts, i)) >= 0)
+ {
+ RelOptInfo *partrel = subpart->part_rels[i];
+ int subplanidx;
+ int subpartidx;
+
+ Assert(partrel != NULL);
+
+ subplan_map[i] = subplanidx = relid_subplan_map[partrel->relid] - 1;
+ subpart_map[i] = subpartidx = relid_subpart_map[partrel->relid] - 1;
+ relid_map[i] = planner_rt_fetch(partrel->relid, root)->relid;
+ if (subplanidx >= 0)
+ {
+ present_parts = bms_add_member(present_parts, i);
+
+ /* Record finding this subplan */
+ subplansfound = bms_add_member(subplansfound, subplanidx);
+ }
+ else if (subpartidx >= 0)
+ present_parts = bms_add_member(present_parts, i);
+ }
+
+ /*
+ * Ensure there were no stray PartitionedRelPruneInfo generated for
+ * partitioned tables that we have no sub-paths or
+ * sub-PartitionedRelPruneInfo for.
+ */
+ Assert(!bms_is_empty(present_parts));
+
+ /* Record the maps and other information. */
+ pinfo->present_parts = present_parts;
+ pinfo->nparts = nparts;
+ pinfo->subplan_map = subplan_map;
+ pinfo->subpart_map = subpart_map;
+ pinfo->relid_map = relid_map;
+ }
+
+ pfree(relid_subpart_map);
+
+ *matchedsubplans = subplansfound;
+
+ return pinfolist;
+}
+
+/*
+ * gen_partprune_steps
+ * Process 'clauses' (typically a rel's baserestrictinfo list of clauses)
+ * and create a list of "partition pruning steps".
+ *
+ * 'target' tells whether to generate pruning steps for planning (use
+ * immutable clauses only), or for executor startup (use any allowable
+ * clause except ones containing PARAM_EXEC Params), or for executor
+ * per-scan pruning (use any allowable clause).
+ *
+ * 'context' is an output argument that receives the steps list as well as
+ * some subsidiary flags; see the GeneratePruningStepsContext typedef.
+ */
+static void
+gen_partprune_steps(RelOptInfo *rel, List *clauses, PartClauseTarget target,
+ GeneratePruningStepsContext *context)
+{
+ /* Initialize all output values to zero/false/NULL */
+ memset(context, 0, sizeof(GeneratePruningStepsContext));
+ context->rel = rel;
+ context->target = target;
+
+ /*
+ * If this partitioned table is in turn a partition, and it shares any
+ * partition keys with its parent, then it's possible that the hierarchy
+ * allows the parent a narrower range of values than some of its
+ * partitions (particularly the default one). This is normally not
+ * useful, but it can be to prune the default partition.
+ */
+ if (partition_bound_has_default(rel->boundinfo) && rel->partition_qual)
+ {
+ /* Make a copy to avoid modifying the passed-in List */
+ clauses = list_concat_copy(clauses, rel->partition_qual);
+ }
+
+ /* Down into the rabbit-hole. */
+ (void) gen_partprune_steps_internal(context, clauses);
+}
+
+/*
+ * prune_append_rel_partitions
+ * Process rel's baserestrictinfo and make use of quals which can be
+ * evaluated during query planning in order to determine the minimum set
+ * of partitions which must be scanned to satisfy these quals. Returns
+ * the matching partitions in the form of a Bitmapset containing the
+ * partitions' indexes in the rel's part_rels array.
+ *
+ * Callers must ensure that 'rel' is a partitioned table.
+ */
+Bitmapset *
+prune_append_rel_partitions(RelOptInfo *rel)
+{
+ List *clauses = rel->baserestrictinfo;
+ List *pruning_steps;
+ GeneratePruningStepsContext gcontext;
+ PartitionPruneContext context;
+
+ Assert(rel->part_scheme != NULL);
+
+ /* If there are no partitions, return the empty set */
+ if (rel->nparts == 0)
+ return NULL;
+
+ /*
+ * If pruning is disabled or if there are no clauses to prune with, return
+ * all partitions.
+ */
+ if (!enable_partition_pruning || clauses == NIL)
+ return bms_add_range(NULL, 0, rel->nparts - 1);
+
+ /*
+ * Process clauses to extract pruning steps that are usable at plan time.
+ * If the clauses are found to be contradictory, we can return the empty
+ * set.
+ */
+ gen_partprune_steps(rel, clauses, PARTTARGET_PLANNER,
+ &gcontext);
+ if (gcontext.contradictory)
+ return NULL;
+ pruning_steps = gcontext.steps;
+
+ /* If there's nothing usable, return all partitions */
+ if (pruning_steps == NIL)
+ return bms_add_range(NULL, 0, rel->nparts - 1);
+
+ /* Set up PartitionPruneContext */
+ context.strategy = rel->part_scheme->strategy;
+ context.partnatts = rel->part_scheme->partnatts;
+ context.nparts = rel->nparts;
+ context.boundinfo = rel->boundinfo;
+ context.partcollation = rel->part_scheme->partcollation;
+ context.partsupfunc = rel->part_scheme->partsupfunc;
+ context.stepcmpfuncs = (FmgrInfo *) palloc0(sizeof(FmgrInfo) *
+ context.partnatts *
+ list_length(pruning_steps));
+ context.ppccontext = CurrentMemoryContext;
+
+ /* These are not valid when being called from the planner */
+ context.planstate = NULL;
+ context.exprcontext = NULL;
+ context.exprstates = NULL;
+
+ /* Actual pruning happens here. */
+ return get_matching_partitions(&context, pruning_steps);
+}
+
+/*
+ * get_matching_partitions
+ * Determine partitions that survive partition pruning
+ *
+ * Note: context->exprcontext must be valid when the pruning_steps were
+ * generated with a target other than PARTTARGET_PLANNER.
+ *
+ * Returns a Bitmapset of the RelOptInfo->part_rels indexes of the surviving
+ * partitions.
+ */
+Bitmapset *
+get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
+{
+ Bitmapset *result;
+ int num_steps = list_length(pruning_steps),
+ i;
+ PruneStepResult **results,
+ *final_result;
+ ListCell *lc;
+ bool scan_default;
+
+ /* If there are no pruning steps then all partitions match. */
+ if (num_steps == 0)
+ {
+ Assert(context->nparts > 0);
+ return bms_add_range(NULL, 0, context->nparts - 1);
+ }
+
+ /*
+ * Allocate space for individual pruning steps to store its result. Each
+ * slot will hold a PruneStepResult after performing a given pruning step.
+ * Later steps may use the result of one or more earlier steps. The
+ * result of applying all pruning steps is the value contained in the slot
+ * of the last pruning step.
+ */
+ results = (PruneStepResult **)
+ palloc0(num_steps * sizeof(PruneStepResult *));
+ foreach(lc, pruning_steps)
+ {
+ PartitionPruneStep *step = lfirst(lc);
+
+ switch (nodeTag(step))
+ {
+ case T_PartitionPruneStepOp:
+ results[step->step_id] =
+ perform_pruning_base_step(context,
+ (PartitionPruneStepOp *) step);
+ break;
+
+ case T_PartitionPruneStepCombine:
+ results[step->step_id] =
+ perform_pruning_combine_step(context,
+ (PartitionPruneStepCombine *) step,
+ results);
+ break;
+
+ default:
+ elog(ERROR, "invalid pruning step type: %d",
+ (int) nodeTag(step));
+ }
+ }
+
+ /*
+ * At this point we know the offsets of all the datums whose corresponding
+ * partitions need to be in the result, including special null-accepting
+ * and default partitions. Collect the actual partition indexes now.
+ */
+ final_result = results[num_steps - 1];
+ Assert(final_result != NULL);
+ i = -1;
+ result = NULL;
+ scan_default = final_result->scan_default;
+ while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
+ {
+ int partindex;
+
+ Assert(i < context->boundinfo->nindexes);
+ partindex = context->boundinfo->indexes[i];
+
+ if (partindex < 0)
+ {
+ /*
+ * In range partitioning cases, if a partition index is -1 it
+ * means that the bound at the offset is the upper bound for a
+ * range not covered by any partition (other than a possible
+ * default partition). In hash partitioning, the same means no
+ * partition has been defined for the corresponding remainder
+ * value.
+ *
+ * In either case, the value is still part of the queried range of
+ * values, so mark to scan the default partition if one exists.
+ */
+ scan_default |= partition_bound_has_default(context->boundinfo);
+ continue;
+ }
+
+ result = bms_add_member(result, partindex);
+ }
+
+ /* Add the null and/or default partition if needed and present. */
+ if (final_result->scan_null)
+ {
+ Assert(context->strategy == PARTITION_STRATEGY_LIST);
+ Assert(partition_bound_accepts_nulls(context->boundinfo));
+ result = bms_add_member(result, context->boundinfo->null_index);
+ }
+ if (scan_default)
+ {
+ Assert(context->strategy == PARTITION_STRATEGY_LIST ||
+ context->strategy == PARTITION_STRATEGY_RANGE);
+ Assert(partition_bound_has_default(context->boundinfo));
+ result = bms_add_member(result, context->boundinfo->default_index);
+ }
+
+ return result;
+}
+
+/*
+ * gen_partprune_steps_internal
+ * Processes 'clauses' to generate a List of partition pruning steps. We
+ * return NIL when no steps were generated.
+ *
+ * These partition pruning steps come in 2 forms; operator steps and combine
+ * steps.
+ *
+ * Operator steps (PartitionPruneStepOp) contain details of clauses that we
+ * determined that we can use for partition pruning. These contain details of
+ * the expression which is being compared to the partition key and the
+ * comparison function.
+ *
+ * Combine steps (PartitionPruneStepCombine) instruct the partition pruning
+ * code how it should produce a single set of partitions from multiple input
+ * operator and other combine steps. A PARTPRUNE_COMBINE_INTERSECT type
+ * combine step will merge its input steps to produce a result which only
+ * contains the partitions which are present in all of the input operator
+ * steps. A PARTPRUNE_COMBINE_UNION combine step will produce a result that
+ * has all of the partitions from each of the input operator steps.
+ *
+ * For BoolExpr clauses, each argument is processed recursively. Steps
+ * generated from processing an OR BoolExpr will be combined using
+ * PARTPRUNE_COMBINE_UNION. AND BoolExprs get combined using
+ * PARTPRUNE_COMBINE_INTERSECT.
+ *
+ * Otherwise, the list of clauses we receive we assume to be mutually ANDed.
+ * We generate all of the pruning steps we can based on these clauses and then
+ * at the end, if we have more than 1 step, we combine each step with a
+ * PARTPRUNE_COMBINE_INTERSECT combine step. Single steps are returned as-is.
+ *
+ * If we find clauses that are mutually contradictory, or contradictory with
+ * the partitioning constraint, or a pseudoconstant clause that contains
+ * false, we set context->contradictory to true and return NIL (that is, no
+ * pruning steps). Caller should consider all partitions as pruned in that
+ * case.
+ */
+static List *
+gen_partprune_steps_internal(GeneratePruningStepsContext *context,
+ List *clauses)
+{
+ PartitionScheme part_scheme = context->rel->part_scheme;
+ List *keyclauses[PARTITION_MAX_KEYS];
+ Bitmapset *nullkeys = NULL,
+ *notnullkeys = NULL;
+ bool generate_opsteps = false;
+ List *result = NIL;
+ ListCell *lc;
+
+ /*
+ * If this partitioned relation has a default partition and is itself a
+ * partition (as evidenced by partition_qual being not NIL), we first
+ * check if the clauses contradict the partition constraint. If they do,
+ * there's no need to generate any steps as it'd already be proven that no
+ * partitions need to be scanned.
+ *
+ * This is a measure of last resort only to be used because the default
+ * partition cannot be pruned using the steps generated from clauses that
+ * contradict the parent's partition constraint; regular pruning, which is
+ * cheaper, is sufficient when no default partition exists.
+ */
+ if (partition_bound_has_default(context->rel->boundinfo) &&
+ predicate_refuted_by(context->rel->partition_qual, clauses, false))
+ {
+ context->contradictory = true;
+ return NIL;
+ }
+
+ memset(keyclauses, 0, sizeof(keyclauses));
+ foreach(lc, clauses)
+ {
+ Expr *clause = (Expr *) lfirst(lc);
+ int i;
+
+ /* Look through RestrictInfo, if any */
+ if (IsA(clause, RestrictInfo))
+ clause = ((RestrictInfo *) clause)->clause;
+
+ /* Constant-false-or-null is contradictory */
+ if (IsA(clause, Const) &&
+ (((Const *) clause)->constisnull ||
+ !DatumGetBool(((Const *) clause)->constvalue)))
+ {
+ context->contradictory = true;
+ return NIL;
+ }
+
+ /* Get the BoolExpr's out of the way. */
+ if (IsA(clause, BoolExpr))
+ {
+ /*
+ * Generate steps for arguments.
+ *
+ * While steps generated for the arguments themselves will be
+ * added to context->steps during recursion and will be evaluated
+ * independently, collect their step IDs to be stored in the
+ * combine step we'll be creating.
+ */
+ if (is_orclause(clause))
+ {
+ List *arg_stepids = NIL;
+ bool all_args_contradictory = true;
+ ListCell *lc1;
+
+ /*
+ * We can share the outer context area with the recursive
+ * call, but contradictory had better not be true yet.
+ */
+ Assert(!context->contradictory);
+
+ /*
+ * Get pruning step for each arg. If we get contradictory for
+ * all args, it means the OR expression is false as a whole.
+ */
+ foreach(lc1, ((BoolExpr *) clause)->args)
+ {
+ Expr *arg = lfirst(lc1);
+ bool arg_contradictory;
+ List *argsteps;
+
+ argsteps = gen_partprune_steps_internal(context,
+ list_make1(arg));
+ arg_contradictory = context->contradictory;
+ /* Keep context->contradictory clear till we're done */
+ context->contradictory = false;
+
+ if (arg_contradictory)
+ {
+ /* Just ignore self-contradictory arguments. */
+ continue;
+ }
+ else
+ all_args_contradictory = false;
+
+ if (argsteps != NIL)
+ {
+ /*
+ * gen_partprune_steps_internal() always adds a single
+ * combine step when it generates multiple steps, so
+ * here we can just pay attention to the last one in
+ * the list. If it just generated one, then the last
+ * one in the list is still the one we want.
+ */
+ PartitionPruneStep *last = llast(argsteps);
+
+ arg_stepids = lappend_int(arg_stepids, last->step_id);
+ }
+ else
+ {
+ PartitionPruneStep *orstep;
+
+ /*
+ * The arg didn't contain a clause matching this
+ * partition key. We cannot prune using such an arg.
+ * To indicate that to the pruning code, we must
+ * construct a dummy PartitionPruneStepCombine whose
+ * source_stepids is set to an empty List.
+ */
+ orstep = gen_prune_step_combine(context, NIL,
+ PARTPRUNE_COMBINE_UNION);
+ arg_stepids = lappend_int(arg_stepids, orstep->step_id);
+ }
+ }
+
+ /* If all the OR arms are contradictory, we can stop */
+ if (all_args_contradictory)
+ {
+ context->contradictory = true;
+ return NIL;
+ }
+
+ if (arg_stepids != NIL)
+ {
+ PartitionPruneStep *step;
+
+ step = gen_prune_step_combine(context, arg_stepids,
+ PARTPRUNE_COMBINE_UNION);
+ result = lappend(result, step);
+ }
+ continue;
+ }
+ else if (is_andclause(clause))
+ {
+ List *args = ((BoolExpr *) clause)->args;
+ List *argsteps;
+
+ /*
+ * args may itself contain clauses of arbitrary type, so just
+ * recurse and later combine the component partitions sets
+ * using a combine step.
+ */
+ argsteps = gen_partprune_steps_internal(context, args);
+
+ /* If any AND arm is contradictory, we can stop immediately */
+ if (context->contradictory)
+ return NIL;
+
+ /*
+ * gen_partprune_steps_internal() always adds a single combine
+ * step when it generates multiple steps, so here we can just
+ * pay attention to the last one in the list. If it just
+ * generated one, then the last one in the list is still the
+ * one we want.
+ */
+ if (argsteps != NIL)
+ result = lappend(result, llast(argsteps));
+
+ continue;
+ }
+
+ /*
+ * Fall-through for a NOT clause, which if it's a Boolean clause,
+ * will be handled in match_clause_to_partition_key(). We
+ * currently don't perform any pruning for more complex NOT
+ * clauses.
+ */
+ }
+
+ /*
+ * See if we can match this clause to any of the partition keys.
+ */
+ for (i = 0; i < part_scheme->partnatts; i++)
+ {
+ Expr *partkey = linitial(context->rel->partexprs[i]);
+ bool clause_is_not_null = false;
+ PartClauseInfo *pc = NULL;
+ List *clause_steps = NIL;
+
+ switch (match_clause_to_partition_key(context,
+ clause, partkey, i,
+ &clause_is_not_null,
+ &pc, &clause_steps))
+ {
+ case PARTCLAUSE_MATCH_CLAUSE:
+ Assert(pc != NULL);
+
+ /*
+ * Since we only allow strict operators, check for any
+ * contradicting IS NULL.
+ */
+ if (bms_is_member(i, nullkeys))
+ {
+ context->contradictory = true;
+ return NIL;
+ }
+ generate_opsteps = true;
+ keyclauses[i] = lappend(keyclauses[i], pc);
+ break;
+
+ case PARTCLAUSE_MATCH_NULLNESS:
+ if (!clause_is_not_null)
+ {
+ /*
+ * check for conflicting IS NOT NULL as well as
+ * contradicting strict clauses
+ */
+ if (bms_is_member(i, notnullkeys) ||
+ keyclauses[i] != NIL)
+ {
+ context->contradictory = true;
+ return NIL;
+ }
+ nullkeys = bms_add_member(nullkeys, i);
+ }
+ else
+ {
+ /* check for conflicting IS NULL */
+ if (bms_is_member(i, nullkeys))
+ {
+ context->contradictory = true;
+ return NIL;
+ }
+ notnullkeys = bms_add_member(notnullkeys, i);
+ }
+ break;
+
+ case PARTCLAUSE_MATCH_STEPS:
+ Assert(clause_steps != NIL);
+ result = list_concat(result, clause_steps);
+ break;
+
+ case PARTCLAUSE_MATCH_CONTRADICT:
+ /* We've nothing more to do if a contradiction was found. */
+ context->contradictory = true;
+ return NIL;
+
+ case PARTCLAUSE_NOMATCH:
+
+ /*
+ * Clause didn't match this key, but it might match the
+ * next one.
+ */
+ continue;
+
+ case PARTCLAUSE_UNSUPPORTED:
+ /* This clause cannot be used for pruning. */
+ break;
+ }
+
+ /* done; go check the next clause. */
+ break;
+ }
+ }
+
+ /*-----------
+ * Now generate some (more) pruning steps. We have three strategies:
+ *
+ * 1) Generate pruning steps based on IS NULL clauses:
+ * a) For list partitioning, null partition keys can only be found in
+ * the designated null-accepting partition, so if there are IS NULL
+ * clauses containing partition keys we should generate a pruning
+ * step that gets rid of all partitions but that one. We can
+ * disregard any OpExpr we may have found.
+ * b) For range partitioning, only the default partition can contain
+ * NULL values, so the same rationale applies.
+ * c) For hash partitioning, we only apply this strategy if we have
+ * IS NULL clauses for all the keys. Strategy 2 below will take
+ * care of the case where some keys have OpExprs and others have
+ * IS NULL clauses.
+ *
+ * 2) If not, generate steps based on OpExprs we have (if any).
+ *
+ * 3) If this doesn't work either, we may be able to generate steps to
+ * prune just the null-accepting partition (if one exists), if we have
+ * IS NOT NULL clauses for all partition keys.
+ */
+ if (!bms_is_empty(nullkeys) &&
+ (part_scheme->strategy == PARTITION_STRATEGY_LIST ||
+ part_scheme->strategy == PARTITION_STRATEGY_RANGE ||
+ (part_scheme->strategy == PARTITION_STRATEGY_HASH &&
+ bms_num_members(nullkeys) == part_scheme->partnatts)))
+ {
+ PartitionPruneStep *step;
+
+ /* Strategy 1 */
+ step = gen_prune_step_op(context, InvalidStrategy,
+ false, NIL, NIL, nullkeys);
+ result = lappend(result, step);
+ }
+ else if (generate_opsteps)
+ {
+ List *opsteps;
+
+ /* Strategy 2 */
+ opsteps = gen_prune_steps_from_opexps(context, keyclauses, nullkeys);
+ result = list_concat(result, opsteps);
+ }
+ else if (bms_num_members(notnullkeys) == part_scheme->partnatts)
+ {
+ PartitionPruneStep *step;
+
+ /* Strategy 3 */
+ step = gen_prune_step_op(context, InvalidStrategy,
+ false, NIL, NIL, NULL);
+ result = lappend(result, step);
+ }
+
+ /*
+ * Finally, if there are multiple steps, since the 'clauses' are mutually
+ * ANDed, add an INTERSECT step to combine the partition sets resulting
+ * from them and append it to the result list.
+ */
+ if (list_length(result) > 1)
+ {
+ List *step_ids = NIL;
+ PartitionPruneStep *final;
+
+ foreach(lc, result)
+ {
+ PartitionPruneStep *step = lfirst(lc);
+
+ step_ids = lappend_int(step_ids, step->step_id);
+ }
+
+ final = gen_prune_step_combine(context, step_ids,
+ PARTPRUNE_COMBINE_INTERSECT);
+ result = lappend(result, final);
+ }
+
+ return result;
+}
+
+/*
+ * gen_prune_step_op
+ * Generate a pruning step for a specific operator
+ *
+ * The step is assigned a unique step identifier and added to context's 'steps'
+ * list.
+ */
+static PartitionPruneStep *
+gen_prune_step_op(GeneratePruningStepsContext *context,
+ StrategyNumber opstrategy, bool op_is_ne,
+ List *exprs, List *cmpfns,
+ Bitmapset *nullkeys)
+{
+ PartitionPruneStepOp *opstep = makeNode(PartitionPruneStepOp);
+
+ opstep->step.step_id = context->next_step_id++;
+
+ /*
+ * For clauses that contain an <> operator, set opstrategy to
+ * InvalidStrategy to signal get_matching_list_bounds to do the right
+ * thing.
+ */
+ opstep->opstrategy = op_is_ne ? InvalidStrategy : opstrategy;
+ Assert(list_length(exprs) == list_length(cmpfns));
+ opstep->exprs = exprs;
+ opstep->cmpfns = cmpfns;
+ opstep->nullkeys = nullkeys;
+
+ context->steps = lappend(context->steps, opstep);
+
+ return (PartitionPruneStep *) opstep;
+}
+
+/*
+ * gen_prune_step_combine
+ * Generate a pruning step for a combination of several other steps
+ *
+ * The step is assigned a unique step identifier and added to context's
+ * 'steps' list.
+ */
+static PartitionPruneStep *
+gen_prune_step_combine(GeneratePruningStepsContext *context,
+ List *source_stepids,
+ PartitionPruneCombineOp combineOp)
+{
+ PartitionPruneStepCombine *cstep = makeNode(PartitionPruneStepCombine);
+
+ cstep->step.step_id = context->next_step_id++;
+ cstep->combineOp = combineOp;
+ cstep->source_stepids = source_stepids;
+
+ context->steps = lappend(context->steps, cstep);
+
+ return (PartitionPruneStep *) cstep;
+}
+
+/*
+ * gen_prune_steps_from_opexps
+ * Generate and return a list of PartitionPruneStepOp that are based on
+ * OpExpr and BooleanTest clauses that have been matched to the partition
+ * key.
+ *
+ * 'keyclauses' is an array of List pointers, indexed by the partition key's
+ * index. Each List element in the array can contain clauses that match to
+ * the corresponding partition key column. Partition key columns without any
+ * matched clauses will have an empty List.
+ *
+ * Some partitioning strategies allow pruning to still occur when we only have
+ * clauses for a prefix of the partition key columns, for example, RANGE
+ * partitioning. Other strategies, such as HASH partitioning, require clauses
+ * for all partition key columns.
+ *
+ * When we return multiple pruning steps here, it's up to the caller to add a
+ * relevant "combine" step to combine the returned steps. This is not done
+ * here as callers may wish to include additional pruning steps before
+ * combining them all.
+ */
+static List *
+gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
+ List **keyclauses, Bitmapset *nullkeys)
+{
+ PartitionScheme part_scheme = context->rel->part_scheme;
+ List *opsteps = NIL;
+ List *btree_clauses[BTMaxStrategyNumber + 1],
+ *hash_clauses[HTMaxStrategyNumber + 1];
+ int i;
+ ListCell *lc;
+
+ memset(btree_clauses, 0, sizeof(btree_clauses));
+ memset(hash_clauses, 0, sizeof(hash_clauses));
+ for (i = 0; i < part_scheme->partnatts; i++)
+ {
+ List *clauselist = keyclauses[i];
+ bool consider_next_key = true;
+
+ /*
+ * For range partitioning, if we have no clauses for the current key,
+ * we can't consider any later keys either, so we can stop here.
+ */
+ if (part_scheme->strategy == PARTITION_STRATEGY_RANGE &&
+ clauselist == NIL)
+ break;
+
+ /*
+ * For hash partitioning, if a column doesn't have the necessary
+ * equality clause, there should be an IS NULL clause, otherwise
+ * pruning is not possible.
+ */
+ if (part_scheme->strategy == PARTITION_STRATEGY_HASH &&
+ clauselist == NIL && !bms_is_member(i, nullkeys))
+ return NIL;
+
+ foreach(lc, clauselist)
+ {
+ PartClauseInfo *pc = (PartClauseInfo *) lfirst(lc);
+ Oid lefttype,
+ righttype;
+
+ /* Look up the operator's btree/hash strategy number. */
+ if (pc->op_strategy == InvalidStrategy)
+ get_op_opfamily_properties(pc->opno,
+ part_scheme->partopfamily[i],
+ false,
+ &pc->op_strategy,
+ &lefttype,
+ &righttype);
+
+ switch (part_scheme->strategy)
+ {
+ case PARTITION_STRATEGY_LIST:
+ case PARTITION_STRATEGY_RANGE:
+ btree_clauses[pc->op_strategy] =
+ lappend(btree_clauses[pc->op_strategy], pc);
+
+ /*
+ * We can't consider subsequent partition keys if the
+ * clause for the current key contains a non-inclusive
+ * operator.
+ */
+ if (pc->op_strategy == BTLessStrategyNumber ||
+ pc->op_strategy == BTGreaterStrategyNumber)
+ consider_next_key = false;
+ break;
+
+ case PARTITION_STRATEGY_HASH:
+ if (pc->op_strategy != HTEqualStrategyNumber)
+ elog(ERROR, "invalid clause for hash partitioning");
+ hash_clauses[pc->op_strategy] =
+ lappend(hash_clauses[pc->op_strategy], pc);
+ break;
+
+ default:
+ elog(ERROR, "invalid partition strategy: %c",
+ part_scheme->strategy);
+ break;
+ }
+ }
+
+ /*
+ * If we've decided that clauses for subsequent partition keys
+ * wouldn't be useful for pruning, don't search any further.
+ */
+ if (!consider_next_key)
+ break;
+ }
+
+ /*
+ * Now, we have divided clauses according to their operator strategies.
+ * Check for each strategy if we can generate pruning step(s) by
+ * collecting a list of expressions whose values will constitute a vector
+ * that can be used as a lookup key by a partition bound searching
+ * function.
+ */
+ switch (part_scheme->strategy)
+ {
+ case PARTITION_STRATEGY_LIST:
+ case PARTITION_STRATEGY_RANGE:
+ {
+ List *eq_clauses = btree_clauses[BTEqualStrategyNumber];
+ List *le_clauses = btree_clauses[BTLessEqualStrategyNumber];
+ List *ge_clauses = btree_clauses[BTGreaterEqualStrategyNumber];
+ int strat;
+
+ /*
+ * For each clause under consideration for a given strategy,
+ * we collect expressions from clauses for earlier keys, whose
+ * operator strategy is inclusive, into a list called
+ * 'prefix'. By appending the clause's own expression to the
+ * 'prefix', we'll generate one step using the so generated
+ * vector and assign the current strategy to it. Actually,
+ * 'prefix' might contain multiple clauses for the same key,
+ * in which case, we must generate steps for various
+ * combinations of expressions of different keys, which
+ * get_steps_using_prefix takes care of for us.
+ */
+ for (strat = 1; strat <= BTMaxStrategyNumber; strat++)
+ {
+ foreach(lc, btree_clauses[strat])
+ {
+ PartClauseInfo *pc = lfirst(lc);
+ ListCell *eq_start;
+ ListCell *le_start;
+ ListCell *ge_start;
+ ListCell *lc1;
+ List *prefix = NIL;
+ List *pc_steps;
+ bool prefix_valid = true;
+ bool pk_has_clauses;
+ int keyno;
+
+ /*
+ * If this is a clause for the first partition key,
+ * there are no preceding expressions; generate a
+ * pruning step without a prefix.
+ *
+ * Note that we pass NULL for step_nullkeys, because
+ * we don't search list/range partition bounds where
+ * some keys are NULL.
+ */
+ if (pc->keyno == 0)
+ {
+ Assert(pc->op_strategy == strat);
+ pc_steps = get_steps_using_prefix(context, strat,
+ pc->op_is_ne,
+ pc->expr,
+ pc->cmpfn,
+ NULL,
+ NIL);
+ opsteps = list_concat(opsteps, pc_steps);
+ continue;
+ }
+
+ eq_start = list_head(eq_clauses);
+ le_start = list_head(le_clauses);
+ ge_start = list_head(ge_clauses);
+
+ /*
+ * We arrange clauses into prefix in ascending order
+ * of their partition key numbers.
+ */
+ for (keyno = 0; keyno < pc->keyno; keyno++)
+ {
+ pk_has_clauses = false;
+
+ /*
+ * Expressions from = clauses can always be in the
+ * prefix, provided they're from an earlier key.
+ */
+ for_each_cell(lc1, eq_clauses, eq_start)
+ {
+ PartClauseInfo *eqpc = lfirst(lc1);
+
+ if (eqpc->keyno == keyno)
+ {
+ prefix = lappend(prefix, eqpc);
+ pk_has_clauses = true;
+ }
+ else
+ {
+ Assert(eqpc->keyno > keyno);
+ break;
+ }
+ }
+ eq_start = lc1;
+
+ /*
+ * If we're generating steps for </<= strategy, we
+ * can add other <= clauses to the prefix,
+ * provided they're from an earlier key.
+ */
+ if (strat == BTLessStrategyNumber ||
+ strat == BTLessEqualStrategyNumber)
+ {
+ for_each_cell(lc1, le_clauses, le_start)
+ {
+ PartClauseInfo *lepc = lfirst(lc1);
+
+ if (lepc->keyno == keyno)
+ {
+ prefix = lappend(prefix, lepc);
+ pk_has_clauses = true;
+ }
+ else
+ {
+ Assert(lepc->keyno > keyno);
+ break;
+ }
+ }
+ le_start = lc1;
+ }
+
+ /*
+ * If we're generating steps for >/>= strategy, we
+ * can add other >= clauses to the prefix,
+ * provided they're from an earlier key.
+ */
+ if (strat == BTGreaterStrategyNumber ||
+ strat == BTGreaterEqualStrategyNumber)
+ {
+ for_each_cell(lc1, ge_clauses, ge_start)
+ {
+ PartClauseInfo *gepc = lfirst(lc1);
+
+ if (gepc->keyno == keyno)
+ {
+ prefix = lappend(prefix, gepc);
+ pk_has_clauses = true;
+ }
+ else
+ {
+ Assert(gepc->keyno > keyno);
+ break;
+ }
+ }
+ ge_start = lc1;
+ }
+
+ /*
+ * If this key has no clauses, prefix is not valid
+ * anymore.
+ */
+ if (!pk_has_clauses)
+ {
+ prefix_valid = false;
+ break;
+ }
+ }
+
+ /*
+ * If prefix_valid, generate PartitionPruneStepOps.
+ * Otherwise, we would not find clauses for a valid
+ * subset of the partition keys anymore for the
+ * strategy; give up on generating partition pruning
+ * steps further for the strategy.
+ *
+ * As mentioned above, if 'prefix' contains multiple
+ * expressions for the same key, the following will
+ * generate multiple steps, one for each combination
+ * of the expressions for different keys.
+ *
+ * Note that we pass NULL for step_nullkeys, because
+ * we don't search list/range partition bounds where
+ * some keys are NULL.
+ */
+ if (prefix_valid)
+ {
+ Assert(pc->op_strategy == strat);
+ pc_steps = get_steps_using_prefix(context, strat,
+ pc->op_is_ne,
+ pc->expr,
+ pc->cmpfn,
+ NULL,
+ prefix);
+ opsteps = list_concat(opsteps, pc_steps);
+ }
+ else
+ break;
+ }
+ }
+ break;
+ }
+
+ case PARTITION_STRATEGY_HASH:
+ {
+ List *eq_clauses = hash_clauses[HTEqualStrategyNumber];
+
+ /* For hash partitioning, we have just the = strategy. */
+ if (eq_clauses != NIL)
+ {
+ PartClauseInfo *pc;
+ List *pc_steps;
+ List *prefix = NIL;
+ int last_keyno;
+ ListCell *lc1;
+
+ /*
+ * Locate the clause for the greatest column. This may
+ * not belong to the last partition key, but it is the
+ * clause belonging to the last partition key we found a
+ * clause for above.
+ */
+ pc = llast(eq_clauses);
+
+ /*
+ * There might be multiple clauses which matched to that
+ * partition key; find the first such clause. While at
+ * it, add all the clauses before that one to 'prefix'.
+ */
+ last_keyno = pc->keyno;
+ foreach(lc, eq_clauses)
+ {
+ pc = lfirst(lc);
+ if (pc->keyno == last_keyno)
+ break;
+ prefix = lappend(prefix, pc);
+ }
+
+ /*
+ * For each clause for the "last" column, after appending
+ * the clause's own expression to the 'prefix', we'll
+ * generate one step using the so generated vector and
+ * assign = as its strategy. Actually, 'prefix' might
+ * contain multiple clauses for the same key, in which
+ * case, we must generate steps for various combinations
+ * of expressions of different keys, which
+ * get_steps_using_prefix will take care of for us.
+ */
+ for_each_cell(lc1, eq_clauses, lc)
+ {
+ pc = lfirst(lc1);
+
+ /*
+ * Note that we pass nullkeys for step_nullkeys,
+ * because we need to tell hash partition bound search
+ * function which of the keys we found IS NULL clauses
+ * for.
+ */
+ Assert(pc->op_strategy == HTEqualStrategyNumber);
+ pc_steps =
+ get_steps_using_prefix(context,
+ HTEqualStrategyNumber,
+ false,
+ pc->expr,
+ pc->cmpfn,
+ nullkeys,
+ prefix);
+ opsteps = list_concat(opsteps, pc_steps);
+ }
+ }
+ break;
+ }
+
+ default:
+ elog(ERROR, "invalid partition strategy: %c",
+ part_scheme->strategy);
+ break;
+ }
+
+ return opsteps;
+}
+
+/*
+ * If the partition key has a collation, then the clause must have the same
+ * input collation. If the partition key is non-collatable, we assume the
+ * collation doesn't matter, because while collation wasn't considered when
+ * performing partitioning, the clause still may have a collation assigned
+ * due to the other input being of a collatable type.
+ *
+ * See also IndexCollMatchesExprColl.
+ */
+#define PartCollMatchesExprColl(partcoll, exprcoll) \
+ ((partcoll) == InvalidOid || (partcoll) == (exprcoll))
+
+/*
+ * match_clause_to_partition_key
+ * Attempt to match the given 'clause' with the specified partition key.
+ *
+ * Return value is:
+ * * PARTCLAUSE_NOMATCH if the clause doesn't match this partition key (but
+ * caller should keep trying, because it might match a subsequent key).
+ * Output arguments: none set.
+ *
+ * * PARTCLAUSE_MATCH_CLAUSE if there is a match.
+ * Output arguments: *pc is set to a PartClauseInfo constructed for the
+ * matched clause.
+ *
+ * * PARTCLAUSE_MATCH_NULLNESS if there is a match, and the matched clause was
+ * either a "a IS NULL" or "a IS NOT NULL" clause.
+ * Output arguments: *clause_is_not_null is set to false in the former case
+ * true otherwise.
+ *
+ * * PARTCLAUSE_MATCH_STEPS if there is a match.
+ * Output arguments: *clause_steps is set to the list of recursively
+ * generated steps for the clause.
+ *
+ * * PARTCLAUSE_MATCH_CONTRADICT if the clause is self-contradictory, ie
+ * it provably returns FALSE or NULL.
+ * Output arguments: none set.
+ *
+ * * PARTCLAUSE_UNSUPPORTED if the clause doesn't match this partition key
+ * and couldn't possibly match any other one either, due to its form or
+ * properties (such as containing a volatile function).
+ * Output arguments: none set.
+ */
+static PartClauseMatchStatus
+match_clause_to_partition_key(GeneratePruningStepsContext *context,
+ Expr *clause, Expr *partkey, int partkeyidx,
+ bool *clause_is_not_null, PartClauseInfo **pc,
+ List **clause_steps)
+{
+ PartClauseMatchStatus boolmatchstatus;
+ PartitionScheme part_scheme = context->rel->part_scheme;
+ Oid partopfamily = part_scheme->partopfamily[partkeyidx],
+ partcoll = part_scheme->partcollation[partkeyidx];
+ Expr *expr;
+ bool noteq;
+
+ /*
+ * Recognize specially shaped clauses that match a Boolean partition key.
+ */
+ boolmatchstatus = match_boolean_partition_clause(partopfamily, clause,
+ partkey, &expr, &noteq);
+
+ if (boolmatchstatus == PARTCLAUSE_MATCH_CLAUSE)
+ {
+ PartClauseInfo *partclause;
+
+ partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo));
+ partclause->keyno = partkeyidx;
+ /* Do pruning with the Boolean equality operator. */
+ partclause->opno = BooleanEqualOperator;
+ partclause->op_is_ne = noteq;
+ partclause->expr = expr;
+ /* We know that expr is of Boolean type. */
+ partclause->cmpfn = part_scheme->partsupfunc[partkeyidx].fn_oid;
+ partclause->op_strategy = InvalidStrategy;
+
+ *pc = partclause;
+
+ return PARTCLAUSE_MATCH_CLAUSE;
+ }
+ else if (IsA(clause, OpExpr) &&
+ list_length(((OpExpr *) clause)->args) == 2)
+ {
+ OpExpr *opclause = (OpExpr *) clause;
+ Expr *leftop,
+ *rightop;
+ Oid opno,
+ op_lefttype,
+ op_righttype,
+ negator = InvalidOid;
+ Oid cmpfn;
+ int op_strategy;
+ bool is_opne_listp = false;
+ PartClauseInfo *partclause;
+
+ leftop = (Expr *) get_leftop(clause);
+ if (IsA(leftop, RelabelType))
+ leftop = ((RelabelType *) leftop)->arg;
+ rightop = (Expr *) get_rightop(clause);
+ if (IsA(rightop, RelabelType))
+ rightop = ((RelabelType *) rightop)->arg;
+ opno = opclause->opno;
+
+ /* check if the clause matches this partition key */
+ if (equal(leftop, partkey))
+ expr = rightop;
+ else if (equal(rightop, partkey))
+ {
+ /*
+ * It's only useful if we can commute the operator to put the
+ * partkey on the left. If we can't, the clause can be deemed
+ * UNSUPPORTED. Even if its leftop matches some later partkey, we
+ * now know it has Vars on the right, so it's no use.
+ */
+ opno = get_commutator(opno);
+ if (!OidIsValid(opno))
+ return PARTCLAUSE_UNSUPPORTED;
+ expr = leftop;
+ }
+ else
+ /* clause does not match this partition key, but perhaps next. */
+ return PARTCLAUSE_NOMATCH;
+
+ /*
+ * Partition key match also requires collation match. There may be
+ * multiple partkeys with the same expression but different
+ * collations, so failure is NOMATCH.
+ */
+ if (!PartCollMatchesExprColl(partcoll, opclause->inputcollid))
+ return PARTCLAUSE_NOMATCH;
+
+ /*
+ * See if the operator is relevant to the partitioning opfamily.
+ *
+ * Normally we only care about operators that are listed as being part
+ * of the partitioning operator family. But there is one exception:
+ * the not-equals operators are not listed in any operator family
+ * whatsoever, but their negators (equality) are. We can use one of
+ * those if we find it, but only for list partitioning.
+ *
+ * Note: we report NOMATCH on failure, in case a later partkey has the
+ * same expression but different opfamily. That's unlikely, but not
+ * much more so than duplicate expressions with different collations.
+ */
+ if (op_in_opfamily(opno, partopfamily))
+ {
+ get_op_opfamily_properties(opno, partopfamily, false,
+ &op_strategy, &op_lefttype,
+ &op_righttype);
+ }
+ else
+ {
+ if (part_scheme->strategy != PARTITION_STRATEGY_LIST)
+ return PARTCLAUSE_NOMATCH;
+
+ /* See if the negator is equality */
+ negator = get_negator(opno);
+ if (OidIsValid(negator) && op_in_opfamily(negator, partopfamily))
+ {
+ get_op_opfamily_properties(negator, partopfamily, false,
+ &op_strategy, &op_lefttype,
+ &op_righttype);
+ if (op_strategy == BTEqualStrategyNumber)
+ is_opne_listp = true; /* bingo */
+ }
+
+ /* Nope, it's not <> either. */
+ if (!is_opne_listp)
+ return PARTCLAUSE_NOMATCH;
+ }
+
+ /*
+ * Only allow strict operators. This will guarantee nulls are
+ * filtered. (This test is likely useless, since btree and hash
+ * comparison operators are generally strict.)
+ */
+ if (!op_strict(opno))
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * OK, we have a match to the partition key and a suitable operator.
+ * Examine the other argument to see if it's usable for pruning.
+ *
+ * In most of these cases, we can return UNSUPPORTED because the same
+ * failure would occur no matter which partkey it's matched to. (In
+ * particular, now that we've successfully matched one side of the
+ * opclause to a partkey, there is no chance that matching the other
+ * side to another partkey will produce a usable result, since that'd
+ * mean there are Vars on both sides.)
+ *
+ * Also, if we reject an argument for a target-dependent reason, set
+ * appropriate fields of *context to report that. We postpone these
+ * tests until after matching the partkey and the operator, so as to
+ * reduce the odds of setting the context fields for clauses that do
+ * not end up contributing to pruning steps.
+ *
+ * First, check for non-Const argument. (We assume that any immutable
+ * subexpression will have been folded to a Const already.)
+ */
+ if (!IsA(expr, Const))
+ {
+ Bitmapset *paramids;
+
+ /*
+ * When pruning in the planner, we only support pruning using
+ * comparisons to constants. We cannot prune on the basis of
+ * anything that's not immutable. (Note that has_mutable_arg and
+ * has_exec_param do not get set for this target value.)
+ */
+ if (context->target == PARTTARGET_PLANNER)
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * We can never prune using an expression that contains Vars.
+ */
+ if (contain_var_clause((Node *) expr))
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * And we must reject anything containing a volatile function.
+ * Stable functions are OK though.
+ */
+ if (contain_volatile_functions((Node *) expr))
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * See if there are any exec Params. If so, we can only use this
+ * expression during per-scan pruning.
+ */
+ paramids = pull_exec_paramids(expr);
+ if (!bms_is_empty(paramids))
+ {
+ context->has_exec_param = true;
+ if (context->target != PARTTARGET_EXEC)
+ return PARTCLAUSE_UNSUPPORTED;
+ }
+ else
+ {
+ /* It's potentially usable, but mutable */
+ context->has_mutable_arg = true;
+ }
+ }
+
+ /*
+ * Check whether the comparison operator itself is immutable. (We
+ * assume anything that's in a btree or hash opclass is at least
+ * stable, but we need to check for immutability.)
+ */
+ if (op_volatile(opno) != PROVOLATILE_IMMUTABLE)
+ {
+ context->has_mutable_op = true;
+
+ /*
+ * When pruning in the planner, we cannot prune with mutable
+ * operators.
+ */
+ if (context->target == PARTTARGET_PLANNER)
+ return PARTCLAUSE_UNSUPPORTED;
+ }
+
+ /*
+ * Now find the procedure to use, based on the types. If the clause's
+ * other argument is of the same type as the partitioning opclass's
+ * declared input type, we can use the procedure cached in
+ * PartitionKey. If not, search for a cross-type one in the same
+ * opfamily; if one doesn't exist, report no match.
+ */
+ if (op_righttype == part_scheme->partopcintype[partkeyidx])
+ cmpfn = part_scheme->partsupfunc[partkeyidx].fn_oid;
+ else
+ {
+ switch (part_scheme->strategy)
+ {
+ /*
+ * For range and list partitioning, we need the ordering
+ * procedure with lefttype being the partition key's type,
+ * and righttype the clause's operator's right type.
+ */
+ case PARTITION_STRATEGY_LIST:
+ case PARTITION_STRATEGY_RANGE:
+ cmpfn =
+ get_opfamily_proc(part_scheme->partopfamily[partkeyidx],
+ part_scheme->partopcintype[partkeyidx],
+ op_righttype, BTORDER_PROC);
+ break;
+
+ /*
+ * For hash partitioning, we need the hashing procedure
+ * for the clause's type.
+ */
+ case PARTITION_STRATEGY_HASH:
+ cmpfn =
+ get_opfamily_proc(part_scheme->partopfamily[partkeyidx],
+ op_righttype, op_righttype,
+ HASHEXTENDED_PROC);
+ break;
+
+ default:
+ elog(ERROR, "invalid partition strategy: %c",
+ part_scheme->strategy);
+ cmpfn = InvalidOid; /* keep compiler quiet */
+ break;
+ }
+
+ if (!OidIsValid(cmpfn))
+ return PARTCLAUSE_NOMATCH;
+ }
+
+ /*
+ * Build the clause, passing the negator if applicable.
+ */
+ partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo));
+ partclause->keyno = partkeyidx;
+ if (is_opne_listp)
+ {
+ Assert(OidIsValid(negator));
+ partclause->opno = negator;
+ partclause->op_is_ne = true;
+ partclause->op_strategy = InvalidStrategy;
+ }
+ else
+ {
+ partclause->opno = opno;
+ partclause->op_is_ne = false;
+ partclause->op_strategy = op_strategy;
+ }
+ partclause->expr = expr;
+ partclause->cmpfn = cmpfn;
+
+ *pc = partclause;
+
+ return PARTCLAUSE_MATCH_CLAUSE;
+ }
+ else if (IsA(clause, ScalarArrayOpExpr))
+ {
+ ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+ Oid saop_op = saop->opno;
+ Oid saop_coll = saop->inputcollid;
+ Expr *leftop = (Expr *) linitial(saop->args),
+ *rightop = (Expr *) lsecond(saop->args);
+ List *elem_exprs,
+ *elem_clauses;
+ ListCell *lc1;
+
+ if (IsA(leftop, RelabelType))
+ leftop = ((RelabelType *) leftop)->arg;
+
+ /* check if the LHS matches this partition key */
+ if (!equal(leftop, partkey) ||
+ !PartCollMatchesExprColl(partcoll, saop->inputcollid))
+ return PARTCLAUSE_NOMATCH;
+
+ /*
+ * See if the operator is relevant to the partitioning opfamily.
+ *
+ * In case of NOT IN (..), we get a '<>', which we handle if list
+ * partitioning is in use and we're able to confirm that it's negator
+ * is a btree equality operator belonging to the partitioning operator
+ * family. As above, report NOMATCH for non-matching operator.
+ */
+ if (!op_in_opfamily(saop_op, partopfamily))
+ {
+ Oid negator;
+
+ if (part_scheme->strategy != PARTITION_STRATEGY_LIST)
+ return PARTCLAUSE_NOMATCH;
+
+ negator = get_negator(saop_op);
+ if (OidIsValid(negator) && op_in_opfamily(negator, partopfamily))
+ {
+ int strategy;
+ Oid lefttype,
+ righttype;
+
+ get_op_opfamily_properties(negator, partopfamily,
+ false, &strategy,
+ &lefttype, &righttype);
+ if (strategy != BTEqualStrategyNumber)
+ return PARTCLAUSE_NOMATCH;
+ }
+ else
+ return PARTCLAUSE_NOMATCH; /* no useful negator */
+ }
+
+ /*
+ * Only allow strict operators. This will guarantee nulls are
+ * filtered. (This test is likely useless, since btree and hash
+ * comparison operators are generally strict.)
+ */
+ if (!op_strict(saop_op))
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * OK, we have a match to the partition key and a suitable operator.
+ * Examine the array argument to see if it's usable for pruning. This
+ * is identical to the logic for a plain OpExpr.
+ */
+ if (!IsA(rightop, Const))
+ {
+ Bitmapset *paramids;
+
+ /*
+ * When pruning in the planner, we only support pruning using
+ * comparisons to constants. We cannot prune on the basis of
+ * anything that's not immutable. (Note that has_mutable_arg and
+ * has_exec_param do not get set for this target value.)
+ */
+ if (context->target == PARTTARGET_PLANNER)
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * We can never prune using an expression that contains Vars.
+ */
+ if (contain_var_clause((Node *) rightop))
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * And we must reject anything containing a volatile function.
+ * Stable functions are OK though.
+ */
+ if (contain_volatile_functions((Node *) rightop))
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * See if there are any exec Params. If so, we can only use this
+ * expression during per-scan pruning.
+ */
+ paramids = pull_exec_paramids(rightop);
+ if (!bms_is_empty(paramids))
+ {
+ context->has_exec_param = true;
+ if (context->target != PARTTARGET_EXEC)
+ return PARTCLAUSE_UNSUPPORTED;
+ }
+ else
+ {
+ /* It's potentially usable, but mutable */
+ context->has_mutable_arg = true;
+ }
+ }
+
+ /*
+ * Check whether the comparison operator itself is immutable. (We
+ * assume anything that's in a btree or hash opclass is at least
+ * stable, but we need to check for immutability.)
+ */
+ if (op_volatile(saop_op) != PROVOLATILE_IMMUTABLE)
+ {
+ context->has_mutable_op = true;
+
+ /*
+ * When pruning in the planner, we cannot prune with mutable
+ * operators.
+ */
+ if (context->target == PARTTARGET_PLANNER)
+ return PARTCLAUSE_UNSUPPORTED;
+ }
+
+ /*
+ * Examine the contents of the array argument.
+ */
+ elem_exprs = NIL;
+ if (IsA(rightop, Const))
+ {
+ /*
+ * For a constant array, convert the elements to a list of Const
+ * nodes, one for each array element (excepting nulls).
+ */
+ Const *arr = (Const *) rightop;
+ ArrayType *arrval;
+ int16 elemlen;
+ bool elembyval;
+ char elemalign;
+ Datum *elem_values;
+ bool *elem_nulls;
+ int num_elems,
+ i;
+
+ /* If the array itself is null, the saop returns null */
+ if (arr->constisnull)
+ return PARTCLAUSE_MATCH_CONTRADICT;
+
+ arrval = DatumGetArrayTypeP(arr->constvalue);
+ get_typlenbyvalalign(ARR_ELEMTYPE(arrval),
+ &elemlen, &elembyval, &elemalign);
+ deconstruct_array(arrval,
+ ARR_ELEMTYPE(arrval),
+ elemlen, elembyval, elemalign,
+ &elem_values, &elem_nulls,
+ &num_elems);
+ for (i = 0; i < num_elems; i++)
+ {
+ Const *elem_expr;
+
+ /*
+ * A null array element must lead to a null comparison result,
+ * since saop_op is known strict. We can ignore it in the
+ * useOr case, but otherwise it implies self-contradiction.
+ */
+ if (elem_nulls[i])
+ {
+ if (saop->useOr)
+ continue;
+ return PARTCLAUSE_MATCH_CONTRADICT;
+ }
+
+ elem_expr = makeConst(ARR_ELEMTYPE(arrval), -1,
+ arr->constcollid, elemlen,
+ elem_values[i], false, elembyval);
+ elem_exprs = lappend(elem_exprs, elem_expr);
+ }
+ }
+ else if (IsA(rightop, ArrayExpr))
+ {
+ ArrayExpr *arrexpr = castNode(ArrayExpr, rightop);
+
+ /*
+ * For a nested ArrayExpr, we don't know how to get the actual
+ * scalar values out into a flat list, so we give up doing
+ * anything with this ScalarArrayOpExpr.
+ */
+ if (arrexpr->multidims)
+ return PARTCLAUSE_UNSUPPORTED;
+
+ /*
+ * Otherwise, we can just use the list of element values.
+ */
+ elem_exprs = arrexpr->elements;
+ }
+ else
+ {
+ /* Give up on any other clause types. */
+ return PARTCLAUSE_UNSUPPORTED;
+ }
+
+ /*
+ * Now generate a list of clauses, one for each array element, of the
+ * form leftop saop_op elem_expr
+ */
+ elem_clauses = NIL;
+ foreach(lc1, elem_exprs)
+ {
+ Expr *rightop = (Expr *) lfirst(lc1),
+ *elem_clause;
+
+ elem_clause = make_opclause(saop_op, BOOLOID, false,
+ leftop, rightop,
+ InvalidOid, saop_coll);
+ elem_clauses = lappend(elem_clauses, elem_clause);
+ }
+
+ /*
+ * If we have an ANY clause and multiple elements, now turn the list
+ * of clauses into an OR expression.
+ */
+ if (saop->useOr && list_length(elem_clauses) > 1)
+ elem_clauses = list_make1(makeBoolExpr(OR_EXPR, elem_clauses, -1));
+
+ /* Finally, generate steps */
+ *clause_steps = gen_partprune_steps_internal(context, elem_clauses);
+ if (context->contradictory)
+ return PARTCLAUSE_MATCH_CONTRADICT;
+ else if (*clause_steps == NIL)
+ return PARTCLAUSE_UNSUPPORTED; /* step generation failed */
+ return PARTCLAUSE_MATCH_STEPS;
+ }
+ else if (IsA(clause, NullTest))
+ {
+ NullTest *nulltest = (NullTest *) clause;
+ Expr *arg = nulltest->arg;
+
+ if (IsA(arg, RelabelType))
+ arg = ((RelabelType *) arg)->arg;
+
+ /* Does arg match with this partition key column? */
+ if (!equal(arg, partkey))
+ return PARTCLAUSE_NOMATCH;
+
+ *clause_is_not_null = (nulltest->nulltesttype == IS_NOT_NULL);
+
+ return PARTCLAUSE_MATCH_NULLNESS;
+ }
+
+ /*
+ * If we get here then the return value depends on the result of the
+ * match_boolean_partition_clause call above. If the call returned
+ * PARTCLAUSE_UNSUPPORTED then we're either not dealing with a bool qual
+ * or the bool qual is not suitable for pruning. Since the qual didn't
+ * match up to any of the other qual types supported here, then trying to
+ * match it against any other partition key is a waste of time, so just
+ * return PARTCLAUSE_UNSUPPORTED. If the qual just couldn't be matched to
+ * this partition key, then it may match another, so return
+ * PARTCLAUSE_NOMATCH. The only other value that
+ * match_boolean_partition_clause can return is PARTCLAUSE_MATCH_CLAUSE,
+ * and since that value was already dealt with above, then we can just
+ * return boolmatchstatus.
+ */
+ return boolmatchstatus;
+}
+
+/*
+ * get_steps_using_prefix
+ * Generate a list of PartitionPruneStepOps based on the given input.
+ *
+ * 'step_lastexpr' and 'step_lastcmpfn' are the Expr and comparison function
+ * belonging to the final partition key that we have a clause for. 'prefix'
+ * is a list of PartClauseInfos for partition key numbers prior to the given
+ * 'step_lastexpr' and 'step_lastcmpfn'. 'prefix' may contain multiple
+ * PartClauseInfos belonging to a single partition key. We will generate a
+ * PartitionPruneStepOp for each combination of the given PartClauseInfos
+ * using, at most, one PartClauseInfo per partition key.
+ *
+ * For LIST and RANGE partitioned tables, callers must ensure that
+ * step_nullkeys is NULL, and that prefix contains at least one clause for
+ * each of the partition keys prior to the key that 'step_lastexpr' and
+ * 'step_lastcmpfn'belong to.
+ *
+ * For HASH partitioned tables, callers must ensure that 'prefix' contains at
+ * least one clause for each of the partition keys apart from the final key
+ * (the expr and comparison function for the final key are in 'step_lastexpr'
+ * and 'step_lastcmpfn'). A bit set in step_nullkeys can substitute clauses
+ * in the 'prefix' list for any given key. If a bit is set in 'step_nullkeys'
+ * for a given key, then there must be no PartClauseInfo for that key in the
+ * 'prefix' list.
+ *
+ * For each of the above cases, callers must ensure that PartClauseInfos in
+ * 'prefix' are sorted in ascending order of keyno.
+ */
+static List *
+get_steps_using_prefix(GeneratePruningStepsContext *context,
+ StrategyNumber step_opstrategy,
+ bool step_op_is_ne,
+ Expr *step_lastexpr,
+ Oid step_lastcmpfn,
+ Bitmapset *step_nullkeys,
+ List *prefix)
+{
+ /* step_nullkeys must be empty for RANGE and LIST partitioned tables */
+ Assert(step_nullkeys == NULL ||
+ context->rel->part_scheme->strategy == PARTITION_STRATEGY_HASH);
+
+ /*
+ * No recursive processing is required when 'prefix' is an empty list. This
+ * occurs when there is only 1 partition key column.
+ */
+ if (list_length(prefix) == 0)
+ {
+ PartitionPruneStep *step;
+
+ step = gen_prune_step_op(context,
+ step_opstrategy,
+ step_op_is_ne,
+ list_make1(step_lastexpr),
+ list_make1_oid(step_lastcmpfn),
+ step_nullkeys);
+ return list_make1(step);
+ }
+
+ /* Recurse to generate steps for every combination of clauses. */
+ return get_steps_using_prefix_recurse(context,
+ step_opstrategy,
+ step_op_is_ne,
+ step_lastexpr,
+ step_lastcmpfn,
+ step_nullkeys,
+ prefix,
+ list_head(prefix),
+ NIL, NIL);
+}
+
+/*
+ * get_steps_using_prefix_recurse
+ * Generate and return a list of PartitionPruneStepOps using the 'prefix'
+ * list of PartClauseInfos starting at the 'start' cell.
+ *
+ * When 'prefix' contains multiple PartClauseInfos for a single partition key
+ * we create a PartitionPruneStepOp for each combination of duplicated
+ * PartClauseInfos. The returned list will contain a PartitionPruneStepOp
+ * for each unique combination of input PartClauseInfos containing at most one
+ * PartClauseInfo per partition key.
+ *
+ * 'prefix' is the input list of PartClauseInfos sorted by keyno.
+ * 'start' marks the cell that searching the 'prefix' list should start from.
+ * 'step_exprs' and 'step_cmpfns' each contains the expressions and cmpfns
+ * we've generated so far from the clauses for the previous part keys.
+ */
+static List *
+get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
+ StrategyNumber step_opstrategy,
+ bool step_op_is_ne,
+ Expr *step_lastexpr,
+ Oid step_lastcmpfn,
+ Bitmapset *step_nullkeys,
+ List *prefix,
+ ListCell *start,
+ List *step_exprs,
+ List *step_cmpfns)
+{
+ List *result = NIL;
+ ListCell *lc;
+ int cur_keyno;
+ int final_keyno;
+
+ /* Actually, recursion would be limited by PARTITION_MAX_KEYS. */
+ check_stack_depth();
+
+ Assert(start != NULL);
+ cur_keyno = ((PartClauseInfo *) lfirst(start))->keyno;
+ final_keyno = ((PartClauseInfo *) llast(prefix))->keyno;
+
+ /* Check if we need to recurse. */
+ if (cur_keyno < final_keyno)
+ {
+ PartClauseInfo *pc;
+ ListCell *next_start;
+
+ /*
+ * Find the first PartClauseInfo belonging to the next partition key, the
+ * next recursive call must start iteration of the prefix list from that
+ * point.
+ */
+ for_each_cell(lc, prefix, start)
+ {
+ pc = lfirst(lc);
+
+ if (pc->keyno > cur_keyno)
+ break;
+ }
+
+ /* record where to start iterating in the next recursive call */
+ next_start = lc;
+
+ /*
+ * For each PartClauseInfo with keyno set to cur_keyno, add its expr and
+ * cmpfn to step_exprs and step_cmpfns, respectively, and recurse using
+ * 'next_start' as the starting point in the 'prefix' list.
+ */
+ for_each_cell(lc, prefix, start)
+ {
+ List *moresteps;
+ List *step_exprs1,
+ *step_cmpfns1;
+
+ pc = lfirst(lc);
+ if (pc->keyno == cur_keyno)
+ {
+ /* Leave the original step_exprs unmodified. */
+ step_exprs1 = list_copy(step_exprs);
+ step_exprs1 = lappend(step_exprs1, pc->expr);
+
+ /* Leave the original step_cmpfns unmodified. */
+ step_cmpfns1 = list_copy(step_cmpfns);
+ step_cmpfns1 = lappend_oid(step_cmpfns1, pc->cmpfn);
+ }
+ else
+ {
+ /* check the 'prefix' list is sorted correctly */
+ Assert(pc->keyno > cur_keyno);
+ break;
+ }
+
+ moresteps = get_steps_using_prefix_recurse(context,
+ step_opstrategy,
+ step_op_is_ne,
+ step_lastexpr,
+ step_lastcmpfn,
+ step_nullkeys,
+ prefix,
+ next_start,
+ step_exprs1,
+ step_cmpfns1);
+ result = list_concat(result, moresteps);
+
+ list_free(step_exprs1);
+ list_free(step_cmpfns1);
+ }
+ }
+ else
+ {
+ /*
+ * End the current recursion cycle and start generating steps, one for
+ * each clause with cur_keyno, which is all clauses from here onward
+ * till the end of the list. Note that for hash partitioning,
+ * step_nullkeys is allowed to be non-empty, in which case step_exprs
+ * would only contain expressions for the partition keys that are not
+ * specified in step_nullkeys.
+ */
+ Assert(list_length(step_exprs) == cur_keyno ||
+ !bms_is_empty(step_nullkeys));
+
+ /*
+ * Note also that for hash partitioning, each partition key should
+ * have either equality clauses or an IS NULL clause, so if a
+ * partition key doesn't have an expression, it would be specified in
+ * step_nullkeys.
+ */
+ Assert(context->rel->part_scheme->strategy
+ != PARTITION_STRATEGY_HASH ||
+ list_length(step_exprs) + 2 + bms_num_members(step_nullkeys) ==
+ context->rel->part_scheme->partnatts);
+ for_each_cell(lc, prefix, start)
+ {
+ PartClauseInfo *pc = lfirst(lc);
+ PartitionPruneStep *step;
+ List *step_exprs1,
+ *step_cmpfns1;
+
+ Assert(pc->keyno == cur_keyno);
+
+ /* Leave the original step_exprs unmodified. */
+ step_exprs1 = list_copy(step_exprs);
+ step_exprs1 = lappend(step_exprs1, pc->expr);
+ step_exprs1 = lappend(step_exprs1, step_lastexpr);
+
+ /* Leave the original step_cmpfns unmodified. */
+ step_cmpfns1 = list_copy(step_cmpfns);
+ step_cmpfns1 = lappend_oid(step_cmpfns1, pc->cmpfn);
+ step_cmpfns1 = lappend_oid(step_cmpfns1, step_lastcmpfn);
+
+ step = gen_prune_step_op(context,
+ step_opstrategy, step_op_is_ne,
+ step_exprs1, step_cmpfns1,
+ step_nullkeys);
+ result = lappend(result, step);
+ }
+ }
+
+ return result;
+}
+
+/*
+ * get_matching_hash_bounds
+ * Determine offset of the hash bound matching the specified values,
+ * considering that all the non-null values come from clauses containing
+ * a compatible hash equality operator and any keys that are null come
+ * from an IS NULL clause.
+ *
+ * Generally this function will return a single matching bound offset,
+ * although if a partition has not been setup for a given modulus then we may
+ * return no matches. If the number of clauses found don't cover the entire
+ * partition key, then we'll need to return all offsets.
+ *
+ * 'opstrategy' if non-zero must be HTEqualStrategyNumber.
+ *
+ * 'values' contains Datums indexed by the partition key to use for pruning.
+ *
+ * 'nvalues', the number of Datums in the 'values' array.
+ *
+ * 'partsupfunc' contains partition hashing functions that can produce correct
+ * hash for the type of the values contained in 'values'.
+ *
+ * 'nullkeys' is the set of partition keys that are null.
+ */
+static PruneStepResult *
+get_matching_hash_bounds(PartitionPruneContext *context,
+ StrategyNumber opstrategy, Datum *values, int nvalues,
+ FmgrInfo *partsupfunc, Bitmapset *nullkeys)
+{
+ PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+ PartitionBoundInfo boundinfo = context->boundinfo;
+ int *partindices = boundinfo->indexes;
+ int partnatts = context->partnatts;
+ bool isnull[PARTITION_MAX_KEYS];
+ int i;
+ uint64 rowHash;
+ int greatest_modulus;
+ Oid *partcollation = context->partcollation;
+
+ Assert(context->strategy == PARTITION_STRATEGY_HASH);
+
+ /*
+ * For hash partitioning we can only perform pruning based on equality
+ * clauses to the partition key or IS NULL clauses. We also can only
+ * prune if we got values for all keys.
+ */
+ if (nvalues + bms_num_members(nullkeys) == partnatts)
+ {
+ /*
+ * If there are any values, they must have come from clauses
+ * containing an equality operator compatible with hash partitioning.
+ */
+ Assert(opstrategy == HTEqualStrategyNumber || nvalues == 0);
+
+ for (i = 0; i < partnatts; i++)
+ isnull[i] = bms_is_member(i, nullkeys);
+
+ rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation,
+ values, isnull);
+
+ greatest_modulus = boundinfo->nindexes;
+ if (partindices[rowHash % greatest_modulus] >= 0)
+ result->bound_offsets =
+ bms_make_singleton(rowHash % greatest_modulus);
+ }
+ else
+ {
+ /* Report all valid offsets into the boundinfo->indexes array. */
+ result->bound_offsets = bms_add_range(NULL, 0,
+ boundinfo->nindexes - 1);
+ }
+
+ /*
+ * There is neither a special hash null partition or the default hash
+ * partition.
+ */
+ result->scan_null = result->scan_default = false;
+
+ return result;
+}
+
+/*
+ * get_matching_list_bounds
+ * Determine the offsets of list bounds matching the specified value,
+ * according to the semantics of the given operator strategy
+ *
+ * scan_default will be set in the returned struct, if the default partition
+ * needs to be scanned, provided one exists at all. scan_null will be set if
+ * the special null-accepting partition needs to be scanned.
+ *
+ * 'opstrategy' if non-zero must be a btree strategy number.
+ *
+ * 'value' contains the value to use for pruning.
+ *
+ * 'nvalues', if non-zero, should be exactly 1, because of list partitioning.
+ *
+ * 'partsupfunc' contains the list partitioning comparison function to be used
+ * to perform partition_list_bsearch
+ *
+ * 'nullkeys' is the set of partition keys that are null.
+ */
+static PruneStepResult *
+get_matching_list_bounds(PartitionPruneContext *context,
+ StrategyNumber opstrategy, Datum value, int nvalues,
+ FmgrInfo *partsupfunc, Bitmapset *nullkeys)
+{
+ PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+ PartitionBoundInfo boundinfo = context->boundinfo;
+ int off,
+ minoff,
+ maxoff;
+ bool is_equal;
+ bool inclusive = false;
+ Oid *partcollation = context->partcollation;
+
+ Assert(context->strategy == PARTITION_STRATEGY_LIST);
+ Assert(context->partnatts == 1);
+
+ result->scan_null = result->scan_default = false;
+
+ if (!bms_is_empty(nullkeys))
+ {
+ /*
+ * Nulls may exist in only one partition - the partition whose
+ * accepted set of values includes null or the default partition if
+ * the former doesn't exist.
+ */
+ if (partition_bound_accepts_nulls(boundinfo))
+ result->scan_null = true;
+ else
+ result->scan_default = partition_bound_has_default(boundinfo);
+ return result;
+ }
+
+ /*
+ * If there are no datums to compare keys with, but there are partitions,
+ * just return the default partition if one exists.
+ */
+ if (boundinfo->ndatums == 0)
+ {
+ result->scan_default = partition_bound_has_default(boundinfo);
+ return result;
+ }
+
+ minoff = 0;
+ maxoff = boundinfo->ndatums - 1;
+
+ /*
+ * If there are no values to compare with the datums in boundinfo, it
+ * means the caller asked for partitions for all non-null datums. Add
+ * indexes of *all* partitions, including the default if any.
+ */
+ if (nvalues == 0)
+ {
+ Assert(boundinfo->ndatums > 0);
+ result->bound_offsets = bms_add_range(NULL, 0,
+ boundinfo->ndatums - 1);
+ result->scan_default = partition_bound_has_default(boundinfo);
+ return result;
+ }
+
+ /* Special case handling of values coming from a <> operator clause. */
+ if (opstrategy == InvalidStrategy)
+ {
+ /*
+ * First match to all bounds. We'll remove any matching datums below.
+ */
+ Assert(boundinfo->ndatums > 0);
+ result->bound_offsets = bms_add_range(NULL, 0,
+ boundinfo->ndatums - 1);
+
+ off = partition_list_bsearch(partsupfunc, partcollation, boundinfo,
+ value, &is_equal);
+ if (off >= 0 && is_equal)
+ {
+
+ /* We have a match. Remove from the result. */
+ Assert(boundinfo->indexes[off] >= 0);
+ result->bound_offsets = bms_del_member(result->bound_offsets,
+ off);
+ }
+
+ /* Always include the default partition if any. */
+ result->scan_default = partition_bound_has_default(boundinfo);
+
+ return result;
+ }
+
+ /*
+ * With range queries, always include the default list partition, because
+ * list partitions divide the key space in a discontinuous manner, not all
+ * values in the given range will have a partition assigned. This may not
+ * technically be true for some data types (e.g. integer types), however,
+ * we currently lack any sort of infrastructure to provide us with proofs
+ * that would allow us to do anything smarter here.
+ */
+ if (opstrategy != BTEqualStrategyNumber)
+ result->scan_default = partition_bound_has_default(boundinfo);
+
+ switch (opstrategy)
+ {
+ case BTEqualStrategyNumber:
+ off = partition_list_bsearch(partsupfunc,
+ partcollation,
+ boundinfo, value,
+ &is_equal);
+ if (off >= 0 && is_equal)
+ {
+ Assert(boundinfo->indexes[off] >= 0);
+ result->bound_offsets = bms_make_singleton(off);
+ }
+ else
+ result->scan_default = partition_bound_has_default(boundinfo);
+ return result;
+
+ case BTGreaterEqualStrategyNumber:
+ inclusive = true;
+ /* fall through */
+ case BTGreaterStrategyNumber:
+ off = partition_list_bsearch(partsupfunc,
+ partcollation,
+ boundinfo, value,
+ &is_equal);
+ if (off >= 0)
+ {
+ /* We don't want the matched datum to be in the result. */
+ if (!is_equal || !inclusive)
+ off++;
+ }
+ else
+ {
+ /*
+ * This case means all partition bounds are greater, which in
+ * turn means that all partitions satisfy this key.
+ */
+ off = 0;
+ }
+
+ /*
+ * off is greater than the numbers of datums we have partitions
+ * for. The only possible partition that could contain a match is
+ * the default partition, but we must've set context->scan_default
+ * above anyway if one exists.
+ */
+ if (off > boundinfo->ndatums - 1)
+ return result;
+
+ minoff = off;
+ break;
+
+ case BTLessEqualStrategyNumber:
+ inclusive = true;
+ /* fall through */
+ case BTLessStrategyNumber:
+ off = partition_list_bsearch(partsupfunc,
+ partcollation,
+ boundinfo, value,
+ &is_equal);
+ if (off >= 0 && is_equal && !inclusive)
+ off--;
+
+ /*
+ * off is smaller than the datums of all non-default partitions.
+ * The only possible partition that could contain a match is the
+ * default partition, but we must've set context->scan_default
+ * above anyway if one exists.
+ */
+ if (off < 0)
+ return result;
+
+ maxoff = off;
+ break;
+
+ default:
+ elog(ERROR, "invalid strategy number %d", opstrategy);
+ break;
+ }
+
+ Assert(minoff >= 0 && maxoff >= 0);
+ result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
+ return result;
+}
+
+
+/*
+ * get_matching_range_bounds
+ * Determine the offsets of range bounds matching the specified values,
+ * according to the semantics of the given operator strategy
+ *
+ * Each datum whose offset is in result is to be treated as the upper bound of
+ * the partition that will contain the desired values.
+ *
+ * scan_default is set in the returned struct if a default partition exists
+ * and we're absolutely certain that it needs to be scanned. We do *not* set
+ * it just because values match portions of the key space uncovered by
+ * partitions other than default (space which we normally assume to belong to
+ * the default partition): the final set of bounds obtained after combining
+ * multiple pruning steps might exclude it, so we infer its inclusion
+ * elsewhere.
+ *
+ * 'opstrategy' if non-zero must be a btree strategy number.
+ *
+ * 'values' contains Datums indexed by the partition key to use for pruning.
+ *
+ * 'nvalues', number of Datums in 'values' array. Must be <= context->partnatts.
+ *
+ * 'partsupfunc' contains the range partitioning comparison functions to be
+ * used to perform partition_range_datum_bsearch or partition_rbound_datum_cmp
+ * using.
+ *
+ * 'nullkeys' is the set of partition keys that are null.
+ */
+static PruneStepResult *
+get_matching_range_bounds(PartitionPruneContext *context,
+ StrategyNumber opstrategy, Datum *values, int nvalues,
+ FmgrInfo *partsupfunc, Bitmapset *nullkeys)
+{
+ PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+ PartitionBoundInfo boundinfo = context->boundinfo;
+ Oid *partcollation = context->partcollation;
+ int partnatts = context->partnatts;
+ int *partindices = boundinfo->indexes;
+ int off,
+ minoff,
+ maxoff;
+ bool is_equal;
+ bool inclusive = false;
+
+ Assert(context->strategy == PARTITION_STRATEGY_RANGE);
+ Assert(nvalues <= partnatts);
+
+ result->scan_null = result->scan_default = false;
+
+ /*
+ * If there are no datums to compare keys with, or if we got an IS NULL
+ * clause just return the default partition, if it exists.
+ */
+ if (boundinfo->ndatums == 0 || !bms_is_empty(nullkeys))
+ {
+ result->scan_default = partition_bound_has_default(boundinfo);
+ return result;
+ }
+
+ minoff = 0;
+ maxoff = boundinfo->ndatums;
+
+ /*
+ * If there are no values to compare with the datums in boundinfo, it
+ * means the caller asked for partitions for all non-null datums. Add
+ * indexes of *all* partitions, including the default partition if one
+ * exists.
+ */
+ if (nvalues == 0)
+ {
+ /* ignore key space not covered by any partitions */
+ if (partindices[minoff] < 0)
+ minoff++;
+ if (partindices[maxoff] < 0)
+ maxoff--;
+
+ result->scan_default = partition_bound_has_default(boundinfo);
+ Assert(partindices[minoff] >= 0 &&
+ partindices[maxoff] >= 0);
+ result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
+
+ return result;
+ }
+
+ /*
+ * If the query does not constrain all key columns, we'll need to scan the
+ * default partition, if any.
+ */
+ if (nvalues < partnatts)
+ result->scan_default = partition_bound_has_default(boundinfo);
+
+ switch (opstrategy)
+ {
+ case BTEqualStrategyNumber:
+ /* Look for the smallest bound that is = lookup value. */
+ off = partition_range_datum_bsearch(partsupfunc,
+ partcollation,
+ boundinfo,
+ nvalues, values,
+ &is_equal);
+
+ if (off >= 0 && is_equal)
+ {
+ if (nvalues == partnatts)
+ {
+ /* There can only be zero or one matching partition. */
+ result->bound_offsets = bms_make_singleton(off + 1);
+ return result;
+ }
+ else
+ {
+ int saved_off = off;
+
+ /*
+ * Since the lookup value contains only a prefix of keys,
+ * we must find other bounds that may also match the
+ * prefix. partition_range_datum_bsearch() returns the
+ * offset of one of them, find others by checking adjacent
+ * bounds.
+ */
+
+ /*
+ * First find greatest bound that's smaller than the
+ * lookup value.
+ */
+ while (off >= 1)
+ {
+ int32 cmpval;
+
+ cmpval =
+ partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[off - 1],
+ boundinfo->kind[off - 1],
+ values, nvalues);
+ if (cmpval != 0)
+ break;
+ off--;
+ }
+
+ Assert(0 ==
+ partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[off],
+ boundinfo->kind[off],
+ values, nvalues));
+
+ /*
+ * We can treat 'off' as the offset of the smallest bound
+ * to be included in the result, if we know it is the
+ * upper bound of the partition in which the lookup value
+ * could possibly exist. One case it couldn't is if the
+ * bound, or precisely the matched portion of its prefix,
+ * is not inclusive.
+ */
+ if (boundinfo->kind[off][nvalues] ==
+ PARTITION_RANGE_DATUM_MINVALUE)
+ off++;
+
+ minoff = off;
+
+ /*
+ * Now find smallest bound that's greater than the lookup
+ * value.
+ */
+ off = saved_off;
+ while (off < boundinfo->ndatums - 1)
+ {
+ int32 cmpval;
+
+ cmpval = partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[off + 1],
+ boundinfo->kind[off + 1],
+ values, nvalues);
+ if (cmpval != 0)
+ break;
+ off++;
+ }
+
+ Assert(0 ==
+ partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[off],
+ boundinfo->kind[off],
+ values, nvalues));
+
+ /*
+ * off + 1, then would be the offset of the greatest bound
+ * to be included in the result.
+ */
+ maxoff = off + 1;
+ }
+
+ Assert(minoff >= 0 && maxoff >= 0);
+ result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
+ }
+ else
+ {
+ /*
+ * The lookup value falls in the range between some bounds in
+ * boundinfo. 'off' would be the offset of the greatest bound
+ * that is <= lookup value, so add off + 1 to the result
+ * instead as the offset of the upper bound of the only
+ * partition that may contain the lookup value. If 'off' is
+ * -1 indicating that all bounds are greater, then we simply
+ * end up adding the first bound's offset, that is, 0.
+ */
+ result->bound_offsets = bms_make_singleton(off + 1);
+ }
+
+ return result;
+
+ case BTGreaterEqualStrategyNumber:
+ inclusive = true;
+ /* fall through */
+ case BTGreaterStrategyNumber:
+
+ /*
+ * Look for the smallest bound that is > or >= lookup value and
+ * set minoff to its offset.
+ */
+ off = partition_range_datum_bsearch(partsupfunc,
+ partcollation,
+ boundinfo,
+ nvalues, values,
+ &is_equal);
+ if (off < 0)
+ {
+ /*
+ * All bounds are greater than the lookup value, so include
+ * all of them in the result.
+ */
+ minoff = 0;
+ }
+ else
+ {
+ if (is_equal && nvalues < partnatts)
+ {
+ /*
+ * Since the lookup value contains only a prefix of keys,
+ * we must find other bounds that may also match the
+ * prefix. partition_range_datum_bsearch() returns the
+ * offset of one of them, find others by checking adjacent
+ * bounds.
+ *
+ * Based on whether the lookup values are inclusive or
+ * not, we must either include the indexes of all such
+ * bounds in the result (that is, set minoff to the index
+ * of smallest such bound) or find the smallest one that's
+ * greater than the lookup values and set minoff to that.
+ */
+ while (off >= 1 && off < boundinfo->ndatums - 1)
+ {
+ int32 cmpval;
+ int nextoff;
+
+ nextoff = inclusive ? off - 1 : off + 1;
+ cmpval =
+ partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[nextoff],
+ boundinfo->kind[nextoff],
+ values, nvalues);
+ if (cmpval != 0)
+ break;
+
+ off = nextoff;
+ }
+
+ Assert(0 ==
+ partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[off],
+ boundinfo->kind[off],
+ values, nvalues));
+
+ minoff = inclusive ? off : off + 1;
+ }
+ else
+ {
+
+ /*
+ * lookup value falls in the range between some bounds in
+ * boundinfo. off would be the offset of the greatest
+ * bound that is <= lookup value, so add off + 1 to the
+ * result instead as the offset of the upper bound of the
+ * smallest partition that may contain the lookup value.
+ */
+ minoff = off + 1;
+ }
+ }
+ break;
+
+ case BTLessEqualStrategyNumber:
+ inclusive = true;
+ /* fall through */
+ case BTLessStrategyNumber:
+
+ /*
+ * Look for the greatest bound that is < or <= lookup value and
+ * set maxoff to its offset.
+ */
+ off = partition_range_datum_bsearch(partsupfunc,
+ partcollation,
+ boundinfo,
+ nvalues, values,
+ &is_equal);
+ if (off >= 0)
+ {
+ /*
+ * See the comment above.
+ */
+ if (is_equal && nvalues < partnatts)
+ {
+ while (off >= 1 && off < boundinfo->ndatums - 1)
+ {
+ int32 cmpval;
+ int nextoff;
+
+ nextoff = inclusive ? off + 1 : off - 1;
+ cmpval = partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[nextoff],
+ boundinfo->kind[nextoff],
+ values, nvalues);
+ if (cmpval != 0)
+ break;
+
+ off = nextoff;
+ }
+
+ Assert(0 ==
+ partition_rbound_datum_cmp(partsupfunc,
+ partcollation,
+ boundinfo->datums[off],
+ boundinfo->kind[off],
+ values, nvalues));
+
+ maxoff = inclusive ? off + 1 : off;
+ }
+
+ /*
+ * The lookup value falls in the range between some bounds in
+ * boundinfo. 'off' would be the offset of the greatest bound
+ * that is <= lookup value, so add off + 1 to the result
+ * instead as the offset of the upper bound of the greatest
+ * partition that may contain lookup value. If the lookup
+ * value had exactly matched the bound, but it isn't
+ * inclusive, no need add the adjacent partition.
+ */
+ else if (!is_equal || inclusive)
+ maxoff = off + 1;
+ else
+ maxoff = off;
+ }
+ else
+ {
+ /*
+ * 'off' is -1 indicating that all bounds are greater, so just
+ * set the first bound's offset as maxoff.
+ */
+ maxoff = off + 1;
+ }
+ break;
+
+ default:
+ elog(ERROR, "invalid strategy number %d", opstrategy);
+ break;
+ }
+
+ Assert(minoff >= 0 && minoff <= boundinfo->ndatums);
+ Assert(maxoff >= 0 && maxoff <= boundinfo->ndatums);
+
+ /*
+ * If the smallest partition to return has MINVALUE (negative infinity) as
+ * its lower bound, increment it to point to the next finite bound
+ * (supposedly its upper bound), so that we don't inadvertently end up
+ * scanning the default partition.
+ */
+ if (minoff < boundinfo->ndatums && partindices[minoff] < 0)
+ {
+ int lastkey = nvalues - 1;
+
+ if (boundinfo->kind[minoff][lastkey] ==
+ PARTITION_RANGE_DATUM_MINVALUE)
+ {
+ minoff++;
+ Assert(boundinfo->indexes[minoff] >= 0);
+ }
+ }
+
+ /*
+ * If the previous greatest partition has MAXVALUE (positive infinity) as
+ * its upper bound (something only possible to do with multi-column range
+ * partitioning), we scan switch to it as the greatest partition to
+ * return. Again, so that we don't inadvertently end up scanning the
+ * default partition.
+ */
+ if (maxoff >= 1 && partindices[maxoff] < 0)
+ {
+ int lastkey = nvalues - 1;
+
+ if (boundinfo->kind[maxoff - 1][lastkey] ==
+ PARTITION_RANGE_DATUM_MAXVALUE)
+ {
+ maxoff--;
+ Assert(boundinfo->indexes[maxoff] >= 0);
+ }
+ }
+
+ Assert(minoff >= 0 && maxoff >= 0);
+ if (minoff <= maxoff)
+ result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
+
+ return result;
+}
+
+/*
+ * pull_exec_paramids
+ * Returns a Bitmapset containing the paramids of all Params with
+ * paramkind = PARAM_EXEC in 'expr'.
+ */
+static Bitmapset *
+pull_exec_paramids(Expr *expr)
+{
+ Bitmapset *result = NULL;
+
+ (void) pull_exec_paramids_walker((Node *) expr, &result);
+
+ return result;
+}
+
+static bool
+pull_exec_paramids_walker(Node *node, Bitmapset **context)
+{
+ if (node == NULL)
+ return false;
+ if (IsA(node, Param))
+ {
+ Param *param = (Param *) node;
+
+ if (param->paramkind == PARAM_EXEC)
+ *context = bms_add_member(*context, param->paramid);
+ return false;
+ }
+ return expression_tree_walker(node, pull_exec_paramids_walker,
+ (void *) context);
+}
+
+/*
+ * get_partkey_exec_paramids
+ * Loop through given pruning steps and find out which exec Params
+ * are used.
+ *
+ * Returns a Bitmapset of Param IDs.
+ */
+static Bitmapset *
+get_partkey_exec_paramids(List *steps)
+{
+ Bitmapset *execparamids = NULL;
+ ListCell *lc;
+
+ foreach(lc, steps)
+ {
+ PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
+ ListCell *lc2;
+
+ if (!IsA(step, PartitionPruneStepOp))
+ continue;
+
+ foreach(lc2, step->exprs)
+ {
+ Expr *expr = lfirst(lc2);
+
+ /* We can be quick for plain Consts */
+ if (!IsA(expr, Const))
+ execparamids = bms_join(execparamids,
+ pull_exec_paramids(expr));
+ }
+ }
+
+ return execparamids;
+}
+
+/*
+ * perform_pruning_base_step
+ * Determines the indexes of datums that satisfy conditions specified in
+ * 'opstep'.
+ *
+ * Result also contains whether special null-accepting and/or default
+ * partition need to be scanned.
+ */
+static PruneStepResult *
+perform_pruning_base_step(PartitionPruneContext *context,
+ PartitionPruneStepOp *opstep)
+{
+ ListCell *lc1,
+ *lc2;
+ int keyno,
+ nvalues;
+ Datum values[PARTITION_MAX_KEYS];
+ FmgrInfo *partsupfunc;
+ int stateidx;
+
+ /*
+ * There better be the same number of expressions and compare functions.
+ */
+ Assert(list_length(opstep->exprs) == list_length(opstep->cmpfns));
+
+ nvalues = 0;
+ lc1 = list_head(opstep->exprs);
+ lc2 = list_head(opstep->cmpfns);
+
+ /*
+ * Generate the partition lookup key that will be used by one of the
+ * get_matching_*_bounds functions called below.
+ */
+ for (keyno = 0; keyno < context->partnatts; keyno++)
+ {
+ /*
+ * For hash partitioning, it is possible that values of some keys are
+ * not provided in operator clauses, but instead the planner found
+ * that they appeared in a IS NULL clause.
+ */
+ if (bms_is_member(keyno, opstep->nullkeys))
+ continue;
+
+ /*
+ * For range partitioning, we must only perform pruning with values
+ * for either all partition keys or a prefix thereof.
+ */
+ if (keyno > nvalues && context->strategy == PARTITION_STRATEGY_RANGE)
+ break;
+
+ if (lc1 != NULL)
+ {
+ Expr *expr;
+ Datum datum;
+ bool isnull;
+ Oid cmpfn;
+
+ expr = lfirst(lc1);
+ stateidx = PruneCxtStateIdx(context->partnatts,
+ opstep->step.step_id, keyno);
+ partkey_datum_from_expr(context, expr, stateidx,
+ &datum, &isnull);
+
+ /*
+ * Since we only allow strict operators in pruning steps, any
+ * null-valued comparison value must cause the comparison to fail,
+ * so that no partitions could match.
+ */
+ if (isnull)
+ {
+ PruneStepResult *result;
+
+ result = (PruneStepResult *) palloc(sizeof(PruneStepResult));
+ result->bound_offsets = NULL;
+ result->scan_default = false;
+ result->scan_null = false;
+
+ return result;
+ }
+
+ /* Set up the stepcmpfuncs entry, unless we already did */
+ cmpfn = lfirst_oid(lc2);
+ Assert(OidIsValid(cmpfn));
+ if (cmpfn != context->stepcmpfuncs[stateidx].fn_oid)
+ {
+ /*
+ * If the needed support function is the same one cached in
+ * the relation's partition key, copy the cached FmgrInfo.
+ * Otherwise (i.e., when we have a cross-type comparison), an
+ * actual lookup is required.
+ */
+ if (cmpfn == context->partsupfunc[keyno].fn_oid)
+ fmgr_info_copy(&context->stepcmpfuncs[stateidx],
+ &context->partsupfunc[keyno],
+ context->ppccontext);
+ else
+ fmgr_info_cxt(cmpfn, &context->stepcmpfuncs[stateidx],
+ context->ppccontext);
+ }
+
+ values[keyno] = datum;
+ nvalues++;
+
+ lc1 = lnext(opstep->exprs, lc1);
+ lc2 = lnext(opstep->cmpfns, lc2);
+ }
+ }
+
+ /*
+ * Point partsupfunc to the entry for the 0th key of this step; the
+ * additional support functions, if any, follow consecutively.
+ */
+ stateidx = PruneCxtStateIdx(context->partnatts, opstep->step.step_id, 0);
+ partsupfunc = &context->stepcmpfuncs[stateidx];
+
+ switch (context->strategy)
+ {
+ case PARTITION_STRATEGY_HASH:
+ return get_matching_hash_bounds(context,
+ opstep->opstrategy,
+ values, nvalues,
+ partsupfunc,
+ opstep->nullkeys);
+
+ case PARTITION_STRATEGY_LIST:
+ return get_matching_list_bounds(context,
+ opstep->opstrategy,
+ values[0], nvalues,
+ &partsupfunc[0],
+ opstep->nullkeys);
+
+ case PARTITION_STRATEGY_RANGE:
+ return get_matching_range_bounds(context,
+ opstep->opstrategy,
+ values, nvalues,
+ partsupfunc,
+ opstep->nullkeys);
+
+ default:
+ elog(ERROR, "unexpected partition strategy: %d",
+ (int) context->strategy);
+ break;
+ }
+
+ return NULL;
+}
+
+/*
+ * perform_pruning_combine_step
+ * Determines the indexes of datums obtained by combining those given
+ * by the steps identified by cstep->source_stepids using the specified
+ * combination method
+ *
+ * Since cstep may refer to the result of earlier steps, we also receive
+ * step_results here.
+ */
+static PruneStepResult *
+perform_pruning_combine_step(PartitionPruneContext *context,
+ PartitionPruneStepCombine *cstep,
+ PruneStepResult **step_results)
+{
+ PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+ bool firststep;
+ ListCell *lc1;
+
+ /*
+ * A combine step without any source steps is an indication to not perform
+ * any partition pruning. Return all datum indexes in that case.
+ */
+ if (cstep->source_stepids == NIL)
+ {
+ PartitionBoundInfo boundinfo = context->boundinfo;
+
+ result->bound_offsets =
+ bms_add_range(NULL, 0, boundinfo->nindexes - 1);
+ result->scan_default = partition_bound_has_default(boundinfo);
+ result->scan_null = partition_bound_accepts_nulls(boundinfo);
+ return result;
+ }
+
+ switch (cstep->combineOp)
+ {
+ case PARTPRUNE_COMBINE_UNION:
+ foreach(lc1, cstep->source_stepids)
+ {
+ int step_id = lfirst_int(lc1);
+ PruneStepResult *step_result;
+
+ /*
+ * step_results[step_id] must contain a valid result, which is
+ * confirmed by the fact that cstep's step_id is greater than
+ * step_id and the fact that results of the individual steps
+ * are evaluated in sequence of their step_ids.
+ */
+ if (step_id >= cstep->step.step_id)
+ elog(ERROR, "invalid pruning combine step argument");
+ step_result = step_results[step_id];
+ Assert(step_result != NULL);
+
+ /* Record any additional datum indexes from this step */
+ result->bound_offsets = bms_add_members(result->bound_offsets,
+ step_result->bound_offsets);
+
+ /* Update whether to scan null and default partitions. */
+ if (!result->scan_null)
+ result->scan_null = step_result->scan_null;
+ if (!result->scan_default)
+ result->scan_default = step_result->scan_default;
+ }
+ break;
+
+ case PARTPRUNE_COMBINE_INTERSECT:
+ firststep = true;
+ foreach(lc1, cstep->source_stepids)
+ {
+ int step_id = lfirst_int(lc1);
+ PruneStepResult *step_result;
+
+ if (step_id >= cstep->step.step_id)
+ elog(ERROR, "invalid pruning combine step argument");
+ step_result = step_results[step_id];
+ Assert(step_result != NULL);
+
+ if (firststep)
+ {
+ /* Copy step's result the first time. */
+ result->bound_offsets =
+ bms_copy(step_result->bound_offsets);
+ result->scan_null = step_result->scan_null;
+ result->scan_default = step_result->scan_default;
+ firststep = false;
+ }
+ else
+ {
+ /* Record datum indexes common to both steps */
+ result->bound_offsets =
+ bms_int_members(result->bound_offsets,
+ step_result->bound_offsets);
+
+ /* Update whether to scan null and default partitions. */
+ if (result->scan_null)
+ result->scan_null = step_result->scan_null;
+ if (result->scan_default)
+ result->scan_default = step_result->scan_default;
+ }
+ }
+ break;
+ }
+
+ return result;
+}
+
+/*
+ * match_boolean_partition_clause
+ *
+ * If we're able to match the clause to the partition key as specially-shaped
+ * boolean clause, set *outconst to a Const containing a true or false value,
+ * set *noteq according to if the clause was in the "not" form, i.e. "is not
+ * true" or "is not false", and return PARTCLAUSE_MATCH_CLAUSE. Returns
+ * PARTCLAUSE_UNSUPPORTED if the clause is not a boolean clause or if the
+ * boolean clause is unsuitable for partition pruning. Returns
+ * PARTCLAUSE_NOMATCH if it's a bool quals but just does not match this
+ * partition key. *outconst is set to NULL in the latter two cases.
+ */
+static PartClauseMatchStatus
+match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey,
+ Expr **outconst, bool *noteq)
+{
+ Expr *leftop;
+
+ *outconst = NULL;
+ *noteq = false;
+
+ if (!IsBooleanOpfamily(partopfamily))
+ return PARTCLAUSE_UNSUPPORTED;
+
+ if (IsA(clause, BooleanTest))
+ {
+ BooleanTest *btest = (BooleanTest *) clause;
+
+ /* Only IS [NOT] TRUE/FALSE are any good to us */
+ if (btest->booltesttype == IS_UNKNOWN ||
+ btest->booltesttype == IS_NOT_UNKNOWN)
+ return PARTCLAUSE_UNSUPPORTED;
+
+ leftop = btest->arg;
+ if (IsA(leftop, RelabelType))
+ leftop = ((RelabelType *) leftop)->arg;
+
+ if (equal(leftop, partkey))
+ {
+ switch (btest->booltesttype)
+ {
+ case IS_NOT_TRUE:
+ *noteq = true;
+ /* fall through */
+ case IS_TRUE:
+ *outconst = (Expr *) makeBoolConst(true, false);
+ break;
+ case IS_NOT_FALSE:
+ *noteq = true;
+ /* fall through */
+ case IS_FALSE:
+ *outconst = (Expr *) makeBoolConst(false, false);
+ break;
+ default:
+ return PARTCLAUSE_UNSUPPORTED;
+ }
+ }
+ if (*outconst)
+ return PARTCLAUSE_MATCH_CLAUSE;
+ }
+ else
+ {
+ bool is_not_clause = is_notclause(clause);
+
+ leftop = is_not_clause ? get_notclausearg(clause) : clause;
+
+ if (IsA(leftop, RelabelType))
+ leftop = ((RelabelType *) leftop)->arg;
+
+ /* Compare to the partition key, and make up a clause ... */
+ if (equal(leftop, partkey))
+ *outconst = (Expr *) makeBoolConst(!is_not_clause, false);
+ else if (equal(negate_clause((Node *) leftop), partkey))
+ *outconst = (Expr *) makeBoolConst(is_not_clause, false);
+
+ if (*outconst)
+ return PARTCLAUSE_MATCH_CLAUSE;
+ }
+
+ return PARTCLAUSE_NOMATCH;
+}
+
+/*
+ * partkey_datum_from_expr
+ * Evaluate expression for potential partition pruning
+ *
+ * Evaluate 'expr'; set *value and *isnull to the resulting Datum and nullflag.
+ *
+ * If expr isn't a Const, its ExprState is in stateidx of the context
+ * exprstate array.
+ *
+ * Note that the evaluated result may be in the per-tuple memory context of
+ * context->exprcontext, and we may have leaked other memory there too.
+ * This memory must be recovered by resetting that ExprContext after
+ * we're done with the pruning operation (see execPartition.c).
+ */
+static void
+partkey_datum_from_expr(PartitionPruneContext *context,
+ Expr *expr, int stateidx,
+ Datum *value, bool *isnull)
+{
+ if (IsA(expr, Const))
+ {
+ /* We can always determine the value of a constant */
+ Const *con = (Const *) expr;
+
+ *value = con->constvalue;
+ *isnull = con->constisnull;
+ }
+ else
+ {
+ ExprState *exprstate;
+ ExprContext *ectx;
+
+ /*
+ * We should never see a non-Const in a step unless the caller has
+ * passed a valid ExprContext.
+ *
+ * When context->planstate is valid, context->exprcontext is same as
+ * context->planstate->ps_ExprContext.
+ */
+ Assert(context->planstate != NULL || context->exprcontext != NULL);
+ Assert(context->planstate == NULL ||
+ (context->exprcontext == context->planstate->ps_ExprContext));
+
+ exprstate = context->exprstates[stateidx];
+ ectx = context->exprcontext;
+ *value = ExecEvalExprSwitchContext(exprstate, ectx, isnull);
+ }
+}