summaryrefslogtreecommitdiffstats
path: root/src/backend/executor
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/executor
parentInitial commit. (diff)
downloadpostgresql-14-upstream.tar.xz
postgresql-14-upstream.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/backend/executor/Makefile82
-rw-r--r--src/backend/executor/README405
-rw-r--r--src/backend/executor/execAmi.c662
-rw-r--r--src/backend/executor/execAsync.c154
-rw-r--r--src/backend/executor/execCurrent.c426
-rw-r--r--src/backend/executor/execExpr.c3965
-rw-r--r--src/backend/executor/execExprInterp.c4373
-rw-r--r--src/backend/executor/execGrouping.c560
-rw-r--r--src/backend/executor/execIndexing.c921
-rw-r--r--src/backend/executor/execJunk.c304
-rw-r--r--src/backend/executor/execMain.c2886
-rw-r--r--src/backend/executor/execParallel.c1498
-rw-r--r--src/backend/executor/execPartition.c2107
-rw-r--r--src/backend/executor/execProcnode.c981
-rw-r--r--src/backend/executor/execReplication.c629
-rw-r--r--src/backend/executor/execSRF.c980
-rw-r--r--src/backend/executor/execScan.c342
-rw-r--r--src/backend/executor/execTuples.c2339
-rw-r--r--src/backend/executor/execUtils.c1351
-rw-r--r--src/backend/executor/functions.c2103
-rw-r--r--src/backend/executor/instrument.c279
-rw-r--r--src/backend/executor/nodeAgg.c4829
-rw-r--r--src/backend/executor/nodeAppend.c1186
-rw-r--r--src/backend/executor/nodeBitmapAnd.c223
-rw-r--r--src/backend/executor/nodeBitmapHeapscan.c954
-rw-r--r--src/backend/executor/nodeBitmapIndexscan.c330
-rw-r--r--src/backend/executor/nodeBitmapOr.c241
-rw-r--r--src/backend/executor/nodeCtescan.c351
-rw-r--r--src/backend/executor/nodeCustom.c228
-rw-r--r--src/backend/executor/nodeForeignscan.c504
-rw-r--r--src/backend/executor/nodeFunctionscan.c620
-rw-r--r--src/backend/executor/nodeGather.c477
-rw-r--r--src/backend/executor/nodeGatherMerge.c789
-rw-r--r--src/backend/executor/nodeGroup.c255
-rw-r--r--src/backend/executor/nodeHash.c3434
-rw-r--r--src/backend/executor/nodeHashjoin.c1551
-rw-r--r--src/backend/executor/nodeIncrementalSort.c1257
-rw-r--r--src/backend/executor/nodeIndexonlyscan.c735
-rw-r--r--src/backend/executor/nodeIndexscan.c1747
-rw-r--r--src/backend/executor/nodeLimit.c558
-rw-r--r--src/backend/executor/nodeLockRows.c403
-rw-r--r--src/backend/executor/nodeMaterial.c368
-rw-r--r--src/backend/executor/nodeMemoize.c1225
-rw-r--r--src/backend/executor/nodeMergeAppend.c389
-rw-r--r--src/backend/executor/nodeMergejoin.c1678
-rw-r--r--src/backend/executor/nodeModifyTable.c3243
-rw-r--r--src/backend/executor/nodeNamedtuplestorescan.c201
-rw-r--r--src/backend/executor/nodeNestloop.c411
-rw-r--r--src/backend/executor/nodeProjectSet.c351
-rw-r--r--src/backend/executor/nodeRecursiveunion.c331
-rw-r--r--src/backend/executor/nodeResult.c272
-rw-r--r--src/backend/executor/nodeSamplescan.c378
-rw-r--r--src/backend/executor/nodeSeqscan.c314
-rw-r--r--src/backend/executor/nodeSetOp.c651
-rw-r--r--src/backend/executor/nodeSort.c430
-rw-r--r--src/backend/executor/nodeSubplan.c1313
-rw-r--r--src/backend/executor/nodeSubqueryscan.c213
-rw-r--r--src/backend/executor/nodeTableFuncscan.c523
-rw-r--r--src/backend/executor/nodeTidrangescan.c413
-rw-r--r--src/backend/executor/nodeTidscan.c558
-rw-r--r--src/backend/executor/nodeUnique.c192
-rw-r--r--src/backend/executor/nodeValuesscan.c361
-rw-r--r--src/backend/executor/nodeWindowAgg.c3463
-rw-r--r--src/backend/executor/nodeWorktablescan.c223
-rw-r--r--src/backend/executor/spi.c3383
-rw-r--r--src/backend/executor/tqueue.c210
-rw-r--r--src/backend/executor/tstoreReceiver.c283
67 files changed, 69396 insertions, 0 deletions
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
new file mode 100644
index 0000000..11118d0
--- /dev/null
+++ b/src/backend/executor/Makefile
@@ -0,0 +1,82 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for executor
+#
+# IDENTIFICATION
+# src/backend/executor/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/executor
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ execAmi.o \
+ execAsync.o \
+ execCurrent.o \
+ execExpr.o \
+ execExprInterp.o \
+ execGrouping.o \
+ execIndexing.o \
+ execJunk.o \
+ execMain.o \
+ execParallel.o \
+ execPartition.o \
+ execProcnode.o \
+ execReplication.o \
+ execSRF.o \
+ execScan.o \
+ execTuples.o \
+ execUtils.o \
+ functions.o \
+ instrument.o \
+ nodeAgg.o \
+ nodeAppend.o \
+ nodeBitmapAnd.o \
+ nodeBitmapHeapscan.o \
+ nodeBitmapIndexscan.o \
+ nodeBitmapOr.o \
+ nodeCtescan.o \
+ nodeCustom.o \
+ nodeForeignscan.o \
+ nodeFunctionscan.o \
+ nodeGather.o \
+ nodeGatherMerge.o \
+ nodeGroup.o \
+ nodeHash.o \
+ nodeHashjoin.o \
+ nodeIncrementalSort.o \
+ nodeIndexonlyscan.o \
+ nodeIndexscan.o \
+ nodeLimit.o \
+ nodeLockRows.o \
+ nodeMaterial.o \
+ nodeMemoize.o \
+ nodeMergeAppend.o \
+ nodeMergejoin.o \
+ nodeModifyTable.o \
+ nodeNamedtuplestorescan.o \
+ nodeNestloop.o \
+ nodeProjectSet.o \
+ nodeRecursiveunion.o \
+ nodeResult.o \
+ nodeSamplescan.o \
+ nodeSeqscan.o \
+ nodeSetOp.o \
+ nodeSort.o \
+ nodeSubplan.o \
+ nodeSubqueryscan.o \
+ nodeTableFuncscan.o \
+ nodeTidrangescan.o \
+ nodeTidscan.o \
+ nodeUnique.o \
+ nodeValuesscan.o \
+ nodeWindowAgg.o \
+ nodeWorktablescan.o \
+ spi.o \
+ tqueue.o \
+ tstoreReceiver.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/executor/README b/src/backend/executor/README
new file mode 100644
index 0000000..bf5e708
--- /dev/null
+++ b/src/backend/executor/README
@@ -0,0 +1,405 @@
+src/backend/executor/README
+
+The Postgres Executor
+=====================
+
+The executor processes a tree of "plan nodes". The plan tree is essentially
+a demand-pull pipeline of tuple processing operations. Each node, when
+called, will produce the next tuple in its output sequence, or NULL if no
+more tuples are available. If the node is not a primitive relation-scanning
+node, it will have child node(s) that it calls in turn to obtain input
+tuples.
+
+Refinements on this basic model include:
+
+* Choice of scan direction (forwards or backwards). Caution: this is not
+currently well-supported. It works for primitive scan nodes, but not very
+well for joins, aggregates, etc.
+
+* Rescan command to reset a node and make it generate its output sequence
+over again.
+
+* Parameters that can alter a node's results. After adjusting a parameter,
+the rescan command must be applied to that node and all nodes above it.
+There is a moderately intelligent scheme to avoid rescanning nodes
+unnecessarily (for example, Sort does not rescan its input if no parameters
+of the input have changed, since it can just reread its stored sorted data).
+
+For a SELECT, it is only necessary to deliver the top-level result tuples
+to the client. For INSERT/UPDATE/DELETE, the actual table modification
+operations happen in a top-level ModifyTable plan node. If the query
+includes a RETURNING clause, the ModifyTable node delivers the computed
+RETURNING rows as output, otherwise it returns nothing. Handling INSERT
+is pretty straightforward: the tuples returned from the plan tree below
+ModifyTable are inserted into the correct result relation. For UPDATE,
+the plan tree returns the new values of the updated columns, plus "junk"
+(hidden) column(s) identifying which table row is to be updated. The
+ModifyTable node must fetch that row to extract values for the unchanged
+columns, combine the values into a new row, and apply the update. (For a
+heap table, the row-identity junk column is a CTID, but other things may
+be used for other table types.) For DELETE, the plan tree need only deliver
+junk row-identity column(s), and the ModifyTable node visits each of those
+rows and marks the row deleted.
+
+XXX a great deal more documentation needs to be written here...
+
+
+Plan Trees and State Trees
+--------------------------
+
+The plan tree delivered by the planner contains a tree of Plan nodes (struct
+types derived from struct Plan). During executor startup we build a parallel
+tree of identical structure containing executor state nodes --- generally,
+every plan node type has a corresponding executor state node type. Each node
+in the state tree has a pointer to its corresponding node in the plan tree,
+plus executor state data as needed to implement that node type. This
+arrangement allows the plan tree to be completely read-only so far as the
+executor is concerned: all data that is modified during execution is in the
+state tree. Read-only plan trees make life much simpler for plan caching and
+reuse.
+
+A corresponding executor state node may not be created during executor startup
+if the executor determines that an entire subplan is not required due to
+execution time partition pruning determining that no matching records will be
+found there. This currently only occurs for Append and MergeAppend nodes. In
+this case the non-required subplans are ignored and the executor state's
+subnode array will become out of sequence to the plan's subplan list.
+
+Each Plan node may have expression trees associated with it, to represent
+its target list, qualification conditions, etc. These trees are also
+read-only to the executor, but the executor state for expression evaluation
+does not mirror the Plan expression's tree shape, as explained below.
+Rather, there's just one ExprState node per expression tree, although this
+may have sub-nodes for some complex expression node types.
+
+Altogether there are four classes of nodes used in these trees: Plan nodes,
+their corresponding PlanState nodes, Expr nodes, and ExprState nodes.
+(Actually, there are also List nodes, which are used as "glue" in all
+three tree-based representations.)
+
+
+Expression Trees and ExprState nodes
+------------------------------------
+
+Expression trees, in contrast to Plan trees, are not mirrored into a
+corresponding tree of state nodes. Instead each separately executable
+expression tree (e.g. a Plan's qual or targetlist) is represented by one
+ExprState node. The ExprState node contains the information needed to
+evaluate the expression in a compact, linear form. That compact form is
+stored as a flat array in ExprState->steps[] (an array of ExprEvalStep,
+not ExprEvalStep *).
+
+The reasons for choosing such a representation include:
+- commonly the amount of work needed to evaluate one Expr-type node is
+ small enough that the overhead of having to perform a tree-walk
+ during evaluation is significant.
+- the flat representation can be evaluated non-recursively within a single
+ function, reducing stack depth and function call overhead.
+- such a representation is usable both for fast interpreted execution,
+ and for compiling into native code.
+
+The Plan-tree representation of an expression is compiled into an
+ExprState node by ExecInitExpr(). As much complexity as possible should
+be handled by ExecInitExpr() (and helpers), instead of execution time
+where both interpreted and compiled versions would need to deal with the
+complexity. Besides duplicating effort between execution approaches,
+runtime initialization checks also have a small but noticeable cost every
+time the expression is evaluated. Therefore, we allow ExecInitExpr() to
+precompute information that we do not expect to vary across execution of a
+single query, for example the set of CHECK constraint expressions to be
+applied to a domain type. This could not be done at plan time without
+greatly increasing the number of events that require plan invalidation.
+(Previously, some information of this kind was rechecked on each
+expression evaluation, but that seems like unnecessary overhead.)
+
+
+Expression Initialization
+-------------------------
+
+During ExecInitExpr() and similar routines, Expr trees are converted
+into the flat representation. Each Expr node might be represented by
+zero, one, or more ExprEvalSteps.
+
+Each ExprEvalStep's work is determined by its opcode (of enum ExprEvalOp)
+and it stores the result of its work into the Datum variable and boolean
+null flag variable pointed to by ExprEvalStep->resvalue/resnull.
+Complex expressions are performed by chaining together several steps.
+For example, "a + b" (one OpExpr, with two Var expressions) would be
+represented as two steps to fetch the Var values, and one step for the
+evaluation of the function underlying the + operator. The steps for the
+Vars would have their resvalue/resnull pointing directly to the appropriate
+args[].value .isnull elements in the FunctionCallInfoBaseData struct that
+is used by the function evaluation step, thus avoiding extra work to copy
+the result values around.
+
+The last entry in a completed ExprState->steps array is always an
+EEOP_DONE step; this removes the need to test for end-of-array while
+iterating. Also, if the expression contains any variable references (to
+user columns of the ExprContext's INNER, OUTER, or SCAN tuples), the steps
+array begins with EEOP_*_FETCHSOME steps that ensure that the relevant
+tuples have been deconstructed to make the required columns directly
+available (cf. slot_getsomeattrs()). This allows individual Var-fetching
+steps to be little more than an array lookup.
+
+Most of ExecInitExpr()'s work is done by the recursive function
+ExecInitExprRec() and its subroutines. ExecInitExprRec() maps one Expr
+node into the steps required for execution, recursing as needed for
+sub-expressions.
+
+Each ExecInitExprRec() call has to specify where that subexpression's
+results are to be stored (via the resv/resnull parameters). This allows
+the above scenario of evaluating a (sub-)expression directly into
+fcinfo->args[].value/isnull, but also requires some care: target Datum/isnull
+variables may not be shared with another ExecInitExprRec() unless the
+results are only needed by steps executing before further usages of those
+target Datum/isnull variables. Due to the non-recursiveness of the
+ExprEvalStep representation that's usually easy to guarantee.
+
+ExecInitExprRec() pushes new operations into the ExprState->steps array
+using ExprEvalPushStep(). To keep the steps as a consecutively laid out
+array, ExprEvalPushStep() has to repalloc the entire array when there's
+not enough space. Because of that it is *not* allowed to point directly
+into any of the steps during expression initialization. Therefore, the
+resv/resnull for a subexpression usually point to some storage that is
+palloc'd separately from the steps array. For instance, the
+FunctionCallInfoBaseData for a function call step is separately allocated
+rather than being part of the ExprEvalStep array. The overall result
+of a complete expression is typically returned into the resvalue/resnull
+fields of the ExprState node itself.
+
+Some steps, e.g. boolean expressions, allow skipping evaluation of
+certain subexpressions. In the flat representation this amounts to
+jumping to some later step rather than just continuing consecutively
+with the next step. The target for such a jump is represented by
+the integer index in the ExprState->steps array of the step to execute
+next. (Compare the EEO_NEXT and EEO_JUMP macros in execExprInterp.c.)
+
+Typically, ExecInitExprRec() has to push a jumping step into the steps
+array, then recursively generate steps for the subexpression that might
+get skipped over, then go back and fix up the jump target index using
+the now-known length of the subexpression's steps. This is handled by
+adjust_jumps lists in execExpr.c.
+
+The last step in constructing an ExprState is to apply ExecReadyExpr(),
+which readies it for execution using whichever execution method has been
+selected.
+
+
+Expression Evaluation
+---------------------
+
+To allow for different methods of expression evaluation, and for
+better branch/jump target prediction, expressions are evaluated by
+calling ExprState->evalfunc (via ExecEvalExpr() and friends).
+
+ExecReadyExpr() can choose the method of interpretation by setting
+evalfunc to an appropriate function. The default execution function,
+ExecInterpExpr, is implemented in execExprInterp.c; see its header
+comment for details. Special-case evalfuncs are used for certain
+especially-simple expressions.
+
+Note that a lot of the more complex expression evaluation steps, which are
+less performance-critical than the simpler ones, are implemented as
+separate functions outside the fast-path of expression execution, allowing
+their implementation to be shared between interpreted and compiled
+expression evaluation. This means that these helper functions are not
+allowed to perform expression step dispatch themselves, as the method of
+dispatch will vary based on the caller. The helpers therefore cannot call
+for the execution of subexpressions; all subexpression results they need
+must be computed by earlier steps. And dispatch to the following
+expression step must be performed after returning from the helper.
+
+
+Targetlist Evaluation
+---------------------
+
+ExecBuildProjectionInfo builds an ExprState that has the effect of
+evaluating a targetlist into ExprState->resultslot. A generic targetlist
+expression is executed by evaluating it as discussed above (storing the
+result into the ExprState's resvalue/resnull fields) and then using an
+EEOP_ASSIGN_TMP step to move the result into the appropriate tts_values[]
+and tts_isnull[] array elements of the result slot. There are special
+fast-path step types (EEOP_ASSIGN_*_VAR) to handle targetlist entries that
+are simple Vars using only one step instead of two.
+
+
+Memory Management
+-----------------
+
+A "per query" memory context is created during CreateExecutorState();
+all storage allocated during an executor invocation is allocated in that
+context or a child context. This allows easy reclamation of storage
+during executor shutdown --- rather than messing with retail pfree's and
+probable storage leaks, we just destroy the memory context.
+
+In particular, the plan state trees and expression state trees described
+in the previous section are allocated in the per-query memory context.
+
+To avoid intra-query memory leaks, most processing while a query runs
+is done in "per tuple" memory contexts, which are so-called because they
+are typically reset to empty once per tuple. Per-tuple contexts are usually
+associated with ExprContexts, and commonly each PlanState node has its own
+ExprContext to evaluate its qual and targetlist expressions in.
+
+
+Query Processing Control Flow
+-----------------------------
+
+This is a sketch of control flow for full query processing:
+
+ CreateQueryDesc
+
+ ExecutorStart
+ CreateExecutorState
+ creates per-query context
+ switch to per-query context to run ExecInitNode
+ AfterTriggerBeginQuery
+ ExecInitNode --- recursively scans plan tree
+ ExecInitNode
+ recurse into subsidiary nodes
+ CreateExprContext
+ creates per-tuple context
+ ExecInitExpr
+
+ ExecutorRun
+ ExecProcNode --- recursively called in per-query context
+ ExecEvalExpr --- called in per-tuple context
+ ResetExprContext --- to free memory
+
+ ExecutorFinish
+ ExecPostprocessPlan --- run any unfinished ModifyTable nodes
+ AfterTriggerEndQuery
+
+ ExecutorEnd
+ ExecEndNode --- recursively releases resources
+ FreeExecutorState
+ frees per-query context and child contexts
+
+ FreeQueryDesc
+
+Per above comments, it's not really critical for ExecEndNode to free any
+memory; it'll all go away in FreeExecutorState anyway. However, we do need to
+be careful to close relations, drop buffer pins, etc, so we do need to scan
+the plan state tree to find these sorts of resources.
+
+
+The executor can also be used to evaluate simple expressions without any Plan
+tree ("simple" meaning "no aggregates and no sub-selects", though such might
+be hidden inside function calls). This case has a flow of control like
+
+ CreateExecutorState
+ creates per-query context
+
+ CreateExprContext -- or use GetPerTupleExprContext(estate)
+ creates per-tuple context
+
+ ExecPrepareExpr
+ temporarily switch to per-query context
+ run the expression through expression_planner
+ ExecInitExpr
+
+ Repeatedly do:
+ ExecEvalExprSwitchContext
+ ExecEvalExpr --- called in per-tuple context
+ ResetExprContext --- to free memory
+
+ FreeExecutorState
+ frees per-query context, as well as ExprContext
+ (a separate FreeExprContext call is not necessary)
+
+
+EvalPlanQual (READ COMMITTED Update Checking)
+---------------------------------------------
+
+For simple SELECTs, the executor need only pay attention to tuples that are
+valid according to the snapshot seen by the current transaction (ie, they
+were inserted by a previously committed transaction, and not deleted by any
+previously committed transaction). However, for UPDATE and DELETE it is not
+cool to modify or delete a tuple that's been modified by an open or
+concurrently-committed transaction. If we are running in SERIALIZABLE
+isolation level then we just raise an error when this condition is seen to
+occur. In READ COMMITTED isolation level, we must work a lot harder.
+
+The basic idea in READ COMMITTED mode is to take the modified tuple
+committed by the concurrent transaction (after waiting for it to commit,
+if need be) and re-evaluate the query qualifications to see if it would
+still meet the quals. If so, we regenerate the updated tuple (if we are
+doing an UPDATE) from the modified tuple, and finally update/delete the
+modified tuple. SELECT FOR UPDATE/SHARE behaves similarly, except that its
+action is just to lock the modified tuple and return results based on that
+version of the tuple.
+
+To implement this checking, we actually re-run the query from scratch for
+each modified tuple (or set of tuples, for SELECT FOR UPDATE), with the
+relation scan nodes tweaked to return only the current tuples --- either
+the original ones, or the updated (and now locked) versions of the modified
+tuple(s). If this query returns a tuple, then the modified tuple(s) pass
+the quals (and the query output is the suitably modified update tuple, if
+we're doing UPDATE). If no tuple is returned, then the modified tuple(s)
+fail the quals, so we ignore the current result tuple and continue the
+original query.
+
+In UPDATE/DELETE, only the target relation needs to be handled this way.
+In SELECT FOR UPDATE, there may be multiple relations flagged FOR UPDATE,
+so we obtain lock on the current tuple version in each such relation before
+executing the recheck.
+
+It is also possible that there are relations in the query that are not
+to be locked (they are neither the UPDATE/DELETE target nor specified to
+be locked in SELECT FOR UPDATE/SHARE). When re-running the test query
+we want to use the same rows from these relations that were joined to
+the locked rows. For ordinary relations this can be implemented relatively
+cheaply by including the row TID in the join outputs and re-fetching that
+TID. (The re-fetch is expensive, but we're trying to optimize the normal
+case where no re-test is needed.) We have also to consider non-table
+relations, such as a ValuesScan or FunctionScan. For these, since there
+is no equivalent of TID, the only practical solution seems to be to include
+the entire row value in the join output row.
+
+We disallow set-returning functions in the targetlist of SELECT FOR UPDATE,
+so as to ensure that at most one tuple can be returned for any particular
+set of scan tuples. Otherwise we'd get duplicates due to the original
+query returning the same set of scan tuples multiple times. Likewise,
+SRFs are disallowed in an UPDATE's targetlist. There, they would have the
+effect of the same row being updated multiple times, which is not very
+useful --- and updates after the first would have no effect anyway.
+
+
+Asynchronous Execution
+----------------------
+
+In cases where a node is waiting on an event external to the database system,
+such as a ForeignScan awaiting network I/O, it's desirable for the node to
+indicate that it cannot return any tuple immediately but may be able to do so
+at a later time. A process which discovers this type of situation can always
+handle it simply by blocking, but this may waste time that could be spent
+executing some other part of the plan tree where progress could be made
+immediately. This is particularly likely to occur when the plan tree contains
+an Append node. Asynchronous execution runs multiple parts of an Append node
+concurrently rather than serially to improve performance.
+
+For asynchronous execution, an Append node must first request a tuple from an
+async-capable child node using ExecAsyncRequest. Next, it must execute the
+asynchronous event loop using ExecAppendAsyncEventWait. Eventually, when a
+child node to which an asynchronous request has been made produces a tuple,
+the Append node will receive it from the event loop via ExecAsyncResponse. In
+the current implementation of asynchronous execution, the only node type that
+requests tuples from an async-capable child node is an Append, while the only
+node type that might be async-capable is a ForeignScan.
+
+Typically, the ExecAsyncResponse callback is the only one required for nodes
+that wish to request tuples asynchronously. On the other hand, async-capable
+nodes generally need to implement three methods:
+
+1. When an asynchronous request is made, the node's ExecAsyncRequest callback
+ will be invoked; it should use ExecAsyncRequestPending to indicate that the
+ request is pending for a callback described below. Alternatively, it can
+ instead use ExecAsyncRequestDone if a result is available immediately.
+
+2. When the event loop wishes to wait or poll for file descriptor events, the
+ node's ExecAsyncConfigureWait callback will be invoked to configure the
+ file descriptor event for which the node wishes to wait.
+
+3. When the file descriptor becomes ready, the node's ExecAsyncNotify callback
+ will be invoked; like #1, it should use ExecAsyncRequestPending for another
+ callback or ExecAsyncRequestDone to return a result immediately.
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
new file mode 100644
index 0000000..c3aa650
--- /dev/null
+++ b/src/backend/executor/execAmi.c
@@ -0,0 +1,662 @@
+/*-------------------------------------------------------------------------
+ *
+ * execAmi.c
+ * miscellaneous executor access method routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/executor/execAmi.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amapi.h"
+#include "access/htup_details.h"
+#include "executor/execdebug.h"
+#include "executor/nodeAgg.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeBitmapAnd.h"
+#include "executor/nodeBitmapHeapscan.h"
+#include "executor/nodeBitmapIndexscan.h"
+#include "executor/nodeBitmapOr.h"
+#include "executor/nodeCtescan.h"
+#include "executor/nodeCustom.h"
+#include "executor/nodeForeignscan.h"
+#include "executor/nodeFunctionscan.h"
+#include "executor/nodeGather.h"
+#include "executor/nodeGatherMerge.h"
+#include "executor/nodeGroup.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "executor/nodeLimit.h"
+#include "executor/nodeLockRows.h"
+#include "executor/nodeMaterial.h"
+#include "executor/nodeMemoize.h"
+#include "executor/nodeMergeAppend.h"
+#include "executor/nodeMergejoin.h"
+#include "executor/nodeModifyTable.h"
+#include "executor/nodeNamedtuplestorescan.h"
+#include "executor/nodeNestloop.h"
+#include "executor/nodeProjectSet.h"
+#include "executor/nodeRecursiveunion.h"
+#include "executor/nodeResult.h"
+#include "executor/nodeSamplescan.h"
+#include "executor/nodeSeqscan.h"
+#include "executor/nodeSetOp.h"
+#include "executor/nodeSort.h"
+#include "executor/nodeSubplan.h"
+#include "executor/nodeSubqueryscan.h"
+#include "executor/nodeTableFuncscan.h"
+#include "executor/nodeTidrangescan.h"
+#include "executor/nodeTidscan.h"
+#include "executor/nodeUnique.h"
+#include "executor/nodeValuesscan.h"
+#include "executor/nodeWindowAgg.h"
+#include "executor/nodeWorktablescan.h"
+#include "nodes/extensible.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/pathnodes.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+static bool IndexSupportsBackwardScan(Oid indexid);
+
+
+/*
+ * ExecReScan
+ * Reset a plan node so that its output can be re-scanned.
+ *
+ * Note that if the plan node has parameters that have changed value,
+ * the output might be different from last time.
+ */
+void
+ExecReScan(PlanState *node)
+{
+ /* If collecting timing stats, update them */
+ if (node->instrument)
+ InstrEndLoop(node->instrument);
+
+ /*
+ * If we have changed parameters, propagate that info.
+ *
+ * Note: ExecReScanSetParamPlan() can add bits to node->chgParam,
+ * corresponding to the output param(s) that the InitPlan will update.
+ * Since we make only one pass over the list, that means that an InitPlan
+ * can depend on the output param(s) of a sibling InitPlan only if that
+ * sibling appears earlier in the list. This is workable for now given
+ * the limited ways in which one InitPlan could depend on another, but
+ * eventually we might need to work harder (or else make the planner
+ * enlarge the extParam/allParam sets to include the params of depended-on
+ * InitPlans).
+ */
+ if (node->chgParam != NULL)
+ {
+ ListCell *l;
+
+ foreach(l, node->initPlan)
+ {
+ SubPlanState *sstate = (SubPlanState *) lfirst(l);
+ PlanState *splan = sstate->planstate;
+
+ if (splan->plan->extParam != NULL) /* don't care about child
+ * local Params */
+ UpdateChangedParamSet(splan, node->chgParam);
+ if (splan->chgParam != NULL)
+ ExecReScanSetParamPlan(sstate, node);
+ }
+ foreach(l, node->subPlan)
+ {
+ SubPlanState *sstate = (SubPlanState *) lfirst(l);
+ PlanState *splan = sstate->planstate;
+
+ if (splan->plan->extParam != NULL)
+ UpdateChangedParamSet(splan, node->chgParam);
+ }
+ /* Well. Now set chgParam for left/right trees. */
+ if (node->lefttree != NULL)
+ UpdateChangedParamSet(node->lefttree, node->chgParam);
+ if (node->righttree != NULL)
+ UpdateChangedParamSet(node->righttree, node->chgParam);
+ }
+
+ /* Call expression callbacks */
+ if (node->ps_ExprContext)
+ ReScanExprContext(node->ps_ExprContext);
+
+ /* And do node-type-specific processing */
+ switch (nodeTag(node))
+ {
+ case T_ResultState:
+ ExecReScanResult((ResultState *) node);
+ break;
+
+ case T_ProjectSetState:
+ ExecReScanProjectSet((ProjectSetState *) node);
+ break;
+
+ case T_ModifyTableState:
+ ExecReScanModifyTable((ModifyTableState *) node);
+ break;
+
+ case T_AppendState:
+ ExecReScanAppend((AppendState *) node);
+ break;
+
+ case T_MergeAppendState:
+ ExecReScanMergeAppend((MergeAppendState *) node);
+ break;
+
+ case T_RecursiveUnionState:
+ ExecReScanRecursiveUnion((RecursiveUnionState *) node);
+ break;
+
+ case T_BitmapAndState:
+ ExecReScanBitmapAnd((BitmapAndState *) node);
+ break;
+
+ case T_BitmapOrState:
+ ExecReScanBitmapOr((BitmapOrState *) node);
+ break;
+
+ case T_SeqScanState:
+ ExecReScanSeqScan((SeqScanState *) node);
+ break;
+
+ case T_SampleScanState:
+ ExecReScanSampleScan((SampleScanState *) node);
+ break;
+
+ case T_GatherState:
+ ExecReScanGather((GatherState *) node);
+ break;
+
+ case T_GatherMergeState:
+ ExecReScanGatherMerge((GatherMergeState *) node);
+ break;
+
+ case T_IndexScanState:
+ ExecReScanIndexScan((IndexScanState *) node);
+ break;
+
+ case T_IndexOnlyScanState:
+ ExecReScanIndexOnlyScan((IndexOnlyScanState *) node);
+ break;
+
+ case T_BitmapIndexScanState:
+ ExecReScanBitmapIndexScan((BitmapIndexScanState *) node);
+ break;
+
+ case T_BitmapHeapScanState:
+ ExecReScanBitmapHeapScan((BitmapHeapScanState *) node);
+ break;
+
+ case T_TidScanState:
+ ExecReScanTidScan((TidScanState *) node);
+ break;
+
+ case T_TidRangeScanState:
+ ExecReScanTidRangeScan((TidRangeScanState *) node);
+ break;
+
+ case T_SubqueryScanState:
+ ExecReScanSubqueryScan((SubqueryScanState *) node);
+ break;
+
+ case T_FunctionScanState:
+ ExecReScanFunctionScan((FunctionScanState *) node);
+ break;
+
+ case T_TableFuncScanState:
+ ExecReScanTableFuncScan((TableFuncScanState *) node);
+ break;
+
+ case T_ValuesScanState:
+ ExecReScanValuesScan((ValuesScanState *) node);
+ break;
+
+ case T_CteScanState:
+ ExecReScanCteScan((CteScanState *) node);
+ break;
+
+ case T_NamedTuplestoreScanState:
+ ExecReScanNamedTuplestoreScan((NamedTuplestoreScanState *) node);
+ break;
+
+ case T_WorkTableScanState:
+ ExecReScanWorkTableScan((WorkTableScanState *) node);
+ break;
+
+ case T_ForeignScanState:
+ ExecReScanForeignScan((ForeignScanState *) node);
+ break;
+
+ case T_CustomScanState:
+ ExecReScanCustomScan((CustomScanState *) node);
+ break;
+
+ case T_NestLoopState:
+ ExecReScanNestLoop((NestLoopState *) node);
+ break;
+
+ case T_MergeJoinState:
+ ExecReScanMergeJoin((MergeJoinState *) node);
+ break;
+
+ case T_HashJoinState:
+ ExecReScanHashJoin((HashJoinState *) node);
+ break;
+
+ case T_MaterialState:
+ ExecReScanMaterial((MaterialState *) node);
+ break;
+
+ case T_MemoizeState:
+ ExecReScanMemoize((MemoizeState *) node);
+ break;
+
+ case T_SortState:
+ ExecReScanSort((SortState *) node);
+ break;
+
+ case T_IncrementalSortState:
+ ExecReScanIncrementalSort((IncrementalSortState *) node);
+ break;
+
+ case T_GroupState:
+ ExecReScanGroup((GroupState *) node);
+ break;
+
+ case T_AggState:
+ ExecReScanAgg((AggState *) node);
+ break;
+
+ case T_WindowAggState:
+ ExecReScanWindowAgg((WindowAggState *) node);
+ break;
+
+ case T_UniqueState:
+ ExecReScanUnique((UniqueState *) node);
+ break;
+
+ case T_HashState:
+ ExecReScanHash((HashState *) node);
+ break;
+
+ case T_SetOpState:
+ ExecReScanSetOp((SetOpState *) node);
+ break;
+
+ case T_LockRowsState:
+ ExecReScanLockRows((LockRowsState *) node);
+ break;
+
+ case T_LimitState:
+ ExecReScanLimit((LimitState *) node);
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+ break;
+ }
+
+ if (node->chgParam != NULL)
+ {
+ bms_free(node->chgParam);
+ node->chgParam = NULL;
+ }
+}
+
+/*
+ * ExecMarkPos
+ *
+ * Marks the current scan position.
+ *
+ * NOTE: mark/restore capability is currently needed only for plan nodes
+ * that are the immediate inner child of a MergeJoin node. Since MergeJoin
+ * requires sorted input, there is never any need to support mark/restore in
+ * node types that cannot produce sorted output. There are some cases in
+ * which a node can pass through sorted data from its child; if we don't
+ * implement mark/restore for such a node type, the planner compensates by
+ * inserting a Material node above that node.
+ */
+void
+ExecMarkPos(PlanState *node)
+{
+ switch (nodeTag(node))
+ {
+ case T_IndexScanState:
+ ExecIndexMarkPos((IndexScanState *) node);
+ break;
+
+ case T_IndexOnlyScanState:
+ ExecIndexOnlyMarkPos((IndexOnlyScanState *) node);
+ break;
+
+ case T_CustomScanState:
+ ExecCustomMarkPos((CustomScanState *) node);
+ break;
+
+ case T_MaterialState:
+ ExecMaterialMarkPos((MaterialState *) node);
+ break;
+
+ case T_SortState:
+ ExecSortMarkPos((SortState *) node);
+ break;
+
+ case T_ResultState:
+ ExecResultMarkPos((ResultState *) node);
+ break;
+
+ default:
+ /* don't make hard error unless caller asks to restore... */
+ elog(DEBUG2, "unrecognized node type: %d", (int) nodeTag(node));
+ break;
+ }
+}
+
+/*
+ * ExecRestrPos
+ *
+ * restores the scan position previously saved with ExecMarkPos()
+ *
+ * NOTE: the semantics of this are that the first ExecProcNode following
+ * the restore operation will yield the same tuple as the first one following
+ * the mark operation. It is unspecified what happens to the plan node's
+ * result TupleTableSlot. (In most cases the result slot is unchanged by
+ * a restore, but the node may choose to clear it or to load it with the
+ * restored-to tuple.) Hence the caller should discard any previously
+ * returned TupleTableSlot after doing a restore.
+ */
+void
+ExecRestrPos(PlanState *node)
+{
+ switch (nodeTag(node))
+ {
+ case T_IndexScanState:
+ ExecIndexRestrPos((IndexScanState *) node);
+ break;
+
+ case T_IndexOnlyScanState:
+ ExecIndexOnlyRestrPos((IndexOnlyScanState *) node);
+ break;
+
+ case T_CustomScanState:
+ ExecCustomRestrPos((CustomScanState *) node);
+ break;
+
+ case T_MaterialState:
+ ExecMaterialRestrPos((MaterialState *) node);
+ break;
+
+ case T_SortState:
+ ExecSortRestrPos((SortState *) node);
+ break;
+
+ case T_ResultState:
+ ExecResultRestrPos((ResultState *) node);
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+ break;
+ }
+}
+
+/*
+ * ExecSupportsMarkRestore - does a Path support mark/restore?
+ *
+ * This is used during planning and so must accept a Path, not a Plan.
+ * We keep it here to be adjacent to the routines above, which also must
+ * know which plan types support mark/restore.
+ */
+bool
+ExecSupportsMarkRestore(Path *pathnode)
+{
+ /*
+ * For consistency with the routines above, we do not examine the nodeTag
+ * but rather the pathtype, which is the Plan node type the Path would
+ * produce.
+ */
+ switch (pathnode->pathtype)
+ {
+ case T_IndexScan:
+ case T_IndexOnlyScan:
+
+ /*
+ * Not all index types support mark/restore.
+ */
+ return castNode(IndexPath, pathnode)->indexinfo->amcanmarkpos;
+
+ case T_Material:
+ case T_Sort:
+ return true;
+
+ case T_CustomScan:
+ {
+ CustomPath *customPath = castNode(CustomPath, pathnode);
+
+ if (customPath->flags & CUSTOMPATH_SUPPORT_MARK_RESTORE)
+ return true;
+ return false;
+ }
+ case T_Result:
+
+ /*
+ * Result supports mark/restore iff it has a child plan that does.
+ *
+ * We have to be careful here because there is more than one Path
+ * type that can produce a Result plan node.
+ */
+ if (IsA(pathnode, ProjectionPath))
+ return ExecSupportsMarkRestore(((ProjectionPath *) pathnode)->subpath);
+ else if (IsA(pathnode, MinMaxAggPath))
+ return false; /* childless Result */
+ else if (IsA(pathnode, GroupResultPath))
+ return false; /* childless Result */
+ else
+ {
+ /* Simple RTE_RESULT base relation */
+ Assert(IsA(pathnode, Path));
+ return false; /* childless Result */
+ }
+
+ case T_Append:
+ {
+ AppendPath *appendPath = castNode(AppendPath, pathnode);
+
+ /*
+ * If there's exactly one child, then there will be no Append
+ * in the final plan, so we can handle mark/restore if the
+ * child plan node can.
+ */
+ if (list_length(appendPath->subpaths) == 1)
+ return ExecSupportsMarkRestore((Path *) linitial(appendPath->subpaths));
+ /* Otherwise, Append can't handle it */
+ return false;
+ }
+
+ case T_MergeAppend:
+ {
+ MergeAppendPath *mapath = castNode(MergeAppendPath, pathnode);
+
+ /*
+ * Like the Append case above, single-subpath MergeAppends
+ * won't be in the final plan, so just return the child's
+ * mark/restore ability.
+ */
+ if (list_length(mapath->subpaths) == 1)
+ return ExecSupportsMarkRestore((Path *) linitial(mapath->subpaths));
+ /* Otherwise, MergeAppend can't handle it */
+ return false;
+ }
+
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * ExecSupportsBackwardScan - does a plan type support backwards scanning?
+ *
+ * Ideally, all plan types would support backwards scan, but that seems
+ * unlikely to happen soon. In some cases, a plan node passes the backwards
+ * scan down to its children, and so supports backwards scan only if its
+ * children do. Therefore, this routine must be passed a complete plan tree.
+ */
+bool
+ExecSupportsBackwardScan(Plan *node)
+{
+ if (node == NULL)
+ return false;
+
+ /*
+ * Parallel-aware nodes return a subset of the tuples in each worker, and
+ * in general we can't expect to have enough bookkeeping state to know
+ * which ones we returned in this worker as opposed to some other worker.
+ */
+ if (node->parallel_aware)
+ return false;
+
+ switch (nodeTag(node))
+ {
+ case T_Result:
+ if (outerPlan(node) != NULL)
+ return ExecSupportsBackwardScan(outerPlan(node));
+ else
+ return false;
+
+ case T_Append:
+ {
+ ListCell *l;
+
+ /* With async, tuples may be interleaved, so can't back up. */
+ if (((Append *) node)->nasyncplans > 0)
+ return false;
+
+ foreach(l, ((Append *) node)->appendplans)
+ {
+ if (!ExecSupportsBackwardScan((Plan *) lfirst(l)))
+ return false;
+ }
+ /* need not check tlist because Append doesn't evaluate it */
+ return true;
+ }
+
+ case T_SampleScan:
+ /* Simplify life for tablesample methods by disallowing this */
+ return false;
+
+ case T_Gather:
+ return false;
+
+ case T_IndexScan:
+ return IndexSupportsBackwardScan(((IndexScan *) node)->indexid);
+
+ case T_IndexOnlyScan:
+ return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid);
+
+ case T_SubqueryScan:
+ return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan);
+
+ case T_CustomScan:
+ {
+ uint32 flags = ((CustomScan *) node)->flags;
+
+ if (flags & CUSTOMPATH_SUPPORT_BACKWARD_SCAN)
+ return true;
+ }
+ return false;
+
+ case T_SeqScan:
+ case T_TidScan:
+ case T_TidRangeScan:
+ case T_FunctionScan:
+ case T_ValuesScan:
+ case T_CteScan:
+ case T_Material:
+ case T_Sort:
+ /* these don't evaluate tlist */
+ return true;
+
+ case T_IncrementalSort:
+
+ /*
+ * Unlike full sort, incremental sort keeps only a single group of
+ * tuples in memory, so it can't scan backwards.
+ */
+ return false;
+
+ case T_LockRows:
+ case T_Limit:
+ return ExecSupportsBackwardScan(outerPlan(node));
+
+ default:
+ return false;
+ }
+}
+
+/*
+ * An IndexScan or IndexOnlyScan node supports backward scan only if the
+ * index's AM does.
+ */
+static bool
+IndexSupportsBackwardScan(Oid indexid)
+{
+ bool result;
+ HeapTuple ht_idxrel;
+ Form_pg_class idxrelrec;
+ IndexAmRoutine *amroutine;
+
+ /* Fetch the pg_class tuple of the index relation */
+ ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexid));
+ if (!HeapTupleIsValid(ht_idxrel))
+ elog(ERROR, "cache lookup failed for relation %u", indexid);
+ idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel);
+
+ /* Fetch the index AM's API struct */
+ amroutine = GetIndexAmRoutineByAmId(idxrelrec->relam, false);
+
+ result = amroutine->amcanbackward;
+
+ pfree(amroutine);
+ ReleaseSysCache(ht_idxrel);
+
+ return result;
+}
+
+/*
+ * ExecMaterializesOutput - does a plan type materialize its output?
+ *
+ * Returns true if the plan node type is one that automatically materializes
+ * its output (typically by keeping it in a tuplestore). For such plans,
+ * a rescan without any parameter change will have zero startup cost and
+ * very low per-tuple cost.
+ */
+bool
+ExecMaterializesOutput(NodeTag plantype)
+{
+ switch (plantype)
+ {
+ case T_Material:
+ case T_FunctionScan:
+ case T_TableFuncScan:
+ case T_CteScan:
+ case T_NamedTuplestoreScan:
+ case T_WorkTableScan:
+ case T_Sort:
+ return true;
+
+ default:
+ break;
+ }
+
+ return false;
+}
diff --git a/src/backend/executor/execAsync.c b/src/backend/executor/execAsync.c
new file mode 100644
index 0000000..94a284a
--- /dev/null
+++ b/src/backend/executor/execAsync.c
@@ -0,0 +1,154 @@
+/*-------------------------------------------------------------------------
+ *
+ * execAsync.c
+ * Support routines for asynchronous execution
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execAsync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execAsync.h"
+#include "executor/executor.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeForeignscan.h"
+
+/*
+ * Asynchronously request a tuple from a designed async-capable node.
+ */
+void
+ExecAsyncRequest(AsyncRequest *areq)
+{
+ if (areq->requestee->chgParam != NULL) /* something changed? */
+ ExecReScan(areq->requestee); /* let ReScan handle this */
+
+ /* must provide our own instrumentation support */
+ if (areq->requestee->instrument)
+ InstrStartNode(areq->requestee->instrument);
+
+ switch (nodeTag(areq->requestee))
+ {
+ case T_ForeignScanState:
+ ExecAsyncForeignScanRequest(areq);
+ break;
+ default:
+ /* If the node doesn't support async, caller messed up. */
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(areq->requestee));
+ }
+
+ ExecAsyncResponse(areq);
+
+ /* must provide our own instrumentation support */
+ if (areq->requestee->instrument)
+ InstrStopNode(areq->requestee->instrument,
+ TupIsNull(areq->result) ? 0.0 : 1.0);
+}
+
+/*
+ * Give the asynchronous node a chance to configure the file descriptor event
+ * for which it wishes to wait. We expect the node-type specific callback to
+ * make a single call of the following form:
+ *
+ * AddWaitEventToSet(set, WL_SOCKET_READABLE, fd, NULL, areq);
+ */
+void
+ExecAsyncConfigureWait(AsyncRequest *areq)
+{
+ /* must provide our own instrumentation support */
+ if (areq->requestee->instrument)
+ InstrStartNode(areq->requestee->instrument);
+
+ switch (nodeTag(areq->requestee))
+ {
+ case T_ForeignScanState:
+ ExecAsyncForeignScanConfigureWait(areq);
+ break;
+ default:
+ /* If the node doesn't support async, caller messed up. */
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(areq->requestee));
+ }
+
+ /* must provide our own instrumentation support */
+ if (areq->requestee->instrument)
+ InstrStopNode(areq->requestee->instrument, 0.0);
+}
+
+/*
+ * Call the asynchronous node back when a relevant event has occurred.
+ */
+void
+ExecAsyncNotify(AsyncRequest *areq)
+{
+ /* must provide our own instrumentation support */
+ if (areq->requestee->instrument)
+ InstrStartNode(areq->requestee->instrument);
+
+ switch (nodeTag(areq->requestee))
+ {
+ case T_ForeignScanState:
+ ExecAsyncForeignScanNotify(areq);
+ break;
+ default:
+ /* If the node doesn't support async, caller messed up. */
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(areq->requestee));
+ }
+
+ ExecAsyncResponse(areq);
+
+ /* must provide our own instrumentation support */
+ if (areq->requestee->instrument)
+ InstrStopNode(areq->requestee->instrument,
+ TupIsNull(areq->result) ? 0.0 : 1.0);
+}
+
+/*
+ * Call the requestor back when an asynchronous node has produced a result.
+ */
+void
+ExecAsyncResponse(AsyncRequest *areq)
+{
+ switch (nodeTag(areq->requestor))
+ {
+ case T_AppendState:
+ ExecAsyncAppendResponse(areq);
+ break;
+ default:
+ /* If the node doesn't support async, caller messed up. */
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(areq->requestor));
+ }
+}
+
+/*
+ * A requestee node should call this function to deliver the tuple to its
+ * requestor node. The requestee node can call this from its ExecAsyncRequest
+ * or ExecAsyncNotify callback.
+ */
+void
+ExecAsyncRequestDone(AsyncRequest *areq, TupleTableSlot *result)
+{
+ areq->request_complete = true;
+ areq->result = result;
+}
+
+/*
+ * A requestee node should call this function to indicate that it is pending
+ * for a callback. The requestee node can call this from its ExecAsyncRequest
+ * or ExecAsyncNotify callback.
+ */
+void
+ExecAsyncRequestPending(AsyncRequest *areq)
+{
+ areq->callback_pending = true;
+ areq->request_complete = false;
+ areq->result = NULL;
+}
diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c
new file mode 100644
index 0000000..4f430fb
--- /dev/null
+++ b/src/backend/executor/execCurrent.c
@@ -0,0 +1,426 @@
+/*-------------------------------------------------------------------------
+ *
+ * execCurrent.c
+ * executor support for WHERE CURRENT OF cursor
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/executor/execCurrent.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/sysattr.h"
+#include "catalog/pg_type.h"
+#include "executor/executor.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/portal.h"
+#include "utils/rel.h"
+
+
+static char *fetch_cursor_param_value(ExprContext *econtext, int paramId);
+static ScanState *search_plan_tree(PlanState *node, Oid table_oid,
+ bool *pending_rescan);
+
+
+/*
+ * execCurrentOf
+ *
+ * Given a CURRENT OF expression and the OID of a table, determine which row
+ * of the table is currently being scanned by the cursor named by CURRENT OF,
+ * and return the row's TID into *current_tid.
+ *
+ * Returns true if a row was identified. Returns false if the cursor is valid
+ * for the table but is not currently scanning a row of the table (this is a
+ * legal situation in inheritance cases). Raises error if cursor is not a
+ * valid updatable scan of the specified table.
+ */
+bool
+execCurrentOf(CurrentOfExpr *cexpr,
+ ExprContext *econtext,
+ Oid table_oid,
+ ItemPointer current_tid)
+{
+ char *cursor_name;
+ char *table_name;
+ Portal portal;
+ QueryDesc *queryDesc;
+
+ /* Get the cursor name --- may have to look up a parameter reference */
+ if (cexpr->cursor_name)
+ cursor_name = cexpr->cursor_name;
+ else
+ cursor_name = fetch_cursor_param_value(econtext, cexpr->cursor_param);
+
+ /* Fetch table name for possible use in error messages */
+ table_name = get_rel_name(table_oid);
+ if (table_name == NULL)
+ elog(ERROR, "cache lookup failed for relation %u", table_oid);
+
+ /* Find the cursor's portal */
+ portal = GetPortalByName(cursor_name);
+ if (!PortalIsValid(portal))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_CURSOR),
+ errmsg("cursor \"%s\" does not exist", cursor_name)));
+
+ /*
+ * We have to watch out for non-SELECT queries as well as held cursors,
+ * both of which may have null queryDesc.
+ */
+ if (portal->strategy != PORTAL_ONE_SELECT)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" is not a SELECT query",
+ cursor_name)));
+ queryDesc = portal->queryDesc;
+ if (queryDesc == NULL || queryDesc->estate == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" is held from a previous transaction",
+ cursor_name)));
+
+ /*
+ * We have two different strategies depending on whether the cursor uses
+ * FOR UPDATE/SHARE or not. The reason for supporting both is that the
+ * FOR UPDATE code is able to identify a target table in many cases where
+ * the other code can't, while the non-FOR-UPDATE case allows use of WHERE
+ * CURRENT OF with an insensitive cursor.
+ */
+ if (queryDesc->estate->es_rowmarks)
+ {
+ ExecRowMark *erm;
+ Index i;
+
+ /*
+ * Here, the query must have exactly one FOR UPDATE/SHARE reference to
+ * the target table, and we dig the ctid info out of that.
+ */
+ erm = NULL;
+ for (i = 0; i < queryDesc->estate->es_range_table_size; i++)
+ {
+ ExecRowMark *thiserm = queryDesc->estate->es_rowmarks[i];
+
+ if (thiserm == NULL ||
+ !RowMarkRequiresRowShareLock(thiserm->markType))
+ continue; /* ignore non-FOR UPDATE/SHARE items */
+
+ if (thiserm->relid == table_oid)
+ {
+ if (erm)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" has multiple FOR UPDATE/SHARE references to table \"%s\"",
+ cursor_name, table_name)));
+ erm = thiserm;
+ }
+ }
+
+ if (erm == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" does not have a FOR UPDATE/SHARE reference to table \"%s\"",
+ cursor_name, table_name)));
+
+ /*
+ * The cursor must have a current result row: per the SQL spec, it's
+ * an error if not.
+ */
+ if (portal->atStart || portal->atEnd)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" is not positioned on a row",
+ cursor_name)));
+
+ /* Return the currently scanned TID, if there is one */
+ if (ItemPointerIsValid(&(erm->curCtid)))
+ {
+ *current_tid = erm->curCtid;
+ return true;
+ }
+
+ /*
+ * This table didn't produce the cursor's current row; some other
+ * inheritance child of the same parent must have. Signal caller to
+ * do nothing on this table.
+ */
+ return false;
+ }
+ else
+ {
+ /*
+ * Without FOR UPDATE, we dig through the cursor's plan to find the
+ * scan node. Fail if it's not there or buried underneath
+ * aggregation.
+ */
+ ScanState *scanstate;
+ bool pending_rescan = false;
+
+ scanstate = search_plan_tree(queryDesc->planstate, table_oid,
+ &pending_rescan);
+ if (!scanstate)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"",
+ cursor_name, table_name)));
+
+ /*
+ * The cursor must have a current result row: per the SQL spec, it's
+ * an error if not. We test this at the top level, rather than at the
+ * scan node level, because in inheritance cases any one table scan
+ * could easily not be on a row. We want to return false, not raise
+ * error, if the passed-in table OID is for one of the inactive scans.
+ */
+ if (portal->atStart || portal->atEnd)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" is not positioned on a row",
+ cursor_name)));
+
+ /*
+ * Now OK to return false if we found an inactive scan. It is
+ * inactive either if it's not positioned on a row, or there's a
+ * rescan pending for it.
+ */
+ if (TupIsNull(scanstate->ss_ScanTupleSlot) || pending_rescan)
+ return false;
+
+ /*
+ * Extract TID of the scan's current row. The mechanism for this is
+ * in principle scan-type-dependent, but for most scan types, we can
+ * just dig the TID out of the physical scan tuple.
+ */
+ if (IsA(scanstate, IndexOnlyScanState))
+ {
+ /*
+ * For IndexOnlyScan, the tuple stored in ss_ScanTupleSlot may be
+ * a virtual tuple that does not have the ctid column, so we have
+ * to get the TID from xs_ctup.t_self.
+ */
+ IndexScanDesc scan = ((IndexOnlyScanState *) scanstate)->ioss_ScanDesc;
+
+ *current_tid = scan->xs_heaptid;
+ }
+ else
+ {
+ /*
+ * Default case: try to fetch TID from the scan node's current
+ * tuple. As an extra cross-check, verify tableoid in the current
+ * tuple. If the scan hasn't provided a physical tuple, we have
+ * to fail.
+ */
+ Datum ldatum;
+ bool lisnull;
+ ItemPointer tuple_tid;
+
+#ifdef USE_ASSERT_CHECKING
+ ldatum = slot_getsysattr(scanstate->ss_ScanTupleSlot,
+ TableOidAttributeNumber,
+ &lisnull);
+ if (lisnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"",
+ cursor_name, table_name)));
+ Assert(DatumGetObjectId(ldatum) == table_oid);
+#endif
+
+ ldatum = slot_getsysattr(scanstate->ss_ScanTupleSlot,
+ SelfItemPointerAttributeNumber,
+ &lisnull);
+ if (lisnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"",
+ cursor_name, table_name)));
+ tuple_tid = (ItemPointer) DatumGetPointer(ldatum);
+
+ *current_tid = *tuple_tid;
+ }
+
+ Assert(ItemPointerIsValid(current_tid));
+
+ return true;
+ }
+}
+
+/*
+ * fetch_cursor_param_value
+ *
+ * Fetch the string value of a param, verifying it is of type REFCURSOR.
+ */
+static char *
+fetch_cursor_param_value(ExprContext *econtext, int paramId)
+{
+ ParamListInfo paramInfo = econtext->ecxt_param_list_info;
+
+ if (paramInfo &&
+ paramId > 0 && paramId <= paramInfo->numParams)
+ {
+ ParamExternData *prm;
+ ParamExternData prmdata;
+
+ /* give hook a chance in case parameter is dynamic */
+ if (paramInfo->paramFetch != NULL)
+ prm = paramInfo->paramFetch(paramInfo, paramId, false, &prmdata);
+ else
+ prm = &paramInfo->params[paramId - 1];
+
+ if (OidIsValid(prm->ptype) && !prm->isnull)
+ {
+ /* safety check in case hook did something unexpected */
+ if (prm->ptype != REFCURSOROID)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("type of parameter %d (%s) does not match that when preparing the plan (%s)",
+ paramId,
+ format_type_be(prm->ptype),
+ format_type_be(REFCURSOROID))));
+
+ /* We know that refcursor uses text's I/O routines */
+ return TextDatumGetCString(prm->value);
+ }
+ }
+
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("no value found for parameter %d", paramId)));
+ return NULL;
+}
+
+/*
+ * search_plan_tree
+ *
+ * Search through a PlanState tree for a scan node on the specified table.
+ * Return NULL if not found or multiple candidates.
+ *
+ * CAUTION: this function is not charged simply with finding some candidate
+ * scan, but with ensuring that that scan returned the plan tree's current
+ * output row. That's why we must reject multiple-match cases.
+ *
+ * If a candidate is found, set *pending_rescan to true if that candidate
+ * or any node above it has a pending rescan action, i.e. chgParam != NULL.
+ * That indicates that we shouldn't consider the node to be positioned on a
+ * valid tuple, even if its own state would indicate that it is. (Caller
+ * must initialize *pending_rescan to false, and should not trust its state
+ * if multiple candidates are found.)
+ */
+static ScanState *
+search_plan_tree(PlanState *node, Oid table_oid,
+ bool *pending_rescan)
+{
+ ScanState *result = NULL;
+
+ if (node == NULL)
+ return NULL;
+ switch (nodeTag(node))
+ {
+ /*
+ * Relation scan nodes can all be treated alike: check to see if
+ * they are scanning the specified table.
+ *
+ * ForeignScan and CustomScan might not have a currentRelation, in
+ * which case we just ignore them. (We dare not descend to any
+ * child plan nodes they might have, since we do not know the
+ * relationship of such a node's current output tuple to the
+ * children's current outputs.)
+ */
+ case T_SeqScanState:
+ case T_SampleScanState:
+ case T_IndexScanState:
+ case T_IndexOnlyScanState:
+ case T_BitmapHeapScanState:
+ case T_TidScanState:
+ case T_TidRangeScanState:
+ case T_ForeignScanState:
+ case T_CustomScanState:
+ {
+ ScanState *sstate = (ScanState *) node;
+
+ if (sstate->ss_currentRelation &&
+ RelationGetRelid(sstate->ss_currentRelation) == table_oid)
+ result = sstate;
+ break;
+ }
+
+ /*
+ * For Append, we can check each input node. It is safe to
+ * descend to the inputs because only the input that resulted in
+ * the Append's current output node could be positioned on a tuple
+ * at all; the other inputs are either at EOF or not yet started.
+ * Hence, if the desired table is scanned by some
+ * currently-inactive input node, we will find that node but then
+ * our caller will realize that it didn't emit the tuple of
+ * interest.
+ *
+ * We do need to watch out for multiple matches (possible if
+ * Append was from UNION ALL rather than an inheritance tree).
+ *
+ * Note: we can NOT descend through MergeAppend similarly, since
+ * its inputs are likely all active, and we don't know which one
+ * returned the current output tuple. (Perhaps that could be
+ * fixed if we were to let this code know more about MergeAppend's
+ * internal state, but it does not seem worth the trouble. Users
+ * should not expect plans for ORDER BY queries to be considered
+ * simply-updatable, since they won't be if the sorting is
+ * implemented by a Sort node.)
+ */
+ case T_AppendState:
+ {
+ AppendState *astate = (AppendState *) node;
+ int i;
+
+ for (i = 0; i < astate->as_nplans; i++)
+ {
+ ScanState *elem = search_plan_tree(astate->appendplans[i],
+ table_oid,
+ pending_rescan);
+
+ if (!elem)
+ continue;
+ if (result)
+ return NULL; /* multiple matches */
+ result = elem;
+ }
+ break;
+ }
+
+ /*
+ * Result and Limit can be descended through (these are safe
+ * because they always return their input's current row)
+ */
+ case T_ResultState:
+ case T_LimitState:
+ result = search_plan_tree(node->lefttree,
+ table_oid,
+ pending_rescan);
+ break;
+
+ /*
+ * SubqueryScan too, but it keeps the child in a different place
+ */
+ case T_SubqueryScanState:
+ result = search_plan_tree(((SubqueryScanState *) node)->subplan,
+ table_oid,
+ pending_rescan);
+ break;
+
+ default:
+ /* Otherwise, assume we can't descend through it */
+ break;
+ }
+
+ /*
+ * If we found a candidate at or below this node, then this node's
+ * chgParam indicates a pending rescan that will affect the candidate.
+ */
+ if (result && node->chgParam != NULL)
+ *pending_rescan = true;
+
+ return result;
+}
diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c
new file mode 100644
index 0000000..bec249f
--- /dev/null
+++ b/src/backend/executor/execExpr.c
@@ -0,0 +1,3965 @@
+/*-------------------------------------------------------------------------
+ *
+ * execExpr.c
+ * Expression evaluation infrastructure.
+ *
+ * During executor startup, we compile each expression tree (which has
+ * previously been processed by the parser and planner) into an ExprState,
+ * using ExecInitExpr() et al. This converts the tree into a flat array
+ * of ExprEvalSteps, which may be thought of as instructions in a program.
+ * At runtime, we'll execute steps, starting with the first, until we reach
+ * an EEOP_DONE opcode.
+ *
+ * This file contains the "compilation" logic. It is independent of the
+ * specific execution technology we use (switch statement, computed goto,
+ * JIT compilation, etc).
+ *
+ * See src/backend/executor/README for some background, specifically the
+ * "Expression Trees and ExprState nodes", "Expression Initialization",
+ * and "Expression Evaluation" sections.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execExpr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "catalog/objectaccess.h"
+#include "catalog/pg_type.h"
+#include "executor/execExpr.h"
+#include "executor/nodeSubplan.h"
+#include "funcapi.h"
+#include "jit/jit.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/subscripting.h"
+#include "optimizer/optimizer.h"
+#include "pgstat.h"
+#include "utils/acl.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/typcache.h"
+
+
+typedef struct LastAttnumInfo
+{
+ AttrNumber last_inner;
+ AttrNumber last_outer;
+ AttrNumber last_scan;
+} LastAttnumInfo;
+
+static void ExecReadyExpr(ExprState *state);
+static void ExecInitExprRec(Expr *node, ExprState *state,
+ Datum *resv, bool *resnull);
+static void ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args,
+ Oid funcid, Oid inputcollid,
+ ExprState *state);
+static void ExecInitExprSlots(ExprState *state, Node *node);
+static void ExecPushExprSlots(ExprState *state, LastAttnumInfo *info);
+static bool get_last_attnums_walker(Node *node, LastAttnumInfo *info);
+static bool ExecComputeSlotInfo(ExprState *state, ExprEvalStep *op);
+static void ExecInitWholeRowVar(ExprEvalStep *scratch, Var *variable,
+ ExprState *state);
+static void ExecInitSubscriptingRef(ExprEvalStep *scratch,
+ SubscriptingRef *sbsref,
+ ExprState *state,
+ Datum *resv, bool *resnull);
+static bool isAssignmentIndirectionExpr(Expr *expr);
+static void ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest,
+ ExprState *state,
+ Datum *resv, bool *resnull);
+static void ExecBuildAggTransCall(ExprState *state, AggState *aggstate,
+ ExprEvalStep *scratch,
+ FunctionCallInfo fcinfo, AggStatePerTrans pertrans,
+ int transno, int setno, int setoff, bool ishash,
+ bool nullcheck);
+
+
+/*
+ * ExecInitExpr: prepare an expression tree for execution
+ *
+ * This function builds and returns an ExprState implementing the given
+ * Expr node tree. The return ExprState can then be handed to ExecEvalExpr
+ * for execution. Because the Expr tree itself is read-only as far as
+ * ExecInitExpr and ExecEvalExpr are concerned, several different executions
+ * of the same plan tree can occur concurrently. (But note that an ExprState
+ * does mutate at runtime, so it can't be re-used concurrently.)
+ *
+ * This must be called in a memory context that will last as long as repeated
+ * executions of the expression are needed. Typically the context will be
+ * the same as the per-query context of the associated ExprContext.
+ *
+ * Any Aggref, WindowFunc, or SubPlan nodes found in the tree are added to
+ * the lists of such nodes held by the parent PlanState.
+ *
+ * Note: there is no ExecEndExpr function; we assume that any resource
+ * cleanup needed will be handled by just releasing the memory context
+ * in which the state tree is built. Functions that require additional
+ * cleanup work can register a shutdown callback in the ExprContext.
+ *
+ * 'node' is the root of the expression tree to compile.
+ * 'parent' is the PlanState node that owns the expression.
+ *
+ * 'parent' may be NULL if we are preparing an expression that is not
+ * associated with a plan tree. (If so, it can't have aggs or subplans.)
+ * Such cases should usually come through ExecPrepareExpr, not directly here.
+ *
+ * Also, if 'node' is NULL, we just return NULL. This is convenient for some
+ * callers that may or may not have an expression that needs to be compiled.
+ * Note that a NULL ExprState pointer *cannot* be handed to ExecEvalExpr,
+ * although ExecQual and ExecCheck will accept one (and treat it as "true").
+ */
+ExprState *
+ExecInitExpr(Expr *node, PlanState *parent)
+{
+ ExprState *state;
+ ExprEvalStep scratch = {0};
+
+ /* Special case: NULL expression produces a NULL ExprState pointer */
+ if (node == NULL)
+ return NULL;
+
+ /* Initialize ExprState with empty step list */
+ state = makeNode(ExprState);
+ state->expr = node;
+ state->parent = parent;
+ state->ext_params = NULL;
+
+ /* Insert EEOP_*_FETCHSOME steps as needed */
+ ExecInitExprSlots(state, (Node *) node);
+
+ /* Compile the expression proper */
+ ExecInitExprRec(node, state, &state->resvalue, &state->resnull);
+
+ /* Finally, append a DONE step */
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(state, &scratch);
+
+ ExecReadyExpr(state);
+
+ return state;
+}
+
+/*
+ * ExecInitExprWithParams: prepare a standalone expression tree for execution
+ *
+ * This is the same as ExecInitExpr, except that there is no parent PlanState,
+ * and instead we may have a ParamListInfo describing PARAM_EXTERN Params.
+ */
+ExprState *
+ExecInitExprWithParams(Expr *node, ParamListInfo ext_params)
+{
+ ExprState *state;
+ ExprEvalStep scratch = {0};
+
+ /* Special case: NULL expression produces a NULL ExprState pointer */
+ if (node == NULL)
+ return NULL;
+
+ /* Initialize ExprState with empty step list */
+ state = makeNode(ExprState);
+ state->expr = node;
+ state->parent = NULL;
+ state->ext_params = ext_params;
+
+ /* Insert EEOP_*_FETCHSOME steps as needed */
+ ExecInitExprSlots(state, (Node *) node);
+
+ /* Compile the expression proper */
+ ExecInitExprRec(node, state, &state->resvalue, &state->resnull);
+
+ /* Finally, append a DONE step */
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(state, &scratch);
+
+ ExecReadyExpr(state);
+
+ return state;
+}
+
+/*
+ * ExecInitQual: prepare a qual for execution by ExecQual
+ *
+ * Prepares for the evaluation of a conjunctive boolean expression (qual list
+ * with implicit AND semantics) that returns true if none of the
+ * subexpressions are false.
+ *
+ * We must return true if the list is empty. Since that's a very common case,
+ * we optimize it a bit further by translating to a NULL ExprState pointer
+ * rather than setting up an ExprState that computes constant TRUE. (Some
+ * especially hot-spot callers of ExecQual detect this and avoid calling
+ * ExecQual at all.)
+ *
+ * If any of the subexpressions yield NULL, then the result of the conjunction
+ * is false. This makes ExecQual primarily useful for evaluating WHERE
+ * clauses, since SQL specifies that tuples with null WHERE results do not
+ * get selected.
+ */
+ExprState *
+ExecInitQual(List *qual, PlanState *parent)
+{
+ ExprState *state;
+ ExprEvalStep scratch = {0};
+ List *adjust_jumps = NIL;
+ ListCell *lc;
+
+ /* short-circuit (here and in ExecQual) for empty restriction list */
+ if (qual == NIL)
+ return NULL;
+
+ Assert(IsA(qual, List));
+
+ state = makeNode(ExprState);
+ state->expr = (Expr *) qual;
+ state->parent = parent;
+ state->ext_params = NULL;
+
+ /* mark expression as to be used with ExecQual() */
+ state->flags = EEO_FLAG_IS_QUAL;
+
+ /* Insert EEOP_*_FETCHSOME steps as needed */
+ ExecInitExprSlots(state, (Node *) qual);
+
+ /*
+ * ExecQual() needs to return false for an expression returning NULL. That
+ * allows us to short-circuit the evaluation the first time a NULL is
+ * encountered. As qual evaluation is a hot-path this warrants using a
+ * special opcode for qual evaluation that's simpler than BOOL_AND (which
+ * has more complex NULL handling).
+ */
+ scratch.opcode = EEOP_QUAL;
+
+ /*
+ * We can use ExprState's resvalue/resnull as target for each qual expr.
+ */
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+
+ foreach(lc, qual)
+ {
+ Expr *node = (Expr *) lfirst(lc);
+
+ /* first evaluate expression */
+ ExecInitExprRec(node, state, &state->resvalue, &state->resnull);
+
+ /* then emit EEOP_QUAL to detect if it's false (or null) */
+ scratch.d.qualexpr.jumpdone = -1;
+ ExprEvalPushStep(state, &scratch);
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+ }
+
+ /* adjust jump targets */
+ foreach(lc, adjust_jumps)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+ Assert(as->opcode == EEOP_QUAL);
+ Assert(as->d.qualexpr.jumpdone == -1);
+ as->d.qualexpr.jumpdone = state->steps_len;
+ }
+
+ /*
+ * At the end, we don't need to do anything more. The last qual expr must
+ * have yielded TRUE, and since its result is stored in the desired output
+ * location, we're done.
+ */
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(state, &scratch);
+
+ ExecReadyExpr(state);
+
+ return state;
+}
+
+/*
+ * ExecInitCheck: prepare a check constraint for execution by ExecCheck
+ *
+ * This is much like ExecInitQual/ExecQual, except that a null result from
+ * the conjunction is treated as TRUE. This behavior is appropriate for
+ * evaluating CHECK constraints, since SQL specifies that NULL constraint
+ * conditions are not failures.
+ *
+ * Note that like ExecInitQual, this expects input in implicit-AND format.
+ * Users of ExecCheck that have expressions in normal explicit-AND format
+ * can just apply ExecInitExpr to produce suitable input for ExecCheck.
+ */
+ExprState *
+ExecInitCheck(List *qual, PlanState *parent)
+{
+ /* short-circuit (here and in ExecCheck) for empty restriction list */
+ if (qual == NIL)
+ return NULL;
+
+ Assert(IsA(qual, List));
+
+ /*
+ * Just convert the implicit-AND list to an explicit AND (if there's more
+ * than one entry), and compile normally. Unlike ExecQual, we can't
+ * short-circuit on NULL results, so the regular AND behavior is needed.
+ */
+ return ExecInitExpr(make_ands_explicit(qual), parent);
+}
+
+/*
+ * Call ExecInitExpr() on a list of expressions, return a list of ExprStates.
+ */
+List *
+ExecInitExprList(List *nodes, PlanState *parent)
+{
+ List *result = NIL;
+ ListCell *lc;
+
+ foreach(lc, nodes)
+ {
+ Expr *e = lfirst(lc);
+
+ result = lappend(result, ExecInitExpr(e, parent));
+ }
+
+ return result;
+}
+
+/*
+ * ExecBuildProjectionInfo
+ *
+ * Build a ProjectionInfo node for evaluating the given tlist in the given
+ * econtext, and storing the result into the tuple slot. (Caller must have
+ * ensured that tuple slot has a descriptor matching the tlist!)
+ *
+ * inputDesc can be NULL, but if it is not, we check to see whether simple
+ * Vars in the tlist match the descriptor. It is important to provide
+ * inputDesc for relation-scan plan nodes, as a cross check that the relation
+ * hasn't been changed since the plan was made. At higher levels of a plan,
+ * there is no need to recheck.
+ *
+ * This is implemented by internally building an ExprState that performs the
+ * whole projection in one go.
+ *
+ * Caution: before PG v10, the targetList was a list of ExprStates; now it
+ * should be the planner-created targetlist, since we do the compilation here.
+ */
+ProjectionInfo *
+ExecBuildProjectionInfo(List *targetList,
+ ExprContext *econtext,
+ TupleTableSlot *slot,
+ PlanState *parent,
+ TupleDesc inputDesc)
+{
+ ProjectionInfo *projInfo = makeNode(ProjectionInfo);
+ ExprState *state;
+ ExprEvalStep scratch = {0};
+ ListCell *lc;
+
+ projInfo->pi_exprContext = econtext;
+ /* We embed ExprState into ProjectionInfo instead of doing extra palloc */
+ projInfo->pi_state.tag = T_ExprState;
+ state = &projInfo->pi_state;
+ state->expr = (Expr *) targetList;
+ state->parent = parent;
+ state->ext_params = NULL;
+
+ state->resultslot = slot;
+
+ /* Insert EEOP_*_FETCHSOME steps as needed */
+ ExecInitExprSlots(state, (Node *) targetList);
+
+ /* Now compile each tlist column */
+ foreach(lc, targetList)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc);
+ Var *variable = NULL;
+ AttrNumber attnum = 0;
+ bool isSafeVar = false;
+
+ /*
+ * If tlist expression is a safe non-system Var, use the fast-path
+ * ASSIGN_*_VAR opcodes. "Safe" means that we don't need to apply
+ * CheckVarSlotCompatibility() during plan startup. If a source slot
+ * was provided, we make the equivalent tests here; if a slot was not
+ * provided, we assume that no check is needed because we're dealing
+ * with a non-relation-scan-level expression.
+ */
+ if (tle->expr != NULL &&
+ IsA(tle->expr, Var) &&
+ ((Var *) tle->expr)->varattno > 0)
+ {
+ /* Non-system Var, but how safe is it? */
+ variable = (Var *) tle->expr;
+ attnum = variable->varattno;
+
+ if (inputDesc == NULL)
+ isSafeVar = true; /* can't check, just assume OK */
+ else if (attnum <= inputDesc->natts)
+ {
+ Form_pg_attribute attr = TupleDescAttr(inputDesc, attnum - 1);
+
+ /*
+ * If user attribute is dropped or has a type mismatch, don't
+ * use ASSIGN_*_VAR. Instead let the normal expression
+ * machinery handle it (which'll possibly error out).
+ */
+ if (!attr->attisdropped && variable->vartype == attr->atttypid)
+ {
+ isSafeVar = true;
+ }
+ }
+ }
+
+ if (isSafeVar)
+ {
+ /* Fast-path: just generate an EEOP_ASSIGN_*_VAR step */
+ switch (variable->varno)
+ {
+ case INNER_VAR:
+ /* get the tuple from the inner node */
+ scratch.opcode = EEOP_ASSIGN_INNER_VAR;
+ break;
+
+ case OUTER_VAR:
+ /* get the tuple from the outer node */
+ scratch.opcode = EEOP_ASSIGN_OUTER_VAR;
+ break;
+
+ /* INDEX_VAR is handled by default case */
+
+ default:
+ /* get the tuple from the relation being scanned */
+ scratch.opcode = EEOP_ASSIGN_SCAN_VAR;
+ break;
+ }
+
+ scratch.d.assign_var.attnum = attnum - 1;
+ scratch.d.assign_var.resultnum = tle->resno - 1;
+ ExprEvalPushStep(state, &scratch);
+ }
+ else
+ {
+ /*
+ * Otherwise, compile the column expression normally.
+ *
+ * We can't tell the expression to evaluate directly into the
+ * result slot, as the result slot (and the exprstate for that
+ * matter) can change between executions. We instead evaluate
+ * into the ExprState's resvalue/resnull and then move.
+ */
+ ExecInitExprRec(tle->expr, state,
+ &state->resvalue, &state->resnull);
+
+ /*
+ * Column might be referenced multiple times in upper nodes, so
+ * force value to R/O - but only if it could be an expanded datum.
+ */
+ if (get_typlen(exprType((Node *) tle->expr)) == -1)
+ scratch.opcode = EEOP_ASSIGN_TMP_MAKE_RO;
+ else
+ scratch.opcode = EEOP_ASSIGN_TMP;
+ scratch.d.assign_tmp.resultnum = tle->resno - 1;
+ ExprEvalPushStep(state, &scratch);
+ }
+ }
+
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(state, &scratch);
+
+ ExecReadyExpr(state);
+
+ return projInfo;
+}
+
+/*
+ * ExecBuildUpdateProjection
+ *
+ * Build a ProjectionInfo node for constructing a new tuple during UPDATE.
+ * The projection will be executed in the given econtext and the result will
+ * be stored into the given tuple slot. (Caller must have ensured that tuple
+ * slot has a descriptor matching the target rel!)
+ *
+ * When evalTargetList is false, targetList contains the UPDATE ... SET
+ * expressions that have already been computed by a subplan node; the values
+ * from this tlist are assumed to be available in the "outer" tuple slot.
+ * When evalTargetList is true, targetList contains the UPDATE ... SET
+ * expressions that must be computed (which could contain references to
+ * the outer, inner, or scan tuple slots).
+ *
+ * In either case, targetColnos contains a list of the target column numbers
+ * corresponding to the non-resjunk entries of targetList. The tlist values
+ * are assigned into these columns of the result tuple slot. Target columns
+ * not listed in targetColnos are filled from the UPDATE's old tuple, which
+ * is assumed to be available in the "scan" tuple slot.
+ *
+ * targetList can also contain resjunk columns. These must be evaluated
+ * if evalTargetList is true, but their values are discarded.
+ *
+ * relDesc must describe the relation we intend to update.
+ *
+ * This is basically a specialized variant of ExecBuildProjectionInfo.
+ * However, it also performs sanity checks equivalent to ExecCheckPlanOutput.
+ * Since we never make a normal tlist equivalent to the whole
+ * tuple-to-be-assigned, there is no convenient way to apply
+ * ExecCheckPlanOutput, so we must do our safety checks here.
+ */
+ProjectionInfo *
+ExecBuildUpdateProjection(List *targetList,
+ bool evalTargetList,
+ List *targetColnos,
+ TupleDesc relDesc,
+ ExprContext *econtext,
+ TupleTableSlot *slot,
+ PlanState *parent)
+{
+ ProjectionInfo *projInfo = makeNode(ProjectionInfo);
+ ExprState *state;
+ int nAssignableCols;
+ bool sawJunk;
+ Bitmapset *assignedCols;
+ LastAttnumInfo deform = {0, 0, 0};
+ ExprEvalStep scratch = {0};
+ int outerattnum;
+ ListCell *lc,
+ *lc2;
+
+ projInfo->pi_exprContext = econtext;
+ /* We embed ExprState into ProjectionInfo instead of doing extra palloc */
+ projInfo->pi_state.tag = T_ExprState;
+ state = &projInfo->pi_state;
+ if (evalTargetList)
+ state->expr = (Expr *) targetList;
+ else
+ state->expr = NULL; /* not used */
+ state->parent = parent;
+ state->ext_params = NULL;
+
+ state->resultslot = slot;
+
+ /*
+ * Examine the targetList to see how many non-junk columns there are, and
+ * to verify that the non-junk columns come before the junk ones.
+ */
+ nAssignableCols = 0;
+ sawJunk = false;
+ foreach(lc, targetList)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc);
+
+ if (tle->resjunk)
+ sawJunk = true;
+ else
+ {
+ if (sawJunk)
+ elog(ERROR, "subplan target list is out of order");
+ nAssignableCols++;
+ }
+ }
+
+ /* We should have one targetColnos entry per non-junk column */
+ if (nAssignableCols != list_length(targetColnos))
+ elog(ERROR, "targetColnos does not match subplan target list");
+
+ /*
+ * Build a bitmapset of the columns in targetColnos. (We could just use
+ * list_member_int() tests, but that risks O(N^2) behavior with many
+ * columns.)
+ */
+ assignedCols = NULL;
+ foreach(lc, targetColnos)
+ {
+ AttrNumber targetattnum = lfirst_int(lc);
+
+ assignedCols = bms_add_member(assignedCols, targetattnum);
+ }
+
+ /*
+ * We need to insert EEOP_*_FETCHSOME steps to ensure the input tuples are
+ * sufficiently deconstructed. The scan tuple must be deconstructed at
+ * least as far as the last old column we need.
+ */
+ for (int attnum = relDesc->natts; attnum > 0; attnum--)
+ {
+ Form_pg_attribute attr = TupleDescAttr(relDesc, attnum - 1);
+
+ if (attr->attisdropped)
+ continue;
+ if (bms_is_member(attnum, assignedCols))
+ continue;
+ deform.last_scan = attnum;
+ break;
+ }
+
+ /*
+ * If we're actually evaluating the tlist, incorporate its input
+ * requirements too; otherwise, we'll just need to fetch the appropriate
+ * number of columns of the "outer" tuple.
+ */
+ if (evalTargetList)
+ get_last_attnums_walker((Node *) targetList, &deform);
+ else
+ deform.last_outer = nAssignableCols;
+
+ ExecPushExprSlots(state, &deform);
+
+ /*
+ * Now generate code to evaluate the tlist's assignable expressions or
+ * fetch them from the outer tuple, incidentally validating that they'll
+ * be of the right data type. The checks above ensure that the forboth()
+ * will iterate over exactly the non-junk columns.
+ */
+ outerattnum = 0;
+ forboth(lc, targetList, lc2, targetColnos)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc);
+ AttrNumber targetattnum = lfirst_int(lc2);
+ Form_pg_attribute attr;
+
+ Assert(!tle->resjunk);
+
+ /*
+ * Apply sanity checks comparable to ExecCheckPlanOutput().
+ */
+ if (targetattnum <= 0 || targetattnum > relDesc->natts)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Query has too many columns.")));
+ attr = TupleDescAttr(relDesc, targetattnum - 1);
+
+ if (attr->attisdropped)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Query provides a value for a dropped column at ordinal position %d.",
+ targetattnum)));
+ if (exprType((Node *) tle->expr) != attr->atttypid)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Table has type %s at ordinal position %d, but query expects %s.",
+ format_type_be(attr->atttypid),
+ targetattnum,
+ format_type_be(exprType((Node *) tle->expr)))));
+
+ /* OK, generate code to perform the assignment. */
+ if (evalTargetList)
+ {
+ /*
+ * We must evaluate the TLE's expression and assign it. We do not
+ * bother jumping through hoops for "safe" Vars like
+ * ExecBuildProjectionInfo does; this is a relatively less-used
+ * path and it doesn't seem worth expending code for that.
+ */
+ ExecInitExprRec(tle->expr, state,
+ &state->resvalue, &state->resnull);
+ /* Needn't worry about read-only-ness here, either. */
+ scratch.opcode = EEOP_ASSIGN_TMP;
+ scratch.d.assign_tmp.resultnum = targetattnum - 1;
+ ExprEvalPushStep(state, &scratch);
+ }
+ else
+ {
+ /* Just assign from the outer tuple. */
+ scratch.opcode = EEOP_ASSIGN_OUTER_VAR;
+ scratch.d.assign_var.attnum = outerattnum;
+ scratch.d.assign_var.resultnum = targetattnum - 1;
+ ExprEvalPushStep(state, &scratch);
+ }
+ outerattnum++;
+ }
+
+ /*
+ * If we're evaluating the tlist, must evaluate any resjunk columns too.
+ * (This matters for things like MULTIEXPR_SUBLINK SubPlans.)
+ */
+ if (evalTargetList)
+ {
+ for_each_cell(lc, targetList, lc)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc);
+
+ Assert(tle->resjunk);
+ ExecInitExprRec(tle->expr, state,
+ &state->resvalue, &state->resnull);
+ }
+ }
+
+ /*
+ * Now generate code to copy over any old columns that were not assigned
+ * to, and to ensure that dropped columns are set to NULL.
+ */
+ for (int attnum = 1; attnum <= relDesc->natts; attnum++)
+ {
+ Form_pg_attribute attr = TupleDescAttr(relDesc, attnum - 1);
+
+ if (attr->attisdropped)
+ {
+ /* Put a null into the ExprState's resvalue/resnull ... */
+ scratch.opcode = EEOP_CONST;
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+ scratch.d.constval.value = (Datum) 0;
+ scratch.d.constval.isnull = true;
+ ExprEvalPushStep(state, &scratch);
+ /* ... then assign it to the result slot */
+ scratch.opcode = EEOP_ASSIGN_TMP;
+ scratch.d.assign_tmp.resultnum = attnum - 1;
+ ExprEvalPushStep(state, &scratch);
+ }
+ else if (!bms_is_member(attnum, assignedCols))
+ {
+ /* Certainly the right type, so needn't check */
+ scratch.opcode = EEOP_ASSIGN_SCAN_VAR;
+ scratch.d.assign_var.attnum = attnum - 1;
+ scratch.d.assign_var.resultnum = attnum - 1;
+ ExprEvalPushStep(state, &scratch);
+ }
+ }
+
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(state, &scratch);
+
+ ExecReadyExpr(state);
+
+ return projInfo;
+}
+
+/*
+ * ExecPrepareExpr --- initialize for expression execution outside a normal
+ * Plan tree context.
+ *
+ * This differs from ExecInitExpr in that we don't assume the caller is
+ * already running in the EState's per-query context. Also, we run the
+ * passed expression tree through expression_planner() to prepare it for
+ * execution. (In ordinary Plan trees the regular planning process will have
+ * made the appropriate transformations on expressions, but for standalone
+ * expressions this won't have happened.)
+ */
+ExprState *
+ExecPrepareExpr(Expr *node, EState *estate)
+{
+ ExprState *result;
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ node = expression_planner(node);
+
+ result = ExecInitExpr(node, NULL);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return result;
+}
+
+/*
+ * ExecPrepareQual --- initialize for qual execution outside a normal
+ * Plan tree context.
+ *
+ * This differs from ExecInitQual in that we don't assume the caller is
+ * already running in the EState's per-query context. Also, we run the
+ * passed expression tree through expression_planner() to prepare it for
+ * execution. (In ordinary Plan trees the regular planning process will have
+ * made the appropriate transformations on expressions, but for standalone
+ * expressions this won't have happened.)
+ */
+ExprState *
+ExecPrepareQual(List *qual, EState *estate)
+{
+ ExprState *result;
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ qual = (List *) expression_planner((Expr *) qual);
+
+ result = ExecInitQual(qual, NULL);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return result;
+}
+
+/*
+ * ExecPrepareCheck -- initialize check constraint for execution outside a
+ * normal Plan tree context.
+ *
+ * See ExecPrepareExpr() and ExecInitCheck() for details.
+ */
+ExprState *
+ExecPrepareCheck(List *qual, EState *estate)
+{
+ ExprState *result;
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ qual = (List *) expression_planner((Expr *) qual);
+
+ result = ExecInitCheck(qual, NULL);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return result;
+}
+
+/*
+ * Call ExecPrepareExpr() on each member of a list of Exprs, and return
+ * a list of ExprStates.
+ *
+ * See ExecPrepareExpr() for details.
+ */
+List *
+ExecPrepareExprList(List *nodes, EState *estate)
+{
+ List *result = NIL;
+ MemoryContext oldcontext;
+ ListCell *lc;
+
+ /* Ensure that the list cell nodes are in the right context too */
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ foreach(lc, nodes)
+ {
+ Expr *e = (Expr *) lfirst(lc);
+
+ result = lappend(result, ExecPrepareExpr(e, estate));
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return result;
+}
+
+/*
+ * ExecCheck - evaluate a check constraint
+ *
+ * For check constraints, a null result is taken as TRUE, ie the constraint
+ * passes.
+ *
+ * The check constraint may have been prepared with ExecInitCheck
+ * (possibly via ExecPrepareCheck) if the caller had it in implicit-AND
+ * format, but a regular boolean expression prepared with ExecInitExpr or
+ * ExecPrepareExpr works too.
+ */
+bool
+ExecCheck(ExprState *state, ExprContext *econtext)
+{
+ Datum ret;
+ bool isnull;
+
+ /* short-circuit (here and in ExecInitCheck) for empty restriction list */
+ if (state == NULL)
+ return true;
+
+ /* verify that expression was not compiled using ExecInitQual */
+ Assert(!(state->flags & EEO_FLAG_IS_QUAL));
+
+ ret = ExecEvalExprSwitchContext(state, econtext, &isnull);
+
+ if (isnull)
+ return true;
+
+ return DatumGetBool(ret);
+}
+
+/*
+ * Prepare a compiled expression for execution. This has to be called for
+ * every ExprState before it can be executed.
+ *
+ * NB: While this currently only calls ExecReadyInterpretedExpr(),
+ * this will likely get extended to further expression evaluation methods.
+ * Therefore this should be used instead of directly calling
+ * ExecReadyInterpretedExpr().
+ */
+static void
+ExecReadyExpr(ExprState *state)
+{
+ if (jit_compile_expr(state))
+ return;
+
+ ExecReadyInterpretedExpr(state);
+}
+
+/*
+ * Append the steps necessary for the evaluation of node to ExprState->steps,
+ * possibly recursing into sub-expressions of node.
+ *
+ * node - expression to evaluate
+ * state - ExprState to whose ->steps to append the necessary operations
+ * resv / resnull - where to store the result of the node into
+ */
+static void
+ExecInitExprRec(Expr *node, ExprState *state,
+ Datum *resv, bool *resnull)
+{
+ ExprEvalStep scratch = {0};
+
+ /* Guard against stack overflow due to overly complex expressions */
+ check_stack_depth();
+
+ /* Step's output location is always what the caller gave us */
+ Assert(resv != NULL && resnull != NULL);
+ scratch.resvalue = resv;
+ scratch.resnull = resnull;
+
+ /* cases should be ordered as they are in enum NodeTag */
+ switch (nodeTag(node))
+ {
+ case T_Var:
+ {
+ Var *variable = (Var *) node;
+
+ if (variable->varattno == InvalidAttrNumber)
+ {
+ /* whole-row Var */
+ ExecInitWholeRowVar(&scratch, variable, state);
+ }
+ else if (variable->varattno <= 0)
+ {
+ /* system column */
+ scratch.d.var.attnum = variable->varattno;
+ scratch.d.var.vartype = variable->vartype;
+ switch (variable->varno)
+ {
+ case INNER_VAR:
+ scratch.opcode = EEOP_INNER_SYSVAR;
+ break;
+ case OUTER_VAR:
+ scratch.opcode = EEOP_OUTER_SYSVAR;
+ break;
+
+ /* INDEX_VAR is handled by default case */
+
+ default:
+ scratch.opcode = EEOP_SCAN_SYSVAR;
+ break;
+ }
+ }
+ else
+ {
+ /* regular user column */
+ scratch.d.var.attnum = variable->varattno - 1;
+ scratch.d.var.vartype = variable->vartype;
+ switch (variable->varno)
+ {
+ case INNER_VAR:
+ scratch.opcode = EEOP_INNER_VAR;
+ break;
+ case OUTER_VAR:
+ scratch.opcode = EEOP_OUTER_VAR;
+ break;
+
+ /* INDEX_VAR is handled by default case */
+
+ default:
+ scratch.opcode = EEOP_SCAN_VAR;
+ break;
+ }
+ }
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_Const:
+ {
+ Const *con = (Const *) node;
+
+ scratch.opcode = EEOP_CONST;
+ scratch.d.constval.value = con->constvalue;
+ scratch.d.constval.isnull = con->constisnull;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_Param:
+ {
+ Param *param = (Param *) node;
+ ParamListInfo params;
+
+ switch (param->paramkind)
+ {
+ case PARAM_EXEC:
+ scratch.opcode = EEOP_PARAM_EXEC;
+ scratch.d.param.paramid = param->paramid;
+ scratch.d.param.paramtype = param->paramtype;
+ ExprEvalPushStep(state, &scratch);
+ break;
+ case PARAM_EXTERN:
+
+ /*
+ * If we have a relevant ParamCompileHook, use it;
+ * otherwise compile a standard EEOP_PARAM_EXTERN
+ * step. ext_params, if supplied, takes precedence
+ * over info from the parent node's EState (if any).
+ */
+ if (state->ext_params)
+ params = state->ext_params;
+ else if (state->parent &&
+ state->parent->state)
+ params = state->parent->state->es_param_list_info;
+ else
+ params = NULL;
+ if (params && params->paramCompile)
+ {
+ params->paramCompile(params, param, state,
+ resv, resnull);
+ }
+ else
+ {
+ scratch.opcode = EEOP_PARAM_EXTERN;
+ scratch.d.param.paramid = param->paramid;
+ scratch.d.param.paramtype = param->paramtype;
+ ExprEvalPushStep(state, &scratch);
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized paramkind: %d",
+ (int) param->paramkind);
+ break;
+ }
+ break;
+ }
+
+ case T_Aggref:
+ {
+ Aggref *aggref = (Aggref *) node;
+
+ scratch.opcode = EEOP_AGGREF;
+ scratch.d.aggref.aggno = aggref->aggno;
+
+ if (state->parent && IsA(state->parent, AggState))
+ {
+ AggState *aggstate = (AggState *) state->parent;
+
+ aggstate->aggs = lappend(aggstate->aggs, aggref);
+ }
+ else
+ {
+ /* planner messed up */
+ elog(ERROR, "Aggref found in non-Agg plan node");
+ }
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_GroupingFunc:
+ {
+ GroupingFunc *grp_node = (GroupingFunc *) node;
+ Agg *agg;
+
+ if (!state->parent || !IsA(state->parent, AggState) ||
+ !IsA(state->parent->plan, Agg))
+ elog(ERROR, "GroupingFunc found in non-Agg plan node");
+
+ scratch.opcode = EEOP_GROUPING_FUNC;
+
+ agg = (Agg *) (state->parent->plan);
+
+ if (agg->groupingSets)
+ scratch.d.grouping_func.clauses = grp_node->cols;
+ else
+ scratch.d.grouping_func.clauses = NIL;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_WindowFunc:
+ {
+ WindowFunc *wfunc = (WindowFunc *) node;
+ WindowFuncExprState *wfstate = makeNode(WindowFuncExprState);
+
+ wfstate->wfunc = wfunc;
+
+ if (state->parent && IsA(state->parent, WindowAggState))
+ {
+ WindowAggState *winstate = (WindowAggState *) state->parent;
+ int nfuncs;
+
+ winstate->funcs = lappend(winstate->funcs, wfstate);
+ nfuncs = ++winstate->numfuncs;
+ if (wfunc->winagg)
+ winstate->numaggs++;
+
+ /* for now initialize agg using old style expressions */
+ wfstate->args = ExecInitExprList(wfunc->args,
+ state->parent);
+ wfstate->aggfilter = ExecInitExpr(wfunc->aggfilter,
+ state->parent);
+
+ /*
+ * Complain if the windowfunc's arguments contain any
+ * windowfuncs; nested window functions are semantically
+ * nonsensical. (This should have been caught earlier,
+ * but we defend against it here anyway.)
+ */
+ if (nfuncs != winstate->numfuncs)
+ ereport(ERROR,
+ (errcode(ERRCODE_WINDOWING_ERROR),
+ errmsg("window function calls cannot be nested")));
+ }
+ else
+ {
+ /* planner messed up */
+ elog(ERROR, "WindowFunc found in non-WindowAgg plan node");
+ }
+
+ scratch.opcode = EEOP_WINDOW_FUNC;
+ scratch.d.window_func.wfstate = wfstate;
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_SubscriptingRef:
+ {
+ SubscriptingRef *sbsref = (SubscriptingRef *) node;
+
+ ExecInitSubscriptingRef(&scratch, sbsref, state, resv, resnull);
+ break;
+ }
+
+ case T_FuncExpr:
+ {
+ FuncExpr *func = (FuncExpr *) node;
+
+ ExecInitFunc(&scratch, node,
+ func->args, func->funcid, func->inputcollid,
+ state);
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_OpExpr:
+ {
+ OpExpr *op = (OpExpr *) node;
+
+ ExecInitFunc(&scratch, node,
+ op->args, op->opfuncid, op->inputcollid,
+ state);
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_DistinctExpr:
+ {
+ DistinctExpr *op = (DistinctExpr *) node;
+
+ ExecInitFunc(&scratch, node,
+ op->args, op->opfuncid, op->inputcollid,
+ state);
+
+ /*
+ * Change opcode of call instruction to EEOP_DISTINCT.
+ *
+ * XXX: historically we've not called the function usage
+ * pgstat infrastructure - that seems inconsistent given that
+ * we do so for normal function *and* operator evaluation. If
+ * we decided to do that here, we'd probably want separate
+ * opcodes for FUSAGE or not.
+ */
+ scratch.opcode = EEOP_DISTINCT;
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_NullIfExpr:
+ {
+ NullIfExpr *op = (NullIfExpr *) node;
+
+ ExecInitFunc(&scratch, node,
+ op->args, op->opfuncid, op->inputcollid,
+ state);
+
+ /*
+ * Change opcode of call instruction to EEOP_NULLIF.
+ *
+ * XXX: historically we've not called the function usage
+ * pgstat infrastructure - that seems inconsistent given that
+ * we do so for normal function *and* operator evaluation. If
+ * we decided to do that here, we'd probably want separate
+ * opcodes for FUSAGE or not.
+ */
+ scratch.opcode = EEOP_NULLIF;
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_ScalarArrayOpExpr:
+ {
+ ScalarArrayOpExpr *opexpr = (ScalarArrayOpExpr *) node;
+ Expr *scalararg;
+ Expr *arrayarg;
+ FmgrInfo *finfo;
+ FunctionCallInfo fcinfo;
+ AclResult aclresult;
+ FmgrInfo *hash_finfo;
+ FunctionCallInfo hash_fcinfo;
+
+ Assert(list_length(opexpr->args) == 2);
+ scalararg = (Expr *) linitial(opexpr->args);
+ arrayarg = (Expr *) lsecond(opexpr->args);
+
+ /* Check permission to call function */
+ aclresult = pg_proc_aclcheck(opexpr->opfuncid,
+ GetUserId(),
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(opexpr->opfuncid));
+ InvokeFunctionExecuteHook(opexpr->opfuncid);
+
+ if (OidIsValid(opexpr->hashfuncid))
+ {
+ aclresult = pg_proc_aclcheck(opexpr->hashfuncid,
+ GetUserId(),
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(opexpr->hashfuncid));
+ InvokeFunctionExecuteHook(opexpr->hashfuncid);
+ }
+
+ /* Set up the primary fmgr lookup information */
+ finfo = palloc0(sizeof(FmgrInfo));
+ fcinfo = palloc0(SizeForFunctionCallInfo(2));
+ fmgr_info(opexpr->opfuncid, finfo);
+ fmgr_info_set_expr((Node *) node, finfo);
+ InitFunctionCallInfoData(*fcinfo, finfo, 2,
+ opexpr->inputcollid, NULL, NULL);
+
+ /*
+ * If hashfuncid is set, we create a EEOP_HASHED_SCALARARRAYOP
+ * step instead of a EEOP_SCALARARRAYOP. This provides much
+ * faster lookup performance than the normal linear search
+ * when the number of items in the array is anything but very
+ * small.
+ */
+ if (OidIsValid(opexpr->hashfuncid))
+ {
+ hash_finfo = palloc0(sizeof(FmgrInfo));
+ hash_fcinfo = palloc0(SizeForFunctionCallInfo(1));
+ fmgr_info(opexpr->hashfuncid, hash_finfo);
+ fmgr_info_set_expr((Node *) node, hash_finfo);
+ InitFunctionCallInfoData(*hash_fcinfo, hash_finfo,
+ 1, opexpr->inputcollid, NULL,
+ NULL);
+
+ scratch.d.hashedscalararrayop.hash_finfo = hash_finfo;
+ scratch.d.hashedscalararrayop.hash_fcinfo_data = hash_fcinfo;
+ scratch.d.hashedscalararrayop.hash_fn_addr = hash_finfo->fn_addr;
+
+ /* Evaluate scalar directly into left function argument */
+ ExecInitExprRec(scalararg, state,
+ &fcinfo->args[0].value, &fcinfo->args[0].isnull);
+
+ /*
+ * Evaluate array argument into our return value. There's
+ * no danger in that, because the return value is
+ * guaranteed to be overwritten by
+ * EEOP_HASHED_SCALARARRAYOP, and will not be passed to
+ * any other expression.
+ */
+ ExecInitExprRec(arrayarg, state, resv, resnull);
+
+ /* And perform the operation */
+ scratch.opcode = EEOP_HASHED_SCALARARRAYOP;
+ scratch.d.hashedscalararrayop.finfo = finfo;
+ scratch.d.hashedscalararrayop.fcinfo_data = fcinfo;
+ scratch.d.hashedscalararrayop.fn_addr = finfo->fn_addr;
+
+ scratch.d.hashedscalararrayop.hash_finfo = hash_finfo;
+ scratch.d.hashedscalararrayop.hash_fcinfo_data = hash_fcinfo;
+ scratch.d.hashedscalararrayop.hash_fn_addr = hash_finfo->fn_addr;
+
+ ExprEvalPushStep(state, &scratch);
+ }
+ else
+ {
+ /* Evaluate scalar directly into left function argument */
+ ExecInitExprRec(scalararg, state,
+ &fcinfo->args[0].value,
+ &fcinfo->args[0].isnull);
+
+ /*
+ * Evaluate array argument into our return value. There's
+ * no danger in that, because the return value is
+ * guaranteed to be overwritten by EEOP_SCALARARRAYOP, and
+ * will not be passed to any other expression.
+ */
+ ExecInitExprRec(arrayarg, state, resv, resnull);
+
+ /* And perform the operation */
+ scratch.opcode = EEOP_SCALARARRAYOP;
+ scratch.d.scalararrayop.element_type = InvalidOid;
+ scratch.d.scalararrayop.useOr = opexpr->useOr;
+ scratch.d.scalararrayop.finfo = finfo;
+ scratch.d.scalararrayop.fcinfo_data = fcinfo;
+ scratch.d.scalararrayop.fn_addr = finfo->fn_addr;
+ ExprEvalPushStep(state, &scratch);
+ }
+ break;
+ }
+
+ case T_BoolExpr:
+ {
+ BoolExpr *boolexpr = (BoolExpr *) node;
+ int nargs = list_length(boolexpr->args);
+ List *adjust_jumps = NIL;
+ int off;
+ ListCell *lc;
+
+ /* allocate scratch memory used by all steps of AND/OR */
+ if (boolexpr->boolop != NOT_EXPR)
+ scratch.d.boolexpr.anynull = (bool *) palloc(sizeof(bool));
+
+ /*
+ * For each argument evaluate the argument itself, then
+ * perform the bool operation's appropriate handling.
+ *
+ * We can evaluate each argument into our result area, since
+ * the short-circuiting logic means we only need to remember
+ * previous NULL values.
+ *
+ * AND/OR is split into separate STEP_FIRST (one) / STEP (zero
+ * or more) / STEP_LAST (one) steps, as each of those has to
+ * perform different work. The FIRST/LAST split is valid
+ * because AND/OR have at least two arguments.
+ */
+ off = 0;
+ foreach(lc, boolexpr->args)
+ {
+ Expr *arg = (Expr *) lfirst(lc);
+
+ /* Evaluate argument into our output variable */
+ ExecInitExprRec(arg, state, resv, resnull);
+
+ /* Perform the appropriate step type */
+ switch (boolexpr->boolop)
+ {
+ case AND_EXPR:
+ Assert(nargs >= 2);
+
+ if (off == 0)
+ scratch.opcode = EEOP_BOOL_AND_STEP_FIRST;
+ else if (off + 1 == nargs)
+ scratch.opcode = EEOP_BOOL_AND_STEP_LAST;
+ else
+ scratch.opcode = EEOP_BOOL_AND_STEP;
+ break;
+ case OR_EXPR:
+ Assert(nargs >= 2);
+
+ if (off == 0)
+ scratch.opcode = EEOP_BOOL_OR_STEP_FIRST;
+ else if (off + 1 == nargs)
+ scratch.opcode = EEOP_BOOL_OR_STEP_LAST;
+ else
+ scratch.opcode = EEOP_BOOL_OR_STEP;
+ break;
+ case NOT_EXPR:
+ Assert(nargs == 1);
+
+ scratch.opcode = EEOP_BOOL_NOT_STEP;
+ break;
+ default:
+ elog(ERROR, "unrecognized boolop: %d",
+ (int) boolexpr->boolop);
+ break;
+ }
+
+ scratch.d.boolexpr.jumpdone = -1;
+ ExprEvalPushStep(state, &scratch);
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+ off++;
+ }
+
+ /* adjust jump targets */
+ foreach(lc, adjust_jumps)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+ Assert(as->d.boolexpr.jumpdone == -1);
+ as->d.boolexpr.jumpdone = state->steps_len;
+ }
+
+ break;
+ }
+
+ case T_SubPlan:
+ {
+ SubPlan *subplan = (SubPlan *) node;
+ SubPlanState *sstate;
+
+ if (!state->parent)
+ elog(ERROR, "SubPlan found with no parent plan");
+
+ sstate = ExecInitSubPlan(subplan, state->parent);
+
+ /* add SubPlanState nodes to state->parent->subPlan */
+ state->parent->subPlan = lappend(state->parent->subPlan,
+ sstate);
+
+ scratch.opcode = EEOP_SUBPLAN;
+ scratch.d.subplan.sstate = sstate;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_FieldSelect:
+ {
+ FieldSelect *fselect = (FieldSelect *) node;
+
+ /* evaluate row/record argument into result area */
+ ExecInitExprRec(fselect->arg, state, resv, resnull);
+
+ /* and extract field */
+ scratch.opcode = EEOP_FIELDSELECT;
+ scratch.d.fieldselect.fieldnum = fselect->fieldnum;
+ scratch.d.fieldselect.resulttype = fselect->resulttype;
+ scratch.d.fieldselect.rowcache.cacheptr = NULL;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_FieldStore:
+ {
+ FieldStore *fstore = (FieldStore *) node;
+ TupleDesc tupDesc;
+ ExprEvalRowtypeCache *rowcachep;
+ Datum *values;
+ bool *nulls;
+ int ncolumns;
+ ListCell *l1,
+ *l2;
+
+ /* find out the number of columns in the composite type */
+ tupDesc = lookup_rowtype_tupdesc(fstore->resulttype, -1);
+ ncolumns = tupDesc->natts;
+ DecrTupleDescRefCount(tupDesc);
+
+ /* create workspace for column values */
+ values = (Datum *) palloc(sizeof(Datum) * ncolumns);
+ nulls = (bool *) palloc(sizeof(bool) * ncolumns);
+
+ /* create shared composite-type-lookup cache struct */
+ rowcachep = palloc(sizeof(ExprEvalRowtypeCache));
+ rowcachep->cacheptr = NULL;
+
+ /* emit code to evaluate the composite input value */
+ ExecInitExprRec(fstore->arg, state, resv, resnull);
+
+ /* next, deform the input tuple into our workspace */
+ scratch.opcode = EEOP_FIELDSTORE_DEFORM;
+ scratch.d.fieldstore.fstore = fstore;
+ scratch.d.fieldstore.rowcache = rowcachep;
+ scratch.d.fieldstore.values = values;
+ scratch.d.fieldstore.nulls = nulls;
+ scratch.d.fieldstore.ncolumns = ncolumns;
+ ExprEvalPushStep(state, &scratch);
+
+ /* evaluate new field values, store in workspace columns */
+ forboth(l1, fstore->newvals, l2, fstore->fieldnums)
+ {
+ Expr *e = (Expr *) lfirst(l1);
+ AttrNumber fieldnum = lfirst_int(l2);
+ Datum *save_innermost_caseval;
+ bool *save_innermost_casenull;
+
+ if (fieldnum <= 0 || fieldnum > ncolumns)
+ elog(ERROR, "field number %d is out of range in FieldStore",
+ fieldnum);
+
+ /*
+ * Use the CaseTestExpr mechanism to pass down the old
+ * value of the field being replaced; this is needed in
+ * case the newval is itself a FieldStore or
+ * SubscriptingRef that has to obtain and modify the old
+ * value. It's safe to reuse the CASE mechanism because
+ * there cannot be a CASE between here and where the value
+ * would be needed, and a field assignment can't be within
+ * a CASE either. (So saving and restoring
+ * innermost_caseval is just paranoia, but let's do it
+ * anyway.)
+ *
+ * Another non-obvious point is that it's safe to use the
+ * field's values[]/nulls[] entries as both the caseval
+ * source and the result address for this subexpression.
+ * That's okay only because (1) both FieldStore and
+ * SubscriptingRef evaluate their arg or refexpr inputs
+ * first, and (2) any such CaseTestExpr is directly the
+ * arg or refexpr input. So any read of the caseval will
+ * occur before there's a chance to overwrite it. Also,
+ * if multiple entries in the newvals/fieldnums lists
+ * target the same field, they'll effectively be applied
+ * left-to-right which is what we want.
+ */
+ save_innermost_caseval = state->innermost_caseval;
+ save_innermost_casenull = state->innermost_casenull;
+ state->innermost_caseval = &values[fieldnum - 1];
+ state->innermost_casenull = &nulls[fieldnum - 1];
+
+ ExecInitExprRec(e, state,
+ &values[fieldnum - 1],
+ &nulls[fieldnum - 1]);
+
+ state->innermost_caseval = save_innermost_caseval;
+ state->innermost_casenull = save_innermost_casenull;
+ }
+
+ /* finally, form result tuple */
+ scratch.opcode = EEOP_FIELDSTORE_FORM;
+ scratch.d.fieldstore.fstore = fstore;
+ scratch.d.fieldstore.rowcache = rowcachep;
+ scratch.d.fieldstore.values = values;
+ scratch.d.fieldstore.nulls = nulls;
+ scratch.d.fieldstore.ncolumns = ncolumns;
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_RelabelType:
+ {
+ /* relabel doesn't need to do anything at runtime */
+ RelabelType *relabel = (RelabelType *) node;
+
+ ExecInitExprRec(relabel->arg, state, resv, resnull);
+ break;
+ }
+
+ case T_CoerceViaIO:
+ {
+ CoerceViaIO *iocoerce = (CoerceViaIO *) node;
+ Oid iofunc;
+ bool typisvarlena;
+ Oid typioparam;
+ FunctionCallInfo fcinfo_in;
+
+ /* evaluate argument into step's result area */
+ ExecInitExprRec(iocoerce->arg, state, resv, resnull);
+
+ /*
+ * Prepare both output and input function calls, to be
+ * evaluated inside a single evaluation step for speed - this
+ * can be a very common operation.
+ *
+ * We don't check permissions here as a type's input/output
+ * function are assumed to be executable by everyone.
+ */
+ scratch.opcode = EEOP_IOCOERCE;
+
+ /* lookup the source type's output function */
+ scratch.d.iocoerce.finfo_out = palloc0(sizeof(FmgrInfo));
+ scratch.d.iocoerce.fcinfo_data_out = palloc0(SizeForFunctionCallInfo(1));
+
+ getTypeOutputInfo(exprType((Node *) iocoerce->arg),
+ &iofunc, &typisvarlena);
+ fmgr_info(iofunc, scratch.d.iocoerce.finfo_out);
+ fmgr_info_set_expr((Node *) node, scratch.d.iocoerce.finfo_out);
+ InitFunctionCallInfoData(*scratch.d.iocoerce.fcinfo_data_out,
+ scratch.d.iocoerce.finfo_out,
+ 1, InvalidOid, NULL, NULL);
+
+ /* lookup the result type's input function */
+ scratch.d.iocoerce.finfo_in = palloc0(sizeof(FmgrInfo));
+ scratch.d.iocoerce.fcinfo_data_in = palloc0(SizeForFunctionCallInfo(3));
+
+ getTypeInputInfo(iocoerce->resulttype,
+ &iofunc, &typioparam);
+ fmgr_info(iofunc, scratch.d.iocoerce.finfo_in);
+ fmgr_info_set_expr((Node *) node, scratch.d.iocoerce.finfo_in);
+ InitFunctionCallInfoData(*scratch.d.iocoerce.fcinfo_data_in,
+ scratch.d.iocoerce.finfo_in,
+ 3, InvalidOid, NULL, NULL);
+
+ /*
+ * We can preload the second and third arguments for the input
+ * function, since they're constants.
+ */
+ fcinfo_in = scratch.d.iocoerce.fcinfo_data_in;
+ fcinfo_in->args[1].value = ObjectIdGetDatum(typioparam);
+ fcinfo_in->args[1].isnull = false;
+ fcinfo_in->args[2].value = Int32GetDatum(-1);
+ fcinfo_in->args[2].isnull = false;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_ArrayCoerceExpr:
+ {
+ ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
+ Oid resultelemtype;
+ ExprState *elemstate;
+
+ /* evaluate argument into step's result area */
+ ExecInitExprRec(acoerce->arg, state, resv, resnull);
+
+ resultelemtype = get_element_type(acoerce->resulttype);
+ if (!OidIsValid(resultelemtype))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("target type is not an array")));
+
+ /*
+ * Construct a sub-expression for the per-element expression;
+ * but don't ready it until after we check it for triviality.
+ * We assume it hasn't any Var references, but does have a
+ * CaseTestExpr representing the source array element values.
+ */
+ elemstate = makeNode(ExprState);
+ elemstate->expr = acoerce->elemexpr;
+ elemstate->parent = state->parent;
+ elemstate->ext_params = state->ext_params;
+
+ elemstate->innermost_caseval = (Datum *) palloc(sizeof(Datum));
+ elemstate->innermost_casenull = (bool *) palloc(sizeof(bool));
+
+ ExecInitExprRec(acoerce->elemexpr, elemstate,
+ &elemstate->resvalue, &elemstate->resnull);
+
+ if (elemstate->steps_len == 1 &&
+ elemstate->steps[0].opcode == EEOP_CASE_TESTVAL)
+ {
+ /* Trivial, so we need no per-element work at runtime */
+ elemstate = NULL;
+ }
+ else
+ {
+ /* Not trivial, so append a DONE step */
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(elemstate, &scratch);
+ /* and ready the subexpression */
+ ExecReadyExpr(elemstate);
+ }
+
+ scratch.opcode = EEOP_ARRAYCOERCE;
+ scratch.d.arraycoerce.elemexprstate = elemstate;
+ scratch.d.arraycoerce.resultelemtype = resultelemtype;
+
+ if (elemstate)
+ {
+ /* Set up workspace for array_map */
+ scratch.d.arraycoerce.amstate =
+ (ArrayMapState *) palloc0(sizeof(ArrayMapState));
+ }
+ else
+ {
+ /* Don't need workspace if there's no subexpression */
+ scratch.d.arraycoerce.amstate = NULL;
+ }
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_ConvertRowtypeExpr:
+ {
+ ConvertRowtypeExpr *convert = (ConvertRowtypeExpr *) node;
+ ExprEvalRowtypeCache *rowcachep;
+
+ /* cache structs must be out-of-line for space reasons */
+ rowcachep = palloc(2 * sizeof(ExprEvalRowtypeCache));
+ rowcachep[0].cacheptr = NULL;
+ rowcachep[1].cacheptr = NULL;
+
+ /* evaluate argument into step's result area */
+ ExecInitExprRec(convert->arg, state, resv, resnull);
+
+ /* and push conversion step */
+ scratch.opcode = EEOP_CONVERT_ROWTYPE;
+ scratch.d.convert_rowtype.inputtype =
+ exprType((Node *) convert->arg);
+ scratch.d.convert_rowtype.outputtype = convert->resulttype;
+ scratch.d.convert_rowtype.incache = &rowcachep[0];
+ scratch.d.convert_rowtype.outcache = &rowcachep[1];
+ scratch.d.convert_rowtype.map = NULL;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ /* note that CaseWhen expressions are handled within this block */
+ case T_CaseExpr:
+ {
+ CaseExpr *caseExpr = (CaseExpr *) node;
+ List *adjust_jumps = NIL;
+ Datum *caseval = NULL;
+ bool *casenull = NULL;
+ ListCell *lc;
+
+ /*
+ * If there's a test expression, we have to evaluate it and
+ * save the value where the CaseTestExpr placeholders can find
+ * it.
+ */
+ if (caseExpr->arg != NULL)
+ {
+ /* Evaluate testexpr into caseval/casenull workspace */
+ caseval = palloc(sizeof(Datum));
+ casenull = palloc(sizeof(bool));
+
+ ExecInitExprRec(caseExpr->arg, state,
+ caseval, casenull);
+
+ /*
+ * Since value might be read multiple times, force to R/O
+ * - but only if it could be an expanded datum.
+ */
+ if (get_typlen(exprType((Node *) caseExpr->arg)) == -1)
+ {
+ /* change caseval in-place */
+ scratch.opcode = EEOP_MAKE_READONLY;
+ scratch.resvalue = caseval;
+ scratch.resnull = casenull;
+ scratch.d.make_readonly.value = caseval;
+ scratch.d.make_readonly.isnull = casenull;
+ ExprEvalPushStep(state, &scratch);
+ /* restore normal settings of scratch fields */
+ scratch.resvalue = resv;
+ scratch.resnull = resnull;
+ }
+ }
+
+ /*
+ * Prepare to evaluate each of the WHEN clauses in turn; as
+ * soon as one is true we return the value of the
+ * corresponding THEN clause. If none are true then we return
+ * the value of the ELSE clause, or NULL if there is none.
+ */
+ foreach(lc, caseExpr->args)
+ {
+ CaseWhen *when = (CaseWhen *) lfirst(lc);
+ Datum *save_innermost_caseval;
+ bool *save_innermost_casenull;
+ int whenstep;
+
+ /*
+ * Make testexpr result available to CaseTestExpr nodes
+ * within the condition. We must save and restore prior
+ * setting of innermost_caseval fields, in case this node
+ * is itself within a larger CASE.
+ *
+ * If there's no test expression, we don't actually need
+ * to save and restore these fields; but it's less code to
+ * just do so unconditionally.
+ */
+ save_innermost_caseval = state->innermost_caseval;
+ save_innermost_casenull = state->innermost_casenull;
+ state->innermost_caseval = caseval;
+ state->innermost_casenull = casenull;
+
+ /* evaluate condition into CASE's result variables */
+ ExecInitExprRec(when->expr, state, resv, resnull);
+
+ state->innermost_caseval = save_innermost_caseval;
+ state->innermost_casenull = save_innermost_casenull;
+
+ /* If WHEN result isn't true, jump to next CASE arm */
+ scratch.opcode = EEOP_JUMP_IF_NOT_TRUE;
+ scratch.d.jump.jumpdone = -1; /* computed later */
+ ExprEvalPushStep(state, &scratch);
+ whenstep = state->steps_len - 1;
+
+ /*
+ * If WHEN result is true, evaluate THEN result, storing
+ * it into the CASE's result variables.
+ */
+ ExecInitExprRec(when->result, state, resv, resnull);
+
+ /* Emit JUMP step to jump to end of CASE's code */
+ scratch.opcode = EEOP_JUMP;
+ scratch.d.jump.jumpdone = -1; /* computed later */
+ ExprEvalPushStep(state, &scratch);
+
+ /*
+ * Don't know address for that jump yet, compute once the
+ * whole CASE expression is built.
+ */
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+
+ /*
+ * But we can set WHEN test's jump target now, to make it
+ * jump to the next WHEN subexpression or the ELSE.
+ */
+ state->steps[whenstep].d.jump.jumpdone = state->steps_len;
+ }
+
+ /* transformCaseExpr always adds a default */
+ Assert(caseExpr->defresult);
+
+ /* evaluate ELSE expr into CASE's result variables */
+ ExecInitExprRec(caseExpr->defresult, state,
+ resv, resnull);
+
+ /* adjust jump targets */
+ foreach(lc, adjust_jumps)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+ Assert(as->opcode == EEOP_JUMP);
+ Assert(as->d.jump.jumpdone == -1);
+ as->d.jump.jumpdone = state->steps_len;
+ }
+
+ break;
+ }
+
+ case T_CaseTestExpr:
+ {
+ /*
+ * Read from location identified by innermost_caseval. Note
+ * that innermost_caseval could be NULL, if this node isn't
+ * actually within a CaseExpr, ArrayCoerceExpr, etc structure.
+ * That can happen because some parts of the system abuse
+ * CaseTestExpr to cause a read of a value externally supplied
+ * in econtext->caseValue_datum. We'll take care of that
+ * scenario at runtime.
+ */
+ scratch.opcode = EEOP_CASE_TESTVAL;
+ scratch.d.casetest.value = state->innermost_caseval;
+ scratch.d.casetest.isnull = state->innermost_casenull;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_ArrayExpr:
+ {
+ ArrayExpr *arrayexpr = (ArrayExpr *) node;
+ int nelems = list_length(arrayexpr->elements);
+ ListCell *lc;
+ int elemoff;
+
+ /*
+ * Evaluate by computing each element, and then forming the
+ * array. Elements are computed into scratch arrays
+ * associated with the ARRAYEXPR step.
+ */
+ scratch.opcode = EEOP_ARRAYEXPR;
+ scratch.d.arrayexpr.elemvalues =
+ (Datum *) palloc(sizeof(Datum) * nelems);
+ scratch.d.arrayexpr.elemnulls =
+ (bool *) palloc(sizeof(bool) * nelems);
+ scratch.d.arrayexpr.nelems = nelems;
+
+ /* fill remaining fields of step */
+ scratch.d.arrayexpr.multidims = arrayexpr->multidims;
+ scratch.d.arrayexpr.elemtype = arrayexpr->element_typeid;
+
+ /* do one-time catalog lookup for type info */
+ get_typlenbyvalalign(arrayexpr->element_typeid,
+ &scratch.d.arrayexpr.elemlength,
+ &scratch.d.arrayexpr.elembyval,
+ &scratch.d.arrayexpr.elemalign);
+
+ /* prepare to evaluate all arguments */
+ elemoff = 0;
+ foreach(lc, arrayexpr->elements)
+ {
+ Expr *e = (Expr *) lfirst(lc);
+
+ ExecInitExprRec(e, state,
+ &scratch.d.arrayexpr.elemvalues[elemoff],
+ &scratch.d.arrayexpr.elemnulls[elemoff]);
+ elemoff++;
+ }
+
+ /* and then collect all into an array */
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_RowExpr:
+ {
+ RowExpr *rowexpr = (RowExpr *) node;
+ int nelems = list_length(rowexpr->args);
+ TupleDesc tupdesc;
+ int i;
+ ListCell *l;
+
+ /* Build tupdesc to describe result tuples */
+ if (rowexpr->row_typeid == RECORDOID)
+ {
+ /* generic record, use types of given expressions */
+ tupdesc = ExecTypeFromExprList(rowexpr->args);
+ /* ... but adopt RowExpr's column aliases */
+ ExecTypeSetColNames(tupdesc, rowexpr->colnames);
+ /* Bless the tupdesc so it can be looked up later */
+ BlessTupleDesc(tupdesc);
+ }
+ else
+ {
+ /* it's been cast to a named type, use that */
+ tupdesc = lookup_rowtype_tupdesc_copy(rowexpr->row_typeid, -1);
+ }
+
+ /*
+ * In the named-type case, the tupdesc could have more columns
+ * than are in the args list, since the type might have had
+ * columns added since the ROW() was parsed. We want those
+ * extra columns to go to nulls, so we make sure that the
+ * workspace arrays are large enough and then initialize any
+ * extra columns to read as NULLs.
+ */
+ Assert(nelems <= tupdesc->natts);
+ nelems = Max(nelems, tupdesc->natts);
+
+ /*
+ * Evaluate by first building datums for each field, and then
+ * a final step forming the composite datum.
+ */
+ scratch.opcode = EEOP_ROW;
+ scratch.d.row.tupdesc = tupdesc;
+
+ /* space for the individual field datums */
+ scratch.d.row.elemvalues =
+ (Datum *) palloc(sizeof(Datum) * nelems);
+ scratch.d.row.elemnulls =
+ (bool *) palloc(sizeof(bool) * nelems);
+ /* as explained above, make sure any extra columns are null */
+ memset(scratch.d.row.elemnulls, true, sizeof(bool) * nelems);
+
+ /* Set up evaluation, skipping any deleted columns */
+ i = 0;
+ foreach(l, rowexpr->args)
+ {
+ Form_pg_attribute att = TupleDescAttr(tupdesc, i);
+ Expr *e = (Expr *) lfirst(l);
+
+ if (!att->attisdropped)
+ {
+ /*
+ * Guard against ALTER COLUMN TYPE on rowtype since
+ * the RowExpr was created. XXX should we check
+ * typmod too? Not sure we can be sure it'll be the
+ * same.
+ */
+ if (exprType((Node *) e) != att->atttypid)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("ROW() column has type %s instead of type %s",
+ format_type_be(exprType((Node *) e)),
+ format_type_be(att->atttypid))));
+ }
+ else
+ {
+ /*
+ * Ignore original expression and insert a NULL. We
+ * don't really care what type of NULL it is, so
+ * always make an int4 NULL.
+ */
+ e = (Expr *) makeNullConst(INT4OID, -1, InvalidOid);
+ }
+
+ /* Evaluate column expr into appropriate workspace slot */
+ ExecInitExprRec(e, state,
+ &scratch.d.row.elemvalues[i],
+ &scratch.d.row.elemnulls[i]);
+ i++;
+ }
+
+ /* And finally build the row value */
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_RowCompareExpr:
+ {
+ RowCompareExpr *rcexpr = (RowCompareExpr *) node;
+ int nopers = list_length(rcexpr->opnos);
+ List *adjust_jumps = NIL;
+ ListCell *l_left_expr,
+ *l_right_expr,
+ *l_opno,
+ *l_opfamily,
+ *l_inputcollid;
+ ListCell *lc;
+
+ /*
+ * Iterate over each field, prepare comparisons. To handle
+ * NULL results, prepare jumps to after the expression. If a
+ * comparison yields a != 0 result, jump to the final step.
+ */
+ Assert(list_length(rcexpr->largs) == nopers);
+ Assert(list_length(rcexpr->rargs) == nopers);
+ Assert(list_length(rcexpr->opfamilies) == nopers);
+ Assert(list_length(rcexpr->inputcollids) == nopers);
+
+ forfive(l_left_expr, rcexpr->largs,
+ l_right_expr, rcexpr->rargs,
+ l_opno, rcexpr->opnos,
+ l_opfamily, rcexpr->opfamilies,
+ l_inputcollid, rcexpr->inputcollids)
+ {
+ Expr *left_expr = (Expr *) lfirst(l_left_expr);
+ Expr *right_expr = (Expr *) lfirst(l_right_expr);
+ Oid opno = lfirst_oid(l_opno);
+ Oid opfamily = lfirst_oid(l_opfamily);
+ Oid inputcollid = lfirst_oid(l_inputcollid);
+ int strategy;
+ Oid lefttype;
+ Oid righttype;
+ Oid proc;
+ FmgrInfo *finfo;
+ FunctionCallInfo fcinfo;
+
+ get_op_opfamily_properties(opno, opfamily, false,
+ &strategy,
+ &lefttype,
+ &righttype);
+ proc = get_opfamily_proc(opfamily,
+ lefttype,
+ righttype,
+ BTORDER_PROC);
+ if (!OidIsValid(proc))
+ elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
+ BTORDER_PROC, lefttype, righttype, opfamily);
+
+ /* Set up the primary fmgr lookup information */
+ finfo = palloc0(sizeof(FmgrInfo));
+ fcinfo = palloc0(SizeForFunctionCallInfo(2));
+ fmgr_info(proc, finfo);
+ fmgr_info_set_expr((Node *) node, finfo);
+ InitFunctionCallInfoData(*fcinfo, finfo, 2,
+ inputcollid, NULL, NULL);
+
+ /*
+ * If we enforced permissions checks on index support
+ * functions, we'd need to make a check here. But the
+ * index support machinery doesn't do that, and thus
+ * neither does this code.
+ */
+
+ /* evaluate left and right args directly into fcinfo */
+ ExecInitExprRec(left_expr, state,
+ &fcinfo->args[0].value, &fcinfo->args[0].isnull);
+ ExecInitExprRec(right_expr, state,
+ &fcinfo->args[1].value, &fcinfo->args[1].isnull);
+
+ scratch.opcode = EEOP_ROWCOMPARE_STEP;
+ scratch.d.rowcompare_step.finfo = finfo;
+ scratch.d.rowcompare_step.fcinfo_data = fcinfo;
+ scratch.d.rowcompare_step.fn_addr = finfo->fn_addr;
+ /* jump targets filled below */
+ scratch.d.rowcompare_step.jumpnull = -1;
+ scratch.d.rowcompare_step.jumpdone = -1;
+
+ ExprEvalPushStep(state, &scratch);
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+ }
+
+ /*
+ * We could have a zero-column rowtype, in which case the rows
+ * necessarily compare equal.
+ */
+ if (nopers == 0)
+ {
+ scratch.opcode = EEOP_CONST;
+ scratch.d.constval.value = Int32GetDatum(0);
+ scratch.d.constval.isnull = false;
+ ExprEvalPushStep(state, &scratch);
+ }
+
+ /* Finally, examine the last comparison result */
+ scratch.opcode = EEOP_ROWCOMPARE_FINAL;
+ scratch.d.rowcompare_final.rctype = rcexpr->rctype;
+ ExprEvalPushStep(state, &scratch);
+
+ /* adjust jump targets */
+ foreach(lc, adjust_jumps)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+ Assert(as->opcode == EEOP_ROWCOMPARE_STEP);
+ Assert(as->d.rowcompare_step.jumpdone == -1);
+ Assert(as->d.rowcompare_step.jumpnull == -1);
+
+ /* jump to comparison evaluation */
+ as->d.rowcompare_step.jumpdone = state->steps_len - 1;
+ /* jump to the following expression */
+ as->d.rowcompare_step.jumpnull = state->steps_len;
+ }
+
+ break;
+ }
+
+ case T_CoalesceExpr:
+ {
+ CoalesceExpr *coalesce = (CoalesceExpr *) node;
+ List *adjust_jumps = NIL;
+ ListCell *lc;
+
+ /* We assume there's at least one arg */
+ Assert(coalesce->args != NIL);
+
+ /*
+ * Prepare evaluation of all coalesced arguments, after each
+ * one push a step that short-circuits if not null.
+ */
+ foreach(lc, coalesce->args)
+ {
+ Expr *e = (Expr *) lfirst(lc);
+
+ /* evaluate argument, directly into result datum */
+ ExecInitExprRec(e, state, resv, resnull);
+
+ /* if it's not null, skip to end of COALESCE expr */
+ scratch.opcode = EEOP_JUMP_IF_NOT_NULL;
+ scratch.d.jump.jumpdone = -1; /* adjust later */
+ ExprEvalPushStep(state, &scratch);
+
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+ }
+
+ /*
+ * No need to add a constant NULL return - we only can get to
+ * the end of the expression if a NULL already is being
+ * returned.
+ */
+
+ /* adjust jump targets */
+ foreach(lc, adjust_jumps)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+ Assert(as->opcode == EEOP_JUMP_IF_NOT_NULL);
+ Assert(as->d.jump.jumpdone == -1);
+ as->d.jump.jumpdone = state->steps_len;
+ }
+
+ break;
+ }
+
+ case T_MinMaxExpr:
+ {
+ MinMaxExpr *minmaxexpr = (MinMaxExpr *) node;
+ int nelems = list_length(minmaxexpr->args);
+ TypeCacheEntry *typentry;
+ FmgrInfo *finfo;
+ FunctionCallInfo fcinfo;
+ ListCell *lc;
+ int off;
+
+ /* Look up the btree comparison function for the datatype */
+ typentry = lookup_type_cache(minmaxexpr->minmaxtype,
+ TYPECACHE_CMP_PROC);
+ if (!OidIsValid(typentry->cmp_proc))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_FUNCTION),
+ errmsg("could not identify a comparison function for type %s",
+ format_type_be(minmaxexpr->minmaxtype))));
+
+ /*
+ * If we enforced permissions checks on index support
+ * functions, we'd need to make a check here. But the index
+ * support machinery doesn't do that, and thus neither does
+ * this code.
+ */
+
+ /* Perform function lookup */
+ finfo = palloc0(sizeof(FmgrInfo));
+ fcinfo = palloc0(SizeForFunctionCallInfo(2));
+ fmgr_info(typentry->cmp_proc, finfo);
+ fmgr_info_set_expr((Node *) node, finfo);
+ InitFunctionCallInfoData(*fcinfo, finfo, 2,
+ minmaxexpr->inputcollid, NULL, NULL);
+
+ scratch.opcode = EEOP_MINMAX;
+ /* allocate space to store arguments */
+ scratch.d.minmax.values =
+ (Datum *) palloc(sizeof(Datum) * nelems);
+ scratch.d.minmax.nulls =
+ (bool *) palloc(sizeof(bool) * nelems);
+ scratch.d.minmax.nelems = nelems;
+
+ scratch.d.minmax.op = minmaxexpr->op;
+ scratch.d.minmax.finfo = finfo;
+ scratch.d.minmax.fcinfo_data = fcinfo;
+
+ /* evaluate expressions into minmax->values/nulls */
+ off = 0;
+ foreach(lc, minmaxexpr->args)
+ {
+ Expr *e = (Expr *) lfirst(lc);
+
+ ExecInitExprRec(e, state,
+ &scratch.d.minmax.values[off],
+ &scratch.d.minmax.nulls[off]);
+ off++;
+ }
+
+ /* and push the final comparison */
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_SQLValueFunction:
+ {
+ SQLValueFunction *svf = (SQLValueFunction *) node;
+
+ scratch.opcode = EEOP_SQLVALUEFUNCTION;
+ scratch.d.sqlvaluefunction.svf = svf;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_XmlExpr:
+ {
+ XmlExpr *xexpr = (XmlExpr *) node;
+ int nnamed = list_length(xexpr->named_args);
+ int nargs = list_length(xexpr->args);
+ int off;
+ ListCell *arg;
+
+ scratch.opcode = EEOP_XMLEXPR;
+ scratch.d.xmlexpr.xexpr = xexpr;
+
+ /* allocate space for storing all the arguments */
+ if (nnamed)
+ {
+ scratch.d.xmlexpr.named_argvalue =
+ (Datum *) palloc(sizeof(Datum) * nnamed);
+ scratch.d.xmlexpr.named_argnull =
+ (bool *) palloc(sizeof(bool) * nnamed);
+ }
+ else
+ {
+ scratch.d.xmlexpr.named_argvalue = NULL;
+ scratch.d.xmlexpr.named_argnull = NULL;
+ }
+
+ if (nargs)
+ {
+ scratch.d.xmlexpr.argvalue =
+ (Datum *) palloc(sizeof(Datum) * nargs);
+ scratch.d.xmlexpr.argnull =
+ (bool *) palloc(sizeof(bool) * nargs);
+ }
+ else
+ {
+ scratch.d.xmlexpr.argvalue = NULL;
+ scratch.d.xmlexpr.argnull = NULL;
+ }
+
+ /* prepare argument execution */
+ off = 0;
+ foreach(arg, xexpr->named_args)
+ {
+ Expr *e = (Expr *) lfirst(arg);
+
+ ExecInitExprRec(e, state,
+ &scratch.d.xmlexpr.named_argvalue[off],
+ &scratch.d.xmlexpr.named_argnull[off]);
+ off++;
+ }
+
+ off = 0;
+ foreach(arg, xexpr->args)
+ {
+ Expr *e = (Expr *) lfirst(arg);
+
+ ExecInitExprRec(e, state,
+ &scratch.d.xmlexpr.argvalue[off],
+ &scratch.d.xmlexpr.argnull[off]);
+ off++;
+ }
+
+ /* and evaluate the actual XML expression */
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_NullTest:
+ {
+ NullTest *ntest = (NullTest *) node;
+
+ if (ntest->nulltesttype == IS_NULL)
+ {
+ if (ntest->argisrow)
+ scratch.opcode = EEOP_NULLTEST_ROWISNULL;
+ else
+ scratch.opcode = EEOP_NULLTEST_ISNULL;
+ }
+ else if (ntest->nulltesttype == IS_NOT_NULL)
+ {
+ if (ntest->argisrow)
+ scratch.opcode = EEOP_NULLTEST_ROWISNOTNULL;
+ else
+ scratch.opcode = EEOP_NULLTEST_ISNOTNULL;
+ }
+ else
+ {
+ elog(ERROR, "unrecognized nulltesttype: %d",
+ (int) ntest->nulltesttype);
+ }
+ /* initialize cache in case it's a row test */
+ scratch.d.nulltest_row.rowcache.cacheptr = NULL;
+
+ /* first evaluate argument into result variable */
+ ExecInitExprRec(ntest->arg, state,
+ resv, resnull);
+
+ /* then push the test of that argument */
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_BooleanTest:
+ {
+ BooleanTest *btest = (BooleanTest *) node;
+
+ /*
+ * Evaluate argument, directly into result datum. That's ok,
+ * because resv/resnull is definitely not used anywhere else,
+ * and will get overwritten by the below EEOP_BOOLTEST_IS_*
+ * step.
+ */
+ ExecInitExprRec(btest->arg, state, resv, resnull);
+
+ switch (btest->booltesttype)
+ {
+ case IS_TRUE:
+ scratch.opcode = EEOP_BOOLTEST_IS_TRUE;
+ break;
+ case IS_NOT_TRUE:
+ scratch.opcode = EEOP_BOOLTEST_IS_NOT_TRUE;
+ break;
+ case IS_FALSE:
+ scratch.opcode = EEOP_BOOLTEST_IS_FALSE;
+ break;
+ case IS_NOT_FALSE:
+ scratch.opcode = EEOP_BOOLTEST_IS_NOT_FALSE;
+ break;
+ case IS_UNKNOWN:
+ /* Same as scalar IS NULL test */
+ scratch.opcode = EEOP_NULLTEST_ISNULL;
+ break;
+ case IS_NOT_UNKNOWN:
+ /* Same as scalar IS NOT NULL test */
+ scratch.opcode = EEOP_NULLTEST_ISNOTNULL;
+ break;
+ default:
+ elog(ERROR, "unrecognized booltesttype: %d",
+ (int) btest->booltesttype);
+ }
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_CoerceToDomain:
+ {
+ CoerceToDomain *ctest = (CoerceToDomain *) node;
+
+ ExecInitCoerceToDomain(&scratch, ctest, state,
+ resv, resnull);
+ break;
+ }
+
+ case T_CoerceToDomainValue:
+ {
+ /*
+ * Read from location identified by innermost_domainval. Note
+ * that innermost_domainval could be NULL, if we're compiling
+ * a standalone domain check rather than one embedded in a
+ * larger expression. In that case we must read from
+ * econtext->domainValue_datum. We'll take care of that
+ * scenario at runtime.
+ */
+ scratch.opcode = EEOP_DOMAIN_TESTVAL;
+ /* we share instruction union variant with case testval */
+ scratch.d.casetest.value = state->innermost_domainval;
+ scratch.d.casetest.isnull = state->innermost_domainnull;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_CurrentOfExpr:
+ {
+ scratch.opcode = EEOP_CURRENTOFEXPR;
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ case T_NextValueExpr:
+ {
+ NextValueExpr *nve = (NextValueExpr *) node;
+
+ scratch.opcode = EEOP_NEXTVALUEEXPR;
+ scratch.d.nextvalueexpr.seqid = nve->seqid;
+ scratch.d.nextvalueexpr.seqtypid = nve->typeId;
+
+ ExprEvalPushStep(state, &scratch);
+ break;
+ }
+
+ default:
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(node));
+ break;
+ }
+}
+
+/*
+ * Add another expression evaluation step to ExprState->steps.
+ *
+ * Note that this potentially re-allocates es->steps, therefore no pointer
+ * into that array may be used while the expression is still being built.
+ */
+void
+ExprEvalPushStep(ExprState *es, const ExprEvalStep *s)
+{
+ if (es->steps_alloc == 0)
+ {
+ es->steps_alloc = 16;
+ es->steps = palloc(sizeof(ExprEvalStep) * es->steps_alloc);
+ }
+ else if (es->steps_alloc == es->steps_len)
+ {
+ es->steps_alloc *= 2;
+ es->steps = repalloc(es->steps,
+ sizeof(ExprEvalStep) * es->steps_alloc);
+ }
+
+ memcpy(&es->steps[es->steps_len++], s, sizeof(ExprEvalStep));
+}
+
+/*
+ * Perform setup necessary for the evaluation of a function-like expression,
+ * appending argument evaluation steps to the steps list in *state, and
+ * setting up *scratch so it is ready to be pushed.
+ *
+ * *scratch is not pushed here, so that callers may override the opcode,
+ * which is useful for function-like cases like DISTINCT.
+ */
+static void
+ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, Oid funcid,
+ Oid inputcollid, ExprState *state)
+{
+ int nargs = list_length(args);
+ AclResult aclresult;
+ FmgrInfo *flinfo;
+ FunctionCallInfo fcinfo;
+ int argno;
+ ListCell *lc;
+
+ /* Check permission to call function */
+ aclresult = pg_proc_aclcheck(funcid, GetUserId(), ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(funcid));
+ InvokeFunctionExecuteHook(funcid);
+
+ /*
+ * Safety check on nargs. Under normal circumstances this should never
+ * fail, as parser should check sooner. But possibly it might fail if
+ * server has been compiled with FUNC_MAX_ARGS smaller than some functions
+ * declared in pg_proc?
+ */
+ if (nargs > FUNC_MAX_ARGS)
+ ereport(ERROR,
+ (errcode(ERRCODE_TOO_MANY_ARGUMENTS),
+ errmsg_plural("cannot pass more than %d argument to a function",
+ "cannot pass more than %d arguments to a function",
+ FUNC_MAX_ARGS,
+ FUNC_MAX_ARGS)));
+
+ /* Allocate function lookup data and parameter workspace for this call */
+ scratch->d.func.finfo = palloc0(sizeof(FmgrInfo));
+ scratch->d.func.fcinfo_data = palloc0(SizeForFunctionCallInfo(nargs));
+ flinfo = scratch->d.func.finfo;
+ fcinfo = scratch->d.func.fcinfo_data;
+
+ /* Set up the primary fmgr lookup information */
+ fmgr_info(funcid, flinfo);
+ fmgr_info_set_expr((Node *) node, flinfo);
+
+ /* Initialize function call parameter structure too */
+ InitFunctionCallInfoData(*fcinfo, flinfo,
+ nargs, inputcollid, NULL, NULL);
+
+ /* Keep extra copies of this info to save an indirection at runtime */
+ scratch->d.func.fn_addr = flinfo->fn_addr;
+ scratch->d.func.nargs = nargs;
+
+ /* We only support non-set functions here */
+ if (flinfo->fn_retset)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set"),
+ state->parent ?
+ executor_errposition(state->parent->state,
+ exprLocation((Node *) node)) : 0));
+
+ /* Build code to evaluate arguments directly into the fcinfo struct */
+ argno = 0;
+ foreach(lc, args)
+ {
+ Expr *arg = (Expr *) lfirst(lc);
+
+ if (IsA(arg, Const))
+ {
+ /*
+ * Don't evaluate const arguments every round; especially
+ * interesting for constants in comparisons.
+ */
+ Const *con = (Const *) arg;
+
+ fcinfo->args[argno].value = con->constvalue;
+ fcinfo->args[argno].isnull = con->constisnull;
+ }
+ else
+ {
+ ExecInitExprRec(arg, state,
+ &fcinfo->args[argno].value,
+ &fcinfo->args[argno].isnull);
+ }
+ argno++;
+ }
+
+ /* Insert appropriate opcode depending on strictness and stats level */
+ if (pgstat_track_functions <= flinfo->fn_stats)
+ {
+ if (flinfo->fn_strict && nargs > 0)
+ scratch->opcode = EEOP_FUNCEXPR_STRICT;
+ else
+ scratch->opcode = EEOP_FUNCEXPR;
+ }
+ else
+ {
+ if (flinfo->fn_strict && nargs > 0)
+ scratch->opcode = EEOP_FUNCEXPR_STRICT_FUSAGE;
+ else
+ scratch->opcode = EEOP_FUNCEXPR_FUSAGE;
+ }
+}
+
+/*
+ * Add expression steps deforming the ExprState's inner/outer/scan slots
+ * as much as required by the expression.
+ */
+static void
+ExecInitExprSlots(ExprState *state, Node *node)
+{
+ LastAttnumInfo info = {0, 0, 0};
+
+ /*
+ * Figure out which attributes we're going to need.
+ */
+ get_last_attnums_walker(node, &info);
+
+ ExecPushExprSlots(state, &info);
+}
+
+/*
+ * Add steps deforming the ExprState's inner/out/scan slots as much as
+ * indicated by info. This is useful when building an ExprState covering more
+ * than one expression.
+ */
+static void
+ExecPushExprSlots(ExprState *state, LastAttnumInfo *info)
+{
+ ExprEvalStep scratch = {0};
+
+ scratch.resvalue = NULL;
+ scratch.resnull = NULL;
+
+ /* Emit steps as needed */
+ if (info->last_inner > 0)
+ {
+ scratch.opcode = EEOP_INNER_FETCHSOME;
+ scratch.d.fetch.last_var = info->last_inner;
+ scratch.d.fetch.fixed = false;
+ scratch.d.fetch.kind = NULL;
+ scratch.d.fetch.known_desc = NULL;
+ if (ExecComputeSlotInfo(state, &scratch))
+ ExprEvalPushStep(state, &scratch);
+ }
+ if (info->last_outer > 0)
+ {
+ scratch.opcode = EEOP_OUTER_FETCHSOME;
+ scratch.d.fetch.last_var = info->last_outer;
+ scratch.d.fetch.fixed = false;
+ scratch.d.fetch.kind = NULL;
+ scratch.d.fetch.known_desc = NULL;
+ if (ExecComputeSlotInfo(state, &scratch))
+ ExprEvalPushStep(state, &scratch);
+ }
+ if (info->last_scan > 0)
+ {
+ scratch.opcode = EEOP_SCAN_FETCHSOME;
+ scratch.d.fetch.last_var = info->last_scan;
+ scratch.d.fetch.fixed = false;
+ scratch.d.fetch.kind = NULL;
+ scratch.d.fetch.known_desc = NULL;
+ if (ExecComputeSlotInfo(state, &scratch))
+ ExprEvalPushStep(state, &scratch);
+ }
+}
+
+/*
+ * get_last_attnums_walker: expression walker for ExecInitExprSlots
+ */
+static bool
+get_last_attnums_walker(Node *node, LastAttnumInfo *info)
+{
+ if (node == NULL)
+ return false;
+ if (IsA(node, Var))
+ {
+ Var *variable = (Var *) node;
+ AttrNumber attnum = variable->varattno;
+
+ switch (variable->varno)
+ {
+ case INNER_VAR:
+ info->last_inner = Max(info->last_inner, attnum);
+ break;
+
+ case OUTER_VAR:
+ info->last_outer = Max(info->last_outer, attnum);
+ break;
+
+ /* INDEX_VAR is handled by default case */
+
+ default:
+ info->last_scan = Max(info->last_scan, attnum);
+ break;
+ }
+ return false;
+ }
+
+ /*
+ * Don't examine the arguments or filters of Aggrefs or WindowFuncs,
+ * because those do not represent expressions to be evaluated within the
+ * calling expression's econtext. GroupingFunc arguments are never
+ * evaluated at all.
+ */
+ if (IsA(node, Aggref))
+ return false;
+ if (IsA(node, WindowFunc))
+ return false;
+ if (IsA(node, GroupingFunc))
+ return false;
+ return expression_tree_walker(node, get_last_attnums_walker,
+ (void *) info);
+}
+
+/*
+ * Compute additional information for EEOP_*_FETCHSOME ops.
+ *
+ * The goal is to determine whether a slot is 'fixed', that is, every
+ * evaluation of the expression will have the same type of slot, with an
+ * equivalent descriptor.
+ *
+ * Returns true if the deforming step is required, false otherwise.
+ */
+static bool
+ExecComputeSlotInfo(ExprState *state, ExprEvalStep *op)
+{
+ PlanState *parent = state->parent;
+ TupleDesc desc = NULL;
+ const TupleTableSlotOps *tts_ops = NULL;
+ bool isfixed = false;
+ ExprEvalOp opcode = op->opcode;
+
+ Assert(opcode == EEOP_INNER_FETCHSOME ||
+ opcode == EEOP_OUTER_FETCHSOME ||
+ opcode == EEOP_SCAN_FETCHSOME);
+
+ if (op->d.fetch.known_desc != NULL)
+ {
+ desc = op->d.fetch.known_desc;
+ tts_ops = op->d.fetch.kind;
+ isfixed = op->d.fetch.kind != NULL;
+ }
+ else if (!parent)
+ {
+ isfixed = false;
+ }
+ else if (opcode == EEOP_INNER_FETCHSOME)
+ {
+ PlanState *is = innerPlanState(parent);
+
+ if (parent->inneropsset && !parent->inneropsfixed)
+ {
+ isfixed = false;
+ }
+ else if (parent->inneropsset && parent->innerops)
+ {
+ isfixed = true;
+ tts_ops = parent->innerops;
+ desc = ExecGetResultType(is);
+ }
+ else if (is)
+ {
+ tts_ops = ExecGetResultSlotOps(is, &isfixed);
+ desc = ExecGetResultType(is);
+ }
+ }
+ else if (opcode == EEOP_OUTER_FETCHSOME)
+ {
+ PlanState *os = outerPlanState(parent);
+
+ if (parent->outeropsset && !parent->outeropsfixed)
+ {
+ isfixed = false;
+ }
+ else if (parent->outeropsset && parent->outerops)
+ {
+ isfixed = true;
+ tts_ops = parent->outerops;
+ desc = ExecGetResultType(os);
+ }
+ else if (os)
+ {
+ tts_ops = ExecGetResultSlotOps(os, &isfixed);
+ desc = ExecGetResultType(os);
+ }
+ }
+ else if (opcode == EEOP_SCAN_FETCHSOME)
+ {
+ desc = parent->scandesc;
+
+ if (parent->scanops)
+ tts_ops = parent->scanops;
+
+ if (parent->scanopsset)
+ isfixed = parent->scanopsfixed;
+ }
+
+ if (isfixed && desc != NULL && tts_ops != NULL)
+ {
+ op->d.fetch.fixed = true;
+ op->d.fetch.kind = tts_ops;
+ op->d.fetch.known_desc = desc;
+ }
+ else
+ {
+ op->d.fetch.fixed = false;
+ op->d.fetch.kind = NULL;
+ op->d.fetch.known_desc = NULL;
+ }
+
+ /* if the slot is known to always virtual we never need to deform */
+ if (op->d.fetch.fixed && op->d.fetch.kind == &TTSOpsVirtual)
+ return false;
+
+ return true;
+}
+
+/*
+ * Prepare step for the evaluation of a whole-row variable.
+ * The caller still has to push the step.
+ */
+static void
+ExecInitWholeRowVar(ExprEvalStep *scratch, Var *variable, ExprState *state)
+{
+ PlanState *parent = state->parent;
+
+ /* fill in all but the target */
+ scratch->opcode = EEOP_WHOLEROW;
+ scratch->d.wholerow.var = variable;
+ scratch->d.wholerow.first = true;
+ scratch->d.wholerow.slow = false;
+ scratch->d.wholerow.tupdesc = NULL; /* filled at runtime */
+ scratch->d.wholerow.junkFilter = NULL;
+
+ /*
+ * If the input tuple came from a subquery, it might contain "resjunk"
+ * columns (such as GROUP BY or ORDER BY columns), which we don't want to
+ * keep in the whole-row result. We can get rid of such columns by
+ * passing the tuple through a JunkFilter --- but to make one, we have to
+ * lay our hands on the subquery's targetlist. Fortunately, there are not
+ * very many cases where this can happen, and we can identify all of them
+ * by examining our parent PlanState. We assume this is not an issue in
+ * standalone expressions that don't have parent plans. (Whole-row Vars
+ * can occur in such expressions, but they will always be referencing
+ * table rows.)
+ */
+ if (parent)
+ {
+ PlanState *subplan = NULL;
+
+ switch (nodeTag(parent))
+ {
+ case T_SubqueryScanState:
+ subplan = ((SubqueryScanState *) parent)->subplan;
+ break;
+ case T_CteScanState:
+ subplan = ((CteScanState *) parent)->cteplanstate;
+ break;
+ default:
+ break;
+ }
+
+ if (subplan)
+ {
+ bool junk_filter_needed = false;
+ ListCell *tlist;
+
+ /* Detect whether subplan tlist actually has any junk columns */
+ foreach(tlist, subplan->plan->targetlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(tlist);
+
+ if (tle->resjunk)
+ {
+ junk_filter_needed = true;
+ break;
+ }
+ }
+
+ /* If so, build the junkfilter now */
+ if (junk_filter_needed)
+ {
+ scratch->d.wholerow.junkFilter =
+ ExecInitJunkFilter(subplan->plan->targetlist,
+ ExecInitExtraTupleSlot(parent->state, NULL,
+ &TTSOpsVirtual));
+ }
+ }
+ }
+}
+
+/*
+ * Prepare evaluation of a SubscriptingRef expression.
+ */
+static void
+ExecInitSubscriptingRef(ExprEvalStep *scratch, SubscriptingRef *sbsref,
+ ExprState *state, Datum *resv, bool *resnull)
+{
+ bool isAssignment = (sbsref->refassgnexpr != NULL);
+ int nupper = list_length(sbsref->refupperindexpr);
+ int nlower = list_length(sbsref->reflowerindexpr);
+ const SubscriptRoutines *sbsroutines;
+ SubscriptingRefState *sbsrefstate;
+ SubscriptExecSteps methods;
+ char *ptr;
+ List *adjust_jumps = NIL;
+ ListCell *lc;
+ int i;
+
+ /* Look up the subscripting support methods */
+ sbsroutines = getSubscriptingRoutines(sbsref->refcontainertype, NULL);
+ if (!sbsroutines)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("cannot subscript type %s because it does not support subscripting",
+ format_type_be(sbsref->refcontainertype)),
+ state->parent ?
+ executor_errposition(state->parent->state,
+ exprLocation((Node *) sbsref)) : 0));
+
+ /* Allocate sbsrefstate, with enough space for per-subscript arrays too */
+ sbsrefstate = palloc0(MAXALIGN(sizeof(SubscriptingRefState)) +
+ (nupper + nlower) * (sizeof(Datum) +
+ 2 * sizeof(bool)));
+
+ /* Fill constant fields of SubscriptingRefState */
+ sbsrefstate->isassignment = isAssignment;
+ sbsrefstate->numupper = nupper;
+ sbsrefstate->numlower = nlower;
+ /* Set up per-subscript arrays */
+ ptr = ((char *) sbsrefstate) + MAXALIGN(sizeof(SubscriptingRefState));
+ sbsrefstate->upperindex = (Datum *) ptr;
+ ptr += nupper * sizeof(Datum);
+ sbsrefstate->lowerindex = (Datum *) ptr;
+ ptr += nlower * sizeof(Datum);
+ sbsrefstate->upperprovided = (bool *) ptr;
+ ptr += nupper * sizeof(bool);
+ sbsrefstate->lowerprovided = (bool *) ptr;
+ ptr += nlower * sizeof(bool);
+ sbsrefstate->upperindexnull = (bool *) ptr;
+ ptr += nupper * sizeof(bool);
+ sbsrefstate->lowerindexnull = (bool *) ptr;
+ /* ptr += nlower * sizeof(bool); */
+
+ /*
+ * Let the container-type-specific code have a chance. It must fill the
+ * "methods" struct with function pointers for us to possibly use in
+ * execution steps below; and it can optionally set up some data pointed
+ * to by the workspace field.
+ */
+ memset(&methods, 0, sizeof(methods));
+ sbsroutines->exec_setup(sbsref, sbsrefstate, &methods);
+
+ /*
+ * Evaluate array input. It's safe to do so into resv/resnull, because we
+ * won't use that as target for any of the other subexpressions, and it'll
+ * be overwritten by the final EEOP_SBSREF_FETCH/ASSIGN step, which is
+ * pushed last.
+ */
+ ExecInitExprRec(sbsref->refexpr, state, resv, resnull);
+
+ /*
+ * If refexpr yields NULL, and the operation should be strict, then result
+ * is NULL. We can implement this with just JUMP_IF_NULL, since we
+ * evaluated the array into the desired target location.
+ */
+ if (!isAssignment && sbsroutines->fetch_strict)
+ {
+ scratch->opcode = EEOP_JUMP_IF_NULL;
+ scratch->d.jump.jumpdone = -1; /* adjust later */
+ ExprEvalPushStep(state, scratch);
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+ }
+
+ /* Evaluate upper subscripts */
+ i = 0;
+ foreach(lc, sbsref->refupperindexpr)
+ {
+ Expr *e = (Expr *) lfirst(lc);
+
+ /* When slicing, individual subscript bounds can be omitted */
+ if (!e)
+ {
+ sbsrefstate->upperprovided[i] = false;
+ sbsrefstate->upperindexnull[i] = true;
+ }
+ else
+ {
+ sbsrefstate->upperprovided[i] = true;
+ /* Each subscript is evaluated into appropriate array entry */
+ ExecInitExprRec(e, state,
+ &sbsrefstate->upperindex[i],
+ &sbsrefstate->upperindexnull[i]);
+ }
+ i++;
+ }
+
+ /* Evaluate lower subscripts similarly */
+ i = 0;
+ foreach(lc, sbsref->reflowerindexpr)
+ {
+ Expr *e = (Expr *) lfirst(lc);
+
+ /* When slicing, individual subscript bounds can be omitted */
+ if (!e)
+ {
+ sbsrefstate->lowerprovided[i] = false;
+ sbsrefstate->lowerindexnull[i] = true;
+ }
+ else
+ {
+ sbsrefstate->lowerprovided[i] = true;
+ /* Each subscript is evaluated into appropriate array entry */
+ ExecInitExprRec(e, state,
+ &sbsrefstate->lowerindex[i],
+ &sbsrefstate->lowerindexnull[i]);
+ }
+ i++;
+ }
+
+ /* SBSREF_SUBSCRIPTS checks and converts all the subscripts at once */
+ if (methods.sbs_check_subscripts)
+ {
+ scratch->opcode = EEOP_SBSREF_SUBSCRIPTS;
+ scratch->d.sbsref_subscript.subscriptfunc = methods.sbs_check_subscripts;
+ scratch->d.sbsref_subscript.state = sbsrefstate;
+ scratch->d.sbsref_subscript.jumpdone = -1; /* adjust later */
+ ExprEvalPushStep(state, scratch);
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+ }
+
+ if (isAssignment)
+ {
+ Datum *save_innermost_caseval;
+ bool *save_innermost_casenull;
+
+ /* Check for unimplemented methods */
+ if (!methods.sbs_assign)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("type %s does not support subscripted assignment",
+ format_type_be(sbsref->refcontainertype))));
+
+ /*
+ * We might have a nested-assignment situation, in which the
+ * refassgnexpr is itself a FieldStore or SubscriptingRef that needs
+ * to obtain and modify the previous value of the array element or
+ * slice being replaced. If so, we have to extract that value from
+ * the array and pass it down via the CaseTestExpr mechanism. It's
+ * safe to reuse the CASE mechanism because there cannot be a CASE
+ * between here and where the value would be needed, and an array
+ * assignment can't be within a CASE either. (So saving and restoring
+ * innermost_caseval is just paranoia, but let's do it anyway.)
+ *
+ * Since fetching the old element might be a nontrivial expense, do it
+ * only if the argument actually needs it.
+ */
+ if (isAssignmentIndirectionExpr(sbsref->refassgnexpr))
+ {
+ if (!methods.sbs_fetch_old)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("type %s does not support subscripted assignment",
+ format_type_be(sbsref->refcontainertype))));
+ scratch->opcode = EEOP_SBSREF_OLD;
+ scratch->d.sbsref.subscriptfunc = methods.sbs_fetch_old;
+ scratch->d.sbsref.state = sbsrefstate;
+ ExprEvalPushStep(state, scratch);
+ }
+
+ /* SBSREF_OLD puts extracted value into prevvalue/prevnull */
+ save_innermost_caseval = state->innermost_caseval;
+ save_innermost_casenull = state->innermost_casenull;
+ state->innermost_caseval = &sbsrefstate->prevvalue;
+ state->innermost_casenull = &sbsrefstate->prevnull;
+
+ /* evaluate replacement value into replacevalue/replacenull */
+ ExecInitExprRec(sbsref->refassgnexpr, state,
+ &sbsrefstate->replacevalue, &sbsrefstate->replacenull);
+
+ state->innermost_caseval = save_innermost_caseval;
+ state->innermost_casenull = save_innermost_casenull;
+
+ /* and perform the assignment */
+ scratch->opcode = EEOP_SBSREF_ASSIGN;
+ scratch->d.sbsref.subscriptfunc = methods.sbs_assign;
+ scratch->d.sbsref.state = sbsrefstate;
+ ExprEvalPushStep(state, scratch);
+ }
+ else
+ {
+ /* array fetch is much simpler */
+ scratch->opcode = EEOP_SBSREF_FETCH;
+ scratch->d.sbsref.subscriptfunc = methods.sbs_fetch;
+ scratch->d.sbsref.state = sbsrefstate;
+ ExprEvalPushStep(state, scratch);
+ }
+
+ /* adjust jump targets */
+ foreach(lc, adjust_jumps)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+ if (as->opcode == EEOP_SBSREF_SUBSCRIPTS)
+ {
+ Assert(as->d.sbsref_subscript.jumpdone == -1);
+ as->d.sbsref_subscript.jumpdone = state->steps_len;
+ }
+ else
+ {
+ Assert(as->opcode == EEOP_JUMP_IF_NULL);
+ Assert(as->d.jump.jumpdone == -1);
+ as->d.jump.jumpdone = state->steps_len;
+ }
+ }
+}
+
+/*
+ * Helper for preparing SubscriptingRef expressions for evaluation: is expr
+ * a nested FieldStore or SubscriptingRef that needs the old element value
+ * passed down?
+ *
+ * (We could use this in FieldStore too, but in that case passing the old
+ * value is so cheap there's no need.)
+ *
+ * Note: it might seem that this needs to recurse, but in most cases it does
+ * not; the CaseTestExpr, if any, will be directly the arg or refexpr of the
+ * top-level node. Nested-assignment situations give rise to expression
+ * trees in which each level of assignment has its own CaseTestExpr, and the
+ * recursive structure appears within the newvals or refassgnexpr field.
+ * There is an exception, though: if the array is an array-of-domain, we will
+ * have a CoerceToDomain as the refassgnexpr, and we need to be able to look
+ * through that.
+ */
+static bool
+isAssignmentIndirectionExpr(Expr *expr)
+{
+ if (expr == NULL)
+ return false; /* just paranoia */
+ if (IsA(expr, FieldStore))
+ {
+ FieldStore *fstore = (FieldStore *) expr;
+
+ if (fstore->arg && IsA(fstore->arg, CaseTestExpr))
+ return true;
+ }
+ else if (IsA(expr, SubscriptingRef))
+ {
+ SubscriptingRef *sbsRef = (SubscriptingRef *) expr;
+
+ if (sbsRef->refexpr && IsA(sbsRef->refexpr, CaseTestExpr))
+ return true;
+ }
+ else if (IsA(expr, CoerceToDomain))
+ {
+ CoerceToDomain *cd = (CoerceToDomain *) expr;
+
+ return isAssignmentIndirectionExpr(cd->arg);
+ }
+ return false;
+}
+
+/*
+ * Prepare evaluation of a CoerceToDomain expression.
+ */
+static void
+ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest,
+ ExprState *state, Datum *resv, bool *resnull)
+{
+ DomainConstraintRef *constraint_ref;
+ Datum *domainval = NULL;
+ bool *domainnull = NULL;
+ ListCell *l;
+
+ scratch->d.domaincheck.resulttype = ctest->resulttype;
+ /* we'll allocate workspace only if needed */
+ scratch->d.domaincheck.checkvalue = NULL;
+ scratch->d.domaincheck.checknull = NULL;
+
+ /*
+ * Evaluate argument - it's fine to directly store it into resv/resnull,
+ * if there's constraint failures there'll be errors, otherwise it's what
+ * needs to be returned.
+ */
+ ExecInitExprRec(ctest->arg, state, resv, resnull);
+
+ /*
+ * Note: if the argument is of varlena type, it could be a R/W expanded
+ * object. We want to return the R/W pointer as the final result, but we
+ * have to pass a R/O pointer as the value to be tested by any functions
+ * in check expressions. We don't bother to emit a MAKE_READONLY step
+ * unless there's actually at least one check expression, though. Until
+ * we've tested that, domainval/domainnull are NULL.
+ */
+
+ /*
+ * Collect the constraints associated with the domain.
+ *
+ * Note: before PG v10 we'd recheck the set of constraints during each
+ * evaluation of the expression. Now we bake them into the ExprState
+ * during executor initialization. That means we don't need typcache.c to
+ * provide compiled exprs.
+ */
+ constraint_ref = (DomainConstraintRef *)
+ palloc(sizeof(DomainConstraintRef));
+ InitDomainConstraintRef(ctest->resulttype,
+ constraint_ref,
+ CurrentMemoryContext,
+ false);
+
+ /*
+ * Compile code to check each domain constraint. NOTNULL constraints can
+ * just be applied on the resv/resnull value, but for CHECK constraints we
+ * need more pushups.
+ */
+ foreach(l, constraint_ref->constraints)
+ {
+ DomainConstraintState *con = (DomainConstraintState *) lfirst(l);
+ Datum *save_innermost_domainval;
+ bool *save_innermost_domainnull;
+
+ scratch->d.domaincheck.constraintname = con->name;
+
+ switch (con->constrainttype)
+ {
+ case DOM_CONSTRAINT_NOTNULL:
+ scratch->opcode = EEOP_DOMAIN_NOTNULL;
+ ExprEvalPushStep(state, scratch);
+ break;
+ case DOM_CONSTRAINT_CHECK:
+ /* Allocate workspace for CHECK output if we didn't yet */
+ if (scratch->d.domaincheck.checkvalue == NULL)
+ {
+ scratch->d.domaincheck.checkvalue =
+ (Datum *) palloc(sizeof(Datum));
+ scratch->d.domaincheck.checknull =
+ (bool *) palloc(sizeof(bool));
+ }
+
+ /*
+ * If first time through, determine where CoerceToDomainValue
+ * nodes should read from.
+ */
+ if (domainval == NULL)
+ {
+ /*
+ * Since value might be read multiple times, force to R/O
+ * - but only if it could be an expanded datum.
+ */
+ if (get_typlen(ctest->resulttype) == -1)
+ {
+ ExprEvalStep scratch2 = {0};
+
+ /* Yes, so make output workspace for MAKE_READONLY */
+ domainval = (Datum *) palloc(sizeof(Datum));
+ domainnull = (bool *) palloc(sizeof(bool));
+
+ /* Emit MAKE_READONLY */
+ scratch2.opcode = EEOP_MAKE_READONLY;
+ scratch2.resvalue = domainval;
+ scratch2.resnull = domainnull;
+ scratch2.d.make_readonly.value = resv;
+ scratch2.d.make_readonly.isnull = resnull;
+ ExprEvalPushStep(state, &scratch2);
+ }
+ else
+ {
+ /* No, so it's fine to read from resv/resnull */
+ domainval = resv;
+ domainnull = resnull;
+ }
+ }
+
+ /*
+ * Set up value to be returned by CoerceToDomainValue nodes.
+ * We must save and restore innermost_domainval/null fields,
+ * in case this node is itself within a check expression for
+ * another domain.
+ */
+ save_innermost_domainval = state->innermost_domainval;
+ save_innermost_domainnull = state->innermost_domainnull;
+ state->innermost_domainval = domainval;
+ state->innermost_domainnull = domainnull;
+
+ /* evaluate check expression value */
+ ExecInitExprRec(con->check_expr, state,
+ scratch->d.domaincheck.checkvalue,
+ scratch->d.domaincheck.checknull);
+
+ state->innermost_domainval = save_innermost_domainval;
+ state->innermost_domainnull = save_innermost_domainnull;
+
+ /* now test result */
+ scratch->opcode = EEOP_DOMAIN_CHECK;
+ ExprEvalPushStep(state, scratch);
+
+ break;
+ default:
+ elog(ERROR, "unrecognized constraint type: %d",
+ (int) con->constrainttype);
+ break;
+ }
+ }
+}
+
+/*
+ * Build transition/combine function invocations for all aggregate transition
+ * / combination function invocations in a grouping sets phase. This has to
+ * invoke all sort based transitions in a phase (if doSort is true), all hash
+ * based transitions (if doHash is true), or both (both true).
+ *
+ * The resulting expression will, for each set of transition values, first
+ * check for filters, evaluate aggregate input, check that that input is not
+ * NULL for a strict transition function, and then finally invoke the
+ * transition for each of the concurrently computed grouping sets.
+ *
+ * If nullcheck is true, the generated code will check for a NULL pointer to
+ * the array of AggStatePerGroup, and skip evaluation if so.
+ */
+ExprState *
+ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase,
+ bool doSort, bool doHash, bool nullcheck)
+{
+ ExprState *state = makeNode(ExprState);
+ PlanState *parent = &aggstate->ss.ps;
+ ExprEvalStep scratch = {0};
+ bool isCombine = DO_AGGSPLIT_COMBINE(aggstate->aggsplit);
+ LastAttnumInfo deform = {0, 0, 0};
+
+ state->expr = (Expr *) aggstate;
+ state->parent = parent;
+
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+
+ /*
+ * First figure out which slots, and how many columns from each, we're
+ * going to need.
+ */
+ for (int transno = 0; transno < aggstate->numtrans; transno++)
+ {
+ AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+
+ get_last_attnums_walker((Node *) pertrans->aggref->aggdirectargs,
+ &deform);
+ get_last_attnums_walker((Node *) pertrans->aggref->args,
+ &deform);
+ get_last_attnums_walker((Node *) pertrans->aggref->aggorder,
+ &deform);
+ get_last_attnums_walker((Node *) pertrans->aggref->aggdistinct,
+ &deform);
+ get_last_attnums_walker((Node *) pertrans->aggref->aggfilter,
+ &deform);
+ }
+ ExecPushExprSlots(state, &deform);
+
+ /*
+ * Emit instructions for each transition value / grouping set combination.
+ */
+ for (int transno = 0; transno < aggstate->numtrans; transno++)
+ {
+ AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+ FunctionCallInfo trans_fcinfo = pertrans->transfn_fcinfo;
+ List *adjust_bailout = NIL;
+ NullableDatum *strictargs = NULL;
+ bool *strictnulls = NULL;
+ int argno;
+ ListCell *bail;
+
+ /*
+ * If filter present, emit. Do so before evaluating the input, to
+ * avoid potentially unneeded computations, or even worse, unintended
+ * side-effects. When combining, all the necessary filtering has
+ * already been done.
+ */
+ if (pertrans->aggref->aggfilter && !isCombine)
+ {
+ /* evaluate filter expression */
+ ExecInitExprRec(pertrans->aggref->aggfilter, state,
+ &state->resvalue, &state->resnull);
+ /* and jump out if false */
+ scratch.opcode = EEOP_JUMP_IF_NOT_TRUE;
+ scratch.d.jump.jumpdone = -1; /* adjust later */
+ ExprEvalPushStep(state, &scratch);
+ adjust_bailout = lappend_int(adjust_bailout,
+ state->steps_len - 1);
+ }
+
+ /*
+ * Evaluate arguments to aggregate/combine function.
+ */
+ argno = 0;
+ if (isCombine)
+ {
+ /*
+ * Combining two aggregate transition values. Instead of directly
+ * coming from a tuple the input is a, potentially deserialized,
+ * transition value.
+ */
+ TargetEntry *source_tle;
+
+ Assert(pertrans->numSortCols == 0);
+ Assert(list_length(pertrans->aggref->args) == 1);
+
+ strictargs = trans_fcinfo->args + 1;
+ source_tle = (TargetEntry *) linitial(pertrans->aggref->args);
+
+ /*
+ * deserialfn_oid will be set if we must deserialize the input
+ * state before calling the combine function.
+ */
+ if (!OidIsValid(pertrans->deserialfn_oid))
+ {
+ /*
+ * Start from 1, since the 0th arg will be the transition
+ * value
+ */
+ ExecInitExprRec(source_tle->expr, state,
+ &trans_fcinfo->args[argno + 1].value,
+ &trans_fcinfo->args[argno + 1].isnull);
+ }
+ else
+ {
+ FunctionCallInfo ds_fcinfo = pertrans->deserialfn_fcinfo;
+
+ /* evaluate argument */
+ ExecInitExprRec(source_tle->expr, state,
+ &ds_fcinfo->args[0].value,
+ &ds_fcinfo->args[0].isnull);
+
+ /* Dummy second argument for type-safety reasons */
+ ds_fcinfo->args[1].value = PointerGetDatum(NULL);
+ ds_fcinfo->args[1].isnull = false;
+
+ /*
+ * Don't call a strict deserialization function with NULL
+ * input
+ */
+ if (pertrans->deserialfn.fn_strict)
+ scratch.opcode = EEOP_AGG_STRICT_DESERIALIZE;
+ else
+ scratch.opcode = EEOP_AGG_DESERIALIZE;
+
+ scratch.d.agg_deserialize.fcinfo_data = ds_fcinfo;
+ scratch.d.agg_deserialize.jumpnull = -1; /* adjust later */
+ scratch.resvalue = &trans_fcinfo->args[argno + 1].value;
+ scratch.resnull = &trans_fcinfo->args[argno + 1].isnull;
+
+ ExprEvalPushStep(state, &scratch);
+ /* don't add an adjustment unless the function is strict */
+ if (pertrans->deserialfn.fn_strict)
+ adjust_bailout = lappend_int(adjust_bailout,
+ state->steps_len - 1);
+
+ /* restore normal settings of scratch fields */
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+ }
+ argno++;
+ }
+ else if (pertrans->numSortCols == 0)
+ {
+ ListCell *arg;
+
+ /*
+ * Normal transition function without ORDER BY / DISTINCT.
+ */
+ strictargs = trans_fcinfo->args + 1;
+
+ foreach(arg, pertrans->aggref->args)
+ {
+ TargetEntry *source_tle = (TargetEntry *) lfirst(arg);
+
+ /*
+ * Start from 1, since the 0th arg will be the transition
+ * value
+ */
+ ExecInitExprRec(source_tle->expr, state,
+ &trans_fcinfo->args[argno + 1].value,
+ &trans_fcinfo->args[argno + 1].isnull);
+ argno++;
+ }
+ }
+ else if (pertrans->numInputs == 1)
+ {
+ /*
+ * DISTINCT and/or ORDER BY case, with a single column sorted on.
+ */
+ TargetEntry *source_tle =
+ (TargetEntry *) linitial(pertrans->aggref->args);
+
+ Assert(list_length(pertrans->aggref->args) == 1);
+
+ ExecInitExprRec(source_tle->expr, state,
+ &state->resvalue,
+ &state->resnull);
+ strictnulls = &state->resnull;
+ argno++;
+ }
+ else
+ {
+ /*
+ * DISTINCT and/or ORDER BY case, with multiple columns sorted on.
+ */
+ Datum *values = pertrans->sortslot->tts_values;
+ bool *nulls = pertrans->sortslot->tts_isnull;
+ ListCell *arg;
+
+ strictnulls = nulls;
+
+ foreach(arg, pertrans->aggref->args)
+ {
+ TargetEntry *source_tle = (TargetEntry *) lfirst(arg);
+
+ ExecInitExprRec(source_tle->expr, state,
+ &values[argno], &nulls[argno]);
+ argno++;
+ }
+ }
+ Assert(pertrans->numInputs == argno);
+
+ /*
+ * For a strict transfn, nothing happens when there's a NULL input; we
+ * just keep the prior transValue. This is true for both plain and
+ * sorted/distinct aggregates.
+ */
+ if (trans_fcinfo->flinfo->fn_strict && pertrans->numTransInputs > 0)
+ {
+ if (strictnulls)
+ scratch.opcode = EEOP_AGG_STRICT_INPUT_CHECK_NULLS;
+ else
+ scratch.opcode = EEOP_AGG_STRICT_INPUT_CHECK_ARGS;
+ scratch.d.agg_strict_input_check.nulls = strictnulls;
+ scratch.d.agg_strict_input_check.args = strictargs;
+ scratch.d.agg_strict_input_check.jumpnull = -1; /* adjust later */
+ scratch.d.agg_strict_input_check.nargs = pertrans->numTransInputs;
+ ExprEvalPushStep(state, &scratch);
+ adjust_bailout = lappend_int(adjust_bailout,
+ state->steps_len - 1);
+ }
+
+ /*
+ * Call transition function (once for each concurrently evaluated
+ * grouping set). Do so for both sort and hash based computations, as
+ * applicable.
+ */
+ if (doSort)
+ {
+ int processGroupingSets = Max(phase->numsets, 1);
+ int setoff = 0;
+
+ for (int setno = 0; setno < processGroupingSets; setno++)
+ {
+ ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo,
+ pertrans, transno, setno, setoff, false,
+ nullcheck);
+ setoff++;
+ }
+ }
+
+ if (doHash)
+ {
+ int numHashes = aggstate->num_hashes;
+ int setoff;
+
+ /* in MIXED mode, there'll be preceding transition values */
+ if (aggstate->aggstrategy != AGG_HASHED)
+ setoff = aggstate->maxsets;
+ else
+ setoff = 0;
+
+ for (int setno = 0; setno < numHashes; setno++)
+ {
+ ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo,
+ pertrans, transno, setno, setoff, true,
+ nullcheck);
+ setoff++;
+ }
+ }
+
+ /* adjust early bail out jump target(s) */
+ foreach(bail, adjust_bailout)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(bail)];
+
+ if (as->opcode == EEOP_JUMP_IF_NOT_TRUE)
+ {
+ Assert(as->d.jump.jumpdone == -1);
+ as->d.jump.jumpdone = state->steps_len;
+ }
+ else if (as->opcode == EEOP_AGG_STRICT_INPUT_CHECK_ARGS ||
+ as->opcode == EEOP_AGG_STRICT_INPUT_CHECK_NULLS)
+ {
+ Assert(as->d.agg_strict_input_check.jumpnull == -1);
+ as->d.agg_strict_input_check.jumpnull = state->steps_len;
+ }
+ else if (as->opcode == EEOP_AGG_STRICT_DESERIALIZE)
+ {
+ Assert(as->d.agg_deserialize.jumpnull == -1);
+ as->d.agg_deserialize.jumpnull = state->steps_len;
+ }
+ else
+ Assert(false);
+ }
+ }
+
+ scratch.resvalue = NULL;
+ scratch.resnull = NULL;
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(state, &scratch);
+
+ ExecReadyExpr(state);
+
+ return state;
+}
+
+/*
+ * Build transition/combine function invocation for a single transition
+ * value. This is separated from ExecBuildAggTrans() because there are
+ * multiple callsites (hash and sort in some grouping set cases).
+ */
+static void
+ExecBuildAggTransCall(ExprState *state, AggState *aggstate,
+ ExprEvalStep *scratch,
+ FunctionCallInfo fcinfo, AggStatePerTrans pertrans,
+ int transno, int setno, int setoff, bool ishash,
+ bool nullcheck)
+{
+ ExprContext *aggcontext;
+ int adjust_jumpnull = -1;
+
+ if (ishash)
+ aggcontext = aggstate->hashcontext;
+ else
+ aggcontext = aggstate->aggcontexts[setno];
+
+ /* add check for NULL pointer? */
+ if (nullcheck)
+ {
+ scratch->opcode = EEOP_AGG_PLAIN_PERGROUP_NULLCHECK;
+ scratch->d.agg_plain_pergroup_nullcheck.setoff = setoff;
+ /* adjust later */
+ scratch->d.agg_plain_pergroup_nullcheck.jumpnull = -1;
+ ExprEvalPushStep(state, scratch);
+ adjust_jumpnull = state->steps_len - 1;
+ }
+
+ /*
+ * Determine appropriate transition implementation.
+ *
+ * For non-ordered aggregates:
+ *
+ * If the initial value for the transition state doesn't exist in the
+ * pg_aggregate table then we will let the first non-NULL value returned
+ * from the outer procNode become the initial value. (This is useful for
+ * aggregates like max() and min().) The noTransValue flag signals that we
+ * need to do so. If true, generate a
+ * EEOP_AGG_INIT_STRICT_PLAIN_TRANS{,_BYVAL} step. This step also needs to
+ * do the work described next:
+ *
+ * If the function is strict, but does have an initial value, choose
+ * EEOP_AGG_STRICT_PLAIN_TRANS{,_BYVAL}, which skips the transition
+ * function if the transition value has become NULL (because a previous
+ * transition function returned NULL). This step also needs to do the work
+ * described next:
+ *
+ * Otherwise we call EEOP_AGG_PLAIN_TRANS{,_BYVAL}, which does not have to
+ * perform either of the above checks.
+ *
+ * Having steps with overlapping responsibilities is not nice, but
+ * aggregations are very performance sensitive, making this worthwhile.
+ *
+ * For ordered aggregates:
+ *
+ * Only need to choose between the faster path for a single ordered
+ * column, and the one between multiple columns. Checking strictness etc
+ * is done when finalizing the aggregate. See
+ * process_ordered_aggregate_{single, multi} and
+ * advance_transition_function.
+ */
+ if (pertrans->numSortCols == 0)
+ {
+ if (pertrans->transtypeByVal)
+ {
+ if (fcinfo->flinfo->fn_strict &&
+ pertrans->initValueIsNull)
+ scratch->opcode = EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL;
+ else if (fcinfo->flinfo->fn_strict)
+ scratch->opcode = EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL;
+ else
+ scratch->opcode = EEOP_AGG_PLAIN_TRANS_BYVAL;
+ }
+ else
+ {
+ if (fcinfo->flinfo->fn_strict &&
+ pertrans->initValueIsNull)
+ scratch->opcode = EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF;
+ else if (fcinfo->flinfo->fn_strict)
+ scratch->opcode = EEOP_AGG_PLAIN_TRANS_STRICT_BYREF;
+ else
+ scratch->opcode = EEOP_AGG_PLAIN_TRANS_BYREF;
+ }
+ }
+ else if (pertrans->numInputs == 1)
+ scratch->opcode = EEOP_AGG_ORDERED_TRANS_DATUM;
+ else
+ scratch->opcode = EEOP_AGG_ORDERED_TRANS_TUPLE;
+
+ scratch->d.agg_trans.pertrans = pertrans;
+ scratch->d.agg_trans.setno = setno;
+ scratch->d.agg_trans.setoff = setoff;
+ scratch->d.agg_trans.transno = transno;
+ scratch->d.agg_trans.aggcontext = aggcontext;
+ ExprEvalPushStep(state, scratch);
+
+ /* fix up jumpnull */
+ if (adjust_jumpnull != -1)
+ {
+ ExprEvalStep *as = &state->steps[adjust_jumpnull];
+
+ Assert(as->opcode == EEOP_AGG_PLAIN_PERGROUP_NULLCHECK);
+ Assert(as->d.agg_plain_pergroup_nullcheck.jumpnull == -1);
+ as->d.agg_plain_pergroup_nullcheck.jumpnull = state->steps_len;
+ }
+}
+
+/*
+ * Build equality expression that can be evaluated using ExecQual(), returning
+ * true if the expression context's inner/outer tuple are NOT DISTINCT. I.e
+ * two nulls match, a null and a not-null don't match.
+ *
+ * desc: tuple descriptor of the to-be-compared tuples
+ * numCols: the number of attributes to be examined
+ * keyColIdx: array of attribute column numbers
+ * eqFunctions: array of function oids of the equality functions to use
+ * parent: parent executor node
+ */
+ExprState *
+ExecBuildGroupingEqual(TupleDesc ldesc, TupleDesc rdesc,
+ const TupleTableSlotOps *lops, const TupleTableSlotOps *rops,
+ int numCols,
+ const AttrNumber *keyColIdx,
+ const Oid *eqfunctions,
+ const Oid *collations,
+ PlanState *parent)
+{
+ ExprState *state = makeNode(ExprState);
+ ExprEvalStep scratch = {0};
+ int maxatt = -1;
+ List *adjust_jumps = NIL;
+ ListCell *lc;
+
+ /*
+ * When no columns are actually compared, the result's always true. See
+ * special case in ExecQual().
+ */
+ if (numCols == 0)
+ return NULL;
+
+ state->expr = NULL;
+ state->flags = EEO_FLAG_IS_QUAL;
+ state->parent = parent;
+
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+
+ /* compute max needed attribute */
+ for (int natt = 0; natt < numCols; natt++)
+ {
+ int attno = keyColIdx[natt];
+
+ if (attno > maxatt)
+ maxatt = attno;
+ }
+ Assert(maxatt >= 0);
+
+ /* push deform steps */
+ scratch.opcode = EEOP_INNER_FETCHSOME;
+ scratch.d.fetch.last_var = maxatt;
+ scratch.d.fetch.fixed = false;
+ scratch.d.fetch.known_desc = ldesc;
+ scratch.d.fetch.kind = lops;
+ if (ExecComputeSlotInfo(state, &scratch))
+ ExprEvalPushStep(state, &scratch);
+
+ scratch.opcode = EEOP_OUTER_FETCHSOME;
+ scratch.d.fetch.last_var = maxatt;
+ scratch.d.fetch.fixed = false;
+ scratch.d.fetch.known_desc = rdesc;
+ scratch.d.fetch.kind = rops;
+ if (ExecComputeSlotInfo(state, &scratch))
+ ExprEvalPushStep(state, &scratch);
+
+ /*
+ * Start comparing at the last field (least significant sort key). That's
+ * the most likely to be different if we are dealing with sorted input.
+ */
+ for (int natt = numCols; --natt >= 0;)
+ {
+ int attno = keyColIdx[natt];
+ Form_pg_attribute latt = TupleDescAttr(ldesc, attno - 1);
+ Form_pg_attribute ratt = TupleDescAttr(rdesc, attno - 1);
+ Oid foid = eqfunctions[natt];
+ Oid collid = collations[natt];
+ FmgrInfo *finfo;
+ FunctionCallInfo fcinfo;
+ AclResult aclresult;
+
+ /* Check permission to call function */
+ aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid));
+
+ InvokeFunctionExecuteHook(foid);
+
+ /* Set up the primary fmgr lookup information */
+ finfo = palloc0(sizeof(FmgrInfo));
+ fcinfo = palloc0(SizeForFunctionCallInfo(2));
+ fmgr_info(foid, finfo);
+ fmgr_info_set_expr(NULL, finfo);
+ InitFunctionCallInfoData(*fcinfo, finfo, 2,
+ collid, NULL, NULL);
+
+ /* left arg */
+ scratch.opcode = EEOP_INNER_VAR;
+ scratch.d.var.attnum = attno - 1;
+ scratch.d.var.vartype = latt->atttypid;
+ scratch.resvalue = &fcinfo->args[0].value;
+ scratch.resnull = &fcinfo->args[0].isnull;
+ ExprEvalPushStep(state, &scratch);
+
+ /* right arg */
+ scratch.opcode = EEOP_OUTER_VAR;
+ scratch.d.var.attnum = attno - 1;
+ scratch.d.var.vartype = ratt->atttypid;
+ scratch.resvalue = &fcinfo->args[1].value;
+ scratch.resnull = &fcinfo->args[1].isnull;
+ ExprEvalPushStep(state, &scratch);
+
+ /* evaluate distinctness */
+ scratch.opcode = EEOP_NOT_DISTINCT;
+ scratch.d.func.finfo = finfo;
+ scratch.d.func.fcinfo_data = fcinfo;
+ scratch.d.func.fn_addr = finfo->fn_addr;
+ scratch.d.func.nargs = 2;
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+ ExprEvalPushStep(state, &scratch);
+
+ /* then emit EEOP_QUAL to detect if result is false (or null) */
+ scratch.opcode = EEOP_QUAL;
+ scratch.d.qualexpr.jumpdone = -1;
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+ ExprEvalPushStep(state, &scratch);
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+ }
+
+ /* adjust jump targets */
+ foreach(lc, adjust_jumps)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+ Assert(as->opcode == EEOP_QUAL);
+ Assert(as->d.qualexpr.jumpdone == -1);
+ as->d.qualexpr.jumpdone = state->steps_len;
+ }
+
+ scratch.resvalue = NULL;
+ scratch.resnull = NULL;
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(state, &scratch);
+
+ ExecReadyExpr(state);
+
+ return state;
+}
+
+/*
+ * Build equality expression that can be evaluated using ExecQual(), returning
+ * true if the expression context's inner/outer tuples are equal. Datums in
+ * the inner/outer slots are assumed to be in the same order and quantity as
+ * the 'eqfunctions' parameter. NULLs are treated as equal.
+ *
+ * desc: tuple descriptor of the to-be-compared tuples
+ * lops: the slot ops for the inner tuple slots
+ * rops: the slot ops for the outer tuple slots
+ * eqFunctions: array of function oids of the equality functions to use
+ * this must be the same length as the 'param_exprs' list.
+ * collations: collation Oids to use for equality comparison. Must be the
+ * same length as the 'param_exprs' list.
+ * parent: parent executor node
+ */
+ExprState *
+ExecBuildParamSetEqual(TupleDesc desc,
+ const TupleTableSlotOps *lops,
+ const TupleTableSlotOps *rops,
+ const Oid *eqfunctions,
+ const Oid *collations,
+ const List *param_exprs,
+ PlanState *parent)
+{
+ ExprState *state = makeNode(ExprState);
+ ExprEvalStep scratch = {0};
+ int maxatt = list_length(param_exprs);
+ List *adjust_jumps = NIL;
+ ListCell *lc;
+
+ state->expr = NULL;
+ state->flags = EEO_FLAG_IS_QUAL;
+ state->parent = parent;
+
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+
+ /* push deform steps */
+ scratch.opcode = EEOP_INNER_FETCHSOME;
+ scratch.d.fetch.last_var = maxatt;
+ scratch.d.fetch.fixed = false;
+ scratch.d.fetch.known_desc = desc;
+ scratch.d.fetch.kind = lops;
+ if (ExecComputeSlotInfo(state, &scratch))
+ ExprEvalPushStep(state, &scratch);
+
+ scratch.opcode = EEOP_OUTER_FETCHSOME;
+ scratch.d.fetch.last_var = maxatt;
+ scratch.d.fetch.fixed = false;
+ scratch.d.fetch.known_desc = desc;
+ scratch.d.fetch.kind = rops;
+ if (ExecComputeSlotInfo(state, &scratch))
+ ExprEvalPushStep(state, &scratch);
+
+ for (int attno = 0; attno < maxatt; attno++)
+ {
+ Form_pg_attribute att = TupleDescAttr(desc, attno);
+ Oid foid = eqfunctions[attno];
+ Oid collid = collations[attno];
+ FmgrInfo *finfo;
+ FunctionCallInfo fcinfo;
+ AclResult aclresult;
+
+ /* Check permission to call function */
+ aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid));
+
+ InvokeFunctionExecuteHook(foid);
+
+ /* Set up the primary fmgr lookup information */
+ finfo = palloc0(sizeof(FmgrInfo));
+ fcinfo = palloc0(SizeForFunctionCallInfo(2));
+ fmgr_info(foid, finfo);
+ fmgr_info_set_expr(NULL, finfo);
+ InitFunctionCallInfoData(*fcinfo, finfo, 2,
+ collid, NULL, NULL);
+
+ /* left arg */
+ scratch.opcode = EEOP_INNER_VAR;
+ scratch.d.var.attnum = attno;
+ scratch.d.var.vartype = att->atttypid;
+ scratch.resvalue = &fcinfo->args[0].value;
+ scratch.resnull = &fcinfo->args[0].isnull;
+ ExprEvalPushStep(state, &scratch);
+
+ /* right arg */
+ scratch.opcode = EEOP_OUTER_VAR;
+ scratch.d.var.attnum = attno;
+ scratch.d.var.vartype = att->atttypid;
+ scratch.resvalue = &fcinfo->args[1].value;
+ scratch.resnull = &fcinfo->args[1].isnull;
+ ExprEvalPushStep(state, &scratch);
+
+ /* evaluate distinctness */
+ scratch.opcode = EEOP_NOT_DISTINCT;
+ scratch.d.func.finfo = finfo;
+ scratch.d.func.fcinfo_data = fcinfo;
+ scratch.d.func.fn_addr = finfo->fn_addr;
+ scratch.d.func.nargs = 2;
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+ ExprEvalPushStep(state, &scratch);
+
+ /* then emit EEOP_QUAL to detect if result is false (or null) */
+ scratch.opcode = EEOP_QUAL;
+ scratch.d.qualexpr.jumpdone = -1;
+ scratch.resvalue = &state->resvalue;
+ scratch.resnull = &state->resnull;
+ ExprEvalPushStep(state, &scratch);
+ adjust_jumps = lappend_int(adjust_jumps,
+ state->steps_len - 1);
+ }
+
+ /* adjust jump targets */
+ foreach(lc, adjust_jumps)
+ {
+ ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+ Assert(as->opcode == EEOP_QUAL);
+ Assert(as->d.qualexpr.jumpdone == -1);
+ as->d.qualexpr.jumpdone = state->steps_len;
+ }
+
+ scratch.resvalue = NULL;
+ scratch.resnull = NULL;
+ scratch.opcode = EEOP_DONE;
+ ExprEvalPushStep(state, &scratch);
+
+ ExecReadyExpr(state);
+
+ return state;
+}
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
new file mode 100644
index 0000000..6b63f93
--- /dev/null
+++ b/src/backend/executor/execExprInterp.c
@@ -0,0 +1,4373 @@
+/*-------------------------------------------------------------------------
+ *
+ * execExprInterp.c
+ * Interpreted evaluation of an expression step list.
+ *
+ * This file provides either a "direct threaded" (for gcc, clang and
+ * compatible) or a "switch threaded" (for all compilers) implementation of
+ * expression evaluation. The former is amongst the fastest known methods
+ * of interpreting programs without resorting to assembly level work, or
+ * just-in-time compilation, but it requires support for computed gotos.
+ * The latter is amongst the fastest approaches doable in standard C.
+ *
+ * In either case we use ExprEvalStep->opcode to dispatch to the code block
+ * within ExecInterpExpr() that implements the specific opcode type.
+ *
+ * Switch-threading uses a plain switch() statement to perform the
+ * dispatch. This has the advantages of being plain C and allowing the
+ * compiler to warn if implementation of a specific opcode has been forgotten.
+ * The disadvantage is that dispatches will, as commonly implemented by
+ * compilers, happen from a single location, requiring more jumps and causing
+ * bad branch prediction.
+ *
+ * In direct threading, we use gcc's label-as-values extension - also adopted
+ * by some other compilers - to replace ExprEvalStep->opcode with the address
+ * of the block implementing the instruction. Dispatch to the next instruction
+ * is done by a "computed goto". This allows for better branch prediction
+ * (as the jumps are happening from different locations) and fewer jumps
+ * (as no preparatory jump to a common dispatch location is needed).
+ *
+ * When using direct threading, ExecReadyInterpretedExpr will replace
+ * each step's opcode field with the address of the relevant code block and
+ * ExprState->flags will contain EEO_FLAG_DIRECT_THREADED to remember that
+ * that's been done.
+ *
+ * For very simple instructions the overhead of the full interpreter
+ * "startup", as minimal as it is, is noticeable. Therefore
+ * ExecReadyInterpretedExpr will choose to implement certain simple
+ * opcode patterns using special fast-path routines (ExecJust*).
+ *
+ * Complex or uncommon instructions are not implemented in-line in
+ * ExecInterpExpr(), rather we call out to a helper function appearing later
+ * in this file. For one reason, there'd not be a noticeable performance
+ * benefit, but more importantly those complex routines are intended to be
+ * shared between different expression evaluation approaches. For instance
+ * a JIT compiler would generate calls to them. (This is why they are
+ * exported rather than being "static" in this file.)
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execExprInterp.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heaptoast.h"
+#include "catalog/pg_type.h"
+#include "commands/sequence.h"
+#include "executor/execExpr.h"
+#include "executor/nodeSubplan.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parsetree.h"
+#include "pgstat.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/date.h"
+#include "utils/datum.h"
+#include "utils/expandedrecord.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+#include "utils/typcache.h"
+#include "utils/xml.h"
+
+/*
+ * Use computed-goto-based opcode dispatch when computed gotos are available.
+ * But use a separate symbol so that it's easy to adjust locally in this file
+ * for development and testing.
+ */
+#ifdef HAVE_COMPUTED_GOTO
+#define EEO_USE_COMPUTED_GOTO
+#endif /* HAVE_COMPUTED_GOTO */
+
+/*
+ * Macros for opcode dispatch.
+ *
+ * EEO_SWITCH - just hides the switch if not in use.
+ * EEO_CASE - labels the implementation of named expression step type.
+ * EEO_DISPATCH - jump to the implementation of the step type for 'op'.
+ * EEO_OPCODE - compute opcode required by used expression evaluation method.
+ * EEO_NEXT - increment 'op' and jump to correct next step type.
+ * EEO_JUMP - jump to the specified step number within the current expression.
+ */
+#if defined(EEO_USE_COMPUTED_GOTO)
+
+/* struct for jump target -> opcode lookup table */
+typedef struct ExprEvalOpLookup
+{
+ const void *opcode;
+ ExprEvalOp op;
+} ExprEvalOpLookup;
+
+/* to make dispatch_table accessible outside ExecInterpExpr() */
+static const void **dispatch_table = NULL;
+
+/* jump target -> opcode lookup table */
+static ExprEvalOpLookup reverse_dispatch_table[EEOP_LAST];
+
+#define EEO_SWITCH()
+#define EEO_CASE(name) CASE_##name:
+#define EEO_DISPATCH() goto *((void *) op->opcode)
+#define EEO_OPCODE(opcode) ((intptr_t) dispatch_table[opcode])
+
+#else /* !EEO_USE_COMPUTED_GOTO */
+
+#define EEO_SWITCH() starteval: switch ((ExprEvalOp) op->opcode)
+#define EEO_CASE(name) case name:
+#define EEO_DISPATCH() goto starteval
+#define EEO_OPCODE(opcode) (opcode)
+
+#endif /* EEO_USE_COMPUTED_GOTO */
+
+#define EEO_NEXT() \
+ do { \
+ op++; \
+ EEO_DISPATCH(); \
+ } while (0)
+
+#define EEO_JUMP(stepno) \
+ do { \
+ op = &state->steps[stepno]; \
+ EEO_DISPATCH(); \
+ } while (0)
+
+
+static Datum ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull);
+static void ExecInitInterpreter(void);
+
+/* support functions */
+static void CheckVarSlotCompatibility(TupleTableSlot *slot, int attnum, Oid vartype);
+static void CheckOpSlotCompatibility(ExprEvalStep *op, TupleTableSlot *slot);
+static TupleDesc get_cached_rowtype(Oid type_id, int32 typmod,
+ ExprEvalRowtypeCache *rowcache,
+ bool *changed);
+static void ExecEvalRowNullInt(ExprState *state, ExprEvalStep *op,
+ ExprContext *econtext, bool checkisnull);
+
+/* fast-path evaluation functions */
+static Datum ExecJustInnerVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustOuterVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustScanVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignInnerVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignOuterVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignScanVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustApplyFuncToCase(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustConst(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+
+/* execution helper functions */
+static pg_attribute_always_inline void ExecAggPlainTransByVal(AggState *aggstate,
+ AggStatePerTrans pertrans,
+ AggStatePerGroup pergroup,
+ ExprContext *aggcontext,
+ int setno);
+static pg_attribute_always_inline void ExecAggPlainTransByRef(AggState *aggstate,
+ AggStatePerTrans pertrans,
+ AggStatePerGroup pergroup,
+ ExprContext *aggcontext,
+ int setno);
+
+/*
+ * ScalarArrayOpExprHashEntry
+ * Hash table entry type used during EEOP_HASHED_SCALARARRAYOP
+ */
+typedef struct ScalarArrayOpExprHashEntry
+{
+ Datum key;
+ uint32 status; /* hash status */
+ uint32 hash; /* hash value (cached) */
+} ScalarArrayOpExprHashEntry;
+
+#define SH_PREFIX saophash
+#define SH_ELEMENT_TYPE ScalarArrayOpExprHashEntry
+#define SH_KEY_TYPE Datum
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static bool saop_hash_element_match(struct saophash_hash *tb, Datum key1,
+ Datum key2);
+static uint32 saop_element_hash(struct saophash_hash *tb, Datum key);
+
+/*
+ * ScalarArrayOpExprHashTable
+ * Hash table for EEOP_HASHED_SCALARARRAYOP
+ */
+typedef struct ScalarArrayOpExprHashTable
+{
+ saophash_hash *hashtab; /* underlying hash table */
+ struct ExprEvalStep *op;
+} ScalarArrayOpExprHashTable;
+
+/* Define parameters for ScalarArrayOpExpr hash table code generation. */
+#define SH_PREFIX saophash
+#define SH_ELEMENT_TYPE ScalarArrayOpExprHashEntry
+#define SH_KEY_TYPE Datum
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) saop_element_hash(tb, key)
+#define SH_EQUAL(tb, a, b) saop_hash_element_match(tb, a, b)
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a) a->hash
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+/*
+ * Prepare ExprState for interpreted execution.
+ */
+void
+ExecReadyInterpretedExpr(ExprState *state)
+{
+ /* Ensure one-time interpreter setup has been done */
+ ExecInitInterpreter();
+
+ /* Simple validity checks on expression */
+ Assert(state->steps_len >= 1);
+ Assert(state->steps[state->steps_len - 1].opcode == EEOP_DONE);
+
+ /*
+ * Don't perform redundant initialization. This is unreachable in current
+ * cases, but might be hit if there's additional expression evaluation
+ * methods that rely on interpreted execution to work.
+ */
+ if (state->flags & EEO_FLAG_INTERPRETER_INITIALIZED)
+ return;
+
+ /*
+ * First time through, check whether attribute matches Var. Might not be
+ * ok anymore, due to schema changes. We do that by setting up a callback
+ * that does checking on the first call, which then sets the evalfunc
+ * callback to the actual method of execution.
+ */
+ state->evalfunc = ExecInterpExprStillValid;
+
+ /* DIRECT_THREADED should not already be set */
+ Assert((state->flags & EEO_FLAG_DIRECT_THREADED) == 0);
+
+ /*
+ * There shouldn't be any errors before the expression is fully
+ * initialized, and even if so, it'd lead to the expression being
+ * abandoned. So we can set the flag now and save some code.
+ */
+ state->flags |= EEO_FLAG_INTERPRETER_INITIALIZED;
+
+ /*
+ * Select fast-path evalfuncs for very simple expressions. "Starting up"
+ * the full interpreter is a measurable overhead for these, and these
+ * patterns occur often enough to be worth optimizing.
+ */
+ if (state->steps_len == 3)
+ {
+ ExprEvalOp step0 = state->steps[0].opcode;
+ ExprEvalOp step1 = state->steps[1].opcode;
+
+ if (step0 == EEOP_INNER_FETCHSOME &&
+ step1 == EEOP_INNER_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustInnerVar;
+ return;
+ }
+ else if (step0 == EEOP_OUTER_FETCHSOME &&
+ step1 == EEOP_OUTER_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustOuterVar;
+ return;
+ }
+ else if (step0 == EEOP_SCAN_FETCHSOME &&
+ step1 == EEOP_SCAN_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustScanVar;
+ return;
+ }
+ else if (step0 == EEOP_INNER_FETCHSOME &&
+ step1 == EEOP_ASSIGN_INNER_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustAssignInnerVar;
+ return;
+ }
+ else if (step0 == EEOP_OUTER_FETCHSOME &&
+ step1 == EEOP_ASSIGN_OUTER_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustAssignOuterVar;
+ return;
+ }
+ else if (step0 == EEOP_SCAN_FETCHSOME &&
+ step1 == EEOP_ASSIGN_SCAN_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustAssignScanVar;
+ return;
+ }
+ else if (step0 == EEOP_CASE_TESTVAL &&
+ step1 == EEOP_FUNCEXPR_STRICT &&
+ state->steps[0].d.casetest.value)
+ {
+ state->evalfunc_private = (void *) ExecJustApplyFuncToCase;
+ return;
+ }
+ }
+ else if (state->steps_len == 2)
+ {
+ ExprEvalOp step0 = state->steps[0].opcode;
+
+ if (step0 == EEOP_CONST)
+ {
+ state->evalfunc_private = (void *) ExecJustConst;
+ return;
+ }
+ else if (step0 == EEOP_INNER_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustInnerVarVirt;
+ return;
+ }
+ else if (step0 == EEOP_OUTER_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustOuterVarVirt;
+ return;
+ }
+ else if (step0 == EEOP_SCAN_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustScanVarVirt;
+ return;
+ }
+ else if (step0 == EEOP_ASSIGN_INNER_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustAssignInnerVarVirt;
+ return;
+ }
+ else if (step0 == EEOP_ASSIGN_OUTER_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustAssignOuterVarVirt;
+ return;
+ }
+ else if (step0 == EEOP_ASSIGN_SCAN_VAR)
+ {
+ state->evalfunc_private = (void *) ExecJustAssignScanVarVirt;
+ return;
+ }
+ }
+
+#if defined(EEO_USE_COMPUTED_GOTO)
+
+ /*
+ * In the direct-threaded implementation, replace each opcode with the
+ * address to jump to. (Use ExecEvalStepOp() to get back the opcode.)
+ */
+ for (int off = 0; off < state->steps_len; off++)
+ {
+ ExprEvalStep *op = &state->steps[off];
+
+ op->opcode = EEO_OPCODE(op->opcode);
+ }
+
+ state->flags |= EEO_FLAG_DIRECT_THREADED;
+#endif /* EEO_USE_COMPUTED_GOTO */
+
+ state->evalfunc_private = (void *) ExecInterpExpr;
+}
+
+
+/*
+ * Evaluate expression identified by "state" in the execution context
+ * given by "econtext". *isnull is set to the is-null flag for the result,
+ * and the Datum value is the function result.
+ *
+ * As a special case, return the dispatch table's address if state is NULL.
+ * This is used by ExecInitInterpreter to set up the dispatch_table global.
+ * (Only applies when EEO_USE_COMPUTED_GOTO is defined.)
+ */
+static Datum
+ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ ExprEvalStep *op;
+ TupleTableSlot *resultslot;
+ TupleTableSlot *innerslot;
+ TupleTableSlot *outerslot;
+ TupleTableSlot *scanslot;
+
+ /*
+ * This array has to be in the same order as enum ExprEvalOp.
+ */
+#if defined(EEO_USE_COMPUTED_GOTO)
+ static const void *const dispatch_table[] = {
+ &&CASE_EEOP_DONE,
+ &&CASE_EEOP_INNER_FETCHSOME,
+ &&CASE_EEOP_OUTER_FETCHSOME,
+ &&CASE_EEOP_SCAN_FETCHSOME,
+ &&CASE_EEOP_INNER_VAR,
+ &&CASE_EEOP_OUTER_VAR,
+ &&CASE_EEOP_SCAN_VAR,
+ &&CASE_EEOP_INNER_SYSVAR,
+ &&CASE_EEOP_OUTER_SYSVAR,
+ &&CASE_EEOP_SCAN_SYSVAR,
+ &&CASE_EEOP_WHOLEROW,
+ &&CASE_EEOP_ASSIGN_INNER_VAR,
+ &&CASE_EEOP_ASSIGN_OUTER_VAR,
+ &&CASE_EEOP_ASSIGN_SCAN_VAR,
+ &&CASE_EEOP_ASSIGN_TMP,
+ &&CASE_EEOP_ASSIGN_TMP_MAKE_RO,
+ &&CASE_EEOP_CONST,
+ &&CASE_EEOP_FUNCEXPR,
+ &&CASE_EEOP_FUNCEXPR_STRICT,
+ &&CASE_EEOP_FUNCEXPR_FUSAGE,
+ &&CASE_EEOP_FUNCEXPR_STRICT_FUSAGE,
+ &&CASE_EEOP_BOOL_AND_STEP_FIRST,
+ &&CASE_EEOP_BOOL_AND_STEP,
+ &&CASE_EEOP_BOOL_AND_STEP_LAST,
+ &&CASE_EEOP_BOOL_OR_STEP_FIRST,
+ &&CASE_EEOP_BOOL_OR_STEP,
+ &&CASE_EEOP_BOOL_OR_STEP_LAST,
+ &&CASE_EEOP_BOOL_NOT_STEP,
+ &&CASE_EEOP_QUAL,
+ &&CASE_EEOP_JUMP,
+ &&CASE_EEOP_JUMP_IF_NULL,
+ &&CASE_EEOP_JUMP_IF_NOT_NULL,
+ &&CASE_EEOP_JUMP_IF_NOT_TRUE,
+ &&CASE_EEOP_NULLTEST_ISNULL,
+ &&CASE_EEOP_NULLTEST_ISNOTNULL,
+ &&CASE_EEOP_NULLTEST_ROWISNULL,
+ &&CASE_EEOP_NULLTEST_ROWISNOTNULL,
+ &&CASE_EEOP_BOOLTEST_IS_TRUE,
+ &&CASE_EEOP_BOOLTEST_IS_NOT_TRUE,
+ &&CASE_EEOP_BOOLTEST_IS_FALSE,
+ &&CASE_EEOP_BOOLTEST_IS_NOT_FALSE,
+ &&CASE_EEOP_PARAM_EXEC,
+ &&CASE_EEOP_PARAM_EXTERN,
+ &&CASE_EEOP_PARAM_CALLBACK,
+ &&CASE_EEOP_CASE_TESTVAL,
+ &&CASE_EEOP_MAKE_READONLY,
+ &&CASE_EEOP_IOCOERCE,
+ &&CASE_EEOP_DISTINCT,
+ &&CASE_EEOP_NOT_DISTINCT,
+ &&CASE_EEOP_NULLIF,
+ &&CASE_EEOP_SQLVALUEFUNCTION,
+ &&CASE_EEOP_CURRENTOFEXPR,
+ &&CASE_EEOP_NEXTVALUEEXPR,
+ &&CASE_EEOP_ARRAYEXPR,
+ &&CASE_EEOP_ARRAYCOERCE,
+ &&CASE_EEOP_ROW,
+ &&CASE_EEOP_ROWCOMPARE_STEP,
+ &&CASE_EEOP_ROWCOMPARE_FINAL,
+ &&CASE_EEOP_MINMAX,
+ &&CASE_EEOP_FIELDSELECT,
+ &&CASE_EEOP_FIELDSTORE_DEFORM,
+ &&CASE_EEOP_FIELDSTORE_FORM,
+ &&CASE_EEOP_SBSREF_SUBSCRIPTS,
+ &&CASE_EEOP_SBSREF_OLD,
+ &&CASE_EEOP_SBSREF_ASSIGN,
+ &&CASE_EEOP_SBSREF_FETCH,
+ &&CASE_EEOP_DOMAIN_TESTVAL,
+ &&CASE_EEOP_DOMAIN_NOTNULL,
+ &&CASE_EEOP_DOMAIN_CHECK,
+ &&CASE_EEOP_CONVERT_ROWTYPE,
+ &&CASE_EEOP_SCALARARRAYOP,
+ &&CASE_EEOP_HASHED_SCALARARRAYOP,
+ &&CASE_EEOP_XMLEXPR,
+ &&CASE_EEOP_AGGREF,
+ &&CASE_EEOP_GROUPING_FUNC,
+ &&CASE_EEOP_WINDOW_FUNC,
+ &&CASE_EEOP_SUBPLAN,
+ &&CASE_EEOP_AGG_STRICT_DESERIALIZE,
+ &&CASE_EEOP_AGG_DESERIALIZE,
+ &&CASE_EEOP_AGG_STRICT_INPUT_CHECK_ARGS,
+ &&CASE_EEOP_AGG_STRICT_INPUT_CHECK_NULLS,
+ &&CASE_EEOP_AGG_PLAIN_PERGROUP_NULLCHECK,
+ &&CASE_EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL,
+ &&CASE_EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL,
+ &&CASE_EEOP_AGG_PLAIN_TRANS_BYVAL,
+ &&CASE_EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF,
+ &&CASE_EEOP_AGG_PLAIN_TRANS_STRICT_BYREF,
+ &&CASE_EEOP_AGG_PLAIN_TRANS_BYREF,
+ &&CASE_EEOP_AGG_ORDERED_TRANS_DATUM,
+ &&CASE_EEOP_AGG_ORDERED_TRANS_TUPLE,
+ &&CASE_EEOP_LAST
+ };
+
+ StaticAssertStmt(EEOP_LAST + 1 == lengthof(dispatch_table),
+ "dispatch_table out of whack with ExprEvalOp");
+
+ if (unlikely(state == NULL))
+ return PointerGetDatum(dispatch_table);
+#else
+ Assert(state != NULL);
+#endif /* EEO_USE_COMPUTED_GOTO */
+
+ /* setup state */
+ op = state->steps;
+ resultslot = state->resultslot;
+ innerslot = econtext->ecxt_innertuple;
+ outerslot = econtext->ecxt_outertuple;
+ scanslot = econtext->ecxt_scantuple;
+
+#if defined(EEO_USE_COMPUTED_GOTO)
+ EEO_DISPATCH();
+#endif
+
+ EEO_SWITCH()
+ {
+ EEO_CASE(EEOP_DONE)
+ {
+ goto out;
+ }
+
+ EEO_CASE(EEOP_INNER_FETCHSOME)
+ {
+ CheckOpSlotCompatibility(op, innerslot);
+
+ slot_getsomeattrs(innerslot, op->d.fetch.last_var);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_OUTER_FETCHSOME)
+ {
+ CheckOpSlotCompatibility(op, outerslot);
+
+ slot_getsomeattrs(outerslot, op->d.fetch.last_var);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_SCAN_FETCHSOME)
+ {
+ CheckOpSlotCompatibility(op, scanslot);
+
+ slot_getsomeattrs(scanslot, op->d.fetch.last_var);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_INNER_VAR)
+ {
+ int attnum = op->d.var.attnum;
+
+ /*
+ * Since we already extracted all referenced columns from the
+ * tuple with a FETCHSOME step, we can just grab the value
+ * directly out of the slot's decomposed-data arrays. But let's
+ * have an Assert to check that that did happen.
+ */
+ Assert(attnum >= 0 && attnum < innerslot->tts_nvalid);
+ *op->resvalue = innerslot->tts_values[attnum];
+ *op->resnull = innerslot->tts_isnull[attnum];
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_OUTER_VAR)
+ {
+ int attnum = op->d.var.attnum;
+
+ /* See EEOP_INNER_VAR comments */
+
+ Assert(attnum >= 0 && attnum < outerslot->tts_nvalid);
+ *op->resvalue = outerslot->tts_values[attnum];
+ *op->resnull = outerslot->tts_isnull[attnum];
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_SCAN_VAR)
+ {
+ int attnum = op->d.var.attnum;
+
+ /* See EEOP_INNER_VAR comments */
+
+ Assert(attnum >= 0 && attnum < scanslot->tts_nvalid);
+ *op->resvalue = scanslot->tts_values[attnum];
+ *op->resnull = scanslot->tts_isnull[attnum];
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_INNER_SYSVAR)
+ {
+ ExecEvalSysVar(state, op, econtext, innerslot);
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_OUTER_SYSVAR)
+ {
+ ExecEvalSysVar(state, op, econtext, outerslot);
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_SCAN_SYSVAR)
+ {
+ ExecEvalSysVar(state, op, econtext, scanslot);
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_WHOLEROW)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalWholeRowVar(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ASSIGN_INNER_VAR)
+ {
+ int resultnum = op->d.assign_var.resultnum;
+ int attnum = op->d.assign_var.attnum;
+
+ /*
+ * We do not need CheckVarSlotCompatibility here; that was taken
+ * care of at compilation time. But see EEOP_INNER_VAR comments.
+ */
+ Assert(attnum >= 0 && attnum < innerslot->tts_nvalid);
+ Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+ resultslot->tts_values[resultnum] = innerslot->tts_values[attnum];
+ resultslot->tts_isnull[resultnum] = innerslot->tts_isnull[attnum];
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ASSIGN_OUTER_VAR)
+ {
+ int resultnum = op->d.assign_var.resultnum;
+ int attnum = op->d.assign_var.attnum;
+
+ /*
+ * We do not need CheckVarSlotCompatibility here; that was taken
+ * care of at compilation time. But see EEOP_INNER_VAR comments.
+ */
+ Assert(attnum >= 0 && attnum < outerslot->tts_nvalid);
+ Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+ resultslot->tts_values[resultnum] = outerslot->tts_values[attnum];
+ resultslot->tts_isnull[resultnum] = outerslot->tts_isnull[attnum];
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ASSIGN_SCAN_VAR)
+ {
+ int resultnum = op->d.assign_var.resultnum;
+ int attnum = op->d.assign_var.attnum;
+
+ /*
+ * We do not need CheckVarSlotCompatibility here; that was taken
+ * care of at compilation time. But see EEOP_INNER_VAR comments.
+ */
+ Assert(attnum >= 0 && attnum < scanslot->tts_nvalid);
+ Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+ resultslot->tts_values[resultnum] = scanslot->tts_values[attnum];
+ resultslot->tts_isnull[resultnum] = scanslot->tts_isnull[attnum];
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ASSIGN_TMP)
+ {
+ int resultnum = op->d.assign_tmp.resultnum;
+
+ Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+ resultslot->tts_values[resultnum] = state->resvalue;
+ resultslot->tts_isnull[resultnum] = state->resnull;
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ASSIGN_TMP_MAKE_RO)
+ {
+ int resultnum = op->d.assign_tmp.resultnum;
+
+ Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+ resultslot->tts_isnull[resultnum] = state->resnull;
+ if (!resultslot->tts_isnull[resultnum])
+ resultslot->tts_values[resultnum] =
+ MakeExpandedObjectReadOnlyInternal(state->resvalue);
+ else
+ resultslot->tts_values[resultnum] = state->resvalue;
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_CONST)
+ {
+ *op->resnull = op->d.constval.isnull;
+ *op->resvalue = op->d.constval.value;
+
+ EEO_NEXT();
+ }
+
+ /*
+ * Function-call implementations. Arguments have previously been
+ * evaluated directly into fcinfo->args.
+ *
+ * As both STRICT checks and function-usage are noticeable performance
+ * wise, and function calls are a very hot-path (they also back
+ * operators!), it's worth having so many separate opcodes.
+ *
+ * Note: the reason for using a temporary variable "d", here and in
+ * other places, is that some compilers think "*op->resvalue = f();"
+ * requires them to evaluate op->resvalue into a register before
+ * calling f(), just in case f() is able to modify op->resvalue
+ * somehow. The extra line of code can save a useless register spill
+ * and reload across the function call.
+ */
+ EEO_CASE(EEOP_FUNCEXPR)
+ {
+ FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+ Datum d;
+
+ fcinfo->isnull = false;
+ d = op->d.func.fn_addr(fcinfo);
+ *op->resvalue = d;
+ *op->resnull = fcinfo->isnull;
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_FUNCEXPR_STRICT)
+ {
+ FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+ NullableDatum *args = fcinfo->args;
+ int nargs = op->d.func.nargs;
+ Datum d;
+
+ /* strict function, so check for NULL args */
+ for (int argno = 0; argno < nargs; argno++)
+ {
+ if (args[argno].isnull)
+ {
+ *op->resnull = true;
+ goto strictfail;
+ }
+ }
+ fcinfo->isnull = false;
+ d = op->d.func.fn_addr(fcinfo);
+ *op->resvalue = d;
+ *op->resnull = fcinfo->isnull;
+
+ strictfail:
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_FUNCEXPR_FUSAGE)
+ {
+ /* not common enough to inline */
+ ExecEvalFuncExprFusage(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_FUNCEXPR_STRICT_FUSAGE)
+ {
+ /* not common enough to inline */
+ ExecEvalFuncExprStrictFusage(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ /*
+ * If any of its clauses is FALSE, an AND's result is FALSE regardless
+ * of the states of the rest of the clauses, so we can stop evaluating
+ * and return FALSE immediately. If none are FALSE and one or more is
+ * NULL, we return NULL; otherwise we return TRUE. This makes sense
+ * when you interpret NULL as "don't know": perhaps one of the "don't
+ * knows" would have been FALSE if we'd known its value. Only when
+ * all the inputs are known to be TRUE can we state confidently that
+ * the AND's result is TRUE.
+ */
+ EEO_CASE(EEOP_BOOL_AND_STEP_FIRST)
+ {
+ *op->d.boolexpr.anynull = false;
+
+ /*
+ * EEOP_BOOL_AND_STEP_FIRST resets anynull, otherwise it's the
+ * same as EEOP_BOOL_AND_STEP - so fall through to that.
+ */
+
+ /* FALL THROUGH */
+ }
+
+ EEO_CASE(EEOP_BOOL_AND_STEP)
+ {
+ if (*op->resnull)
+ {
+ *op->d.boolexpr.anynull = true;
+ }
+ else if (!DatumGetBool(*op->resvalue))
+ {
+ /* result is already set to FALSE, need not change it */
+ /* bail out early */
+ EEO_JUMP(op->d.boolexpr.jumpdone);
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_BOOL_AND_STEP_LAST)
+ {
+ if (*op->resnull)
+ {
+ /* result is already set to NULL, need not change it */
+ }
+ else if (!DatumGetBool(*op->resvalue))
+ {
+ /* result is already set to FALSE, need not change it */
+
+ /*
+ * No point jumping early to jumpdone - would be same target
+ * (as this is the last argument to the AND expression),
+ * except more expensive.
+ */
+ }
+ else if (*op->d.boolexpr.anynull)
+ {
+ *op->resvalue = (Datum) 0;
+ *op->resnull = true;
+ }
+ else
+ {
+ /* result is already set to TRUE, need not change it */
+ }
+
+ EEO_NEXT();
+ }
+
+ /*
+ * If any of its clauses is TRUE, an OR's result is TRUE regardless of
+ * the states of the rest of the clauses, so we can stop evaluating
+ * and return TRUE immediately. If none are TRUE and one or more is
+ * NULL, we return NULL; otherwise we return FALSE. This makes sense
+ * when you interpret NULL as "don't know": perhaps one of the "don't
+ * knows" would have been TRUE if we'd known its value. Only when all
+ * the inputs are known to be FALSE can we state confidently that the
+ * OR's result is FALSE.
+ */
+ EEO_CASE(EEOP_BOOL_OR_STEP_FIRST)
+ {
+ *op->d.boolexpr.anynull = false;
+
+ /*
+ * EEOP_BOOL_OR_STEP_FIRST resets anynull, otherwise it's the same
+ * as EEOP_BOOL_OR_STEP - so fall through to that.
+ */
+
+ /* FALL THROUGH */
+ }
+
+ EEO_CASE(EEOP_BOOL_OR_STEP)
+ {
+ if (*op->resnull)
+ {
+ *op->d.boolexpr.anynull = true;
+ }
+ else if (DatumGetBool(*op->resvalue))
+ {
+ /* result is already set to TRUE, need not change it */
+ /* bail out early */
+ EEO_JUMP(op->d.boolexpr.jumpdone);
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_BOOL_OR_STEP_LAST)
+ {
+ if (*op->resnull)
+ {
+ /* result is already set to NULL, need not change it */
+ }
+ else if (DatumGetBool(*op->resvalue))
+ {
+ /* result is already set to TRUE, need not change it */
+
+ /*
+ * No point jumping to jumpdone - would be same target (as
+ * this is the last argument to the AND expression), except
+ * more expensive.
+ */
+ }
+ else if (*op->d.boolexpr.anynull)
+ {
+ *op->resvalue = (Datum) 0;
+ *op->resnull = true;
+ }
+ else
+ {
+ /* result is already set to FALSE, need not change it */
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_BOOL_NOT_STEP)
+ {
+ /*
+ * Evaluation of 'not' is simple... if expr is false, then return
+ * 'true' and vice versa. It's safe to do this even on a
+ * nominally null value, so we ignore resnull; that means that
+ * NULL in produces NULL out, which is what we want.
+ */
+ *op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue));
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_QUAL)
+ {
+ /* simplified version of BOOL_AND_STEP for use by ExecQual() */
+
+ /* If argument (also result) is false or null ... */
+ if (*op->resnull ||
+ !DatumGetBool(*op->resvalue))
+ {
+ /* ... bail out early, returning FALSE */
+ *op->resnull = false;
+ *op->resvalue = BoolGetDatum(false);
+ EEO_JUMP(op->d.qualexpr.jumpdone);
+ }
+
+ /*
+ * Otherwise, leave the TRUE value in place, in case this is the
+ * last qual. Then, TRUE is the correct answer.
+ */
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_JUMP)
+ {
+ /* Unconditionally jump to target step */
+ EEO_JUMP(op->d.jump.jumpdone);
+ }
+
+ EEO_CASE(EEOP_JUMP_IF_NULL)
+ {
+ /* Transfer control if current result is null */
+ if (*op->resnull)
+ EEO_JUMP(op->d.jump.jumpdone);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_JUMP_IF_NOT_NULL)
+ {
+ /* Transfer control if current result is non-null */
+ if (!*op->resnull)
+ EEO_JUMP(op->d.jump.jumpdone);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_JUMP_IF_NOT_TRUE)
+ {
+ /* Transfer control if current result is null or false */
+ if (*op->resnull || !DatumGetBool(*op->resvalue))
+ EEO_JUMP(op->d.jump.jumpdone);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_NULLTEST_ISNULL)
+ {
+ *op->resvalue = BoolGetDatum(*op->resnull);
+ *op->resnull = false;
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_NULLTEST_ISNOTNULL)
+ {
+ *op->resvalue = BoolGetDatum(!*op->resnull);
+ *op->resnull = false;
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_NULLTEST_ROWISNULL)
+ {
+ /* out of line implementation: too large */
+ ExecEvalRowNull(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_NULLTEST_ROWISNOTNULL)
+ {
+ /* out of line implementation: too large */
+ ExecEvalRowNotNull(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ /* BooleanTest implementations for all booltesttypes */
+
+ EEO_CASE(EEOP_BOOLTEST_IS_TRUE)
+ {
+ if (*op->resnull)
+ {
+ *op->resvalue = BoolGetDatum(false);
+ *op->resnull = false;
+ }
+ /* else, input value is the correct output as well */
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_BOOLTEST_IS_NOT_TRUE)
+ {
+ if (*op->resnull)
+ {
+ *op->resvalue = BoolGetDatum(true);
+ *op->resnull = false;
+ }
+ else
+ *op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue));
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_BOOLTEST_IS_FALSE)
+ {
+ if (*op->resnull)
+ {
+ *op->resvalue = BoolGetDatum(false);
+ *op->resnull = false;
+ }
+ else
+ *op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue));
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_BOOLTEST_IS_NOT_FALSE)
+ {
+ if (*op->resnull)
+ {
+ *op->resvalue = BoolGetDatum(true);
+ *op->resnull = false;
+ }
+ /* else, input value is the correct output as well */
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_PARAM_EXEC)
+ {
+ /* out of line implementation: too large */
+ ExecEvalParamExec(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_PARAM_EXTERN)
+ {
+ /* out of line implementation: too large */
+ ExecEvalParamExtern(state, op, econtext);
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_PARAM_CALLBACK)
+ {
+ /* allow an extension module to supply a PARAM_EXTERN value */
+ op->d.cparam.paramfunc(state, op, econtext);
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_CASE_TESTVAL)
+ {
+ /*
+ * Normally upper parts of the expression tree have setup the
+ * values to be returned here, but some parts of the system
+ * currently misuse {caseValue,domainValue}_{datum,isNull} to set
+ * run-time data. So if no values have been set-up, use
+ * ExprContext's. This isn't pretty, but also not *that* ugly,
+ * and this is unlikely to be performance sensitive enough to
+ * worry about an extra branch.
+ */
+ if (op->d.casetest.value)
+ {
+ *op->resvalue = *op->d.casetest.value;
+ *op->resnull = *op->d.casetest.isnull;
+ }
+ else
+ {
+ *op->resvalue = econtext->caseValue_datum;
+ *op->resnull = econtext->caseValue_isNull;
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_DOMAIN_TESTVAL)
+ {
+ /*
+ * See EEOP_CASE_TESTVAL comment.
+ */
+ if (op->d.casetest.value)
+ {
+ *op->resvalue = *op->d.casetest.value;
+ *op->resnull = *op->d.casetest.isnull;
+ }
+ else
+ {
+ *op->resvalue = econtext->domainValue_datum;
+ *op->resnull = econtext->domainValue_isNull;
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_MAKE_READONLY)
+ {
+ /*
+ * Force a varlena value that might be read multiple times to R/O
+ */
+ if (!*op->d.make_readonly.isnull)
+ *op->resvalue =
+ MakeExpandedObjectReadOnlyInternal(*op->d.make_readonly.value);
+ *op->resnull = *op->d.make_readonly.isnull;
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_IOCOERCE)
+ {
+ /*
+ * Evaluate a CoerceViaIO node. This can be quite a hot path, so
+ * inline as much work as possible. The source value is in our
+ * result variable.
+ */
+ char *str;
+
+ /* call output function (similar to OutputFunctionCall) */
+ if (*op->resnull)
+ {
+ /* output functions are not called on nulls */
+ str = NULL;
+ }
+ else
+ {
+ FunctionCallInfo fcinfo_out;
+
+ fcinfo_out = op->d.iocoerce.fcinfo_data_out;
+ fcinfo_out->args[0].value = *op->resvalue;
+ fcinfo_out->args[0].isnull = false;
+
+ fcinfo_out->isnull = false;
+ str = DatumGetCString(FunctionCallInvoke(fcinfo_out));
+
+ /* OutputFunctionCall assumes result isn't null */
+ Assert(!fcinfo_out->isnull);
+ }
+
+ /* call input function (similar to InputFunctionCall) */
+ if (!op->d.iocoerce.finfo_in->fn_strict || str != NULL)
+ {
+ FunctionCallInfo fcinfo_in;
+ Datum d;
+
+ fcinfo_in = op->d.iocoerce.fcinfo_data_in;
+ fcinfo_in->args[0].value = PointerGetDatum(str);
+ fcinfo_in->args[0].isnull = *op->resnull;
+ /* second and third arguments are already set up */
+
+ fcinfo_in->isnull = false;
+ d = FunctionCallInvoke(fcinfo_in);
+ *op->resvalue = d;
+
+ /* Should get null result if and only if str is NULL */
+ if (str == NULL)
+ {
+ Assert(*op->resnull);
+ Assert(fcinfo_in->isnull);
+ }
+ else
+ {
+ Assert(!*op->resnull);
+ Assert(!fcinfo_in->isnull);
+ }
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_DISTINCT)
+ {
+ /*
+ * IS DISTINCT FROM must evaluate arguments (already done into
+ * fcinfo->args) to determine whether they are NULL; if either is
+ * NULL then the result is determined. If neither is NULL, then
+ * proceed to evaluate the comparison function, which is just the
+ * type's standard equality operator. We need not care whether
+ * that function is strict. Because the handling of nulls is
+ * different, we can't just reuse EEOP_FUNCEXPR.
+ */
+ FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+
+ /* check function arguments for NULLness */
+ if (fcinfo->args[0].isnull && fcinfo->args[1].isnull)
+ {
+ /* Both NULL? Then is not distinct... */
+ *op->resvalue = BoolGetDatum(false);
+ *op->resnull = false;
+ }
+ else if (fcinfo->args[0].isnull || fcinfo->args[1].isnull)
+ {
+ /* Only one is NULL? Then is distinct... */
+ *op->resvalue = BoolGetDatum(true);
+ *op->resnull = false;
+ }
+ else
+ {
+ /* Neither null, so apply the equality function */
+ Datum eqresult;
+
+ fcinfo->isnull = false;
+ eqresult = op->d.func.fn_addr(fcinfo);
+ /* Must invert result of "="; safe to do even if null */
+ *op->resvalue = BoolGetDatum(!DatumGetBool(eqresult));
+ *op->resnull = fcinfo->isnull;
+ }
+
+ EEO_NEXT();
+ }
+
+ /* see EEOP_DISTINCT for comments, this is just inverted */
+ EEO_CASE(EEOP_NOT_DISTINCT)
+ {
+ FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+
+ if (fcinfo->args[0].isnull && fcinfo->args[1].isnull)
+ {
+ *op->resvalue = BoolGetDatum(true);
+ *op->resnull = false;
+ }
+ else if (fcinfo->args[0].isnull || fcinfo->args[1].isnull)
+ {
+ *op->resvalue = BoolGetDatum(false);
+ *op->resnull = false;
+ }
+ else
+ {
+ Datum eqresult;
+
+ fcinfo->isnull = false;
+ eqresult = op->d.func.fn_addr(fcinfo);
+ *op->resvalue = eqresult;
+ *op->resnull = fcinfo->isnull;
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_NULLIF)
+ {
+ /*
+ * The arguments are already evaluated into fcinfo->args.
+ */
+ FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+
+ /* if either argument is NULL they can't be equal */
+ if (!fcinfo->args[0].isnull && !fcinfo->args[1].isnull)
+ {
+ Datum result;
+
+ fcinfo->isnull = false;
+ result = op->d.func.fn_addr(fcinfo);
+
+ /* if the arguments are equal return null */
+ if (!fcinfo->isnull && DatumGetBool(result))
+ {
+ *op->resvalue = (Datum) 0;
+ *op->resnull = true;
+
+ EEO_NEXT();
+ }
+ }
+
+ /* Arguments aren't equal, so return the first one */
+ *op->resvalue = fcinfo->args[0].value;
+ *op->resnull = fcinfo->args[0].isnull;
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_SQLVALUEFUNCTION)
+ {
+ /*
+ * Doesn't seem worthwhile to have an inline implementation
+ * efficiency-wise.
+ */
+ ExecEvalSQLValueFunction(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_CURRENTOFEXPR)
+ {
+ /* error invocation uses space, and shouldn't ever occur */
+ ExecEvalCurrentOfExpr(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_NEXTVALUEEXPR)
+ {
+ /*
+ * Doesn't seem worthwhile to have an inline implementation
+ * efficiency-wise.
+ */
+ ExecEvalNextValueExpr(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ARRAYEXPR)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalArrayExpr(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ARRAYCOERCE)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalArrayCoerce(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ROW)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalRow(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ROWCOMPARE_STEP)
+ {
+ FunctionCallInfo fcinfo = op->d.rowcompare_step.fcinfo_data;
+ Datum d;
+
+ /* force NULL result if strict fn and NULL input */
+ if (op->d.rowcompare_step.finfo->fn_strict &&
+ (fcinfo->args[0].isnull || fcinfo->args[1].isnull))
+ {
+ *op->resnull = true;
+ EEO_JUMP(op->d.rowcompare_step.jumpnull);
+ }
+
+ /* Apply comparison function */
+ fcinfo->isnull = false;
+ d = op->d.rowcompare_step.fn_addr(fcinfo);
+ *op->resvalue = d;
+
+ /* force NULL result if NULL function result */
+ if (fcinfo->isnull)
+ {
+ *op->resnull = true;
+ EEO_JUMP(op->d.rowcompare_step.jumpnull);
+ }
+ *op->resnull = false;
+
+ /* If unequal, no need to compare remaining columns */
+ if (DatumGetInt32(*op->resvalue) != 0)
+ {
+ EEO_JUMP(op->d.rowcompare_step.jumpdone);
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_ROWCOMPARE_FINAL)
+ {
+ int32 cmpresult = DatumGetInt32(*op->resvalue);
+ RowCompareType rctype = op->d.rowcompare_final.rctype;
+
+ *op->resnull = false;
+ switch (rctype)
+ {
+ /* EQ and NE cases aren't allowed here */
+ case ROWCOMPARE_LT:
+ *op->resvalue = BoolGetDatum(cmpresult < 0);
+ break;
+ case ROWCOMPARE_LE:
+ *op->resvalue = BoolGetDatum(cmpresult <= 0);
+ break;
+ case ROWCOMPARE_GE:
+ *op->resvalue = BoolGetDatum(cmpresult >= 0);
+ break;
+ case ROWCOMPARE_GT:
+ *op->resvalue = BoolGetDatum(cmpresult > 0);
+ break;
+ default:
+ Assert(false);
+ break;
+ }
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_MINMAX)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalMinMax(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_FIELDSELECT)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalFieldSelect(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_FIELDSTORE_DEFORM)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalFieldStoreDeForm(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_FIELDSTORE_FORM)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalFieldStoreForm(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_SBSREF_SUBSCRIPTS)
+ {
+ /* Precheck SubscriptingRef subscript(s) */
+ if (op->d.sbsref_subscript.subscriptfunc(state, op, econtext))
+ {
+ EEO_NEXT();
+ }
+ else
+ {
+ /* Subscript is null, short-circuit SubscriptingRef to NULL */
+ EEO_JUMP(op->d.sbsref_subscript.jumpdone);
+ }
+ }
+
+ EEO_CASE(EEOP_SBSREF_OLD)
+ EEO_CASE(EEOP_SBSREF_ASSIGN)
+ EEO_CASE(EEOP_SBSREF_FETCH)
+ {
+ /* Perform a SubscriptingRef fetch or assignment */
+ op->d.sbsref.subscriptfunc(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_CONVERT_ROWTYPE)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalConvertRowtype(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_SCALARARRAYOP)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalScalarArrayOp(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_HASHED_SCALARARRAYOP)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalHashedScalarArrayOp(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_DOMAIN_NOTNULL)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalConstraintNotNull(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_DOMAIN_CHECK)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalConstraintCheck(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_XMLEXPR)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalXmlExpr(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_AGGREF)
+ {
+ /*
+ * Returns a Datum whose value is the precomputed aggregate value
+ * found in the given expression context.
+ */
+ int aggno = op->d.aggref.aggno;
+
+ Assert(econtext->ecxt_aggvalues != NULL);
+
+ *op->resvalue = econtext->ecxt_aggvalues[aggno];
+ *op->resnull = econtext->ecxt_aggnulls[aggno];
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_GROUPING_FUNC)
+ {
+ /* too complex/uncommon for an inline implementation */
+ ExecEvalGroupingFunc(state, op);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_WINDOW_FUNC)
+ {
+ /*
+ * Like Aggref, just return a precomputed value from the econtext.
+ */
+ WindowFuncExprState *wfunc = op->d.window_func.wfstate;
+
+ Assert(econtext->ecxt_aggvalues != NULL);
+
+ *op->resvalue = econtext->ecxt_aggvalues[wfunc->wfuncno];
+ *op->resnull = econtext->ecxt_aggnulls[wfunc->wfuncno];
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_SUBPLAN)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalSubPlan(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ /* evaluate a strict aggregate deserialization function */
+ EEO_CASE(EEOP_AGG_STRICT_DESERIALIZE)
+ {
+ /* Don't call a strict deserialization function with NULL input */
+ if (op->d.agg_deserialize.fcinfo_data->args[0].isnull)
+ EEO_JUMP(op->d.agg_deserialize.jumpnull);
+
+ /* fallthrough */
+ }
+
+ /* evaluate aggregate deserialization function (non-strict portion) */
+ EEO_CASE(EEOP_AGG_DESERIALIZE)
+ {
+ FunctionCallInfo fcinfo = op->d.agg_deserialize.fcinfo_data;
+ AggState *aggstate = castNode(AggState, state->parent);
+ MemoryContext oldContext;
+
+ /*
+ * We run the deserialization functions in per-input-tuple memory
+ * context.
+ */
+ oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+ fcinfo->isnull = false;
+ *op->resvalue = FunctionCallInvoke(fcinfo);
+ *op->resnull = fcinfo->isnull;
+ MemoryContextSwitchTo(oldContext);
+
+ EEO_NEXT();
+ }
+
+ /*
+ * Check that a strict aggregate transition / combination function's
+ * input is not NULL.
+ */
+
+ EEO_CASE(EEOP_AGG_STRICT_INPUT_CHECK_ARGS)
+ {
+ NullableDatum *args = op->d.agg_strict_input_check.args;
+ int nargs = op->d.agg_strict_input_check.nargs;
+
+ for (int argno = 0; argno < nargs; argno++)
+ {
+ if (args[argno].isnull)
+ EEO_JUMP(op->d.agg_strict_input_check.jumpnull);
+ }
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_AGG_STRICT_INPUT_CHECK_NULLS)
+ {
+ bool *nulls = op->d.agg_strict_input_check.nulls;
+ int nargs = op->d.agg_strict_input_check.nargs;
+
+ for (int argno = 0; argno < nargs; argno++)
+ {
+ if (nulls[argno])
+ EEO_JUMP(op->d.agg_strict_input_check.jumpnull);
+ }
+ EEO_NEXT();
+ }
+
+ /*
+ * Check for a NULL pointer to the per-group states.
+ */
+
+ EEO_CASE(EEOP_AGG_PLAIN_PERGROUP_NULLCHECK)
+ {
+ AggState *aggstate = castNode(AggState, state->parent);
+ AggStatePerGroup pergroup_allaggs =
+ aggstate->all_pergroups[op->d.agg_plain_pergroup_nullcheck.setoff];
+
+ if (pergroup_allaggs == NULL)
+ EEO_JUMP(op->d.agg_plain_pergroup_nullcheck.jumpnull);
+
+ EEO_NEXT();
+ }
+
+ /*
+ * Different types of aggregate transition functions are implemented
+ * as different types of steps, to avoid incurring unnecessary
+ * overhead. There's a step type for each valid combination of having
+ * a by value / by reference transition type, [not] needing to the
+ * initialize the transition value for the first row in a group from
+ * input, and [not] strict transition function.
+ *
+ * Could optimize further by splitting off by-reference for
+ * fixed-length types, but currently that doesn't seem worth it.
+ */
+
+ EEO_CASE(EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL)
+ {
+ AggState *aggstate = castNode(AggState, state->parent);
+ AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+ AggStatePerGroup pergroup =
+ &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+ Assert(pertrans->transtypeByVal);
+
+ if (pergroup->noTransValue)
+ {
+ /* If transValue has not yet been initialized, do so now. */
+ ExecAggInitGroup(aggstate, pertrans, pergroup,
+ op->d.agg_trans.aggcontext);
+ /* copied trans value from input, done this round */
+ }
+ else if (likely(!pergroup->transValueIsNull))
+ {
+ /* invoke transition function, unless prevented by strictness */
+ ExecAggPlainTransByVal(aggstate, pertrans, pergroup,
+ op->d.agg_trans.aggcontext,
+ op->d.agg_trans.setno);
+ }
+
+ EEO_NEXT();
+ }
+
+ /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+ EEO_CASE(EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL)
+ {
+ AggState *aggstate = castNode(AggState, state->parent);
+ AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+ AggStatePerGroup pergroup =
+ &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+ Assert(pertrans->transtypeByVal);
+
+ if (likely(!pergroup->transValueIsNull))
+ ExecAggPlainTransByVal(aggstate, pertrans, pergroup,
+ op->d.agg_trans.aggcontext,
+ op->d.agg_trans.setno);
+
+ EEO_NEXT();
+ }
+
+ /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+ EEO_CASE(EEOP_AGG_PLAIN_TRANS_BYVAL)
+ {
+ AggState *aggstate = castNode(AggState, state->parent);
+ AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+ AggStatePerGroup pergroup =
+ &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+ Assert(pertrans->transtypeByVal);
+
+ ExecAggPlainTransByVal(aggstate, pertrans, pergroup,
+ op->d.agg_trans.aggcontext,
+ op->d.agg_trans.setno);
+
+ EEO_NEXT();
+ }
+
+ /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+ EEO_CASE(EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF)
+ {
+ AggState *aggstate = castNode(AggState, state->parent);
+ AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+ AggStatePerGroup pergroup =
+ &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+ Assert(!pertrans->transtypeByVal);
+
+ if (pergroup->noTransValue)
+ ExecAggInitGroup(aggstate, pertrans, pergroup,
+ op->d.agg_trans.aggcontext);
+ else if (likely(!pergroup->transValueIsNull))
+ ExecAggPlainTransByRef(aggstate, pertrans, pergroup,
+ op->d.agg_trans.aggcontext,
+ op->d.agg_trans.setno);
+
+ EEO_NEXT();
+ }
+
+ /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+ EEO_CASE(EEOP_AGG_PLAIN_TRANS_STRICT_BYREF)
+ {
+ AggState *aggstate = castNode(AggState, state->parent);
+ AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+ AggStatePerGroup pergroup =
+ &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+ Assert(!pertrans->transtypeByVal);
+
+ if (likely(!pergroup->transValueIsNull))
+ ExecAggPlainTransByRef(aggstate, pertrans, pergroup,
+ op->d.agg_trans.aggcontext,
+ op->d.agg_trans.setno);
+ EEO_NEXT();
+ }
+
+ /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+ EEO_CASE(EEOP_AGG_PLAIN_TRANS_BYREF)
+ {
+ AggState *aggstate = castNode(AggState, state->parent);
+ AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+ AggStatePerGroup pergroup =
+ &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+ Assert(!pertrans->transtypeByVal);
+
+ ExecAggPlainTransByRef(aggstate, pertrans, pergroup,
+ op->d.agg_trans.aggcontext,
+ op->d.agg_trans.setno);
+
+ EEO_NEXT();
+ }
+
+ /* process single-column ordered aggregate datum */
+ EEO_CASE(EEOP_AGG_ORDERED_TRANS_DATUM)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalAggOrderedTransDatum(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ /* process multi-column ordered aggregate tuple */
+ EEO_CASE(EEOP_AGG_ORDERED_TRANS_TUPLE)
+ {
+ /* too complex for an inline implementation */
+ ExecEvalAggOrderedTransTuple(state, op, econtext);
+
+ EEO_NEXT();
+ }
+
+ EEO_CASE(EEOP_LAST)
+ {
+ /* unreachable */
+ Assert(false);
+ goto out;
+ }
+ }
+
+out:
+ *isnull = state->resnull;
+ return state->resvalue;
+}
+
+/*
+ * Expression evaluation callback that performs extra checks before executing
+ * the expression. Declared extern so other methods of execution can use it
+ * too.
+ */
+Datum
+ExecInterpExprStillValid(ExprState *state, ExprContext *econtext, bool *isNull)
+{
+ /*
+ * First time through, check whether attribute matches Var. Might not be
+ * ok anymore, due to schema changes.
+ */
+ CheckExprStillValid(state, econtext);
+
+ /* skip the check during further executions */
+ state->evalfunc = (ExprStateEvalFunc) state->evalfunc_private;
+
+ /* and actually execute */
+ return state->evalfunc(state, econtext, isNull);
+}
+
+/*
+ * Check that an expression is still valid in the face of potential schema
+ * changes since the plan has been created.
+ */
+void
+CheckExprStillValid(ExprState *state, ExprContext *econtext)
+{
+ TupleTableSlot *innerslot;
+ TupleTableSlot *outerslot;
+ TupleTableSlot *scanslot;
+
+ innerslot = econtext->ecxt_innertuple;
+ outerslot = econtext->ecxt_outertuple;
+ scanslot = econtext->ecxt_scantuple;
+
+ for (int i = 0; i < state->steps_len; i++)
+ {
+ ExprEvalStep *op = &state->steps[i];
+
+ switch (ExecEvalStepOp(state, op))
+ {
+ case EEOP_INNER_VAR:
+ {
+ int attnum = op->d.var.attnum;
+
+ CheckVarSlotCompatibility(innerslot, attnum + 1, op->d.var.vartype);
+ break;
+ }
+
+ case EEOP_OUTER_VAR:
+ {
+ int attnum = op->d.var.attnum;
+
+ CheckVarSlotCompatibility(outerslot, attnum + 1, op->d.var.vartype);
+ break;
+ }
+
+ case EEOP_SCAN_VAR:
+ {
+ int attnum = op->d.var.attnum;
+
+ CheckVarSlotCompatibility(scanslot, attnum + 1, op->d.var.vartype);
+ break;
+ }
+ default:
+ break;
+ }
+ }
+}
+
+/*
+ * Check whether a user attribute in a slot can be referenced by a Var
+ * expression. This should succeed unless there have been schema changes
+ * since the expression tree has been created.
+ */
+static void
+CheckVarSlotCompatibility(TupleTableSlot *slot, int attnum, Oid vartype)
+{
+ /*
+ * What we have to check for here is the possibility of an attribute
+ * having been dropped or changed in type since the plan tree was created.
+ * Ideally the plan will get invalidated and not re-used, but just in
+ * case, we keep these defenses. Fortunately it's sufficient to check
+ * once on the first time through.
+ *
+ * Note: ideally we'd check typmod as well as typid, but that seems
+ * impractical at the moment: in many cases the tupdesc will have been
+ * generated by ExecTypeFromTL(), and that can't guarantee to generate an
+ * accurate typmod in all cases, because some expression node types don't
+ * carry typmod. Fortunately, for precisely that reason, there should be
+ * no places with a critical dependency on the typmod of a value.
+ *
+ * System attributes don't require checking since their types never
+ * change.
+ */
+ if (attnum > 0)
+ {
+ TupleDesc slot_tupdesc = slot->tts_tupleDescriptor;
+ Form_pg_attribute attr;
+
+ if (attnum > slot_tupdesc->natts) /* should never happen */
+ elog(ERROR, "attribute number %d exceeds number of columns %d",
+ attnum, slot_tupdesc->natts);
+
+ attr = TupleDescAttr(slot_tupdesc, attnum - 1);
+
+ if (attr->attisdropped)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_COLUMN),
+ errmsg("attribute %d of type %s has been dropped",
+ attnum, format_type_be(slot_tupdesc->tdtypeid))));
+
+ if (vartype != attr->atttypid)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("attribute %d of type %s has wrong type",
+ attnum, format_type_be(slot_tupdesc->tdtypeid)),
+ errdetail("Table has type %s, but query expects %s.",
+ format_type_be(attr->atttypid),
+ format_type_be(vartype))));
+ }
+}
+
+/*
+ * Verify that the slot is compatible with a EEOP_*_FETCHSOME operation.
+ */
+static void
+CheckOpSlotCompatibility(ExprEvalStep *op, TupleTableSlot *slot)
+{
+#ifdef USE_ASSERT_CHECKING
+ /* there's nothing to check */
+ if (!op->d.fetch.fixed)
+ return;
+
+ /*
+ * Should probably fixed at some point, but for now it's easier to allow
+ * buffer and heap tuples to be used interchangeably.
+ */
+ if (slot->tts_ops == &TTSOpsBufferHeapTuple &&
+ op->d.fetch.kind == &TTSOpsHeapTuple)
+ return;
+ if (slot->tts_ops == &TTSOpsHeapTuple &&
+ op->d.fetch.kind == &TTSOpsBufferHeapTuple)
+ return;
+
+ /*
+ * At the moment we consider it OK if a virtual slot is used instead of a
+ * specific type of slot, as a virtual slot never needs to be deformed.
+ */
+ if (slot->tts_ops == &TTSOpsVirtual)
+ return;
+
+ Assert(op->d.fetch.kind == slot->tts_ops);
+#endif
+}
+
+/*
+ * get_cached_rowtype: utility function to lookup a rowtype tupdesc
+ *
+ * type_id, typmod: identity of the rowtype
+ * rowcache: space for caching identity info
+ * (rowcache->cacheptr must be initialized to NULL)
+ * changed: if not NULL, *changed is set to true on any update
+ *
+ * The returned TupleDesc is not guaranteed pinned; caller must pin it
+ * to use it across any operation that might incur cache invalidation.
+ * (The TupleDesc is always refcounted, so just use IncrTupleDescRefCount.)
+ *
+ * NOTE: because composite types can change contents, we must be prepared
+ * to re-do this during any node execution; cannot call just once during
+ * expression initialization.
+ */
+static TupleDesc
+get_cached_rowtype(Oid type_id, int32 typmod,
+ ExprEvalRowtypeCache *rowcache,
+ bool *changed)
+{
+ if (type_id != RECORDOID)
+ {
+ /*
+ * It's a named composite type, so use the regular typcache. Do a
+ * lookup first time through, or if the composite type changed. Note:
+ * "tupdesc_id == 0" may look redundant, but it protects against the
+ * admittedly-theoretical possibility that type_id was RECORDOID the
+ * last time through, so that the cacheptr isn't TypeCacheEntry *.
+ */
+ TypeCacheEntry *typentry = (TypeCacheEntry *) rowcache->cacheptr;
+
+ if (unlikely(typentry == NULL ||
+ rowcache->tupdesc_id == 0 ||
+ typentry->tupDesc_identifier != rowcache->tupdesc_id))
+ {
+ typentry = lookup_type_cache(type_id, TYPECACHE_TUPDESC);
+ if (typentry->tupDesc == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("type %s is not composite",
+ format_type_be(type_id))));
+ rowcache->cacheptr = (void *) typentry;
+ rowcache->tupdesc_id = typentry->tupDesc_identifier;
+ if (changed)
+ *changed = true;
+ }
+ return typentry->tupDesc;
+ }
+ else
+ {
+ /*
+ * A RECORD type, once registered, doesn't change for the life of the
+ * backend. So we don't need a typcache entry as such, which is good
+ * because there isn't one. It's possible that the caller is asking
+ * about a different type than before, though.
+ */
+ TupleDesc tupDesc = (TupleDesc) rowcache->cacheptr;
+
+ if (unlikely(tupDesc == NULL ||
+ rowcache->tupdesc_id != 0 ||
+ type_id != tupDesc->tdtypeid ||
+ typmod != tupDesc->tdtypmod))
+ {
+ tupDesc = lookup_rowtype_tupdesc(type_id, typmod);
+ /* Drop pin acquired by lookup_rowtype_tupdesc */
+ ReleaseTupleDesc(tupDesc);
+ rowcache->cacheptr = (void *) tupDesc;
+ rowcache->tupdesc_id = 0; /* not a valid value for non-RECORD */
+ if (changed)
+ *changed = true;
+ }
+ return tupDesc;
+ }
+}
+
+
+/*
+ * Fast-path functions, for very simple expressions
+ */
+
+/* implementation of ExecJust(Inner|Outer|Scan)Var */
+static pg_attribute_always_inline Datum
+ExecJustVarImpl(ExprState *state, TupleTableSlot *slot, bool *isnull)
+{
+ ExprEvalStep *op = &state->steps[1];
+ int attnum = op->d.var.attnum + 1;
+
+ CheckOpSlotCompatibility(&state->steps[0], slot);
+
+ /*
+ * Since we use slot_getattr(), we don't need to implement the FETCHSOME
+ * step explicitly, and we also needn't Assert that the attnum is in range
+ * --- slot_getattr() will take care of any problems.
+ */
+ return slot_getattr(slot, attnum, isnull);
+}
+
+/* Simple reference to inner Var */
+static Datum
+ExecJustInnerVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustVarImpl(state, econtext->ecxt_innertuple, isnull);
+}
+
+/* Simple reference to outer Var */
+static Datum
+ExecJustOuterVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustVarImpl(state, econtext->ecxt_outertuple, isnull);
+}
+
+/* Simple reference to scan Var */
+static Datum
+ExecJustScanVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustVarImpl(state, econtext->ecxt_scantuple, isnull);
+}
+
+/* implementation of ExecJustAssign(Inner|Outer|Scan)Var */
+static pg_attribute_always_inline Datum
+ExecJustAssignVarImpl(ExprState *state, TupleTableSlot *inslot, bool *isnull)
+{
+ ExprEvalStep *op = &state->steps[1];
+ int attnum = op->d.assign_var.attnum + 1;
+ int resultnum = op->d.assign_var.resultnum;
+ TupleTableSlot *outslot = state->resultslot;
+
+ CheckOpSlotCompatibility(&state->steps[0], inslot);
+
+ /*
+ * We do not need CheckVarSlotCompatibility here; that was taken care of
+ * at compilation time.
+ *
+ * Since we use slot_getattr(), we don't need to implement the FETCHSOME
+ * step explicitly, and we also needn't Assert that the attnum is in range
+ * --- slot_getattr() will take care of any problems. Nonetheless, check
+ * that resultnum is in range.
+ */
+ Assert(resultnum >= 0 && resultnum < outslot->tts_tupleDescriptor->natts);
+ outslot->tts_values[resultnum] =
+ slot_getattr(inslot, attnum, &outslot->tts_isnull[resultnum]);
+ return 0;
+}
+
+/* Evaluate inner Var and assign to appropriate column of result tuple */
+static Datum
+ExecJustAssignInnerVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustAssignVarImpl(state, econtext->ecxt_innertuple, isnull);
+}
+
+/* Evaluate outer Var and assign to appropriate column of result tuple */
+static Datum
+ExecJustAssignOuterVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustAssignVarImpl(state, econtext->ecxt_outertuple, isnull);
+}
+
+/* Evaluate scan Var and assign to appropriate column of result tuple */
+static Datum
+ExecJustAssignScanVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustAssignVarImpl(state, econtext->ecxt_scantuple, isnull);
+}
+
+/* Evaluate CASE_TESTVAL and apply a strict function to it */
+static Datum
+ExecJustApplyFuncToCase(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ ExprEvalStep *op = &state->steps[0];
+ FunctionCallInfo fcinfo;
+ NullableDatum *args;
+ int nargs;
+ Datum d;
+
+ /*
+ * XXX with some redesign of the CaseTestExpr mechanism, maybe we could
+ * get rid of this data shuffling?
+ */
+ *op->resvalue = *op->d.casetest.value;
+ *op->resnull = *op->d.casetest.isnull;
+
+ op++;
+
+ nargs = op->d.func.nargs;
+ fcinfo = op->d.func.fcinfo_data;
+ args = fcinfo->args;
+
+ /* strict function, so check for NULL args */
+ for (int argno = 0; argno < nargs; argno++)
+ {
+ if (args[argno].isnull)
+ {
+ *isnull = true;
+ return (Datum) 0;
+ }
+ }
+ fcinfo->isnull = false;
+ d = op->d.func.fn_addr(fcinfo);
+ *isnull = fcinfo->isnull;
+ return d;
+}
+
+/* Simple Const expression */
+static Datum
+ExecJustConst(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ ExprEvalStep *op = &state->steps[0];
+
+ *isnull = op->d.constval.isnull;
+ return op->d.constval.value;
+}
+
+/* implementation of ExecJust(Inner|Outer|Scan)VarVirt */
+static pg_attribute_always_inline Datum
+ExecJustVarVirtImpl(ExprState *state, TupleTableSlot *slot, bool *isnull)
+{
+ ExprEvalStep *op = &state->steps[0];
+ int attnum = op->d.var.attnum;
+
+ /*
+ * As it is guaranteed that a virtual slot is used, there never is a need
+ * to perform tuple deforming (nor would it be possible). Therefore
+ * execExpr.c has not emitted an EEOP_*_FETCHSOME step. Verify, as much as
+ * possible, that that determination was accurate.
+ */
+ Assert(TTS_IS_VIRTUAL(slot));
+ Assert(TTS_FIXED(slot));
+ Assert(attnum >= 0 && attnum < slot->tts_nvalid);
+
+ *isnull = slot->tts_isnull[attnum];
+
+ return slot->tts_values[attnum];
+}
+
+/* Like ExecJustInnerVar, optimized for virtual slots */
+static Datum
+ExecJustInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustVarVirtImpl(state, econtext->ecxt_innertuple, isnull);
+}
+
+/* Like ExecJustOuterVar, optimized for virtual slots */
+static Datum
+ExecJustOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustVarVirtImpl(state, econtext->ecxt_outertuple, isnull);
+}
+
+/* Like ExecJustScanVar, optimized for virtual slots */
+static Datum
+ExecJustScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustVarVirtImpl(state, econtext->ecxt_scantuple, isnull);
+}
+
+/* implementation of ExecJustAssign(Inner|Outer|Scan)VarVirt */
+static pg_attribute_always_inline Datum
+ExecJustAssignVarVirtImpl(ExprState *state, TupleTableSlot *inslot, bool *isnull)
+{
+ ExprEvalStep *op = &state->steps[0];
+ int attnum = op->d.assign_var.attnum;
+ int resultnum = op->d.assign_var.resultnum;
+ TupleTableSlot *outslot = state->resultslot;
+
+ /* see ExecJustVarVirtImpl for comments */
+
+ Assert(TTS_IS_VIRTUAL(inslot));
+ Assert(TTS_FIXED(inslot));
+ Assert(attnum >= 0 && attnum < inslot->tts_nvalid);
+ Assert(resultnum >= 0 && resultnum < outslot->tts_tupleDescriptor->natts);
+
+ outslot->tts_values[resultnum] = inslot->tts_values[attnum];
+ outslot->tts_isnull[resultnum] = inslot->tts_isnull[attnum];
+
+ return 0;
+}
+
+/* Like ExecJustAssignInnerVar, optimized for virtual slots */
+static Datum
+ExecJustAssignInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustAssignVarVirtImpl(state, econtext->ecxt_innertuple, isnull);
+}
+
+/* Like ExecJustAssignOuterVar, optimized for virtual slots */
+static Datum
+ExecJustAssignOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustAssignVarVirtImpl(state, econtext->ecxt_outertuple, isnull);
+}
+
+/* Like ExecJustAssignScanVar, optimized for virtual slots */
+static Datum
+ExecJustAssignScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+ return ExecJustAssignVarVirtImpl(state, econtext->ecxt_scantuple, isnull);
+}
+
+#if defined(EEO_USE_COMPUTED_GOTO)
+/*
+ * Comparator used when building address->opcode lookup table for
+ * ExecEvalStepOp() in the threaded dispatch case.
+ */
+static int
+dispatch_compare_ptr(const void *a, const void *b)
+{
+ const ExprEvalOpLookup *la = (const ExprEvalOpLookup *) a;
+ const ExprEvalOpLookup *lb = (const ExprEvalOpLookup *) b;
+
+ if (la->opcode < lb->opcode)
+ return -1;
+ else if (la->opcode > lb->opcode)
+ return 1;
+ return 0;
+}
+#endif
+
+/*
+ * Do one-time initialization of interpretation machinery.
+ */
+static void
+ExecInitInterpreter(void)
+{
+#if defined(EEO_USE_COMPUTED_GOTO)
+ /* Set up externally-visible pointer to dispatch table */
+ if (dispatch_table == NULL)
+ {
+ dispatch_table = (const void **)
+ DatumGetPointer(ExecInterpExpr(NULL, NULL, NULL));
+
+ /* build reverse lookup table */
+ for (int i = 0; i < EEOP_LAST; i++)
+ {
+ reverse_dispatch_table[i].opcode = dispatch_table[i];
+ reverse_dispatch_table[i].op = (ExprEvalOp) i;
+ }
+
+ /* make it bsearch()able */
+ qsort(reverse_dispatch_table,
+ EEOP_LAST /* nmembers */ ,
+ sizeof(ExprEvalOpLookup),
+ dispatch_compare_ptr);
+ }
+#endif
+}
+
+/*
+ * Function to return the opcode of an expression step.
+ *
+ * When direct-threading is in use, ExprState->opcode isn't easily
+ * decipherable. This function returns the appropriate enum member.
+ */
+ExprEvalOp
+ExecEvalStepOp(ExprState *state, ExprEvalStep *op)
+{
+#if defined(EEO_USE_COMPUTED_GOTO)
+ if (state->flags & EEO_FLAG_DIRECT_THREADED)
+ {
+ ExprEvalOpLookup key;
+ ExprEvalOpLookup *res;
+
+ key.opcode = (void *) op->opcode;
+ res = bsearch(&key,
+ reverse_dispatch_table,
+ EEOP_LAST /* nmembers */ ,
+ sizeof(ExprEvalOpLookup),
+ dispatch_compare_ptr);
+ Assert(res); /* unknown ops shouldn't get looked up */
+ return res->op;
+ }
+#endif
+ return (ExprEvalOp) op->opcode;
+}
+
+
+/*
+ * Out-of-line helper functions for complex instructions.
+ */
+
+/*
+ * Evaluate EEOP_FUNCEXPR_FUSAGE
+ */
+void
+ExecEvalFuncExprFusage(ExprState *state, ExprEvalStep *op,
+ ExprContext *econtext)
+{
+ FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+ PgStat_FunctionCallUsage fcusage;
+ Datum d;
+
+ pgstat_init_function_usage(fcinfo, &fcusage);
+
+ fcinfo->isnull = false;
+ d = op->d.func.fn_addr(fcinfo);
+ *op->resvalue = d;
+ *op->resnull = fcinfo->isnull;
+
+ pgstat_end_function_usage(&fcusage, true);
+}
+
+/*
+ * Evaluate EEOP_FUNCEXPR_STRICT_FUSAGE
+ */
+void
+ExecEvalFuncExprStrictFusage(ExprState *state, ExprEvalStep *op,
+ ExprContext *econtext)
+{
+
+ FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+ PgStat_FunctionCallUsage fcusage;
+ NullableDatum *args = fcinfo->args;
+ int nargs = op->d.func.nargs;
+ Datum d;
+
+ /* strict function, so check for NULL args */
+ for (int argno = 0; argno < nargs; argno++)
+ {
+ if (args[argno].isnull)
+ {
+ *op->resnull = true;
+ return;
+ }
+ }
+
+ pgstat_init_function_usage(fcinfo, &fcusage);
+
+ fcinfo->isnull = false;
+ d = op->d.func.fn_addr(fcinfo);
+ *op->resvalue = d;
+ *op->resnull = fcinfo->isnull;
+
+ pgstat_end_function_usage(&fcusage, true);
+}
+
+/*
+ * Evaluate a PARAM_EXEC parameter.
+ *
+ * PARAM_EXEC params (internal executor parameters) are stored in the
+ * ecxt_param_exec_vals array, and can be accessed by array index.
+ */
+void
+ExecEvalParamExec(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ ParamExecData *prm;
+
+ prm = &(econtext->ecxt_param_exec_vals[op->d.param.paramid]);
+ if (unlikely(prm->execPlan != NULL))
+ {
+ /* Parameter not evaluated yet, so go do it */
+ ExecSetParamPlan(prm->execPlan, econtext);
+ /* ExecSetParamPlan should have processed this param... */
+ Assert(prm->execPlan == NULL);
+ }
+ *op->resvalue = prm->value;
+ *op->resnull = prm->isnull;
+}
+
+/*
+ * Evaluate a PARAM_EXTERN parameter.
+ *
+ * PARAM_EXTERN parameters must be sought in ecxt_param_list_info.
+ */
+void
+ExecEvalParamExtern(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ ParamListInfo paramInfo = econtext->ecxt_param_list_info;
+ int paramId = op->d.param.paramid;
+
+ if (likely(paramInfo &&
+ paramId > 0 && paramId <= paramInfo->numParams))
+ {
+ ParamExternData *prm;
+ ParamExternData prmdata;
+
+ /* give hook a chance in case parameter is dynamic */
+ if (paramInfo->paramFetch != NULL)
+ prm = paramInfo->paramFetch(paramInfo, paramId, false, &prmdata);
+ else
+ prm = &paramInfo->params[paramId - 1];
+
+ if (likely(OidIsValid(prm->ptype)))
+ {
+ /* safety check in case hook did something unexpected */
+ if (unlikely(prm->ptype != op->d.param.paramtype))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("type of parameter %d (%s) does not match that when preparing the plan (%s)",
+ paramId,
+ format_type_be(prm->ptype),
+ format_type_be(op->d.param.paramtype))));
+ *op->resvalue = prm->value;
+ *op->resnull = prm->isnull;
+ return;
+ }
+ }
+
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("no value found for parameter %d", paramId)));
+}
+
+/*
+ * Evaluate a SQLValueFunction expression.
+ */
+void
+ExecEvalSQLValueFunction(ExprState *state, ExprEvalStep *op)
+{
+ LOCAL_FCINFO(fcinfo, 0);
+ SQLValueFunction *svf = op->d.sqlvaluefunction.svf;
+
+ *op->resnull = false;
+
+ /*
+ * Note: current_schema() can return NULL. current_user() etc currently
+ * cannot, but might as well code those cases the same way for safety.
+ */
+ switch (svf->op)
+ {
+ case SVFOP_CURRENT_DATE:
+ *op->resvalue = DateADTGetDatum(GetSQLCurrentDate());
+ break;
+ case SVFOP_CURRENT_TIME:
+ case SVFOP_CURRENT_TIME_N:
+ *op->resvalue = TimeTzADTPGetDatum(GetSQLCurrentTime(svf->typmod));
+ break;
+ case SVFOP_CURRENT_TIMESTAMP:
+ case SVFOP_CURRENT_TIMESTAMP_N:
+ *op->resvalue = TimestampTzGetDatum(GetSQLCurrentTimestamp(svf->typmod));
+ break;
+ case SVFOP_LOCALTIME:
+ case SVFOP_LOCALTIME_N:
+ *op->resvalue = TimeADTGetDatum(GetSQLLocalTime(svf->typmod));
+ break;
+ case SVFOP_LOCALTIMESTAMP:
+ case SVFOP_LOCALTIMESTAMP_N:
+ *op->resvalue = TimestampGetDatum(GetSQLLocalTimestamp(svf->typmod));
+ break;
+ case SVFOP_CURRENT_ROLE:
+ case SVFOP_CURRENT_USER:
+ case SVFOP_USER:
+ InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+ *op->resvalue = current_user(fcinfo);
+ *op->resnull = fcinfo->isnull;
+ break;
+ case SVFOP_SESSION_USER:
+ InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+ *op->resvalue = session_user(fcinfo);
+ *op->resnull = fcinfo->isnull;
+ break;
+ case SVFOP_CURRENT_CATALOG:
+ InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+ *op->resvalue = current_database(fcinfo);
+ *op->resnull = fcinfo->isnull;
+ break;
+ case SVFOP_CURRENT_SCHEMA:
+ InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+ *op->resvalue = current_schema(fcinfo);
+ *op->resnull = fcinfo->isnull;
+ break;
+ }
+}
+
+/*
+ * Raise error if a CURRENT OF expression is evaluated.
+ *
+ * The planner should convert CURRENT OF into a TidScan qualification, or some
+ * other special handling in a ForeignScan node. So we have to be able to do
+ * ExecInitExpr on a CurrentOfExpr, but we shouldn't ever actually execute it.
+ * If we get here, we suppose we must be dealing with CURRENT OF on a foreign
+ * table whose FDW doesn't handle it, and complain accordingly.
+ */
+void
+ExecEvalCurrentOfExpr(ExprState *state, ExprEvalStep *op)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("WHERE CURRENT OF is not supported for this table type")));
+}
+
+/*
+ * Evaluate NextValueExpr.
+ */
+void
+ExecEvalNextValueExpr(ExprState *state, ExprEvalStep *op)
+{
+ int64 newval = nextval_internal(op->d.nextvalueexpr.seqid, false);
+
+ switch (op->d.nextvalueexpr.seqtypid)
+ {
+ case INT2OID:
+ *op->resvalue = Int16GetDatum((int16) newval);
+ break;
+ case INT4OID:
+ *op->resvalue = Int32GetDatum((int32) newval);
+ break;
+ case INT8OID:
+ *op->resvalue = Int64GetDatum((int64) newval);
+ break;
+ default:
+ elog(ERROR, "unsupported sequence type %u",
+ op->d.nextvalueexpr.seqtypid);
+ }
+ *op->resnull = false;
+}
+
+/*
+ * Evaluate NullTest / IS NULL for rows.
+ */
+void
+ExecEvalRowNull(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ ExecEvalRowNullInt(state, op, econtext, true);
+}
+
+/*
+ * Evaluate NullTest / IS NOT NULL for rows.
+ */
+void
+ExecEvalRowNotNull(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ ExecEvalRowNullInt(state, op, econtext, false);
+}
+
+/* Common code for IS [NOT] NULL on a row value */
+static void
+ExecEvalRowNullInt(ExprState *state, ExprEvalStep *op,
+ ExprContext *econtext, bool checkisnull)
+{
+ Datum value = *op->resvalue;
+ bool isnull = *op->resnull;
+ HeapTupleHeader tuple;
+ Oid tupType;
+ int32 tupTypmod;
+ TupleDesc tupDesc;
+ HeapTupleData tmptup;
+
+ *op->resnull = false;
+
+ /* NULL row variables are treated just as NULL scalar columns */
+ if (isnull)
+ {
+ *op->resvalue = BoolGetDatum(checkisnull);
+ return;
+ }
+
+ /*
+ * The SQL standard defines IS [NOT] NULL for a non-null rowtype argument
+ * as:
+ *
+ * "R IS NULL" is true if every field is the null value.
+ *
+ * "R IS NOT NULL" is true if no field is the null value.
+ *
+ * This definition is (apparently intentionally) not recursive; so our
+ * tests on the fields are primitive attisnull tests, not recursive checks
+ * to see if they are all-nulls or no-nulls rowtypes.
+ *
+ * The standard does not consider the possibility of zero-field rows, but
+ * here we consider them to vacuously satisfy both predicates.
+ */
+
+ tuple = DatumGetHeapTupleHeader(value);
+
+ tupType = HeapTupleHeaderGetTypeId(tuple);
+ tupTypmod = HeapTupleHeaderGetTypMod(tuple);
+
+ /* Lookup tupdesc if first time through or if type changes */
+ tupDesc = get_cached_rowtype(tupType, tupTypmod,
+ &op->d.nulltest_row.rowcache, NULL);
+
+ /*
+ * heap_attisnull needs a HeapTuple not a bare HeapTupleHeader.
+ */
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+ tmptup.t_data = tuple;
+
+ for (int att = 1; att <= tupDesc->natts; att++)
+ {
+ /* ignore dropped columns */
+ if (TupleDescAttr(tupDesc, att - 1)->attisdropped)
+ continue;
+ if (heap_attisnull(&tmptup, att, tupDesc))
+ {
+ /* null field disproves IS NOT NULL */
+ if (!checkisnull)
+ {
+ *op->resvalue = BoolGetDatum(false);
+ return;
+ }
+ }
+ else
+ {
+ /* non-null field disproves IS NULL */
+ if (checkisnull)
+ {
+ *op->resvalue = BoolGetDatum(false);
+ return;
+ }
+ }
+ }
+
+ *op->resvalue = BoolGetDatum(true);
+}
+
+/*
+ * Evaluate an ARRAY[] expression.
+ *
+ * The individual array elements (or subarrays) have already been evaluated
+ * into op->d.arrayexpr.elemvalues[]/elemnulls[].
+ */
+void
+ExecEvalArrayExpr(ExprState *state, ExprEvalStep *op)
+{
+ ArrayType *result;
+ Oid element_type = op->d.arrayexpr.elemtype;
+ int nelems = op->d.arrayexpr.nelems;
+ int ndims = 0;
+ int dims[MAXDIM];
+ int lbs[MAXDIM];
+
+ /* Set non-null as default */
+ *op->resnull = false;
+
+ if (!op->d.arrayexpr.multidims)
+ {
+ /* Elements are presumably of scalar type */
+ Datum *dvalues = op->d.arrayexpr.elemvalues;
+ bool *dnulls = op->d.arrayexpr.elemnulls;
+
+ /* setup for 1-D array of the given length */
+ ndims = 1;
+ dims[0] = nelems;
+ lbs[0] = 1;
+
+ result = construct_md_array(dvalues, dnulls, ndims, dims, lbs,
+ element_type,
+ op->d.arrayexpr.elemlength,
+ op->d.arrayexpr.elembyval,
+ op->d.arrayexpr.elemalign);
+ }
+ else
+ {
+ /* Must be nested array expressions */
+ int nbytes = 0;
+ int nitems = 0;
+ int outer_nelems = 0;
+ int elem_ndims = 0;
+ int *elem_dims = NULL;
+ int *elem_lbs = NULL;
+ bool firstone = true;
+ bool havenulls = false;
+ bool haveempty = false;
+ char **subdata;
+ bits8 **subbitmaps;
+ int *subbytes;
+ int *subnitems;
+ int32 dataoffset;
+ char *dat;
+ int iitem;
+
+ subdata = (char **) palloc(nelems * sizeof(char *));
+ subbitmaps = (bits8 **) palloc(nelems * sizeof(bits8 *));
+ subbytes = (int *) palloc(nelems * sizeof(int));
+ subnitems = (int *) palloc(nelems * sizeof(int));
+
+ /* loop through and get data area from each element */
+ for (int elemoff = 0; elemoff < nelems; elemoff++)
+ {
+ Datum arraydatum;
+ bool eisnull;
+ ArrayType *array;
+ int this_ndims;
+
+ arraydatum = op->d.arrayexpr.elemvalues[elemoff];
+ eisnull = op->d.arrayexpr.elemnulls[elemoff];
+
+ /* temporarily ignore null subarrays */
+ if (eisnull)
+ {
+ haveempty = true;
+ continue;
+ }
+
+ array = DatumGetArrayTypeP(arraydatum);
+
+ /* run-time double-check on element type */
+ if (element_type != ARR_ELEMTYPE(array))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("cannot merge incompatible arrays"),
+ errdetail("Array with element type %s cannot be "
+ "included in ARRAY construct with element type %s.",
+ format_type_be(ARR_ELEMTYPE(array)),
+ format_type_be(element_type))));
+
+ this_ndims = ARR_NDIM(array);
+ /* temporarily ignore zero-dimensional subarrays */
+ if (this_ndims <= 0)
+ {
+ haveempty = true;
+ continue;
+ }
+
+ if (firstone)
+ {
+ /* Get sub-array details from first member */
+ elem_ndims = this_ndims;
+ ndims = elem_ndims + 1;
+ if (ndims <= 0 || ndims > MAXDIM)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("number of array dimensions (%d) exceeds the maximum allowed (%d)",
+ ndims, MAXDIM)));
+
+ elem_dims = (int *) palloc(elem_ndims * sizeof(int));
+ memcpy(elem_dims, ARR_DIMS(array), elem_ndims * sizeof(int));
+ elem_lbs = (int *) palloc(elem_ndims * sizeof(int));
+ memcpy(elem_lbs, ARR_LBOUND(array), elem_ndims * sizeof(int));
+
+ firstone = false;
+ }
+ else
+ {
+ /* Check other sub-arrays are compatible */
+ if (elem_ndims != this_ndims ||
+ memcmp(elem_dims, ARR_DIMS(array),
+ elem_ndims * sizeof(int)) != 0 ||
+ memcmp(elem_lbs, ARR_LBOUND(array),
+ elem_ndims * sizeof(int)) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+ errmsg("multidimensional arrays must have array "
+ "expressions with matching dimensions")));
+ }
+
+ subdata[outer_nelems] = ARR_DATA_PTR(array);
+ subbitmaps[outer_nelems] = ARR_NULLBITMAP(array);
+ subbytes[outer_nelems] = ARR_SIZE(array) - ARR_DATA_OFFSET(array);
+ nbytes += subbytes[outer_nelems];
+ subnitems[outer_nelems] = ArrayGetNItems(this_ndims,
+ ARR_DIMS(array));
+ nitems += subnitems[outer_nelems];
+ havenulls |= ARR_HASNULL(array);
+ outer_nelems++;
+ }
+
+ /*
+ * If all items were null or empty arrays, return an empty array;
+ * otherwise, if some were and some weren't, raise error. (Note: we
+ * must special-case this somehow to avoid trying to generate a 1-D
+ * array formed from empty arrays. It's not ideal...)
+ */
+ if (haveempty)
+ {
+ if (ndims == 0) /* didn't find any nonempty array */
+ {
+ *op->resvalue = PointerGetDatum(construct_empty_array(element_type));
+ return;
+ }
+ ereport(ERROR,
+ (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+ errmsg("multidimensional arrays must have array "
+ "expressions with matching dimensions")));
+ }
+
+ /* setup for multi-D array */
+ dims[0] = outer_nelems;
+ lbs[0] = 1;
+ for (int i = 1; i < ndims; i++)
+ {
+ dims[i] = elem_dims[i - 1];
+ lbs[i] = elem_lbs[i - 1];
+ }
+
+ /* check for subscript overflow */
+ (void) ArrayGetNItems(ndims, dims);
+ ArrayCheckBounds(ndims, dims, lbs);
+
+ if (havenulls)
+ {
+ dataoffset = ARR_OVERHEAD_WITHNULLS(ndims, nitems);
+ nbytes += dataoffset;
+ }
+ else
+ {
+ dataoffset = 0; /* marker for no null bitmap */
+ nbytes += ARR_OVERHEAD_NONULLS(ndims);
+ }
+
+ result = (ArrayType *) palloc(nbytes);
+ SET_VARSIZE(result, nbytes);
+ result->ndim = ndims;
+ result->dataoffset = dataoffset;
+ result->elemtype = element_type;
+ memcpy(ARR_DIMS(result), dims, ndims * sizeof(int));
+ memcpy(ARR_LBOUND(result), lbs, ndims * sizeof(int));
+
+ dat = ARR_DATA_PTR(result);
+ iitem = 0;
+ for (int i = 0; i < outer_nelems; i++)
+ {
+ memcpy(dat, subdata[i], subbytes[i]);
+ dat += subbytes[i];
+ if (havenulls)
+ array_bitmap_copy(ARR_NULLBITMAP(result), iitem,
+ subbitmaps[i], 0,
+ subnitems[i]);
+ iitem += subnitems[i];
+ }
+ }
+
+ *op->resvalue = PointerGetDatum(result);
+}
+
+/*
+ * Evaluate an ArrayCoerceExpr expression.
+ *
+ * Source array is in step's result variable.
+ */
+void
+ExecEvalArrayCoerce(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ Datum arraydatum;
+
+ /* NULL array -> NULL result */
+ if (*op->resnull)
+ return;
+
+ arraydatum = *op->resvalue;
+
+ /*
+ * If it's binary-compatible, modify the element type in the array header,
+ * but otherwise leave the array as we received it.
+ */
+ if (op->d.arraycoerce.elemexprstate == NULL)
+ {
+ /* Detoast input array if necessary, and copy in any case */
+ ArrayType *array = DatumGetArrayTypePCopy(arraydatum);
+
+ ARR_ELEMTYPE(array) = op->d.arraycoerce.resultelemtype;
+ *op->resvalue = PointerGetDatum(array);
+ return;
+ }
+
+ /*
+ * Use array_map to apply the sub-expression to each array element.
+ */
+ *op->resvalue = array_map(arraydatum,
+ op->d.arraycoerce.elemexprstate,
+ econtext,
+ op->d.arraycoerce.resultelemtype,
+ op->d.arraycoerce.amstate);
+}
+
+/*
+ * Evaluate a ROW() expression.
+ *
+ * The individual columns have already been evaluated into
+ * op->d.row.elemvalues[]/elemnulls[].
+ */
+void
+ExecEvalRow(ExprState *state, ExprEvalStep *op)
+{
+ HeapTuple tuple;
+
+ /* build tuple from evaluated field values */
+ tuple = heap_form_tuple(op->d.row.tupdesc,
+ op->d.row.elemvalues,
+ op->d.row.elemnulls);
+
+ *op->resvalue = HeapTupleGetDatum(tuple);
+ *op->resnull = false;
+}
+
+/*
+ * Evaluate GREATEST() or LEAST() expression (note this is *not* MIN()/MAX()).
+ *
+ * All of the to-be-compared expressions have already been evaluated into
+ * op->d.minmax.values[]/nulls[].
+ */
+void
+ExecEvalMinMax(ExprState *state, ExprEvalStep *op)
+{
+ Datum *values = op->d.minmax.values;
+ bool *nulls = op->d.minmax.nulls;
+ FunctionCallInfo fcinfo = op->d.minmax.fcinfo_data;
+ MinMaxOp operator = op->d.minmax.op;
+
+ /* set at initialization */
+ Assert(fcinfo->args[0].isnull == false);
+ Assert(fcinfo->args[1].isnull == false);
+
+ /* default to null result */
+ *op->resnull = true;
+
+ for (int off = 0; off < op->d.minmax.nelems; off++)
+ {
+ /* ignore NULL inputs */
+ if (nulls[off])
+ continue;
+
+ if (*op->resnull)
+ {
+ /* first nonnull input, adopt value */
+ *op->resvalue = values[off];
+ *op->resnull = false;
+ }
+ else
+ {
+ int cmpresult;
+
+ /* apply comparison function */
+ fcinfo->args[0].value = *op->resvalue;
+ fcinfo->args[1].value = values[off];
+
+ fcinfo->isnull = false;
+ cmpresult = DatumGetInt32(FunctionCallInvoke(fcinfo));
+ if (fcinfo->isnull) /* probably should not happen */
+ continue;
+
+ if (cmpresult > 0 && operator == IS_LEAST)
+ *op->resvalue = values[off];
+ else if (cmpresult < 0 && operator == IS_GREATEST)
+ *op->resvalue = values[off];
+ }
+ }
+}
+
+/*
+ * Evaluate a FieldSelect node.
+ *
+ * Source record is in step's result variable.
+ */
+void
+ExecEvalFieldSelect(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ AttrNumber fieldnum = op->d.fieldselect.fieldnum;
+ Datum tupDatum;
+ HeapTupleHeader tuple;
+ Oid tupType;
+ int32 tupTypmod;
+ TupleDesc tupDesc;
+ Form_pg_attribute attr;
+ HeapTupleData tmptup;
+
+ /* NULL record -> NULL result */
+ if (*op->resnull)
+ return;
+
+ tupDatum = *op->resvalue;
+
+ /* We can special-case expanded records for speed */
+ if (VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(tupDatum)))
+ {
+ ExpandedRecordHeader *erh = (ExpandedRecordHeader *) DatumGetEOHP(tupDatum);
+
+ Assert(erh->er_magic == ER_MAGIC);
+
+ /* Extract record's TupleDesc */
+ tupDesc = expanded_record_get_tupdesc(erh);
+
+ /*
+ * Find field's attr record. Note we don't support system columns
+ * here: a datum tuple doesn't have valid values for most of the
+ * interesting system columns anyway.
+ */
+ if (fieldnum <= 0) /* should never happen */
+ elog(ERROR, "unsupported reference to system column %d in FieldSelect",
+ fieldnum);
+ if (fieldnum > tupDesc->natts) /* should never happen */
+ elog(ERROR, "attribute number %d exceeds number of columns %d",
+ fieldnum, tupDesc->natts);
+ attr = TupleDescAttr(tupDesc, fieldnum - 1);
+
+ /* Check for dropped column, and force a NULL result if so */
+ if (attr->attisdropped)
+ {
+ *op->resnull = true;
+ return;
+ }
+
+ /* Check for type mismatch --- possible after ALTER COLUMN TYPE? */
+ /* As in CheckVarSlotCompatibility, we should but can't check typmod */
+ if (op->d.fieldselect.resulttype != attr->atttypid)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("attribute %d has wrong type", fieldnum),
+ errdetail("Table has type %s, but query expects %s.",
+ format_type_be(attr->atttypid),
+ format_type_be(op->d.fieldselect.resulttype))));
+
+ /* extract the field */
+ *op->resvalue = expanded_record_get_field(erh, fieldnum,
+ op->resnull);
+ }
+ else
+ {
+ /* Get the composite datum and extract its type fields */
+ tuple = DatumGetHeapTupleHeader(tupDatum);
+
+ tupType = HeapTupleHeaderGetTypeId(tuple);
+ tupTypmod = HeapTupleHeaderGetTypMod(tuple);
+
+ /* Lookup tupdesc if first time through or if type changes */
+ tupDesc = get_cached_rowtype(tupType, tupTypmod,
+ &op->d.fieldselect.rowcache, NULL);
+
+ /*
+ * Find field's attr record. Note we don't support system columns
+ * here: a datum tuple doesn't have valid values for most of the
+ * interesting system columns anyway.
+ */
+ if (fieldnum <= 0) /* should never happen */
+ elog(ERROR, "unsupported reference to system column %d in FieldSelect",
+ fieldnum);
+ if (fieldnum > tupDesc->natts) /* should never happen */
+ elog(ERROR, "attribute number %d exceeds number of columns %d",
+ fieldnum, tupDesc->natts);
+ attr = TupleDescAttr(tupDesc, fieldnum - 1);
+
+ /* Check for dropped column, and force a NULL result if so */
+ if (attr->attisdropped)
+ {
+ *op->resnull = true;
+ return;
+ }
+
+ /* Check for type mismatch --- possible after ALTER COLUMN TYPE? */
+ /* As in CheckVarSlotCompatibility, we should but can't check typmod */
+ if (op->d.fieldselect.resulttype != attr->atttypid)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("attribute %d has wrong type", fieldnum),
+ errdetail("Table has type %s, but query expects %s.",
+ format_type_be(attr->atttypid),
+ format_type_be(op->d.fieldselect.resulttype))));
+
+ /* heap_getattr needs a HeapTuple not a bare HeapTupleHeader */
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+ tmptup.t_data = tuple;
+
+ /* extract the field */
+ *op->resvalue = heap_getattr(&tmptup,
+ fieldnum,
+ tupDesc,
+ op->resnull);
+ }
+}
+
+/*
+ * Deform source tuple, filling in the step's values/nulls arrays, before
+ * evaluating individual new values as part of a FieldStore expression.
+ * Subsequent steps will overwrite individual elements of the values/nulls
+ * arrays with the new field values, and then FIELDSTORE_FORM will build the
+ * new tuple value.
+ *
+ * Source record is in step's result variable.
+ */
+void
+ExecEvalFieldStoreDeForm(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ TupleDesc tupDesc;
+
+ /* Lookup tupdesc if first time through or if type changes */
+ tupDesc = get_cached_rowtype(op->d.fieldstore.fstore->resulttype, -1,
+ op->d.fieldstore.rowcache, NULL);
+
+ /* Check that current tupdesc doesn't have more fields than we allocated */
+ if (unlikely(tupDesc->natts > op->d.fieldstore.ncolumns))
+ elog(ERROR, "too many columns in composite type %u",
+ op->d.fieldstore.fstore->resulttype);
+
+ if (*op->resnull)
+ {
+ /* Convert null input tuple into an all-nulls row */
+ memset(op->d.fieldstore.nulls, true,
+ op->d.fieldstore.ncolumns * sizeof(bool));
+ }
+ else
+ {
+ /*
+ * heap_deform_tuple needs a HeapTuple not a bare HeapTupleHeader. We
+ * set all the fields in the struct just in case.
+ */
+ Datum tupDatum = *op->resvalue;
+ HeapTupleHeader tuphdr;
+ HeapTupleData tmptup;
+
+ tuphdr = DatumGetHeapTupleHeader(tupDatum);
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(tuphdr);
+ ItemPointerSetInvalid(&(tmptup.t_self));
+ tmptup.t_tableOid = InvalidOid;
+ tmptup.t_data = tuphdr;
+
+ heap_deform_tuple(&tmptup, tupDesc,
+ op->d.fieldstore.values,
+ op->d.fieldstore.nulls);
+ }
+}
+
+/*
+ * Compute the new composite datum after each individual field value of a
+ * FieldStore expression has been evaluated.
+ */
+void
+ExecEvalFieldStoreForm(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ TupleDesc tupDesc;
+ HeapTuple tuple;
+
+ /* Lookup tupdesc (should be valid already) */
+ tupDesc = get_cached_rowtype(op->d.fieldstore.fstore->resulttype, -1,
+ op->d.fieldstore.rowcache, NULL);
+
+ tuple = heap_form_tuple(tupDesc,
+ op->d.fieldstore.values,
+ op->d.fieldstore.nulls);
+
+ *op->resvalue = HeapTupleGetDatum(tuple);
+ *op->resnull = false;
+}
+
+/*
+ * Evaluate a rowtype coercion operation.
+ * This may require rearranging field positions.
+ *
+ * Source record is in step's result variable.
+ */
+void
+ExecEvalConvertRowtype(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ HeapTuple result;
+ Datum tupDatum;
+ HeapTupleHeader tuple;
+ HeapTupleData tmptup;
+ TupleDesc indesc,
+ outdesc;
+ bool changed = false;
+
+ /* NULL in -> NULL out */
+ if (*op->resnull)
+ return;
+
+ tupDatum = *op->resvalue;
+ tuple = DatumGetHeapTupleHeader(tupDatum);
+
+ /*
+ * Lookup tupdescs if first time through or if type changes. We'd better
+ * pin them since type conversion functions could do catalog lookups and
+ * hence cause cache invalidation.
+ */
+ indesc = get_cached_rowtype(op->d.convert_rowtype.inputtype, -1,
+ op->d.convert_rowtype.incache,
+ &changed);
+ IncrTupleDescRefCount(indesc);
+ outdesc = get_cached_rowtype(op->d.convert_rowtype.outputtype, -1,
+ op->d.convert_rowtype.outcache,
+ &changed);
+ IncrTupleDescRefCount(outdesc);
+
+ /*
+ * We used to be able to assert that incoming tuples are marked with
+ * exactly the rowtype of indesc. However, now that ExecEvalWholeRowVar
+ * might change the tuples' marking to plain RECORD due to inserting
+ * aliases, we can only make this weak test:
+ */
+ Assert(HeapTupleHeaderGetTypeId(tuple) == indesc->tdtypeid ||
+ HeapTupleHeaderGetTypeId(tuple) == RECORDOID);
+
+ /* if first time through, or after change, initialize conversion map */
+ if (changed)
+ {
+ MemoryContext old_cxt;
+
+ /* allocate map in long-lived memory context */
+ old_cxt = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+ /* prepare map from old to new attribute numbers */
+ op->d.convert_rowtype.map = convert_tuples_by_name(indesc, outdesc);
+
+ MemoryContextSwitchTo(old_cxt);
+ }
+
+ /* Following steps need a HeapTuple not a bare HeapTupleHeader */
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+ tmptup.t_data = tuple;
+
+ if (op->d.convert_rowtype.map != NULL)
+ {
+ /* Full conversion with attribute rearrangement needed */
+ result = execute_attr_map_tuple(&tmptup, op->d.convert_rowtype.map);
+ /* Result already has appropriate composite-datum header fields */
+ *op->resvalue = HeapTupleGetDatum(result);
+ }
+ else
+ {
+ /*
+ * The tuple is physically compatible as-is, but we need to insert the
+ * destination rowtype OID in its composite-datum header field, so we
+ * have to copy it anyway. heap_copy_tuple_as_datum() is convenient
+ * for this since it will both make the physical copy and insert the
+ * correct composite header fields. Note that we aren't expecting to
+ * have to flatten any toasted fields: the input was a composite
+ * datum, so it shouldn't contain any. So heap_copy_tuple_as_datum()
+ * is overkill here, but its check for external fields is cheap.
+ */
+ *op->resvalue = heap_copy_tuple_as_datum(&tmptup, outdesc);
+ }
+
+ DecrTupleDescRefCount(indesc);
+ DecrTupleDescRefCount(outdesc);
+}
+
+/*
+ * Evaluate "scalar op ANY/ALL (array)".
+ *
+ * Source array is in our result area, scalar arg is already evaluated into
+ * fcinfo->args[0].
+ *
+ * The operator always yields boolean, and we combine the results across all
+ * array elements using OR and AND (for ANY and ALL respectively). Of course
+ * we short-circuit as soon as the result is known.
+ */
+void
+ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op)
+{
+ FunctionCallInfo fcinfo = op->d.scalararrayop.fcinfo_data;
+ bool useOr = op->d.scalararrayop.useOr;
+ bool strictfunc = op->d.scalararrayop.finfo->fn_strict;
+ ArrayType *arr;
+ int nitems;
+ Datum result;
+ bool resultnull;
+ int16 typlen;
+ bool typbyval;
+ char typalign;
+ char *s;
+ bits8 *bitmap;
+ int bitmask;
+
+ /*
+ * If the array is NULL then we return NULL --- it's not very meaningful
+ * to do anything else, even if the operator isn't strict.
+ */
+ if (*op->resnull)
+ return;
+
+ /* Else okay to fetch and detoast the array */
+ arr = DatumGetArrayTypeP(*op->resvalue);
+
+ /*
+ * If the array is empty, we return either FALSE or TRUE per the useOr
+ * flag. This is correct even if the scalar is NULL; since we would
+ * evaluate the operator zero times, it matters not whether it would want
+ * to return NULL.
+ */
+ nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr));
+ if (nitems <= 0)
+ {
+ *op->resvalue = BoolGetDatum(!useOr);
+ *op->resnull = false;
+ return;
+ }
+
+ /*
+ * If the scalar is NULL, and the function is strict, return NULL; no
+ * point in iterating the loop.
+ */
+ if (fcinfo->args[0].isnull && strictfunc)
+ {
+ *op->resnull = true;
+ return;
+ }
+
+ /*
+ * We arrange to look up info about the element type only once per series
+ * of calls, assuming the element type doesn't change underneath us.
+ */
+ if (op->d.scalararrayop.element_type != ARR_ELEMTYPE(arr))
+ {
+ get_typlenbyvalalign(ARR_ELEMTYPE(arr),
+ &op->d.scalararrayop.typlen,
+ &op->d.scalararrayop.typbyval,
+ &op->d.scalararrayop.typalign);
+ op->d.scalararrayop.element_type = ARR_ELEMTYPE(arr);
+ }
+
+ typlen = op->d.scalararrayop.typlen;
+ typbyval = op->d.scalararrayop.typbyval;
+ typalign = op->d.scalararrayop.typalign;
+
+ /* Initialize result appropriately depending on useOr */
+ result = BoolGetDatum(!useOr);
+ resultnull = false;
+
+ /* Loop over the array elements */
+ s = (char *) ARR_DATA_PTR(arr);
+ bitmap = ARR_NULLBITMAP(arr);
+ bitmask = 1;
+
+ for (int i = 0; i < nitems; i++)
+ {
+ Datum elt;
+ Datum thisresult;
+
+ /* Get array element, checking for NULL */
+ if (bitmap && (*bitmap & bitmask) == 0)
+ {
+ fcinfo->args[1].value = (Datum) 0;
+ fcinfo->args[1].isnull = true;
+ }
+ else
+ {
+ elt = fetch_att(s, typbyval, typlen);
+ s = att_addlength_pointer(s, typlen, s);
+ s = (char *) att_align_nominal(s, typalign);
+ fcinfo->args[1].value = elt;
+ fcinfo->args[1].isnull = false;
+ }
+
+ /* Call comparison function */
+ if (fcinfo->args[1].isnull && strictfunc)
+ {
+ fcinfo->isnull = true;
+ thisresult = (Datum) 0;
+ }
+ else
+ {
+ fcinfo->isnull = false;
+ thisresult = op->d.scalararrayop.fn_addr(fcinfo);
+ }
+
+ /* Combine results per OR or AND semantics */
+ if (fcinfo->isnull)
+ resultnull = true;
+ else if (useOr)
+ {
+ if (DatumGetBool(thisresult))
+ {
+ result = BoolGetDatum(true);
+ resultnull = false;
+ break; /* needn't look at any more elements */
+ }
+ }
+ else
+ {
+ if (!DatumGetBool(thisresult))
+ {
+ result = BoolGetDatum(false);
+ resultnull = false;
+ break; /* needn't look at any more elements */
+ }
+ }
+
+ /* advance bitmap pointer if any */
+ if (bitmap)
+ {
+ bitmask <<= 1;
+ if (bitmask == 0x100)
+ {
+ bitmap++;
+ bitmask = 1;
+ }
+ }
+ }
+
+ *op->resvalue = result;
+ *op->resnull = resultnull;
+}
+
+/*
+ * Hash function for scalar array hash op elements.
+ *
+ * We use the element type's default hash opclass, and the column collation
+ * if the type is collation-sensitive.
+ */
+static uint32
+saop_element_hash(struct saophash_hash *tb, Datum key)
+{
+ ScalarArrayOpExprHashTable *elements_tab = (ScalarArrayOpExprHashTable *) tb->private_data;
+ FunctionCallInfo fcinfo = elements_tab->op->d.hashedscalararrayop.hash_fcinfo_data;
+ Datum hash;
+
+ fcinfo->args[0].value = key;
+ fcinfo->args[0].isnull = false;
+
+ hash = elements_tab->op->d.hashedscalararrayop.hash_fn_addr(fcinfo);
+
+ return DatumGetUInt32(hash);
+}
+
+/*
+ * Matching function for scalar array hash op elements, to be used in hashtable
+ * lookups.
+ */
+static bool
+saop_hash_element_match(struct saophash_hash *tb, Datum key1, Datum key2)
+{
+ Datum result;
+
+ ScalarArrayOpExprHashTable *elements_tab = (ScalarArrayOpExprHashTable *) tb->private_data;
+ FunctionCallInfo fcinfo = elements_tab->op->d.hashedscalararrayop.fcinfo_data;
+
+ fcinfo->args[0].value = key1;
+ fcinfo->args[0].isnull = false;
+ fcinfo->args[1].value = key2;
+ fcinfo->args[1].isnull = false;
+
+ result = elements_tab->op->d.hashedscalararrayop.fn_addr(fcinfo);
+
+ return DatumGetBool(result);
+}
+
+/*
+ * Evaluate "scalar op ANY (const array)".
+ *
+ * Similar to ExecEvalScalarArrayOp, but optimized for faster repeat lookups
+ * by building a hashtable on the first lookup. This hashtable will be reused
+ * by subsequent lookups. Unlike ExecEvalScalarArrayOp, this version only
+ * supports OR semantics.
+ *
+ * Source array is in our result area, scalar arg is already evaluated into
+ * fcinfo->args[0].
+ *
+ * The operator always yields boolean.
+ */
+void
+ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ ScalarArrayOpExprHashTable *elements_tab = op->d.hashedscalararrayop.elements_tab;
+ FunctionCallInfo fcinfo = op->d.hashedscalararrayop.fcinfo_data;
+ bool strictfunc = op->d.hashedscalararrayop.finfo->fn_strict;
+ Datum scalar = fcinfo->args[0].value;
+ bool scalar_isnull = fcinfo->args[0].isnull;
+ Datum result;
+ bool resultnull;
+ bool hashfound;
+
+ /* We don't setup a hashed scalar array op if the array const is null. */
+ Assert(!*op->resnull);
+
+ /*
+ * If the scalar is NULL, and the function is strict, return NULL; no
+ * point in executing the search.
+ */
+ if (fcinfo->args[0].isnull && strictfunc)
+ {
+ *op->resnull = true;
+ return;
+ }
+
+ /* Build the hash table on first evaluation */
+ if (elements_tab == NULL)
+ {
+ int16 typlen;
+ bool typbyval;
+ char typalign;
+ int nitems;
+ bool has_nulls = false;
+ char *s;
+ bits8 *bitmap;
+ int bitmask;
+ MemoryContext oldcontext;
+ ArrayType *arr;
+
+ arr = DatumGetArrayTypeP(*op->resvalue);
+ nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr));
+
+ get_typlenbyvalalign(ARR_ELEMTYPE(arr),
+ &typlen,
+ &typbyval,
+ &typalign);
+
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+ elements_tab = (ScalarArrayOpExprHashTable *)
+ palloc(sizeof(ScalarArrayOpExprHashTable));
+ op->d.hashedscalararrayop.elements_tab = elements_tab;
+ elements_tab->op = op;
+
+ /*
+ * Create the hash table sizing it according to the number of elements
+ * in the array. This does assume that the array has no duplicates.
+ * If the array happens to contain many duplicate values then it'll
+ * just mean that we sized the table a bit on the large side.
+ */
+ elements_tab->hashtab = saophash_create(CurrentMemoryContext, nitems,
+ elements_tab);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ s = (char *) ARR_DATA_PTR(arr);
+ bitmap = ARR_NULLBITMAP(arr);
+ bitmask = 1;
+ for (int i = 0; i < nitems; i++)
+ {
+ /* Get array element, checking for NULL. */
+ if (bitmap && (*bitmap & bitmask) == 0)
+ {
+ has_nulls = true;
+ }
+ else
+ {
+ Datum element;
+
+ element = fetch_att(s, typbyval, typlen);
+ s = att_addlength_pointer(s, typlen, s);
+ s = (char *) att_align_nominal(s, typalign);
+
+ saophash_insert(elements_tab->hashtab, element, &hashfound);
+ }
+
+ /* Advance bitmap pointer if any. */
+ if (bitmap)
+ {
+ bitmask <<= 1;
+ if (bitmask == 0x100)
+ {
+ bitmap++;
+ bitmask = 1;
+ }
+ }
+ }
+
+ /*
+ * Remember if we had any nulls so that we know if we need to execute
+ * non-strict functions with a null lhs value if no match is found.
+ */
+ op->d.hashedscalararrayop.has_nulls = has_nulls;
+ }
+
+ /* Check the hash to see if we have a match. */
+ hashfound = NULL != saophash_lookup(elements_tab->hashtab, scalar);
+
+ result = BoolGetDatum(hashfound);
+ resultnull = false;
+
+ /*
+ * If we didn't find a match in the array, we still might need to handle
+ * the possibility of null values. We didn't put any NULLs into the
+ * hashtable, but instead marked if we found any when building the table
+ * in has_nulls.
+ */
+ if (!DatumGetBool(result) && op->d.hashedscalararrayop.has_nulls)
+ {
+ if (strictfunc)
+ {
+
+ /*
+ * We have nulls in the array so a non-null lhs and no match must
+ * yield NULL.
+ */
+ result = (Datum) 0;
+ resultnull = true;
+ }
+ else
+ {
+ /*
+ * Execute function will null rhs just once.
+ *
+ * The hash lookup path will have scribbled on the lhs argument so
+ * we need to set it up also (even though we entered this function
+ * with it already set).
+ */
+ fcinfo->args[0].value = scalar;
+ fcinfo->args[0].isnull = scalar_isnull;
+ fcinfo->args[1].value = (Datum) 0;
+ fcinfo->args[1].isnull = true;
+
+ result = op->d.hashedscalararrayop.fn_addr(fcinfo);
+ resultnull = fcinfo->isnull;
+ }
+ }
+
+ *op->resvalue = result;
+ *op->resnull = resultnull;
+}
+
+/*
+ * Evaluate a NOT NULL domain constraint.
+ */
+void
+ExecEvalConstraintNotNull(ExprState *state, ExprEvalStep *op)
+{
+ if (*op->resnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_NOT_NULL_VIOLATION),
+ errmsg("domain %s does not allow null values",
+ format_type_be(op->d.domaincheck.resulttype)),
+ errdatatype(op->d.domaincheck.resulttype)));
+}
+
+/*
+ * Evaluate a CHECK domain constraint.
+ */
+void
+ExecEvalConstraintCheck(ExprState *state, ExprEvalStep *op)
+{
+ if (!*op->d.domaincheck.checknull &&
+ !DatumGetBool(*op->d.domaincheck.checkvalue))
+ ereport(ERROR,
+ (errcode(ERRCODE_CHECK_VIOLATION),
+ errmsg("value for domain %s violates check constraint \"%s\"",
+ format_type_be(op->d.domaincheck.resulttype),
+ op->d.domaincheck.constraintname),
+ errdomainconstraint(op->d.domaincheck.resulttype,
+ op->d.domaincheck.constraintname)));
+}
+
+/*
+ * Evaluate the various forms of XmlExpr.
+ *
+ * Arguments have been evaluated into named_argvalue/named_argnull
+ * and/or argvalue/argnull arrays.
+ */
+void
+ExecEvalXmlExpr(ExprState *state, ExprEvalStep *op)
+{
+ XmlExpr *xexpr = op->d.xmlexpr.xexpr;
+ Datum value;
+
+ *op->resnull = true; /* until we get a result */
+ *op->resvalue = (Datum) 0;
+
+ switch (xexpr->op)
+ {
+ case IS_XMLCONCAT:
+ {
+ Datum *argvalue = op->d.xmlexpr.argvalue;
+ bool *argnull = op->d.xmlexpr.argnull;
+ List *values = NIL;
+
+ for (int i = 0; i < list_length(xexpr->args); i++)
+ {
+ if (!argnull[i])
+ values = lappend(values, DatumGetPointer(argvalue[i]));
+ }
+
+ if (values != NIL)
+ {
+ *op->resvalue = PointerGetDatum(xmlconcat(values));
+ *op->resnull = false;
+ }
+ }
+ break;
+
+ case IS_XMLFOREST:
+ {
+ Datum *argvalue = op->d.xmlexpr.named_argvalue;
+ bool *argnull = op->d.xmlexpr.named_argnull;
+ StringInfoData buf;
+ ListCell *lc;
+ ListCell *lc2;
+ int i;
+
+ initStringInfo(&buf);
+
+ i = 0;
+ forboth(lc, xexpr->named_args, lc2, xexpr->arg_names)
+ {
+ Expr *e = (Expr *) lfirst(lc);
+ char *argname = strVal(lfirst(lc2));
+
+ if (!argnull[i])
+ {
+ value = argvalue[i];
+ appendStringInfo(&buf, "<%s>%s</%s>",
+ argname,
+ map_sql_value_to_xml_value(value,
+ exprType((Node *) e), true),
+ argname);
+ *op->resnull = false;
+ }
+ i++;
+ }
+
+ if (!*op->resnull)
+ {
+ text *result;
+
+ result = cstring_to_text_with_len(buf.data, buf.len);
+ *op->resvalue = PointerGetDatum(result);
+ }
+
+ pfree(buf.data);
+ }
+ break;
+
+ case IS_XMLELEMENT:
+ *op->resvalue = PointerGetDatum(xmlelement(xexpr,
+ op->d.xmlexpr.named_argvalue,
+ op->d.xmlexpr.named_argnull,
+ op->d.xmlexpr.argvalue,
+ op->d.xmlexpr.argnull));
+ *op->resnull = false;
+ break;
+
+ case IS_XMLPARSE:
+ {
+ Datum *argvalue = op->d.xmlexpr.argvalue;
+ bool *argnull = op->d.xmlexpr.argnull;
+ text *data;
+ bool preserve_whitespace;
+
+ /* arguments are known to be text, bool */
+ Assert(list_length(xexpr->args) == 2);
+
+ if (argnull[0])
+ return;
+ value = argvalue[0];
+ data = DatumGetTextPP(value);
+
+ if (argnull[1]) /* probably can't happen */
+ return;
+ value = argvalue[1];
+ preserve_whitespace = DatumGetBool(value);
+
+ *op->resvalue = PointerGetDatum(xmlparse(data,
+ xexpr->xmloption,
+ preserve_whitespace));
+ *op->resnull = false;
+ }
+ break;
+
+ case IS_XMLPI:
+ {
+ text *arg;
+ bool isnull;
+
+ /* optional argument is known to be text */
+ Assert(list_length(xexpr->args) <= 1);
+
+ if (xexpr->args)
+ {
+ isnull = op->d.xmlexpr.argnull[0];
+ if (isnull)
+ arg = NULL;
+ else
+ arg = DatumGetTextPP(op->d.xmlexpr.argvalue[0]);
+ }
+ else
+ {
+ arg = NULL;
+ isnull = false;
+ }
+
+ *op->resvalue = PointerGetDatum(xmlpi(xexpr->name,
+ arg,
+ isnull,
+ op->resnull));
+ }
+ break;
+
+ case IS_XMLROOT:
+ {
+ Datum *argvalue = op->d.xmlexpr.argvalue;
+ bool *argnull = op->d.xmlexpr.argnull;
+ xmltype *data;
+ text *version;
+ int standalone;
+
+ /* arguments are known to be xml, text, int */
+ Assert(list_length(xexpr->args) == 3);
+
+ if (argnull[0])
+ return;
+ data = DatumGetXmlP(argvalue[0]);
+
+ if (argnull[1])
+ version = NULL;
+ else
+ version = DatumGetTextPP(argvalue[1]);
+
+ Assert(!argnull[2]); /* always present */
+ standalone = DatumGetInt32(argvalue[2]);
+
+ *op->resvalue = PointerGetDatum(xmlroot(data,
+ version,
+ standalone));
+ *op->resnull = false;
+ }
+ break;
+
+ case IS_XMLSERIALIZE:
+ {
+ Datum *argvalue = op->d.xmlexpr.argvalue;
+ bool *argnull = op->d.xmlexpr.argnull;
+
+ /* argument type is known to be xml */
+ Assert(list_length(xexpr->args) == 1);
+
+ if (argnull[0])
+ return;
+ value = argvalue[0];
+
+ *op->resvalue = PointerGetDatum(xmltotext_with_xmloption(DatumGetXmlP(value),
+ xexpr->xmloption));
+ *op->resnull = false;
+ }
+ break;
+
+ case IS_DOCUMENT:
+ {
+ Datum *argvalue = op->d.xmlexpr.argvalue;
+ bool *argnull = op->d.xmlexpr.argnull;
+
+ /* optional argument is known to be xml */
+ Assert(list_length(xexpr->args) == 1);
+
+ if (argnull[0])
+ return;
+ value = argvalue[0];
+
+ *op->resvalue =
+ BoolGetDatum(xml_is_document(DatumGetXmlP(value)));
+ *op->resnull = false;
+ }
+ break;
+
+ default:
+ elog(ERROR, "unrecognized XML operation");
+ break;
+ }
+}
+
+/*
+ * ExecEvalGroupingFunc
+ *
+ * Computes a bitmask with a bit for each (unevaluated) argument expression
+ * (rightmost arg is least significant bit).
+ *
+ * A bit is set if the corresponding expression is NOT part of the set of
+ * grouping expressions in the current grouping set.
+ */
+void
+ExecEvalGroupingFunc(ExprState *state, ExprEvalStep *op)
+{
+ AggState *aggstate = castNode(AggState, state->parent);
+ int result = 0;
+ Bitmapset *grouped_cols = aggstate->grouped_cols;
+ ListCell *lc;
+
+ foreach(lc, op->d.grouping_func.clauses)
+ {
+ int attnum = lfirst_int(lc);
+
+ result <<= 1;
+
+ if (!bms_is_member(attnum, grouped_cols))
+ result |= 1;
+ }
+
+ *op->resvalue = Int32GetDatum(result);
+ *op->resnull = false;
+}
+
+/*
+ * Hand off evaluation of a subplan to nodeSubplan.c
+ */
+void
+ExecEvalSubPlan(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ SubPlanState *sstate = op->d.subplan.sstate;
+
+ /* could potentially be nested, so make sure there's enough stack */
+ check_stack_depth();
+
+ *op->resvalue = ExecSubPlan(sstate, econtext, op->resnull);
+}
+
+/*
+ * Evaluate a wholerow Var expression.
+ *
+ * Returns a Datum whose value is the value of a whole-row range variable
+ * with respect to given expression context.
+ */
+void
+ExecEvalWholeRowVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+ Var *variable = op->d.wholerow.var;
+ TupleTableSlot *slot;
+ TupleDesc output_tupdesc;
+ MemoryContext oldcontext;
+ HeapTupleHeader dtuple;
+ HeapTuple tuple;
+
+ /* This was checked by ExecInitExpr */
+ Assert(variable->varattno == InvalidAttrNumber);
+
+ /* Get the input slot we want */
+ switch (variable->varno)
+ {
+ case INNER_VAR:
+ /* get the tuple from the inner node */
+ slot = econtext->ecxt_innertuple;
+ break;
+
+ case OUTER_VAR:
+ /* get the tuple from the outer node */
+ slot = econtext->ecxt_outertuple;
+ break;
+
+ /* INDEX_VAR is handled by default case */
+
+ default:
+ /* get the tuple from the relation being scanned */
+ slot = econtext->ecxt_scantuple;
+ break;
+ }
+
+ /* Apply the junkfilter if any */
+ if (op->d.wholerow.junkFilter != NULL)
+ slot = ExecFilterJunk(op->d.wholerow.junkFilter, slot);
+
+ /*
+ * If first time through, obtain tuple descriptor and check compatibility.
+ *
+ * XXX: It'd be great if this could be moved to the expression
+ * initialization phase, but due to using slots that's currently not
+ * feasible.
+ */
+ if (op->d.wholerow.first)
+ {
+ /* optimistically assume we don't need slow path */
+ op->d.wholerow.slow = false;
+
+ /*
+ * If the Var identifies a named composite type, we must check that
+ * the actual tuple type is compatible with it.
+ */
+ if (variable->vartype != RECORDOID)
+ {
+ TupleDesc var_tupdesc;
+ TupleDesc slot_tupdesc;
+
+ /*
+ * We really only care about numbers of attributes and data types.
+ * Also, we can ignore type mismatch on columns that are dropped
+ * in the destination type, so long as (1) the physical storage
+ * matches or (2) the actual column value is NULL. Case (1) is
+ * helpful in some cases involving out-of-date cached plans, while
+ * case (2) is expected behavior in situations such as an INSERT
+ * into a table with dropped columns (the planner typically
+ * generates an INT4 NULL regardless of the dropped column type).
+ * If we find a dropped column and cannot verify that case (1)
+ * holds, we have to use the slow path to check (2) for each row.
+ *
+ * If vartype is a domain over composite, just look through that
+ * to the base composite type.
+ */
+ var_tupdesc = lookup_rowtype_tupdesc_domain(variable->vartype,
+ -1, false);
+
+ slot_tupdesc = slot->tts_tupleDescriptor;
+
+ if (var_tupdesc->natts != slot_tupdesc->natts)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail_plural("Table row contains %d attribute, but query expects %d.",
+ "Table row contains %d attributes, but query expects %d.",
+ slot_tupdesc->natts,
+ slot_tupdesc->natts,
+ var_tupdesc->natts)));
+
+ for (int i = 0; i < var_tupdesc->natts; i++)
+ {
+ Form_pg_attribute vattr = TupleDescAttr(var_tupdesc, i);
+ Form_pg_attribute sattr = TupleDescAttr(slot_tupdesc, i);
+
+ if (vattr->atttypid == sattr->atttypid)
+ continue; /* no worries */
+ if (!vattr->attisdropped)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Table has type %s at ordinal position %d, but query expects %s.",
+ format_type_be(sattr->atttypid),
+ i + 1,
+ format_type_be(vattr->atttypid))));
+
+ if (vattr->attlen != sattr->attlen ||
+ vattr->attalign != sattr->attalign)
+ op->d.wholerow.slow = true; /* need to check for nulls */
+ }
+
+ /*
+ * Use the variable's declared rowtype as the descriptor for the
+ * output values. In particular, we *must* absorb any
+ * attisdropped markings.
+ */
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+ output_tupdesc = CreateTupleDescCopy(var_tupdesc);
+ MemoryContextSwitchTo(oldcontext);
+
+ ReleaseTupleDesc(var_tupdesc);
+ }
+ else
+ {
+ /*
+ * In the RECORD case, we use the input slot's rowtype as the
+ * descriptor for the output values, modulo possibly assigning new
+ * column names below.
+ */
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+ output_tupdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * It's possible that the input slot is a relation scan slot and
+ * so is marked with that relation's rowtype. But we're supposed
+ * to be returning RECORD, so reset to that.
+ */
+ output_tupdesc->tdtypeid = RECORDOID;
+ output_tupdesc->tdtypmod = -1;
+
+ /*
+ * We already got the correct physical datatype info above, but
+ * now we should try to find the source RTE and adopt its column
+ * aliases, since it's unlikely that the input slot has the
+ * desired names.
+ *
+ * If we can't locate the RTE, assume the column names we've got
+ * are OK. (As of this writing, the only cases where we can't
+ * locate the RTE are in execution of trigger WHEN clauses, and
+ * then the Var will have the trigger's relation's rowtype, so its
+ * names are fine.) Also, if the creator of the RTE didn't bother
+ * to fill in an eref field, assume our column names are OK. (This
+ * happens in COPY, and perhaps other places.)
+ */
+ if (econtext->ecxt_estate &&
+ variable->varno <= econtext->ecxt_estate->es_range_table_size)
+ {
+ RangeTblEntry *rte = exec_rt_fetch(variable->varno,
+ econtext->ecxt_estate);
+
+ if (rte->eref)
+ ExecTypeSetColNames(output_tupdesc, rte->eref->colnames);
+ }
+ }
+
+ /* Bless the tupdesc if needed, and save it in the execution state */
+ op->d.wholerow.tupdesc = BlessTupleDesc(output_tupdesc);
+
+ op->d.wholerow.first = false;
+ }
+
+ /*
+ * Make sure all columns of the slot are accessible in the slot's
+ * Datum/isnull arrays.
+ */
+ slot_getallattrs(slot);
+
+ if (op->d.wholerow.slow)
+ {
+ /* Check to see if any dropped attributes are non-null */
+ TupleDesc tupleDesc = slot->tts_tupleDescriptor;
+ TupleDesc var_tupdesc = op->d.wholerow.tupdesc;
+
+ Assert(var_tupdesc->natts == tupleDesc->natts);
+
+ for (int i = 0; i < var_tupdesc->natts; i++)
+ {
+ Form_pg_attribute vattr = TupleDescAttr(var_tupdesc, i);
+ Form_pg_attribute sattr = TupleDescAttr(tupleDesc, i);
+
+ if (!vattr->attisdropped)
+ continue; /* already checked non-dropped cols */
+ if (slot->tts_isnull[i])
+ continue; /* null is always okay */
+ if (vattr->attlen != sattr->attlen ||
+ vattr->attalign != sattr->attalign)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Physical storage mismatch on dropped attribute at ordinal position %d.",
+ i + 1)));
+ }
+ }
+
+ /*
+ * Build a composite datum, making sure any toasted fields get detoasted.
+ *
+ * (Note: it is critical that we not change the slot's state here.)
+ */
+ tuple = toast_build_flattened_tuple(slot->tts_tupleDescriptor,
+ slot->tts_values,
+ slot->tts_isnull);
+ dtuple = tuple->t_data;
+
+ /*
+ * Label the datum with the composite type info we identified before.
+ *
+ * (Note: we could skip doing this by passing op->d.wholerow.tupdesc to
+ * the tuple build step; but that seems a tad risky so let's not.)
+ */
+ HeapTupleHeaderSetTypeId(dtuple, op->d.wholerow.tupdesc->tdtypeid);
+ HeapTupleHeaderSetTypMod(dtuple, op->d.wholerow.tupdesc->tdtypmod);
+
+ *op->resvalue = PointerGetDatum(dtuple);
+ *op->resnull = false;
+}
+
+void
+ExecEvalSysVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext,
+ TupleTableSlot *slot)
+{
+ Datum d;
+
+ /* slot_getsysattr has sufficient defenses against bad attnums */
+ d = slot_getsysattr(slot,
+ op->d.var.attnum,
+ op->resnull);
+ *op->resvalue = d;
+ /* this ought to be unreachable, but it's cheap enough to check */
+ if (unlikely(*op->resnull))
+ elog(ERROR, "failed to fetch attribute from slot");
+}
+
+/*
+ * Transition value has not been initialized. This is the first non-NULL input
+ * value for a group. We use it as the initial value for transValue.
+ */
+void
+ExecAggInitGroup(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroup,
+ ExprContext *aggcontext)
+{
+ FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+ MemoryContext oldContext;
+
+ /*
+ * We must copy the datum into aggcontext if it is pass-by-ref. We do not
+ * need to pfree the old transValue, since it's NULL. (We already checked
+ * that the agg's input type is binary-compatible with its transtype, so
+ * straight copy here is OK.)
+ */
+ oldContext = MemoryContextSwitchTo(aggcontext->ecxt_per_tuple_memory);
+ pergroup->transValue = datumCopy(fcinfo->args[1].value,
+ pertrans->transtypeByVal,
+ pertrans->transtypeLen);
+ pergroup->transValueIsNull = false;
+ pergroup->noTransValue = false;
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Ensure that the current transition value is a child of the aggcontext,
+ * rather than the per-tuple context.
+ *
+ * NB: This can change the current memory context.
+ */
+Datum
+ExecAggTransReparent(AggState *aggstate, AggStatePerTrans pertrans,
+ Datum newValue, bool newValueIsNull,
+ Datum oldValue, bool oldValueIsNull)
+{
+ Assert(newValue != oldValue);
+
+ if (!newValueIsNull)
+ {
+ MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
+ if (DatumIsReadWriteExpandedObject(newValue,
+ false,
+ pertrans->transtypeLen) &&
+ MemoryContextGetParent(DatumGetEOHP(newValue)->eoh_context) == CurrentMemoryContext)
+ /* do nothing */ ;
+ else
+ newValue = datumCopy(newValue,
+ pertrans->transtypeByVal,
+ pertrans->transtypeLen);
+ }
+ else
+ {
+ /*
+ * Ensure that AggStatePerGroup->transValue ends up being 0, so
+ * callers can safely compare newValue/oldValue without having to
+ * check their respective nullness.
+ */
+ newValue = (Datum) 0;
+ }
+
+ if (!oldValueIsNull)
+ {
+ if (DatumIsReadWriteExpandedObject(oldValue,
+ false,
+ pertrans->transtypeLen))
+ DeleteExpandedObject(oldValue);
+ else
+ pfree(DatumGetPointer(oldValue));
+ }
+
+ return newValue;
+}
+
+/*
+ * Invoke ordered transition function, with a datum argument.
+ */
+void
+ExecEvalAggOrderedTransDatum(ExprState *state, ExprEvalStep *op,
+ ExprContext *econtext)
+{
+ AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+ int setno = op->d.agg_trans.setno;
+
+ tuplesort_putdatum(pertrans->sortstates[setno],
+ *op->resvalue, *op->resnull);
+}
+
+/*
+ * Invoke ordered transition function, with a tuple argument.
+ */
+void
+ExecEvalAggOrderedTransTuple(ExprState *state, ExprEvalStep *op,
+ ExprContext *econtext)
+{
+ AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+ int setno = op->d.agg_trans.setno;
+
+ ExecClearTuple(pertrans->sortslot);
+ pertrans->sortslot->tts_nvalid = pertrans->numInputs;
+ ExecStoreVirtualTuple(pertrans->sortslot);
+ tuplesort_puttupleslot(pertrans->sortstates[setno], pertrans->sortslot);
+}
+
+/* implementation of transition function invocation for byval types */
+static pg_attribute_always_inline void
+ExecAggPlainTransByVal(AggState *aggstate, AggStatePerTrans pertrans,
+ AggStatePerGroup pergroup,
+ ExprContext *aggcontext, int setno)
+{
+ FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+ MemoryContext oldContext;
+ Datum newVal;
+
+ /* cf. select_current_set() */
+ aggstate->curaggcontext = aggcontext;
+ aggstate->current_set = setno;
+
+ /* set up aggstate->curpertrans for AggGetAggref() */
+ aggstate->curpertrans = pertrans;
+
+ /* invoke transition function in per-tuple context */
+ oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+
+ fcinfo->args[0].value = pergroup->transValue;
+ fcinfo->args[0].isnull = pergroup->transValueIsNull;
+ fcinfo->isnull = false; /* just in case transfn doesn't set it */
+
+ newVal = FunctionCallInvoke(fcinfo);
+
+ pergroup->transValue = newVal;
+ pergroup->transValueIsNull = fcinfo->isnull;
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/* implementation of transition function invocation for byref types */
+static pg_attribute_always_inline void
+ExecAggPlainTransByRef(AggState *aggstate, AggStatePerTrans pertrans,
+ AggStatePerGroup pergroup,
+ ExprContext *aggcontext, int setno)
+{
+ FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+ MemoryContext oldContext;
+ Datum newVal;
+
+ /* cf. select_current_set() */
+ aggstate->curaggcontext = aggcontext;
+ aggstate->current_set = setno;
+
+ /* set up aggstate->curpertrans for AggGetAggref() */
+ aggstate->curpertrans = pertrans;
+
+ /* invoke transition function in per-tuple context */
+ oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+
+ fcinfo->args[0].value = pergroup->transValue;
+ fcinfo->args[0].isnull = pergroup->transValueIsNull;
+ fcinfo->isnull = false; /* just in case transfn doesn't set it */
+
+ newVal = FunctionCallInvoke(fcinfo);
+
+ /*
+ * For pass-by-ref datatype, must copy the new value into aggcontext and
+ * free the prior transValue. But if transfn returned a pointer to its
+ * first input, we don't need to do anything. Also, if transfn returned a
+ * pointer to a R/W expanded object that is already a child of the
+ * aggcontext, assume we can adopt that value without copying it.
+ *
+ * It's safe to compare newVal with pergroup->transValue without regard
+ * for either being NULL, because ExecAggTransReparent() takes care to set
+ * transValue to 0 when NULL. Otherwise we could end up accidentally not
+ * reparenting, when the transValue has the same numerical value as
+ * newValue, despite being NULL. This is a somewhat hot path, making it
+ * undesirable to instead solve this with another branch for the common
+ * case of the transition function returning its (modified) input
+ * argument.
+ */
+ if (DatumGetPointer(newVal) != DatumGetPointer(pergroup->transValue))
+ newVal = ExecAggTransReparent(aggstate, pertrans,
+ newVal, fcinfo->isnull,
+ pergroup->transValue,
+ pergroup->transValueIsNull);
+
+ pergroup->transValue = newVal;
+ pergroup->transValueIsNull = fcinfo->isnull;
+
+ MemoryContextSwitchTo(oldContext);
+}
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
new file mode 100644
index 0000000..c11427a
--- /dev/null
+++ b/src/backend/executor/execGrouping.c
@@ -0,0 +1,560 @@
+/*-------------------------------------------------------------------------
+ *
+ * execGrouping.c
+ * executor utility routines for grouping, hashing, and aggregation
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execGrouping.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+
+static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
+static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
+ const MinimalTuple tuple);
+static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
+ TupleTableSlot *slot,
+ bool *isnew, uint32 hash);
+
+/*
+ * Define parameters for tuple hash table code generation. The interface is
+ * *also* declared in execnodes.h (to generate the types, which are externally
+ * visible).
+ */
+#define SH_PREFIX tuplehash
+#define SH_ELEMENT_TYPE TupleHashEntryData
+#define SH_KEY_TYPE MinimalTuple
+#define SH_KEY firstTuple
+#define SH_HASH_KEY(tb, key) TupleHashTableHash_internal(tb, key)
+#define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0
+#define SH_SCOPE extern
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a) a->hash
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+
+/*****************************************************************************
+ * Utility routines for grouping tuples together
+ *****************************************************************************/
+
+/*
+ * execTuplesMatchPrepare
+ * Build expression that can be evaluated using ExecQual(), returning
+ * whether an ExprContext's inner/outer tuples are NOT DISTINCT
+ */
+ExprState *
+execTuplesMatchPrepare(TupleDesc desc,
+ int numCols,
+ const AttrNumber *keyColIdx,
+ const Oid *eqOperators,
+ const Oid *collations,
+ PlanState *parent)
+{
+ Oid *eqFunctions = (Oid *) palloc(numCols * sizeof(Oid));
+ int i;
+ ExprState *expr;
+
+ if (numCols == 0)
+ return NULL;
+
+ /* lookup equality functions */
+ for (i = 0; i < numCols; i++)
+ eqFunctions[i] = get_opcode(eqOperators[i]);
+
+ /* build actual expression */
+ expr = ExecBuildGroupingEqual(desc, desc, NULL, NULL,
+ numCols, keyColIdx, eqFunctions, collations,
+ parent);
+
+ return expr;
+}
+
+/*
+ * execTuplesHashPrepare
+ * Look up the equality and hashing functions needed for a TupleHashTable.
+ *
+ * This is similar to execTuplesMatchPrepare, but we also need to find the
+ * hash functions associated with the equality operators. *eqFunctions and
+ * *hashFunctions receive the palloc'd result arrays.
+ *
+ * Note: we expect that the given operators are not cross-type comparisons.
+ */
+void
+execTuplesHashPrepare(int numCols,
+ const Oid *eqOperators,
+ Oid **eqFuncOids,
+ FmgrInfo **hashFunctions)
+{
+ int i;
+
+ *eqFuncOids = (Oid *) palloc(numCols * sizeof(Oid));
+ *hashFunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
+
+ for (i = 0; i < numCols; i++)
+ {
+ Oid eq_opr = eqOperators[i];
+ Oid eq_function;
+ Oid left_hash_function;
+ Oid right_hash_function;
+
+ eq_function = get_opcode(eq_opr);
+ if (!get_op_hash_functions(eq_opr,
+ &left_hash_function, &right_hash_function))
+ elog(ERROR, "could not find hash function for hash operator %u",
+ eq_opr);
+ /* We're not supporting cross-type cases here */
+ Assert(left_hash_function == right_hash_function);
+ (*eqFuncOids)[i] = eq_function;
+ fmgr_info(right_hash_function, &(*hashFunctions)[i]);
+ }
+}
+
+
+/*****************************************************************************
+ * Utility routines for all-in-memory hash tables
+ *
+ * These routines build hash tables for grouping tuples together (eg, for
+ * hash aggregation). There is one entry for each not-distinct set of tuples
+ * presented.
+ *****************************************************************************/
+
+/*
+ * Construct an empty TupleHashTable
+ *
+ * numCols, keyColIdx: identify the tuple fields to use as lookup key
+ * eqfunctions: equality comparison functions to use
+ * hashfunctions: datatype-specific hashing functions to use
+ * nbuckets: initial estimate of hashtable size
+ * additionalsize: size of data stored in ->additional
+ * metacxt: memory context for long-lived allocation, but not per-entry data
+ * tablecxt: memory context in which to store table entries
+ * tempcxt: short-lived context for evaluation hash and comparison functions
+ *
+ * The function arrays may be made with execTuplesHashPrepare(). Note they
+ * are not cross-type functions, but expect to see the table datatype(s)
+ * on both sides.
+ *
+ * Note that keyColIdx, eqfunctions, and hashfunctions must be allocated in
+ * storage that will live as long as the hashtable does.
+ */
+TupleHashTable
+BuildTupleHashTableExt(PlanState *parent,
+ TupleDesc inputDesc,
+ int numCols, AttrNumber *keyColIdx,
+ const Oid *eqfuncoids,
+ FmgrInfo *hashfunctions,
+ Oid *collations,
+ long nbuckets, Size additionalsize,
+ MemoryContext metacxt,
+ MemoryContext tablecxt,
+ MemoryContext tempcxt,
+ bool use_variable_hash_iv)
+{
+ TupleHashTable hashtable;
+ Size entrysize = sizeof(TupleHashEntryData) + additionalsize;
+ Size hash_mem_limit;
+ MemoryContext oldcontext;
+ bool allow_jit;
+
+ Assert(nbuckets > 0);
+
+ /* Limit initial table size request to not more than hash_mem */
+ hash_mem_limit = get_hash_memory_limit() / entrysize;
+ if (nbuckets > hash_mem_limit)
+ nbuckets = hash_mem_limit;
+
+ oldcontext = MemoryContextSwitchTo(metacxt);
+
+ hashtable = (TupleHashTable) palloc(sizeof(TupleHashTableData));
+
+ hashtable->numCols = numCols;
+ hashtable->keyColIdx = keyColIdx;
+ hashtable->tab_hash_funcs = hashfunctions;
+ hashtable->tab_collations = collations;
+ hashtable->tablecxt = tablecxt;
+ hashtable->tempcxt = tempcxt;
+ hashtable->entrysize = entrysize;
+ hashtable->tableslot = NULL; /* will be made on first lookup */
+ hashtable->inputslot = NULL;
+ hashtable->in_hash_funcs = NULL;
+ hashtable->cur_eq_func = NULL;
+
+ /*
+ * If parallelism is in use, even if the leader backend is performing the
+ * scan itself, we don't want to create the hashtable exactly the same way
+ * in all workers. As hashtables are iterated over in keyspace-order,
+ * doing so in all processes in the same way is likely to lead to
+ * "unbalanced" hashtables when the table size initially is
+ * underestimated.
+ */
+ if (use_variable_hash_iv)
+ hashtable->hash_iv = murmurhash32(ParallelWorkerNumber);
+ else
+ hashtable->hash_iv = 0;
+
+ hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable);
+
+ /*
+ * We copy the input tuple descriptor just for safety --- we assume all
+ * input tuples will have equivalent descriptors.
+ */
+ hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc),
+ &TTSOpsMinimalTuple);
+
+ /*
+ * If the old reset interface is used (i.e. BuildTupleHashTable, rather
+ * than BuildTupleHashTableExt), allowing JIT would lead to the generated
+ * functions to a) live longer than the query b) be re-generated each time
+ * the table is being reset. Therefore prevent JIT from being used in that
+ * case, by not providing a parent node (which prevents accessing the
+ * JitContext in the EState).
+ */
+ allow_jit = metacxt != tablecxt;
+
+ /* build comparator for all columns */
+ /* XXX: should we support non-minimal tuples for the inputslot? */
+ hashtable->tab_eq_func = ExecBuildGroupingEqual(inputDesc, inputDesc,
+ &TTSOpsMinimalTuple, &TTSOpsMinimalTuple,
+ numCols,
+ keyColIdx, eqfuncoids, collations,
+ allow_jit ? parent : NULL);
+
+ /*
+ * While not pretty, it's ok to not shut down this context, but instead
+ * rely on the containing memory context being reset, as
+ * ExecBuildGroupingEqual() only builds a very simple expression calling
+ * functions (i.e. nothing that'd employ RegisterExprContextCallback()).
+ */
+ hashtable->exprcontext = CreateStandaloneExprContext();
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return hashtable;
+}
+
+/*
+ * BuildTupleHashTable is a backwards-compatibilty wrapper for
+ * BuildTupleHashTableExt(), that allocates the hashtable's metadata in
+ * tablecxt. Note that hashtables created this way cannot be reset leak-free
+ * with ResetTupleHashTable().
+ */
+TupleHashTable
+BuildTupleHashTable(PlanState *parent,
+ TupleDesc inputDesc,
+ int numCols, AttrNumber *keyColIdx,
+ const Oid *eqfuncoids,
+ FmgrInfo *hashfunctions,
+ Oid *collations,
+ long nbuckets, Size additionalsize,
+ MemoryContext tablecxt,
+ MemoryContext tempcxt,
+ bool use_variable_hash_iv)
+{
+ return BuildTupleHashTableExt(parent,
+ inputDesc,
+ numCols, keyColIdx,
+ eqfuncoids,
+ hashfunctions,
+ collations,
+ nbuckets, additionalsize,
+ tablecxt,
+ tablecxt,
+ tempcxt,
+ use_variable_hash_iv);
+}
+
+/*
+ * Reset contents of the hashtable to be empty, preserving all the non-content
+ * state. Note that the tablecxt passed to BuildTupleHashTableExt() should
+ * also be reset, otherwise there will be leaks.
+ */
+void
+ResetTupleHashTable(TupleHashTable hashtable)
+{
+ tuplehash_reset(hashtable->hashtab);
+}
+
+/*
+ * Find or create a hashtable entry for the tuple group containing the
+ * given tuple. The tuple must be the same type as the hashtable entries.
+ *
+ * If isnew is NULL, we do not create new entries; we return NULL if no
+ * match is found.
+ *
+ * If hash is not NULL, we set it to the calculated hash value. This allows
+ * callers access to the hash value even if no entry is returned.
+ *
+ * If isnew isn't NULL, then a new entry is created if no existing entry
+ * matches. On return, *isnew is true if the entry is newly created,
+ * false if it existed already. ->additional_data in the new entry has
+ * been zeroed.
+ */
+TupleHashEntry
+LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
+ bool *isnew, uint32 *hash)
+{
+ TupleHashEntry entry;
+ MemoryContext oldContext;
+ uint32 local_hash;
+
+ /* Need to run the hash functions in short-lived context */
+ oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+ /* set up data needed by hash and match functions */
+ hashtable->inputslot = slot;
+ hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+ hashtable->cur_eq_func = hashtable->tab_eq_func;
+
+ local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+ entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
+
+ if (hash != NULL)
+ *hash = local_hash;
+
+ Assert(entry == NULL || entry->hash == local_hash);
+
+ MemoryContextSwitchTo(oldContext);
+
+ return entry;
+}
+
+/*
+ * Compute the hash value for a tuple
+ */
+uint32
+TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
+{
+ MemoryContext oldContext;
+ uint32 hash;
+
+ hashtable->inputslot = slot;
+ hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+
+ /* Need to run the hash functions in short-lived context */
+ oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+ hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+
+ MemoryContextSwitchTo(oldContext);
+
+ return hash;
+}
+
+/*
+ * A variant of LookupTupleHashEntry for callers that have already computed
+ * the hash value.
+ */
+TupleHashEntry
+LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
+ bool *isnew, uint32 hash)
+{
+ TupleHashEntry entry;
+ MemoryContext oldContext;
+
+ /* Need to run the hash functions in short-lived context */
+ oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+ /* set up data needed by hash and match functions */
+ hashtable->inputslot = slot;
+ hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+ hashtable->cur_eq_func = hashtable->tab_eq_func;
+
+ entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
+ Assert(entry == NULL || entry->hash == hash);
+
+ MemoryContextSwitchTo(oldContext);
+
+ return entry;
+}
+
+/*
+ * Search for a hashtable entry matching the given tuple. No entry is
+ * created if there's not a match. This is similar to the non-creating
+ * case of LookupTupleHashEntry, except that it supports cross-type
+ * comparisons, in which the given tuple is not of the same type as the
+ * table entries. The caller must provide the hash functions to use for
+ * the input tuple, as well as the equality functions, since these may be
+ * different from the table's internal functions.
+ */
+TupleHashEntry
+FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
+ ExprState *eqcomp,
+ FmgrInfo *hashfunctions)
+{
+ TupleHashEntry entry;
+ MemoryContext oldContext;
+ MinimalTuple key;
+
+ /* Need to run the hash functions in short-lived context */
+ oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+ /* Set up data needed by hash and match functions */
+ hashtable->inputslot = slot;
+ hashtable->in_hash_funcs = hashfunctions;
+ hashtable->cur_eq_func = eqcomp;
+
+ /* Search the hash table */
+ key = NULL; /* flag to reference inputslot */
+ entry = tuplehash_lookup(hashtable->hashtab, key);
+ MemoryContextSwitchTo(oldContext);
+
+ return entry;
+}
+
+/*
+ * If tuple is NULL, use the input slot instead. This convention avoids the
+ * need to materialize virtual input tuples unless they actually need to get
+ * copied into the table.
+ *
+ * Also, the caller must select an appropriate memory context for running
+ * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.)
+ */
+static uint32
+TupleHashTableHash_internal(struct tuplehash_hash *tb,
+ const MinimalTuple tuple)
+{
+ TupleHashTable hashtable = (TupleHashTable) tb->private_data;
+ int numCols = hashtable->numCols;
+ AttrNumber *keyColIdx = hashtable->keyColIdx;
+ uint32 hashkey = hashtable->hash_iv;
+ TupleTableSlot *slot;
+ FmgrInfo *hashfunctions;
+ int i;
+
+ if (tuple == NULL)
+ {
+ /* Process the current input tuple for the table */
+ slot = hashtable->inputslot;
+ hashfunctions = hashtable->in_hash_funcs;
+ }
+ else
+ {
+ /*
+ * Process a tuple already stored in the table.
+ *
+ * (this case never actually occurs due to the way simplehash.h is
+ * used, as the hash-value is stored in the entries)
+ */
+ slot = hashtable->tableslot;
+ ExecStoreMinimalTuple(tuple, slot, false);
+ hashfunctions = hashtable->tab_hash_funcs;
+ }
+
+ for (i = 0; i < numCols; i++)
+ {
+ AttrNumber att = keyColIdx[i];
+ Datum attr;
+ bool isNull;
+
+ /* rotate hashkey left 1 bit at each step */
+ hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+ attr = slot_getattr(slot, att, &isNull);
+
+ if (!isNull) /* treat nulls as having hash key 0 */
+ {
+ uint32 hkey;
+
+ hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i],
+ hashtable->tab_collations[i],
+ attr));
+ hashkey ^= hkey;
+ }
+ }
+
+ /*
+ * The way hashes are combined above, among each other and with the IV,
+ * doesn't lead to good bit perturbation. As the IV's goal is to lead to
+ * achieve that, perform a round of hashing of the combined hash -
+ * resulting in near perfect perturbation.
+ */
+ return murmurhash32(hashkey);
+}
+
+/*
+ * Does the work of LookupTupleHashEntry and LookupTupleHashEntryHash. Useful
+ * so that we can avoid switching the memory context multiple times for
+ * LookupTupleHashEntry.
+ *
+ * NB: This function may or may not change the memory context. Caller is
+ * expected to change it back.
+ */
+static inline TupleHashEntry
+LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
+ bool *isnew, uint32 hash)
+{
+ TupleHashEntryData *entry;
+ bool found;
+ MinimalTuple key;
+
+ key = NULL; /* flag to reference inputslot */
+
+ if (isnew)
+ {
+ entry = tuplehash_insert_hash(hashtable->hashtab, key, hash, &found);
+
+ if (found)
+ {
+ /* found pre-existing entry */
+ *isnew = false;
+ }
+ else
+ {
+ /* created new entry */
+ *isnew = true;
+ /* zero caller data */
+ entry->additional = NULL;
+ MemoryContextSwitchTo(hashtable->tablecxt);
+ /* Copy the first tuple into the table context */
+ entry->firstTuple = ExecCopySlotMinimalTuple(slot);
+ }
+ }
+ else
+ {
+ entry = tuplehash_lookup_hash(hashtable->hashtab, key, hash);
+ }
+
+ return entry;
+}
+
+/*
+ * See whether two tuples (presumably of the same hash value) match
+ */
+static int
+TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2)
+{
+ TupleTableSlot *slot1;
+ TupleTableSlot *slot2;
+ TupleHashTable hashtable = (TupleHashTable) tb->private_data;
+ ExprContext *econtext = hashtable->exprcontext;
+
+ /*
+ * We assume that simplehash.h will only ever call us with the first
+ * argument being an actual table entry, and the second argument being
+ * LookupTupleHashEntry's dummy TupleHashEntryData. The other direction
+ * could be supported too, but is not currently required.
+ */
+ Assert(tuple1 != NULL);
+ slot1 = hashtable->tableslot;
+ ExecStoreMinimalTuple(tuple1, slot1, false);
+ Assert(tuple2 == NULL);
+ slot2 = hashtable->inputslot;
+
+ /* For crosstype comparisons, the inputslot must be first */
+ econtext->ecxt_innertuple = slot2;
+ econtext->ecxt_outertuple = slot1;
+ return !ExecQualAndReset(hashtable->cur_eq_func, econtext);
+}
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
new file mode 100644
index 0000000..74becdc
--- /dev/null
+++ b/src/backend/executor/execIndexing.c
@@ -0,0 +1,921 @@
+/*-------------------------------------------------------------------------
+ *
+ * execIndexing.c
+ * routines for inserting index tuples and enforcing unique and
+ * exclusion constraints.
+ *
+ * ExecInsertIndexTuples() is the main entry point. It's called after
+ * inserting a tuple to the heap, and it inserts corresponding index tuples
+ * into all indexes. At the same time, it enforces any unique and
+ * exclusion constraints:
+ *
+ * Unique Indexes
+ * --------------
+ *
+ * Enforcing a unique constraint is straightforward. When the index AM
+ * inserts the tuple to the index, it also checks that there are no
+ * conflicting tuples in the index already. It does so atomically, so that
+ * even if two backends try to insert the same key concurrently, only one
+ * of them will succeed. All the logic to ensure atomicity, and to wait
+ * for in-progress transactions to finish, is handled by the index AM.
+ *
+ * If a unique constraint is deferred, we request the index AM to not
+ * throw an error if a conflict is found. Instead, we make note that there
+ * was a conflict and return the list of indexes with conflicts to the
+ * caller. The caller must re-check them later, by calling index_insert()
+ * with the UNIQUE_CHECK_EXISTING option.
+ *
+ * Exclusion Constraints
+ * ---------------------
+ *
+ * Exclusion constraints are different from unique indexes in that when the
+ * tuple is inserted to the index, the index AM does not check for
+ * duplicate keys at the same time. After the insertion, we perform a
+ * separate scan on the index to check for conflicting tuples, and if one
+ * is found, we throw an error and the transaction is aborted. If the
+ * conflicting tuple's inserter or deleter is in-progress, we wait for it
+ * to finish first.
+ *
+ * There is a chance of deadlock, if two backends insert a tuple at the
+ * same time, and then perform the scan to check for conflicts. They will
+ * find each other's tuple, and both try to wait for each other. The
+ * deadlock detector will detect that, and abort one of the transactions.
+ * That's fairly harmless, as one of them was bound to abort with a
+ * "duplicate key error" anyway, although you get a different error
+ * message.
+ *
+ * If an exclusion constraint is deferred, we still perform the conflict
+ * checking scan immediately after inserting the index tuple. But instead
+ * of throwing an error if a conflict is found, we return that information
+ * to the caller. The caller must re-check them later by calling
+ * check_exclusion_constraint().
+ *
+ * Speculative insertion
+ * ---------------------
+ *
+ * Speculative insertion is a two-phase mechanism used to implement
+ * INSERT ... ON CONFLICT DO UPDATE/NOTHING. The tuple is first inserted
+ * to the heap and update the indexes as usual, but if a constraint is
+ * violated, we can still back out the insertion without aborting the whole
+ * transaction. In an INSERT ... ON CONFLICT statement, if a conflict is
+ * detected, the inserted tuple is backed out and the ON CONFLICT action is
+ * executed instead.
+ *
+ * Insertion to a unique index works as usual: the index AM checks for
+ * duplicate keys atomically with the insertion. But instead of throwing
+ * an error on a conflict, the speculatively inserted heap tuple is backed
+ * out.
+ *
+ * Exclusion constraints are slightly more complicated. As mentioned
+ * earlier, there is a risk of deadlock when two backends insert the same
+ * key concurrently. That was not a problem for regular insertions, when
+ * one of the transactions has to be aborted anyway, but with a speculative
+ * insertion we cannot let a deadlock happen, because we only want to back
+ * out the speculatively inserted tuple on conflict, not abort the whole
+ * transaction.
+ *
+ * When a backend detects that the speculative insertion conflicts with
+ * another in-progress tuple, it has two options:
+ *
+ * 1. back out the speculatively inserted tuple, then wait for the other
+ * transaction, and retry. Or,
+ * 2. wait for the other transaction, with the speculatively inserted tuple
+ * still in place.
+ *
+ * If two backends insert at the same time, and both try to wait for each
+ * other, they will deadlock. So option 2 is not acceptable. Option 1
+ * avoids the deadlock, but it is prone to a livelock instead. Both
+ * transactions will wake up immediately as the other transaction backs
+ * out. Then they both retry, and conflict with each other again, lather,
+ * rinse, repeat.
+ *
+ * To avoid the livelock, one of the backends must back out first, and then
+ * wait, while the other one waits without backing out. It doesn't matter
+ * which one backs out, so we employ an arbitrary rule that the transaction
+ * with the higher XID backs out.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execIndexing.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "catalog/index.h"
+#include "executor/executor.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/lmgr.h"
+#include "utils/snapmgr.h"
+
+/* waitMode argument to check_exclusion_or_unique_constraint() */
+typedef enum
+{
+ CEOUC_WAIT,
+ CEOUC_NOWAIT,
+ CEOUC_LIVELOCK_PREVENTING_WAIT
+} CEOUC_WAIT_MODE;
+
+static bool check_exclusion_or_unique_constraint(Relation heap, Relation index,
+ IndexInfo *indexInfo,
+ ItemPointer tupleid,
+ Datum *values, bool *isnull,
+ EState *estate, bool newIndex,
+ CEOUC_WAIT_MODE waitMode,
+ bool errorOK,
+ ItemPointer conflictTid);
+
+static bool index_recheck_constraint(Relation index, Oid *constr_procs,
+ Datum *existing_values, bool *existing_isnull,
+ Datum *new_values);
+
+/* ----------------------------------------------------------------
+ * ExecOpenIndices
+ *
+ * Find the indices associated with a result relation, open them,
+ * and save information about them in the result ResultRelInfo.
+ *
+ * At entry, caller has already opened and locked
+ * resultRelInfo->ri_RelationDesc.
+ * ----------------------------------------------------------------
+ */
+void
+ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative)
+{
+ Relation resultRelation = resultRelInfo->ri_RelationDesc;
+ List *indexoidlist;
+ ListCell *l;
+ int len,
+ i;
+ RelationPtr relationDescs;
+ IndexInfo **indexInfoArray;
+
+ resultRelInfo->ri_NumIndices = 0;
+
+ /* fast path if no indexes */
+ if (!RelationGetForm(resultRelation)->relhasindex)
+ return;
+
+ /*
+ * Get cached list of index OIDs
+ */
+ indexoidlist = RelationGetIndexList(resultRelation);
+ len = list_length(indexoidlist);
+ if (len == 0)
+ return;
+
+ /*
+ * allocate space for result arrays
+ */
+ relationDescs = (RelationPtr) palloc(len * sizeof(Relation));
+ indexInfoArray = (IndexInfo **) palloc(len * sizeof(IndexInfo *));
+
+ resultRelInfo->ri_NumIndices = len;
+ resultRelInfo->ri_IndexRelationDescs = relationDescs;
+ resultRelInfo->ri_IndexRelationInfo = indexInfoArray;
+
+ /*
+ * For each index, open the index relation and save pg_index info. We
+ * acquire RowExclusiveLock, signifying we will update the index.
+ *
+ * Note: we do this even if the index is not indisready; it's not worth
+ * the trouble to optimize for the case where it isn't.
+ */
+ i = 0;
+ foreach(l, indexoidlist)
+ {
+ Oid indexOid = lfirst_oid(l);
+ Relation indexDesc;
+ IndexInfo *ii;
+
+ indexDesc = index_open(indexOid, RowExclusiveLock);
+
+ /* extract index key information from the index's pg_index info */
+ ii = BuildIndexInfo(indexDesc);
+
+ /*
+ * If the indexes are to be used for speculative insertion, add extra
+ * information required by unique index entries.
+ */
+ if (speculative && ii->ii_Unique)
+ BuildSpeculativeIndexInfo(indexDesc, ii);
+
+ relationDescs[i] = indexDesc;
+ indexInfoArray[i] = ii;
+ i++;
+ }
+
+ list_free(indexoidlist);
+}
+
+/* ----------------------------------------------------------------
+ * ExecCloseIndices
+ *
+ * Close the index relations stored in resultRelInfo
+ * ----------------------------------------------------------------
+ */
+void
+ExecCloseIndices(ResultRelInfo *resultRelInfo)
+{
+ int i;
+ int numIndices;
+ RelationPtr indexDescs;
+
+ numIndices = resultRelInfo->ri_NumIndices;
+ indexDescs = resultRelInfo->ri_IndexRelationDescs;
+
+ for (i = 0; i < numIndices; i++)
+ {
+ if (indexDescs[i] == NULL)
+ continue; /* shouldn't happen? */
+
+ /* Drop lock acquired by ExecOpenIndices */
+ index_close(indexDescs[i], RowExclusiveLock);
+ }
+
+ /*
+ * XXX should free indexInfo array here too? Currently we assume that
+ * such stuff will be cleaned up automatically in FreeExecutorState.
+ */
+}
+
+/* ----------------------------------------------------------------
+ * ExecInsertIndexTuples
+ *
+ * This routine takes care of inserting index tuples
+ * into all the relations indexing the result relation
+ * when a heap tuple is inserted into the result relation.
+ *
+ * When 'update' is true, executor is performing an UPDATE
+ * that could not use an optimization like heapam's HOT (in
+ * more general terms a call to table_tuple_update() took
+ * place and set 'update_indexes' to true). Receiving this
+ * hint makes us consider if we should pass down the
+ * 'indexUnchanged' hint in turn. That's something that we
+ * figure out for each index_insert() call iff 'update' is
+ * true. (When 'update' is false we already know not to pass
+ * the hint to any index.)
+ *
+ * Unique and exclusion constraints are enforced at the same
+ * time. This returns a list of index OIDs for any unique or
+ * exclusion constraints that are deferred and that had
+ * potential (unconfirmed) conflicts. (if noDupErr == true,
+ * the same is done for non-deferred constraints, but report
+ * if conflict was speculative or deferred conflict to caller)
+ *
+ * If 'arbiterIndexes' is nonempty, noDupErr applies only to
+ * those indexes. NIL means noDupErr applies to all indexes.
+ * ----------------------------------------------------------------
+ */
+List *
+ExecInsertIndexTuples(ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot,
+ EState *estate,
+ bool update,
+ bool noDupErr,
+ bool *specConflict,
+ List *arbiterIndexes)
+{
+ ItemPointer tupleid = &slot->tts_tid;
+ List *result = NIL;
+ int i;
+ int numIndices;
+ RelationPtr relationDescs;
+ Relation heapRelation;
+ IndexInfo **indexInfoArray;
+ ExprContext *econtext;
+ Datum values[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+
+ Assert(ItemPointerIsValid(tupleid));
+
+ /*
+ * Get information from the result relation info structure.
+ */
+ numIndices = resultRelInfo->ri_NumIndices;
+ relationDescs = resultRelInfo->ri_IndexRelationDescs;
+ indexInfoArray = resultRelInfo->ri_IndexRelationInfo;
+ heapRelation = resultRelInfo->ri_RelationDesc;
+
+ /* Sanity check: slot must belong to the same rel as the resultRelInfo. */
+ Assert(slot->tts_tableOid == RelationGetRelid(heapRelation));
+
+ /*
+ * We will use the EState's per-tuple context for evaluating predicates
+ * and index expressions (creating it if it's not already there).
+ */
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /*
+ * for each index, form and insert the index tuple
+ */
+ for (i = 0; i < numIndices; i++)
+ {
+ Relation indexRelation = relationDescs[i];
+ IndexInfo *indexInfo;
+ bool applyNoDupErr;
+ IndexUniqueCheck checkUnique;
+ bool indexUnchanged;
+ bool satisfiesConstraint;
+
+ if (indexRelation == NULL)
+ continue;
+
+ indexInfo = indexInfoArray[i];
+
+ /* If the index is marked as read-only, ignore it */
+ if (!indexInfo->ii_ReadyForInserts)
+ continue;
+
+ /* Check for partial index */
+ if (indexInfo->ii_Predicate != NIL)
+ {
+ ExprState *predicate;
+
+ /*
+ * If predicate state not set up yet, create it (in the estate's
+ * per-query context)
+ */
+ predicate = indexInfo->ii_PredicateState;
+ if (predicate == NULL)
+ {
+ predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+ indexInfo->ii_PredicateState = predicate;
+ }
+
+ /* Skip this index-update if the predicate isn't satisfied */
+ if (!ExecQual(predicate, econtext))
+ continue;
+ }
+
+ /*
+ * FormIndexDatum fills in its values and isnull parameters with the
+ * appropriate values for the column(s) of the index.
+ */
+ FormIndexDatum(indexInfo,
+ slot,
+ estate,
+ values,
+ isnull);
+
+ /* Check whether to apply noDupErr to this index */
+ applyNoDupErr = noDupErr &&
+ (arbiterIndexes == NIL ||
+ list_member_oid(arbiterIndexes,
+ indexRelation->rd_index->indexrelid));
+
+ /*
+ * The index AM does the actual insertion, plus uniqueness checking.
+ *
+ * For an immediate-mode unique index, we just tell the index AM to
+ * throw error if not unique.
+ *
+ * For a deferrable unique index, we tell the index AM to just detect
+ * possible non-uniqueness, and we add the index OID to the result
+ * list if further checking is needed.
+ *
+ * For a speculative insertion (used by INSERT ... ON CONFLICT), do
+ * the same as for a deferrable unique index.
+ */
+ if (!indexRelation->rd_index->indisunique)
+ checkUnique = UNIQUE_CHECK_NO;
+ else if (applyNoDupErr)
+ checkUnique = UNIQUE_CHECK_PARTIAL;
+ else if (indexRelation->rd_index->indimmediate)
+ checkUnique = UNIQUE_CHECK_YES;
+ else
+ checkUnique = UNIQUE_CHECK_PARTIAL;
+
+ /*
+ * There's definitely going to be an index_insert() call for this
+ * index. If we're being called as part of an UPDATE statement,
+ * consider if the 'indexUnchanged' = true hint should be passed.
+ *
+ * XXX We always assume that the hint should be passed for an UPDATE.
+ * This is a workaround for a bug in PostgreSQL 14. In practice this
+ * won't make much difference for current users of the hint.
+ */
+ indexUnchanged = update;
+
+ satisfiesConstraint =
+ index_insert(indexRelation, /* index relation */
+ values, /* array of index Datums */
+ isnull, /* null flags */
+ tupleid, /* tid of heap tuple */
+ heapRelation, /* heap relation */
+ checkUnique, /* type of uniqueness check to do */
+ indexUnchanged, /* UPDATE without logical change? */
+ indexInfo); /* index AM may need this */
+
+ /*
+ * If the index has an associated exclusion constraint, check that.
+ * This is simpler than the process for uniqueness checks since we
+ * always insert first and then check. If the constraint is deferred,
+ * we check now anyway, but don't throw error on violation or wait for
+ * a conclusive outcome from a concurrent insertion; instead we'll
+ * queue a recheck event. Similarly, noDupErr callers (speculative
+ * inserters) will recheck later, and wait for a conclusive outcome
+ * then.
+ *
+ * An index for an exclusion constraint can't also be UNIQUE (not an
+ * essential property, we just don't allow it in the grammar), so no
+ * need to preserve the prior state of satisfiesConstraint.
+ */
+ if (indexInfo->ii_ExclusionOps != NULL)
+ {
+ bool violationOK;
+ CEOUC_WAIT_MODE waitMode;
+
+ if (applyNoDupErr)
+ {
+ violationOK = true;
+ waitMode = CEOUC_LIVELOCK_PREVENTING_WAIT;
+ }
+ else if (!indexRelation->rd_index->indimmediate)
+ {
+ violationOK = true;
+ waitMode = CEOUC_NOWAIT;
+ }
+ else
+ {
+ violationOK = false;
+ waitMode = CEOUC_WAIT;
+ }
+
+ satisfiesConstraint =
+ check_exclusion_or_unique_constraint(heapRelation,
+ indexRelation, indexInfo,
+ tupleid, values, isnull,
+ estate, false,
+ waitMode, violationOK, NULL);
+ }
+
+ if ((checkUnique == UNIQUE_CHECK_PARTIAL ||
+ indexInfo->ii_ExclusionOps != NULL) &&
+ !satisfiesConstraint)
+ {
+ /*
+ * The tuple potentially violates the uniqueness or exclusion
+ * constraint, so make a note of the index so that we can re-check
+ * it later. Speculative inserters are told if there was a
+ * speculative conflict, since that always requires a restart.
+ */
+ result = lappend_oid(result, RelationGetRelid(indexRelation));
+ if (indexRelation->rd_index->indimmediate && specConflict)
+ *specConflict = true;
+ }
+ }
+
+ return result;
+}
+
+/* ----------------------------------------------------------------
+ * ExecCheckIndexConstraints
+ *
+ * This routine checks if a tuple violates any unique or
+ * exclusion constraints. Returns true if there is no conflict.
+ * Otherwise returns false, and the TID of the conflicting
+ * tuple is returned in *conflictTid.
+ *
+ * If 'arbiterIndexes' is given, only those indexes are checked.
+ * NIL means all indexes.
+ *
+ * Note that this doesn't lock the values in any way, so it's
+ * possible that a conflicting tuple is inserted immediately
+ * after this returns. But this can be used for a pre-check
+ * before insertion.
+ * ----------------------------------------------------------------
+ */
+bool
+ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
+ EState *estate, ItemPointer conflictTid,
+ List *arbiterIndexes)
+{
+ int i;
+ int numIndices;
+ RelationPtr relationDescs;
+ Relation heapRelation;
+ IndexInfo **indexInfoArray;
+ ExprContext *econtext;
+ Datum values[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ ItemPointerData invalidItemPtr;
+ bool checkedIndex = false;
+
+ ItemPointerSetInvalid(conflictTid);
+ ItemPointerSetInvalid(&invalidItemPtr);
+
+ /*
+ * Get information from the result relation info structure.
+ */
+ numIndices = resultRelInfo->ri_NumIndices;
+ relationDescs = resultRelInfo->ri_IndexRelationDescs;
+ indexInfoArray = resultRelInfo->ri_IndexRelationInfo;
+ heapRelation = resultRelInfo->ri_RelationDesc;
+
+ /*
+ * We will use the EState's per-tuple context for evaluating predicates
+ * and index expressions (creating it if it's not already there).
+ */
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /*
+ * For each index, form index tuple and check if it satisfies the
+ * constraint.
+ */
+ for (i = 0; i < numIndices; i++)
+ {
+ Relation indexRelation = relationDescs[i];
+ IndexInfo *indexInfo;
+ bool satisfiesConstraint;
+
+ if (indexRelation == NULL)
+ continue;
+
+ indexInfo = indexInfoArray[i];
+
+ if (!indexInfo->ii_Unique && !indexInfo->ii_ExclusionOps)
+ continue;
+
+ /* If the index is marked as read-only, ignore it */
+ if (!indexInfo->ii_ReadyForInserts)
+ continue;
+
+ /* When specific arbiter indexes requested, only examine them */
+ if (arbiterIndexes != NIL &&
+ !list_member_oid(arbiterIndexes,
+ indexRelation->rd_index->indexrelid))
+ continue;
+
+ if (!indexRelation->rd_index->indimmediate)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("ON CONFLICT does not support deferrable unique constraints/exclusion constraints as arbiters"),
+ errtableconstraint(heapRelation,
+ RelationGetRelationName(indexRelation))));
+
+ checkedIndex = true;
+
+ /* Check for partial index */
+ if (indexInfo->ii_Predicate != NIL)
+ {
+ ExprState *predicate;
+
+ /*
+ * If predicate state not set up yet, create it (in the estate's
+ * per-query context)
+ */
+ predicate = indexInfo->ii_PredicateState;
+ if (predicate == NULL)
+ {
+ predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+ indexInfo->ii_PredicateState = predicate;
+ }
+
+ /* Skip this index-update if the predicate isn't satisfied */
+ if (!ExecQual(predicate, econtext))
+ continue;
+ }
+
+ /*
+ * FormIndexDatum fills in its values and isnull parameters with the
+ * appropriate values for the column(s) of the index.
+ */
+ FormIndexDatum(indexInfo,
+ slot,
+ estate,
+ values,
+ isnull);
+
+ satisfiesConstraint =
+ check_exclusion_or_unique_constraint(heapRelation, indexRelation,
+ indexInfo, &invalidItemPtr,
+ values, isnull, estate, false,
+ CEOUC_WAIT, true,
+ conflictTid);
+ if (!satisfiesConstraint)
+ return false;
+ }
+
+ if (arbiterIndexes != NIL && !checkedIndex)
+ elog(ERROR, "unexpected failure to find arbiter index");
+
+ return true;
+}
+
+/*
+ * Check for violation of an exclusion or unique constraint
+ *
+ * heap: the table containing the new tuple
+ * index: the index supporting the constraint
+ * indexInfo: info about the index, including the exclusion properties
+ * tupleid: heap TID of the new tuple we have just inserted (invalid if we
+ * haven't inserted a new tuple yet)
+ * values, isnull: the *index* column values computed for the new tuple
+ * estate: an EState we can do evaluation in
+ * newIndex: if true, we are trying to build a new index (this affects
+ * only the wording of error messages)
+ * waitMode: whether to wait for concurrent inserters/deleters
+ * violationOK: if true, don't throw error for violation
+ * conflictTid: if not-NULL, the TID of the conflicting tuple is returned here
+ *
+ * Returns true if OK, false if actual or potential violation
+ *
+ * 'waitMode' determines what happens if a conflict is detected with a tuple
+ * that was inserted or deleted by a transaction that's still running.
+ * CEOUC_WAIT means that we wait for the transaction to commit, before
+ * throwing an error or returning. CEOUC_NOWAIT means that we report the
+ * violation immediately; so the violation is only potential, and the caller
+ * must recheck sometime later. This behavior is convenient for deferred
+ * exclusion checks; we need not bother queuing a deferred event if there is
+ * definitely no conflict at insertion time.
+ *
+ * CEOUC_LIVELOCK_PREVENTING_WAIT is like CEOUC_NOWAIT, but we will sometimes
+ * wait anyway, to prevent livelocking if two transactions try inserting at
+ * the same time. This is used with speculative insertions, for INSERT ON
+ * CONFLICT statements. (See notes in file header)
+ *
+ * If violationOK is true, we just report the potential or actual violation to
+ * the caller by returning 'false'. Otherwise we throw a descriptive error
+ * message here. When violationOK is false, a false result is impossible.
+ *
+ * Note: The indexam is normally responsible for checking unique constraints,
+ * so this normally only needs to be used for exclusion constraints. But this
+ * function is also called when doing a "pre-check" for conflicts on a unique
+ * constraint, when doing speculative insertion. Caller may use the returned
+ * conflict TID to take further steps.
+ */
+static bool
+check_exclusion_or_unique_constraint(Relation heap, Relation index,
+ IndexInfo *indexInfo,
+ ItemPointer tupleid,
+ Datum *values, bool *isnull,
+ EState *estate, bool newIndex,
+ CEOUC_WAIT_MODE waitMode,
+ bool violationOK,
+ ItemPointer conflictTid)
+{
+ Oid *constr_procs;
+ uint16 *constr_strats;
+ Oid *index_collations = index->rd_indcollation;
+ int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index);
+ IndexScanDesc index_scan;
+ ScanKeyData scankeys[INDEX_MAX_KEYS];
+ SnapshotData DirtySnapshot;
+ int i;
+ bool conflict;
+ bool found_self;
+ ExprContext *econtext;
+ TupleTableSlot *existing_slot;
+ TupleTableSlot *save_scantuple;
+
+ if (indexInfo->ii_ExclusionOps)
+ {
+ constr_procs = indexInfo->ii_ExclusionProcs;
+ constr_strats = indexInfo->ii_ExclusionStrats;
+ }
+ else
+ {
+ constr_procs = indexInfo->ii_UniqueProcs;
+ constr_strats = indexInfo->ii_UniqueStrats;
+ }
+
+ /*
+ * If any of the input values are NULL, the constraint check is assumed to
+ * pass (i.e., we assume the operators are strict).
+ */
+ for (i = 0; i < indnkeyatts; i++)
+ {
+ if (isnull[i])
+ return true;
+ }
+
+ /*
+ * Search the tuples that are in the index for any violations, including
+ * tuples that aren't visible yet.
+ */
+ InitDirtySnapshot(DirtySnapshot);
+
+ for (i = 0; i < indnkeyatts; i++)
+ {
+ ScanKeyEntryInitialize(&scankeys[i],
+ 0,
+ i + 1,
+ constr_strats[i],
+ InvalidOid,
+ index_collations[i],
+ constr_procs[i],
+ values[i]);
+ }
+
+ /*
+ * Need a TupleTableSlot to put existing tuples in.
+ *
+ * To use FormIndexDatum, we have to make the econtext's scantuple point
+ * to this slot. Be sure to save and restore caller's value for
+ * scantuple.
+ */
+ existing_slot = table_slot_create(heap, NULL);
+
+ econtext = GetPerTupleExprContext(estate);
+ save_scantuple = econtext->ecxt_scantuple;
+ econtext->ecxt_scantuple = existing_slot;
+
+ /*
+ * May have to restart scan from this point if a potential conflict is
+ * found.
+ */
+retry:
+ conflict = false;
+ found_self = false;
+ index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0);
+ index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0);
+
+ while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot))
+ {
+ TransactionId xwait;
+ XLTW_Oper reason_wait;
+ Datum existing_values[INDEX_MAX_KEYS];
+ bool existing_isnull[INDEX_MAX_KEYS];
+ char *error_new;
+ char *error_existing;
+
+ /*
+ * Ignore the entry for the tuple we're trying to check.
+ */
+ if (ItemPointerIsValid(tupleid) &&
+ ItemPointerEquals(tupleid, &existing_slot->tts_tid))
+ {
+ if (found_self) /* should not happen */
+ elog(ERROR, "found self tuple multiple times in index \"%s\"",
+ RelationGetRelationName(index));
+ found_self = true;
+ continue;
+ }
+
+ /*
+ * Extract the index column values and isnull flags from the existing
+ * tuple.
+ */
+ FormIndexDatum(indexInfo, existing_slot, estate,
+ existing_values, existing_isnull);
+
+ /* If lossy indexscan, must recheck the condition */
+ if (index_scan->xs_recheck)
+ {
+ if (!index_recheck_constraint(index,
+ constr_procs,
+ existing_values,
+ existing_isnull,
+ values))
+ continue; /* tuple doesn't actually match, so no
+ * conflict */
+ }
+
+ /*
+ * At this point we have either a conflict or a potential conflict.
+ *
+ * If an in-progress transaction is affecting the visibility of this
+ * tuple, we need to wait for it to complete and then recheck (unless
+ * the caller requested not to). For simplicity we do rechecking by
+ * just restarting the whole scan --- this case probably doesn't
+ * happen often enough to be worth trying harder, and anyway we don't
+ * want to hold any index internal locks while waiting.
+ */
+ xwait = TransactionIdIsValid(DirtySnapshot.xmin) ?
+ DirtySnapshot.xmin : DirtySnapshot.xmax;
+
+ if (TransactionIdIsValid(xwait) &&
+ (waitMode == CEOUC_WAIT ||
+ (waitMode == CEOUC_LIVELOCK_PREVENTING_WAIT &&
+ DirtySnapshot.speculativeToken &&
+ TransactionIdPrecedes(GetCurrentTransactionId(), xwait))))
+ {
+ reason_wait = indexInfo->ii_ExclusionOps ?
+ XLTW_RecheckExclusionConstr : XLTW_InsertIndex;
+ index_endscan(index_scan);
+ if (DirtySnapshot.speculativeToken)
+ SpeculativeInsertionWait(DirtySnapshot.xmin,
+ DirtySnapshot.speculativeToken);
+ else
+ XactLockTableWait(xwait, heap,
+ &existing_slot->tts_tid, reason_wait);
+ goto retry;
+ }
+
+ /*
+ * We have a definite conflict (or a potential one, but the caller
+ * didn't want to wait). Return it to caller, or report it.
+ */
+ if (violationOK)
+ {
+ conflict = true;
+ if (conflictTid)
+ *conflictTid = existing_slot->tts_tid;
+ break;
+ }
+
+ error_new = BuildIndexValueDescription(index, values, isnull);
+ error_existing = BuildIndexValueDescription(index, existing_values,
+ existing_isnull);
+ if (newIndex)
+ ereport(ERROR,
+ (errcode(ERRCODE_EXCLUSION_VIOLATION),
+ errmsg("could not create exclusion constraint \"%s\"",
+ RelationGetRelationName(index)),
+ error_new && error_existing ?
+ errdetail("Key %s conflicts with key %s.",
+ error_new, error_existing) :
+ errdetail("Key conflicts exist."),
+ errtableconstraint(heap,
+ RelationGetRelationName(index))));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_EXCLUSION_VIOLATION),
+ errmsg("conflicting key value violates exclusion constraint \"%s\"",
+ RelationGetRelationName(index)),
+ error_new && error_existing ?
+ errdetail("Key %s conflicts with existing key %s.",
+ error_new, error_existing) :
+ errdetail("Key conflicts with existing key."),
+ errtableconstraint(heap,
+ RelationGetRelationName(index))));
+ }
+
+ index_endscan(index_scan);
+
+ /*
+ * Ordinarily, at this point the search should have found the originally
+ * inserted tuple (if any), unless we exited the loop early because of
+ * conflict. However, it is possible to define exclusion constraints for
+ * which that wouldn't be true --- for instance, if the operator is <>. So
+ * we no longer complain if found_self is still false.
+ */
+
+ econtext->ecxt_scantuple = save_scantuple;
+
+ ExecDropSingleTupleTableSlot(existing_slot);
+
+ return !conflict;
+}
+
+/*
+ * Check for violation of an exclusion constraint
+ *
+ * This is a dumbed down version of check_exclusion_or_unique_constraint
+ * for external callers. They don't need all the special modes.
+ */
+void
+check_exclusion_constraint(Relation heap, Relation index,
+ IndexInfo *indexInfo,
+ ItemPointer tupleid,
+ Datum *values, bool *isnull,
+ EState *estate, bool newIndex)
+{
+ (void) check_exclusion_or_unique_constraint(heap, index, indexInfo, tupleid,
+ values, isnull,
+ estate, newIndex,
+ CEOUC_WAIT, false, NULL);
+}
+
+/*
+ * Check existing tuple's index values to see if it really matches the
+ * exclusion condition against the new_values. Returns true if conflict.
+ */
+static bool
+index_recheck_constraint(Relation index, Oid *constr_procs,
+ Datum *existing_values, bool *existing_isnull,
+ Datum *new_values)
+{
+ int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index);
+ int i;
+
+ for (i = 0; i < indnkeyatts; i++)
+ {
+ /* Assume the exclusion operators are strict */
+ if (existing_isnull[i])
+ return false;
+
+ if (!DatumGetBool(OidFunctionCall2Coll(constr_procs[i],
+ index->rd_indcollation[i],
+ existing_values[i],
+ new_values[i])))
+ return false;
+ }
+
+ return true;
+}
diff --git a/src/backend/executor/execJunk.c b/src/backend/executor/execJunk.c
new file mode 100644
index 0000000..9741897
--- /dev/null
+++ b/src/backend/executor/execJunk.c
@@ -0,0 +1,304 @@
+/*-------------------------------------------------------------------------
+ *
+ * execJunk.c
+ * Junk attribute support stuff....
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execJunk.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+
+/*-------------------------------------------------------------------------
+ * XXX this stuff should be rewritten to take advantage
+ * of ExecProject() and the ProjectionInfo node.
+ * -cim 6/3/91
+ *
+ * An attribute of a tuple living inside the executor, can be
+ * either a normal attribute or a "junk" attribute. "junk" attributes
+ * never make it out of the executor, i.e. they are never printed,
+ * returned or stored on disk. Their only purpose in life is to
+ * store some information useful only to the executor, mainly the values
+ * of system attributes like "ctid", or sort key columns that are not to
+ * be output.
+ *
+ * The general idea is the following: A target list consists of a list of
+ * TargetEntry nodes containing expressions. Each TargetEntry has a field
+ * called 'resjunk'. If the value of this field is true then the
+ * corresponding attribute is a "junk" attribute.
+ *
+ * When we initialize a plan we call ExecInitJunkFilter to create a filter.
+ *
+ * We then execute the plan, treating the resjunk attributes like any others.
+ *
+ * Finally, when at the top level we get back a tuple, we can call
+ * ExecFindJunkAttribute/ExecGetJunkAttribute to retrieve the values of the
+ * junk attributes we are interested in, and ExecFilterJunk to remove all the
+ * junk attributes from a tuple. This new "clean" tuple is then printed,
+ * inserted, or updated.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * ExecInitJunkFilter
+ *
+ * Initialize the Junk filter.
+ *
+ * The source targetlist is passed in. The output tuple descriptor is
+ * built from the non-junk tlist entries.
+ * An optional resultSlot can be passed as well; otherwise, we create one.
+ */
+JunkFilter *
+ExecInitJunkFilter(List *targetList, TupleTableSlot *slot)
+{
+ JunkFilter *junkfilter;
+ TupleDesc cleanTupType;
+ int cleanLength;
+ AttrNumber *cleanMap;
+
+ /*
+ * Compute the tuple descriptor for the cleaned tuple.
+ */
+ cleanTupType = ExecCleanTypeFromTL(targetList);
+
+ /*
+ * Use the given slot, or make a new slot if we weren't given one.
+ */
+ if (slot)
+ ExecSetSlotDescriptor(slot, cleanTupType);
+ else
+ slot = MakeSingleTupleTableSlot(cleanTupType, &TTSOpsVirtual);
+
+ /*
+ * Now calculate the mapping between the original tuple's attributes and
+ * the "clean" tuple's attributes.
+ *
+ * The "map" is an array of "cleanLength" attribute numbers, i.e. one
+ * entry for every attribute of the "clean" tuple. The value of this entry
+ * is the attribute number of the corresponding attribute of the
+ * "original" tuple. (Zero indicates a NULL output attribute, but we do
+ * not use that feature in this routine.)
+ */
+ cleanLength = cleanTupType->natts;
+ if (cleanLength > 0)
+ {
+ AttrNumber cleanResno;
+ ListCell *t;
+
+ cleanMap = (AttrNumber *) palloc(cleanLength * sizeof(AttrNumber));
+ cleanResno = 0;
+ foreach(t, targetList)
+ {
+ TargetEntry *tle = lfirst(t);
+
+ if (!tle->resjunk)
+ {
+ cleanMap[cleanResno] = tle->resno;
+ cleanResno++;
+ }
+ }
+ Assert(cleanResno == cleanLength);
+ }
+ else
+ cleanMap = NULL;
+
+ /*
+ * Finally create and initialize the JunkFilter struct.
+ */
+ junkfilter = makeNode(JunkFilter);
+
+ junkfilter->jf_targetList = targetList;
+ junkfilter->jf_cleanTupType = cleanTupType;
+ junkfilter->jf_cleanMap = cleanMap;
+ junkfilter->jf_resultSlot = slot;
+
+ return junkfilter;
+}
+
+/*
+ * ExecInitJunkFilterConversion
+ *
+ * Initialize a JunkFilter for rowtype conversions.
+ *
+ * Here, we are given the target "clean" tuple descriptor rather than
+ * inferring it from the targetlist. The target descriptor can contain
+ * deleted columns. It is assumed that the caller has checked that the
+ * non-deleted columns match up with the non-junk columns of the targetlist.
+ */
+JunkFilter *
+ExecInitJunkFilterConversion(List *targetList,
+ TupleDesc cleanTupType,
+ TupleTableSlot *slot)
+{
+ JunkFilter *junkfilter;
+ int cleanLength;
+ AttrNumber *cleanMap;
+ ListCell *t;
+ int i;
+
+ /*
+ * Use the given slot, or make a new slot if we weren't given one.
+ */
+ if (slot)
+ ExecSetSlotDescriptor(slot, cleanTupType);
+ else
+ slot = MakeSingleTupleTableSlot(cleanTupType, &TTSOpsVirtual);
+
+ /*
+ * Calculate the mapping between the original tuple's attributes and the
+ * "clean" tuple's attributes.
+ *
+ * The "map" is an array of "cleanLength" attribute numbers, i.e. one
+ * entry for every attribute of the "clean" tuple. The value of this entry
+ * is the attribute number of the corresponding attribute of the
+ * "original" tuple. We store zero for any deleted attributes, marking
+ * that a NULL is needed in the output tuple.
+ */
+ cleanLength = cleanTupType->natts;
+ if (cleanLength > 0)
+ {
+ cleanMap = (AttrNumber *) palloc0(cleanLength * sizeof(AttrNumber));
+ t = list_head(targetList);
+ for (i = 0; i < cleanLength; i++)
+ {
+ if (TupleDescAttr(cleanTupType, i)->attisdropped)
+ continue; /* map entry is already zero */
+ for (;;)
+ {
+ TargetEntry *tle = lfirst(t);
+
+ t = lnext(targetList, t);
+ if (!tle->resjunk)
+ {
+ cleanMap[i] = tle->resno;
+ break;
+ }
+ }
+ }
+ }
+ else
+ cleanMap = NULL;
+
+ /*
+ * Finally create and initialize the JunkFilter struct.
+ */
+ junkfilter = makeNode(JunkFilter);
+
+ junkfilter->jf_targetList = targetList;
+ junkfilter->jf_cleanTupType = cleanTupType;
+ junkfilter->jf_cleanMap = cleanMap;
+ junkfilter->jf_resultSlot = slot;
+
+ return junkfilter;
+}
+
+/*
+ * ExecFindJunkAttribute
+ *
+ * Locate the specified junk attribute in the junk filter's targetlist,
+ * and return its resno. Returns InvalidAttrNumber if not found.
+ */
+AttrNumber
+ExecFindJunkAttribute(JunkFilter *junkfilter, const char *attrName)
+{
+ return ExecFindJunkAttributeInTlist(junkfilter->jf_targetList, attrName);
+}
+
+/*
+ * ExecFindJunkAttributeInTlist
+ *
+ * Find a junk attribute given a subplan's targetlist (not necessarily
+ * part of a JunkFilter).
+ */
+AttrNumber
+ExecFindJunkAttributeInTlist(List *targetlist, const char *attrName)
+{
+ ListCell *t;
+
+ foreach(t, targetlist)
+ {
+ TargetEntry *tle = lfirst(t);
+
+ if (tle->resjunk && tle->resname &&
+ (strcmp(tle->resname, attrName) == 0))
+ {
+ /* We found it ! */
+ return tle->resno;
+ }
+ }
+
+ return InvalidAttrNumber;
+}
+
+/*
+ * ExecFilterJunk
+ *
+ * Construct and return a slot with all the junk attributes removed.
+ */
+TupleTableSlot *
+ExecFilterJunk(JunkFilter *junkfilter, TupleTableSlot *slot)
+{
+ TupleTableSlot *resultSlot;
+ AttrNumber *cleanMap;
+ TupleDesc cleanTupType;
+ int cleanLength;
+ int i;
+ Datum *values;
+ bool *isnull;
+ Datum *old_values;
+ bool *old_isnull;
+
+ /*
+ * Extract all the values of the old tuple.
+ */
+ slot_getallattrs(slot);
+ old_values = slot->tts_values;
+ old_isnull = slot->tts_isnull;
+
+ /*
+ * get info from the junk filter
+ */
+ cleanTupType = junkfilter->jf_cleanTupType;
+ cleanLength = cleanTupType->natts;
+ cleanMap = junkfilter->jf_cleanMap;
+ resultSlot = junkfilter->jf_resultSlot;
+
+ /*
+ * Prepare to build a virtual result tuple.
+ */
+ ExecClearTuple(resultSlot);
+ values = resultSlot->tts_values;
+ isnull = resultSlot->tts_isnull;
+
+ /*
+ * Transpose data into proper fields of the new tuple.
+ */
+ for (i = 0; i < cleanLength; i++)
+ {
+ int j = cleanMap[i];
+
+ if (j == 0)
+ {
+ values[i] = (Datum) 0;
+ isnull[i] = true;
+ }
+ else
+ {
+ values[i] = old_values[j - 1];
+ isnull[i] = old_isnull[j - 1];
+ }
+ }
+
+ /*
+ * And return the virtual tuple.
+ */
+ return ExecStoreVirtualTuple(resultSlot);
+}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
new file mode 100644
index 0000000..b3ce4ba
--- /dev/null
+++ b/src/backend/executor/execMain.c
@@ -0,0 +1,2886 @@
+/*-------------------------------------------------------------------------
+ *
+ * execMain.c
+ * top level executor interface routines
+ *
+ * INTERFACE ROUTINES
+ * ExecutorStart()
+ * ExecutorRun()
+ * ExecutorFinish()
+ * ExecutorEnd()
+ *
+ * These four procedures are the external interface to the executor.
+ * In each case, the query descriptor is required as an argument.
+ *
+ * ExecutorStart must be called at the beginning of execution of any
+ * query plan and ExecutorEnd must always be called at the end of
+ * execution of a plan (unless it is aborted due to error).
+ *
+ * ExecutorRun accepts direction and count arguments that specify whether
+ * the plan is to be executed forwards, backwards, and for how many tuples.
+ * In some cases ExecutorRun may be called multiple times to process all
+ * the tuples for a plan. It is also acceptable to stop short of executing
+ * the whole plan (but only if it is a SELECT).
+ *
+ * ExecutorFinish must be called after the final ExecutorRun call and
+ * before ExecutorEnd. This can be omitted only in case of EXPLAIN,
+ * which should also omit ExecutorRun.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execMain.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_publication.h"
+#include "commands/matview.h"
+#include "commands/trigger.h"
+#include "executor/execdebug.h"
+#include "executor/nodeSubplan.h"
+#include "foreign/fdwapi.h"
+#include "jit/jit.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "parser/parsetree.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "tcop/utility.h"
+#include "utils/acl.h"
+#include "utils/backend_status.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/partcache.h"
+#include "utils/rls.h"
+#include "utils/ruleutils.h"
+#include "utils/snapmgr.h"
+
+
+/* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */
+ExecutorStart_hook_type ExecutorStart_hook = NULL;
+ExecutorRun_hook_type ExecutorRun_hook = NULL;
+ExecutorFinish_hook_type ExecutorFinish_hook = NULL;
+ExecutorEnd_hook_type ExecutorEnd_hook = NULL;
+
+/* Hook for plugin to get control in ExecCheckRTPerms() */
+ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL;
+
+/* decls for local routines only used within this module */
+static void InitPlan(QueryDesc *queryDesc, int eflags);
+static void CheckValidRowMarkRel(Relation rel, RowMarkType markType);
+static void ExecPostprocessPlan(EState *estate);
+static void ExecEndPlan(PlanState *planstate, EState *estate);
+static void ExecutePlan(EState *estate, PlanState *planstate,
+ bool use_parallel_mode,
+ CmdType operation,
+ bool sendTuples,
+ uint64 numberTuples,
+ ScanDirection direction,
+ DestReceiver *dest,
+ bool execute_once);
+static bool ExecCheckRTEPerms(RangeTblEntry *rte);
+static bool ExecCheckRTEPermsModified(Oid relOid, Oid userid,
+ Bitmapset *modifiedCols,
+ AclMode requiredPerms);
+static void ExecCheckXactReadOnly(PlannedStmt *plannedstmt);
+static char *ExecBuildSlotValueDescription(Oid reloid,
+ TupleTableSlot *slot,
+ TupleDesc tupdesc,
+ Bitmapset *modifiedCols,
+ int maxfieldlen);
+static void EvalPlanQualStart(EPQState *epqstate, Plan *planTree);
+
+/* end of local decls */
+
+
+/* ----------------------------------------------------------------
+ * ExecutorStart
+ *
+ * This routine must be called at the beginning of any execution of any
+ * query plan
+ *
+ * Takes a QueryDesc previously created by CreateQueryDesc (which is separate
+ * only because some places use QueryDescs for utility commands). The tupDesc
+ * field of the QueryDesc is filled in to describe the tuples that will be
+ * returned, and the internal fields (estate and planstate) are set up.
+ *
+ * eflags contains flag bits as described in executor.h.
+ *
+ * NB: the CurrentMemoryContext when this is called will become the parent
+ * of the per-query context used for this Executor invocation.
+ *
+ * We provide a function hook variable that lets loadable plugins
+ * get control when ExecutorStart is called. Such a plugin would
+ * normally call standard_ExecutorStart().
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+ /*
+ * In some cases (e.g. an EXECUTE statement) a query execution will skip
+ * parse analysis, which means that the query_id won't be reported. Note
+ * that it's harmless to report the query_id multiple time, as the call
+ * will be ignored if the top level query_id has already been reported.
+ */
+ pgstat_report_query_id(queryDesc->plannedstmt->queryId, false);
+
+ if (ExecutorStart_hook)
+ (*ExecutorStart_hook) (queryDesc, eflags);
+ else
+ standard_ExecutorStart(queryDesc, eflags);
+}
+
+void
+standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+ EState *estate;
+ MemoryContext oldcontext;
+
+ /* sanity checks: queryDesc must not be started already */
+ Assert(queryDesc != NULL);
+ Assert(queryDesc->estate == NULL);
+
+ /*
+ * If the transaction is read-only, we need to check if any writes are
+ * planned to non-temporary tables. EXPLAIN is considered read-only.
+ *
+ * Don't allow writes in parallel mode. Supporting UPDATE and DELETE
+ * would require (a) storing the combo CID hash in shared memory, rather
+ * than synchronizing it just once at the start of parallelism, and (b) an
+ * alternative to heap_update()'s reliance on xmax for mutual exclusion.
+ * INSERT may have no such troubles, but we forbid it to simplify the
+ * checks.
+ *
+ * We have lower-level defenses in CommandCounterIncrement and elsewhere
+ * against performing unsafe operations in parallel mode, but this gives a
+ * more user-friendly error message.
+ */
+ if ((XactReadOnly || IsInParallelMode()) &&
+ !(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ ExecCheckXactReadOnly(queryDesc->plannedstmt);
+
+ /*
+ * Build EState, switch into per-query memory context for startup.
+ */
+ estate = CreateExecutorState();
+ queryDesc->estate = estate;
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ /*
+ * Fill in external parameters, if any, from queryDesc; and allocate
+ * workspace for internal parameters
+ */
+ estate->es_param_list_info = queryDesc->params;
+
+ if (queryDesc->plannedstmt->paramExecTypes != NIL)
+ {
+ int nParamExec;
+
+ nParamExec = list_length(queryDesc->plannedstmt->paramExecTypes);
+ estate->es_param_exec_vals = (ParamExecData *)
+ palloc0(nParamExec * sizeof(ParamExecData));
+ }
+
+ /* We now require all callers to provide sourceText */
+ Assert(queryDesc->sourceText != NULL);
+ estate->es_sourceText = queryDesc->sourceText;
+
+ /*
+ * Fill in the query environment, if any, from queryDesc.
+ */
+ estate->es_queryEnv = queryDesc->queryEnv;
+
+ /*
+ * If non-read-only query, set the command ID to mark output tuples with
+ */
+ switch (queryDesc->operation)
+ {
+ case CMD_SELECT:
+
+ /*
+ * SELECT FOR [KEY] UPDATE/SHARE and modifying CTEs need to mark
+ * tuples
+ */
+ if (queryDesc->plannedstmt->rowMarks != NIL ||
+ queryDesc->plannedstmt->hasModifyingCTE)
+ estate->es_output_cid = GetCurrentCommandId(true);
+
+ /*
+ * A SELECT without modifying CTEs can't possibly queue triggers,
+ * so force skip-triggers mode. This is just a marginal efficiency
+ * hack, since AfterTriggerBeginQuery/AfterTriggerEndQuery aren't
+ * all that expensive, but we might as well do it.
+ */
+ if (!queryDesc->plannedstmt->hasModifyingCTE)
+ eflags |= EXEC_FLAG_SKIP_TRIGGERS;
+ break;
+
+ case CMD_INSERT:
+ case CMD_DELETE:
+ case CMD_UPDATE:
+ estate->es_output_cid = GetCurrentCommandId(true);
+ break;
+
+ default:
+ elog(ERROR, "unrecognized operation code: %d",
+ (int) queryDesc->operation);
+ break;
+ }
+
+ /*
+ * Copy other important information into the EState
+ */
+ estate->es_snapshot = RegisterSnapshot(queryDesc->snapshot);
+ estate->es_crosscheck_snapshot = RegisterSnapshot(queryDesc->crosscheck_snapshot);
+ estate->es_top_eflags = eflags;
+ estate->es_instrument = queryDesc->instrument_options;
+ estate->es_jit_flags = queryDesc->plannedstmt->jitFlags;
+
+ /*
+ * Set up an AFTER-trigger statement context, unless told not to, or
+ * unless it's EXPLAIN-only mode (when ExecutorFinish won't be called).
+ */
+ if (!(eflags & (EXEC_FLAG_SKIP_TRIGGERS | EXEC_FLAG_EXPLAIN_ONLY)))
+ AfterTriggerBeginQuery();
+
+ /*
+ * Initialize the plan state tree
+ */
+ InitPlan(queryDesc, eflags);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/* ----------------------------------------------------------------
+ * ExecutorRun
+ *
+ * This is the main routine of the executor module. It accepts
+ * the query descriptor from the traffic cop and executes the
+ * query plan.
+ *
+ * ExecutorStart must have been called already.
+ *
+ * If direction is NoMovementScanDirection then nothing is done
+ * except to start up/shut down the destination. Otherwise,
+ * we retrieve up to 'count' tuples in the specified direction.
+ *
+ * Note: count = 0 is interpreted as no portal limit, i.e., run to
+ * completion. Also note that the count limit is only applied to
+ * retrieved tuples, not for instance to those inserted/updated/deleted
+ * by a ModifyTable plan node.
+ *
+ * There is no return value, but output tuples (if any) are sent to
+ * the destination receiver specified in the QueryDesc; and the number
+ * of tuples processed at the top level can be found in
+ * estate->es_processed.
+ *
+ * We provide a function hook variable that lets loadable plugins
+ * get control when ExecutorRun is called. Such a plugin would
+ * normally call standard_ExecutorRun().
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorRun(QueryDesc *queryDesc,
+ ScanDirection direction, uint64 count,
+ bool execute_once)
+{
+ if (ExecutorRun_hook)
+ (*ExecutorRun_hook) (queryDesc, direction, count, execute_once);
+ else
+ standard_ExecutorRun(queryDesc, direction, count, execute_once);
+}
+
+void
+standard_ExecutorRun(QueryDesc *queryDesc,
+ ScanDirection direction, uint64 count, bool execute_once)
+{
+ EState *estate;
+ CmdType operation;
+ DestReceiver *dest;
+ bool sendTuples;
+ MemoryContext oldcontext;
+
+ /* sanity checks */
+ Assert(queryDesc != NULL);
+
+ estate = queryDesc->estate;
+
+ Assert(estate != NULL);
+ Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
+
+ /*
+ * Switch into per-query memory context
+ */
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ /* Allow instrumentation of Executor overall runtime */
+ if (queryDesc->totaltime)
+ InstrStartNode(queryDesc->totaltime);
+
+ /*
+ * extract information from the query descriptor and the query feature.
+ */
+ operation = queryDesc->operation;
+ dest = queryDesc->dest;
+
+ /*
+ * startup tuple receiver, if we will be emitting tuples
+ */
+ estate->es_processed = 0;
+
+ sendTuples = (operation == CMD_SELECT ||
+ queryDesc->plannedstmt->hasReturning);
+
+ if (sendTuples)
+ dest->rStartup(dest, operation, queryDesc->tupDesc);
+
+ /*
+ * run plan
+ */
+ if (!ScanDirectionIsNoMovement(direction))
+ {
+ if (execute_once && queryDesc->already_executed)
+ elog(ERROR, "can't re-execute query flagged for single execution");
+ queryDesc->already_executed = true;
+
+ ExecutePlan(estate,
+ queryDesc->planstate,
+ queryDesc->plannedstmt->parallelModeNeeded,
+ operation,
+ sendTuples,
+ count,
+ direction,
+ dest,
+ execute_once);
+ }
+
+ /*
+ * shutdown tuple receiver, if we started it
+ */
+ if (sendTuples)
+ dest->rShutdown(dest);
+
+ if (queryDesc->totaltime)
+ InstrStopNode(queryDesc->totaltime, estate->es_processed);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/* ----------------------------------------------------------------
+ * ExecutorFinish
+ *
+ * This routine must be called after the last ExecutorRun call.
+ * It performs cleanup such as firing AFTER triggers. It is
+ * separate from ExecutorEnd because EXPLAIN ANALYZE needs to
+ * include these actions in the total runtime.
+ *
+ * We provide a function hook variable that lets loadable plugins
+ * get control when ExecutorFinish is called. Such a plugin would
+ * normally call standard_ExecutorFinish().
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorFinish(QueryDesc *queryDesc)
+{
+ if (ExecutorFinish_hook)
+ (*ExecutorFinish_hook) (queryDesc);
+ else
+ standard_ExecutorFinish(queryDesc);
+}
+
+void
+standard_ExecutorFinish(QueryDesc *queryDesc)
+{
+ EState *estate;
+ MemoryContext oldcontext;
+
+ /* sanity checks */
+ Assert(queryDesc != NULL);
+
+ estate = queryDesc->estate;
+
+ Assert(estate != NULL);
+ Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
+
+ /* This should be run once and only once per Executor instance */
+ Assert(!estate->es_finished);
+
+ /* Switch into per-query memory context */
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ /* Allow instrumentation of Executor overall runtime */
+ if (queryDesc->totaltime)
+ InstrStartNode(queryDesc->totaltime);
+
+ /* Run ModifyTable nodes to completion */
+ ExecPostprocessPlan(estate);
+
+ /* Execute queued AFTER triggers, unless told not to */
+ if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
+ AfterTriggerEndQuery(estate);
+
+ if (queryDesc->totaltime)
+ InstrStopNode(queryDesc->totaltime, 0);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ estate->es_finished = true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecutorEnd
+ *
+ * This routine must be called at the end of execution of any
+ * query plan
+ *
+ * We provide a function hook variable that lets loadable plugins
+ * get control when ExecutorEnd is called. Such a plugin would
+ * normally call standard_ExecutorEnd().
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorEnd(QueryDesc *queryDesc)
+{
+ if (ExecutorEnd_hook)
+ (*ExecutorEnd_hook) (queryDesc);
+ else
+ standard_ExecutorEnd(queryDesc);
+}
+
+void
+standard_ExecutorEnd(QueryDesc *queryDesc)
+{
+ EState *estate;
+ MemoryContext oldcontext;
+
+ /* sanity checks */
+ Assert(queryDesc != NULL);
+
+ estate = queryDesc->estate;
+
+ Assert(estate != NULL);
+
+ /*
+ * Check that ExecutorFinish was called, unless in EXPLAIN-only mode. This
+ * Assert is needed because ExecutorFinish is new as of 9.1, and callers
+ * might forget to call it.
+ */
+ Assert(estate->es_finished ||
+ (estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
+
+ /*
+ * Switch into per-query memory context to run ExecEndPlan
+ */
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ ExecEndPlan(queryDesc->planstate, estate);
+
+ /* do away with our snapshots */
+ UnregisterSnapshot(estate->es_snapshot);
+ UnregisterSnapshot(estate->es_crosscheck_snapshot);
+
+ /*
+ * Must switch out of context before destroying it
+ */
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * Release EState and per-query memory context. This should release
+ * everything the executor has allocated.
+ */
+ FreeExecutorState(estate);
+
+ /* Reset queryDesc fields that no longer point to anything */
+ queryDesc->tupDesc = NULL;
+ queryDesc->estate = NULL;
+ queryDesc->planstate = NULL;
+ queryDesc->totaltime = NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecutorRewind
+ *
+ * This routine may be called on an open queryDesc to rewind it
+ * to the start.
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorRewind(QueryDesc *queryDesc)
+{
+ EState *estate;
+ MemoryContext oldcontext;
+
+ /* sanity checks */
+ Assert(queryDesc != NULL);
+
+ estate = queryDesc->estate;
+
+ Assert(estate != NULL);
+
+ /* It's probably not sensible to rescan updating queries */
+ Assert(queryDesc->operation == CMD_SELECT);
+
+ /*
+ * Switch into per-query memory context
+ */
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ /*
+ * rescan plan
+ */
+ ExecReScan(queryDesc->planstate);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+
+/*
+ * ExecCheckRTPerms
+ * Check access permissions for all relations listed in a range table.
+ *
+ * Returns true if permissions are adequate. Otherwise, throws an appropriate
+ * error if ereport_on_violation is true, or simply returns false otherwise.
+ *
+ * Note that this does NOT address row-level security policies (aka: RLS). If
+ * rows will be returned to the user as a result of this permission check
+ * passing, then RLS also needs to be consulted (and check_enable_rls()).
+ *
+ * See rewrite/rowsecurity.c.
+ */
+bool
+ExecCheckRTPerms(List *rangeTable, bool ereport_on_violation)
+{
+ ListCell *l;
+ bool result = true;
+
+ foreach(l, rangeTable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
+
+ result = ExecCheckRTEPerms(rte);
+ if (!result)
+ {
+ Assert(rte->rtekind == RTE_RELATION);
+ if (ereport_on_violation)
+ aclcheck_error(ACLCHECK_NO_PRIV, get_relkind_objtype(get_rel_relkind(rte->relid)),
+ get_rel_name(rte->relid));
+ return false;
+ }
+ }
+
+ if (ExecutorCheckPerms_hook)
+ result = (*ExecutorCheckPerms_hook) (rangeTable,
+ ereport_on_violation);
+ return result;
+}
+
+/*
+ * ExecCheckRTEPerms
+ * Check access permissions for a single RTE.
+ */
+static bool
+ExecCheckRTEPerms(RangeTblEntry *rte)
+{
+ AclMode requiredPerms;
+ AclMode relPerms;
+ AclMode remainingPerms;
+ Oid relOid;
+ Oid userid;
+
+ /*
+ * Only plain-relation RTEs need to be checked here. Function RTEs are
+ * checked when the function is prepared for execution. Join, subquery,
+ * and special RTEs need no checks.
+ */
+ if (rte->rtekind != RTE_RELATION)
+ return true;
+
+ /*
+ * No work if requiredPerms is empty.
+ */
+ requiredPerms = rte->requiredPerms;
+ if (requiredPerms == 0)
+ return true;
+
+ relOid = rte->relid;
+
+ /*
+ * userid to check as: current user unless we have a setuid indication.
+ *
+ * Note: GetUserId() is presently fast enough that there's no harm in
+ * calling it separately for each RTE. If that stops being true, we could
+ * call it once in ExecCheckRTPerms and pass the userid down from there.
+ * But for now, no need for the extra clutter.
+ */
+ userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
+
+ /*
+ * We must have *all* the requiredPerms bits, but some of the bits can be
+ * satisfied from column-level rather than relation-level permissions.
+ * First, remove any bits that are satisfied by relation permissions.
+ */
+ relPerms = pg_class_aclmask(relOid, userid, requiredPerms, ACLMASK_ALL);
+ remainingPerms = requiredPerms & ~relPerms;
+ if (remainingPerms != 0)
+ {
+ int col = -1;
+
+ /*
+ * If we lack any permissions that exist only as relation permissions,
+ * we can fail straight away.
+ */
+ if (remainingPerms & ~(ACL_SELECT | ACL_INSERT | ACL_UPDATE))
+ return false;
+
+ /*
+ * Check to see if we have the needed privileges at column level.
+ *
+ * Note: failures just report a table-level error; it would be nicer
+ * to report a column-level error if we have some but not all of the
+ * column privileges.
+ */
+ if (remainingPerms & ACL_SELECT)
+ {
+ /*
+ * When the query doesn't explicitly reference any columns (for
+ * example, SELECT COUNT(*) FROM table), allow the query if we
+ * have SELECT on any column of the rel, as per SQL spec.
+ */
+ if (bms_is_empty(rte->selectedCols))
+ {
+ if (pg_attribute_aclcheck_all(relOid, userid, ACL_SELECT,
+ ACLMASK_ANY) != ACLCHECK_OK)
+ return false;
+ }
+
+ while ((col = bms_next_member(rte->selectedCols, col)) >= 0)
+ {
+ /* bit #s are offset by FirstLowInvalidHeapAttributeNumber */
+ AttrNumber attno = col + FirstLowInvalidHeapAttributeNumber;
+
+ if (attno == InvalidAttrNumber)
+ {
+ /* Whole-row reference, must have priv on all cols */
+ if (pg_attribute_aclcheck_all(relOid, userid, ACL_SELECT,
+ ACLMASK_ALL) != ACLCHECK_OK)
+ return false;
+ }
+ else
+ {
+ if (pg_attribute_aclcheck(relOid, attno, userid,
+ ACL_SELECT) != ACLCHECK_OK)
+ return false;
+ }
+ }
+ }
+
+ /*
+ * Basically the same for the mod columns, for both INSERT and UPDATE
+ * privilege as specified by remainingPerms.
+ */
+ if (remainingPerms & ACL_INSERT && !ExecCheckRTEPermsModified(relOid,
+ userid,
+ rte->insertedCols,
+ ACL_INSERT))
+ return false;
+
+ if (remainingPerms & ACL_UPDATE && !ExecCheckRTEPermsModified(relOid,
+ userid,
+ rte->updatedCols,
+ ACL_UPDATE))
+ return false;
+ }
+ return true;
+}
+
+/*
+ * ExecCheckRTEPermsModified
+ * Check INSERT or UPDATE access permissions for a single RTE (these
+ * are processed uniformly).
+ */
+static bool
+ExecCheckRTEPermsModified(Oid relOid, Oid userid, Bitmapset *modifiedCols,
+ AclMode requiredPerms)
+{
+ int col = -1;
+
+ /*
+ * When the query doesn't explicitly update any columns, allow the query
+ * if we have permission on any column of the rel. This is to handle
+ * SELECT FOR UPDATE as well as possible corner cases in UPDATE.
+ */
+ if (bms_is_empty(modifiedCols))
+ {
+ if (pg_attribute_aclcheck_all(relOid, userid, requiredPerms,
+ ACLMASK_ANY) != ACLCHECK_OK)
+ return false;
+ }
+
+ while ((col = bms_next_member(modifiedCols, col)) >= 0)
+ {
+ /* bit #s are offset by FirstLowInvalidHeapAttributeNumber */
+ AttrNumber attno = col + FirstLowInvalidHeapAttributeNumber;
+
+ if (attno == InvalidAttrNumber)
+ {
+ /* whole-row reference can't happen here */
+ elog(ERROR, "whole-row update is not implemented");
+ }
+ else
+ {
+ if (pg_attribute_aclcheck(relOid, attno, userid,
+ requiredPerms) != ACLCHECK_OK)
+ return false;
+ }
+ }
+ return true;
+}
+
+/*
+ * Check that the query does not imply any writes to non-temp tables;
+ * unless we're in parallel mode, in which case don't even allow writes
+ * to temp tables.
+ *
+ * Note: in a Hot Standby this would need to reject writes to temp
+ * tables just as we do in parallel mode; but an HS standby can't have created
+ * any temp tables in the first place, so no need to check that.
+ */
+static void
+ExecCheckXactReadOnly(PlannedStmt *plannedstmt)
+{
+ ListCell *l;
+
+ /*
+ * Fail if write permissions are requested in parallel mode for table
+ * (temp or non-temp), otherwise fail for any non-temp table.
+ */
+ foreach(l, plannedstmt->rtable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
+
+ if (rte->rtekind != RTE_RELATION)
+ continue;
+
+ if ((rte->requiredPerms & (~ACL_SELECT)) == 0)
+ continue;
+
+ if (isTempNamespace(get_rel_namespace(rte->relid)))
+ continue;
+
+ PreventCommandIfReadOnly(CreateCommandName((Node *) plannedstmt));
+ }
+
+ if (plannedstmt->commandType != CMD_SELECT || plannedstmt->hasModifyingCTE)
+ PreventCommandIfParallelMode(CreateCommandName((Node *) plannedstmt));
+}
+
+
+/* ----------------------------------------------------------------
+ * InitPlan
+ *
+ * Initializes the query plan: open files, allocate storage
+ * and start up the rule manager
+ * ----------------------------------------------------------------
+ */
+static void
+InitPlan(QueryDesc *queryDesc, int eflags)
+{
+ CmdType operation = queryDesc->operation;
+ PlannedStmt *plannedstmt = queryDesc->plannedstmt;
+ Plan *plan = plannedstmt->planTree;
+ List *rangeTable = plannedstmt->rtable;
+ EState *estate = queryDesc->estate;
+ PlanState *planstate;
+ TupleDesc tupType;
+ ListCell *l;
+ int i;
+
+ /*
+ * Do permissions checks
+ */
+ ExecCheckRTPerms(rangeTable, true);
+
+ /*
+ * initialize the node's execution state
+ */
+ ExecInitRangeTable(estate, rangeTable);
+
+ estate->es_plannedstmt = plannedstmt;
+
+ /*
+ * Next, build the ExecRowMark array from the PlanRowMark(s), if any.
+ */
+ if (plannedstmt->rowMarks)
+ {
+ estate->es_rowmarks = (ExecRowMark **)
+ palloc0(estate->es_range_table_size * sizeof(ExecRowMark *));
+ foreach(l, plannedstmt->rowMarks)
+ {
+ PlanRowMark *rc = (PlanRowMark *) lfirst(l);
+ Oid relid;
+ Relation relation;
+ ExecRowMark *erm;
+
+ /* ignore "parent" rowmarks; they are irrelevant at runtime */
+ if (rc->isParent)
+ continue;
+
+ /* get relation's OID (will produce InvalidOid if subquery) */
+ relid = exec_rt_fetch(rc->rti, estate)->relid;
+
+ /* open relation, if we need to access it for this mark type */
+ switch (rc->markType)
+ {
+ case ROW_MARK_EXCLUSIVE:
+ case ROW_MARK_NOKEYEXCLUSIVE:
+ case ROW_MARK_SHARE:
+ case ROW_MARK_KEYSHARE:
+ case ROW_MARK_REFERENCE:
+ relation = ExecGetRangeTableRelation(estate, rc->rti);
+ break;
+ case ROW_MARK_COPY:
+ /* no physical table access is required */
+ relation = NULL;
+ break;
+ default:
+ elog(ERROR, "unrecognized markType: %d", rc->markType);
+ relation = NULL; /* keep compiler quiet */
+ break;
+ }
+
+ /* Check that relation is a legal target for marking */
+ if (relation)
+ CheckValidRowMarkRel(relation, rc->markType);
+
+ erm = (ExecRowMark *) palloc(sizeof(ExecRowMark));
+ erm->relation = relation;
+ erm->relid = relid;
+ erm->rti = rc->rti;
+ erm->prti = rc->prti;
+ erm->rowmarkId = rc->rowmarkId;
+ erm->markType = rc->markType;
+ erm->strength = rc->strength;
+ erm->waitPolicy = rc->waitPolicy;
+ erm->ermActive = false;
+ ItemPointerSetInvalid(&(erm->curCtid));
+ erm->ermExtra = NULL;
+
+ Assert(erm->rti > 0 && erm->rti <= estate->es_range_table_size &&
+ estate->es_rowmarks[erm->rti - 1] == NULL);
+
+ estate->es_rowmarks[erm->rti - 1] = erm;
+ }
+ }
+
+ /*
+ * Initialize the executor's tuple table to empty.
+ */
+ estate->es_tupleTable = NIL;
+
+ /* signal that this EState is not used for EPQ */
+ estate->es_epq_active = NULL;
+
+ /*
+ * Initialize private state information for each SubPlan. We must do this
+ * before running ExecInitNode on the main query tree, since
+ * ExecInitSubPlan expects to be able to find these entries.
+ */
+ Assert(estate->es_subplanstates == NIL);
+ i = 1; /* subplan indices count from 1 */
+ foreach(l, plannedstmt->subplans)
+ {
+ Plan *subplan = (Plan *) lfirst(l);
+ PlanState *subplanstate;
+ int sp_eflags;
+
+ /*
+ * A subplan will never need to do BACKWARD scan nor MARK/RESTORE. If
+ * it is a parameterless subplan (not initplan), we suggest that it be
+ * prepared to handle REWIND efficiently; otherwise there is no need.
+ */
+ sp_eflags = eflags
+ & (EXEC_FLAG_EXPLAIN_ONLY | EXEC_FLAG_WITH_NO_DATA);
+ if (bms_is_member(i, plannedstmt->rewindPlanIDs))
+ sp_eflags |= EXEC_FLAG_REWIND;
+
+ subplanstate = ExecInitNode(subplan, estate, sp_eflags);
+
+ estate->es_subplanstates = lappend(estate->es_subplanstates,
+ subplanstate);
+
+ i++;
+ }
+
+ /*
+ * Initialize the private state information for all the nodes in the query
+ * tree. This opens files, allocates storage and leaves us ready to start
+ * processing tuples.
+ */
+ planstate = ExecInitNode(plan, estate, eflags);
+
+ /*
+ * Get the tuple descriptor describing the type of tuples to return.
+ */
+ tupType = ExecGetResultType(planstate);
+
+ /*
+ * Initialize the junk filter if needed. SELECT queries need a filter if
+ * there are any junk attrs in the top-level tlist.
+ */
+ if (operation == CMD_SELECT)
+ {
+ bool junk_filter_needed = false;
+ ListCell *tlist;
+
+ foreach(tlist, plan->targetlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(tlist);
+
+ if (tle->resjunk)
+ {
+ junk_filter_needed = true;
+ break;
+ }
+ }
+
+ if (junk_filter_needed)
+ {
+ JunkFilter *j;
+ TupleTableSlot *slot;
+
+ slot = ExecInitExtraTupleSlot(estate, NULL, &TTSOpsVirtual);
+ j = ExecInitJunkFilter(planstate->plan->targetlist,
+ slot);
+ estate->es_junkFilter = j;
+
+ /* Want to return the cleaned tuple type */
+ tupType = j->jf_cleanTupType;
+ }
+ }
+
+ queryDesc->tupDesc = tupType;
+ queryDesc->planstate = planstate;
+}
+
+/*
+ * Check that a proposed result relation is a legal target for the operation
+ *
+ * Generally the parser and/or planner should have noticed any such mistake
+ * already, but let's make sure.
+ *
+ * Note: when changing this function, you probably also need to look at
+ * CheckValidRowMarkRel.
+ */
+void
+CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation)
+{
+ Relation resultRel = resultRelInfo->ri_RelationDesc;
+ TriggerDesc *trigDesc = resultRel->trigdesc;
+ FdwRoutine *fdwroutine;
+
+ switch (resultRel->rd_rel->relkind)
+ {
+ case RELKIND_RELATION:
+ case RELKIND_PARTITIONED_TABLE:
+ CheckCmdReplicaIdentity(resultRel, operation);
+ break;
+ case RELKIND_SEQUENCE:
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot change sequence \"%s\"",
+ RelationGetRelationName(resultRel))));
+ break;
+ case RELKIND_TOASTVALUE:
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot change TOAST relation \"%s\"",
+ RelationGetRelationName(resultRel))));
+ break;
+ case RELKIND_VIEW:
+
+ /*
+ * Okay only if there's a suitable INSTEAD OF trigger. Messages
+ * here should match rewriteHandler.c's rewriteTargetView and
+ * RewriteQuery, except that we omit errdetail because we haven't
+ * got the information handy (and given that we really shouldn't
+ * get here anyway, it's not worth great exertion to get).
+ */
+ switch (operation)
+ {
+ case CMD_INSERT:
+ if (!trigDesc || !trigDesc->trig_insert_instead_row)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot insert into view \"%s\"",
+ RelationGetRelationName(resultRel)),
+ errhint("To enable inserting into the view, provide an INSTEAD OF INSERT trigger or an unconditional ON INSERT DO INSTEAD rule.")));
+ break;
+ case CMD_UPDATE:
+ if (!trigDesc || !trigDesc->trig_update_instead_row)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot update view \"%s\"",
+ RelationGetRelationName(resultRel)),
+ errhint("To enable updating the view, provide an INSTEAD OF UPDATE trigger or an unconditional ON UPDATE DO INSTEAD rule.")));
+ break;
+ case CMD_DELETE:
+ if (!trigDesc || !trigDesc->trig_delete_instead_row)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot delete from view \"%s\"",
+ RelationGetRelationName(resultRel)),
+ errhint("To enable deleting from the view, provide an INSTEAD OF DELETE trigger or an unconditional ON DELETE DO INSTEAD rule.")));
+ break;
+ default:
+ elog(ERROR, "unrecognized CmdType: %d", (int) operation);
+ break;
+ }
+ break;
+ case RELKIND_MATVIEW:
+ if (!MatViewIncrementalMaintenanceIsEnabled())
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot change materialized view \"%s\"",
+ RelationGetRelationName(resultRel))));
+ break;
+ case RELKIND_FOREIGN_TABLE:
+ /* Okay only if the FDW supports it */
+ fdwroutine = resultRelInfo->ri_FdwRoutine;
+ switch (operation)
+ {
+ case CMD_INSERT:
+ if (fdwroutine->ExecForeignInsert == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot insert into foreign table \"%s\"",
+ RelationGetRelationName(resultRel))));
+ if (fdwroutine->IsForeignRelUpdatable != NULL &&
+ (fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_INSERT)) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("foreign table \"%s\" does not allow inserts",
+ RelationGetRelationName(resultRel))));
+ break;
+ case CMD_UPDATE:
+ if (fdwroutine->ExecForeignUpdate == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot update foreign table \"%s\"",
+ RelationGetRelationName(resultRel))));
+ if (fdwroutine->IsForeignRelUpdatable != NULL &&
+ (fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_UPDATE)) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("foreign table \"%s\" does not allow updates",
+ RelationGetRelationName(resultRel))));
+ break;
+ case CMD_DELETE:
+ if (fdwroutine->ExecForeignDelete == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot delete from foreign table \"%s\"",
+ RelationGetRelationName(resultRel))));
+ if (fdwroutine->IsForeignRelUpdatable != NULL &&
+ (fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_DELETE)) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("foreign table \"%s\" does not allow deletes",
+ RelationGetRelationName(resultRel))));
+ break;
+ default:
+ elog(ERROR, "unrecognized CmdType: %d", (int) operation);
+ break;
+ }
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot change relation \"%s\"",
+ RelationGetRelationName(resultRel))));
+ break;
+ }
+}
+
+/*
+ * Check that a proposed rowmark target relation is a legal target
+ *
+ * In most cases parser and/or planner should have noticed this already, but
+ * they don't cover all cases.
+ */
+static void
+CheckValidRowMarkRel(Relation rel, RowMarkType markType)
+{
+ FdwRoutine *fdwroutine;
+
+ switch (rel->rd_rel->relkind)
+ {
+ case RELKIND_RELATION:
+ case RELKIND_PARTITIONED_TABLE:
+ /* OK */
+ break;
+ case RELKIND_SEQUENCE:
+ /* Must disallow this because we don't vacuum sequences */
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot lock rows in sequence \"%s\"",
+ RelationGetRelationName(rel))));
+ break;
+ case RELKIND_TOASTVALUE:
+ /* We could allow this, but there seems no good reason to */
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot lock rows in TOAST relation \"%s\"",
+ RelationGetRelationName(rel))));
+ break;
+ case RELKIND_VIEW:
+ /* Should not get here; planner should have expanded the view */
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot lock rows in view \"%s\"",
+ RelationGetRelationName(rel))));
+ break;
+ case RELKIND_MATVIEW:
+ /* Allow referencing a matview, but not actual locking clauses */
+ if (markType != ROW_MARK_REFERENCE)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot lock rows in materialized view \"%s\"",
+ RelationGetRelationName(rel))));
+ break;
+ case RELKIND_FOREIGN_TABLE:
+ /* Okay only if the FDW supports it */
+ fdwroutine = GetFdwRoutineForRelation(rel, false);
+ if (fdwroutine->RefetchForeignRow == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot lock rows in foreign table \"%s\"",
+ RelationGetRelationName(rel))));
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot lock rows in relation \"%s\"",
+ RelationGetRelationName(rel))));
+ break;
+ }
+}
+
+/*
+ * Initialize ResultRelInfo data for one result relation
+ *
+ * Caution: before Postgres 9.1, this function included the relkind checking
+ * that's now in CheckValidResultRel, and it also did ExecOpenIndices if
+ * appropriate. Be sure callers cover those needs.
+ */
+void
+InitResultRelInfo(ResultRelInfo *resultRelInfo,
+ Relation resultRelationDesc,
+ Index resultRelationIndex,
+ ResultRelInfo *partition_root_rri,
+ int instrument_options)
+{
+ MemSet(resultRelInfo, 0, sizeof(ResultRelInfo));
+ resultRelInfo->type = T_ResultRelInfo;
+ resultRelInfo->ri_RangeTableIndex = resultRelationIndex;
+ resultRelInfo->ri_RelationDesc = resultRelationDesc;
+ resultRelInfo->ri_NumIndices = 0;
+ resultRelInfo->ri_IndexRelationDescs = NULL;
+ resultRelInfo->ri_IndexRelationInfo = NULL;
+ /* make a copy so as not to depend on relcache info not changing... */
+ resultRelInfo->ri_TrigDesc = CopyTriggerDesc(resultRelationDesc->trigdesc);
+ if (resultRelInfo->ri_TrigDesc)
+ {
+ int n = resultRelInfo->ri_TrigDesc->numtriggers;
+
+ resultRelInfo->ri_TrigFunctions = (FmgrInfo *)
+ palloc0(n * sizeof(FmgrInfo));
+ resultRelInfo->ri_TrigWhenExprs = (ExprState **)
+ palloc0(n * sizeof(ExprState *));
+ if (instrument_options)
+ resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false);
+ }
+ else
+ {
+ resultRelInfo->ri_TrigFunctions = NULL;
+ resultRelInfo->ri_TrigWhenExprs = NULL;
+ resultRelInfo->ri_TrigInstrument = NULL;
+ }
+ if (resultRelationDesc->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+ resultRelInfo->ri_FdwRoutine = GetFdwRoutineForRelation(resultRelationDesc, true);
+ else
+ resultRelInfo->ri_FdwRoutine = NULL;
+
+ /* The following fields are set later if needed */
+ resultRelInfo->ri_RowIdAttNo = 0;
+ resultRelInfo->ri_projectNew = NULL;
+ resultRelInfo->ri_newTupleSlot = NULL;
+ resultRelInfo->ri_oldTupleSlot = NULL;
+ resultRelInfo->ri_projectNewInfoValid = false;
+ resultRelInfo->ri_FdwState = NULL;
+ resultRelInfo->ri_usesFdwDirectModify = false;
+ resultRelInfo->ri_ConstraintExprs = NULL;
+ resultRelInfo->ri_GeneratedExprs = NULL;
+ resultRelInfo->ri_projectReturning = NULL;
+ resultRelInfo->ri_onConflictArbiterIndexes = NIL;
+ resultRelInfo->ri_onConflict = NULL;
+ resultRelInfo->ri_ReturningSlot = NULL;
+ resultRelInfo->ri_TrigOldSlot = NULL;
+ resultRelInfo->ri_TrigNewSlot = NULL;
+
+ /*
+ * Only ExecInitPartitionInfo() and ExecInitPartitionDispatchInfo() pass
+ * non-NULL partition_root_rri. For child relations that are part of the
+ * initial query rather than being dynamically added by tuple routing,
+ * this field is filled in ExecInitModifyTable().
+ */
+ resultRelInfo->ri_RootResultRelInfo = partition_root_rri;
+ resultRelInfo->ri_RootToPartitionMap = NULL; /* set by
+ * ExecInitRoutingInfo */
+ resultRelInfo->ri_PartitionTupleSlot = NULL; /* ditto */
+ resultRelInfo->ri_ChildToRootMap = NULL;
+ resultRelInfo->ri_ChildToRootMapValid = false;
+ resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+}
+
+/*
+ * ExecGetTriggerResultRel
+ * Get a ResultRelInfo for a trigger target relation.
+ *
+ * Most of the time, triggers are fired on one of the result relations of the
+ * query, and so we can just return a member of the es_result_relations array,
+ * or the es_tuple_routing_result_relations list (if any). (Note: in self-join
+ * situations there might be multiple members with the same OID; if so it
+ * doesn't matter which one we pick.)
+ *
+ * However, it is sometimes necessary to fire triggers on other relations;
+ * this happens mainly when an RI update trigger queues additional triggers
+ * on other relations, which will be processed in the context of the outer
+ * query. For efficiency's sake, we want to have a ResultRelInfo for those
+ * triggers too; that can avoid repeated re-opening of the relation. (It
+ * also provides a way for EXPLAIN ANALYZE to report the runtimes of such
+ * triggers.) So we make additional ResultRelInfo's as needed, and save them
+ * in es_trig_target_relations.
+ */
+ResultRelInfo *
+ExecGetTriggerResultRel(EState *estate, Oid relid)
+{
+ ResultRelInfo *rInfo;
+ ListCell *l;
+ Relation rel;
+ MemoryContext oldcontext;
+
+ /* Search through the query result relations */
+ foreach(l, estate->es_opened_result_relations)
+ {
+ rInfo = lfirst(l);
+ if (RelationGetRelid(rInfo->ri_RelationDesc) == relid)
+ return rInfo;
+ }
+
+ /*
+ * Search through the result relations that were created during tuple
+ * routing, if any.
+ */
+ foreach(l, estate->es_tuple_routing_result_relations)
+ {
+ rInfo = (ResultRelInfo *) lfirst(l);
+ if (RelationGetRelid(rInfo->ri_RelationDesc) == relid)
+ return rInfo;
+ }
+
+ /* Nope, but maybe we already made an extra ResultRelInfo for it */
+ foreach(l, estate->es_trig_target_relations)
+ {
+ rInfo = (ResultRelInfo *) lfirst(l);
+ if (RelationGetRelid(rInfo->ri_RelationDesc) == relid)
+ return rInfo;
+ }
+ /* Nope, so we need a new one */
+
+ /*
+ * Open the target relation's relcache entry. We assume that an
+ * appropriate lock is still held by the backend from whenever the trigger
+ * event got queued, so we need take no new lock here. Also, we need not
+ * recheck the relkind, so no need for CheckValidResultRel.
+ */
+ rel = table_open(relid, NoLock);
+
+ /*
+ * Make the new entry in the right context.
+ */
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+ rInfo = makeNode(ResultRelInfo);
+ InitResultRelInfo(rInfo,
+ rel,
+ 0, /* dummy rangetable index */
+ NULL,
+ estate->es_instrument);
+ estate->es_trig_target_relations =
+ lappend(estate->es_trig_target_relations, rInfo);
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * Currently, we don't need any index information in ResultRelInfos used
+ * only for triggers, so no need to call ExecOpenIndices.
+ */
+
+ return rInfo;
+}
+
+/* ----------------------------------------------------------------
+ * ExecPostprocessPlan
+ *
+ * Give plan nodes a final chance to execute before shutdown
+ * ----------------------------------------------------------------
+ */
+static void
+ExecPostprocessPlan(EState *estate)
+{
+ ListCell *lc;
+
+ /*
+ * Make sure nodes run forward.
+ */
+ estate->es_direction = ForwardScanDirection;
+
+ /*
+ * Run any secondary ModifyTable nodes to completion, in case the main
+ * query did not fetch all rows from them. (We do this to ensure that
+ * such nodes have predictable results.)
+ */
+ foreach(lc, estate->es_auxmodifytables)
+ {
+ PlanState *ps = (PlanState *) lfirst(lc);
+
+ for (;;)
+ {
+ TupleTableSlot *slot;
+
+ /* Reset the per-output-tuple exprcontext each time */
+ ResetPerTupleExprContext(estate);
+
+ slot = ExecProcNode(ps);
+
+ if (TupIsNull(slot))
+ break;
+ }
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndPlan
+ *
+ * Cleans up the query plan -- closes files and frees up storage
+ *
+ * NOTE: we are no longer very worried about freeing storage per se
+ * in this code; FreeExecutorState should be guaranteed to release all
+ * memory that needs to be released. What we are worried about doing
+ * is closing relations and dropping buffer pins. Thus, for example,
+ * tuple tables must be cleared or dropped to ensure pins are released.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecEndPlan(PlanState *planstate, EState *estate)
+{
+ ListCell *l;
+
+ /*
+ * shut down the node-type-specific query processing
+ */
+ ExecEndNode(planstate);
+
+ /*
+ * for subplans too
+ */
+ foreach(l, estate->es_subplanstates)
+ {
+ PlanState *subplanstate = (PlanState *) lfirst(l);
+
+ ExecEndNode(subplanstate);
+ }
+
+ /*
+ * destroy the executor's tuple table. Actually we only care about
+ * releasing buffer pins and tupdesc refcounts; there's no need to pfree
+ * the TupleTableSlots, since the containing memory context is about to go
+ * away anyway.
+ */
+ ExecResetTupleTable(estate->es_tupleTable, false);
+
+ /*
+ * Close any Relations that have been opened for range table entries or
+ * result relations.
+ */
+ ExecCloseResultRelations(estate);
+ ExecCloseRangeTableRelations(estate);
+}
+
+/*
+ * Close any relations that have been opened for ResultRelInfos.
+ */
+void
+ExecCloseResultRelations(EState *estate)
+{
+ ListCell *l;
+
+ /*
+ * close indexes of result relation(s) if any. (Rels themselves are
+ * closed in ExecCloseRangeTableRelations())
+ */
+ foreach(l, estate->es_opened_result_relations)
+ {
+ ResultRelInfo *resultRelInfo = lfirst(l);
+
+ ExecCloseIndices(resultRelInfo);
+ }
+
+ /* Close any relations that have been opened by ExecGetTriggerResultRel(). */
+ foreach(l, estate->es_trig_target_relations)
+ {
+ ResultRelInfo *resultRelInfo = (ResultRelInfo *) lfirst(l);
+
+ /*
+ * Assert this is a "dummy" ResultRelInfo, see above. Otherwise we
+ * might be issuing a duplicate close against a Relation opened by
+ * ExecGetRangeTableRelation.
+ */
+ Assert(resultRelInfo->ri_RangeTableIndex == 0);
+
+ /*
+ * Since ExecGetTriggerResultRel doesn't call ExecOpenIndices for
+ * these rels, we needn't call ExecCloseIndices either.
+ */
+ Assert(resultRelInfo->ri_NumIndices == 0);
+
+ table_close(resultRelInfo->ri_RelationDesc, NoLock);
+ }
+}
+
+/*
+ * Close all relations opened by ExecGetRangeTableRelation().
+ *
+ * We do not release any locks we might hold on those rels.
+ */
+void
+ExecCloseRangeTableRelations(EState *estate)
+{
+ int i;
+
+ for (i = 0; i < estate->es_range_table_size; i++)
+ {
+ if (estate->es_relations[i])
+ table_close(estate->es_relations[i], NoLock);
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecutePlan
+ *
+ * Processes the query plan until we have retrieved 'numberTuples' tuples,
+ * moving in the specified direction.
+ *
+ * Runs to completion if numberTuples is 0
+ *
+ * Note: the ctid attribute is a 'junk' attribute that is removed before the
+ * user can see it
+ * ----------------------------------------------------------------
+ */
+static void
+ExecutePlan(EState *estate,
+ PlanState *planstate,
+ bool use_parallel_mode,
+ CmdType operation,
+ bool sendTuples,
+ uint64 numberTuples,
+ ScanDirection direction,
+ DestReceiver *dest,
+ bool execute_once)
+{
+ TupleTableSlot *slot;
+ uint64 current_tuple_count;
+
+ /*
+ * initialize local variables
+ */
+ current_tuple_count = 0;
+
+ /*
+ * Set the direction.
+ */
+ estate->es_direction = direction;
+
+ /*
+ * If the plan might potentially be executed multiple times, we must force
+ * it to run without parallelism, because we might exit early.
+ */
+ if (!execute_once)
+ use_parallel_mode = false;
+
+ estate->es_use_parallel_mode = use_parallel_mode;
+ if (use_parallel_mode)
+ EnterParallelMode();
+
+ /*
+ * Loop until we've processed the proper number of tuples from the plan.
+ */
+ for (;;)
+ {
+ /* Reset the per-output-tuple exprcontext */
+ ResetPerTupleExprContext(estate);
+
+ /*
+ * Execute the plan and obtain a tuple
+ */
+ slot = ExecProcNode(planstate);
+
+ /*
+ * if the tuple is null, then we assume there is nothing more to
+ * process so we just end the loop...
+ */
+ if (TupIsNull(slot))
+ break;
+
+ /*
+ * If we have a junk filter, then project a new tuple with the junk
+ * removed.
+ *
+ * Store this new "clean" tuple in the junkfilter's resultSlot.
+ * (Formerly, we stored it back over the "dirty" tuple, which is WRONG
+ * because that tuple slot has the wrong descriptor.)
+ */
+ if (estate->es_junkFilter != NULL)
+ slot = ExecFilterJunk(estate->es_junkFilter, slot);
+
+ /*
+ * If we are supposed to send the tuple somewhere, do so. (In
+ * practice, this is probably always the case at this point.)
+ */
+ if (sendTuples)
+ {
+ /*
+ * If we are not able to send the tuple, we assume the destination
+ * has closed and no more tuples can be sent. If that's the case,
+ * end the loop.
+ */
+ if (!dest->receiveSlot(slot, dest))
+ break;
+ }
+
+ /*
+ * Count tuples processed, if this is a SELECT. (For other operation
+ * types, the ModifyTable plan node must count the appropriate
+ * events.)
+ */
+ if (operation == CMD_SELECT)
+ (estate->es_processed)++;
+
+ /*
+ * check our tuple count.. if we've processed the proper number then
+ * quit, else loop again and process more tuples. Zero numberTuples
+ * means no limit.
+ */
+ current_tuple_count++;
+ if (numberTuples && numberTuples == current_tuple_count)
+ break;
+ }
+
+ /*
+ * If we know we won't need to back up, we can release resources at this
+ * point.
+ */
+ if (!(estate->es_top_eflags & EXEC_FLAG_BACKWARD))
+ (void) ExecShutdownNode(planstate);
+
+ if (use_parallel_mode)
+ ExitParallelMode();
+}
+
+
+/*
+ * ExecRelCheck --- check that tuple meets constraints for result relation
+ *
+ * Returns NULL if OK, else name of failed check constraint
+ */
+static const char *
+ExecRelCheck(ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot, EState *estate)
+{
+ Relation rel = resultRelInfo->ri_RelationDesc;
+ int ncheck = rel->rd_att->constr->num_check;
+ ConstrCheck *check = rel->rd_att->constr->check;
+ ExprContext *econtext;
+ MemoryContext oldContext;
+ int i;
+
+ /*
+ * CheckConstraintFetch let this pass with only a warning, but now we
+ * should fail rather than possibly failing to enforce an important
+ * constraint.
+ */
+ if (ncheck != rel->rd_rel->relchecks)
+ elog(ERROR, "%d pg_constraint record(s) missing for relation \"%s\"",
+ rel->rd_rel->relchecks - ncheck, RelationGetRelationName(rel));
+
+ /*
+ * If first time through for this result relation, build expression
+ * nodetrees for rel's constraint expressions. Keep them in the per-query
+ * memory context so they'll survive throughout the query.
+ */
+ if (resultRelInfo->ri_ConstraintExprs == NULL)
+ {
+ oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+ resultRelInfo->ri_ConstraintExprs =
+ (ExprState **) palloc(ncheck * sizeof(ExprState *));
+ for (i = 0; i < ncheck; i++)
+ {
+ Expr *checkconstr;
+
+ checkconstr = stringToNode(check[i].ccbin);
+ resultRelInfo->ri_ConstraintExprs[i] =
+ ExecPrepareExpr(checkconstr, estate);
+ }
+ MemoryContextSwitchTo(oldContext);
+ }
+
+ /*
+ * We will use the EState's per-tuple context for evaluating constraint
+ * expressions (creating it if it's not already there).
+ */
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /* And evaluate the constraints */
+ for (i = 0; i < ncheck; i++)
+ {
+ ExprState *checkconstr = resultRelInfo->ri_ConstraintExprs[i];
+
+ /*
+ * NOTE: SQL specifies that a NULL result from a constraint expression
+ * is not to be treated as a failure. Therefore, use ExecCheck not
+ * ExecQual.
+ */
+ if (!ExecCheck(checkconstr, econtext))
+ return check[i].ccname;
+ }
+
+ /* NULL result means no error */
+ return NULL;
+}
+
+/*
+ * ExecPartitionCheck --- check that tuple meets the partition constraint.
+ *
+ * Returns true if it meets the partition constraint. If the constraint
+ * fails and we're asked to emit an error, do so and don't return; otherwise
+ * return false.
+ */
+bool
+ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
+ EState *estate, bool emitError)
+{
+ ExprContext *econtext;
+ bool success;
+
+ /*
+ * If first time through, build expression state tree for the partition
+ * check expression. (In the corner case where the partition check
+ * expression is empty, ie there's a default partition and nothing else,
+ * we'll be fooled into executing this code each time through. But it's
+ * pretty darn cheap in that case, so we don't worry about it.)
+ */
+ if (resultRelInfo->ri_PartitionCheckExpr == NULL)
+ {
+ /*
+ * Ensure that the qual tree and prepared expression are in the
+ * query-lifespan context.
+ */
+ MemoryContext oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
+ List *qual = RelationGetPartitionQual(resultRelInfo->ri_RelationDesc);
+
+ resultRelInfo->ri_PartitionCheckExpr = ExecPrepareCheck(qual, estate);
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ /*
+ * We will use the EState's per-tuple context for evaluating constraint
+ * expressions (creating it if it's not already there).
+ */
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /*
+ * As in case of the catalogued constraints, we treat a NULL result as
+ * success here, not a failure.
+ */
+ success = ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext);
+
+ /* if asked to emit error, don't actually return on failure */
+ if (!success && emitError)
+ ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+ return success;
+}
+
+/*
+ * ExecPartitionCheckEmitError - Form and emit an error message after a failed
+ * partition constraint check.
+ */
+void
+ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot,
+ EState *estate)
+{
+ Oid root_relid;
+ TupleDesc tupdesc;
+ char *val_desc;
+ Bitmapset *modifiedCols;
+
+ /*
+ * If the tuple has been routed, it's been converted to the partition's
+ * rowtype, which might differ from the root table's. We must convert it
+ * back to the root table's rowtype so that val_desc in the error message
+ * matches the input tuple.
+ */
+ if (resultRelInfo->ri_RootResultRelInfo)
+ {
+ ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo;
+ TupleDesc old_tupdesc;
+ AttrMap *map;
+
+ root_relid = RelationGetRelid(rootrel->ri_RelationDesc);
+ tupdesc = RelationGetDescr(rootrel->ri_RelationDesc);
+
+ old_tupdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+ /* a reverse map */
+ map = build_attrmap_by_name_if_req(old_tupdesc, tupdesc);
+
+ /*
+ * Partition-specific slot's tupdesc can't be changed, so allocate a
+ * new one.
+ */
+ if (map != NULL)
+ slot = execute_attr_map_slot(map, slot,
+ MakeTupleTableSlot(tupdesc, &TTSOpsVirtual));
+ modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate),
+ ExecGetUpdatedCols(rootrel, estate));
+ }
+ else
+ {
+ root_relid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+ tupdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+ modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate),
+ ExecGetUpdatedCols(resultRelInfo, estate));
+ }
+
+ val_desc = ExecBuildSlotValueDescription(root_relid,
+ slot,
+ tupdesc,
+ modifiedCols,
+ 64);
+ ereport(ERROR,
+ (errcode(ERRCODE_CHECK_VIOLATION),
+ errmsg("new row for relation \"%s\" violates partition constraint",
+ RelationGetRelationName(resultRelInfo->ri_RelationDesc)),
+ val_desc ? errdetail("Failing row contains %s.", val_desc) : 0,
+ errtable(resultRelInfo->ri_RelationDesc)));
+}
+
+/*
+ * ExecConstraints - check constraints of the tuple in 'slot'
+ *
+ * This checks the traditional NOT NULL and check constraints.
+ *
+ * The partition constraint is *NOT* checked.
+ *
+ * Note: 'slot' contains the tuple to check the constraints of, which may
+ * have been converted from the original input tuple after tuple routing.
+ * 'resultRelInfo' is the final result relation, after tuple routing.
+ */
+void
+ExecConstraints(ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot, EState *estate)
+{
+ Relation rel = resultRelInfo->ri_RelationDesc;
+ TupleDesc tupdesc = RelationGetDescr(rel);
+ TupleConstr *constr = tupdesc->constr;
+ Bitmapset *modifiedCols;
+
+ Assert(constr); /* we should not be called otherwise */
+
+ if (constr->has_not_null)
+ {
+ int natts = tupdesc->natts;
+ int attrChk;
+
+ for (attrChk = 1; attrChk <= natts; attrChk++)
+ {
+ Form_pg_attribute att = TupleDescAttr(tupdesc, attrChk - 1);
+
+ if (att->attnotnull && slot_attisnull(slot, attrChk))
+ {
+ char *val_desc;
+ Relation orig_rel = rel;
+ TupleDesc orig_tupdesc = RelationGetDescr(rel);
+
+ /*
+ * If the tuple has been routed, it's been converted to the
+ * partition's rowtype, which might differ from the root
+ * table's. We must convert it back to the root table's
+ * rowtype so that val_desc shown error message matches the
+ * input tuple.
+ */
+ if (resultRelInfo->ri_RootResultRelInfo)
+ {
+ ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo;
+ AttrMap *map;
+
+ tupdesc = RelationGetDescr(rootrel->ri_RelationDesc);
+ /* a reverse map */
+ map = build_attrmap_by_name_if_req(orig_tupdesc,
+ tupdesc);
+
+ /*
+ * Partition-specific slot's tupdesc can't be changed, so
+ * allocate a new one.
+ */
+ if (map != NULL)
+ slot = execute_attr_map_slot(map, slot,
+ MakeTupleTableSlot(tupdesc, &TTSOpsVirtual));
+ modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate),
+ ExecGetUpdatedCols(rootrel, estate));
+ rel = rootrel->ri_RelationDesc;
+ }
+ else
+ modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate),
+ ExecGetUpdatedCols(resultRelInfo, estate));
+ val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
+ slot,
+ tupdesc,
+ modifiedCols,
+ 64);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_NOT_NULL_VIOLATION),
+ errmsg("null value in column \"%s\" of relation \"%s\" violates not-null constraint",
+ NameStr(att->attname),
+ RelationGetRelationName(orig_rel)),
+ val_desc ? errdetail("Failing row contains %s.", val_desc) : 0,
+ errtablecol(orig_rel, attrChk)));
+ }
+ }
+ }
+
+ if (rel->rd_rel->relchecks > 0)
+ {
+ const char *failed;
+
+ if ((failed = ExecRelCheck(resultRelInfo, slot, estate)) != NULL)
+ {
+ char *val_desc;
+ Relation orig_rel = rel;
+
+ /* See the comment above. */
+ if (resultRelInfo->ri_RootResultRelInfo)
+ {
+ ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo;
+ TupleDesc old_tupdesc = RelationGetDescr(rel);
+ AttrMap *map;
+
+ tupdesc = RelationGetDescr(rootrel->ri_RelationDesc);
+ /* a reverse map */
+ map = build_attrmap_by_name_if_req(old_tupdesc,
+ tupdesc);
+
+ /*
+ * Partition-specific slot's tupdesc can't be changed, so
+ * allocate a new one.
+ */
+ if (map != NULL)
+ slot = execute_attr_map_slot(map, slot,
+ MakeTupleTableSlot(tupdesc, &TTSOpsVirtual));
+ modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate),
+ ExecGetUpdatedCols(rootrel, estate));
+ rel = rootrel->ri_RelationDesc;
+ }
+ else
+ modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate),
+ ExecGetUpdatedCols(resultRelInfo, estate));
+ val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
+ slot,
+ tupdesc,
+ modifiedCols,
+ 64);
+ ereport(ERROR,
+ (errcode(ERRCODE_CHECK_VIOLATION),
+ errmsg("new row for relation \"%s\" violates check constraint \"%s\"",
+ RelationGetRelationName(orig_rel), failed),
+ val_desc ? errdetail("Failing row contains %s.", val_desc) : 0,
+ errtableconstraint(orig_rel, failed)));
+ }
+ }
+}
+
+/*
+ * ExecWithCheckOptions -- check that tuple satisfies any WITH CHECK OPTIONs
+ * of the specified kind.
+ *
+ * Note that this needs to be called multiple times to ensure that all kinds of
+ * WITH CHECK OPTIONs are handled (both those from views which have the WITH
+ * CHECK OPTION set and from row-level security policies). See ExecInsert()
+ * and ExecUpdate().
+ */
+void
+ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot, EState *estate)
+{
+ Relation rel = resultRelInfo->ri_RelationDesc;
+ TupleDesc tupdesc = RelationGetDescr(rel);
+ ExprContext *econtext;
+ ListCell *l1,
+ *l2;
+
+ /*
+ * We will use the EState's per-tuple context for evaluating constraint
+ * expressions (creating it if it's not already there).
+ */
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /* Check each of the constraints */
+ forboth(l1, resultRelInfo->ri_WithCheckOptions,
+ l2, resultRelInfo->ri_WithCheckOptionExprs)
+ {
+ WithCheckOption *wco = (WithCheckOption *) lfirst(l1);
+ ExprState *wcoExpr = (ExprState *) lfirst(l2);
+
+ /*
+ * Skip any WCOs which are not the kind we are looking for at this
+ * time.
+ */
+ if (wco->kind != kind)
+ continue;
+
+ /*
+ * WITH CHECK OPTION checks are intended to ensure that the new tuple
+ * is visible (in the case of a view) or that it passes the
+ * 'with-check' policy (in the case of row security). If the qual
+ * evaluates to NULL or FALSE, then the new tuple won't be included in
+ * the view or doesn't pass the 'with-check' policy for the table.
+ */
+ if (!ExecQual(wcoExpr, econtext))
+ {
+ char *val_desc;
+ Bitmapset *modifiedCols;
+
+ switch (wco->kind)
+ {
+ /*
+ * For WITH CHECK OPTIONs coming from views, we might be
+ * able to provide the details on the row, depending on
+ * the permissions on the relation (that is, if the user
+ * could view it directly anyway). For RLS violations, we
+ * don't include the data since we don't know if the user
+ * should be able to view the tuple as that depends on the
+ * USING policy.
+ */
+ case WCO_VIEW_CHECK:
+ /* See the comment in ExecConstraints(). */
+ if (resultRelInfo->ri_RootResultRelInfo)
+ {
+ ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo;
+ TupleDesc old_tupdesc = RelationGetDescr(rel);
+ AttrMap *map;
+
+ tupdesc = RelationGetDescr(rootrel->ri_RelationDesc);
+ /* a reverse map */
+ map = build_attrmap_by_name_if_req(old_tupdesc,
+ tupdesc);
+
+ /*
+ * Partition-specific slot's tupdesc can't be changed,
+ * so allocate a new one.
+ */
+ if (map != NULL)
+ slot = execute_attr_map_slot(map, slot,
+ MakeTupleTableSlot(tupdesc, &TTSOpsVirtual));
+
+ modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate),
+ ExecGetUpdatedCols(rootrel, estate));
+ rel = rootrel->ri_RelationDesc;
+ }
+ else
+ modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate),
+ ExecGetUpdatedCols(resultRelInfo, estate));
+ val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
+ slot,
+ tupdesc,
+ modifiedCols,
+ 64);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_WITH_CHECK_OPTION_VIOLATION),
+ errmsg("new row violates check option for view \"%s\"",
+ wco->relname),
+ val_desc ? errdetail("Failing row contains %s.",
+ val_desc) : 0));
+ break;
+ case WCO_RLS_INSERT_CHECK:
+ case WCO_RLS_UPDATE_CHECK:
+ if (wco->polname != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("new row violates row-level security policy \"%s\" for table \"%s\"",
+ wco->polname, wco->relname)));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("new row violates row-level security policy for table \"%s\"",
+ wco->relname)));
+ break;
+ case WCO_RLS_CONFLICT_CHECK:
+ if (wco->polname != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("new row violates row-level security policy \"%s\" (USING expression) for table \"%s\"",
+ wco->polname, wco->relname)));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("new row violates row-level security policy (USING expression) for table \"%s\"",
+ wco->relname)));
+ break;
+ default:
+ elog(ERROR, "unrecognized WCO kind: %u", wco->kind);
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * ExecBuildSlotValueDescription -- construct a string representing a tuple
+ *
+ * This is intentionally very similar to BuildIndexValueDescription, but
+ * unlike that function, we truncate long field values (to at most maxfieldlen
+ * bytes). That seems necessary here since heap field values could be very
+ * long, whereas index entries typically aren't so wide.
+ *
+ * Also, unlike the case with index entries, we need to be prepared to ignore
+ * dropped columns. We used to use the slot's tuple descriptor to decode the
+ * data, but the slot's descriptor doesn't identify dropped columns, so we
+ * now need to be passed the relation's descriptor.
+ *
+ * Note that, like BuildIndexValueDescription, if the user does not have
+ * permission to view any of the columns involved, a NULL is returned. Unlike
+ * BuildIndexValueDescription, if the user has access to view a subset of the
+ * column involved, that subset will be returned with a key identifying which
+ * columns they are.
+ */
+static char *
+ExecBuildSlotValueDescription(Oid reloid,
+ TupleTableSlot *slot,
+ TupleDesc tupdesc,
+ Bitmapset *modifiedCols,
+ int maxfieldlen)
+{
+ StringInfoData buf;
+ StringInfoData collist;
+ bool write_comma = false;
+ bool write_comma_collist = false;
+ int i;
+ AclResult aclresult;
+ bool table_perm = false;
+ bool any_perm = false;
+
+ /*
+ * Check if RLS is enabled and should be active for the relation; if so,
+ * then don't return anything. Otherwise, go through normal permission
+ * checks.
+ */
+ if (check_enable_rls(reloid, InvalidOid, true) == RLS_ENABLED)
+ return NULL;
+
+ initStringInfo(&buf);
+
+ appendStringInfoChar(&buf, '(');
+
+ /*
+ * Check if the user has permissions to see the row. Table-level SELECT
+ * allows access to all columns. If the user does not have table-level
+ * SELECT then we check each column and include those the user has SELECT
+ * rights on. Additionally, we always include columns the user provided
+ * data for.
+ */
+ aclresult = pg_class_aclcheck(reloid, GetUserId(), ACL_SELECT);
+ if (aclresult != ACLCHECK_OK)
+ {
+ /* Set up the buffer for the column list */
+ initStringInfo(&collist);
+ appendStringInfoChar(&collist, '(');
+ }
+ else
+ table_perm = any_perm = true;
+
+ /* Make sure the tuple is fully deconstructed */
+ slot_getallattrs(slot);
+
+ for (i = 0; i < tupdesc->natts; i++)
+ {
+ bool column_perm = false;
+ char *val;
+ int vallen;
+ Form_pg_attribute att = TupleDescAttr(tupdesc, i);
+
+ /* ignore dropped columns */
+ if (att->attisdropped)
+ continue;
+
+ if (!table_perm)
+ {
+ /*
+ * No table-level SELECT, so need to make sure they either have
+ * SELECT rights on the column or that they have provided the data
+ * for the column. If not, omit this column from the error
+ * message.
+ */
+ aclresult = pg_attribute_aclcheck(reloid, att->attnum,
+ GetUserId(), ACL_SELECT);
+ if (bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber,
+ modifiedCols) || aclresult == ACLCHECK_OK)
+ {
+ column_perm = any_perm = true;
+
+ if (write_comma_collist)
+ appendStringInfoString(&collist, ", ");
+ else
+ write_comma_collist = true;
+
+ appendStringInfoString(&collist, NameStr(att->attname));
+ }
+ }
+
+ if (table_perm || column_perm)
+ {
+ if (slot->tts_isnull[i])
+ val = "null";
+ else
+ {
+ Oid foutoid;
+ bool typisvarlena;
+
+ getTypeOutputInfo(att->atttypid,
+ &foutoid, &typisvarlena);
+ val = OidOutputFunctionCall(foutoid, slot->tts_values[i]);
+ }
+
+ if (write_comma)
+ appendStringInfoString(&buf, ", ");
+ else
+ write_comma = true;
+
+ /* truncate if needed */
+ vallen = strlen(val);
+ if (vallen <= maxfieldlen)
+ appendBinaryStringInfo(&buf, val, vallen);
+ else
+ {
+ vallen = pg_mbcliplen(val, vallen, maxfieldlen);
+ appendBinaryStringInfo(&buf, val, vallen);
+ appendStringInfoString(&buf, "...");
+ }
+ }
+ }
+
+ /* If we end up with zero columns being returned, then return NULL. */
+ if (!any_perm)
+ return NULL;
+
+ appendStringInfoChar(&buf, ')');
+
+ if (!table_perm)
+ {
+ appendStringInfoString(&collist, ") = ");
+ appendBinaryStringInfo(&collist, buf.data, buf.len);
+
+ return collist.data;
+ }
+
+ return buf.data;
+}
+
+
+/*
+ * ExecUpdateLockMode -- find the appropriate UPDATE tuple lock mode for a
+ * given ResultRelInfo
+ */
+LockTupleMode
+ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo)
+{
+ Bitmapset *keyCols;
+ Bitmapset *updatedCols;
+
+ /*
+ * Compute lock mode to use. If columns that are part of the key have not
+ * been modified, then we can use a weaker lock, allowing for better
+ * concurrency.
+ */
+ updatedCols = ExecGetAllUpdatedCols(relinfo, estate);
+ keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc,
+ INDEX_ATTR_BITMAP_KEY);
+
+ if (bms_overlap(keyCols, updatedCols))
+ return LockTupleExclusive;
+
+ return LockTupleNoKeyExclusive;
+}
+
+/*
+ * ExecFindRowMark -- find the ExecRowMark struct for given rangetable index
+ *
+ * If no such struct, either return NULL or throw error depending on missing_ok
+ */
+ExecRowMark *
+ExecFindRowMark(EState *estate, Index rti, bool missing_ok)
+{
+ if (rti > 0 && rti <= estate->es_range_table_size &&
+ estate->es_rowmarks != NULL)
+ {
+ ExecRowMark *erm = estate->es_rowmarks[rti - 1];
+
+ if (erm)
+ return erm;
+ }
+ if (!missing_ok)
+ elog(ERROR, "failed to find ExecRowMark for rangetable index %u", rti);
+ return NULL;
+}
+
+/*
+ * ExecBuildAuxRowMark -- create an ExecAuxRowMark struct
+ *
+ * Inputs are the underlying ExecRowMark struct and the targetlist of the
+ * input plan node (not planstate node!). We need the latter to find out
+ * the column numbers of the resjunk columns.
+ */
+ExecAuxRowMark *
+ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist)
+{
+ ExecAuxRowMark *aerm = (ExecAuxRowMark *) palloc0(sizeof(ExecAuxRowMark));
+ char resname[32];
+
+ aerm->rowmark = erm;
+
+ /* Look up the resjunk columns associated with this rowmark */
+ if (erm->markType != ROW_MARK_COPY)
+ {
+ /* need ctid for all methods other than COPY */
+ snprintf(resname, sizeof(resname), "ctid%u", erm->rowmarkId);
+ aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist,
+ resname);
+ if (!AttributeNumberIsValid(aerm->ctidAttNo))
+ elog(ERROR, "could not find junk %s column", resname);
+ }
+ else
+ {
+ /* need wholerow if COPY */
+ snprintf(resname, sizeof(resname), "wholerow%u", erm->rowmarkId);
+ aerm->wholeAttNo = ExecFindJunkAttributeInTlist(targetlist,
+ resname);
+ if (!AttributeNumberIsValid(aerm->wholeAttNo))
+ elog(ERROR, "could not find junk %s column", resname);
+ }
+
+ /* if child rel, need tableoid */
+ if (erm->rti != erm->prti)
+ {
+ snprintf(resname, sizeof(resname), "tableoid%u", erm->rowmarkId);
+ aerm->toidAttNo = ExecFindJunkAttributeInTlist(targetlist,
+ resname);
+ if (!AttributeNumberIsValid(aerm->toidAttNo))
+ elog(ERROR, "could not find junk %s column", resname);
+ }
+
+ return aerm;
+}
+
+
+/*
+ * EvalPlanQual logic --- recheck modified tuple(s) to see if we want to
+ * process the updated version under READ COMMITTED rules.
+ *
+ * See backend/executor/README for some info about how this works.
+ */
+
+
+/*
+ * Check the updated version of a tuple to see if we want to process it under
+ * READ COMMITTED rules.
+ *
+ * epqstate - state for EvalPlanQual rechecking
+ * relation - table containing tuple
+ * rti - rangetable index of table containing tuple
+ * inputslot - tuple for processing - this can be the slot from
+ * EvalPlanQualSlot(), for the increased efficiency.
+ *
+ * This tests whether the tuple in inputslot still matches the relevant
+ * quals. For that result to be useful, typically the input tuple has to be
+ * last row version (otherwise the result isn't particularly useful) and
+ * locked (otherwise the result might be out of date). That's typically
+ * achieved by using table_tuple_lock() with the
+ * TUPLE_LOCK_FLAG_FIND_LAST_VERSION flag.
+ *
+ * Returns a slot containing the new candidate update/delete tuple, or
+ * NULL if we determine we shouldn't process the row.
+ */
+TupleTableSlot *
+EvalPlanQual(EPQState *epqstate, Relation relation,
+ Index rti, TupleTableSlot *inputslot)
+{
+ TupleTableSlot *slot;
+ TupleTableSlot *testslot;
+
+ Assert(rti > 0);
+
+ /*
+ * Need to run a recheck subquery. Initialize or reinitialize EPQ state.
+ */
+ EvalPlanQualBegin(epqstate);
+
+ /*
+ * Callers will often use the EvalPlanQualSlot to store the tuple to avoid
+ * an unnecessary copy.
+ */
+ testslot = EvalPlanQualSlot(epqstate, relation, rti);
+ if (testslot != inputslot)
+ ExecCopySlot(testslot, inputslot);
+
+ /*
+ * Run the EPQ query. We assume it will return at most one tuple.
+ */
+ slot = EvalPlanQualNext(epqstate);
+
+ /*
+ * If we got a tuple, force the slot to materialize the tuple so that it
+ * is not dependent on any local state in the EPQ query (in particular,
+ * it's highly likely that the slot contains references to any pass-by-ref
+ * datums that may be present in copyTuple). As with the next step, this
+ * is to guard against early re-use of the EPQ query.
+ */
+ if (!TupIsNull(slot))
+ ExecMaterializeSlot(slot);
+
+ /*
+ * Clear out the test tuple. This is needed in case the EPQ query is
+ * re-used to test a tuple for a different relation. (Not clear that can
+ * really happen, but let's be safe.)
+ */
+ ExecClearTuple(testslot);
+
+ return slot;
+}
+
+/*
+ * EvalPlanQualInit -- initialize during creation of a plan state node
+ * that might need to invoke EPQ processing.
+ *
+ * Note: subplan/auxrowmarks can be NULL/NIL if they will be set later
+ * with EvalPlanQualSetPlan.
+ */
+void
+EvalPlanQualInit(EPQState *epqstate, EState *parentestate,
+ Plan *subplan, List *auxrowmarks, int epqParam)
+{
+ Index rtsize = parentestate->es_range_table_size;
+
+ /* initialize data not changing over EPQState's lifetime */
+ epqstate->parentestate = parentestate;
+ epqstate->epqParam = epqParam;
+
+ /*
+ * Allocate space to reference a slot for each potential rti - do so now
+ * rather than in EvalPlanQualBegin(), as done for other dynamically
+ * allocated resources, so EvalPlanQualSlot() can be used to hold tuples
+ * that *may* need EPQ later, without forcing the overhead of
+ * EvalPlanQualBegin().
+ */
+ epqstate->tuple_table = NIL;
+ epqstate->relsubs_slot = (TupleTableSlot **)
+ palloc0(rtsize * sizeof(TupleTableSlot *));
+
+ /* ... and remember data that EvalPlanQualBegin will need */
+ epqstate->plan = subplan;
+ epqstate->arowMarks = auxrowmarks;
+
+ /* ... and mark the EPQ state inactive */
+ epqstate->origslot = NULL;
+ epqstate->recheckestate = NULL;
+ epqstate->recheckplanstate = NULL;
+ epqstate->relsubs_rowmark = NULL;
+ epqstate->relsubs_done = NULL;
+}
+
+/*
+ * EvalPlanQualSetPlan -- set or change subplan of an EPQState.
+ *
+ * We used to need this so that ModifyTable could deal with multiple subplans.
+ * It could now be refactored out of existence.
+ */
+void
+EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks)
+{
+ /* If we have a live EPQ query, shut it down */
+ EvalPlanQualEnd(epqstate);
+ /* And set/change the plan pointer */
+ epqstate->plan = subplan;
+ /* The rowmarks depend on the plan, too */
+ epqstate->arowMarks = auxrowmarks;
+}
+
+/*
+ * Return, and create if necessary, a slot for an EPQ test tuple.
+ *
+ * Note this only requires EvalPlanQualInit() to have been called,
+ * EvalPlanQualBegin() is not necessary.
+ */
+TupleTableSlot *
+EvalPlanQualSlot(EPQState *epqstate,
+ Relation relation, Index rti)
+{
+ TupleTableSlot **slot;
+
+ Assert(relation);
+ Assert(rti > 0 && rti <= epqstate->parentestate->es_range_table_size);
+ slot = &epqstate->relsubs_slot[rti - 1];
+
+ if (*slot == NULL)
+ {
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(epqstate->parentestate->es_query_cxt);
+ *slot = table_slot_create(relation, &epqstate->tuple_table);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ return *slot;
+}
+
+/*
+ * Fetch the current row value for a non-locked relation, identified by rti,
+ * that needs to be scanned by an EvalPlanQual operation. origslot must have
+ * been set to contain the current result row (top-level row) that we need to
+ * recheck. Returns true if a substitution tuple was found, false if not.
+ */
+bool
+EvalPlanQualFetchRowMark(EPQState *epqstate, Index rti, TupleTableSlot *slot)
+{
+ ExecAuxRowMark *earm = epqstate->relsubs_rowmark[rti - 1];
+ ExecRowMark *erm = earm->rowmark;
+ Datum datum;
+ bool isNull;
+
+ Assert(earm != NULL);
+ Assert(epqstate->origslot != NULL);
+
+ if (RowMarkRequiresRowShareLock(erm->markType))
+ elog(ERROR, "EvalPlanQual doesn't support locking rowmarks");
+
+ /* if child rel, must check whether it produced this row */
+ if (erm->rti != erm->prti)
+ {
+ Oid tableoid;
+
+ datum = ExecGetJunkAttribute(epqstate->origslot,
+ earm->toidAttNo,
+ &isNull);
+ /* non-locked rels could be on the inside of outer joins */
+ if (isNull)
+ return false;
+
+ tableoid = DatumGetObjectId(datum);
+
+ Assert(OidIsValid(erm->relid));
+ if (tableoid != erm->relid)
+ {
+ /* this child is inactive right now */
+ return false;
+ }
+ }
+
+ if (erm->markType == ROW_MARK_REFERENCE)
+ {
+ Assert(erm->relation != NULL);
+
+ /* fetch the tuple's ctid */
+ datum = ExecGetJunkAttribute(epqstate->origslot,
+ earm->ctidAttNo,
+ &isNull);
+ /* non-locked rels could be on the inside of outer joins */
+ if (isNull)
+ return false;
+
+ /* fetch requests on foreign tables must be passed to their FDW */
+ if (erm->relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+ {
+ FdwRoutine *fdwroutine;
+ bool updated = false;
+
+ fdwroutine = GetFdwRoutineForRelation(erm->relation, false);
+ /* this should have been checked already, but let's be safe */
+ if (fdwroutine->RefetchForeignRow == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot lock rows in foreign table \"%s\"",
+ RelationGetRelationName(erm->relation))));
+
+ fdwroutine->RefetchForeignRow(epqstate->recheckestate,
+ erm,
+ datum,
+ slot,
+ &updated);
+ if (TupIsNull(slot))
+ elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck");
+
+ /*
+ * Ideally we'd insist on updated == false here, but that assumes
+ * that FDWs can track that exactly, which they might not be able
+ * to. So just ignore the flag.
+ */
+ return true;
+ }
+ else
+ {
+ /* ordinary table, fetch the tuple */
+ if (!table_tuple_fetch_row_version(erm->relation,
+ (ItemPointer) DatumGetPointer(datum),
+ SnapshotAny, slot))
+ elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck");
+ return true;
+ }
+ }
+ else
+ {
+ Assert(erm->markType == ROW_MARK_COPY);
+
+ /* fetch the whole-row Var for the relation */
+ datum = ExecGetJunkAttribute(epqstate->origslot,
+ earm->wholeAttNo,
+ &isNull);
+ /* non-locked rels could be on the inside of outer joins */
+ if (isNull)
+ return false;
+
+ ExecStoreHeapTupleDatum(datum, slot);
+ return true;
+ }
+}
+
+/*
+ * Fetch the next row (if any) from EvalPlanQual testing
+ *
+ * (In practice, there should never be more than one row...)
+ */
+TupleTableSlot *
+EvalPlanQualNext(EPQState *epqstate)
+{
+ MemoryContext oldcontext;
+ TupleTableSlot *slot;
+
+ oldcontext = MemoryContextSwitchTo(epqstate->recheckestate->es_query_cxt);
+ slot = ExecProcNode(epqstate->recheckplanstate);
+ MemoryContextSwitchTo(oldcontext);
+
+ return slot;
+}
+
+/*
+ * Initialize or reset an EvalPlanQual state tree
+ */
+void
+EvalPlanQualBegin(EPQState *epqstate)
+{
+ EState *parentestate = epqstate->parentestate;
+ EState *recheckestate = epqstate->recheckestate;
+
+ if (recheckestate == NULL)
+ {
+ /* First time through, so create a child EState */
+ EvalPlanQualStart(epqstate, epqstate->plan);
+ }
+ else
+ {
+ /*
+ * We already have a suitable child EPQ tree, so just reset it.
+ */
+ Index rtsize = parentestate->es_range_table_size;
+ PlanState *rcplanstate = epqstate->recheckplanstate;
+
+ MemSet(epqstate->relsubs_done, 0, rtsize * sizeof(bool));
+
+ /* Recopy current values of parent parameters */
+ if (parentestate->es_plannedstmt->paramExecTypes != NIL)
+ {
+ int i;
+
+ /*
+ * Force evaluation of any InitPlan outputs that could be needed
+ * by the subplan, just in case they got reset since
+ * EvalPlanQualStart (see comments therein).
+ */
+ ExecSetParamPlanMulti(rcplanstate->plan->extParam,
+ GetPerTupleExprContext(parentestate));
+
+ i = list_length(parentestate->es_plannedstmt->paramExecTypes);
+
+ while (--i >= 0)
+ {
+ /* copy value if any, but not execPlan link */
+ recheckestate->es_param_exec_vals[i].value =
+ parentestate->es_param_exec_vals[i].value;
+ recheckestate->es_param_exec_vals[i].isnull =
+ parentestate->es_param_exec_vals[i].isnull;
+ }
+ }
+
+ /*
+ * Mark child plan tree as needing rescan at all scan nodes. The
+ * first ExecProcNode will take care of actually doing the rescan.
+ */
+ rcplanstate->chgParam = bms_add_member(rcplanstate->chgParam,
+ epqstate->epqParam);
+ }
+}
+
+/*
+ * Start execution of an EvalPlanQual plan tree.
+ *
+ * This is a cut-down version of ExecutorStart(): we copy some state from
+ * the top-level estate rather than initializing it fresh.
+ */
+static void
+EvalPlanQualStart(EPQState *epqstate, Plan *planTree)
+{
+ EState *parentestate = epqstate->parentestate;
+ Index rtsize = parentestate->es_range_table_size;
+ EState *rcestate;
+ MemoryContext oldcontext;
+ ListCell *l;
+
+ epqstate->recheckestate = rcestate = CreateExecutorState();
+
+ oldcontext = MemoryContextSwitchTo(rcestate->es_query_cxt);
+
+ /* signal that this is an EState for executing EPQ */
+ rcestate->es_epq_active = epqstate;
+
+ /*
+ * Child EPQ EStates share the parent's copy of unchanging state such as
+ * the snapshot, rangetable, and external Param info. They need their own
+ * copies of local state, including a tuple table, es_param_exec_vals,
+ * result-rel info, etc.
+ */
+ rcestate->es_direction = ForwardScanDirection;
+ rcestate->es_snapshot = parentestate->es_snapshot;
+ rcestate->es_crosscheck_snapshot = parentestate->es_crosscheck_snapshot;
+ rcestate->es_range_table = parentestate->es_range_table;
+ rcestate->es_range_table_size = parentestate->es_range_table_size;
+ rcestate->es_relations = parentestate->es_relations;
+ rcestate->es_queryEnv = parentestate->es_queryEnv;
+ rcestate->es_rowmarks = parentestate->es_rowmarks;
+ rcestate->es_plannedstmt = parentestate->es_plannedstmt;
+ rcestate->es_junkFilter = parentestate->es_junkFilter;
+ rcestate->es_output_cid = parentestate->es_output_cid;
+
+ /*
+ * ResultRelInfos needed by subplans are initialized from scratch when the
+ * subplans themselves are initialized.
+ */
+ rcestate->es_result_relations = NULL;
+ /* es_trig_target_relations must NOT be copied */
+ rcestate->es_top_eflags = parentestate->es_top_eflags;
+ rcestate->es_instrument = parentestate->es_instrument;
+ /* es_auxmodifytables must NOT be copied */
+
+ /*
+ * The external param list is simply shared from parent. The internal
+ * param workspace has to be local state, but we copy the initial values
+ * from the parent, so as to have access to any param values that were
+ * already set from other parts of the parent's plan tree.
+ */
+ rcestate->es_param_list_info = parentestate->es_param_list_info;
+ if (parentestate->es_plannedstmt->paramExecTypes != NIL)
+ {
+ int i;
+
+ /*
+ * Force evaluation of any InitPlan outputs that could be needed by
+ * the subplan. (With more complexity, maybe we could postpone this
+ * till the subplan actually demands them, but it doesn't seem worth
+ * the trouble; this is a corner case already, since usually the
+ * InitPlans would have been evaluated before reaching EvalPlanQual.)
+ *
+ * This will not touch output params of InitPlans that occur somewhere
+ * within the subplan tree, only those that are attached to the
+ * ModifyTable node or above it and are referenced within the subplan.
+ * That's OK though, because the planner would only attach such
+ * InitPlans to a lower-level SubqueryScan node, and EPQ execution
+ * will not descend into a SubqueryScan.
+ *
+ * The EState's per-output-tuple econtext is sufficiently short-lived
+ * for this, since it should get reset before there is any chance of
+ * doing EvalPlanQual again.
+ */
+ ExecSetParamPlanMulti(planTree->extParam,
+ GetPerTupleExprContext(parentestate));
+
+ /* now make the internal param workspace ... */
+ i = list_length(parentestate->es_plannedstmt->paramExecTypes);
+ rcestate->es_param_exec_vals = (ParamExecData *)
+ palloc0(i * sizeof(ParamExecData));
+ /* ... and copy down all values, whether really needed or not */
+ while (--i >= 0)
+ {
+ /* copy value if any, but not execPlan link */
+ rcestate->es_param_exec_vals[i].value =
+ parentestate->es_param_exec_vals[i].value;
+ rcestate->es_param_exec_vals[i].isnull =
+ parentestate->es_param_exec_vals[i].isnull;
+ }
+ }
+
+ /*
+ * Initialize private state information for each SubPlan. We must do this
+ * before running ExecInitNode on the main query tree, since
+ * ExecInitSubPlan expects to be able to find these entries. Some of the
+ * SubPlans might not be used in the part of the plan tree we intend to
+ * run, but since it's not easy to tell which, we just initialize them
+ * all.
+ */
+ Assert(rcestate->es_subplanstates == NIL);
+ foreach(l, parentestate->es_plannedstmt->subplans)
+ {
+ Plan *subplan = (Plan *) lfirst(l);
+ PlanState *subplanstate;
+
+ subplanstate = ExecInitNode(subplan, rcestate, 0);
+ rcestate->es_subplanstates = lappend(rcestate->es_subplanstates,
+ subplanstate);
+ }
+
+ /*
+ * Build an RTI indexed array of rowmarks, so that
+ * EvalPlanQualFetchRowMark() can efficiently access the to be fetched
+ * rowmark.
+ */
+ epqstate->relsubs_rowmark = (ExecAuxRowMark **)
+ palloc0(rtsize * sizeof(ExecAuxRowMark *));
+ foreach(l, epqstate->arowMarks)
+ {
+ ExecAuxRowMark *earm = (ExecAuxRowMark *) lfirst(l);
+
+ epqstate->relsubs_rowmark[earm->rowmark->rti - 1] = earm;
+ }
+
+ /*
+ * Initialize per-relation EPQ tuple states to not-fetched.
+ */
+ epqstate->relsubs_done = (bool *)
+ palloc0(rtsize * sizeof(bool));
+
+ /*
+ * Initialize the private state information for all the nodes in the part
+ * of the plan tree we need to run. This opens files, allocates storage
+ * and leaves us ready to start processing tuples.
+ */
+ epqstate->recheckplanstate = ExecInitNode(planTree, rcestate, 0);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * EvalPlanQualEnd -- shut down at termination of parent plan state node,
+ * or if we are done with the current EPQ child.
+ *
+ * This is a cut-down version of ExecutorEnd(); basically we want to do most
+ * of the normal cleanup, but *not* close result relations (which we are
+ * just sharing from the outer query). We do, however, have to close any
+ * result and trigger target relations that got opened, since those are not
+ * shared. (There probably shouldn't be any of the latter, but just in
+ * case...)
+ */
+void
+EvalPlanQualEnd(EPQState *epqstate)
+{
+ EState *estate = epqstate->recheckestate;
+ Index rtsize;
+ MemoryContext oldcontext;
+ ListCell *l;
+
+ rtsize = epqstate->parentestate->es_range_table_size;
+
+ /*
+ * We may have a tuple table, even if EPQ wasn't started, because we allow
+ * use of EvalPlanQualSlot() without calling EvalPlanQualBegin().
+ */
+ if (epqstate->tuple_table != NIL)
+ {
+ memset(epqstate->relsubs_slot, 0,
+ rtsize * sizeof(TupleTableSlot *));
+ ExecResetTupleTable(epqstate->tuple_table, true);
+ epqstate->tuple_table = NIL;
+ }
+
+ /* EPQ wasn't started, nothing further to do */
+ if (estate == NULL)
+ return;
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ ExecEndNode(epqstate->recheckplanstate);
+
+ foreach(l, estate->es_subplanstates)
+ {
+ PlanState *subplanstate = (PlanState *) lfirst(l);
+
+ ExecEndNode(subplanstate);
+ }
+
+ /* throw away the per-estate tuple table, some node may have used it */
+ ExecResetTupleTable(estate->es_tupleTable, false);
+
+ /* Close any result and trigger target relations attached to this EState */
+ ExecCloseResultRelations(estate);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ FreeExecutorState(estate);
+
+ /* Mark EPQState idle */
+ epqstate->origslot = NULL;
+ epqstate->recheckestate = NULL;
+ epqstate->recheckplanstate = NULL;
+ epqstate->relsubs_rowmark = NULL;
+ epqstate->relsubs_done = NULL;
+}
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
new file mode 100644
index 0000000..f8a4a40
--- /dev/null
+++ b/src/backend/executor/execParallel.c
@@ -0,0 +1,1498 @@
+/*-------------------------------------------------------------------------
+ *
+ * execParallel.c
+ * Support routines for parallel execution.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * This file contains routines that are intended to support setting up,
+ * using, and tearing down a ParallelContext from within the PostgreSQL
+ * executor. The ParallelContext machinery will handle starting the
+ * workers and ensuring that their state generally matches that of the
+ * leader; see src/backend/access/transam/README.parallel for details.
+ * However, we must save and restore relevant executor state, such as
+ * any ParamListInfo associated with the query, buffer/WAL usage info, and
+ * the actual plan to be passed down to the worker.
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execParallel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execParallel.h"
+#include "executor/executor.h"
+#include "executor/nodeAgg.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeBitmapHeapscan.h"
+#include "executor/nodeCustom.h"
+#include "executor/nodeForeignscan.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "executor/nodeMemoize.h"
+#include "executor/nodeSeqscan.h"
+#include "executor/nodeSort.h"
+#include "executor/nodeSubplan.h"
+#include "executor/tqueue.h"
+#include "jit/jit.h"
+#include "nodes/nodeFuncs.h"
+#include "pgstat.h"
+#include "storage/spin.h"
+#include "tcop/tcopprot.h"
+#include "utils/datum.h"
+#include "utils/dsa.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+
+/*
+ * Magic numbers for parallel executor communication. We use constants
+ * greater than any 32-bit integer here so that values < 2^32 can be used
+ * by individual parallel nodes to store their own state.
+ */
+#define PARALLEL_KEY_EXECUTOR_FIXED UINT64CONST(0xE000000000000001)
+#define PARALLEL_KEY_PLANNEDSTMT UINT64CONST(0xE000000000000002)
+#define PARALLEL_KEY_PARAMLISTINFO UINT64CONST(0xE000000000000003)
+#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xE000000000000004)
+#define PARALLEL_KEY_TUPLE_QUEUE UINT64CONST(0xE000000000000005)
+#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xE000000000000006)
+#define PARALLEL_KEY_DSA UINT64CONST(0xE000000000000007)
+#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xE000000000000008)
+#define PARALLEL_KEY_JIT_INSTRUMENTATION UINT64CONST(0xE000000000000009)
+#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xE00000000000000A)
+
+#define PARALLEL_TUPLE_QUEUE_SIZE 65536
+
+/*
+ * Fixed-size random stuff that we need to pass to parallel workers.
+ */
+typedef struct FixedParallelExecutorState
+{
+ int64 tuples_needed; /* tuple bound, see ExecSetTupleBound */
+ dsa_pointer param_exec;
+ int eflags;
+ int jit_flags;
+} FixedParallelExecutorState;
+
+/*
+ * DSM structure for accumulating per-PlanState instrumentation.
+ *
+ * instrument_options: Same meaning here as in instrument.c.
+ *
+ * instrument_offset: Offset, relative to the start of this structure,
+ * of the first Instrumentation object. This will depend on the length of
+ * the plan_node_id array.
+ *
+ * num_workers: Number of workers.
+ *
+ * num_plan_nodes: Number of plan nodes.
+ *
+ * plan_node_id: Array of plan nodes for which we are gathering instrumentation
+ * from parallel workers. The length of this array is given by num_plan_nodes.
+ */
+struct SharedExecutorInstrumentation
+{
+ int instrument_options;
+ int instrument_offset;
+ int num_workers;
+ int num_plan_nodes;
+ int plan_node_id[FLEXIBLE_ARRAY_MEMBER];
+ /* array of num_plan_nodes * num_workers Instrumentation objects follows */
+};
+#define GetInstrumentationArray(sei) \
+ (AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
+ (Instrumentation *) (((char *) sei) + sei->instrument_offset))
+
+/* Context object for ExecParallelEstimate. */
+typedef struct ExecParallelEstimateContext
+{
+ ParallelContext *pcxt;
+ int nnodes;
+} ExecParallelEstimateContext;
+
+/* Context object for ExecParallelInitializeDSM. */
+typedef struct ExecParallelInitializeDSMContext
+{
+ ParallelContext *pcxt;
+ SharedExecutorInstrumentation *instrumentation;
+ int nnodes;
+} ExecParallelInitializeDSMContext;
+
+/* Helper functions that run in the parallel leader. */
+static char *ExecSerializePlan(Plan *plan, EState *estate);
+static bool ExecParallelEstimate(PlanState *node,
+ ExecParallelEstimateContext *e);
+static bool ExecParallelInitializeDSM(PlanState *node,
+ ExecParallelInitializeDSMContext *d);
+static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt,
+ bool reinitialize);
+static bool ExecParallelReInitializeDSM(PlanState *planstate,
+ ParallelContext *pcxt);
+static bool ExecParallelRetrieveInstrumentation(PlanState *planstate,
+ SharedExecutorInstrumentation *instrumentation);
+
+/* Helper function that runs in the parallel worker. */
+static DestReceiver *ExecParallelGetReceiver(dsm_segment *seg, shm_toc *toc);
+
+/*
+ * Create a serialized representation of the plan to be sent to each worker.
+ */
+static char *
+ExecSerializePlan(Plan *plan, EState *estate)
+{
+ PlannedStmt *pstmt;
+ ListCell *lc;
+
+ /* We can't scribble on the original plan, so make a copy. */
+ plan = copyObject(plan);
+
+ /*
+ * The worker will start its own copy of the executor, and that copy will
+ * insert a junk filter if the toplevel node has any resjunk entries. We
+ * don't want that to happen, because while resjunk columns shouldn't be
+ * sent back to the user, here the tuples are coming back to another
+ * backend which may very well need them. So mutate the target list
+ * accordingly. This is sort of a hack; there might be better ways to do
+ * this...
+ */
+ foreach(lc, plan->targetlist)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc);
+
+ tle->resjunk = false;
+ }
+
+ /*
+ * Create a dummy PlannedStmt. Most of the fields don't need to be valid
+ * for our purposes, but the worker will need at least a minimal
+ * PlannedStmt to start the executor.
+ */
+ pstmt = makeNode(PlannedStmt);
+ pstmt->commandType = CMD_SELECT;
+ pstmt->queryId = pgstat_get_my_query_id();
+ pstmt->hasReturning = false;
+ pstmt->hasModifyingCTE = false;
+ pstmt->canSetTag = true;
+ pstmt->transientPlan = false;
+ pstmt->dependsOnRole = false;
+ pstmt->parallelModeNeeded = false;
+ pstmt->planTree = plan;
+ pstmt->rtable = estate->es_range_table;
+ pstmt->resultRelations = NIL;
+ pstmt->appendRelations = NIL;
+
+ /*
+ * Transfer only parallel-safe subplans, leaving a NULL "hole" in the list
+ * for unsafe ones (so that the list indexes of the safe ones are
+ * preserved). This positively ensures that the worker won't try to run,
+ * or even do ExecInitNode on, an unsafe subplan. That's important to
+ * protect, eg, non-parallel-aware FDWs from getting into trouble.
+ */
+ pstmt->subplans = NIL;
+ foreach(lc, estate->es_plannedstmt->subplans)
+ {
+ Plan *subplan = (Plan *) lfirst(lc);
+
+ if (subplan && !subplan->parallel_safe)
+ subplan = NULL;
+ pstmt->subplans = lappend(pstmt->subplans, subplan);
+ }
+
+ pstmt->rewindPlanIDs = NULL;
+ pstmt->rowMarks = NIL;
+ pstmt->relationOids = NIL;
+ pstmt->invalItems = NIL; /* workers can't replan anyway... */
+ pstmt->paramExecTypes = estate->es_plannedstmt->paramExecTypes;
+ pstmt->utilityStmt = NULL;
+ pstmt->stmt_location = -1;
+ pstmt->stmt_len = -1;
+
+ /* Return serialized copy of our dummy PlannedStmt. */
+ return nodeToString(pstmt);
+}
+
+/*
+ * Parallel-aware plan nodes (and occasionally others) may need some state
+ * which is shared across all parallel workers. Before we size the DSM, give
+ * them a chance to call shm_toc_estimate_chunk or shm_toc_estimate_keys on
+ * &pcxt->estimator.
+ *
+ * While we're at it, count the number of PlanState nodes in the tree, so
+ * we know how many Instrumentation structures we need.
+ */
+static bool
+ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
+{
+ if (planstate == NULL)
+ return false;
+
+ /* Count this node. */
+ e->nnodes++;
+
+ switch (nodeTag(planstate))
+ {
+ case T_SeqScanState:
+ if (planstate->plan->parallel_aware)
+ ExecSeqScanEstimate((SeqScanState *) planstate,
+ e->pcxt);
+ break;
+ case T_IndexScanState:
+ if (planstate->plan->parallel_aware)
+ ExecIndexScanEstimate((IndexScanState *) planstate,
+ e->pcxt);
+ break;
+ case T_IndexOnlyScanState:
+ if (planstate->plan->parallel_aware)
+ ExecIndexOnlyScanEstimate((IndexOnlyScanState *) planstate,
+ e->pcxt);
+ break;
+ case T_ForeignScanState:
+ if (planstate->plan->parallel_aware)
+ ExecForeignScanEstimate((ForeignScanState *) planstate,
+ e->pcxt);
+ break;
+ case T_AppendState:
+ if (planstate->plan->parallel_aware)
+ ExecAppendEstimate((AppendState *) planstate,
+ e->pcxt);
+ break;
+ case T_CustomScanState:
+ if (planstate->plan->parallel_aware)
+ ExecCustomScanEstimate((CustomScanState *) planstate,
+ e->pcxt);
+ break;
+ case T_BitmapHeapScanState:
+ if (planstate->plan->parallel_aware)
+ ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate,
+ e->pcxt);
+ break;
+ case T_HashJoinState:
+ if (planstate->plan->parallel_aware)
+ ExecHashJoinEstimate((HashJoinState *) planstate,
+ e->pcxt);
+ break;
+ case T_HashState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecHashEstimate((HashState *) planstate, e->pcxt);
+ break;
+ case T_SortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecSortEstimate((SortState *) planstate, e->pcxt);
+ break;
+ case T_IncrementalSortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt);
+ break;
+ case T_AggState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecAggEstimate((AggState *) planstate, e->pcxt);
+ break;
+ case T_MemoizeState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecMemoizeEstimate((MemoizeState *) planstate, e->pcxt);
+ break;
+ default:
+ break;
+ }
+
+ return planstate_tree_walker(planstate, ExecParallelEstimate, e);
+}
+
+/*
+ * Estimate the amount of space required to serialize the indicated parameters.
+ */
+static Size
+EstimateParamExecSpace(EState *estate, Bitmapset *params)
+{
+ int paramid;
+ Size sz = sizeof(int);
+
+ paramid = -1;
+ while ((paramid = bms_next_member(params, paramid)) >= 0)
+ {
+ Oid typeOid;
+ int16 typLen;
+ bool typByVal;
+ ParamExecData *prm;
+
+ prm = &(estate->es_param_exec_vals[paramid]);
+ typeOid = list_nth_oid(estate->es_plannedstmt->paramExecTypes,
+ paramid);
+
+ sz = add_size(sz, sizeof(int)); /* space for paramid */
+
+ /* space for datum/isnull */
+ if (OidIsValid(typeOid))
+ get_typlenbyval(typeOid, &typLen, &typByVal);
+ else
+ {
+ /* If no type OID, assume by-value, like copyParamList does. */
+ typLen = sizeof(Datum);
+ typByVal = true;
+ }
+ sz = add_size(sz,
+ datumEstimateSpace(prm->value, prm->isnull,
+ typByVal, typLen));
+ }
+ return sz;
+}
+
+/*
+ * Serialize specified PARAM_EXEC parameters.
+ *
+ * We write the number of parameters first, as a 4-byte integer, and then
+ * write details for each parameter in turn. The details for each parameter
+ * consist of a 4-byte paramid (location of param in execution time internal
+ * parameter array) and then the datum as serialized by datumSerialize().
+ */
+static dsa_pointer
+SerializeParamExecParams(EState *estate, Bitmapset *params, dsa_area *area)
+{
+ Size size;
+ int nparams;
+ int paramid;
+ ParamExecData *prm;
+ dsa_pointer handle;
+ char *start_address;
+
+ /* Allocate enough space for the current parameter values. */
+ size = EstimateParamExecSpace(estate, params);
+ handle = dsa_allocate(area, size);
+ start_address = dsa_get_address(area, handle);
+
+ /* First write the number of parameters as a 4-byte integer. */
+ nparams = bms_num_members(params);
+ memcpy(start_address, &nparams, sizeof(int));
+ start_address += sizeof(int);
+
+ /* Write details for each parameter in turn. */
+ paramid = -1;
+ while ((paramid = bms_next_member(params, paramid)) >= 0)
+ {
+ Oid typeOid;
+ int16 typLen;
+ bool typByVal;
+
+ prm = &(estate->es_param_exec_vals[paramid]);
+ typeOid = list_nth_oid(estate->es_plannedstmt->paramExecTypes,
+ paramid);
+
+ /* Write paramid. */
+ memcpy(start_address, &paramid, sizeof(int));
+ start_address += sizeof(int);
+
+ /* Write datum/isnull */
+ if (OidIsValid(typeOid))
+ get_typlenbyval(typeOid, &typLen, &typByVal);
+ else
+ {
+ /* If no type OID, assume by-value, like copyParamList does. */
+ typLen = sizeof(Datum);
+ typByVal = true;
+ }
+ datumSerialize(prm->value, prm->isnull, typByVal, typLen,
+ &start_address);
+ }
+
+ return handle;
+}
+
+/*
+ * Restore specified PARAM_EXEC parameters.
+ */
+static void
+RestoreParamExecParams(char *start_address, EState *estate)
+{
+ int nparams;
+ int i;
+ int paramid;
+
+ memcpy(&nparams, start_address, sizeof(int));
+ start_address += sizeof(int);
+
+ for (i = 0; i < nparams; i++)
+ {
+ ParamExecData *prm;
+
+ /* Read paramid */
+ memcpy(&paramid, start_address, sizeof(int));
+ start_address += sizeof(int);
+ prm = &(estate->es_param_exec_vals[paramid]);
+
+ /* Read datum/isnull. */
+ prm->value = datumRestore(&start_address, &prm->isnull);
+ prm->execPlan = NULL;
+ }
+}
+
+/*
+ * Initialize the dynamic shared memory segment that will be used to control
+ * parallel execution.
+ */
+static bool
+ExecParallelInitializeDSM(PlanState *planstate,
+ ExecParallelInitializeDSMContext *d)
+{
+ if (planstate == NULL)
+ return false;
+
+ /* If instrumentation is enabled, initialize slot for this node. */
+ if (d->instrumentation != NULL)
+ d->instrumentation->plan_node_id[d->nnodes] =
+ planstate->plan->plan_node_id;
+
+ /* Count this node. */
+ d->nnodes++;
+
+ /*
+ * Call initializers for DSM-using plan nodes.
+ *
+ * Most plan nodes won't do anything here, but plan nodes that allocated
+ * DSM may need to initialize shared state in the DSM before parallel
+ * workers are launched. They can allocate the space they previously
+ * estimated using shm_toc_allocate, and add the keys they previously
+ * estimated using shm_toc_insert, in each case targeting pcxt->toc.
+ */
+ switch (nodeTag(planstate))
+ {
+ case T_SeqScanState:
+ if (planstate->plan->parallel_aware)
+ ExecSeqScanInitializeDSM((SeqScanState *) planstate,
+ d->pcxt);
+ break;
+ case T_IndexScanState:
+ if (planstate->plan->parallel_aware)
+ ExecIndexScanInitializeDSM((IndexScanState *) planstate,
+ d->pcxt);
+ break;
+ case T_IndexOnlyScanState:
+ if (planstate->plan->parallel_aware)
+ ExecIndexOnlyScanInitializeDSM((IndexOnlyScanState *) planstate,
+ d->pcxt);
+ break;
+ case T_ForeignScanState:
+ if (planstate->plan->parallel_aware)
+ ExecForeignScanInitializeDSM((ForeignScanState *) planstate,
+ d->pcxt);
+ break;
+ case T_AppendState:
+ if (planstate->plan->parallel_aware)
+ ExecAppendInitializeDSM((AppendState *) planstate,
+ d->pcxt);
+ break;
+ case T_CustomScanState:
+ if (planstate->plan->parallel_aware)
+ ExecCustomScanInitializeDSM((CustomScanState *) planstate,
+ d->pcxt);
+ break;
+ case T_BitmapHeapScanState:
+ if (planstate->plan->parallel_aware)
+ ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate,
+ d->pcxt);
+ break;
+ case T_HashJoinState:
+ if (planstate->plan->parallel_aware)
+ ExecHashJoinInitializeDSM((HashJoinState *) planstate,
+ d->pcxt);
+ break;
+ case T_HashState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecHashInitializeDSM((HashState *) planstate, d->pcxt);
+ break;
+ case T_SortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
+ break;
+ case T_IncrementalSortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt);
+ break;
+ case T_AggState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecAggInitializeDSM((AggState *) planstate, d->pcxt);
+ break;
+ case T_MemoizeState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecMemoizeInitializeDSM((MemoizeState *) planstate, d->pcxt);
+ break;
+ default:
+ break;
+ }
+
+ return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d);
+}
+
+/*
+ * It sets up the response queues for backend workers to return tuples
+ * to the main backend and start the workers.
+ */
+static shm_mq_handle **
+ExecParallelSetupTupleQueues(ParallelContext *pcxt, bool reinitialize)
+{
+ shm_mq_handle **responseq;
+ char *tqueuespace;
+ int i;
+
+ /* Skip this if no workers. */
+ if (pcxt->nworkers == 0)
+ return NULL;
+
+ /* Allocate memory for shared memory queue handles. */
+ responseq = (shm_mq_handle **)
+ palloc(pcxt->nworkers * sizeof(shm_mq_handle *));
+
+ /*
+ * If not reinitializing, allocate space from the DSM for the queues;
+ * otherwise, find the already allocated space.
+ */
+ if (!reinitialize)
+ tqueuespace =
+ shm_toc_allocate(pcxt->toc,
+ mul_size(PARALLEL_TUPLE_QUEUE_SIZE,
+ pcxt->nworkers));
+ else
+ tqueuespace = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_TUPLE_QUEUE, false);
+
+ /* Create the queues, and become the receiver for each. */
+ for (i = 0; i < pcxt->nworkers; ++i)
+ {
+ shm_mq *mq;
+
+ mq = shm_mq_create(tqueuespace +
+ ((Size) i) * PARALLEL_TUPLE_QUEUE_SIZE,
+ (Size) PARALLEL_TUPLE_QUEUE_SIZE);
+
+ shm_mq_set_receiver(mq, MyProc);
+ responseq[i] = shm_mq_attach(mq, pcxt->seg, NULL);
+ }
+
+ /* Add array of queues to shm_toc, so others can find it. */
+ if (!reinitialize)
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLE_QUEUE, tqueuespace);
+
+ /* Return array of handles. */
+ return responseq;
+}
+
+/*
+ * Sets up the required infrastructure for backend workers to perform
+ * execution and return results to the main backend.
+ */
+ParallelExecutorInfo *
+ExecInitParallelPlan(PlanState *planstate, EState *estate,
+ Bitmapset *sendParams, int nworkers,
+ int64 tuples_needed)
+{
+ ParallelExecutorInfo *pei;
+ ParallelContext *pcxt;
+ ExecParallelEstimateContext e;
+ ExecParallelInitializeDSMContext d;
+ FixedParallelExecutorState *fpes;
+ char *pstmt_data;
+ char *pstmt_space;
+ char *paramlistinfo_space;
+ BufferUsage *bufusage_space;
+ WalUsage *walusage_space;
+ SharedExecutorInstrumentation *instrumentation = NULL;
+ SharedJitInstrumentation *jit_instrumentation = NULL;
+ int pstmt_len;
+ int paramlistinfo_len;
+ int instrumentation_len = 0;
+ int jit_instrumentation_len = 0;
+ int instrument_offset = 0;
+ Size dsa_minsize = dsa_minimum_size();
+ char *query_string;
+ int query_len;
+
+ /*
+ * Force any initplan outputs that we're going to pass to workers to be
+ * evaluated, if they weren't already.
+ *
+ * For simplicity, we use the EState's per-output-tuple ExprContext here.
+ * That risks intra-query memory leakage, since we might pass through here
+ * many times before that ExprContext gets reset; but ExecSetParamPlan
+ * doesn't normally leak any memory in the context (see its comments), so
+ * it doesn't seem worth complicating this function's API to pass it a
+ * shorter-lived ExprContext. This might need to change someday.
+ */
+ ExecSetParamPlanMulti(sendParams, GetPerTupleExprContext(estate));
+
+ /* Allocate object for return value. */
+ pei = palloc0(sizeof(ParallelExecutorInfo));
+ pei->finished = false;
+ pei->planstate = planstate;
+
+ /* Fix up and serialize plan to be sent to workers. */
+ pstmt_data = ExecSerializePlan(planstate->plan, estate);
+
+ /* Create a parallel context. */
+ pcxt = CreateParallelContext("postgres", "ParallelQueryMain", nworkers);
+ pei->pcxt = pcxt;
+
+ /*
+ * Before telling the parallel context to create a dynamic shared memory
+ * segment, we need to figure out how big it should be. Estimate space
+ * for the various things we need to store.
+ */
+
+ /* Estimate space for fixed-size state. */
+ shm_toc_estimate_chunk(&pcxt->estimator,
+ sizeof(FixedParallelExecutorState));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /* Estimate space for query text. */
+ query_len = strlen(estate->es_sourceText);
+ shm_toc_estimate_chunk(&pcxt->estimator, query_len + 1);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /* Estimate space for serialized PlannedStmt. */
+ pstmt_len = strlen(pstmt_data) + 1;
+ shm_toc_estimate_chunk(&pcxt->estimator, pstmt_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /* Estimate space for serialized ParamListInfo. */
+ paramlistinfo_len = EstimateParamListSpace(estate->es_param_list_info);
+ shm_toc_estimate_chunk(&pcxt->estimator, paramlistinfo_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /*
+ * Estimate space for BufferUsage.
+ *
+ * If EXPLAIN is not in use and there are no extensions loaded that care,
+ * we could skip this. But we have no way of knowing whether anyone's
+ * looking at pgBufferUsage, so do it unconditionally.
+ */
+ shm_toc_estimate_chunk(&pcxt->estimator,
+ mul_size(sizeof(BufferUsage), pcxt->nworkers));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /*
+ * Same thing for WalUsage.
+ */
+ shm_toc_estimate_chunk(&pcxt->estimator,
+ mul_size(sizeof(WalUsage), pcxt->nworkers));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /* Estimate space for tuple queues. */
+ shm_toc_estimate_chunk(&pcxt->estimator,
+ mul_size(PARALLEL_TUPLE_QUEUE_SIZE, pcxt->nworkers));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /*
+ * Give parallel-aware nodes a chance to add to the estimates, and get a
+ * count of how many PlanState nodes there are.
+ */
+ e.pcxt = pcxt;
+ e.nnodes = 0;
+ ExecParallelEstimate(planstate, &e);
+
+ /* Estimate space for instrumentation, if required. */
+ if (estate->es_instrument)
+ {
+ instrumentation_len =
+ offsetof(SharedExecutorInstrumentation, plan_node_id) +
+ sizeof(int) * e.nnodes;
+ instrumentation_len = MAXALIGN(instrumentation_len);
+ instrument_offset = instrumentation_len;
+ instrumentation_len +=
+ mul_size(sizeof(Instrumentation),
+ mul_size(e.nnodes, nworkers));
+ shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /* Estimate space for JIT instrumentation, if required. */
+ if (estate->es_jit_flags != PGJIT_NONE)
+ {
+ jit_instrumentation_len =
+ offsetof(SharedJitInstrumentation, jit_instr) +
+ sizeof(JitInstrumentation) * nworkers;
+ shm_toc_estimate_chunk(&pcxt->estimator, jit_instrumentation_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+ }
+ }
+
+ /* Estimate space for DSA area. */
+ shm_toc_estimate_chunk(&pcxt->estimator, dsa_minsize);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /* Everyone's had a chance to ask for space, so now create the DSM. */
+ InitializeParallelDSM(pcxt);
+
+ /*
+ * OK, now we have a dynamic shared memory segment, and it should be big
+ * enough to store all of the data we estimated we would want to put into
+ * it, plus whatever general stuff (not specifically executor-related) the
+ * ParallelContext itself needs to store there. None of the space we
+ * asked for has been allocated or initialized yet, though, so do that.
+ */
+
+ /* Store fixed-size state. */
+ fpes = shm_toc_allocate(pcxt->toc, sizeof(FixedParallelExecutorState));
+ fpes->tuples_needed = tuples_needed;
+ fpes->param_exec = InvalidDsaPointer;
+ fpes->eflags = estate->es_top_eflags;
+ fpes->jit_flags = estate->es_jit_flags;
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_EXECUTOR_FIXED, fpes);
+
+ /* Store query string */
+ query_string = shm_toc_allocate(pcxt->toc, query_len + 1);
+ memcpy(query_string, estate->es_sourceText, query_len + 1);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, query_string);
+
+ /* Store serialized PlannedStmt. */
+ pstmt_space = shm_toc_allocate(pcxt->toc, pstmt_len);
+ memcpy(pstmt_space, pstmt_data, pstmt_len);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_PLANNEDSTMT, pstmt_space);
+
+ /* Store serialized ParamListInfo. */
+ paramlistinfo_space = shm_toc_allocate(pcxt->toc, paramlistinfo_len);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_PARAMLISTINFO, paramlistinfo_space);
+ SerializeParamList(estate->es_param_list_info, &paramlistinfo_space);
+
+ /* Allocate space for each worker's BufferUsage; no need to initialize. */
+ bufusage_space = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(BufferUsage), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufusage_space);
+ pei->buffer_usage = bufusage_space;
+
+ /* Same for WalUsage. */
+ walusage_space = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(WalUsage), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage_space);
+ pei->wal_usage = walusage_space;
+
+ /* Set up the tuple queues that the workers will write into. */
+ pei->tqueue = ExecParallelSetupTupleQueues(pcxt, false);
+
+ /* We don't need the TupleQueueReaders yet, though. */
+ pei->reader = NULL;
+
+ /*
+ * If instrumentation options were supplied, allocate space for the data.
+ * It only gets partially initialized here; the rest happens during
+ * ExecParallelInitializeDSM.
+ */
+ if (estate->es_instrument)
+ {
+ Instrumentation *instrument;
+ int i;
+
+ instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
+ instrumentation->instrument_options = estate->es_instrument;
+ instrumentation->instrument_offset = instrument_offset;
+ instrumentation->num_workers = nworkers;
+ instrumentation->num_plan_nodes = e.nnodes;
+ instrument = GetInstrumentationArray(instrumentation);
+ for (i = 0; i < nworkers * e.nnodes; ++i)
+ InstrInit(&instrument[i], estate->es_instrument);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
+ instrumentation);
+ pei->instrumentation = instrumentation;
+
+ if (estate->es_jit_flags != PGJIT_NONE)
+ {
+ jit_instrumentation = shm_toc_allocate(pcxt->toc,
+ jit_instrumentation_len);
+ jit_instrumentation->num_workers = nworkers;
+ memset(jit_instrumentation->jit_instr, 0,
+ sizeof(JitInstrumentation) * nworkers);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_JIT_INSTRUMENTATION,
+ jit_instrumentation);
+ pei->jit_instrumentation = jit_instrumentation;
+ }
+ }
+
+ /*
+ * Create a DSA area that can be used by the leader and all workers.
+ * (However, if we failed to create a DSM and are using private memory
+ * instead, then skip this.)
+ */
+ if (pcxt->seg != NULL)
+ {
+ char *area_space;
+
+ area_space = shm_toc_allocate(pcxt->toc, dsa_minsize);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_DSA, area_space);
+ pei->area = dsa_create_in_place(area_space, dsa_minsize,
+ LWTRANCHE_PARALLEL_QUERY_DSA,
+ pcxt->seg);
+
+ /*
+ * Serialize parameters, if any, using DSA storage. We don't dare use
+ * the main parallel query DSM for this because we might relaunch
+ * workers after the values have changed (and thus the amount of
+ * storage required has changed).
+ */
+ if (!bms_is_empty(sendParams))
+ {
+ pei->param_exec = SerializeParamExecParams(estate, sendParams,
+ pei->area);
+ fpes->param_exec = pei->param_exec;
+ }
+ }
+
+ /*
+ * Give parallel-aware nodes a chance to initialize their shared data.
+ * This also initializes the elements of instrumentation->ps_instrument,
+ * if it exists.
+ */
+ d.pcxt = pcxt;
+ d.instrumentation = instrumentation;
+ d.nnodes = 0;
+
+ /* Install our DSA area while initializing the plan. */
+ estate->es_query_dsa = pei->area;
+ ExecParallelInitializeDSM(planstate, &d);
+ estate->es_query_dsa = NULL;
+
+ /*
+ * Make sure that the world hasn't shifted under our feet. This could
+ * probably just be an Assert(), but let's be conservative for now.
+ */
+ if (e.nnodes != d.nnodes)
+ elog(ERROR, "inconsistent count of PlanState nodes");
+
+ /* OK, we're ready to rock and roll. */
+ return pei;
+}
+
+/*
+ * Set up tuple queue readers to read the results of a parallel subplan.
+ *
+ * This is separate from ExecInitParallelPlan() because we can launch the
+ * worker processes and let them start doing something before we do this.
+ */
+void
+ExecParallelCreateReaders(ParallelExecutorInfo *pei)
+{
+ int nworkers = pei->pcxt->nworkers_launched;
+ int i;
+
+ Assert(pei->reader == NULL);
+
+ if (nworkers > 0)
+ {
+ pei->reader = (TupleQueueReader **)
+ palloc(nworkers * sizeof(TupleQueueReader *));
+
+ for (i = 0; i < nworkers; i++)
+ {
+ shm_mq_set_handle(pei->tqueue[i],
+ pei->pcxt->worker[i].bgwhandle);
+ pei->reader[i] = CreateTupleQueueReader(pei->tqueue[i]);
+ }
+ }
+}
+
+/*
+ * Re-initialize the parallel executor shared memory state before launching
+ * a fresh batch of workers.
+ */
+void
+ExecParallelReinitialize(PlanState *planstate,
+ ParallelExecutorInfo *pei,
+ Bitmapset *sendParams)
+{
+ EState *estate = planstate->state;
+ FixedParallelExecutorState *fpes;
+
+ /* Old workers must already be shut down */
+ Assert(pei->finished);
+
+ /*
+ * Force any initplan outputs that we're going to pass to workers to be
+ * evaluated, if they weren't already (see comments in
+ * ExecInitParallelPlan).
+ */
+ ExecSetParamPlanMulti(sendParams, GetPerTupleExprContext(estate));
+
+ ReinitializeParallelDSM(pei->pcxt);
+ pei->tqueue = ExecParallelSetupTupleQueues(pei->pcxt, true);
+ pei->reader = NULL;
+ pei->finished = false;
+
+ fpes = shm_toc_lookup(pei->pcxt->toc, PARALLEL_KEY_EXECUTOR_FIXED, false);
+
+ /* Free any serialized parameters from the last round. */
+ if (DsaPointerIsValid(fpes->param_exec))
+ {
+ dsa_free(pei->area, fpes->param_exec);
+ fpes->param_exec = InvalidDsaPointer;
+ }
+
+ /* Serialize current parameter values if required. */
+ if (!bms_is_empty(sendParams))
+ {
+ pei->param_exec = SerializeParamExecParams(estate, sendParams,
+ pei->area);
+ fpes->param_exec = pei->param_exec;
+ }
+
+ /* Traverse plan tree and let each child node reset associated state. */
+ estate->es_query_dsa = pei->area;
+ ExecParallelReInitializeDSM(planstate, pei->pcxt);
+ estate->es_query_dsa = NULL;
+}
+
+/*
+ * Traverse plan tree to reinitialize per-node dynamic shared memory state
+ */
+static bool
+ExecParallelReInitializeDSM(PlanState *planstate,
+ ParallelContext *pcxt)
+{
+ if (planstate == NULL)
+ return false;
+
+ /*
+ * Call reinitializers for DSM-using plan nodes.
+ */
+ switch (nodeTag(planstate))
+ {
+ case T_SeqScanState:
+ if (planstate->plan->parallel_aware)
+ ExecSeqScanReInitializeDSM((SeqScanState *) planstate,
+ pcxt);
+ break;
+ case T_IndexScanState:
+ if (planstate->plan->parallel_aware)
+ ExecIndexScanReInitializeDSM((IndexScanState *) planstate,
+ pcxt);
+ break;
+ case T_IndexOnlyScanState:
+ if (planstate->plan->parallel_aware)
+ ExecIndexOnlyScanReInitializeDSM((IndexOnlyScanState *) planstate,
+ pcxt);
+ break;
+ case T_ForeignScanState:
+ if (planstate->plan->parallel_aware)
+ ExecForeignScanReInitializeDSM((ForeignScanState *) planstate,
+ pcxt);
+ break;
+ case T_AppendState:
+ if (planstate->plan->parallel_aware)
+ ExecAppendReInitializeDSM((AppendState *) planstate, pcxt);
+ break;
+ case T_CustomScanState:
+ if (planstate->plan->parallel_aware)
+ ExecCustomScanReInitializeDSM((CustomScanState *) planstate,
+ pcxt);
+ break;
+ case T_BitmapHeapScanState:
+ if (planstate->plan->parallel_aware)
+ ExecBitmapHeapReInitializeDSM((BitmapHeapScanState *) planstate,
+ pcxt);
+ break;
+ case T_HashJoinState:
+ if (planstate->plan->parallel_aware)
+ ExecHashJoinReInitializeDSM((HashJoinState *) planstate,
+ pcxt);
+ break;
+ case T_HashState:
+ case T_SortState:
+ case T_IncrementalSortState:
+ case T_MemoizeState:
+ /* these nodes have DSM state, but no reinitialization is required */
+ break;
+
+ default:
+ break;
+ }
+
+ return planstate_tree_walker(planstate, ExecParallelReInitializeDSM, pcxt);
+}
+
+/*
+ * Copy instrumentation information about this node and its descendants from
+ * dynamic shared memory.
+ */
+static bool
+ExecParallelRetrieveInstrumentation(PlanState *planstate,
+ SharedExecutorInstrumentation *instrumentation)
+{
+ Instrumentation *instrument;
+ int i;
+ int n;
+ int ibytes;
+ int plan_node_id = planstate->plan->plan_node_id;
+ MemoryContext oldcontext;
+
+ /* Find the instrumentation for this node. */
+ for (i = 0; i < instrumentation->num_plan_nodes; ++i)
+ if (instrumentation->plan_node_id[i] == plan_node_id)
+ break;
+ if (i >= instrumentation->num_plan_nodes)
+ elog(ERROR, "plan node %d not found", plan_node_id);
+
+ /* Accumulate the statistics from all workers. */
+ instrument = GetInstrumentationArray(instrumentation);
+ instrument += i * instrumentation->num_workers;
+ for (n = 0; n < instrumentation->num_workers; ++n)
+ InstrAggNode(planstate->instrument, &instrument[n]);
+
+ /*
+ * Also store the per-worker detail.
+ *
+ * Worker instrumentation should be allocated in the same context as the
+ * regular instrumentation information, which is the per-query context.
+ * Switch into per-query memory context.
+ */
+ oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
+ ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
+ planstate->worker_instrument =
+ palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
+ MemoryContextSwitchTo(oldcontext);
+
+ planstate->worker_instrument->num_workers = instrumentation->num_workers;
+ memcpy(&planstate->worker_instrument->instrument, instrument, ibytes);
+
+ /* Perform any node-type-specific work that needs to be done. */
+ switch (nodeTag(planstate))
+ {
+ case T_SortState:
+ ExecSortRetrieveInstrumentation((SortState *) planstate);
+ break;
+ case T_IncrementalSortState:
+ ExecIncrementalSortRetrieveInstrumentation((IncrementalSortState *) planstate);
+ break;
+ case T_HashState:
+ ExecHashRetrieveInstrumentation((HashState *) planstate);
+ break;
+ case T_AggState:
+ ExecAggRetrieveInstrumentation((AggState *) planstate);
+ break;
+ case T_MemoizeState:
+ ExecMemoizeRetrieveInstrumentation((MemoizeState *) planstate);
+ break;
+ default:
+ break;
+ }
+
+ return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation,
+ instrumentation);
+}
+
+/*
+ * Add up the workers' JIT instrumentation from dynamic shared memory.
+ */
+static void
+ExecParallelRetrieveJitInstrumentation(PlanState *planstate,
+ SharedJitInstrumentation *shared_jit)
+{
+ JitInstrumentation *combined;
+ int ibytes;
+
+ int n;
+
+ /*
+ * Accumulate worker JIT instrumentation into the combined JIT
+ * instrumentation, allocating it if required.
+ */
+ if (!planstate->state->es_jit_worker_instr)
+ planstate->state->es_jit_worker_instr =
+ MemoryContextAllocZero(planstate->state->es_query_cxt, sizeof(JitInstrumentation));
+ combined = planstate->state->es_jit_worker_instr;
+
+ /* Accumulate all the workers' instrumentations. */
+ for (n = 0; n < shared_jit->num_workers; ++n)
+ InstrJitAgg(combined, &shared_jit->jit_instr[n]);
+
+ /*
+ * Store the per-worker detail.
+ *
+ * Similar to ExecParallelRetrieveInstrumentation(), allocate the
+ * instrumentation in per-query context.
+ */
+ ibytes = offsetof(SharedJitInstrumentation, jit_instr)
+ + mul_size(shared_jit->num_workers, sizeof(JitInstrumentation));
+ planstate->worker_jit_instrument =
+ MemoryContextAlloc(planstate->state->es_query_cxt, ibytes);
+
+ memcpy(planstate->worker_jit_instrument, shared_jit, ibytes);
+}
+
+/*
+ * Finish parallel execution. We wait for parallel workers to finish, and
+ * accumulate their buffer/WAL usage.
+ */
+void
+ExecParallelFinish(ParallelExecutorInfo *pei)
+{
+ int nworkers = pei->pcxt->nworkers_launched;
+ int i;
+
+ /* Make this be a no-op if called twice in a row. */
+ if (pei->finished)
+ return;
+
+ /*
+ * Detach from tuple queues ASAP, so that any still-active workers will
+ * notice that no further results are wanted.
+ */
+ if (pei->tqueue != NULL)
+ {
+ for (i = 0; i < nworkers; i++)
+ shm_mq_detach(pei->tqueue[i]);
+ pfree(pei->tqueue);
+ pei->tqueue = NULL;
+ }
+
+ /*
+ * While we're waiting for the workers to finish, let's get rid of the
+ * tuple queue readers. (Any other local cleanup could be done here too.)
+ */
+ if (pei->reader != NULL)
+ {
+ for (i = 0; i < nworkers; i++)
+ DestroyTupleQueueReader(pei->reader[i]);
+ pfree(pei->reader);
+ pei->reader = NULL;
+ }
+
+ /* Now wait for the workers to finish. */
+ WaitForParallelWorkersToFinish(pei->pcxt);
+
+ /*
+ * Next, accumulate buffer/WAL usage. (This must wait for the workers to
+ * finish, or we might get incomplete data.)
+ */
+ for (i = 0; i < nworkers; i++)
+ InstrAccumParallelQuery(&pei->buffer_usage[i], &pei->wal_usage[i]);
+
+ pei->finished = true;
+}
+
+/*
+ * Accumulate instrumentation, and then clean up whatever ParallelExecutorInfo
+ * resources still exist after ExecParallelFinish. We separate these
+ * routines because someone might want to examine the contents of the DSM
+ * after ExecParallelFinish and before calling this routine.
+ */
+void
+ExecParallelCleanup(ParallelExecutorInfo *pei)
+{
+ /* Accumulate instrumentation, if any. */
+ if (pei->instrumentation)
+ ExecParallelRetrieveInstrumentation(pei->planstate,
+ pei->instrumentation);
+
+ /* Accumulate JIT instrumentation, if any. */
+ if (pei->jit_instrumentation)
+ ExecParallelRetrieveJitInstrumentation(pei->planstate,
+ pei->jit_instrumentation);
+
+ /* Free any serialized parameters. */
+ if (DsaPointerIsValid(pei->param_exec))
+ {
+ dsa_free(pei->area, pei->param_exec);
+ pei->param_exec = InvalidDsaPointer;
+ }
+ if (pei->area != NULL)
+ {
+ dsa_detach(pei->area);
+ pei->area = NULL;
+ }
+ if (pei->pcxt != NULL)
+ {
+ DestroyParallelContext(pei->pcxt);
+ pei->pcxt = NULL;
+ }
+ pfree(pei);
+}
+
+/*
+ * Create a DestReceiver to write tuples we produce to the shm_mq designated
+ * for that purpose.
+ */
+static DestReceiver *
+ExecParallelGetReceiver(dsm_segment *seg, shm_toc *toc)
+{
+ char *mqspace;
+ shm_mq *mq;
+
+ mqspace = shm_toc_lookup(toc, PARALLEL_KEY_TUPLE_QUEUE, false);
+ mqspace += ParallelWorkerNumber * PARALLEL_TUPLE_QUEUE_SIZE;
+ mq = (shm_mq *) mqspace;
+ shm_mq_set_sender(mq, MyProc);
+ return CreateTupleQueueDestReceiver(shm_mq_attach(mq, seg, NULL));
+}
+
+/*
+ * Create a QueryDesc for the PlannedStmt we are to execute, and return it.
+ */
+static QueryDesc *
+ExecParallelGetQueryDesc(shm_toc *toc, DestReceiver *receiver,
+ int instrument_options)
+{
+ char *pstmtspace;
+ char *paramspace;
+ PlannedStmt *pstmt;
+ ParamListInfo paramLI;
+ char *queryString;
+
+ /* Get the query string from shared memory */
+ queryString = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false);
+
+ /* Reconstruct leader-supplied PlannedStmt. */
+ pstmtspace = shm_toc_lookup(toc, PARALLEL_KEY_PLANNEDSTMT, false);
+ pstmt = (PlannedStmt *) stringToNode(pstmtspace);
+
+ /* Reconstruct ParamListInfo. */
+ paramspace = shm_toc_lookup(toc, PARALLEL_KEY_PARAMLISTINFO, false);
+ paramLI = RestoreParamList(&paramspace);
+
+ /* Create a QueryDesc for the query. */
+ return CreateQueryDesc(pstmt,
+ queryString,
+ GetActiveSnapshot(), InvalidSnapshot,
+ receiver, paramLI, NULL, instrument_options);
+}
+
+/*
+ * Copy instrumentation information from this node and its descendants into
+ * dynamic shared memory, so that the parallel leader can retrieve it.
+ */
+static bool
+ExecParallelReportInstrumentation(PlanState *planstate,
+ SharedExecutorInstrumentation *instrumentation)
+{
+ int i;
+ int plan_node_id = planstate->plan->plan_node_id;
+ Instrumentation *instrument;
+
+ InstrEndLoop(planstate->instrument);
+
+ /*
+ * If we shuffled the plan_node_id values in ps_instrument into sorted
+ * order, we could use binary search here. This might matter someday if
+ * we're pushing down sufficiently large plan trees. For now, do it the
+ * slow, dumb way.
+ */
+ for (i = 0; i < instrumentation->num_plan_nodes; ++i)
+ if (instrumentation->plan_node_id[i] == plan_node_id)
+ break;
+ if (i >= instrumentation->num_plan_nodes)
+ elog(ERROR, "plan node %d not found", plan_node_id);
+
+ /*
+ * Add our statistics to the per-node, per-worker totals. It's possible
+ * that this could happen more than once if we relaunched workers.
+ */
+ instrument = GetInstrumentationArray(instrumentation);
+ instrument += i * instrumentation->num_workers;
+ Assert(IsParallelWorker());
+ Assert(ParallelWorkerNumber < instrumentation->num_workers);
+ InstrAggNode(&instrument[ParallelWorkerNumber], planstate->instrument);
+
+ return planstate_tree_walker(planstate, ExecParallelReportInstrumentation,
+ instrumentation);
+}
+
+/*
+ * Initialize the PlanState and its descendants with the information
+ * retrieved from shared memory. This has to be done once the PlanState
+ * is allocated and initialized by executor; that is, after ExecutorStart().
+ */
+static bool
+ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
+{
+ if (planstate == NULL)
+ return false;
+
+ switch (nodeTag(planstate))
+ {
+ case T_SeqScanState:
+ if (planstate->plan->parallel_aware)
+ ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt);
+ break;
+ case T_IndexScanState:
+ if (planstate->plan->parallel_aware)
+ ExecIndexScanInitializeWorker((IndexScanState *) planstate,
+ pwcxt);
+ break;
+ case T_IndexOnlyScanState:
+ if (planstate->plan->parallel_aware)
+ ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate,
+ pwcxt);
+ break;
+ case T_ForeignScanState:
+ if (planstate->plan->parallel_aware)
+ ExecForeignScanInitializeWorker((ForeignScanState *) planstate,
+ pwcxt);
+ break;
+ case T_AppendState:
+ if (planstate->plan->parallel_aware)
+ ExecAppendInitializeWorker((AppendState *) planstate, pwcxt);
+ break;
+ case T_CustomScanState:
+ if (planstate->plan->parallel_aware)
+ ExecCustomScanInitializeWorker((CustomScanState *) planstate,
+ pwcxt);
+ break;
+ case T_BitmapHeapScanState:
+ if (planstate->plan->parallel_aware)
+ ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate,
+ pwcxt);
+ break;
+ case T_HashJoinState:
+ if (planstate->plan->parallel_aware)
+ ExecHashJoinInitializeWorker((HashJoinState *) planstate,
+ pwcxt);
+ break;
+ case T_HashState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecHashInitializeWorker((HashState *) planstate, pwcxt);
+ break;
+ case T_SortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecSortInitializeWorker((SortState *) planstate, pwcxt);
+ break;
+ case T_IncrementalSortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate,
+ pwcxt);
+ break;
+ case T_AggState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecAggInitializeWorker((AggState *) planstate, pwcxt);
+ break;
+ case T_MemoizeState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecMemoizeInitializeWorker((MemoizeState *) planstate, pwcxt);
+ break;
+ default:
+ break;
+ }
+
+ return planstate_tree_walker(planstate, ExecParallelInitializeWorker,
+ pwcxt);
+}
+
+/*
+ * Main entrypoint for parallel query worker processes.
+ *
+ * We reach this function from ParallelWorkerMain, so the setup necessary to
+ * create a sensible parallel environment has already been done;
+ * ParallelWorkerMain worries about stuff like the transaction state, combo
+ * CID mappings, and GUC values, so we don't need to deal with any of that
+ * here.
+ *
+ * Our job is to deal with concerns specific to the executor. The parallel
+ * group leader will have stored a serialized PlannedStmt, and it's our job
+ * to execute that plan and write the resulting tuples to the appropriate
+ * tuple queue. Various bits of supporting information that we need in order
+ * to do this are also stored in the dsm_segment and can be accessed through
+ * the shm_toc.
+ */
+void
+ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
+{
+ FixedParallelExecutorState *fpes;
+ BufferUsage *buffer_usage;
+ WalUsage *wal_usage;
+ DestReceiver *receiver;
+ QueryDesc *queryDesc;
+ SharedExecutorInstrumentation *instrumentation;
+ SharedJitInstrumentation *jit_instrumentation;
+ int instrument_options = 0;
+ void *area_space;
+ dsa_area *area;
+ ParallelWorkerContext pwcxt;
+
+ /* Get fixed-size state. */
+ fpes = shm_toc_lookup(toc, PARALLEL_KEY_EXECUTOR_FIXED, false);
+
+ /* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */
+ receiver = ExecParallelGetReceiver(seg, toc);
+ instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, true);
+ if (instrumentation != NULL)
+ instrument_options = instrumentation->instrument_options;
+ jit_instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_JIT_INSTRUMENTATION,
+ true);
+ queryDesc = ExecParallelGetQueryDesc(toc, receiver, instrument_options);
+
+ /* Setting debug_query_string for individual workers */
+ debug_query_string = queryDesc->sourceText;
+
+ /* Report workers' query and queryId for monitoring purposes */
+ pgstat_report_activity(STATE_RUNNING, debug_query_string);
+
+ /* Attach to the dynamic shared memory area. */
+ area_space = shm_toc_lookup(toc, PARALLEL_KEY_DSA, false);
+ area = dsa_attach_in_place(area_space, seg);
+
+ /* Start up the executor */
+ queryDesc->plannedstmt->jitFlags = fpes->jit_flags;
+ ExecutorStart(queryDesc, fpes->eflags);
+
+ /* Special executor initialization steps for parallel workers */
+ queryDesc->planstate->state->es_query_dsa = area;
+ if (DsaPointerIsValid(fpes->param_exec))
+ {
+ char *paramexec_space;
+
+ paramexec_space = dsa_get_address(area, fpes->param_exec);
+ RestoreParamExecParams(paramexec_space, queryDesc->estate);
+
+ }
+ pwcxt.toc = toc;
+ pwcxt.seg = seg;
+ ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt);
+
+ /* Pass down any tuple bound */
+ ExecSetTupleBound(fpes->tuples_needed, queryDesc->planstate);
+
+ /*
+ * Prepare to track buffer/WAL usage during query execution.
+ *
+ * We do this after starting up the executor to match what happens in the
+ * leader, which also doesn't count buffer accesses and WAL activity that
+ * occur during executor startup.
+ */
+ InstrStartParallelQuery();
+
+ /*
+ * Run the plan. If we specified a tuple bound, be careful not to demand
+ * more tuples than that.
+ */
+ ExecutorRun(queryDesc,
+ ForwardScanDirection,
+ fpes->tuples_needed < 0 ? (int64) 0 : fpes->tuples_needed,
+ true);
+
+ /* Shut down the executor */
+ ExecutorFinish(queryDesc);
+
+ /* Report buffer/WAL usage during parallel execution. */
+ buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
+ wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
+ InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
+ &wal_usage[ParallelWorkerNumber]);
+
+ /* Report instrumentation data if any instrumentation options are set. */
+ if (instrumentation != NULL)
+ ExecParallelReportInstrumentation(queryDesc->planstate,
+ instrumentation);
+
+ /* Report JIT instrumentation data if any */
+ if (queryDesc->estate->es_jit && jit_instrumentation != NULL)
+ {
+ Assert(ParallelWorkerNumber < jit_instrumentation->num_workers);
+ jit_instrumentation->jit_instr[ParallelWorkerNumber] =
+ queryDesc->estate->es_jit->instr;
+ }
+
+ /* Must do this after capturing instrumentation. */
+ ExecutorEnd(queryDesc);
+
+ /* Cleanup. */
+ dsa_detach(area);
+ FreeQueryDesc(queryDesc);
+ receiver->rDestroy(receiver);
+}
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
new file mode 100644
index 0000000..606c920
--- /dev/null
+++ b/src/backend/executor/execPartition.c
@@ -0,0 +1,2107 @@
+/*-------------------------------------------------------------------------
+ *
+ * execPartition.c
+ * Support routines for partitioning.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execPartition.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/table.h"
+#include "access/tableam.h"
+#include "catalog/partition.h"
+#include "catalog/pg_inherits.h"
+#include "catalog/pg_type.h"
+#include "executor/execPartition.h"
+#include "executor/executor.h"
+#include "foreign/fdwapi.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "partitioning/partbounds.h"
+#include "partitioning/partdesc.h"
+#include "partitioning/partprune.h"
+#include "rewrite/rewriteManip.h"
+#include "utils/acl.h"
+#include "utils/lsyscache.h"
+#include "utils/partcache.h"
+#include "utils/rls.h"
+#include "utils/ruleutils.h"
+
+
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions.
+ *
+ * partition_root
+ * The partitioned table that's the target of the command.
+ *
+ * partition_dispatch_info
+ * Array of 'max_dispatch' elements containing a pointer to a
+ * PartitionDispatch object for every partitioned table touched by tuple
+ * routing. The entry for the target partitioned table is *always*
+ * present in the 0th element of this array. See comment for
+ * PartitionDispatchData->indexes for details on how this array is
+ * indexed.
+ *
+ * nonleaf_partitions
+ * Array of 'max_dispatch' elements containing pointers to fake
+ * ResultRelInfo objects for nonleaf partitions, useful for checking
+ * the partition constraint.
+ *
+ * num_dispatch
+ * The current number of items stored in the 'partition_dispatch_info'
+ * array. Also serves as the index of the next free array element for
+ * new PartitionDispatch objects that need to be stored.
+ *
+ * max_dispatch
+ * The current allocated size of the 'partition_dispatch_info' array.
+ *
+ * partitions
+ * Array of 'max_partitions' elements containing a pointer to a
+ * ResultRelInfo for every leaf partition touched by tuple routing.
+ * Some of these are pointers to ResultRelInfos which are borrowed out of
+ * the owning ModifyTableState node. The remainder have been built
+ * especially for tuple routing. See comment for
+ * PartitionDispatchData->indexes for details on how this array is
+ * indexed.
+ *
+ * is_borrowed_rel
+ * Array of 'max_partitions' booleans recording whether a given entry
+ * in 'partitions' is a ResultRelInfo pointer borrowed from the owning
+ * ModifyTableState node, rather than being built here.
+ *
+ * num_partitions
+ * The current number of items stored in the 'partitions' array. Also
+ * serves as the index of the next free array element for new
+ * ResultRelInfo objects that need to be stored.
+ *
+ * max_partitions
+ * The current allocated size of the 'partitions' array.
+ *
+ * memcxt
+ * Memory context used to allocate subsidiary structs.
+ *-----------------------
+ */
+struct PartitionTupleRouting
+{
+ Relation partition_root;
+ PartitionDispatch *partition_dispatch_info;
+ ResultRelInfo **nonleaf_partitions;
+ int num_dispatch;
+ int max_dispatch;
+ ResultRelInfo **partitions;
+ bool *is_borrowed_rel;
+ int num_partitions;
+ int max_partitions;
+ MemoryContext memcxt;
+};
+
+/*-----------------------
+ * PartitionDispatch - information about one partitioned table in a partition
+ * hierarchy required to route a tuple to any of its partitions. A
+ * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
+ * struct and stored inside its 'partition_dispatch_info' array.
+ *
+ * reldesc
+ * Relation descriptor of the table
+ *
+ * key
+ * Partition key information of the table
+ *
+ * keystate
+ * Execution state required for expressions in the partition key
+ *
+ * partdesc
+ * Partition descriptor of the table
+ *
+ * tupslot
+ * A standalone TupleTableSlot initialized with this table's tuple
+ * descriptor, or NULL if no tuple conversion between the parent is
+ * required.
+ *
+ * tupmap
+ * TupleConversionMap to convert from the parent's rowtype to this table's
+ * rowtype (when extracting the partition key of a tuple just before
+ * routing it through this table). A NULL value is stored if no tuple
+ * conversion is required.
+ *
+ * indexes
+ * Array of partdesc->nparts elements. For leaf partitions the index
+ * corresponds to the partition's ResultRelInfo in the encapsulating
+ * PartitionTupleRouting's partitions array. For partitioned partitions,
+ * the index corresponds to the PartitionDispatch for it in its
+ * partition_dispatch_info array. -1 indicates we've not yet allocated
+ * anything in PartitionTupleRouting for the partition.
+ *-----------------------
+ */
+typedef struct PartitionDispatchData
+{
+ Relation reldesc;
+ PartitionKey key;
+ List *keystate; /* list of ExprState */
+ PartitionDesc partdesc;
+ TupleTableSlot *tupslot;
+ AttrMap *tupmap;
+ int indexes[FLEXIBLE_ARRAY_MEMBER];
+} PartitionDispatchData;
+
+
+static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
+ EState *estate, PartitionTupleRouting *proute,
+ PartitionDispatch dispatch,
+ ResultRelInfo *rootResultRelInfo,
+ int partidx);
+static void ExecInitRoutingInfo(ModifyTableState *mtstate,
+ EState *estate,
+ PartitionTupleRouting *proute,
+ PartitionDispatch dispatch,
+ ResultRelInfo *partRelInfo,
+ int partidx,
+ bool is_borrowed_rel);
+static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
+ PartitionTupleRouting *proute,
+ Oid partoid, PartitionDispatch parent_pd,
+ int partidx, ResultRelInfo *rootResultRelInfo);
+static void FormPartitionKeyDatum(PartitionDispatch pd,
+ TupleTableSlot *slot,
+ EState *estate,
+ Datum *values,
+ bool *isnull);
+static int get_partition_for_tuple(PartitionDispatch pd, Datum *values,
+ bool *isnull);
+static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
+ Datum *values,
+ bool *isnull,
+ int maxfieldlen);
+static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri);
+static void ExecInitPruningContext(PartitionPruneContext *context,
+ List *pruning_steps,
+ PartitionDesc partdesc,
+ PartitionKey partkey,
+ PlanState *planstate);
+static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
+ PartitionedRelPruningData *pprune,
+ bool initial_prune,
+ Bitmapset **validsubplans);
+
+
+/*
+ * ExecSetupPartitionTupleRouting - sets up information needed during
+ * tuple routing for partitioned tables, encapsulates it in
+ * PartitionTupleRouting, and returns it.
+ *
+ * Callers must use the returned PartitionTupleRouting during calls to
+ * ExecFindPartition(). The actual ResultRelInfo for a partition is only
+ * allocated when the partition is found for the first time.
+ *
+ * The current memory context is used to allocate this struct and all
+ * subsidiary structs that will be allocated from it later on. Typically
+ * it should be estate->es_query_cxt.
+ */
+PartitionTupleRouting *
+ExecSetupPartitionTupleRouting(EState *estate, Relation rel)
+{
+ PartitionTupleRouting *proute;
+
+ /*
+ * Here we attempt to expend as little effort as possible in setting up
+ * the PartitionTupleRouting. Each partition's ResultRelInfo is built on
+ * demand, only when we actually need to route a tuple to that partition.
+ * The reason for this is that a common case is for INSERT to insert a
+ * single tuple into a partitioned table and this must be fast.
+ */
+ proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
+ proute->partition_root = rel;
+ proute->memcxt = CurrentMemoryContext;
+ /* Rest of members initialized by zeroing */
+
+ /*
+ * Initialize this table's PartitionDispatch object. Here we pass in the
+ * parent as NULL as we don't need to care about any parent of the target
+ * partitioned table.
+ */
+ ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
+ NULL, 0, NULL);
+
+ return proute;
+}
+
+/*
+ * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
+ * the tuple contained in *slot should belong to.
+ *
+ * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
+ * one up or reuse one from mtstate's resultRelInfo array. When reusing a
+ * ResultRelInfo from the mtstate we verify that the relation is a valid
+ * target for INSERTs and initialize tuple routing information.
+ *
+ * rootResultRelInfo is the relation named in the query.
+ *
+ * estate must be non-NULL; we'll need it to compute any expressions in the
+ * partition keys. Also, its per-tuple contexts are used as evaluation
+ * scratch space.
+ *
+ * If no leaf partition is found, this routine errors out with the appropriate
+ * error message. An error may also be raised if the found target partition
+ * is not a valid target for an INSERT.
+ */
+ResultRelInfo *
+ExecFindPartition(ModifyTableState *mtstate,
+ ResultRelInfo *rootResultRelInfo,
+ PartitionTupleRouting *proute,
+ TupleTableSlot *slot, EState *estate)
+{
+ PartitionDispatch *pd = proute->partition_dispatch_info;
+ Datum values[PARTITION_MAX_KEYS];
+ bool isnull[PARTITION_MAX_KEYS];
+ Relation rel;
+ PartitionDispatch dispatch;
+ PartitionDesc partdesc;
+ ExprContext *ecxt = GetPerTupleExprContext(estate);
+ TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
+ TupleTableSlot *rootslot = slot;
+ TupleTableSlot *myslot = NULL;
+ MemoryContext oldcxt;
+ ResultRelInfo *rri = NULL;
+
+ /* use per-tuple context here to avoid leaking memory */
+ oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
+
+ /*
+ * First check the root table's partition constraint, if any. No point in
+ * routing the tuple if it doesn't belong in the root table itself.
+ */
+ if (rootResultRelInfo->ri_RelationDesc->rd_rel->relispartition)
+ ExecPartitionCheck(rootResultRelInfo, slot, estate, true);
+
+ /* start with the root partitioned table */
+ dispatch = pd[0];
+ while (dispatch != NULL)
+ {
+ int partidx = -1;
+ bool is_leaf;
+
+ CHECK_FOR_INTERRUPTS();
+
+ rel = dispatch->reldesc;
+ partdesc = dispatch->partdesc;
+
+ /*
+ * Extract partition key from tuple. Expression evaluation machinery
+ * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
+ * point to the correct tuple slot. The slot might have changed from
+ * what was used for the parent table if the table of the current
+ * partitioning level has different tuple descriptor from the parent.
+ * So update ecxt_scantuple accordingly.
+ */
+ ecxt->ecxt_scantuple = slot;
+ FormPartitionKeyDatum(dispatch, slot, estate, values, isnull);
+
+ /*
+ * If this partitioned table has no partitions or no partition for
+ * these values, error out.
+ */
+ if (partdesc->nparts == 0 ||
+ (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0)
+ {
+ char *val_desc;
+
+ val_desc = ExecBuildSlotPartitionKeyDescription(rel,
+ values, isnull, 64);
+ Assert(OidIsValid(RelationGetRelid(rel)));
+ ereport(ERROR,
+ (errcode(ERRCODE_CHECK_VIOLATION),
+ errmsg("no partition of relation \"%s\" found for row",
+ RelationGetRelationName(rel)),
+ val_desc ?
+ errdetail("Partition key of the failing row contains %s.",
+ val_desc) : 0,
+ errtable(rel)));
+ }
+
+ is_leaf = partdesc->is_leaf[partidx];
+ if (is_leaf)
+ {
+ /*
+ * We've reached the leaf -- hurray, we're done. Look to see if
+ * we've already got a ResultRelInfo for this partition.
+ */
+ if (likely(dispatch->indexes[partidx] >= 0))
+ {
+ /* ResultRelInfo already built */
+ Assert(dispatch->indexes[partidx] < proute->num_partitions);
+ rri = proute->partitions[dispatch->indexes[partidx]];
+ }
+ else
+ {
+ /*
+ * If the partition is known in the owning ModifyTableState
+ * node, we can re-use that ResultRelInfo instead of creating
+ * a new one with ExecInitPartitionInfo().
+ */
+ rri = ExecLookupResultRelByOid(mtstate,
+ partdesc->oids[partidx],
+ true, false);
+ if (rri)
+ {
+ /* Verify this ResultRelInfo allows INSERTs */
+ CheckValidResultRel(rri, CMD_INSERT);
+
+ /*
+ * Initialize information needed to insert this and
+ * subsequent tuples routed to this partition.
+ */
+ ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
+ rri, partidx, true);
+ }
+ else
+ {
+ /* We need to create a new one. */
+ rri = ExecInitPartitionInfo(mtstate, estate, proute,
+ dispatch,
+ rootResultRelInfo, partidx);
+ }
+ }
+ Assert(rri != NULL);
+
+ /* Signal to terminate the loop */
+ dispatch = NULL;
+ }
+ else
+ {
+ /*
+ * Partition is a sub-partitioned table; get the PartitionDispatch
+ */
+ if (likely(dispatch->indexes[partidx] >= 0))
+ {
+ /* Already built. */
+ Assert(dispatch->indexes[partidx] < proute->num_dispatch);
+
+ rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
+
+ /*
+ * Move down to the next partition level and search again
+ * until we find a leaf partition that matches this tuple
+ */
+ dispatch = pd[dispatch->indexes[partidx]];
+ }
+ else
+ {
+ /* Not yet built. Do that now. */
+ PartitionDispatch subdispatch;
+
+ /*
+ * Create the new PartitionDispatch. We pass the current one
+ * in as the parent PartitionDispatch
+ */
+ subdispatch = ExecInitPartitionDispatchInfo(estate,
+ proute,
+ partdesc->oids[partidx],
+ dispatch, partidx,
+ mtstate->rootResultRelInfo);
+ Assert(dispatch->indexes[partidx] >= 0 &&
+ dispatch->indexes[partidx] < proute->num_dispatch);
+
+ rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
+ dispatch = subdispatch;
+ }
+
+ /*
+ * Convert the tuple to the new parent's layout, if different from
+ * the previous parent.
+ */
+ if (dispatch->tupslot)
+ {
+ AttrMap *map = dispatch->tupmap;
+ TupleTableSlot *tempslot = myslot;
+
+ myslot = dispatch->tupslot;
+ slot = execute_attr_map_slot(map, slot, myslot);
+
+ if (tempslot != NULL)
+ ExecClearTuple(tempslot);
+ }
+ }
+
+ /*
+ * If this partition is the default one, we must check its partition
+ * constraint now, which may have changed concurrently due to
+ * partitions being added to the parent.
+ *
+ * (We do this here, and do not rely on ExecInsert doing it, because
+ * we don't want to miss doing it for non-leaf partitions.)
+ */
+ if (partidx == partdesc->boundinfo->default_index)
+ {
+ /*
+ * The tuple must match the partition's layout for the constraint
+ * expression to be evaluated successfully. If the partition is
+ * sub-partitioned, that would already be the case due to the code
+ * above, but for a leaf partition the tuple still matches the
+ * parent's layout.
+ *
+ * Note that we have a map to convert from root to current
+ * partition, but not from immediate parent to current partition.
+ * So if we have to convert, do it from the root slot; if not, use
+ * the root slot as-is.
+ */
+ if (is_leaf)
+ {
+ TupleConversionMap *map = rri->ri_RootToPartitionMap;
+
+ if (map)
+ slot = execute_attr_map_slot(map->attrMap, rootslot,
+ rri->ri_PartitionTupleSlot);
+ else
+ slot = rootslot;
+ }
+
+ ExecPartitionCheck(rri, slot, estate, true);
+ }
+ }
+
+ /* Release the tuple in the lowest parent's dedicated slot. */
+ if (myslot != NULL)
+ ExecClearTuple(myslot);
+ /* and restore ecxt's scantuple */
+ ecxt->ecxt_scantuple = ecxt_scantuple_saved;
+ MemoryContextSwitchTo(oldcxt);
+
+ return rri;
+}
+
+/*
+ * ExecInitPartitionInfo
+ * Lock the partition and initialize ResultRelInfo. Also setup other
+ * information for the partition and store it in the next empty slot in
+ * the proute->partitions array.
+ *
+ * Returns the ResultRelInfo
+ */
+static ResultRelInfo *
+ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
+ PartitionTupleRouting *proute,
+ PartitionDispatch dispatch,
+ ResultRelInfo *rootResultRelInfo,
+ int partidx)
+{
+ ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+ Oid partOid = dispatch->partdesc->oids[partidx];
+ Relation partrel;
+ int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
+ Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
+ ResultRelInfo *leaf_part_rri;
+ MemoryContext oldcxt;
+ AttrMap *part_attmap = NULL;
+ bool found_whole_row;
+
+ oldcxt = MemoryContextSwitchTo(proute->memcxt);
+
+ partrel = table_open(partOid, RowExclusiveLock);
+
+ leaf_part_rri = makeNode(ResultRelInfo);
+ InitResultRelInfo(leaf_part_rri,
+ partrel,
+ 0,
+ rootResultRelInfo,
+ estate->es_instrument);
+
+ /*
+ * Verify result relation is a valid target for an INSERT. An UPDATE of a
+ * partition-key becomes a DELETE+INSERT operation, so this check is still
+ * required when the operation is CMD_UPDATE.
+ */
+ CheckValidResultRel(leaf_part_rri, CMD_INSERT);
+
+ /*
+ * Open partition indices. The user may have asked to check for conflicts
+ * within this leaf partition and do "nothing" instead of throwing an
+ * error. Be prepared in that case by initializing the index information
+ * needed by ExecInsert() to perform speculative insertions.
+ */
+ if (partrel->rd_rel->relhasindex &&
+ leaf_part_rri->ri_IndexRelationDescs == NULL)
+ ExecOpenIndices(leaf_part_rri,
+ (node != NULL &&
+ node->onConflictAction != ONCONFLICT_NONE));
+
+ /*
+ * Build WITH CHECK OPTION constraints for the partition. Note that we
+ * didn't build the withCheckOptionList for partitions within the planner,
+ * but simple translation of varattnos will suffice. This only occurs for
+ * the INSERT case or in the case of UPDATE tuple routing where we didn't
+ * find a result rel to reuse.
+ */
+ if (node && node->withCheckOptionLists != NIL)
+ {
+ List *wcoList;
+ List *wcoExprs = NIL;
+ ListCell *ll;
+
+ /*
+ * In the case of INSERT on a partitioned table, there is only one
+ * plan. Likewise, there is only one WCO list, not one per partition.
+ * For UPDATE, there are as many WCO lists as there are plans.
+ */
+ Assert((node->operation == CMD_INSERT &&
+ list_length(node->withCheckOptionLists) == 1 &&
+ list_length(node->resultRelations) == 1) ||
+ (node->operation == CMD_UPDATE &&
+ list_length(node->withCheckOptionLists) ==
+ list_length(node->resultRelations)));
+
+ /*
+ * Use the WCO list of the first plan as a reference to calculate
+ * attno's for the WCO list of this partition. In the INSERT case,
+ * that refers to the root partitioned table, whereas in the UPDATE
+ * tuple routing case, that refers to the first partition in the
+ * mtstate->resultRelInfo array. In any case, both that relation and
+ * this partition should have the same columns, so we should be able
+ * to map attributes successfully.
+ */
+ wcoList = linitial(node->withCheckOptionLists);
+
+ /*
+ * Convert Vars in it to contain this partition's attribute numbers.
+ */
+ part_attmap =
+ build_attrmap_by_name(RelationGetDescr(partrel),
+ RelationGetDescr(firstResultRel));
+ wcoList = (List *)
+ map_variable_attnos((Node *) wcoList,
+ firstVarno, 0,
+ part_attmap,
+ RelationGetForm(partrel)->reltype,
+ &found_whole_row);
+ /* We ignore the value of found_whole_row. */
+
+ foreach(ll, wcoList)
+ {
+ WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
+ ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual),
+ &mtstate->ps);
+
+ wcoExprs = lappend(wcoExprs, wcoExpr);
+ }
+
+ leaf_part_rri->ri_WithCheckOptions = wcoList;
+ leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
+ }
+
+ /*
+ * Build the RETURNING projection for the partition. Note that we didn't
+ * build the returningList for partitions within the planner, but simple
+ * translation of varattnos will suffice. This only occurs for the INSERT
+ * case or in the case of UPDATE tuple routing where we didn't find a
+ * result rel to reuse.
+ */
+ if (node && node->returningLists != NIL)
+ {
+ TupleTableSlot *slot;
+ ExprContext *econtext;
+ List *returningList;
+
+ /* See the comment above for WCO lists. */
+ Assert((node->operation == CMD_INSERT &&
+ list_length(node->returningLists) == 1 &&
+ list_length(node->resultRelations) == 1) ||
+ (node->operation == CMD_UPDATE &&
+ list_length(node->returningLists) ==
+ list_length(node->resultRelations)));
+
+ /*
+ * Use the RETURNING list of the first plan as a reference to
+ * calculate attno's for the RETURNING list of this partition. See
+ * the comment above for WCO lists for more details on why this is
+ * okay.
+ */
+ returningList = linitial(node->returningLists);
+
+ /*
+ * Convert Vars in it to contain this partition's attribute numbers.
+ */
+ if (part_attmap == NULL)
+ part_attmap =
+ build_attrmap_by_name(RelationGetDescr(partrel),
+ RelationGetDescr(firstResultRel));
+ returningList = (List *)
+ map_variable_attnos((Node *) returningList,
+ firstVarno, 0,
+ part_attmap,
+ RelationGetForm(partrel)->reltype,
+ &found_whole_row);
+ /* We ignore the value of found_whole_row. */
+
+ leaf_part_rri->ri_returningList = returningList;
+
+ /*
+ * Initialize the projection itself.
+ *
+ * Use the slot and the expression context that would have been set up
+ * in ExecInitModifyTable() for projection's output.
+ */
+ Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
+ slot = mtstate->ps.ps_ResultTupleSlot;
+ Assert(mtstate->ps.ps_ExprContext != NULL);
+ econtext = mtstate->ps.ps_ExprContext;
+ leaf_part_rri->ri_projectReturning =
+ ExecBuildProjectionInfo(returningList, econtext, slot,
+ &mtstate->ps, RelationGetDescr(partrel));
+ }
+
+ /* Set up information needed for routing tuples to the partition. */
+ ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
+ leaf_part_rri, partidx, false);
+
+ /*
+ * If there is an ON CONFLICT clause, initialize state for it.
+ */
+ if (node && node->onConflictAction != ONCONFLICT_NONE)
+ {
+ TupleDesc partrelDesc = RelationGetDescr(partrel);
+ ExprContext *econtext = mtstate->ps.ps_ExprContext;
+ ListCell *lc;
+ List *arbiterIndexes = NIL;
+
+ /*
+ * If there is a list of arbiter indexes, map it to a list of indexes
+ * in the partition. We do that by scanning the partition's index
+ * list and searching for ancestry relationships to each index in the
+ * ancestor table.
+ */
+ if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) > 0)
+ {
+ List *childIdxs;
+
+ childIdxs = RelationGetIndexList(leaf_part_rri->ri_RelationDesc);
+
+ foreach(lc, childIdxs)
+ {
+ Oid childIdx = lfirst_oid(lc);
+ List *ancestors;
+ ListCell *lc2;
+
+ ancestors = get_partition_ancestors(childIdx);
+ foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes)
+ {
+ if (list_member_oid(ancestors, lfirst_oid(lc2)))
+ arbiterIndexes = lappend_oid(arbiterIndexes, childIdx);
+ }
+ list_free(ancestors);
+ }
+ }
+
+ /*
+ * If the resulting lists are of inequal length, something is wrong.
+ * (This shouldn't happen, since arbiter index selection should not
+ * pick up an invalid index.)
+ */
+ if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
+ list_length(arbiterIndexes))
+ elog(ERROR, "invalid arbiter index list");
+ leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
+
+ /*
+ * In the DO UPDATE case, we have some more state to initialize.
+ */
+ if (node->onConflictAction == ONCONFLICT_UPDATE)
+ {
+ OnConflictSetState *onconfl = makeNode(OnConflictSetState);
+ TupleConversionMap *map;
+
+ map = leaf_part_rri->ri_RootToPartitionMap;
+
+ Assert(node->onConflictSet != NIL);
+ Assert(rootResultRelInfo->ri_onConflict != NULL);
+
+ leaf_part_rri->ri_onConflict = onconfl;
+
+ /*
+ * Need a separate existing slot for each partition, as the
+ * partition could be of a different AM, even if the tuple
+ * descriptors match.
+ */
+ onconfl->oc_Existing =
+ table_slot_create(leaf_part_rri->ri_RelationDesc,
+ &mtstate->ps.state->es_tupleTable);
+
+ /*
+ * If the partition's tuple descriptor matches exactly the root
+ * parent (the common case), we can re-use most of the parent's ON
+ * CONFLICT SET state, skipping a bunch of work. Otherwise, we
+ * need to create state specific to this partition.
+ */
+ if (map == NULL)
+ {
+ /*
+ * It's safe to reuse these from the partition root, as we
+ * only process one tuple at a time (therefore we won't
+ * overwrite needed data in slots), and the results of
+ * projections are independent of the underlying storage.
+ * Projections and where clauses themselves don't store state
+ * / are independent of the underlying storage.
+ */
+ onconfl->oc_ProjSlot =
+ rootResultRelInfo->ri_onConflict->oc_ProjSlot;
+ onconfl->oc_ProjInfo =
+ rootResultRelInfo->ri_onConflict->oc_ProjInfo;
+ onconfl->oc_WhereClause =
+ rootResultRelInfo->ri_onConflict->oc_WhereClause;
+ }
+ else
+ {
+ List *onconflset;
+ List *onconflcols;
+ bool found_whole_row;
+
+ /*
+ * Translate expressions in onConflictSet to account for
+ * different attribute numbers. For that, map partition
+ * varattnos twice: first to catch the EXCLUDED
+ * pseudo-relation (INNER_VAR), and second to handle the main
+ * target relation (firstVarno).
+ */
+ onconflset = copyObject(node->onConflictSet);
+ if (part_attmap == NULL)
+ part_attmap =
+ build_attrmap_by_name(RelationGetDescr(partrel),
+ RelationGetDescr(firstResultRel));
+ onconflset = (List *)
+ map_variable_attnos((Node *) onconflset,
+ INNER_VAR, 0,
+ part_attmap,
+ RelationGetForm(partrel)->reltype,
+ &found_whole_row);
+ /* We ignore the value of found_whole_row. */
+ onconflset = (List *)
+ map_variable_attnos((Node *) onconflset,
+ firstVarno, 0,
+ part_attmap,
+ RelationGetForm(partrel)->reltype,
+ &found_whole_row);
+ /* We ignore the value of found_whole_row. */
+
+ /* Finally, adjust the target colnos to match the partition. */
+ onconflcols = adjust_partition_colnos(node->onConflictCols,
+ leaf_part_rri);
+
+ /* create the tuple slot for the UPDATE SET projection */
+ onconfl->oc_ProjSlot =
+ table_slot_create(partrel,
+ &mtstate->ps.state->es_tupleTable);
+
+ /* build UPDATE SET projection state */
+ onconfl->oc_ProjInfo =
+ ExecBuildUpdateProjection(onconflset,
+ true,
+ onconflcols,
+ partrelDesc,
+ econtext,
+ onconfl->oc_ProjSlot,
+ &mtstate->ps);
+
+ /*
+ * If there is a WHERE clause, initialize state where it will
+ * be evaluated, mapping the attribute numbers appropriately.
+ * As with onConflictSet, we need to map partition varattnos
+ * to the partition's tupdesc.
+ */
+ if (node->onConflictWhere)
+ {
+ List *clause;
+
+ clause = copyObject((List *) node->onConflictWhere);
+ clause = (List *)
+ map_variable_attnos((Node *) clause,
+ INNER_VAR, 0,
+ part_attmap,
+ RelationGetForm(partrel)->reltype,
+ &found_whole_row);
+ /* We ignore the value of found_whole_row. */
+ clause = (List *)
+ map_variable_attnos((Node *) clause,
+ firstVarno, 0,
+ part_attmap,
+ RelationGetForm(partrel)->reltype,
+ &found_whole_row);
+ /* We ignore the value of found_whole_row. */
+ onconfl->oc_WhereClause =
+ ExecInitQual((List *) clause, &mtstate->ps);
+ }
+ }
+ }
+ }
+
+ /*
+ * Since we've just initialized this ResultRelInfo, it's not in any list
+ * attached to the estate as yet. Add it, so that it can be found later.
+ *
+ * Note that the entries in this list appear in no predetermined order,
+ * because partition result rels are initialized as and when they're
+ * needed.
+ */
+ MemoryContextSwitchTo(estate->es_query_cxt);
+ estate->es_tuple_routing_result_relations =
+ lappend(estate->es_tuple_routing_result_relations,
+ leaf_part_rri);
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return leaf_part_rri;
+}
+
+/*
+ * ExecInitRoutingInfo
+ * Set up information needed for translating tuples between root
+ * partitioned table format and partition format, and keep track of it
+ * in PartitionTupleRouting.
+ */
+static void
+ExecInitRoutingInfo(ModifyTableState *mtstate,
+ EState *estate,
+ PartitionTupleRouting *proute,
+ PartitionDispatch dispatch,
+ ResultRelInfo *partRelInfo,
+ int partidx,
+ bool is_borrowed_rel)
+{
+ ResultRelInfo *rootRelInfo = partRelInfo->ri_RootResultRelInfo;
+ MemoryContext oldcxt;
+ int rri_index;
+
+ oldcxt = MemoryContextSwitchTo(proute->memcxt);
+
+ /*
+ * Set up a tuple conversion map to convert a tuple routed to the
+ * partition from the parent's type to the partition's.
+ */
+ partRelInfo->ri_RootToPartitionMap =
+ convert_tuples_by_name(RelationGetDescr(rootRelInfo->ri_RelationDesc),
+ RelationGetDescr(partRelInfo->ri_RelationDesc));
+
+ /*
+ * If a partition has a different rowtype than the root parent, initialize
+ * a slot dedicated to storing this partition's tuples. The slot is used
+ * for various operations that are applied to tuples after routing, such
+ * as checking constraints.
+ */
+ if (partRelInfo->ri_RootToPartitionMap != NULL)
+ {
+ Relation partrel = partRelInfo->ri_RelationDesc;
+
+ /*
+ * Initialize the slot itself setting its descriptor to this
+ * partition's TupleDesc; TupleDesc reference will be released at the
+ * end of the command.
+ */
+ partRelInfo->ri_PartitionTupleSlot =
+ table_slot_create(partrel, &estate->es_tupleTable);
+ }
+ else
+ partRelInfo->ri_PartitionTupleSlot = NULL;
+
+ /*
+ * If the partition is a foreign table, let the FDW init itself for
+ * routing tuples to the partition.
+ */
+ if (partRelInfo->ri_FdwRoutine != NULL &&
+ partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
+ partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
+
+ /*
+ * Determine if the FDW supports batch insert and determine the batch size
+ * (a FDW may support batching, but it may be disabled for the
+ * server/table or for this particular query).
+ *
+ * If the FDW does not support batching, we set the batch size to 1.
+ */
+ if (mtstate->operation == CMD_INSERT &&
+ partRelInfo->ri_FdwRoutine != NULL &&
+ partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
+ partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
+ partRelInfo->ri_BatchSize =
+ partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(partRelInfo);
+ else
+ partRelInfo->ri_BatchSize = 1;
+
+ Assert(partRelInfo->ri_BatchSize >= 1);
+
+ partRelInfo->ri_CopyMultiInsertBuffer = NULL;
+
+ /*
+ * Keep track of it in the PartitionTupleRouting->partitions array.
+ */
+ Assert(dispatch->indexes[partidx] == -1);
+
+ rri_index = proute->num_partitions++;
+
+ /* Allocate or enlarge the array, as needed */
+ if (proute->num_partitions >= proute->max_partitions)
+ {
+ if (proute->max_partitions == 0)
+ {
+ proute->max_partitions = 8;
+ proute->partitions = (ResultRelInfo **)
+ palloc(sizeof(ResultRelInfo *) * proute->max_partitions);
+ proute->is_borrowed_rel = (bool *)
+ palloc(sizeof(bool) * proute->max_partitions);
+ }
+ else
+ {
+ proute->max_partitions *= 2;
+ proute->partitions = (ResultRelInfo **)
+ repalloc(proute->partitions, sizeof(ResultRelInfo *) *
+ proute->max_partitions);
+ proute->is_borrowed_rel = (bool *)
+ repalloc(proute->is_borrowed_rel, sizeof(bool) *
+ proute->max_partitions);
+ }
+ }
+
+ proute->partitions[rri_index] = partRelInfo;
+ proute->is_borrowed_rel[rri_index] = is_borrowed_rel;
+ dispatch->indexes[partidx] = rri_index;
+
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * ExecInitPartitionDispatchInfo
+ * Lock the partitioned table (if not locked already) and initialize
+ * PartitionDispatch for a partitioned table and store it in the next
+ * available slot in the proute->partition_dispatch_info array. Also,
+ * record the index into this array in the parent_pd->indexes[] array in
+ * the partidx element so that we can properly retrieve the newly created
+ * PartitionDispatch later.
+ */
+static PartitionDispatch
+ExecInitPartitionDispatchInfo(EState *estate,
+ PartitionTupleRouting *proute, Oid partoid,
+ PartitionDispatch parent_pd, int partidx,
+ ResultRelInfo *rootResultRelInfo)
+{
+ Relation rel;
+ PartitionDesc partdesc;
+ PartitionDispatch pd;
+ int dispatchidx;
+ MemoryContext oldcxt;
+
+ /*
+ * For data modification, it is better that executor does not include
+ * partitions being detached, except when running in snapshot-isolation
+ * mode. This means that a read-committed transaction immediately gets a
+ * "no partition for tuple" error when a tuple is inserted into a
+ * partition that's being detached concurrently, but a transaction in
+ * repeatable-read mode can still use such a partition.
+ */
+ if (estate->es_partition_directory == NULL)
+ estate->es_partition_directory =
+ CreatePartitionDirectory(estate->es_query_cxt,
+ !IsolationUsesXactSnapshot());
+
+ oldcxt = MemoryContextSwitchTo(proute->memcxt);
+
+ /*
+ * Only sub-partitioned tables need to be locked here. The root
+ * partitioned table will already have been locked as it's referenced in
+ * the query's rtable.
+ */
+ if (partoid != RelationGetRelid(proute->partition_root))
+ rel = table_open(partoid, RowExclusiveLock);
+ else
+ rel = proute->partition_root;
+ partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);
+
+ pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
+ partdesc->nparts * sizeof(int));
+ pd->reldesc = rel;
+ pd->key = RelationGetPartitionKey(rel);
+ pd->keystate = NIL;
+ pd->partdesc = partdesc;
+ if (parent_pd != NULL)
+ {
+ TupleDesc tupdesc = RelationGetDescr(rel);
+
+ /*
+ * For sub-partitioned tables where the column order differs from its
+ * direct parent partitioned table, we must store a tuple table slot
+ * initialized with its tuple descriptor and a tuple conversion map to
+ * convert a tuple from its parent's rowtype to its own. This is to
+ * make sure that we are looking at the correct row using the correct
+ * tuple descriptor when computing its partition key for tuple
+ * routing.
+ */
+ pd->tupmap = build_attrmap_by_name_if_req(RelationGetDescr(parent_pd->reldesc),
+ tupdesc);
+ pd->tupslot = pd->tupmap ?
+ MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual) : NULL;
+ }
+ else
+ {
+ /* Not required for the root partitioned table */
+ pd->tupmap = NULL;
+ pd->tupslot = NULL;
+ }
+
+ /*
+ * Initialize with -1 to signify that the corresponding partition's
+ * ResultRelInfo or PartitionDispatch has not been created yet.
+ */
+ memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);
+
+ /* Track in PartitionTupleRouting for later use */
+ dispatchidx = proute->num_dispatch++;
+
+ /* Allocate or enlarge the array, as needed */
+ if (proute->num_dispatch >= proute->max_dispatch)
+ {
+ if (proute->max_dispatch == 0)
+ {
+ proute->max_dispatch = 4;
+ proute->partition_dispatch_info = (PartitionDispatch *)
+ palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
+ proute->nonleaf_partitions = (ResultRelInfo **)
+ palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
+ }
+ else
+ {
+ proute->max_dispatch *= 2;
+ proute->partition_dispatch_info = (PartitionDispatch *)
+ repalloc(proute->partition_dispatch_info,
+ sizeof(PartitionDispatch) * proute->max_dispatch);
+ proute->nonleaf_partitions = (ResultRelInfo **)
+ repalloc(proute->nonleaf_partitions,
+ sizeof(ResultRelInfo *) * proute->max_dispatch);
+ }
+ }
+ proute->partition_dispatch_info[dispatchidx] = pd;
+
+ /*
+ * If setting up a PartitionDispatch for a sub-partitioned table, we may
+ * also need a minimally valid ResultRelInfo for checking the partition
+ * constraint later; set that up now.
+ */
+ if (parent_pd)
+ {
+ ResultRelInfo *rri = makeNode(ResultRelInfo);
+
+ InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
+ proute->nonleaf_partitions[dispatchidx] = rri;
+ }
+ else
+ proute->nonleaf_partitions[dispatchidx] = NULL;
+
+ /*
+ * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
+ * install a downlink in the parent to allow quick descent.
+ */
+ if (parent_pd)
+ {
+ Assert(parent_pd->indexes[partidx] == -1);
+ parent_pd->indexes[partidx] = dispatchidx;
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return pd;
+}
+
+/*
+ * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
+ * routing.
+ *
+ * Close all the partitioned tables, leaf partitions, and their indices.
+ */
+void
+ExecCleanupTupleRouting(ModifyTableState *mtstate,
+ PartitionTupleRouting *proute)
+{
+ int i;
+
+ /*
+ * Remember, proute->partition_dispatch_info[0] corresponds to the root
+ * partitioned table, which we must not try to close, because it is the
+ * main target table of the query that will be closed by callers such as
+ * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
+ * partitioned table.
+ */
+ for (i = 1; i < proute->num_dispatch; i++)
+ {
+ PartitionDispatch pd = proute->partition_dispatch_info[i];
+
+ table_close(pd->reldesc, NoLock);
+
+ if (pd->tupslot)
+ ExecDropSingleTupleTableSlot(pd->tupslot);
+ }
+
+ for (i = 0; i < proute->num_partitions; i++)
+ {
+ ResultRelInfo *resultRelInfo = proute->partitions[i];
+
+ /* Allow any FDWs to shut down */
+ if (resultRelInfo->ri_FdwRoutine != NULL &&
+ resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
+ resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
+ resultRelInfo);
+
+ /*
+ * Close it if it's not one of the result relations borrowed from the
+ * owning ModifyTableState; those will be closed by ExecEndPlan().
+ */
+ if (proute->is_borrowed_rel[i])
+ continue;
+
+ ExecCloseIndices(resultRelInfo);
+ table_close(resultRelInfo->ri_RelationDesc, NoLock);
+ }
+}
+
+/* ----------------
+ * FormPartitionKeyDatum
+ * Construct values[] and isnull[] arrays for the partition key
+ * of a tuple.
+ *
+ * pd Partition dispatch object of the partitioned table
+ * slot Heap tuple from which to extract partition key
+ * estate executor state for evaluating any partition key
+ * expressions (must be non-NULL)
+ * values Array of partition key Datums (output area)
+ * isnull Array of is-null indicators (output area)
+ *
+ * the ecxt_scantuple slot of estate's per-tuple expr context must point to
+ * the heap tuple passed in.
+ * ----------------
+ */
+static void
+FormPartitionKeyDatum(PartitionDispatch pd,
+ TupleTableSlot *slot,
+ EState *estate,
+ Datum *values,
+ bool *isnull)
+{
+ ListCell *partexpr_item;
+ int i;
+
+ if (pd->key->partexprs != NIL && pd->keystate == NIL)
+ {
+ /* Check caller has set up context correctly */
+ Assert(estate != NULL &&
+ GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
+
+ /* First time through, set up expression evaluation state */
+ pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
+ }
+
+ partexpr_item = list_head(pd->keystate);
+ for (i = 0; i < pd->key->partnatts; i++)
+ {
+ AttrNumber keycol = pd->key->partattrs[i];
+ Datum datum;
+ bool isNull;
+
+ if (keycol != 0)
+ {
+ /* Plain column; get the value directly from the heap tuple */
+ datum = slot_getattr(slot, keycol, &isNull);
+ }
+ else
+ {
+ /* Expression; need to evaluate it */
+ if (partexpr_item == NULL)
+ elog(ERROR, "wrong number of partition key expressions");
+ datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
+ GetPerTupleExprContext(estate),
+ &isNull);
+ partexpr_item = lnext(pd->keystate, partexpr_item);
+ }
+ values[i] = datum;
+ isnull[i] = isNull;
+ }
+
+ if (partexpr_item != NULL)
+ elog(ERROR, "wrong number of partition key expressions");
+}
+
+/*
+ * get_partition_for_tuple
+ * Finds partition of relation which accepts the partition key specified
+ * in values and isnull
+ *
+ * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
+ * found or -1 if none found.
+ */
+static int
+get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
+{
+ int bound_offset;
+ int part_index = -1;
+ PartitionKey key = pd->key;
+ PartitionDesc partdesc = pd->partdesc;
+ PartitionBoundInfo boundinfo = partdesc->boundinfo;
+
+ /* Route as appropriate based on partitioning strategy. */
+ switch (key->strategy)
+ {
+ case PARTITION_STRATEGY_HASH:
+ {
+ uint64 rowHash;
+
+ rowHash = compute_partition_hash_value(key->partnatts,
+ key->partsupfunc,
+ key->partcollation,
+ values, isnull);
+
+ part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
+ }
+ break;
+
+ case PARTITION_STRATEGY_LIST:
+ if (isnull[0])
+ {
+ if (partition_bound_accepts_nulls(boundinfo))
+ part_index = boundinfo->null_index;
+ }
+ else
+ {
+ bool equal = false;
+
+ bound_offset = partition_list_bsearch(key->partsupfunc,
+ key->partcollation,
+ boundinfo,
+ values[0], &equal);
+ if (bound_offset >= 0 && equal)
+ part_index = boundinfo->indexes[bound_offset];
+ }
+ break;
+
+ case PARTITION_STRATEGY_RANGE:
+ {
+ bool equal = false,
+ range_partkey_has_null = false;
+ int i;
+
+ /*
+ * No range includes NULL, so this will be accepted by the
+ * default partition if there is one, and otherwise rejected.
+ */
+ for (i = 0; i < key->partnatts; i++)
+ {
+ if (isnull[i])
+ {
+ range_partkey_has_null = true;
+ break;
+ }
+ }
+
+ if (!range_partkey_has_null)
+ {
+ bound_offset = partition_range_datum_bsearch(key->partsupfunc,
+ key->partcollation,
+ boundinfo,
+ key->partnatts,
+ values,
+ &equal);
+
+ /*
+ * The bound at bound_offset is less than or equal to the
+ * tuple value, so the bound at offset+1 is the upper
+ * bound of the partition we're looking for, if there
+ * actually exists one.
+ */
+ part_index = boundinfo->indexes[bound_offset + 1];
+ }
+ }
+ break;
+
+ default:
+ elog(ERROR, "unexpected partition strategy: %d",
+ (int) key->strategy);
+ }
+
+ /*
+ * part_index < 0 means we failed to find a partition of this parent. Use
+ * the default partition, if there is one.
+ */
+ if (part_index < 0)
+ part_index = boundinfo->default_index;
+
+ return part_index;
+}
+
+/*
+ * ExecBuildSlotPartitionKeyDescription
+ *
+ * This works very much like BuildIndexValueDescription() and is currently
+ * used for building error messages when ExecFindPartition() fails to find
+ * partition for a row.
+ */
+static char *
+ExecBuildSlotPartitionKeyDescription(Relation rel,
+ Datum *values,
+ bool *isnull,
+ int maxfieldlen)
+{
+ StringInfoData buf;
+ PartitionKey key = RelationGetPartitionKey(rel);
+ int partnatts = get_partition_natts(key);
+ int i;
+ Oid relid = RelationGetRelid(rel);
+ AclResult aclresult;
+
+ if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
+ return NULL;
+
+ /* If the user has table-level access, just go build the description. */
+ aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
+ if (aclresult != ACLCHECK_OK)
+ {
+ /*
+ * Step through the columns of the partition key and make sure the
+ * user has SELECT rights on all of them.
+ */
+ for (i = 0; i < partnatts; i++)
+ {
+ AttrNumber attnum = get_partition_col_attnum(key, i);
+
+ /*
+ * If this partition key column is an expression, we return no
+ * detail rather than try to figure out what column(s) the
+ * expression includes and if the user has SELECT rights on them.
+ */
+ if (attnum == InvalidAttrNumber ||
+ pg_attribute_aclcheck(relid, attnum, GetUserId(),
+ ACL_SELECT) != ACLCHECK_OK)
+ return NULL;
+ }
+ }
+
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "(%s) = (",
+ pg_get_partkeydef_columns(relid, true));
+
+ for (i = 0; i < partnatts; i++)
+ {
+ char *val;
+ int vallen;
+
+ if (isnull[i])
+ val = "null";
+ else
+ {
+ Oid foutoid;
+ bool typisvarlena;
+
+ getTypeOutputInfo(get_partition_col_typid(key, i),
+ &foutoid, &typisvarlena);
+ val = OidOutputFunctionCall(foutoid, values[i]);
+ }
+
+ if (i > 0)
+ appendStringInfoString(&buf, ", ");
+
+ /* truncate if needed */
+ vallen = strlen(val);
+ if (vallen <= maxfieldlen)
+ appendBinaryStringInfo(&buf, val, vallen);
+ else
+ {
+ vallen = pg_mbcliplen(val, vallen, maxfieldlen);
+ appendBinaryStringInfo(&buf, val, vallen);
+ appendStringInfoString(&buf, "...");
+ }
+ }
+
+ appendStringInfoChar(&buf, ')');
+
+ return buf.data;
+}
+
+/*
+ * adjust_partition_colnos
+ * Adjust the list of UPDATE target column numbers to account for
+ * attribute differences between the parent and the partition.
+ */
+static List *
+adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri)
+{
+ List *new_colnos = NIL;
+ TupleConversionMap *map = ExecGetChildToRootMap(leaf_part_rri);
+ AttrMap *attrMap;
+ ListCell *lc;
+
+ Assert(map != NULL); /* else we shouldn't be here */
+ attrMap = map->attrMap;
+
+ foreach(lc, colnos)
+ {
+ AttrNumber parentattrno = lfirst_int(lc);
+
+ if (parentattrno <= 0 ||
+ parentattrno > attrMap->maplen ||
+ attrMap->attnums[parentattrno - 1] == 0)
+ elog(ERROR, "unexpected attno %d in target column list",
+ parentattrno);
+ new_colnos = lappend_int(new_colnos,
+ attrMap->attnums[parentattrno - 1]);
+ }
+
+ return new_colnos;
+}
+
+/*-------------------------------------------------------------------------
+ * Run-Time Partition Pruning Support.
+ *
+ * The following series of functions exist to support the removal of unneeded
+ * subplans for queries against partitioned tables. The supporting functions
+ * here are designed to work with any plan type which supports an arbitrary
+ * number of subplans, e.g. Append, MergeAppend.
+ *
+ * When pruning involves comparison of a partition key to a constant, it's
+ * done by the planner. However, if we have a comparison to a non-constant
+ * but not volatile expression, that presents an opportunity for run-time
+ * pruning by the executor, allowing irrelevant partitions to be skipped
+ * dynamically.
+ *
+ * We must distinguish expressions containing PARAM_EXEC Params from
+ * expressions that don't contain those. Even though a PARAM_EXEC Param is
+ * considered to be a stable expression, it can change value from one plan
+ * node scan to the next during query execution. Stable comparison
+ * expressions that don't involve such Params allow partition pruning to be
+ * done once during executor startup. Expressions that do involve such Params
+ * require us to prune separately for each scan of the parent plan node.
+ *
+ * Note that pruning away unneeded subplans during executor startup has the
+ * added benefit of not having to initialize the unneeded subplans at all.
+ *
+ *
+ * Functions:
+ *
+ * ExecCreatePartitionPruneState:
+ * Creates the PartitionPruneState required by each of the two pruning
+ * functions. Details stored include how to map the partition index
+ * returned by the partition pruning code into subplan indexes.
+ *
+ * ExecFindInitialMatchingSubPlans:
+ * Returns indexes of matching subplans. Partition pruning is attempted
+ * without any evaluation of expressions containing PARAM_EXEC Params.
+ * This function must be called during executor startup for the parent
+ * plan before the subplans themselves are initialized. Subplans which
+ * are found not to match by this function must be removed from the
+ * plan's list of subplans during execution, as this function performs a
+ * remap of the partition index to subplan index map and the newly
+ * created map provides indexes only for subplans which remain after
+ * calling this function.
+ *
+ * ExecFindMatchingSubPlans:
+ * Returns indexes of matching subplans after evaluating all available
+ * expressions. This function can only be called during execution and
+ * must be called again each time the value of a Param listed in
+ * PartitionPruneState's 'execparamids' changes.
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * ExecCreatePartitionPruneState
+ * Build the data structure required for calling
+ * ExecFindInitialMatchingSubPlans and ExecFindMatchingSubPlans.
+ *
+ * 'planstate' is the parent plan node's execution state.
+ *
+ * 'partitionpruneinfo' is a PartitionPruneInfo as generated by
+ * make_partition_pruneinfo. Here we build a PartitionPruneState containing a
+ * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
+ * partitionpruneinfo->prune_infos), each of which contains a
+ * PartitionedRelPruningData for each PartitionedRelPruneInfo appearing in
+ * that sublist. This two-level system is needed to keep from confusing the
+ * different hierarchies when a UNION ALL contains multiple partitioned tables
+ * as children. The data stored in each PartitionedRelPruningData can be
+ * re-used each time we re-evaluate which partitions match the pruning steps
+ * provided in each PartitionedRelPruneInfo.
+ */
+PartitionPruneState *
+ExecCreatePartitionPruneState(PlanState *planstate,
+ PartitionPruneInfo *partitionpruneinfo)
+{
+ EState *estate = planstate->state;
+ PartitionPruneState *prunestate;
+ int n_part_hierarchies;
+ ListCell *lc;
+ int i;
+
+ /* For data reading, executor always omits detached partitions */
+ if (estate->es_partition_directory == NULL)
+ estate->es_partition_directory =
+ CreatePartitionDirectory(estate->es_query_cxt, false);
+
+ n_part_hierarchies = list_length(partitionpruneinfo->prune_infos);
+ Assert(n_part_hierarchies > 0);
+
+ /*
+ * Allocate the data structure
+ */
+ prunestate = (PartitionPruneState *)
+ palloc(offsetof(PartitionPruneState, partprunedata) +
+ sizeof(PartitionPruningData *) * n_part_hierarchies);
+
+ prunestate->execparamids = NULL;
+ /* other_subplans can change at runtime, so we need our own copy */
+ prunestate->other_subplans = bms_copy(partitionpruneinfo->other_subplans);
+ prunestate->do_initial_prune = false; /* may be set below */
+ prunestate->do_exec_prune = false; /* may be set below */
+ prunestate->num_partprunedata = n_part_hierarchies;
+
+ /*
+ * Create a short-term memory context which we'll use when making calls to
+ * the partition pruning functions. This avoids possible memory leaks,
+ * since the pruning functions call comparison functions that aren't under
+ * our control.
+ */
+ prunestate->prune_context =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "Partition Prune",
+ ALLOCSET_DEFAULT_SIZES);
+
+ i = 0;
+ foreach(lc, partitionpruneinfo->prune_infos)
+ {
+ List *partrelpruneinfos = lfirst_node(List, lc);
+ int npartrelpruneinfos = list_length(partrelpruneinfos);
+ PartitionPruningData *prunedata;
+ ListCell *lc2;
+ int j;
+
+ prunedata = (PartitionPruningData *)
+ palloc(offsetof(PartitionPruningData, partrelprunedata) +
+ npartrelpruneinfos * sizeof(PartitionedRelPruningData));
+ prunestate->partprunedata[i] = prunedata;
+ prunedata->num_partrelprunedata = npartrelpruneinfos;
+
+ j = 0;
+ foreach(lc2, partrelpruneinfos)
+ {
+ PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2);
+ PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
+ Relation partrel;
+ PartitionDesc partdesc;
+ PartitionKey partkey;
+
+ /*
+ * We can rely on the copies of the partitioned table's partition
+ * key and partition descriptor appearing in its relcache entry,
+ * because that entry will be held open and locked for the
+ * duration of this executor run.
+ */
+ partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex);
+ partkey = RelationGetPartitionKey(partrel);
+ partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
+ partrel);
+
+ /*
+ * Initialize the subplan_map and subpart_map.
+ *
+ * Because we request detached partitions to be included, and
+ * detaching waits for old transactions, it is safe to assume that
+ * no partitions have disappeared since this query was planned.
+ *
+ * However, new partitions may have been added.
+ */
+ Assert(partdesc->nparts >= pinfo->nparts);
+ pprune->nparts = partdesc->nparts;
+ pprune->subplan_map = palloc(sizeof(int) * partdesc->nparts);
+ if (partdesc->nparts == pinfo->nparts)
+ {
+ /*
+ * There are no new partitions, so this is simple. We can
+ * simply point to the subpart_map from the plan, but we must
+ * copy the subplan_map since we may change it later.
+ */
+ pprune->subpart_map = pinfo->subpart_map;
+ memcpy(pprune->subplan_map, pinfo->subplan_map,
+ sizeof(int) * pinfo->nparts);
+
+ /*
+ * Double-check that the list of unpruned relations has not
+ * changed. (Pruned partitions are not in relid_map[].)
+ */
+#ifdef USE_ASSERT_CHECKING
+ for (int k = 0; k < pinfo->nparts; k++)
+ {
+ Assert(partdesc->oids[k] == pinfo->relid_map[k] ||
+ pinfo->subplan_map[k] == -1);
+ }
+#endif
+ }
+ else
+ {
+ int pd_idx = 0;
+ int pp_idx;
+
+ /*
+ * Some new partitions have appeared since plan time, and
+ * those are reflected in our PartitionDesc but were not
+ * present in the one used to construct subplan_map and
+ * subpart_map. So we must construct new and longer arrays
+ * where the partitions that were originally present map to
+ * the same sub-structures, and any added partitions map to
+ * -1, as if the new partitions had been pruned.
+ *
+ * Note: pinfo->relid_map[] may contain InvalidOid entries for
+ * partitions pruned by the planner. We cannot tell exactly
+ * which of the partdesc entries these correspond to, but we
+ * don't have to; just skip over them. The non-pruned
+ * relid_map entries, however, had better be a subset of the
+ * partdesc entries and in the same order.
+ */
+ pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts);
+ for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++)
+ {
+ /* Skip any InvalidOid relid_map entries */
+ while (pd_idx < pinfo->nparts &&
+ !OidIsValid(pinfo->relid_map[pd_idx]))
+ pd_idx++;
+
+ if (pd_idx < pinfo->nparts &&
+ pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx])
+ {
+ /* match... */
+ pprune->subplan_map[pp_idx] =
+ pinfo->subplan_map[pd_idx];
+ pprune->subpart_map[pp_idx] =
+ pinfo->subpart_map[pd_idx];
+ pd_idx++;
+ }
+ else
+ {
+ /* this partdesc entry is not in the plan */
+ pprune->subplan_map[pp_idx] = -1;
+ pprune->subpart_map[pp_idx] = -1;
+ }
+ }
+
+ /*
+ * It might seem that we need to skip any trailing InvalidOid
+ * entries in pinfo->relid_map before checking that we scanned
+ * all of the relid_map. But we will have skipped them above,
+ * because they must correspond to some partdesc->oids
+ * entries; we just couldn't tell which.
+ */
+ if (pd_idx != pinfo->nparts)
+ elog(ERROR, "could not match partition child tables to plan elements");
+ }
+
+ /* present_parts is also subject to later modification */
+ pprune->present_parts = bms_copy(pinfo->present_parts);
+
+ /*
+ * Initialize pruning contexts as needed.
+ */
+ pprune->initial_pruning_steps = pinfo->initial_pruning_steps;
+ if (pinfo->initial_pruning_steps)
+ {
+ ExecInitPruningContext(&pprune->initial_context,
+ pinfo->initial_pruning_steps,
+ partdesc, partkey, planstate);
+ /* Record whether initial pruning is needed at any level */
+ prunestate->do_initial_prune = true;
+ }
+ pprune->exec_pruning_steps = pinfo->exec_pruning_steps;
+ if (pinfo->exec_pruning_steps)
+ {
+ ExecInitPruningContext(&pprune->exec_context,
+ pinfo->exec_pruning_steps,
+ partdesc, partkey, planstate);
+ /* Record whether exec pruning is needed at any level */
+ prunestate->do_exec_prune = true;
+ }
+
+ /*
+ * Accumulate the IDs of all PARAM_EXEC Params affecting the
+ * partitioning decisions at this plan node.
+ */
+ prunestate->execparamids = bms_add_members(prunestate->execparamids,
+ pinfo->execparamids);
+
+ j++;
+ }
+ i++;
+ }
+
+ return prunestate;
+}
+
+/*
+ * Initialize a PartitionPruneContext for the given list of pruning steps.
+ */
+static void
+ExecInitPruningContext(PartitionPruneContext *context,
+ List *pruning_steps,
+ PartitionDesc partdesc,
+ PartitionKey partkey,
+ PlanState *planstate)
+{
+ int n_steps;
+ int partnatts;
+ ListCell *lc;
+
+ n_steps = list_length(pruning_steps);
+
+ context->strategy = partkey->strategy;
+ context->partnatts = partnatts = partkey->partnatts;
+ context->nparts = partdesc->nparts;
+ context->boundinfo = partdesc->boundinfo;
+ context->partcollation = partkey->partcollation;
+ context->partsupfunc = partkey->partsupfunc;
+
+ /* We'll look up type-specific support functions as needed */
+ context->stepcmpfuncs = (FmgrInfo *)
+ palloc0(sizeof(FmgrInfo) * n_steps * partnatts);
+
+ context->ppccontext = CurrentMemoryContext;
+ context->planstate = planstate;
+
+ /* Initialize expression state for each expression we need */
+ context->exprstates = (ExprState **)
+ palloc0(sizeof(ExprState *) * n_steps * partnatts);
+ foreach(lc, pruning_steps)
+ {
+ PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
+ ListCell *lc2;
+ int keyno;
+
+ /* not needed for other step kinds */
+ if (!IsA(step, PartitionPruneStepOp))
+ continue;
+
+ Assert(list_length(step->exprs) <= partnatts);
+
+ keyno = 0;
+ foreach(lc2, step->exprs)
+ {
+ Expr *expr = (Expr *) lfirst(lc2);
+
+ /* not needed for Consts */
+ if (!IsA(expr, Const))
+ {
+ int stateidx = PruneCxtStateIdx(partnatts,
+ step->step.step_id,
+ keyno);
+
+ context->exprstates[stateidx] =
+ ExecInitExpr(expr, context->planstate);
+ }
+ keyno++;
+ }
+ }
+}
+
+/*
+ * ExecFindInitialMatchingSubPlans
+ * Identify the set of subplans that cannot be eliminated by initial
+ * pruning, disregarding any pruning constraints involving PARAM_EXEC
+ * Params.
+ *
+ * If additional pruning passes will be required (because of PARAM_EXEC
+ * Params), we must also update the translation data that allows conversion
+ * of partition indexes into subplan indexes to account for the unneeded
+ * subplans having been removed.
+ *
+ * Must only be called once per 'prunestate', and only if initial pruning
+ * is required.
+ *
+ * 'nsubplans' must be passed as the total number of unpruned subplans.
+ */
+Bitmapset *
+ExecFindInitialMatchingSubPlans(PartitionPruneState *prunestate, int nsubplans)
+{
+ Bitmapset *result = NULL;
+ MemoryContext oldcontext;
+ int i;
+
+ /* Caller error if we get here without do_initial_prune */
+ Assert(prunestate->do_initial_prune);
+
+ /*
+ * Switch to a temp context to avoid leaking memory in the executor's
+ * query-lifespan memory context.
+ */
+ oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
+
+ /*
+ * For each hierarchy, do the pruning tests, and add nondeletable
+ * subplans' indexes to "result".
+ */
+ for (i = 0; i < prunestate->num_partprunedata; i++)
+ {
+ PartitionPruningData *prunedata;
+ PartitionedRelPruningData *pprune;
+
+ prunedata = prunestate->partprunedata[i];
+ pprune = &prunedata->partrelprunedata[0];
+
+ /* Perform pruning without using PARAM_EXEC Params */
+ find_matching_subplans_recurse(prunedata, pprune, true, &result);
+
+ /* Expression eval may have used space in node's ps_ExprContext too */
+ if (pprune->initial_pruning_steps)
+ ResetExprContext(pprune->initial_context.planstate->ps_ExprContext);
+ }
+
+ /* Add in any subplans that partition pruning didn't account for */
+ result = bms_add_members(result, prunestate->other_subplans);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /* Copy result out of the temp context before we reset it */
+ result = bms_copy(result);
+
+ MemoryContextReset(prunestate->prune_context);
+
+ /*
+ * If exec-time pruning is required and we pruned subplans above, then we
+ * must re-sequence the subplan indexes so that ExecFindMatchingSubPlans
+ * properly returns the indexes from the subplans which will remain after
+ * execution of this function.
+ *
+ * We can safely skip this when !do_exec_prune, even though that leaves
+ * invalid data in prunestate, because that data won't be consulted again
+ * (cf initial Assert in ExecFindMatchingSubPlans).
+ */
+ if (prunestate->do_exec_prune && bms_num_members(result) < nsubplans)
+ {
+ int *new_subplan_indexes;
+ Bitmapset *new_other_subplans;
+ int i;
+ int newidx;
+
+ /*
+ * First we must build a temporary array which maps old subplan
+ * indexes to new ones. For convenience of initialization, we use
+ * 1-based indexes in this array and leave pruned items as 0.
+ */
+ new_subplan_indexes = (int *) palloc0(sizeof(int) * nsubplans);
+ newidx = 1;
+ i = -1;
+ while ((i = bms_next_member(result, i)) >= 0)
+ {
+ Assert(i < nsubplans);
+ new_subplan_indexes[i] = newidx++;
+ }
+
+ /*
+ * Now we can update each PartitionedRelPruneInfo's subplan_map with
+ * new subplan indexes. We must also recompute its present_parts
+ * bitmap.
+ */
+ for (i = 0; i < prunestate->num_partprunedata; i++)
+ {
+ PartitionPruningData *prunedata = prunestate->partprunedata[i];
+ int j;
+
+ /*
+ * Within each hierarchy, we perform this loop in back-to-front
+ * order so that we determine present_parts for the lowest-level
+ * partitioned tables first. This way we can tell whether a
+ * sub-partitioned table's partitions were entirely pruned so we
+ * can exclude it from the current level's present_parts.
+ */
+ for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--)
+ {
+ PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
+ int nparts = pprune->nparts;
+ int k;
+
+ /* We just rebuild present_parts from scratch */
+ bms_free(pprune->present_parts);
+ pprune->present_parts = NULL;
+
+ for (k = 0; k < nparts; k++)
+ {
+ int oldidx = pprune->subplan_map[k];
+ int subidx;
+
+ /*
+ * If this partition existed as a subplan then change the
+ * old subplan index to the new subplan index. The new
+ * index may become -1 if the partition was pruned above,
+ * or it may just come earlier in the subplan list due to
+ * some subplans being removed earlier in the list. If
+ * it's a subpartition, add it to present_parts unless
+ * it's entirely pruned.
+ */
+ if (oldidx >= 0)
+ {
+ Assert(oldidx < nsubplans);
+ pprune->subplan_map[k] = new_subplan_indexes[oldidx] - 1;
+
+ if (new_subplan_indexes[oldidx] > 0)
+ pprune->present_parts =
+ bms_add_member(pprune->present_parts, k);
+ }
+ else if ((subidx = pprune->subpart_map[k]) >= 0)
+ {
+ PartitionedRelPruningData *subprune;
+
+ subprune = &prunedata->partrelprunedata[subidx];
+
+ if (!bms_is_empty(subprune->present_parts))
+ pprune->present_parts =
+ bms_add_member(pprune->present_parts, k);
+ }
+ }
+ }
+ }
+
+ /*
+ * We must also recompute the other_subplans set, since indexes in it
+ * may change.
+ */
+ new_other_subplans = NULL;
+ i = -1;
+ while ((i = bms_next_member(prunestate->other_subplans, i)) >= 0)
+ new_other_subplans = bms_add_member(new_other_subplans,
+ new_subplan_indexes[i] - 1);
+
+ bms_free(prunestate->other_subplans);
+ prunestate->other_subplans = new_other_subplans;
+
+ pfree(new_subplan_indexes);
+ }
+
+ return result;
+}
+
+/*
+ * ExecFindMatchingSubPlans
+ * Determine which subplans match the pruning steps detailed in
+ * 'prunestate' for the current comparison expression values.
+ *
+ * Here we assume we may evaluate PARAM_EXEC Params.
+ */
+Bitmapset *
+ExecFindMatchingSubPlans(PartitionPruneState *prunestate)
+{
+ Bitmapset *result = NULL;
+ MemoryContext oldcontext;
+ int i;
+
+ /*
+ * If !do_exec_prune, we've got problems because
+ * ExecFindInitialMatchingSubPlans will not have bothered to update
+ * prunestate for whatever pruning it did.
+ */
+ Assert(prunestate->do_exec_prune);
+
+ /*
+ * Switch to a temp context to avoid leaking memory in the executor's
+ * query-lifespan memory context.
+ */
+ oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
+
+ /*
+ * For each hierarchy, do the pruning tests, and add nondeletable
+ * subplans' indexes to "result".
+ */
+ for (i = 0; i < prunestate->num_partprunedata; i++)
+ {
+ PartitionPruningData *prunedata;
+ PartitionedRelPruningData *pprune;
+
+ prunedata = prunestate->partprunedata[i];
+ pprune = &prunedata->partrelprunedata[0];
+
+ find_matching_subplans_recurse(prunedata, pprune, false, &result);
+
+ /* Expression eval may have used space in node's ps_ExprContext too */
+ if (pprune->exec_pruning_steps)
+ ResetExprContext(pprune->exec_context.planstate->ps_ExprContext);
+ }
+
+ /* Add in any subplans that partition pruning didn't account for */
+ result = bms_add_members(result, prunestate->other_subplans);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /* Copy result out of the temp context before we reset it */
+ result = bms_copy(result);
+
+ MemoryContextReset(prunestate->prune_context);
+
+ return result;
+}
+
+/*
+ * find_matching_subplans_recurse
+ * Recursive worker function for ExecFindMatchingSubPlans and
+ * ExecFindInitialMatchingSubPlans
+ *
+ * Adds valid (non-prunable) subplan IDs to *validsubplans
+ */
+static void
+find_matching_subplans_recurse(PartitionPruningData *prunedata,
+ PartitionedRelPruningData *pprune,
+ bool initial_prune,
+ Bitmapset **validsubplans)
+{
+ Bitmapset *partset;
+ int i;
+
+ /* Guard against stack overflow due to overly deep partition hierarchy. */
+ check_stack_depth();
+
+ /* Only prune if pruning would be useful at this level. */
+ if (initial_prune && pprune->initial_pruning_steps)
+ {
+ partset = get_matching_partitions(&pprune->initial_context,
+ pprune->initial_pruning_steps);
+ }
+ else if (!initial_prune && pprune->exec_pruning_steps)
+ {
+ partset = get_matching_partitions(&pprune->exec_context,
+ pprune->exec_pruning_steps);
+ }
+ else
+ {
+ /*
+ * If no pruning is to be done, just include all partitions at this
+ * level.
+ */
+ partset = pprune->present_parts;
+ }
+
+ /* Translate partset into subplan indexes */
+ i = -1;
+ while ((i = bms_next_member(partset, i)) >= 0)
+ {
+ if (pprune->subplan_map[i] >= 0)
+ *validsubplans = bms_add_member(*validsubplans,
+ pprune->subplan_map[i]);
+ else
+ {
+ int partidx = pprune->subpart_map[i];
+
+ if (partidx >= 0)
+ find_matching_subplans_recurse(prunedata,
+ &prunedata->partrelprunedata[partidx],
+ initial_prune, validsubplans);
+ else
+ {
+ /*
+ * We get here if the planner already pruned all the sub-
+ * partitions for this partition. Silently ignore this
+ * partition in this case. The end result is the same: we
+ * would have pruned all partitions just the same, but we
+ * don't have any pruning steps to execute to verify this.
+ */
+ }
+ }
+ }
+}
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
new file mode 100644
index 0000000..1752b9b
--- /dev/null
+++ b/src/backend/executor/execProcnode.c
@@ -0,0 +1,981 @@
+/*-------------------------------------------------------------------------
+ *
+ * execProcnode.c
+ * contains dispatch functions which call the appropriate "initialize",
+ * "get a tuple", and "cleanup" routines for the given node type.
+ * If the node has children, then it will presumably call ExecInitNode,
+ * ExecProcNode, or ExecEndNode on its subnodes and do the appropriate
+ * processing.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execProcnode.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * NOTES
+ * This used to be three files. It is now all combined into
+ * one file so that it is easier to keep the dispatch routines
+ * in sync when new nodes are added.
+ *
+ * EXAMPLE
+ * Suppose we want the age of the manager of the shoe department and
+ * the number of employees in that department. So we have the query:
+ *
+ * select DEPT.no_emps, EMP.age
+ * from DEPT, EMP
+ * where EMP.name = DEPT.mgr and
+ * DEPT.name = "shoe"
+ *
+ * Suppose the planner gives us the following plan:
+ *
+ * Nest Loop (DEPT.mgr = EMP.name)
+ * / \
+ * / \
+ * Seq Scan Seq Scan
+ * DEPT EMP
+ * (name = "shoe")
+ *
+ * ExecutorStart() is called first.
+ * It calls InitPlan() which calls ExecInitNode() on
+ * the root of the plan -- the nest loop node.
+ *
+ * * ExecInitNode() notices that it is looking at a nest loop and
+ * as the code below demonstrates, it calls ExecInitNestLoop().
+ * Eventually this calls ExecInitNode() on the right and left subplans
+ * and so forth until the entire plan is initialized. The result
+ * of ExecInitNode() is a plan state tree built with the same structure
+ * as the underlying plan tree.
+ *
+ * * Then when ExecutorRun() is called, it calls ExecutePlan() which calls
+ * ExecProcNode() repeatedly on the top node of the plan state tree.
+ * Each time this happens, ExecProcNode() will end up calling
+ * ExecNestLoop(), which calls ExecProcNode() on its subplans.
+ * Each of these subplans is a sequential scan so ExecSeqScan() is
+ * called. The slots returned by ExecSeqScan() may contain
+ * tuples which contain the attributes ExecNestLoop() uses to
+ * form the tuples it returns.
+ *
+ * * Eventually ExecSeqScan() stops returning tuples and the nest
+ * loop join ends. Lastly, ExecutorEnd() calls ExecEndNode() which
+ * calls ExecEndNestLoop() which in turn calls ExecEndNode() on
+ * its subplans which result in ExecEndSeqScan().
+ *
+ * This should show how the executor works by having
+ * ExecInitNode(), ExecProcNode() and ExecEndNode() dispatch
+ * their work to the appropriate node support routines which may
+ * in turn call these routines themselves on their subplans.
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeAgg.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeBitmapAnd.h"
+#include "executor/nodeBitmapHeapscan.h"
+#include "executor/nodeBitmapIndexscan.h"
+#include "executor/nodeBitmapOr.h"
+#include "executor/nodeCtescan.h"
+#include "executor/nodeCustom.h"
+#include "executor/nodeForeignscan.h"
+#include "executor/nodeFunctionscan.h"
+#include "executor/nodeGather.h"
+#include "executor/nodeGatherMerge.h"
+#include "executor/nodeGroup.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "executor/nodeLimit.h"
+#include "executor/nodeLockRows.h"
+#include "executor/nodeMaterial.h"
+#include "executor/nodeMemoize.h"
+#include "executor/nodeMergeAppend.h"
+#include "executor/nodeMergejoin.h"
+#include "executor/nodeModifyTable.h"
+#include "executor/nodeNamedtuplestorescan.h"
+#include "executor/nodeNestloop.h"
+#include "executor/nodeProjectSet.h"
+#include "executor/nodeRecursiveunion.h"
+#include "executor/nodeResult.h"
+#include "executor/nodeSamplescan.h"
+#include "executor/nodeSeqscan.h"
+#include "executor/nodeSetOp.h"
+#include "executor/nodeSort.h"
+#include "executor/nodeSubplan.h"
+#include "executor/nodeSubqueryscan.h"
+#include "executor/nodeTableFuncscan.h"
+#include "executor/nodeTidrangescan.h"
+#include "executor/nodeTidscan.h"
+#include "executor/nodeUnique.h"
+#include "executor/nodeValuesscan.h"
+#include "executor/nodeWindowAgg.h"
+#include "executor/nodeWorktablescan.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+
+static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
+static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
+
+
+/* ------------------------------------------------------------------------
+ * ExecInitNode
+ *
+ * Recursively initializes all the nodes in the plan tree rooted
+ * at 'node'.
+ *
+ * Inputs:
+ * 'node' is the current node of the plan produced by the query planner
+ * 'estate' is the shared execution state for the plan tree
+ * 'eflags' is a bitwise OR of flag bits described in executor.h
+ *
+ * Returns a PlanState node corresponding to the given Plan node.
+ * ------------------------------------------------------------------------
+ */
+PlanState *
+ExecInitNode(Plan *node, EState *estate, int eflags)
+{
+ PlanState *result;
+ List *subps;
+ ListCell *l;
+
+ /*
+ * do nothing when we get to the end of a leaf on tree.
+ */
+ if (node == NULL)
+ return NULL;
+
+ /*
+ * Make sure there's enough stack available. Need to check here, in
+ * addition to ExecProcNode() (via ExecProcNodeFirst()), to ensure the
+ * stack isn't overrun while initializing the node tree.
+ */
+ check_stack_depth();
+
+ switch (nodeTag(node))
+ {
+ /*
+ * control nodes
+ */
+ case T_Result:
+ result = (PlanState *) ExecInitResult((Result *) node,
+ estate, eflags);
+ break;
+
+ case T_ProjectSet:
+ result = (PlanState *) ExecInitProjectSet((ProjectSet *) node,
+ estate, eflags);
+ break;
+
+ case T_ModifyTable:
+ result = (PlanState *) ExecInitModifyTable((ModifyTable *) node,
+ estate, eflags);
+ break;
+
+ case T_Append:
+ result = (PlanState *) ExecInitAppend((Append *) node,
+ estate, eflags);
+ break;
+
+ case T_MergeAppend:
+ result = (PlanState *) ExecInitMergeAppend((MergeAppend *) node,
+ estate, eflags);
+ break;
+
+ case T_RecursiveUnion:
+ result = (PlanState *) ExecInitRecursiveUnion((RecursiveUnion *) node,
+ estate, eflags);
+ break;
+
+ case T_BitmapAnd:
+ result = (PlanState *) ExecInitBitmapAnd((BitmapAnd *) node,
+ estate, eflags);
+ break;
+
+ case T_BitmapOr:
+ result = (PlanState *) ExecInitBitmapOr((BitmapOr *) node,
+ estate, eflags);
+ break;
+
+ /*
+ * scan nodes
+ */
+ case T_SeqScan:
+ result = (PlanState *) ExecInitSeqScan((SeqScan *) node,
+ estate, eflags);
+ break;
+
+ case T_SampleScan:
+ result = (PlanState *) ExecInitSampleScan((SampleScan *) node,
+ estate, eflags);
+ break;
+
+ case T_IndexScan:
+ result = (PlanState *) ExecInitIndexScan((IndexScan *) node,
+ estate, eflags);
+ break;
+
+ case T_IndexOnlyScan:
+ result = (PlanState *) ExecInitIndexOnlyScan((IndexOnlyScan *) node,
+ estate, eflags);
+ break;
+
+ case T_BitmapIndexScan:
+ result = (PlanState *) ExecInitBitmapIndexScan((BitmapIndexScan *) node,
+ estate, eflags);
+ break;
+
+ case T_BitmapHeapScan:
+ result = (PlanState *) ExecInitBitmapHeapScan((BitmapHeapScan *) node,
+ estate, eflags);
+ break;
+
+ case T_TidScan:
+ result = (PlanState *) ExecInitTidScan((TidScan *) node,
+ estate, eflags);
+ break;
+
+ case T_TidRangeScan:
+ result = (PlanState *) ExecInitTidRangeScan((TidRangeScan *) node,
+ estate, eflags);
+ break;
+
+ case T_SubqueryScan:
+ result = (PlanState *) ExecInitSubqueryScan((SubqueryScan *) node,
+ estate, eflags);
+ break;
+
+ case T_FunctionScan:
+ result = (PlanState *) ExecInitFunctionScan((FunctionScan *) node,
+ estate, eflags);
+ break;
+
+ case T_TableFuncScan:
+ result = (PlanState *) ExecInitTableFuncScan((TableFuncScan *) node,
+ estate, eflags);
+ break;
+
+ case T_ValuesScan:
+ result = (PlanState *) ExecInitValuesScan((ValuesScan *) node,
+ estate, eflags);
+ break;
+
+ case T_CteScan:
+ result = (PlanState *) ExecInitCteScan((CteScan *) node,
+ estate, eflags);
+ break;
+
+ case T_NamedTuplestoreScan:
+ result = (PlanState *) ExecInitNamedTuplestoreScan((NamedTuplestoreScan *) node,
+ estate, eflags);
+ break;
+
+ case T_WorkTableScan:
+ result = (PlanState *) ExecInitWorkTableScan((WorkTableScan *) node,
+ estate, eflags);
+ break;
+
+ case T_ForeignScan:
+ result = (PlanState *) ExecInitForeignScan((ForeignScan *) node,
+ estate, eflags);
+ break;
+
+ case T_CustomScan:
+ result = (PlanState *) ExecInitCustomScan((CustomScan *) node,
+ estate, eflags);
+ break;
+
+ /*
+ * join nodes
+ */
+ case T_NestLoop:
+ result = (PlanState *) ExecInitNestLoop((NestLoop *) node,
+ estate, eflags);
+ break;
+
+ case T_MergeJoin:
+ result = (PlanState *) ExecInitMergeJoin((MergeJoin *) node,
+ estate, eflags);
+ break;
+
+ case T_HashJoin:
+ result = (PlanState *) ExecInitHashJoin((HashJoin *) node,
+ estate, eflags);
+ break;
+
+ /*
+ * materialization nodes
+ */
+ case T_Material:
+ result = (PlanState *) ExecInitMaterial((Material *) node,
+ estate, eflags);
+ break;
+
+ case T_Sort:
+ result = (PlanState *) ExecInitSort((Sort *) node,
+ estate, eflags);
+ break;
+
+ case T_IncrementalSort:
+ result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node,
+ estate, eflags);
+ break;
+
+ case T_Memoize:
+ result = (PlanState *) ExecInitMemoize((Memoize *) node, estate,
+ eflags);
+ break;
+
+ case T_Group:
+ result = (PlanState *) ExecInitGroup((Group *) node,
+ estate, eflags);
+ break;
+
+ case T_Agg:
+ result = (PlanState *) ExecInitAgg((Agg *) node,
+ estate, eflags);
+ break;
+
+ case T_WindowAgg:
+ result = (PlanState *) ExecInitWindowAgg((WindowAgg *) node,
+ estate, eflags);
+ break;
+
+ case T_Unique:
+ result = (PlanState *) ExecInitUnique((Unique *) node,
+ estate, eflags);
+ break;
+
+ case T_Gather:
+ result = (PlanState *) ExecInitGather((Gather *) node,
+ estate, eflags);
+ break;
+
+ case T_GatherMerge:
+ result = (PlanState *) ExecInitGatherMerge((GatherMerge *) node,
+ estate, eflags);
+ break;
+
+ case T_Hash:
+ result = (PlanState *) ExecInitHash((Hash *) node,
+ estate, eflags);
+ break;
+
+ case T_SetOp:
+ result = (PlanState *) ExecInitSetOp((SetOp *) node,
+ estate, eflags);
+ break;
+
+ case T_LockRows:
+ result = (PlanState *) ExecInitLockRows((LockRows *) node,
+ estate, eflags);
+ break;
+
+ case T_Limit:
+ result = (PlanState *) ExecInitLimit((Limit *) node,
+ estate, eflags);
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+ result = NULL; /* keep compiler quiet */
+ break;
+ }
+
+ ExecSetExecProcNode(result, result->ExecProcNode);
+
+ /*
+ * Initialize any initPlans present in this node. The planner put them in
+ * a separate list for us.
+ */
+ subps = NIL;
+ foreach(l, node->initPlan)
+ {
+ SubPlan *subplan = (SubPlan *) lfirst(l);
+ SubPlanState *sstate;
+
+ Assert(IsA(subplan, SubPlan));
+ sstate = ExecInitSubPlan(subplan, result);
+ subps = lappend(subps, sstate);
+ }
+ result->initPlan = subps;
+
+ /* Set up instrumentation for this node if requested */
+ if (estate->es_instrument)
+ result->instrument = InstrAlloc(1, estate->es_instrument,
+ result->async_capable);
+
+ return result;
+}
+
+
+/*
+ * If a node wants to change its ExecProcNode function after ExecInitNode()
+ * has finished, it should do so with this function. That way any wrapper
+ * functions can be reinstalled, without the node having to know how that
+ * works.
+ */
+void
+ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function)
+{
+ /*
+ * Add a wrapper around the ExecProcNode callback that checks stack depth
+ * during the first execution and maybe adds an instrumentation wrapper.
+ * When the callback is changed after execution has already begun that
+ * means we'll superfluously execute ExecProcNodeFirst, but that seems ok.
+ */
+ node->ExecProcNodeReal = function;
+ node->ExecProcNode = ExecProcNodeFirst;
+}
+
+
+/*
+ * ExecProcNode wrapper that performs some one-time checks, before calling
+ * the relevant node method (possibly via an instrumentation wrapper).
+ */
+static TupleTableSlot *
+ExecProcNodeFirst(PlanState *node)
+{
+ /*
+ * Perform stack depth check during the first execution of the node. We
+ * only do so the first time round because it turns out to not be cheap on
+ * some common architectures (eg. x86). This relies on the assumption
+ * that ExecProcNode calls for a given plan node will always be made at
+ * roughly the same stack depth.
+ */
+ check_stack_depth();
+
+ /*
+ * If instrumentation is required, change the wrapper to one that just
+ * does instrumentation. Otherwise we can dispense with all wrappers and
+ * have ExecProcNode() directly call the relevant function from now on.
+ */
+ if (node->instrument)
+ node->ExecProcNode = ExecProcNodeInstr;
+ else
+ node->ExecProcNode = node->ExecProcNodeReal;
+
+ return node->ExecProcNode(node);
+}
+
+
+/*
+ * ExecProcNode wrapper that performs instrumentation calls. By keeping
+ * this a separate function, we avoid overhead in the normal case where
+ * no instrumentation is wanted.
+ */
+static TupleTableSlot *
+ExecProcNodeInstr(PlanState *node)
+{
+ TupleTableSlot *result;
+
+ InstrStartNode(node->instrument);
+
+ result = node->ExecProcNodeReal(node);
+
+ InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
+
+ return result;
+}
+
+
+/* ----------------------------------------------------------------
+ * MultiExecProcNode
+ *
+ * Execute a node that doesn't return individual tuples
+ * (it might return a hashtable, bitmap, etc). Caller should
+ * check it got back the expected kind of Node.
+ *
+ * This has essentially the same responsibilities as ExecProcNode,
+ * but it does not do InstrStartNode/InstrStopNode (mainly because
+ * it can't tell how many returned tuples to count). Each per-node
+ * function must provide its own instrumentation support.
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecProcNode(PlanState *node)
+{
+ Node *result;
+
+ check_stack_depth();
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (node->chgParam != NULL) /* something changed */
+ ExecReScan(node); /* let ReScan handle this */
+
+ switch (nodeTag(node))
+ {
+ /*
+ * Only node types that actually support multiexec will be listed
+ */
+
+ case T_HashState:
+ result = MultiExecHash((HashState *) node);
+ break;
+
+ case T_BitmapIndexScanState:
+ result = MultiExecBitmapIndexScan((BitmapIndexScanState *) node);
+ break;
+
+ case T_BitmapAndState:
+ result = MultiExecBitmapAnd((BitmapAndState *) node);
+ break;
+
+ case T_BitmapOrState:
+ result = MultiExecBitmapOr((BitmapOrState *) node);
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+ result = NULL;
+ break;
+ }
+
+ return result;
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecEndNode
+ *
+ * Recursively cleans up all the nodes in the plan rooted
+ * at 'node'.
+ *
+ * After this operation, the query plan will not be able to be
+ * processed any further. This should be called only after
+ * the query plan has been fully executed.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndNode(PlanState *node)
+{
+ /*
+ * do nothing when we get to the end of a leaf on tree.
+ */
+ if (node == NULL)
+ return;
+
+ /*
+ * Make sure there's enough stack available. Need to check here, in
+ * addition to ExecProcNode() (via ExecProcNodeFirst()), because it's not
+ * guaranteed that ExecProcNode() is reached for all nodes.
+ */
+ check_stack_depth();
+
+ if (node->chgParam != NULL)
+ {
+ bms_free(node->chgParam);
+ node->chgParam = NULL;
+ }
+
+ switch (nodeTag(node))
+ {
+ /*
+ * control nodes
+ */
+ case T_ResultState:
+ ExecEndResult((ResultState *) node);
+ break;
+
+ case T_ProjectSetState:
+ ExecEndProjectSet((ProjectSetState *) node);
+ break;
+
+ case T_ModifyTableState:
+ ExecEndModifyTable((ModifyTableState *) node);
+ break;
+
+ case T_AppendState:
+ ExecEndAppend((AppendState *) node);
+ break;
+
+ case T_MergeAppendState:
+ ExecEndMergeAppend((MergeAppendState *) node);
+ break;
+
+ case T_RecursiveUnionState:
+ ExecEndRecursiveUnion((RecursiveUnionState *) node);
+ break;
+
+ case T_BitmapAndState:
+ ExecEndBitmapAnd((BitmapAndState *) node);
+ break;
+
+ case T_BitmapOrState:
+ ExecEndBitmapOr((BitmapOrState *) node);
+ break;
+
+ /*
+ * scan nodes
+ */
+ case T_SeqScanState:
+ ExecEndSeqScan((SeqScanState *) node);
+ break;
+
+ case T_SampleScanState:
+ ExecEndSampleScan((SampleScanState *) node);
+ break;
+
+ case T_GatherState:
+ ExecEndGather((GatherState *) node);
+ break;
+
+ case T_GatherMergeState:
+ ExecEndGatherMerge((GatherMergeState *) node);
+ break;
+
+ case T_IndexScanState:
+ ExecEndIndexScan((IndexScanState *) node);
+ break;
+
+ case T_IndexOnlyScanState:
+ ExecEndIndexOnlyScan((IndexOnlyScanState *) node);
+ break;
+
+ case T_BitmapIndexScanState:
+ ExecEndBitmapIndexScan((BitmapIndexScanState *) node);
+ break;
+
+ case T_BitmapHeapScanState:
+ ExecEndBitmapHeapScan((BitmapHeapScanState *) node);
+ break;
+
+ case T_TidScanState:
+ ExecEndTidScan((TidScanState *) node);
+ break;
+
+ case T_TidRangeScanState:
+ ExecEndTidRangeScan((TidRangeScanState *) node);
+ break;
+
+ case T_SubqueryScanState:
+ ExecEndSubqueryScan((SubqueryScanState *) node);
+ break;
+
+ case T_FunctionScanState:
+ ExecEndFunctionScan((FunctionScanState *) node);
+ break;
+
+ case T_TableFuncScanState:
+ ExecEndTableFuncScan((TableFuncScanState *) node);
+ break;
+
+ case T_ValuesScanState:
+ ExecEndValuesScan((ValuesScanState *) node);
+ break;
+
+ case T_CteScanState:
+ ExecEndCteScan((CteScanState *) node);
+ break;
+
+ case T_NamedTuplestoreScanState:
+ ExecEndNamedTuplestoreScan((NamedTuplestoreScanState *) node);
+ break;
+
+ case T_WorkTableScanState:
+ ExecEndWorkTableScan((WorkTableScanState *) node);
+ break;
+
+ case T_ForeignScanState:
+ ExecEndForeignScan((ForeignScanState *) node);
+ break;
+
+ case T_CustomScanState:
+ ExecEndCustomScan((CustomScanState *) node);
+ break;
+
+ /*
+ * join nodes
+ */
+ case T_NestLoopState:
+ ExecEndNestLoop((NestLoopState *) node);
+ break;
+
+ case T_MergeJoinState:
+ ExecEndMergeJoin((MergeJoinState *) node);
+ break;
+
+ case T_HashJoinState:
+ ExecEndHashJoin((HashJoinState *) node);
+ break;
+
+ /*
+ * materialization nodes
+ */
+ case T_MaterialState:
+ ExecEndMaterial((MaterialState *) node);
+ break;
+
+ case T_SortState:
+ ExecEndSort((SortState *) node);
+ break;
+
+ case T_IncrementalSortState:
+ ExecEndIncrementalSort((IncrementalSortState *) node);
+ break;
+
+ case T_MemoizeState:
+ ExecEndMemoize((MemoizeState *) node);
+ break;
+
+ case T_GroupState:
+ ExecEndGroup((GroupState *) node);
+ break;
+
+ case T_AggState:
+ ExecEndAgg((AggState *) node);
+ break;
+
+ case T_WindowAggState:
+ ExecEndWindowAgg((WindowAggState *) node);
+ break;
+
+ case T_UniqueState:
+ ExecEndUnique((UniqueState *) node);
+ break;
+
+ case T_HashState:
+ ExecEndHash((HashState *) node);
+ break;
+
+ case T_SetOpState:
+ ExecEndSetOp((SetOpState *) node);
+ break;
+
+ case T_LockRowsState:
+ ExecEndLockRows((LockRowsState *) node);
+ break;
+
+ case T_LimitState:
+ ExecEndLimit((LimitState *) node);
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+ break;
+ }
+}
+
+/*
+ * ExecShutdownNode
+ *
+ * Give execution nodes a chance to stop asynchronous resource consumption
+ * and release any resources still held.
+ */
+bool
+ExecShutdownNode(PlanState *node)
+{
+ if (node == NULL)
+ return false;
+
+ check_stack_depth();
+
+ /*
+ * Treat the node as running while we shut it down, but only if it's run
+ * at least once already. We don't expect much CPU consumption during
+ * node shutdown, but in the case of Gather or Gather Merge, we may shut
+ * down workers at this stage. If so, their buffer usage will get
+ * propagated into pgBufferUsage at this point, and we want to make sure
+ * that it gets associated with the Gather node. We skip this if the node
+ * has never been executed, so as to avoid incorrectly making it appear
+ * that it has.
+ */
+ if (node->instrument && node->instrument->running)
+ InstrStartNode(node->instrument);
+
+ planstate_tree_walker(node, ExecShutdownNode, NULL);
+
+ switch (nodeTag(node))
+ {
+ case T_GatherState:
+ ExecShutdownGather((GatherState *) node);
+ break;
+ case T_ForeignScanState:
+ ExecShutdownForeignScan((ForeignScanState *) node);
+ break;
+ case T_CustomScanState:
+ ExecShutdownCustomScan((CustomScanState *) node);
+ break;
+ case T_GatherMergeState:
+ ExecShutdownGatherMerge((GatherMergeState *) node);
+ break;
+ case T_HashState:
+ ExecShutdownHash((HashState *) node);
+ break;
+ case T_HashJoinState:
+ ExecShutdownHashJoin((HashJoinState *) node);
+ break;
+ default:
+ break;
+ }
+
+ /* Stop the node if we started it above, reporting 0 tuples. */
+ if (node->instrument && node->instrument->running)
+ InstrStopNode(node->instrument, 0);
+
+ return false;
+}
+
+/*
+ * ExecSetTupleBound
+ *
+ * Set a tuple bound for a planstate node. This lets child plan nodes
+ * optimize based on the knowledge that the maximum number of tuples that
+ * their parent will demand is limited. The tuple bound for a node may
+ * only be changed between scans (i.e., after node initialization or just
+ * before an ExecReScan call).
+ *
+ * Any negative tuples_needed value means "no limit", which should be the
+ * default assumption when this is not called at all for a particular node.
+ *
+ * Note: if this is called repeatedly on a plan tree, the exact same set
+ * of nodes must be updated with the new limit each time; be careful that
+ * only unchanging conditions are tested here.
+ */
+void
+ExecSetTupleBound(int64 tuples_needed, PlanState *child_node)
+{
+ /*
+ * Since this function recurses, in principle we should check stack depth
+ * here. In practice, it's probably pointless since the earlier node
+ * initialization tree traversal would surely have consumed more stack.
+ */
+
+ if (IsA(child_node, SortState))
+ {
+ /*
+ * If it is a Sort node, notify it that it can use bounded sort.
+ *
+ * Note: it is the responsibility of nodeSort.c to react properly to
+ * changes of these parameters. If we ever redesign this, it'd be a
+ * good idea to integrate this signaling with the parameter-change
+ * mechanism.
+ */
+ SortState *sortState = (SortState *) child_node;
+
+ if (tuples_needed < 0)
+ {
+ /* make sure flag gets reset if needed upon rescan */
+ sortState->bounded = false;
+ }
+ else
+ {
+ sortState->bounded = true;
+ sortState->bound = tuples_needed;
+ }
+ }
+ else if (IsA(child_node, IncrementalSortState))
+ {
+ /*
+ * If it is an IncrementalSort node, notify it that it can use bounded
+ * sort.
+ *
+ * Note: it is the responsibility of nodeIncrementalSort.c to react
+ * properly to changes of these parameters. If we ever redesign this,
+ * it'd be a good idea to integrate this signaling with the
+ * parameter-change mechanism.
+ */
+ IncrementalSortState *sortState = (IncrementalSortState *) child_node;
+
+ if (tuples_needed < 0)
+ {
+ /* make sure flag gets reset if needed upon rescan */
+ sortState->bounded = false;
+ }
+ else
+ {
+ sortState->bounded = true;
+ sortState->bound = tuples_needed;
+ }
+ }
+ else if (IsA(child_node, AppendState))
+ {
+ /*
+ * If it is an Append, we can apply the bound to any nodes that are
+ * children of the Append, since the Append surely need read no more
+ * than that many tuples from any one input.
+ */
+ AppendState *aState = (AppendState *) child_node;
+ int i;
+
+ for (i = 0; i < aState->as_nplans; i++)
+ ExecSetTupleBound(tuples_needed, aState->appendplans[i]);
+ }
+ else if (IsA(child_node, MergeAppendState))
+ {
+ /*
+ * If it is a MergeAppend, we can apply the bound to any nodes that
+ * are children of the MergeAppend, since the MergeAppend surely need
+ * read no more than that many tuples from any one input.
+ */
+ MergeAppendState *maState = (MergeAppendState *) child_node;
+ int i;
+
+ for (i = 0; i < maState->ms_nplans; i++)
+ ExecSetTupleBound(tuples_needed, maState->mergeplans[i]);
+ }
+ else if (IsA(child_node, ResultState))
+ {
+ /*
+ * Similarly, for a projecting Result, we can apply the bound to its
+ * child node.
+ *
+ * If Result supported qual checking, we'd have to punt on seeing a
+ * qual. Note that having a resconstantqual is not a showstopper: if
+ * that condition succeeds it affects nothing, while if it fails, no
+ * rows will be demanded from the Result child anyway.
+ */
+ if (outerPlanState(child_node))
+ ExecSetTupleBound(tuples_needed, outerPlanState(child_node));
+ }
+ else if (IsA(child_node, SubqueryScanState))
+ {
+ /*
+ * We can also descend through SubqueryScan, but only if it has no
+ * qual (otherwise it might discard rows).
+ */
+ SubqueryScanState *subqueryState = (SubqueryScanState *) child_node;
+
+ if (subqueryState->ss.ps.qual == NULL)
+ ExecSetTupleBound(tuples_needed, subqueryState->subplan);
+ }
+ else if (IsA(child_node, GatherState))
+ {
+ /*
+ * A Gather node can propagate the bound to its workers. As with
+ * MergeAppend, no one worker could possibly need to return more
+ * tuples than the Gather itself needs to.
+ *
+ * Note: As with Sort, the Gather node is responsible for reacting
+ * properly to changes to this parameter.
+ */
+ GatherState *gstate = (GatherState *) child_node;
+
+ gstate->tuples_needed = tuples_needed;
+
+ /* Also pass down the bound to our own copy of the child plan */
+ ExecSetTupleBound(tuples_needed, outerPlanState(child_node));
+ }
+ else if (IsA(child_node, GatherMergeState))
+ {
+ /* Same comments as for Gather */
+ GatherMergeState *gstate = (GatherMergeState *) child_node;
+
+ gstate->tuples_needed = tuples_needed;
+
+ ExecSetTupleBound(tuples_needed, outerPlanState(child_node));
+ }
+
+ /*
+ * In principle we could descend through any plan node type that is
+ * certain not to discard or combine input rows; but on seeing a node that
+ * can do that, we can't propagate the bound any further. For the moment
+ * it's unclear that any other cases are worth checking here.
+ */
+}
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
new file mode 100644
index 0000000..1e285e0
--- /dev/null
+++ b/src/backend/executor/execReplication.c
@@ -0,0 +1,629 @@
+/*-------------------------------------------------------------------------
+ *
+ * execReplication.c
+ * miscellaneous executor routines for logical replication
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execReplication.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "commands/trigger.h"
+#include "executor/executor.h"
+#include "executor/nodeModifyTable.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parse_relation.h"
+#include "parser/parsetree.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/typcache.h"
+
+
+/*
+ * Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that
+ * is setup to match 'rel' (*NOT* idxrel!).
+ *
+ * Returns whether any column contains NULLs.
+ *
+ * This is not generic routine, it expects the idxrel to be replication
+ * identity of a rel and meet all limitations associated with that.
+ */
+static bool
+build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel,
+ TupleTableSlot *searchslot)
+{
+ int attoff;
+ bool isnull;
+ Datum indclassDatum;
+ oidvector *opclass;
+ int2vector *indkey = &idxrel->rd_index->indkey;
+ bool hasnulls = false;
+
+ Assert(RelationGetReplicaIndex(rel) == RelationGetRelid(idxrel) ||
+ RelationGetPrimaryKeyIndex(rel) == RelationGetRelid(idxrel));
+
+ indclassDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple,
+ Anum_pg_index_indclass, &isnull);
+ Assert(!isnull);
+ opclass = (oidvector *) DatumGetPointer(indclassDatum);
+
+ /* Build scankey for every attribute in the index. */
+ for (attoff = 0; attoff < IndexRelationGetNumberOfKeyAttributes(idxrel); attoff++)
+ {
+ Oid operator;
+ Oid opfamily;
+ RegProcedure regop;
+ int pkattno = attoff + 1;
+ int mainattno = indkey->values[attoff];
+ Oid optype = get_opclass_input_type(opclass->values[attoff]);
+
+ /*
+ * Load the operator info. We need this to get the equality operator
+ * function for the scan key.
+ */
+ opfamily = get_opclass_family(opclass->values[attoff]);
+
+ operator = get_opfamily_member(opfamily, optype,
+ optype,
+ BTEqualStrategyNumber);
+ if (!OidIsValid(operator))
+ elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
+ BTEqualStrategyNumber, optype, optype, opfamily);
+
+ regop = get_opcode(operator);
+
+ /* Initialize the scankey. */
+ ScanKeyInit(&skey[attoff],
+ pkattno,
+ BTEqualStrategyNumber,
+ regop,
+ searchslot->tts_values[mainattno - 1]);
+
+ skey[attoff].sk_collation = idxrel->rd_indcollation[attoff];
+
+ /* Check for null value. */
+ if (searchslot->tts_isnull[mainattno - 1])
+ {
+ hasnulls = true;
+ skey[attoff].sk_flags |= SK_ISNULL;
+ }
+ }
+
+ return hasnulls;
+}
+
+/*
+ * Search the relation 'rel' for tuple using the index.
+ *
+ * If a matching tuple is found, lock it with lockmode, fill the slot with its
+ * contents, and return true. Return false otherwise.
+ */
+bool
+RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
+ LockTupleMode lockmode,
+ TupleTableSlot *searchslot,
+ TupleTableSlot *outslot)
+{
+ ScanKeyData skey[INDEX_MAX_KEYS];
+ IndexScanDesc scan;
+ SnapshotData snap;
+ TransactionId xwait;
+ Relation idxrel;
+ bool found;
+
+ /* Open the index. */
+ idxrel = index_open(idxoid, RowExclusiveLock);
+
+ /* Start an index scan. */
+ InitDirtySnapshot(snap);
+ scan = index_beginscan(rel, idxrel, &snap,
+ IndexRelationGetNumberOfKeyAttributes(idxrel),
+ 0);
+
+ /* Build scan key. */
+ build_replindex_scan_key(skey, rel, idxrel, searchslot);
+
+retry:
+ found = false;
+
+ index_rescan(scan, skey, IndexRelationGetNumberOfKeyAttributes(idxrel), NULL, 0);
+
+ /* Try to find the tuple */
+ if (index_getnext_slot(scan, ForwardScanDirection, outslot))
+ {
+ found = true;
+ ExecMaterializeSlot(outslot);
+
+ xwait = TransactionIdIsValid(snap.xmin) ?
+ snap.xmin : snap.xmax;
+
+ /*
+ * If the tuple is locked, wait for locking transaction to finish and
+ * retry.
+ */
+ if (TransactionIdIsValid(xwait))
+ {
+ XactLockTableWait(xwait, NULL, NULL, XLTW_None);
+ goto retry;
+ }
+ }
+
+ /* Found tuple, try to lock it in the lockmode. */
+ if (found)
+ {
+ TM_FailureData tmfd;
+ TM_Result res;
+
+ PushActiveSnapshot(GetLatestSnapshot());
+
+ res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(),
+ outslot,
+ GetCurrentCommandId(false),
+ lockmode,
+ LockWaitBlock,
+ 0 /* don't follow updates */ ,
+ &tmfd);
+
+ PopActiveSnapshot();
+
+ switch (res)
+ {
+ case TM_Ok:
+ break;
+ case TM_Updated:
+ /* XXX: Improve handling here */
+ if (ItemPointerIndicatesMovedPartitions(&tmfd.ctid))
+ ereport(LOG,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying")));
+ else
+ ereport(LOG,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("concurrent update, retrying")));
+ goto retry;
+ case TM_Deleted:
+ /* XXX: Improve handling here */
+ ereport(LOG,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("concurrent delete, retrying")));
+ goto retry;
+ case TM_Invisible:
+ elog(ERROR, "attempted to lock invisible tuple");
+ break;
+ default:
+ elog(ERROR, "unexpected table_tuple_lock status: %u", res);
+ break;
+ }
+ }
+
+ index_endscan(scan);
+
+ /* Don't release lock until commit. */
+ index_close(idxrel, NoLock);
+
+ return found;
+}
+
+/*
+ * Compare the tuples in the slots by checking if they have equal values.
+ */
+static bool
+tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2,
+ TypeCacheEntry **eq)
+{
+ int attrnum;
+
+ Assert(slot1->tts_tupleDescriptor->natts ==
+ slot2->tts_tupleDescriptor->natts);
+
+ slot_getallattrs(slot1);
+ slot_getallattrs(slot2);
+
+ /* Check equality of the attributes. */
+ for (attrnum = 0; attrnum < slot1->tts_tupleDescriptor->natts; attrnum++)
+ {
+ Form_pg_attribute att;
+ TypeCacheEntry *typentry;
+
+ /*
+ * If one value is NULL and other is not, then they are certainly not
+ * equal
+ */
+ if (slot1->tts_isnull[attrnum] != slot2->tts_isnull[attrnum])
+ return false;
+
+ /*
+ * If both are NULL, they can be considered equal.
+ */
+ if (slot1->tts_isnull[attrnum] || slot2->tts_isnull[attrnum])
+ continue;
+
+ att = TupleDescAttr(slot1->tts_tupleDescriptor, attrnum);
+
+ typentry = eq[attrnum];
+ if (typentry == NULL)
+ {
+ typentry = lookup_type_cache(att->atttypid,
+ TYPECACHE_EQ_OPR_FINFO);
+ if (!OidIsValid(typentry->eq_opr_finfo.fn_oid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_FUNCTION),
+ errmsg("could not identify an equality operator for type %s",
+ format_type_be(att->atttypid))));
+ eq[attrnum] = typentry;
+ }
+
+ if (!DatumGetBool(FunctionCall2Coll(&typentry->eq_opr_finfo,
+ att->attcollation,
+ slot1->tts_values[attrnum],
+ slot2->tts_values[attrnum])))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Search the relation 'rel' for tuple using the sequential scan.
+ *
+ * If a matching tuple is found, lock it with lockmode, fill the slot with its
+ * contents, and return true. Return false otherwise.
+ *
+ * Note that this stops on the first matching tuple.
+ *
+ * This can obviously be quite slow on tables that have more than few rows.
+ */
+bool
+RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode,
+ TupleTableSlot *searchslot, TupleTableSlot *outslot)
+{
+ TupleTableSlot *scanslot;
+ TableScanDesc scan;
+ SnapshotData snap;
+ TypeCacheEntry **eq;
+ TransactionId xwait;
+ bool found;
+ TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel);
+
+ Assert(equalTupleDescs(desc, outslot->tts_tupleDescriptor));
+
+ eq = palloc0(sizeof(*eq) * outslot->tts_tupleDescriptor->natts);
+
+ /* Start a heap scan. */
+ InitDirtySnapshot(snap);
+ scan = table_beginscan(rel, &snap, 0, NULL);
+ scanslot = table_slot_create(rel, NULL);
+
+retry:
+ found = false;
+
+ table_rescan(scan, NULL);
+
+ /* Try to find the tuple */
+ while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot))
+ {
+ if (!tuples_equal(scanslot, searchslot, eq))
+ continue;
+
+ found = true;
+ ExecCopySlot(outslot, scanslot);
+
+ xwait = TransactionIdIsValid(snap.xmin) ?
+ snap.xmin : snap.xmax;
+
+ /*
+ * If the tuple is locked, wait for locking transaction to finish and
+ * retry.
+ */
+ if (TransactionIdIsValid(xwait))
+ {
+ XactLockTableWait(xwait, NULL, NULL, XLTW_None);
+ goto retry;
+ }
+
+ /* Found our tuple and it's not locked */
+ break;
+ }
+
+ /* Found tuple, try to lock it in the lockmode. */
+ if (found)
+ {
+ TM_FailureData tmfd;
+ TM_Result res;
+
+ PushActiveSnapshot(GetLatestSnapshot());
+
+ res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(),
+ outslot,
+ GetCurrentCommandId(false),
+ lockmode,
+ LockWaitBlock,
+ 0 /* don't follow updates */ ,
+ &tmfd);
+
+ PopActiveSnapshot();
+
+ switch (res)
+ {
+ case TM_Ok:
+ break;
+ case TM_Updated:
+ /* XXX: Improve handling here */
+ if (ItemPointerIndicatesMovedPartitions(&tmfd.ctid))
+ ereport(LOG,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying")));
+ else
+ ereport(LOG,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("concurrent update, retrying")));
+ goto retry;
+ case TM_Deleted:
+ /* XXX: Improve handling here */
+ ereport(LOG,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("concurrent delete, retrying")));
+ goto retry;
+ case TM_Invisible:
+ elog(ERROR, "attempted to lock invisible tuple");
+ break;
+ default:
+ elog(ERROR, "unexpected table_tuple_lock status: %u", res);
+ break;
+ }
+ }
+
+ table_endscan(scan);
+ ExecDropSingleTupleTableSlot(scanslot);
+
+ return found;
+}
+
+/*
+ * Insert tuple represented in the slot to the relation, update the indexes,
+ * and execute any constraints and per-row triggers.
+ *
+ * Caller is responsible for opening the indexes.
+ */
+void
+ExecSimpleRelationInsert(ResultRelInfo *resultRelInfo,
+ EState *estate, TupleTableSlot *slot)
+{
+ bool skip_tuple = false;
+ Relation rel = resultRelInfo->ri_RelationDesc;
+
+ /* For now we support only tables. */
+ Assert(rel->rd_rel->relkind == RELKIND_RELATION);
+
+ CheckCmdReplicaIdentity(rel, CMD_INSERT);
+
+ /* BEFORE ROW INSERT Triggers */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_insert_before_row)
+ {
+ if (!ExecBRInsertTriggers(estate, resultRelInfo, slot))
+ skip_tuple = true; /* "do nothing" */
+ }
+
+ if (!skip_tuple)
+ {
+ List *recheckIndexes = NIL;
+
+ /* Compute stored generated columns */
+ if (rel->rd_att->constr &&
+ rel->rd_att->constr->has_generated_stored)
+ ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+ CMD_INSERT);
+
+ /* Check the constraints of the tuple */
+ if (rel->rd_att->constr)
+ ExecConstraints(resultRelInfo, slot, estate);
+ if (rel->rd_rel->relispartition)
+ ExecPartitionCheck(resultRelInfo, slot, estate, true);
+
+ /* OK, store the tuple and create index entries for it */
+ simple_table_tuple_insert(resultRelInfo->ri_RelationDesc, slot);
+
+ if (resultRelInfo->ri_NumIndices > 0)
+ recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+ slot, estate, false, false,
+ NULL, NIL);
+
+ /* AFTER ROW INSERT Triggers */
+ ExecARInsertTriggers(estate, resultRelInfo, slot,
+ recheckIndexes, NULL);
+
+ /*
+ * XXX we should in theory pass a TransitionCaptureState object to the
+ * above to capture transition tuples, but after statement triggers
+ * don't actually get fired by replication yet anyway
+ */
+
+ list_free(recheckIndexes);
+ }
+}
+
+/*
+ * Find the searchslot tuple and update it with data in the slot,
+ * update the indexes, and execute any constraints and per-row triggers.
+ *
+ * Caller is responsible for opening the indexes.
+ */
+void
+ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo,
+ EState *estate, EPQState *epqstate,
+ TupleTableSlot *searchslot, TupleTableSlot *slot)
+{
+ bool skip_tuple = false;
+ Relation rel = resultRelInfo->ri_RelationDesc;
+ ItemPointer tid = &(searchslot->tts_tid);
+
+ /* For now we support only tables. */
+ Assert(rel->rd_rel->relkind == RELKIND_RELATION);
+
+ CheckCmdReplicaIdentity(rel, CMD_UPDATE);
+
+ /* BEFORE ROW UPDATE Triggers */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_update_before_row)
+ {
+ if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo,
+ tid, NULL, slot))
+ skip_tuple = true; /* "do nothing" */
+ }
+
+ if (!skip_tuple)
+ {
+ List *recheckIndexes = NIL;
+ bool update_indexes;
+
+ /* Compute stored generated columns */
+ if (rel->rd_att->constr &&
+ rel->rd_att->constr->has_generated_stored)
+ ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+ CMD_UPDATE);
+
+ /* Check the constraints of the tuple */
+ if (rel->rd_att->constr)
+ ExecConstraints(resultRelInfo, slot, estate);
+ if (rel->rd_rel->relispartition)
+ ExecPartitionCheck(resultRelInfo, slot, estate, true);
+
+ simple_table_tuple_update(rel, tid, slot, estate->es_snapshot,
+ &update_indexes);
+
+ if (resultRelInfo->ri_NumIndices > 0 && update_indexes)
+ recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+ slot, estate, true, false,
+ NULL, NIL);
+
+ /* AFTER ROW UPDATE Triggers */
+ ExecARUpdateTriggers(estate, resultRelInfo,
+ tid, NULL, slot,
+ recheckIndexes, NULL);
+
+ list_free(recheckIndexes);
+ }
+}
+
+/*
+ * Find the searchslot tuple and delete it, and execute any constraints
+ * and per-row triggers.
+ *
+ * Caller is responsible for opening the indexes.
+ */
+void
+ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo,
+ EState *estate, EPQState *epqstate,
+ TupleTableSlot *searchslot)
+{
+ bool skip_tuple = false;
+ Relation rel = resultRelInfo->ri_RelationDesc;
+ ItemPointer tid = &searchslot->tts_tid;
+
+ CheckCmdReplicaIdentity(rel, CMD_DELETE);
+
+ /* BEFORE ROW DELETE Triggers */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_delete_before_row)
+ {
+ skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
+ tid, NULL, NULL);
+
+ }
+
+ if (!skip_tuple)
+ {
+ /* OK, delete the tuple */
+ simple_table_tuple_delete(rel, tid, estate->es_snapshot);
+
+ /* AFTER ROW DELETE Triggers */
+ ExecARDeleteTriggers(estate, resultRelInfo,
+ tid, NULL, NULL);
+ }
+}
+
+/*
+ * Check if command can be executed with current replica identity.
+ */
+void
+CheckCmdReplicaIdentity(Relation rel, CmdType cmd)
+{
+ PublicationActions *pubactions;
+
+ /* We only need to do checks for UPDATE and DELETE. */
+ if (cmd != CMD_UPDATE && cmd != CMD_DELETE)
+ return;
+
+ /* If relation has replica identity we are always good. */
+ if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL ||
+ OidIsValid(RelationGetReplicaIndex(rel)))
+ return;
+
+ /*
+ * This is either UPDATE OR DELETE and there is no replica identity.
+ *
+ * Check if the table publishes UPDATES or DELETES.
+ */
+ pubactions = GetRelationPublicationActions(rel);
+ if (cmd == CMD_UPDATE && pubactions->pubupdate)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot update table \"%s\" because it does not have a replica identity and publishes updates",
+ RelationGetRelationName(rel)),
+ errhint("To enable updating the table, set REPLICA IDENTITY using ALTER TABLE.")));
+ else if (cmd == CMD_DELETE && pubactions->pubdelete)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot delete from table \"%s\" because it does not have a replica identity and publishes deletes",
+ RelationGetRelationName(rel)),
+ errhint("To enable deleting from the table, set REPLICA IDENTITY using ALTER TABLE.")));
+}
+
+
+/*
+ * Check if we support writing into specific relkind.
+ *
+ * The nspname and relname are only needed for error reporting.
+ */
+void
+CheckSubscriptionRelkind(char relkind, const char *nspname,
+ const char *relname)
+{
+ /*
+ * Give a more specific error for foreign tables.
+ */
+ if (relkind == RELKIND_FOREIGN_TABLE)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot use relation \"%s.%s\" as logical replication target",
+ nspname, relname),
+ errdetail("\"%s.%s\" is a foreign table.",
+ nspname, relname)));
+
+ if (relkind != RELKIND_RELATION && relkind != RELKIND_PARTITIONED_TABLE)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot use relation \"%s.%s\" as logical replication target",
+ nspname, relname),
+ errdetail("\"%s.%s\" is not a table.",
+ nspname, relname)));
+}
diff --git a/src/backend/executor/execSRF.c b/src/backend/executor/execSRF.c
new file mode 100644
index 0000000..545b6c1
--- /dev/null
+++ b/src/backend/executor/execSRF.c
@@ -0,0 +1,980 @@
+/*-------------------------------------------------------------------------
+ *
+ * execSRF.c
+ * Routines implementing the API for set-returning functions
+ *
+ * This file serves nodeFunctionscan.c and nodeProjectSet.c, providing
+ * common code for calling set-returning functions according to the
+ * ReturnSetInfo API.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execSRF.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/objectaccess.h"
+#include "executor/execdebug.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parse_coerce.h"
+#include "pgstat.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/typcache.h"
+
+
+/* static function decls */
+static void init_sexpr(Oid foid, Oid input_collation, Expr *node,
+ SetExprState *sexpr, PlanState *parent,
+ MemoryContext sexprCxt, bool allowSRF, bool needDescForSRF);
+static void ShutdownSetExpr(Datum arg);
+static void ExecEvalFuncArgs(FunctionCallInfo fcinfo,
+ List *argList, ExprContext *econtext);
+static void ExecPrepareTuplestoreResult(SetExprState *sexpr,
+ ExprContext *econtext,
+ Tuplestorestate *resultStore,
+ TupleDesc resultDesc);
+static void tupledesc_match(TupleDesc dst_tupdesc, TupleDesc src_tupdesc);
+
+
+/*
+ * Prepare function call in FROM (ROWS FROM) for execution.
+ *
+ * This is used by nodeFunctionscan.c.
+ */
+SetExprState *
+ExecInitTableFunctionResult(Expr *expr,
+ ExprContext *econtext, PlanState *parent)
+{
+ SetExprState *state = makeNode(SetExprState);
+
+ state->funcReturnsSet = false;
+ state->expr = expr;
+ state->func.fn_oid = InvalidOid;
+
+ /*
+ * Normally the passed expression tree will be a FuncExpr, since the
+ * grammar only allows a function call at the top level of a table
+ * function reference. However, if the function doesn't return set then
+ * the planner might have replaced the function call via constant-folding
+ * or inlining. So if we see any other kind of expression node, execute
+ * it via the general ExecEvalExpr() code. That code path will not
+ * support set-returning functions buried in the expression, though.
+ */
+ if (IsA(expr, FuncExpr))
+ {
+ FuncExpr *func = (FuncExpr *) expr;
+
+ state->funcReturnsSet = func->funcretset;
+ state->args = ExecInitExprList(func->args, parent);
+
+ init_sexpr(func->funcid, func->inputcollid, expr, state, parent,
+ econtext->ecxt_per_query_memory, func->funcretset, false);
+ }
+ else
+ {
+ state->elidedFuncState = ExecInitExpr(expr, parent);
+ }
+
+ return state;
+}
+
+/*
+ * ExecMakeTableFunctionResult
+ *
+ * Evaluate a table function, producing a materialized result in a Tuplestore
+ * object.
+ *
+ * This is used by nodeFunctionscan.c.
+ */
+Tuplestorestate *
+ExecMakeTableFunctionResult(SetExprState *setexpr,
+ ExprContext *econtext,
+ MemoryContext argContext,
+ TupleDesc expectedDesc,
+ bool randomAccess)
+{
+ Tuplestorestate *tupstore = NULL;
+ TupleDesc tupdesc = NULL;
+ Oid funcrettype;
+ bool returnsTuple;
+ bool returnsSet = false;
+ FunctionCallInfo fcinfo;
+ PgStat_FunctionCallUsage fcusage;
+ ReturnSetInfo rsinfo;
+ HeapTupleData tmptup;
+ MemoryContext callerContext;
+ bool first_time = true;
+
+ /*
+ * Execute per-tablefunc actions in appropriate context.
+ *
+ * The FunctionCallInfo needs to live across all the calls to a
+ * ValuePerCall function, so it can't be allocated in the per-tuple
+ * context. Similarly, the function arguments need to be evaluated in a
+ * context that is longer lived than the per-tuple context: The argument
+ * values would otherwise disappear when we reset that context in the
+ * inner loop. As the caller's CurrentMemoryContext is typically a
+ * query-lifespan context, we don't want to leak memory there. We require
+ * the caller to pass a separate memory context that can be used for this,
+ * and can be reset each time through to avoid bloat.
+ */
+ MemoryContextReset(argContext);
+ callerContext = MemoryContextSwitchTo(argContext);
+
+ funcrettype = exprType((Node *) setexpr->expr);
+
+ returnsTuple = type_is_rowtype(funcrettype);
+
+ /*
+ * Prepare a resultinfo node for communication. We always do this even if
+ * not expecting a set result, so that we can pass expectedDesc. In the
+ * generic-expression case, the expression doesn't actually get to see the
+ * resultinfo, but set it up anyway because we use some of the fields as
+ * our own state variables.
+ */
+ rsinfo.type = T_ReturnSetInfo;
+ rsinfo.econtext = econtext;
+ rsinfo.expectedDesc = expectedDesc;
+ rsinfo.allowedModes = (int) (SFRM_ValuePerCall | SFRM_Materialize | SFRM_Materialize_Preferred);
+ if (randomAccess)
+ rsinfo.allowedModes |= (int) SFRM_Materialize_Random;
+ rsinfo.returnMode = SFRM_ValuePerCall;
+ /* isDone is filled below */
+ rsinfo.setResult = NULL;
+ rsinfo.setDesc = NULL;
+
+ fcinfo = palloc(SizeForFunctionCallInfo(list_length(setexpr->args)));
+
+ /*
+ * Normally the passed expression tree will be a SetExprState, since the
+ * grammar only allows a function call at the top level of a table
+ * function reference. However, if the function doesn't return set then
+ * the planner might have replaced the function call via constant-folding
+ * or inlining. So if we see any other kind of expression node, execute
+ * it via the general ExecEvalExpr() code; the only difference is that we
+ * don't get a chance to pass a special ReturnSetInfo to any functions
+ * buried in the expression.
+ */
+ if (!setexpr->elidedFuncState)
+ {
+ /*
+ * This path is similar to ExecMakeFunctionResultSet.
+ */
+ returnsSet = setexpr->funcReturnsSet;
+ InitFunctionCallInfoData(*fcinfo, &(setexpr->func),
+ list_length(setexpr->args),
+ setexpr->fcinfo->fncollation,
+ NULL, (Node *) &rsinfo);
+ /* evaluate the function's argument list */
+ Assert(CurrentMemoryContext == argContext);
+ ExecEvalFuncArgs(fcinfo, setexpr->args, econtext);
+
+ /*
+ * If function is strict, and there are any NULL arguments, skip
+ * calling the function and act like it returned NULL (or an empty
+ * set, in the returns-set case).
+ */
+ if (setexpr->func.fn_strict)
+ {
+ int i;
+
+ for (i = 0; i < fcinfo->nargs; i++)
+ {
+ if (fcinfo->args[i].isnull)
+ goto no_function_result;
+ }
+ }
+ }
+ else
+ {
+ /* Treat setexpr as a generic expression */
+ InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+ }
+
+ /*
+ * Switch to short-lived context for calling the function or expression.
+ */
+ MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ /*
+ * Loop to handle the ValuePerCall protocol (which is also the same
+ * behavior needed in the generic ExecEvalExpr path).
+ */
+ for (;;)
+ {
+ Datum result;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Reset per-tuple memory context before each call of the function or
+ * expression. This cleans up any local memory the function may leak
+ * when called.
+ */
+ ResetExprContext(econtext);
+
+ /* Call the function or expression one time */
+ if (!setexpr->elidedFuncState)
+ {
+ pgstat_init_function_usage(fcinfo, &fcusage);
+
+ fcinfo->isnull = false;
+ rsinfo.isDone = ExprSingleResult;
+ result = FunctionCallInvoke(fcinfo);
+
+ pgstat_end_function_usage(&fcusage,
+ rsinfo.isDone != ExprMultipleResult);
+ }
+ else
+ {
+ result =
+ ExecEvalExpr(setexpr->elidedFuncState, econtext, &fcinfo->isnull);
+ rsinfo.isDone = ExprSingleResult;
+ }
+
+ /* Which protocol does function want to use? */
+ if (rsinfo.returnMode == SFRM_ValuePerCall)
+ {
+ /*
+ * Check for end of result set.
+ */
+ if (rsinfo.isDone == ExprEndResult)
+ break;
+
+ /*
+ * If first time through, build tuplestore for result. For a
+ * scalar function result type, also make a suitable tupdesc.
+ */
+ if (first_time)
+ {
+ MemoryContext oldcontext =
+ MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+ tupstore = tuplestore_begin_heap(randomAccess, false, work_mem);
+ rsinfo.setResult = tupstore;
+ if (!returnsTuple)
+ {
+ tupdesc = CreateTemplateTupleDesc(1);
+ TupleDescInitEntry(tupdesc,
+ (AttrNumber) 1,
+ "column",
+ funcrettype,
+ -1,
+ 0);
+ rsinfo.setDesc = tupdesc;
+ }
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ /*
+ * Store current resultset item.
+ */
+ if (returnsTuple)
+ {
+ if (!fcinfo->isnull)
+ {
+ HeapTupleHeader td = DatumGetHeapTupleHeader(result);
+
+ if (tupdesc == NULL)
+ {
+ MemoryContext oldcontext =
+ MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+ /*
+ * This is the first non-NULL result from the
+ * function. Use the type info embedded in the
+ * rowtype Datum to look up the needed tupdesc. Make
+ * a copy for the query.
+ */
+ tupdesc = lookup_rowtype_tupdesc_copy(HeapTupleHeaderGetTypeId(td),
+ HeapTupleHeaderGetTypMod(td));
+ rsinfo.setDesc = tupdesc;
+ MemoryContextSwitchTo(oldcontext);
+ }
+ else
+ {
+ /*
+ * Verify all later returned rows have same subtype;
+ * necessary in case the type is RECORD.
+ */
+ if (HeapTupleHeaderGetTypeId(td) != tupdesc->tdtypeid ||
+ HeapTupleHeaderGetTypMod(td) != tupdesc->tdtypmod)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("rows returned by function are not all of the same row type")));
+ }
+
+ /*
+ * tuplestore_puttuple needs a HeapTuple not a bare
+ * HeapTupleHeader, but it doesn't need all the fields.
+ */
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(td);
+ tmptup.t_data = td;
+
+ tuplestore_puttuple(tupstore, &tmptup);
+ }
+ else
+ {
+ /*
+ * NULL result from a tuple-returning function; expand it
+ * to a row of all nulls. We rely on the expectedDesc to
+ * form such rows. (Note: this would be problematic if
+ * tuplestore_putvalues saved the tdtypeid/tdtypmod from
+ * the provided descriptor, since that might not match
+ * what we get from the function itself. But it doesn't.)
+ */
+ int natts = expectedDesc->natts;
+ bool *nullflags;
+
+ nullflags = (bool *) palloc(natts * sizeof(bool));
+ memset(nullflags, true, natts * sizeof(bool));
+ tuplestore_putvalues(tupstore, expectedDesc, NULL, nullflags);
+ }
+ }
+ else
+ {
+ /* Scalar-type case: just store the function result */
+ tuplestore_putvalues(tupstore, tupdesc, &result, &fcinfo->isnull);
+ }
+
+ /*
+ * Are we done?
+ */
+ if (rsinfo.isDone != ExprMultipleResult)
+ break;
+
+ /*
+ * Check that set-returning functions were properly declared.
+ * (Note: for historical reasons, we don't complain if a non-SRF
+ * returns ExprEndResult; that's treated as returning NULL.)
+ */
+ if (!returnsSet)
+ ereport(ERROR,
+ (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+ errmsg("table-function protocol for value-per-call mode was not followed")));
+ }
+ else if (rsinfo.returnMode == SFRM_Materialize)
+ {
+ /* check we're on the same page as the function author */
+ if (!first_time || rsinfo.isDone != ExprSingleResult || !returnsSet)
+ ereport(ERROR,
+ (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+ errmsg("table-function protocol for materialize mode was not followed")));
+ /* Done evaluating the set result */
+ break;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+ errmsg("unrecognized table-function returnMode: %d",
+ (int) rsinfo.returnMode)));
+
+ first_time = false;
+ }
+
+no_function_result:
+
+ /*
+ * If we got nothing from the function (ie, an empty-set or NULL result),
+ * we have to create the tuplestore to return, and if it's a
+ * non-set-returning function then insert a single all-nulls row. As
+ * above, we depend on the expectedDesc to manufacture the dummy row.
+ */
+ if (rsinfo.setResult == NULL)
+ {
+ MemoryContext oldcontext =
+ MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+ tupstore = tuplestore_begin_heap(randomAccess, false, work_mem);
+ rsinfo.setResult = tupstore;
+ MemoryContextSwitchTo(oldcontext);
+
+ if (!returnsSet)
+ {
+ int natts = expectedDesc->natts;
+ bool *nullflags;
+
+ nullflags = (bool *) palloc(natts * sizeof(bool));
+ memset(nullflags, true, natts * sizeof(bool));
+ tuplestore_putvalues(tupstore, expectedDesc, NULL, nullflags);
+ }
+ }
+
+ /*
+ * If function provided a tupdesc, cross-check it. We only really need to
+ * do this for functions returning RECORD, but might as well do it always.
+ */
+ if (rsinfo.setDesc)
+ {
+ tupledesc_match(expectedDesc, rsinfo.setDesc);
+
+ /*
+ * If it is a dynamically-allocated TupleDesc, free it: it is
+ * typically allocated in a per-query context, so we must avoid
+ * leaking it across multiple usages.
+ */
+ if (rsinfo.setDesc->tdrefcount == -1)
+ FreeTupleDesc(rsinfo.setDesc);
+ }
+
+ MemoryContextSwitchTo(callerContext);
+
+ /* All done, pass back the tuplestore */
+ return rsinfo.setResult;
+}
+
+
+/*
+ * Prepare targetlist SRF function call for execution.
+ *
+ * This is used by nodeProjectSet.c.
+ */
+SetExprState *
+ExecInitFunctionResultSet(Expr *expr,
+ ExprContext *econtext, PlanState *parent)
+{
+ SetExprState *state = makeNode(SetExprState);
+
+ state->funcReturnsSet = true;
+ state->expr = expr;
+ state->func.fn_oid = InvalidOid;
+
+ /*
+ * Initialize metadata. The expression node could be either a FuncExpr or
+ * an OpExpr.
+ */
+ if (IsA(expr, FuncExpr))
+ {
+ FuncExpr *func = (FuncExpr *) expr;
+
+ state->args = ExecInitExprList(func->args, parent);
+ init_sexpr(func->funcid, func->inputcollid, expr, state, parent,
+ econtext->ecxt_per_query_memory, true, true);
+ }
+ else if (IsA(expr, OpExpr))
+ {
+ OpExpr *op = (OpExpr *) expr;
+
+ state->args = ExecInitExprList(op->args, parent);
+ init_sexpr(op->opfuncid, op->inputcollid, expr, state, parent,
+ econtext->ecxt_per_query_memory, true, true);
+ }
+ else
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(expr));
+
+ /* shouldn't get here unless the selected function returns set */
+ Assert(state->func.fn_retset);
+
+ return state;
+}
+
+/*
+ * ExecMakeFunctionResultSet
+ *
+ * Evaluate the arguments to a set-returning function and then call the
+ * function itself. The argument expressions may not contain set-returning
+ * functions (the planner is supposed to have separated evaluation for those).
+ *
+ * This should be called in a short-lived (per-tuple) context, argContext
+ * needs to live until all rows have been returned (i.e. *isDone set to
+ * ExprEndResult or ExprSingleResult).
+ *
+ * This is used by nodeProjectSet.c.
+ */
+Datum
+ExecMakeFunctionResultSet(SetExprState *fcache,
+ ExprContext *econtext,
+ MemoryContext argContext,
+ bool *isNull,
+ ExprDoneCond *isDone)
+{
+ List *arguments;
+ Datum result;
+ FunctionCallInfo fcinfo;
+ PgStat_FunctionCallUsage fcusage;
+ ReturnSetInfo rsinfo;
+ bool callit;
+ int i;
+
+restart:
+
+ /* Guard against stack overflow due to overly complex expressions */
+ check_stack_depth();
+
+ /*
+ * If a previous call of the function returned a set result in the form of
+ * a tuplestore, continue reading rows from the tuplestore until it's
+ * empty.
+ */
+ if (fcache->funcResultStore)
+ {
+ TupleTableSlot *slot = fcache->funcResultSlot;
+ MemoryContext oldContext;
+ bool foundTup;
+
+ /*
+ * Have to make sure tuple in slot lives long enough, otherwise
+ * clearing the slot could end up trying to free something already
+ * freed.
+ */
+ oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+ foundTup = tuplestore_gettupleslot(fcache->funcResultStore, true, false,
+ fcache->funcResultSlot);
+ MemoryContextSwitchTo(oldContext);
+
+ if (foundTup)
+ {
+ *isDone = ExprMultipleResult;
+ if (fcache->funcReturnsTuple)
+ {
+ /* We must return the whole tuple as a Datum. */
+ *isNull = false;
+ return ExecFetchSlotHeapTupleDatum(fcache->funcResultSlot);
+ }
+ else
+ {
+ /* Extract the first column and return it as a scalar. */
+ return slot_getattr(fcache->funcResultSlot, 1, isNull);
+ }
+ }
+ /* Exhausted the tuplestore, so clean up */
+ tuplestore_end(fcache->funcResultStore);
+ fcache->funcResultStore = NULL;
+ *isDone = ExprEndResult;
+ *isNull = true;
+ return (Datum) 0;
+ }
+
+ /*
+ * arguments is a list of expressions to evaluate before passing to the
+ * function manager. We skip the evaluation if it was already done in the
+ * previous call (ie, we are continuing the evaluation of a set-valued
+ * function). Otherwise, collect the current argument values into fcinfo.
+ *
+ * The arguments have to live in a context that lives at least until all
+ * rows from this SRF have been returned, otherwise ValuePerCall SRFs
+ * would reference freed memory after the first returned row.
+ */
+ fcinfo = fcache->fcinfo;
+ arguments = fcache->args;
+ if (!fcache->setArgsValid)
+ {
+ MemoryContext oldContext = MemoryContextSwitchTo(argContext);
+
+ ExecEvalFuncArgs(fcinfo, arguments, econtext);
+ MemoryContextSwitchTo(oldContext);
+ }
+ else
+ {
+ /* Reset flag (we may set it again below) */
+ fcache->setArgsValid = false;
+ }
+
+ /*
+ * Now call the function, passing the evaluated parameter values.
+ */
+
+ /* Prepare a resultinfo node for communication. */
+ fcinfo->resultinfo = (Node *) &rsinfo;
+ rsinfo.type = T_ReturnSetInfo;
+ rsinfo.econtext = econtext;
+ rsinfo.expectedDesc = fcache->funcResultDesc;
+ rsinfo.allowedModes = (int) (SFRM_ValuePerCall | SFRM_Materialize);
+ /* note we do not set SFRM_Materialize_Random or _Preferred */
+ rsinfo.returnMode = SFRM_ValuePerCall;
+ /* isDone is filled below */
+ rsinfo.setResult = NULL;
+ rsinfo.setDesc = NULL;
+
+ /*
+ * If function is strict, and there are any NULL arguments, skip calling
+ * the function.
+ */
+ callit = true;
+ if (fcache->func.fn_strict)
+ {
+ for (i = 0; i < fcinfo->nargs; i++)
+ {
+ if (fcinfo->args[i].isnull)
+ {
+ callit = false;
+ break;
+ }
+ }
+ }
+
+ if (callit)
+ {
+ pgstat_init_function_usage(fcinfo, &fcusage);
+
+ fcinfo->isnull = false;
+ rsinfo.isDone = ExprSingleResult;
+ result = FunctionCallInvoke(fcinfo);
+ *isNull = fcinfo->isnull;
+ *isDone = rsinfo.isDone;
+
+ pgstat_end_function_usage(&fcusage,
+ rsinfo.isDone != ExprMultipleResult);
+ }
+ else
+ {
+ /* for a strict SRF, result for NULL is an empty set */
+ result = (Datum) 0;
+ *isNull = true;
+ *isDone = ExprEndResult;
+ }
+
+ /* Which protocol does function want to use? */
+ if (rsinfo.returnMode == SFRM_ValuePerCall)
+ {
+ if (*isDone != ExprEndResult)
+ {
+ /*
+ * Save the current argument values to re-use on the next call.
+ */
+ if (*isDone == ExprMultipleResult)
+ {
+ fcache->setArgsValid = true;
+ /* Register cleanup callback if we didn't already */
+ if (!fcache->shutdown_reg)
+ {
+ RegisterExprContextCallback(econtext,
+ ShutdownSetExpr,
+ PointerGetDatum(fcache));
+ fcache->shutdown_reg = true;
+ }
+ }
+ }
+ }
+ else if (rsinfo.returnMode == SFRM_Materialize)
+ {
+ /* check we're on the same page as the function author */
+ if (rsinfo.isDone != ExprSingleResult)
+ ereport(ERROR,
+ (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+ errmsg("table-function protocol for materialize mode was not followed")));
+ if (rsinfo.setResult != NULL)
+ {
+ /* prepare to return values from the tuplestore */
+ ExecPrepareTuplestoreResult(fcache, econtext,
+ rsinfo.setResult,
+ rsinfo.setDesc);
+ /* loop back to top to start returning from tuplestore */
+ goto restart;
+ }
+ /* if setResult was left null, treat it as empty set */
+ *isDone = ExprEndResult;
+ *isNull = true;
+ result = (Datum) 0;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+ errmsg("unrecognized table-function returnMode: %d",
+ (int) rsinfo.returnMode)));
+
+ return result;
+}
+
+
+/*
+ * init_sexpr - initialize a SetExprState node during first use
+ */
+static void
+init_sexpr(Oid foid, Oid input_collation, Expr *node,
+ SetExprState *sexpr, PlanState *parent,
+ MemoryContext sexprCxt, bool allowSRF, bool needDescForSRF)
+{
+ AclResult aclresult;
+ size_t numargs = list_length(sexpr->args);
+
+ /* Check permission to call function */
+ aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid));
+ InvokeFunctionExecuteHook(foid);
+
+ /*
+ * Safety check on nargs. Under normal circumstances this should never
+ * fail, as parser should check sooner. But possibly it might fail if
+ * server has been compiled with FUNC_MAX_ARGS smaller than some functions
+ * declared in pg_proc?
+ */
+ if (list_length(sexpr->args) > FUNC_MAX_ARGS)
+ ereport(ERROR,
+ (errcode(ERRCODE_TOO_MANY_ARGUMENTS),
+ errmsg_plural("cannot pass more than %d argument to a function",
+ "cannot pass more than %d arguments to a function",
+ FUNC_MAX_ARGS,
+ FUNC_MAX_ARGS)));
+
+ /* Set up the primary fmgr lookup information */
+ fmgr_info_cxt(foid, &(sexpr->func), sexprCxt);
+ fmgr_info_set_expr((Node *) sexpr->expr, &(sexpr->func));
+
+ /* Initialize the function call parameter struct as well */
+ sexpr->fcinfo =
+ (FunctionCallInfo) palloc(SizeForFunctionCallInfo(numargs));
+ InitFunctionCallInfoData(*sexpr->fcinfo, &(sexpr->func),
+ numargs,
+ input_collation, NULL, NULL);
+
+ /* If function returns set, check if that's allowed by caller */
+ if (sexpr->func.fn_retset && !allowSRF)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set"),
+ parent ? executor_errposition(parent->state,
+ exprLocation((Node *) node)) : 0));
+
+ /* Otherwise, caller should have marked the sexpr correctly */
+ Assert(sexpr->func.fn_retset == sexpr->funcReturnsSet);
+
+ /* If function returns set, prepare expected tuple descriptor */
+ if (sexpr->func.fn_retset && needDescForSRF)
+ {
+ TypeFuncClass functypclass;
+ Oid funcrettype;
+ TupleDesc tupdesc;
+ MemoryContext oldcontext;
+
+ functypclass = get_expr_result_type(sexpr->func.fn_expr,
+ &funcrettype,
+ &tupdesc);
+
+ /* Must save tupdesc in sexpr's context */
+ oldcontext = MemoryContextSwitchTo(sexprCxt);
+
+ if (functypclass == TYPEFUNC_COMPOSITE ||
+ functypclass == TYPEFUNC_COMPOSITE_DOMAIN)
+ {
+ /* Composite data type, e.g. a table's row type */
+ Assert(tupdesc);
+ /* Must copy it out of typcache for safety */
+ sexpr->funcResultDesc = CreateTupleDescCopy(tupdesc);
+ sexpr->funcReturnsTuple = true;
+ }
+ else if (functypclass == TYPEFUNC_SCALAR)
+ {
+ /* Base data type, i.e. scalar */
+ tupdesc = CreateTemplateTupleDesc(1);
+ TupleDescInitEntry(tupdesc,
+ (AttrNumber) 1,
+ NULL,
+ funcrettype,
+ -1,
+ 0);
+ sexpr->funcResultDesc = tupdesc;
+ sexpr->funcReturnsTuple = false;
+ }
+ else if (functypclass == TYPEFUNC_RECORD)
+ {
+ /* This will work if function doesn't need an expectedDesc */
+ sexpr->funcResultDesc = NULL;
+ sexpr->funcReturnsTuple = true;
+ }
+ else
+ {
+ /* Else, we will fail if function needs an expectedDesc */
+ sexpr->funcResultDesc = NULL;
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+ else
+ sexpr->funcResultDesc = NULL;
+
+ /* Initialize additional state */
+ sexpr->funcResultStore = NULL;
+ sexpr->funcResultSlot = NULL;
+ sexpr->shutdown_reg = false;
+}
+
+/*
+ * callback function in case a SetExprState needs to be shut down before it
+ * has been run to completion
+ */
+static void
+ShutdownSetExpr(Datum arg)
+{
+ SetExprState *sexpr = castNode(SetExprState, DatumGetPointer(arg));
+
+ /* If we have a slot, make sure it's let go of any tuplestore pointer */
+ if (sexpr->funcResultSlot)
+ ExecClearTuple(sexpr->funcResultSlot);
+
+ /* Release any open tuplestore */
+ if (sexpr->funcResultStore)
+ tuplestore_end(sexpr->funcResultStore);
+ sexpr->funcResultStore = NULL;
+
+ /* Clear any active set-argument state */
+ sexpr->setArgsValid = false;
+
+ /* execUtils will deregister the callback... */
+ sexpr->shutdown_reg = false;
+}
+
+/*
+ * Evaluate arguments for a function.
+ */
+static void
+ExecEvalFuncArgs(FunctionCallInfo fcinfo,
+ List *argList,
+ ExprContext *econtext)
+{
+ int i;
+ ListCell *arg;
+
+ i = 0;
+ foreach(arg, argList)
+ {
+ ExprState *argstate = (ExprState *) lfirst(arg);
+
+ fcinfo->args[i].value = ExecEvalExpr(argstate,
+ econtext,
+ &fcinfo->args[i].isnull);
+ i++;
+ }
+
+ Assert(i == fcinfo->nargs);
+}
+
+/*
+ * ExecPrepareTuplestoreResult
+ *
+ * Subroutine for ExecMakeFunctionResultSet: prepare to extract rows from a
+ * tuplestore function result. We must set up a funcResultSlot (unless
+ * already done in a previous call cycle) and verify that the function
+ * returned the expected tuple descriptor.
+ */
+static void
+ExecPrepareTuplestoreResult(SetExprState *sexpr,
+ ExprContext *econtext,
+ Tuplestorestate *resultStore,
+ TupleDesc resultDesc)
+{
+ sexpr->funcResultStore = resultStore;
+
+ if (sexpr->funcResultSlot == NULL)
+ {
+ /* Create a slot so we can read data out of the tuplestore */
+ TupleDesc slotDesc;
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(sexpr->func.fn_mcxt);
+
+ /*
+ * If we were not able to determine the result rowtype from context,
+ * and the function didn't return a tupdesc, we have to fail.
+ */
+ if (sexpr->funcResultDesc)
+ slotDesc = sexpr->funcResultDesc;
+ else if (resultDesc)
+ {
+ /* don't assume resultDesc is long-lived */
+ slotDesc = CreateTupleDescCopy(resultDesc);
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("function returning setof record called in "
+ "context that cannot accept type record")));
+ slotDesc = NULL; /* keep compiler quiet */
+ }
+
+ sexpr->funcResultSlot = MakeSingleTupleTableSlot(slotDesc,
+ &TTSOpsMinimalTuple);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ /*
+ * If function provided a tupdesc, cross-check it. We only really need to
+ * do this for functions returning RECORD, but might as well do it always.
+ */
+ if (resultDesc)
+ {
+ if (sexpr->funcResultDesc)
+ tupledesc_match(sexpr->funcResultDesc, resultDesc);
+
+ /*
+ * If it is a dynamically-allocated TupleDesc, free it: it is
+ * typically allocated in a per-query context, so we must avoid
+ * leaking it across multiple usages.
+ */
+ if (resultDesc->tdrefcount == -1)
+ FreeTupleDesc(resultDesc);
+ }
+
+ /* Register cleanup callback if we didn't already */
+ if (!sexpr->shutdown_reg)
+ {
+ RegisterExprContextCallback(econtext,
+ ShutdownSetExpr,
+ PointerGetDatum(sexpr));
+ sexpr->shutdown_reg = true;
+ }
+}
+
+/*
+ * Check that function result tuple type (src_tupdesc) matches or can
+ * be considered to match what the query expects (dst_tupdesc). If
+ * they don't match, ereport.
+ *
+ * We really only care about number of attributes and data type.
+ * Also, we can ignore type mismatch on columns that are dropped in the
+ * destination type, so long as the physical storage matches. This is
+ * helpful in some cases involving out-of-date cached plans.
+ */
+static void
+tupledesc_match(TupleDesc dst_tupdesc, TupleDesc src_tupdesc)
+{
+ int i;
+
+ if (dst_tupdesc->natts != src_tupdesc->natts)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("function return row and query-specified return row do not match"),
+ errdetail_plural("Returned row contains %d attribute, but query expects %d.",
+ "Returned row contains %d attributes, but query expects %d.",
+ src_tupdesc->natts,
+ src_tupdesc->natts, dst_tupdesc->natts)));
+
+ for (i = 0; i < dst_tupdesc->natts; i++)
+ {
+ Form_pg_attribute dattr = TupleDescAttr(dst_tupdesc, i);
+ Form_pg_attribute sattr = TupleDescAttr(src_tupdesc, i);
+
+ if (IsBinaryCoercible(sattr->atttypid, dattr->atttypid))
+ continue; /* no worries */
+ if (!dattr->attisdropped)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("function return row and query-specified return row do not match"),
+ errdetail("Returned type %s at ordinal position %d, but query expects %s.",
+ format_type_be(sattr->atttypid),
+ i + 1,
+ format_type_be(dattr->atttypid))));
+
+ if (dattr->attlen != sattr->attlen ||
+ dattr->attalign != sattr->attalign)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("function return row and query-specified return row do not match"),
+ errdetail("Physical storage mismatch on dropped attribute at ordinal position %d.",
+ i + 1)));
+ }
+}
diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c
new file mode 100644
index 0000000..69ab345
--- /dev/null
+++ b/src/backend/executor/execScan.c
@@ -0,0 +1,342 @@
+/*-------------------------------------------------------------------------
+ *
+ * execScan.c
+ * This code provides support for generalized relation scans. ExecScan
+ * is passed a node and a pointer to a function to "do the right thing"
+ * and return a tuple from the relation. ExecScan then does the tedious
+ * stuff - checking the qualification and projecting the tuple
+ * appropriately.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execScan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+
+/*
+ * ExecScanFetch -- check interrupts & fetch next potential tuple
+ *
+ * This routine is concerned with substituting a test tuple if we are
+ * inside an EvalPlanQual recheck. If we aren't, just execute
+ * the access method's next-tuple routine.
+ */
+static inline TupleTableSlot *
+ExecScanFetch(ScanState *node,
+ ExecScanAccessMtd accessMtd,
+ ExecScanRecheckMtd recheckMtd)
+{
+ EState *estate = node->ps.state;
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (estate->es_epq_active != NULL)
+ {
+ EPQState *epqstate = estate->es_epq_active;
+
+ /*
+ * We are inside an EvalPlanQual recheck. Return the test tuple if
+ * one is available, after rechecking any access-method-specific
+ * conditions.
+ */
+ Index scanrelid = ((Scan *) node->ps.plan)->scanrelid;
+
+ if (scanrelid == 0)
+ {
+ /*
+ * This is a ForeignScan or CustomScan which has pushed down a
+ * join to the remote side. The recheck method is responsible not
+ * only for rechecking the scan/join quals but also for storing
+ * the correct tuple in the slot.
+ */
+
+ TupleTableSlot *slot = node->ss_ScanTupleSlot;
+
+ if (!(*recheckMtd) (node, slot))
+ ExecClearTuple(slot); /* would not be returned by scan */
+ return slot;
+ }
+ else if (epqstate->relsubs_done[scanrelid - 1])
+ {
+ /*
+ * Return empty slot, as we already performed an EPQ substitution
+ * for this relation.
+ */
+
+ TupleTableSlot *slot = node->ss_ScanTupleSlot;
+
+ /* Return empty slot, as we already returned a tuple */
+ return ExecClearTuple(slot);
+ }
+ else if (epqstate->relsubs_slot[scanrelid - 1] != NULL)
+ {
+ /*
+ * Return replacement tuple provided by the EPQ caller.
+ */
+
+ TupleTableSlot *slot = epqstate->relsubs_slot[scanrelid - 1];
+
+ Assert(epqstate->relsubs_rowmark[scanrelid - 1] == NULL);
+
+ /* Mark to remember that we shouldn't return more */
+ epqstate->relsubs_done[scanrelid - 1] = true;
+
+ /* Return empty slot if we haven't got a test tuple */
+ if (TupIsNull(slot))
+ return NULL;
+
+ /* Check if it meets the access-method conditions */
+ if (!(*recheckMtd) (node, slot))
+ return ExecClearTuple(slot); /* would not be returned by
+ * scan */
+ return slot;
+ }
+ else if (epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+ {
+ /*
+ * Fetch and return replacement tuple using a non-locking rowmark.
+ */
+
+ TupleTableSlot *slot = node->ss_ScanTupleSlot;
+
+ /* Mark to remember that we shouldn't return more */
+ epqstate->relsubs_done[scanrelid - 1] = true;
+
+ if (!EvalPlanQualFetchRowMark(epqstate, scanrelid, slot))
+ return NULL;
+
+ /* Return empty slot if we haven't got a test tuple */
+ if (TupIsNull(slot))
+ return NULL;
+
+ /* Check if it meets the access-method conditions */
+ if (!(*recheckMtd) (node, slot))
+ return ExecClearTuple(slot); /* would not be returned by
+ * scan */
+ return slot;
+ }
+ }
+
+ /*
+ * Run the node-type-specific access method function to get the next tuple
+ */
+ return (*accessMtd) (node);
+}
+
+/* ----------------------------------------------------------------
+ * ExecScan
+ *
+ * Scans the relation using the 'access method' indicated and
+ * returns the next qualifying tuple.
+ * The access method returns the next tuple and ExecScan() is
+ * responsible for checking the tuple returned against the qual-clause.
+ *
+ * A 'recheck method' must also be provided that can check an
+ * arbitrary tuple of the relation against any qual conditions
+ * that are implemented internal to the access method.
+ *
+ * Conditions:
+ * -- the "cursor" maintained by the AMI is positioned at the tuple
+ * returned previously.
+ *
+ * Initial States:
+ * -- the relation indicated is opened for scanning so that the
+ * "cursor" is positioned before the first qualifying tuple.
+ * ----------------------------------------------------------------
+ */
+TupleTableSlot *
+ExecScan(ScanState *node,
+ ExecScanAccessMtd accessMtd, /* function returning a tuple */
+ ExecScanRecheckMtd recheckMtd)
+{
+ ExprContext *econtext;
+ ExprState *qual;
+ ProjectionInfo *projInfo;
+
+ /*
+ * Fetch data from node
+ */
+ qual = node->ps.qual;
+ projInfo = node->ps.ps_ProjInfo;
+ econtext = node->ps.ps_ExprContext;
+
+ /* interrupt checks are in ExecScanFetch */
+
+ /*
+ * If we have neither a qual to check nor a projection to do, just skip
+ * all the overhead and return the raw scan tuple.
+ */
+ if (!qual && !projInfo)
+ {
+ ResetExprContext(econtext);
+ return ExecScanFetch(node, accessMtd, recheckMtd);
+ }
+
+ /*
+ * Reset per-tuple memory context to free any expression evaluation
+ * storage allocated in the previous tuple cycle.
+ */
+ ResetExprContext(econtext);
+
+ /*
+ * get a tuple from the access method. Loop until we obtain a tuple that
+ * passes the qualification.
+ */
+ for (;;)
+ {
+ TupleTableSlot *slot;
+
+ slot = ExecScanFetch(node, accessMtd, recheckMtd);
+
+ /*
+ * if the slot returned by the accessMtd contains NULL, then it means
+ * there is nothing more to scan so we just return an empty slot,
+ * being careful to use the projection result slot so it has correct
+ * tupleDesc.
+ */
+ if (TupIsNull(slot))
+ {
+ if (projInfo)
+ return ExecClearTuple(projInfo->pi_state.resultslot);
+ else
+ return slot;
+ }
+
+ /*
+ * place the current tuple into the expr context
+ */
+ econtext->ecxt_scantuple = slot;
+
+ /*
+ * check that the current tuple satisfies the qual-clause
+ *
+ * check for non-null qual here to avoid a function call to ExecQual()
+ * when the qual is null ... saves only a few cycles, but they add up
+ * ...
+ */
+ if (qual == NULL || ExecQual(qual, econtext))
+ {
+ /*
+ * Found a satisfactory scan tuple.
+ */
+ if (projInfo)
+ {
+ /*
+ * Form a projection tuple, store it in the result tuple slot
+ * and return it.
+ */
+ return ExecProject(projInfo);
+ }
+ else
+ {
+ /*
+ * Here, we aren't projecting, so just return scan tuple.
+ */
+ return slot;
+ }
+ }
+ else
+ InstrCountFiltered1(node, 1);
+
+ /*
+ * Tuple fails qual, so free per-tuple memory and try again.
+ */
+ ResetExprContext(econtext);
+ }
+}
+
+/*
+ * ExecAssignScanProjectionInfo
+ * Set up projection info for a scan node, if necessary.
+ *
+ * We can avoid a projection step if the requested tlist exactly matches
+ * the underlying tuple type. If so, we just set ps_ProjInfo to NULL.
+ * Note that this case occurs not only for simple "SELECT * FROM ...", but
+ * also in most cases where there are joins or other processing nodes above
+ * the scan node, because the planner will preferentially generate a matching
+ * tlist.
+ *
+ * The scan slot's descriptor must have been set already.
+ */
+void
+ExecAssignScanProjectionInfo(ScanState *node)
+{
+ Scan *scan = (Scan *) node->ps.plan;
+ TupleDesc tupdesc = node->ss_ScanTupleSlot->tts_tupleDescriptor;
+
+ ExecConditionalAssignProjectionInfo(&node->ps, tupdesc, scan->scanrelid);
+}
+
+/*
+ * ExecAssignScanProjectionInfoWithVarno
+ * As above, but caller can specify varno expected in Vars in the tlist.
+ */
+void
+ExecAssignScanProjectionInfoWithVarno(ScanState *node, Index varno)
+{
+ TupleDesc tupdesc = node->ss_ScanTupleSlot->tts_tupleDescriptor;
+
+ ExecConditionalAssignProjectionInfo(&node->ps, tupdesc, varno);
+}
+
+/*
+ * ExecScanReScan
+ *
+ * This must be called within the ReScan function of any plan node type
+ * that uses ExecScan().
+ */
+void
+ExecScanReScan(ScanState *node)
+{
+ EState *estate = node->ps.state;
+
+ /*
+ * We must clear the scan tuple so that observers (e.g., execCurrent.c)
+ * can tell that this plan node is not positioned on a tuple.
+ */
+ ExecClearTuple(node->ss_ScanTupleSlot);
+
+ /* Rescan EvalPlanQual tuple if we're inside an EvalPlanQual recheck */
+ if (estate->es_epq_active != NULL)
+ {
+ EPQState *epqstate = estate->es_epq_active;
+ Index scanrelid = ((Scan *) node->ps.plan)->scanrelid;
+
+ if (scanrelid > 0)
+ epqstate->relsubs_done[scanrelid - 1] = false;
+ else
+ {
+ Bitmapset *relids;
+ int rtindex = -1;
+
+ /*
+ * If an FDW or custom scan provider has replaced the join with a
+ * scan, there are multiple RTIs; reset the epqScanDone flag for
+ * all of them.
+ */
+ if (IsA(node->ps.plan, ForeignScan))
+ relids = ((ForeignScan *) node->ps.plan)->fs_relids;
+ else if (IsA(node->ps.plan, CustomScan))
+ relids = ((CustomScan *) node->ps.plan)->custom_relids;
+ else
+ elog(ERROR, "unexpected scan node: %d",
+ (int) nodeTag(node->ps.plan));
+
+ while ((rtindex = bms_next_member(relids, rtindex)) >= 0)
+ {
+ Assert(rtindex > 0);
+ epqstate->relsubs_done[rtindex - 1] = false;
+ }
+ }
+ }
+}
diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
new file mode 100644
index 0000000..5004b3b
--- /dev/null
+++ b/src/backend/executor/execTuples.c
@@ -0,0 +1,2339 @@
+/*-------------------------------------------------------------------------
+ *
+ * execTuples.c
+ * Routines dealing with TupleTableSlots. These are used for resource
+ * management associated with tuples (eg, releasing buffer pins for
+ * tuples in disk buffers, or freeing the memory occupied by transient
+ * tuples). Slots also provide access abstraction that lets us implement
+ * "virtual" tuples to reduce data-copying overhead.
+ *
+ * Routines dealing with the type information for tuples. Currently,
+ * the type information for a tuple is an array of FormData_pg_attribute.
+ * This information is needed by routines manipulating tuples
+ * (getattribute, formtuple, etc.).
+ *
+ *
+ * EXAMPLE OF HOW TABLE ROUTINES WORK
+ * Suppose we have a query such as SELECT emp.name FROM emp and we have
+ * a single SeqScan node in the query plan.
+ *
+ * At ExecutorStart()
+ * ----------------
+ *
+ * - ExecInitSeqScan() calls ExecInitScanTupleSlot() to construct a
+ * TupleTableSlots for the tuples returned by the access method, and
+ * ExecInitResultTypeTL() to define the node's return
+ * type. ExecAssignScanProjectionInfo() will, if necessary, create
+ * another TupleTableSlot for the tuples resulting from performing
+ * target list projections.
+ *
+ * During ExecutorRun()
+ * ----------------
+ * - SeqNext() calls ExecStoreBufferHeapTuple() to place the tuple
+ * returned by the access method into the scan tuple slot.
+ *
+ * - ExecSeqScan() (via ExecScan), if necessary, calls ExecProject(),
+ * putting the result of the projection in the result tuple slot. If
+ * not necessary, it directly returns the slot returned by SeqNext().
+ *
+ * - ExecutePlan() calls the output function.
+ *
+ * The important thing to watch in the executor code is how pointers
+ * to the slots containing tuples are passed instead of the tuples
+ * themselves. This facilitates the communication of related information
+ * (such as whether or not a tuple should be pfreed, what buffer contains
+ * this tuple, the tuple's tuple descriptor, etc). It also allows us
+ * to avoid physically constructing projection tuples in many cases.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execTuples.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heaptoast.h"
+#include "access/htup_details.h"
+#include "access/tupdesc_details.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/bufmgr.h"
+#include "utils/builtins.h"
+#include "utils/expandeddatum.h"
+#include "utils/lsyscache.h"
+#include "utils/typcache.h"
+
+static TupleDesc ExecTypeFromTLInternal(List *targetList,
+ bool skipjunk);
+static pg_attribute_always_inline void slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
+ int natts);
+static inline void tts_buffer_heap_store_tuple(TupleTableSlot *slot,
+ HeapTuple tuple,
+ Buffer buffer,
+ bool transfer_pin);
+static void tts_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple, bool shouldFree);
+
+
+const TupleTableSlotOps TTSOpsVirtual;
+const TupleTableSlotOps TTSOpsHeapTuple;
+const TupleTableSlotOps TTSOpsMinimalTuple;
+const TupleTableSlotOps TTSOpsBufferHeapTuple;
+
+
+/*
+ * TupleTableSlotOps implementations.
+ */
+
+/*
+ * TupleTableSlotOps implementation for VirtualTupleTableSlot.
+ */
+static void
+tts_virtual_init(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_virtual_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_virtual_clear(TupleTableSlot *slot)
+{
+ if (unlikely(TTS_SHOULDFREE(slot)))
+ {
+ VirtualTupleTableSlot *vslot = (VirtualTupleTableSlot *) slot;
+
+ pfree(vslot->data);
+ vslot->data = NULL;
+
+ slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+ }
+
+ slot->tts_nvalid = 0;
+ slot->tts_flags |= TTS_FLAG_EMPTY;
+ ItemPointerSetInvalid(&slot->tts_tid);
+}
+
+/*
+ * VirtualTupleTableSlots always have fully populated tts_values and
+ * tts_isnull arrays. So this function should never be called.
+ */
+static void
+tts_virtual_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+ elog(ERROR, "getsomeattrs is not required to be called on a virtual tuple table slot");
+}
+
+/*
+ * VirtualTupleTableSlots never provide system attributes (except those
+ * handled generically, such as tableoid). We generally shouldn't get
+ * here, but provide a user-friendly message if we do.
+ */
+static Datum
+tts_virtual_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+ Assert(!TTS_EMPTY(slot));
+
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot retrieve a system column in this context")));
+
+ return 0; /* silence compiler warnings */
+}
+
+/*
+ * To materialize a virtual slot all the datums that aren't passed by value
+ * have to be copied into the slot's memory context. To do so, compute the
+ * required size, and allocate enough memory to store all attributes. That's
+ * good for cache hit ratio, but more importantly requires only memory
+ * allocation/deallocation.
+ */
+static void
+tts_virtual_materialize(TupleTableSlot *slot)
+{
+ VirtualTupleTableSlot *vslot = (VirtualTupleTableSlot *) slot;
+ TupleDesc desc = slot->tts_tupleDescriptor;
+ Size sz = 0;
+ char *data;
+
+ /* already materialized */
+ if (TTS_SHOULDFREE(slot))
+ return;
+
+ /* compute size of memory required */
+ for (int natt = 0; natt < desc->natts; natt++)
+ {
+ Form_pg_attribute att = TupleDescAttr(desc, natt);
+ Datum val;
+
+ if (att->attbyval || slot->tts_isnull[natt])
+ continue;
+
+ val = slot->tts_values[natt];
+
+ if (att->attlen == -1 &&
+ VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val)))
+ {
+ /*
+ * We want to flatten the expanded value so that the materialized
+ * slot doesn't depend on it.
+ */
+ sz = att_align_nominal(sz, att->attalign);
+ sz += EOH_get_flat_size(DatumGetEOHP(val));
+ }
+ else
+ {
+ sz = att_align_nominal(sz, att->attalign);
+ sz = att_addlength_datum(sz, att->attlen, val);
+ }
+ }
+
+ /* all data is byval */
+ if (sz == 0)
+ return;
+
+ /* allocate memory */
+ vslot->data = data = MemoryContextAlloc(slot->tts_mcxt, sz);
+ slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+ /* and copy all attributes into the pre-allocated space */
+ for (int natt = 0; natt < desc->natts; natt++)
+ {
+ Form_pg_attribute att = TupleDescAttr(desc, natt);
+ Datum val;
+
+ if (att->attbyval || slot->tts_isnull[natt])
+ continue;
+
+ val = slot->tts_values[natt];
+
+ if (att->attlen == -1 &&
+ VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val)))
+ {
+ Size data_length;
+
+ /*
+ * We want to flatten the expanded value so that the materialized
+ * slot doesn't depend on it.
+ */
+ ExpandedObjectHeader *eoh = DatumGetEOHP(val);
+
+ data = (char *) att_align_nominal(data,
+ att->attalign);
+ data_length = EOH_get_flat_size(eoh);
+ EOH_flatten_into(eoh, data, data_length);
+
+ slot->tts_values[natt] = PointerGetDatum(data);
+ data += data_length;
+ }
+ else
+ {
+ Size data_length = 0;
+
+ data = (char *) att_align_nominal(data, att->attalign);
+ data_length = att_addlength_datum(data_length, att->attlen, val);
+
+ memcpy(data, DatumGetPointer(val), data_length);
+
+ slot->tts_values[natt] = PointerGetDatum(data);
+ data += data_length;
+ }
+ }
+}
+
+static void
+tts_virtual_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+ TupleDesc srcdesc = srcslot->tts_tupleDescriptor;
+
+ Assert(srcdesc->natts <= dstslot->tts_tupleDescriptor->natts);
+
+ tts_virtual_clear(dstslot);
+
+ slot_getallattrs(srcslot);
+
+ for (int natt = 0; natt < srcdesc->natts; natt++)
+ {
+ dstslot->tts_values[natt] = srcslot->tts_values[natt];
+ dstslot->tts_isnull[natt] = srcslot->tts_isnull[natt];
+ }
+
+ dstslot->tts_nvalid = srcdesc->natts;
+ dstslot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+ /* make sure storage doesn't depend on external memory */
+ tts_virtual_materialize(dstslot);
+}
+
+static HeapTuple
+tts_virtual_copy_heap_tuple(TupleTableSlot *slot)
+{
+ Assert(!TTS_EMPTY(slot));
+
+ return heap_form_tuple(slot->tts_tupleDescriptor,
+ slot->tts_values,
+ slot->tts_isnull);
+}
+
+static MinimalTuple
+tts_virtual_copy_minimal_tuple(TupleTableSlot *slot)
+{
+ Assert(!TTS_EMPTY(slot));
+
+ return heap_form_minimal_tuple(slot->tts_tupleDescriptor,
+ slot->tts_values,
+ slot->tts_isnull);
+}
+
+
+/*
+ * TupleTableSlotOps implementation for HeapTupleTableSlot.
+ */
+
+static void
+tts_heap_init(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_heap_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_heap_clear(TupleTableSlot *slot)
+{
+ HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+ /* Free the memory for the heap tuple if it's allowed. */
+ if (TTS_SHOULDFREE(slot))
+ {
+ heap_freetuple(hslot->tuple);
+ slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+ }
+
+ slot->tts_nvalid = 0;
+ slot->tts_flags |= TTS_FLAG_EMPTY;
+ ItemPointerSetInvalid(&slot->tts_tid);
+ hslot->off = 0;
+ hslot->tuple = NULL;
+}
+
+static void
+tts_heap_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+ HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+
+ slot_deform_heap_tuple(slot, hslot->tuple, &hslot->off, natts);
+}
+
+static Datum
+tts_heap_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+ HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+
+ /*
+ * In some code paths it's possible to get here with a non-materialized
+ * slot, in which case we can't retrieve system columns.
+ */
+ if (!hslot->tuple)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot retrieve a system column in this context")));
+
+ return heap_getsysattr(hslot->tuple, attnum,
+ slot->tts_tupleDescriptor, isnull);
+}
+
+static void
+tts_heap_materialize(TupleTableSlot *slot)
+{
+ HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+ MemoryContext oldContext;
+
+ Assert(!TTS_EMPTY(slot));
+
+ /* If slot has its tuple already materialized, nothing to do. */
+ if (TTS_SHOULDFREE(slot))
+ return;
+
+ oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+
+ /*
+ * Have to deform from scratch, otherwise tts_values[] entries could point
+ * into the non-materialized tuple (which might be gone when accessed).
+ */
+ slot->tts_nvalid = 0;
+ hslot->off = 0;
+
+ if (!hslot->tuple)
+ hslot->tuple = heap_form_tuple(slot->tts_tupleDescriptor,
+ slot->tts_values,
+ slot->tts_isnull);
+ else
+ {
+ /*
+ * The tuple contained in this slot is not allocated in the memory
+ * context of the given slot (else it would have TTS_SHOULDFREE set).
+ * Copy the tuple into the given slot's memory context.
+ */
+ hslot->tuple = heap_copytuple(hslot->tuple);
+ }
+
+ slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+static void
+tts_heap_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+ HeapTuple tuple;
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(dstslot->tts_mcxt);
+ tuple = ExecCopySlotHeapTuple(srcslot);
+ MemoryContextSwitchTo(oldcontext);
+
+ ExecStoreHeapTuple(tuple, dstslot, true);
+}
+
+static HeapTuple
+tts_heap_get_heap_tuple(TupleTableSlot *slot)
+{
+ HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+ if (!hslot->tuple)
+ tts_heap_materialize(slot);
+
+ return hslot->tuple;
+}
+
+static HeapTuple
+tts_heap_copy_heap_tuple(TupleTableSlot *slot)
+{
+ HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+ if (!hslot->tuple)
+ tts_heap_materialize(slot);
+
+ return heap_copytuple(hslot->tuple);
+}
+
+static MinimalTuple
+tts_heap_copy_minimal_tuple(TupleTableSlot *slot)
+{
+ HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+ if (!hslot->tuple)
+ tts_heap_materialize(slot);
+
+ return minimal_tuple_from_heap_tuple(hslot->tuple);
+}
+
+static void
+tts_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple, bool shouldFree)
+{
+ HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+ tts_heap_clear(slot);
+
+ slot->tts_nvalid = 0;
+ hslot->tuple = tuple;
+ hslot->off = 0;
+ slot->tts_flags &= ~(TTS_FLAG_EMPTY | TTS_FLAG_SHOULDFREE);
+ slot->tts_tid = tuple->t_self;
+
+ if (shouldFree)
+ slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+}
+
+
+/*
+ * TupleTableSlotOps implementation for MinimalTupleTableSlot.
+ */
+
+static void
+tts_minimal_init(TupleTableSlot *slot)
+{
+ MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+ /*
+ * Initialize the heap tuple pointer to access attributes of the minimal
+ * tuple contained in the slot as if its a heap tuple.
+ */
+ mslot->tuple = &mslot->minhdr;
+}
+
+static void
+tts_minimal_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_minimal_clear(TupleTableSlot *slot)
+{
+ MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+ if (TTS_SHOULDFREE(slot))
+ {
+ heap_free_minimal_tuple(mslot->mintuple);
+ slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+ }
+
+ slot->tts_nvalid = 0;
+ slot->tts_flags |= TTS_FLAG_EMPTY;
+ ItemPointerSetInvalid(&slot->tts_tid);
+ mslot->off = 0;
+ mslot->mintuple = NULL;
+}
+
+static void
+tts_minimal_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+ MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+
+ slot_deform_heap_tuple(slot, mslot->tuple, &mslot->off, natts);
+}
+
+static Datum
+tts_minimal_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+ Assert(!TTS_EMPTY(slot));
+
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot retrieve a system column in this context")));
+
+ return 0; /* silence compiler warnings */
+}
+
+static void
+tts_minimal_materialize(TupleTableSlot *slot)
+{
+ MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+ MemoryContext oldContext;
+
+ Assert(!TTS_EMPTY(slot));
+
+ /* If slot has its tuple already materialized, nothing to do. */
+ if (TTS_SHOULDFREE(slot))
+ return;
+
+ oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+
+ /*
+ * Have to deform from scratch, otherwise tts_values[] entries could point
+ * into the non-materialized tuple (which might be gone when accessed).
+ */
+ slot->tts_nvalid = 0;
+ mslot->off = 0;
+
+ if (!mslot->mintuple)
+ {
+ mslot->mintuple = heap_form_minimal_tuple(slot->tts_tupleDescriptor,
+ slot->tts_values,
+ slot->tts_isnull);
+ }
+ else
+ {
+ /*
+ * The minimal tuple contained in this slot is not allocated in the
+ * memory context of the given slot (else it would have TTS_SHOULDFREE
+ * set). Copy the minimal tuple into the given slot's memory context.
+ */
+ mslot->mintuple = heap_copy_minimal_tuple(mslot->mintuple);
+ }
+
+ slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+ Assert(mslot->tuple == &mslot->minhdr);
+
+ mslot->minhdr.t_len = mslot->mintuple->t_len + MINIMAL_TUPLE_OFFSET;
+ mslot->minhdr.t_data = (HeapTupleHeader) ((char *) mslot->mintuple - MINIMAL_TUPLE_OFFSET);
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+static void
+tts_minimal_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+ MemoryContext oldcontext;
+ MinimalTuple mintuple;
+
+ oldcontext = MemoryContextSwitchTo(dstslot->tts_mcxt);
+ mintuple = ExecCopySlotMinimalTuple(srcslot);
+ MemoryContextSwitchTo(oldcontext);
+
+ ExecStoreMinimalTuple(mintuple, dstslot, true);
+}
+
+static MinimalTuple
+tts_minimal_get_minimal_tuple(TupleTableSlot *slot)
+{
+ MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+ if (!mslot->mintuple)
+ tts_minimal_materialize(slot);
+
+ return mslot->mintuple;
+}
+
+static HeapTuple
+tts_minimal_copy_heap_tuple(TupleTableSlot *slot)
+{
+ MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+ if (!mslot->mintuple)
+ tts_minimal_materialize(slot);
+
+ return heap_tuple_from_minimal_tuple(mslot->mintuple);
+}
+
+static MinimalTuple
+tts_minimal_copy_minimal_tuple(TupleTableSlot *slot)
+{
+ MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+ if (!mslot->mintuple)
+ tts_minimal_materialize(slot);
+
+ return heap_copy_minimal_tuple(mslot->mintuple);
+}
+
+static void
+tts_minimal_store_tuple(TupleTableSlot *slot, MinimalTuple mtup, bool shouldFree)
+{
+ MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+ tts_minimal_clear(slot);
+
+ Assert(!TTS_SHOULDFREE(slot));
+ Assert(TTS_EMPTY(slot));
+
+ slot->tts_flags &= ~TTS_FLAG_EMPTY;
+ slot->tts_nvalid = 0;
+ mslot->off = 0;
+
+ mslot->mintuple = mtup;
+ Assert(mslot->tuple == &mslot->minhdr);
+ mslot->minhdr.t_len = mtup->t_len + MINIMAL_TUPLE_OFFSET;
+ mslot->minhdr.t_data = (HeapTupleHeader) ((char *) mtup - MINIMAL_TUPLE_OFFSET);
+ /* no need to set t_self or t_tableOid since we won't allow access */
+
+ if (shouldFree)
+ slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+}
+
+
+/*
+ * TupleTableSlotOps implementation for BufferHeapTupleTableSlot.
+ */
+
+static void
+tts_buffer_heap_init(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_buffer_heap_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_buffer_heap_clear(TupleTableSlot *slot)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+ /*
+ * Free the memory for heap tuple if allowed. A tuple coming from buffer
+ * can never be freed. But we may have materialized a tuple from buffer.
+ * Such a tuple can be freed.
+ */
+ if (TTS_SHOULDFREE(slot))
+ {
+ /* We should have unpinned the buffer while materializing the tuple. */
+ Assert(!BufferIsValid(bslot->buffer));
+
+ heap_freetuple(bslot->base.tuple);
+ slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+ }
+
+ if (BufferIsValid(bslot->buffer))
+ ReleaseBuffer(bslot->buffer);
+
+ slot->tts_nvalid = 0;
+ slot->tts_flags |= TTS_FLAG_EMPTY;
+ ItemPointerSetInvalid(&slot->tts_tid);
+ bslot->base.tuple = NULL;
+ bslot->base.off = 0;
+ bslot->buffer = InvalidBuffer;
+}
+
+static void
+tts_buffer_heap_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+
+ slot_deform_heap_tuple(slot, bslot->base.tuple, &bslot->base.off, natts);
+}
+
+static Datum
+tts_buffer_heap_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+
+ /*
+ * In some code paths it's possible to get here with a non-materialized
+ * slot, in which case we can't retrieve system columns.
+ */
+ if (!bslot->base.tuple)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot retrieve a system column in this context")));
+
+ return heap_getsysattr(bslot->base.tuple, attnum,
+ slot->tts_tupleDescriptor, isnull);
+}
+
+static void
+tts_buffer_heap_materialize(TupleTableSlot *slot)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+ MemoryContext oldContext;
+
+ Assert(!TTS_EMPTY(slot));
+
+ /* If slot has its tuple already materialized, nothing to do. */
+ if (TTS_SHOULDFREE(slot))
+ return;
+
+ oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+
+ /*
+ * Have to deform from scratch, otherwise tts_values[] entries could point
+ * into the non-materialized tuple (which might be gone when accessed).
+ */
+ bslot->base.off = 0;
+ slot->tts_nvalid = 0;
+
+ if (!bslot->base.tuple)
+ {
+ /*
+ * Normally BufferHeapTupleTableSlot should have a tuple + buffer
+ * associated with it, unless it's materialized (which would've
+ * returned above). But when it's useful to allow storing virtual
+ * tuples in a buffer slot, which then also needs to be
+ * materializable.
+ */
+ bslot->base.tuple = heap_form_tuple(slot->tts_tupleDescriptor,
+ slot->tts_values,
+ slot->tts_isnull);
+ }
+ else
+ {
+ bslot->base.tuple = heap_copytuple(bslot->base.tuple);
+
+ /*
+ * A heap tuple stored in a BufferHeapTupleTableSlot should have a
+ * buffer associated with it, unless it's materialized or virtual.
+ */
+ if (likely(BufferIsValid(bslot->buffer)))
+ ReleaseBuffer(bslot->buffer);
+ bslot->buffer = InvalidBuffer;
+ }
+
+ /*
+ * We don't set TTS_FLAG_SHOULDFREE until after releasing the buffer, if
+ * any. This avoids having a transient state that would fall foul of our
+ * assertions that a slot with TTS_FLAG_SHOULDFREE doesn't own a buffer.
+ * In the unlikely event that ReleaseBuffer() above errors out, we'd
+ * effectively leak the copied tuple, but that seems fairly harmless.
+ */
+ slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+static void
+tts_buffer_heap_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+ BufferHeapTupleTableSlot *bsrcslot = (BufferHeapTupleTableSlot *) srcslot;
+ BufferHeapTupleTableSlot *bdstslot = (BufferHeapTupleTableSlot *) dstslot;
+
+ /*
+ * If the source slot is of a different kind, or is a buffer slot that has
+ * been materialized / is virtual, make a new copy of the tuple. Otherwise
+ * make a new reference to the in-buffer tuple.
+ */
+ if (dstslot->tts_ops != srcslot->tts_ops ||
+ TTS_SHOULDFREE(srcslot) ||
+ !bsrcslot->base.tuple)
+ {
+ MemoryContext oldContext;
+
+ ExecClearTuple(dstslot);
+ dstslot->tts_flags &= ~TTS_FLAG_EMPTY;
+ oldContext = MemoryContextSwitchTo(dstslot->tts_mcxt);
+ bdstslot->base.tuple = ExecCopySlotHeapTuple(srcslot);
+ dstslot->tts_flags |= TTS_FLAG_SHOULDFREE;
+ MemoryContextSwitchTo(oldContext);
+ }
+ else
+ {
+ Assert(BufferIsValid(bsrcslot->buffer));
+
+ tts_buffer_heap_store_tuple(dstslot, bsrcslot->base.tuple,
+ bsrcslot->buffer, false);
+
+ /*
+ * The HeapTupleData portion of the source tuple might be shorter
+ * lived than the destination slot. Therefore copy the HeapTuple into
+ * our slot's tupdata, which is guaranteed to live long enough (but
+ * will still point into the buffer).
+ */
+ memcpy(&bdstslot->base.tupdata, bdstslot->base.tuple, sizeof(HeapTupleData));
+ bdstslot->base.tuple = &bdstslot->base.tupdata;
+ }
+}
+
+static HeapTuple
+tts_buffer_heap_get_heap_tuple(TupleTableSlot *slot)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+
+ if (!bslot->base.tuple)
+ tts_buffer_heap_materialize(slot);
+
+ return bslot->base.tuple;
+}
+
+static HeapTuple
+tts_buffer_heap_copy_heap_tuple(TupleTableSlot *slot)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+
+ if (!bslot->base.tuple)
+ tts_buffer_heap_materialize(slot);
+
+ return heap_copytuple(bslot->base.tuple);
+}
+
+static MinimalTuple
+tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+ Assert(!TTS_EMPTY(slot));
+
+ if (!bslot->base.tuple)
+ tts_buffer_heap_materialize(slot);
+
+ return minimal_tuple_from_heap_tuple(bslot->base.tuple);
+}
+
+static inline void
+tts_buffer_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple,
+ Buffer buffer, bool transfer_pin)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+ if (TTS_SHOULDFREE(slot))
+ {
+ /* materialized slot shouldn't have a buffer to release */
+ Assert(!BufferIsValid(bslot->buffer));
+
+ heap_freetuple(bslot->base.tuple);
+ slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+ }
+
+ slot->tts_flags &= ~TTS_FLAG_EMPTY;
+ slot->tts_nvalid = 0;
+ bslot->base.tuple = tuple;
+ bslot->base.off = 0;
+ slot->tts_tid = tuple->t_self;
+
+ /*
+ * If tuple is on a disk page, keep the page pinned as long as we hold a
+ * pointer into it. We assume the caller already has such a pin. If
+ * transfer_pin is true, we'll transfer that pin to this slot, if not
+ * we'll pin it again ourselves.
+ *
+ * This is coded to optimize the case where the slot previously held a
+ * tuple on the same disk page: in that case releasing and re-acquiring
+ * the pin is a waste of cycles. This is a common situation during
+ * seqscans, so it's worth troubling over.
+ */
+ if (bslot->buffer != buffer)
+ {
+ if (BufferIsValid(bslot->buffer))
+ ReleaseBuffer(bslot->buffer);
+
+ bslot->buffer = buffer;
+
+ if (!transfer_pin && BufferIsValid(buffer))
+ IncrBufferRefCount(buffer);
+ }
+ else if (transfer_pin && BufferIsValid(buffer))
+ {
+ /*
+ * In transfer_pin mode the caller won't know about the same-page
+ * optimization, so we gotta release its pin.
+ */
+ ReleaseBuffer(buffer);
+ }
+}
+
+/*
+ * slot_deform_heap_tuple
+ * Given a TupleTableSlot, extract data from the slot's physical tuple
+ * into its Datum/isnull arrays. Data is extracted up through the
+ * natts'th column (caller must ensure this is a legal column number).
+ *
+ * This is essentially an incremental version of heap_deform_tuple:
+ * on each call we extract attributes up to the one needed, without
+ * re-computing information about previously extracted attributes.
+ * slot->tts_nvalid is the number of attributes already extracted.
+ *
+ * This is marked as always inline, so the different offp for different types
+ * of slots gets optimized away.
+ */
+static pg_attribute_always_inline void
+slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
+ int natts)
+{
+ TupleDesc tupleDesc = slot->tts_tupleDescriptor;
+ Datum *values = slot->tts_values;
+ bool *isnull = slot->tts_isnull;
+ HeapTupleHeader tup = tuple->t_data;
+ bool hasnulls = HeapTupleHasNulls(tuple);
+ int attnum;
+ char *tp; /* ptr to tuple data */
+ uint32 off; /* offset in tuple data */
+ bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */
+ bool slow; /* can we use/set attcacheoff? */
+
+ /* We can only fetch as many attributes as the tuple has. */
+ natts = Min(HeapTupleHeaderGetNatts(tuple->t_data), natts);
+
+ /*
+ * Check whether the first call for this tuple, and initialize or restore
+ * loop state.
+ */
+ attnum = slot->tts_nvalid;
+ if (attnum == 0)
+ {
+ /* Start from the first attribute */
+ off = 0;
+ slow = false;
+ }
+ else
+ {
+ /* Restore state from previous execution */
+ off = *offp;
+ slow = TTS_SLOW(slot);
+ }
+
+ tp = (char *) tup + tup->t_hoff;
+
+ for (; attnum < natts; attnum++)
+ {
+ Form_pg_attribute thisatt = TupleDescAttr(tupleDesc, attnum);
+
+ if (hasnulls && att_isnull(attnum, bp))
+ {
+ values[attnum] = (Datum) 0;
+ isnull[attnum] = true;
+ slow = true; /* can't use attcacheoff anymore */
+ continue;
+ }
+
+ isnull[attnum] = false;
+
+ if (!slow && thisatt->attcacheoff >= 0)
+ off = thisatt->attcacheoff;
+ else if (thisatt->attlen == -1)
+ {
+ /*
+ * We can only cache the offset for a varlena attribute if the
+ * offset is already suitably aligned, so that there would be no
+ * pad bytes in any case: then the offset will be valid for either
+ * an aligned or unaligned value.
+ */
+ if (!slow &&
+ off == att_align_nominal(off, thisatt->attalign))
+ thisatt->attcacheoff = off;
+ else
+ {
+ off = att_align_pointer(off, thisatt->attalign, -1,
+ tp + off);
+ slow = true;
+ }
+ }
+ else
+ {
+ /* not varlena, so safe to use att_align_nominal */
+ off = att_align_nominal(off, thisatt->attalign);
+
+ if (!slow)
+ thisatt->attcacheoff = off;
+ }
+
+ values[attnum] = fetchatt(thisatt, tp + off);
+
+ off = att_addlength_pointer(off, thisatt->attlen, tp + off);
+
+ if (thisatt->attlen <= 0)
+ slow = true; /* can't use attcacheoff anymore */
+ }
+
+ /*
+ * Save state for next execution
+ */
+ slot->tts_nvalid = attnum;
+ *offp = off;
+ if (slow)
+ slot->tts_flags |= TTS_FLAG_SLOW;
+ else
+ slot->tts_flags &= ~TTS_FLAG_SLOW;
+}
+
+
+const TupleTableSlotOps TTSOpsVirtual = {
+ .base_slot_size = sizeof(VirtualTupleTableSlot),
+ .init = tts_virtual_init,
+ .release = tts_virtual_release,
+ .clear = tts_virtual_clear,
+ .getsomeattrs = tts_virtual_getsomeattrs,
+ .getsysattr = tts_virtual_getsysattr,
+ .materialize = tts_virtual_materialize,
+ .copyslot = tts_virtual_copyslot,
+
+ /*
+ * A virtual tuple table slot can not "own" a heap tuple or a minimal
+ * tuple.
+ */
+ .get_heap_tuple = NULL,
+ .get_minimal_tuple = NULL,
+ .copy_heap_tuple = tts_virtual_copy_heap_tuple,
+ .copy_minimal_tuple = tts_virtual_copy_minimal_tuple
+};
+
+const TupleTableSlotOps TTSOpsHeapTuple = {
+ .base_slot_size = sizeof(HeapTupleTableSlot),
+ .init = tts_heap_init,
+ .release = tts_heap_release,
+ .clear = tts_heap_clear,
+ .getsomeattrs = tts_heap_getsomeattrs,
+ .getsysattr = tts_heap_getsysattr,
+ .materialize = tts_heap_materialize,
+ .copyslot = tts_heap_copyslot,
+ .get_heap_tuple = tts_heap_get_heap_tuple,
+
+ /* A heap tuple table slot can not "own" a minimal tuple. */
+ .get_minimal_tuple = NULL,
+ .copy_heap_tuple = tts_heap_copy_heap_tuple,
+ .copy_minimal_tuple = tts_heap_copy_minimal_tuple
+};
+
+const TupleTableSlotOps TTSOpsMinimalTuple = {
+ .base_slot_size = sizeof(MinimalTupleTableSlot),
+ .init = tts_minimal_init,
+ .release = tts_minimal_release,
+ .clear = tts_minimal_clear,
+ .getsomeattrs = tts_minimal_getsomeattrs,
+ .getsysattr = tts_minimal_getsysattr,
+ .materialize = tts_minimal_materialize,
+ .copyslot = tts_minimal_copyslot,
+
+ /* A minimal tuple table slot can not "own" a heap tuple. */
+ .get_heap_tuple = NULL,
+ .get_minimal_tuple = tts_minimal_get_minimal_tuple,
+ .copy_heap_tuple = tts_minimal_copy_heap_tuple,
+ .copy_minimal_tuple = tts_minimal_copy_minimal_tuple
+};
+
+const TupleTableSlotOps TTSOpsBufferHeapTuple = {
+ .base_slot_size = sizeof(BufferHeapTupleTableSlot),
+ .init = tts_buffer_heap_init,
+ .release = tts_buffer_heap_release,
+ .clear = tts_buffer_heap_clear,
+ .getsomeattrs = tts_buffer_heap_getsomeattrs,
+ .getsysattr = tts_buffer_heap_getsysattr,
+ .materialize = tts_buffer_heap_materialize,
+ .copyslot = tts_buffer_heap_copyslot,
+ .get_heap_tuple = tts_buffer_heap_get_heap_tuple,
+
+ /* A buffer heap tuple table slot can not "own" a minimal tuple. */
+ .get_minimal_tuple = NULL,
+ .copy_heap_tuple = tts_buffer_heap_copy_heap_tuple,
+ .copy_minimal_tuple = tts_buffer_heap_copy_minimal_tuple
+};
+
+
+/* ----------------------------------------------------------------
+ * tuple table create/delete functions
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * MakeTupleTableSlot
+ *
+ * Basic routine to make an empty TupleTableSlot of given
+ * TupleTableSlotType. If tupleDesc is specified the slot's descriptor is
+ * fixed for its lifetime, gaining some efficiency. If that's
+ * undesirable, pass NULL.
+ * --------------------------------
+ */
+TupleTableSlot *
+MakeTupleTableSlot(TupleDesc tupleDesc,
+ const TupleTableSlotOps *tts_ops)
+{
+ Size basesz,
+ allocsz;
+ TupleTableSlot *slot;
+
+ basesz = tts_ops->base_slot_size;
+
+ /*
+ * When a fixed descriptor is specified, we can reduce overhead by
+ * allocating the entire slot in one go.
+ */
+ if (tupleDesc)
+ allocsz = MAXALIGN(basesz) +
+ MAXALIGN(tupleDesc->natts * sizeof(Datum)) +
+ MAXALIGN(tupleDesc->natts * sizeof(bool));
+ else
+ allocsz = basesz;
+
+ slot = palloc0(allocsz);
+ /* const for optimization purposes, OK to modify at allocation time */
+ *((const TupleTableSlotOps **) &slot->tts_ops) = tts_ops;
+ slot->type = T_TupleTableSlot;
+ slot->tts_flags |= TTS_FLAG_EMPTY;
+ if (tupleDesc != NULL)
+ slot->tts_flags |= TTS_FLAG_FIXED;
+ slot->tts_tupleDescriptor = tupleDesc;
+ slot->tts_mcxt = CurrentMemoryContext;
+ slot->tts_nvalid = 0;
+
+ if (tupleDesc != NULL)
+ {
+ slot->tts_values = (Datum *)
+ (((char *) slot)
+ + MAXALIGN(basesz));
+ slot->tts_isnull = (bool *)
+ (((char *) slot)
+ + MAXALIGN(basesz)
+ + MAXALIGN(tupleDesc->natts * sizeof(Datum)));
+
+ PinTupleDesc(tupleDesc);
+ }
+
+ /*
+ * And allow slot type specific initialization.
+ */
+ slot->tts_ops->init(slot);
+
+ return slot;
+}
+
+/* --------------------------------
+ * ExecAllocTableSlot
+ *
+ * Create a tuple table slot within a tuple table (which is just a List).
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecAllocTableSlot(List **tupleTable, TupleDesc desc,
+ const TupleTableSlotOps *tts_ops)
+{
+ TupleTableSlot *slot = MakeTupleTableSlot(desc, tts_ops);
+
+ *tupleTable = lappend(*tupleTable, slot);
+
+ return slot;
+}
+
+/* --------------------------------
+ * ExecResetTupleTable
+ *
+ * This releases any resources (buffer pins, tupdesc refcounts)
+ * held by the tuple table, and optionally releases the memory
+ * occupied by the tuple table data structure.
+ * It is expected that this routine be called by ExecEndPlan().
+ * --------------------------------
+ */
+void
+ExecResetTupleTable(List *tupleTable, /* tuple table */
+ bool shouldFree) /* true if we should free memory */
+{
+ ListCell *lc;
+
+ foreach(lc, tupleTable)
+ {
+ TupleTableSlot *slot = lfirst_node(TupleTableSlot, lc);
+
+ /* Always release resources and reset the slot to empty */
+ ExecClearTuple(slot);
+ slot->tts_ops->release(slot);
+ if (slot->tts_tupleDescriptor)
+ {
+ ReleaseTupleDesc(slot->tts_tupleDescriptor);
+ slot->tts_tupleDescriptor = NULL;
+ }
+
+ /* If shouldFree, release memory occupied by the slot itself */
+ if (shouldFree)
+ {
+ if (!TTS_FIXED(slot))
+ {
+ if (slot->tts_values)
+ pfree(slot->tts_values);
+ if (slot->tts_isnull)
+ pfree(slot->tts_isnull);
+ }
+ pfree(slot);
+ }
+ }
+
+ /* If shouldFree, release the list structure */
+ if (shouldFree)
+ list_free(tupleTable);
+}
+
+/* --------------------------------
+ * MakeSingleTupleTableSlot
+ *
+ * This is a convenience routine for operations that need a standalone
+ * TupleTableSlot not gotten from the main executor tuple table. It makes
+ * a single slot of given TupleTableSlotType and initializes it to use the
+ * given tuple descriptor.
+ * --------------------------------
+ */
+TupleTableSlot *
+MakeSingleTupleTableSlot(TupleDesc tupdesc,
+ const TupleTableSlotOps *tts_ops)
+{
+ TupleTableSlot *slot = MakeTupleTableSlot(tupdesc, tts_ops);
+
+ return slot;
+}
+
+/* --------------------------------
+ * ExecDropSingleTupleTableSlot
+ *
+ * Release a TupleTableSlot made with MakeSingleTupleTableSlot.
+ * DON'T use this on a slot that's part of a tuple table list!
+ * --------------------------------
+ */
+void
+ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
+{
+ /* This should match ExecResetTupleTable's processing of one slot */
+ Assert(IsA(slot, TupleTableSlot));
+ ExecClearTuple(slot);
+ slot->tts_ops->release(slot);
+ if (slot->tts_tupleDescriptor)
+ ReleaseTupleDesc(slot->tts_tupleDescriptor);
+ if (!TTS_FIXED(slot))
+ {
+ if (slot->tts_values)
+ pfree(slot->tts_values);
+ if (slot->tts_isnull)
+ pfree(slot->tts_isnull);
+ }
+ pfree(slot);
+}
+
+
+/* ----------------------------------------------------------------
+ * tuple table slot accessor functions
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * ExecSetSlotDescriptor
+ *
+ * This function is used to set the tuple descriptor associated
+ * with the slot's tuple. The passed descriptor must have lifespan
+ * at least equal to the slot's. If it is a reference-counted descriptor
+ * then the reference count is incremented for as long as the slot holds
+ * a reference.
+ * --------------------------------
+ */
+void
+ExecSetSlotDescriptor(TupleTableSlot *slot, /* slot to change */
+ TupleDesc tupdesc) /* new tuple descriptor */
+{
+ Assert(!TTS_FIXED(slot));
+
+ /* For safety, make sure slot is empty before changing it */
+ ExecClearTuple(slot);
+
+ /*
+ * Release any old descriptor. Also release old Datum/isnull arrays if
+ * present (we don't bother to check if they could be re-used).
+ */
+ if (slot->tts_tupleDescriptor)
+ ReleaseTupleDesc(slot->tts_tupleDescriptor);
+
+ if (slot->tts_values)
+ pfree(slot->tts_values);
+ if (slot->tts_isnull)
+ pfree(slot->tts_isnull);
+
+ /*
+ * Install the new descriptor; if it's refcounted, bump its refcount.
+ */
+ slot->tts_tupleDescriptor = tupdesc;
+ PinTupleDesc(tupdesc);
+
+ /*
+ * Allocate Datum/isnull arrays of the appropriate size. These must have
+ * the same lifetime as the slot, so allocate in the slot's own context.
+ */
+ slot->tts_values = (Datum *)
+ MemoryContextAlloc(slot->tts_mcxt, tupdesc->natts * sizeof(Datum));
+ slot->tts_isnull = (bool *)
+ MemoryContextAlloc(slot->tts_mcxt, tupdesc->natts * sizeof(bool));
+}
+
+/* --------------------------------
+ * ExecStoreHeapTuple
+ *
+ * This function is used to store an on-the-fly physical tuple into a specified
+ * slot in the tuple table.
+ *
+ * tuple: tuple to store
+ * slot: TTSOpsHeapTuple type slot to store it in
+ * shouldFree: true if ExecClearTuple should pfree() the tuple
+ * when done with it
+ *
+ * shouldFree is normally set 'true' for tuples constructed on-the-fly. But it
+ * can be 'false' when the referenced tuple is held in a tuple table slot
+ * belonging to a lower-level executor Proc node. In this case the lower-level
+ * slot retains ownership and responsibility for eventually releasing the
+ * tuple. When this method is used, we must be certain that the upper-level
+ * Proc node will lose interest in the tuple sooner than the lower-level one
+ * does! If you're not certain, copy the lower-level tuple with heap_copytuple
+ * and let the upper-level table slot assume ownership of the copy!
+ *
+ * Return value is just the passed-in slot pointer.
+ *
+ * If the target slot is not guaranteed to be TTSOpsHeapTuple type slot, use
+ * the, more expensive, ExecForceStoreHeapTuple().
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecStoreHeapTuple(HeapTuple tuple,
+ TupleTableSlot *slot,
+ bool shouldFree)
+{
+ /*
+ * sanity checks
+ */
+ Assert(tuple != NULL);
+ Assert(slot != NULL);
+ Assert(slot->tts_tupleDescriptor != NULL);
+
+ if (unlikely(!TTS_IS_HEAPTUPLE(slot)))
+ elog(ERROR, "trying to store a heap tuple into wrong type of slot");
+ tts_heap_store_tuple(slot, tuple, shouldFree);
+
+ slot->tts_tableOid = tuple->t_tableOid;
+
+ return slot;
+}
+
+/* --------------------------------
+ * ExecStoreBufferHeapTuple
+ *
+ * This function is used to store an on-disk physical tuple from a buffer
+ * into a specified slot in the tuple table.
+ *
+ * tuple: tuple to store
+ * slot: TTSOpsBufferHeapTuple type slot to store it in
+ * buffer: disk buffer if tuple is in a disk page, else InvalidBuffer
+ *
+ * The tuple table code acquires a pin on the buffer which is held until the
+ * slot is cleared, so that the tuple won't go away on us.
+ *
+ * Return value is just the passed-in slot pointer.
+ *
+ * If the target slot is not guaranteed to be TTSOpsBufferHeapTuple type slot,
+ * use the, more expensive, ExecForceStoreHeapTuple().
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecStoreBufferHeapTuple(HeapTuple tuple,
+ TupleTableSlot *slot,
+ Buffer buffer)
+{
+ /*
+ * sanity checks
+ */
+ Assert(tuple != NULL);
+ Assert(slot != NULL);
+ Assert(slot->tts_tupleDescriptor != NULL);
+ Assert(BufferIsValid(buffer));
+
+ if (unlikely(!TTS_IS_BUFFERTUPLE(slot)))
+ elog(ERROR, "trying to store an on-disk heap tuple into wrong type of slot");
+ tts_buffer_heap_store_tuple(slot, tuple, buffer, false);
+
+ slot->tts_tableOid = tuple->t_tableOid;
+
+ return slot;
+}
+
+/*
+ * Like ExecStoreBufferHeapTuple, but transfer an existing pin from the caller
+ * to the slot, i.e. the caller doesn't need to, and may not, release the pin.
+ */
+TupleTableSlot *
+ExecStorePinnedBufferHeapTuple(HeapTuple tuple,
+ TupleTableSlot *slot,
+ Buffer buffer)
+{
+ /*
+ * sanity checks
+ */
+ Assert(tuple != NULL);
+ Assert(slot != NULL);
+ Assert(slot->tts_tupleDescriptor != NULL);
+ Assert(BufferIsValid(buffer));
+
+ if (unlikely(!TTS_IS_BUFFERTUPLE(slot)))
+ elog(ERROR, "trying to store an on-disk heap tuple into wrong type of slot");
+ tts_buffer_heap_store_tuple(slot, tuple, buffer, true);
+
+ slot->tts_tableOid = tuple->t_tableOid;
+
+ return slot;
+}
+
+/*
+ * Store a minimal tuple into TTSOpsMinimalTuple type slot.
+ *
+ * If the target slot is not guaranteed to be TTSOpsMinimalTuple type slot,
+ * use the, more expensive, ExecForceStoreMinimalTuple().
+ */
+TupleTableSlot *
+ExecStoreMinimalTuple(MinimalTuple mtup,
+ TupleTableSlot *slot,
+ bool shouldFree)
+{
+ /*
+ * sanity checks
+ */
+ Assert(mtup != NULL);
+ Assert(slot != NULL);
+ Assert(slot->tts_tupleDescriptor != NULL);
+
+ if (unlikely(!TTS_IS_MINIMALTUPLE(slot)))
+ elog(ERROR, "trying to store a minimal tuple into wrong type of slot");
+ tts_minimal_store_tuple(slot, mtup, shouldFree);
+
+ return slot;
+}
+
+/*
+ * Store a HeapTuple into any kind of slot, performing conversion if
+ * necessary.
+ */
+void
+ExecForceStoreHeapTuple(HeapTuple tuple,
+ TupleTableSlot *slot,
+ bool shouldFree)
+{
+ if (TTS_IS_HEAPTUPLE(slot))
+ {
+ ExecStoreHeapTuple(tuple, slot, shouldFree);
+ }
+ else if (TTS_IS_BUFFERTUPLE(slot))
+ {
+ MemoryContext oldContext;
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+ ExecClearTuple(slot);
+ slot->tts_flags &= ~TTS_FLAG_EMPTY;
+ oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+ bslot->base.tuple = heap_copytuple(tuple);
+ slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+ MemoryContextSwitchTo(oldContext);
+
+ if (shouldFree)
+ pfree(tuple);
+ }
+ else
+ {
+ ExecClearTuple(slot);
+ heap_deform_tuple(tuple, slot->tts_tupleDescriptor,
+ slot->tts_values, slot->tts_isnull);
+ ExecStoreVirtualTuple(slot);
+
+ if (shouldFree)
+ {
+ ExecMaterializeSlot(slot);
+ pfree(tuple);
+ }
+ }
+}
+
+/*
+ * Store a MinimalTuple into any kind of slot, performing conversion if
+ * necessary.
+ */
+void
+ExecForceStoreMinimalTuple(MinimalTuple mtup,
+ TupleTableSlot *slot,
+ bool shouldFree)
+{
+ if (TTS_IS_MINIMALTUPLE(slot))
+ {
+ tts_minimal_store_tuple(slot, mtup, shouldFree);
+ }
+ else
+ {
+ HeapTupleData htup;
+
+ ExecClearTuple(slot);
+
+ htup.t_len = mtup->t_len + MINIMAL_TUPLE_OFFSET;
+ htup.t_data = (HeapTupleHeader) ((char *) mtup - MINIMAL_TUPLE_OFFSET);
+ heap_deform_tuple(&htup, slot->tts_tupleDescriptor,
+ slot->tts_values, slot->tts_isnull);
+ ExecStoreVirtualTuple(slot);
+
+ if (shouldFree)
+ {
+ ExecMaterializeSlot(slot);
+ pfree(mtup);
+ }
+ }
+}
+
+/* --------------------------------
+ * ExecStoreVirtualTuple
+ * Mark a slot as containing a virtual tuple.
+ *
+ * The protocol for loading a slot with virtual tuple data is:
+ * * Call ExecClearTuple to mark the slot empty.
+ * * Store data into the Datum/isnull arrays.
+ * * Call ExecStoreVirtualTuple to mark the slot valid.
+ * This is a bit unclean but it avoids one round of data copying.
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecStoreVirtualTuple(TupleTableSlot *slot)
+{
+ /*
+ * sanity checks
+ */
+ Assert(slot != NULL);
+ Assert(slot->tts_tupleDescriptor != NULL);
+ Assert(TTS_EMPTY(slot));
+
+ slot->tts_flags &= ~TTS_FLAG_EMPTY;
+ slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+
+ return slot;
+}
+
+/* --------------------------------
+ * ExecStoreAllNullTuple
+ * Set up the slot to contain a null in every column.
+ *
+ * At first glance this might sound just like ExecClearTuple, but it's
+ * entirely different: the slot ends up full, not empty.
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecStoreAllNullTuple(TupleTableSlot *slot)
+{
+ /*
+ * sanity checks
+ */
+ Assert(slot != NULL);
+ Assert(slot->tts_tupleDescriptor != NULL);
+
+ /* Clear any old contents */
+ ExecClearTuple(slot);
+
+ /*
+ * Fill all the columns of the virtual tuple with nulls
+ */
+ MemSet(slot->tts_values, 0,
+ slot->tts_tupleDescriptor->natts * sizeof(Datum));
+ memset(slot->tts_isnull, true,
+ slot->tts_tupleDescriptor->natts * sizeof(bool));
+
+ return ExecStoreVirtualTuple(slot);
+}
+
+/*
+ * Store a HeapTuple in datum form, into a slot. That always requires
+ * deforming it and storing it in virtual form.
+ *
+ * Until the slot is materialized, the contents of the slot depend on the
+ * datum.
+ */
+void
+ExecStoreHeapTupleDatum(Datum data, TupleTableSlot *slot)
+{
+ HeapTupleData tuple = {0};
+ HeapTupleHeader td;
+
+ td = DatumGetHeapTupleHeader(data);
+
+ tuple.t_len = HeapTupleHeaderGetDatumLength(td);
+ tuple.t_self = td->t_ctid;
+ tuple.t_data = td;
+
+ ExecClearTuple(slot);
+
+ heap_deform_tuple(&tuple, slot->tts_tupleDescriptor,
+ slot->tts_values, slot->tts_isnull);
+ ExecStoreVirtualTuple(slot);
+}
+
+/*
+ * ExecFetchSlotHeapTuple - fetch HeapTuple representing the slot's content
+ *
+ * The returned HeapTuple represents the slot's content as closely as
+ * possible.
+ *
+ * If materialize is true, the contents of the slots will be made independent
+ * from the underlying storage (i.e. all buffer pins are released, memory is
+ * allocated in the slot's context).
+ *
+ * If shouldFree is not-NULL it'll be set to true if the returned tuple has
+ * been allocated in the calling memory context, and must be freed by the
+ * caller (via explicit pfree() or a memory context reset).
+ *
+ * NB: If materialize is true, modifications of the returned tuple are
+ * allowed. But it depends on the type of the slot whether such modifications
+ * will also affect the slot's contents. While that is not the nicest
+ * behaviour, all such modifications are in the process of being removed.
+ */
+HeapTuple
+ExecFetchSlotHeapTuple(TupleTableSlot *slot, bool materialize, bool *shouldFree)
+{
+ /*
+ * sanity checks
+ */
+ Assert(slot != NULL);
+ Assert(!TTS_EMPTY(slot));
+
+ /* Materialize the tuple so that the slot "owns" it, if requested. */
+ if (materialize)
+ slot->tts_ops->materialize(slot);
+
+ if (slot->tts_ops->get_heap_tuple == NULL)
+ {
+ if (shouldFree)
+ *shouldFree = true;
+ return slot->tts_ops->copy_heap_tuple(slot);
+ }
+ else
+ {
+ if (shouldFree)
+ *shouldFree = false;
+ return slot->tts_ops->get_heap_tuple(slot);
+ }
+}
+
+/* --------------------------------
+ * ExecFetchSlotMinimalTuple
+ * Fetch the slot's minimal physical tuple.
+ *
+ * If the given tuple table slot can hold a minimal tuple, indicated by a
+ * non-NULL get_minimal_tuple callback, the function returns the minimal
+ * tuple returned by that callback. It assumes that the minimal tuple
+ * returned by the callback is "owned" by the slot i.e. the slot is
+ * responsible for freeing the memory consumed by the tuple. Hence it sets
+ * *shouldFree to false, indicating that the caller should not free the
+ * memory consumed by the minimal tuple. In this case the returned minimal
+ * tuple should be considered as read-only.
+ *
+ * If that callback is not supported, it calls copy_minimal_tuple callback
+ * which is expected to return a copy of minimal tuple representing the
+ * contents of the slot. In this case *shouldFree is set to true,
+ * indicating the caller that it should free the memory consumed by the
+ * minimal tuple. In this case the returned minimal tuple may be written
+ * up.
+ * --------------------------------
+ */
+MinimalTuple
+ExecFetchSlotMinimalTuple(TupleTableSlot *slot,
+ bool *shouldFree)
+{
+ /*
+ * sanity checks
+ */
+ Assert(slot != NULL);
+ Assert(!TTS_EMPTY(slot));
+
+ if (slot->tts_ops->get_minimal_tuple)
+ {
+ if (shouldFree)
+ *shouldFree = false;
+ return slot->tts_ops->get_minimal_tuple(slot);
+ }
+ else
+ {
+ if (shouldFree)
+ *shouldFree = true;
+ return slot->tts_ops->copy_minimal_tuple(slot);
+ }
+}
+
+/* --------------------------------
+ * ExecFetchSlotHeapTupleDatum
+ * Fetch the slot's tuple as a composite-type Datum.
+ *
+ * The result is always freshly palloc'd in the caller's memory context.
+ * --------------------------------
+ */
+Datum
+ExecFetchSlotHeapTupleDatum(TupleTableSlot *slot)
+{
+ HeapTuple tup;
+ TupleDesc tupdesc;
+ bool shouldFree;
+ Datum ret;
+
+ /* Fetch slot's contents in regular-physical-tuple form */
+ tup = ExecFetchSlotHeapTuple(slot, false, &shouldFree);
+ tupdesc = slot->tts_tupleDescriptor;
+
+ /* Convert to Datum form */
+ ret = heap_copy_tuple_as_datum(tup, tupdesc);
+
+ if (shouldFree)
+ pfree(tup);
+
+ return ret;
+}
+
+/* ----------------------------------------------------------------
+ * convenience initialization routines
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * ExecInitResultTypeTL
+ *
+ * Initialize result type, using the plan node's targetlist.
+ * ----------------
+ */
+void
+ExecInitResultTypeTL(PlanState *planstate)
+{
+ TupleDesc tupDesc = ExecTypeFromTL(planstate->plan->targetlist);
+
+ planstate->ps_ResultTupleDesc = tupDesc;
+}
+
+/* --------------------------------
+ * ExecInit{Result,Scan,Extra}TupleSlot[TL]
+ *
+ * These are convenience routines to initialize the specified slot
+ * in nodes inheriting the appropriate state. ExecInitExtraTupleSlot
+ * is used for initializing special-purpose slots.
+ * --------------------------------
+ */
+
+/* ----------------
+ * ExecInitResultTupleSlotTL
+ *
+ * Initialize result tuple slot, using the tuple descriptor previously
+ * computed with ExecInitResultTypeTL().
+ * ----------------
+ */
+void
+ExecInitResultSlot(PlanState *planstate, const TupleTableSlotOps *tts_ops)
+{
+ TupleTableSlot *slot;
+
+ slot = ExecAllocTableSlot(&planstate->state->es_tupleTable,
+ planstate->ps_ResultTupleDesc, tts_ops);
+ planstate->ps_ResultTupleSlot = slot;
+
+ planstate->resultopsfixed = planstate->ps_ResultTupleDesc != NULL;
+ planstate->resultops = tts_ops;
+ planstate->resultopsset = true;
+}
+
+/* ----------------
+ * ExecInitResultTupleSlotTL
+ *
+ * Initialize result tuple slot, using the plan node's targetlist.
+ * ----------------
+ */
+void
+ExecInitResultTupleSlotTL(PlanState *planstate,
+ const TupleTableSlotOps *tts_ops)
+{
+ ExecInitResultTypeTL(planstate);
+ ExecInitResultSlot(planstate, tts_ops);
+}
+
+/* ----------------
+ * ExecInitScanTupleSlot
+ * ----------------
+ */
+void
+ExecInitScanTupleSlot(EState *estate, ScanState *scanstate,
+ TupleDesc tupledesc, const TupleTableSlotOps *tts_ops)
+{
+ scanstate->ss_ScanTupleSlot = ExecAllocTableSlot(&estate->es_tupleTable,
+ tupledesc, tts_ops);
+ scanstate->ps.scandesc = tupledesc;
+ scanstate->ps.scanopsfixed = tupledesc != NULL;
+ scanstate->ps.scanops = tts_ops;
+ scanstate->ps.scanopsset = true;
+}
+
+/* ----------------
+ * ExecInitExtraTupleSlot
+ *
+ * Return a newly created slot. If tupledesc is non-NULL the slot will have
+ * that as its fixed tupledesc. Otherwise the caller needs to use
+ * ExecSetSlotDescriptor() to set the descriptor before use.
+ * ----------------
+ */
+TupleTableSlot *
+ExecInitExtraTupleSlot(EState *estate,
+ TupleDesc tupledesc,
+ const TupleTableSlotOps *tts_ops)
+{
+ return ExecAllocTableSlot(&estate->es_tupleTable, tupledesc, tts_ops);
+}
+
+/* ----------------
+ * ExecInitNullTupleSlot
+ *
+ * Build a slot containing an all-nulls tuple of the given type.
+ * This is used as a substitute for an input tuple when performing an
+ * outer join.
+ * ----------------
+ */
+TupleTableSlot *
+ExecInitNullTupleSlot(EState *estate, TupleDesc tupType,
+ const TupleTableSlotOps *tts_ops)
+{
+ TupleTableSlot *slot = ExecInitExtraTupleSlot(estate, tupType, tts_ops);
+
+ return ExecStoreAllNullTuple(slot);
+}
+
+/* ---------------------------------------------------------------
+ * Routines for setting/accessing attributes in a slot.
+ * ---------------------------------------------------------------
+ */
+
+/*
+ * Fill in missing values for a TupleTableSlot.
+ *
+ * This is only exposed because it's needed for JIT compiled tuple
+ * deforming. That exception aside, there should be no callers outside of this
+ * file.
+ */
+void
+slot_getmissingattrs(TupleTableSlot *slot, int startAttNum, int lastAttNum)
+{
+ AttrMissing *attrmiss = NULL;
+
+ if (slot->tts_tupleDescriptor->constr)
+ attrmiss = slot->tts_tupleDescriptor->constr->missing;
+
+ if (!attrmiss)
+ {
+ /* no missing values array at all, so just fill everything in as NULL */
+ memset(slot->tts_values + startAttNum, 0,
+ (lastAttNum - startAttNum) * sizeof(Datum));
+ memset(slot->tts_isnull + startAttNum, 1,
+ (lastAttNum - startAttNum) * sizeof(bool));
+ }
+ else
+ {
+ int missattnum;
+
+ /* if there is a missing values array we must process them one by one */
+ for (missattnum = startAttNum;
+ missattnum < lastAttNum;
+ missattnum++)
+ {
+ slot->tts_values[missattnum] = attrmiss[missattnum].am_value;
+ slot->tts_isnull[missattnum] = !attrmiss[missattnum].am_present;
+ }
+ }
+}
+
+/*
+ * slot_getsomeattrs_int - workhorse for slot_getsomeattrs()
+ */
+void
+slot_getsomeattrs_int(TupleTableSlot *slot, int attnum)
+{
+ /* Check for caller errors */
+ Assert(slot->tts_nvalid < attnum); /* checked in slot_getsomeattrs */
+ Assert(attnum > 0);
+
+ if (unlikely(attnum > slot->tts_tupleDescriptor->natts))
+ elog(ERROR, "invalid attribute number %d", attnum);
+
+ /* Fetch as many attributes as possible from the underlying tuple. */
+ slot->tts_ops->getsomeattrs(slot, attnum);
+
+ /*
+ * If the underlying tuple doesn't have enough attributes, tuple
+ * descriptor must have the missing attributes.
+ */
+ if (unlikely(slot->tts_nvalid < attnum))
+ {
+ slot_getmissingattrs(slot, slot->tts_nvalid, attnum);
+ slot->tts_nvalid = attnum;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecTypeFromTL
+ *
+ * Generate a tuple descriptor for the result tuple of a targetlist.
+ * (A parse/plan tlist must be passed, not an ExprState tlist.)
+ * Note that resjunk columns, if any, are included in the result.
+ *
+ * Currently there are about 4 different places where we create
+ * TupleDescriptors. They should all be merged, or perhaps
+ * be rewritten to call BuildDesc().
+ * ----------------------------------------------------------------
+ */
+TupleDesc
+ExecTypeFromTL(List *targetList)
+{
+ return ExecTypeFromTLInternal(targetList, false);
+}
+
+/* ----------------------------------------------------------------
+ * ExecCleanTypeFromTL
+ *
+ * Same as above, but resjunk columns are omitted from the result.
+ * ----------------------------------------------------------------
+ */
+TupleDesc
+ExecCleanTypeFromTL(List *targetList)
+{
+ return ExecTypeFromTLInternal(targetList, true);
+}
+
+static TupleDesc
+ExecTypeFromTLInternal(List *targetList, bool skipjunk)
+{
+ TupleDesc typeInfo;
+ ListCell *l;
+ int len;
+ int cur_resno = 1;
+
+ if (skipjunk)
+ len = ExecCleanTargetListLength(targetList);
+ else
+ len = ExecTargetListLength(targetList);
+ typeInfo = CreateTemplateTupleDesc(len);
+
+ foreach(l, targetList)
+ {
+ TargetEntry *tle = lfirst(l);
+
+ if (skipjunk && tle->resjunk)
+ continue;
+ TupleDescInitEntry(typeInfo,
+ cur_resno,
+ tle->resname,
+ exprType((Node *) tle->expr),
+ exprTypmod((Node *) tle->expr),
+ 0);
+ TupleDescInitEntryCollation(typeInfo,
+ cur_resno,
+ exprCollation((Node *) tle->expr));
+ cur_resno++;
+ }
+
+ return typeInfo;
+}
+
+/*
+ * ExecTypeFromExprList - build a tuple descriptor from a list of Exprs
+ *
+ * This is roughly like ExecTypeFromTL, but we work from bare expressions
+ * not TargetEntrys. No names are attached to the tupledesc's columns.
+ */
+TupleDesc
+ExecTypeFromExprList(List *exprList)
+{
+ TupleDesc typeInfo;
+ ListCell *lc;
+ int cur_resno = 1;
+
+ typeInfo = CreateTemplateTupleDesc(list_length(exprList));
+
+ foreach(lc, exprList)
+ {
+ Node *e = lfirst(lc);
+
+ TupleDescInitEntry(typeInfo,
+ cur_resno,
+ NULL,
+ exprType(e),
+ exprTypmod(e),
+ 0);
+ TupleDescInitEntryCollation(typeInfo,
+ cur_resno,
+ exprCollation(e));
+ cur_resno++;
+ }
+
+ return typeInfo;
+}
+
+/*
+ * ExecTypeSetColNames - set column names in a RECORD TupleDesc
+ *
+ * Column names must be provided as an alias list (list of String nodes).
+ */
+void
+ExecTypeSetColNames(TupleDesc typeInfo, List *namesList)
+{
+ int colno = 0;
+ ListCell *lc;
+
+ /* It's only OK to change col names in a not-yet-blessed RECORD type */
+ Assert(typeInfo->tdtypeid == RECORDOID);
+ Assert(typeInfo->tdtypmod < 0);
+
+ foreach(lc, namesList)
+ {
+ char *cname = strVal(lfirst(lc));
+ Form_pg_attribute attr;
+
+ /* Guard against too-long names list (probably can't happen) */
+ if (colno >= typeInfo->natts)
+ break;
+ attr = TupleDescAttr(typeInfo, colno);
+ colno++;
+
+ /*
+ * Do nothing for empty aliases or dropped columns (these cases
+ * probably can't arise in RECORD types, either)
+ */
+ if (cname[0] == '\0' || attr->attisdropped)
+ continue;
+
+ /* OK, assign the column name */
+ namestrcpy(&(attr->attname), cname);
+ }
+}
+
+/*
+ * BlessTupleDesc - make a completed tuple descriptor useful for SRFs
+ *
+ * Rowtype Datums returned by a function must contain valid type information.
+ * This happens "for free" if the tupdesc came from a relcache entry, but
+ * not if we have manufactured a tupdesc for a transient RECORD datatype.
+ * In that case we have to notify typcache.c of the existence of the type.
+ */
+TupleDesc
+BlessTupleDesc(TupleDesc tupdesc)
+{
+ if (tupdesc->tdtypeid == RECORDOID &&
+ tupdesc->tdtypmod < 0)
+ assign_record_type_typmod(tupdesc);
+
+ return tupdesc; /* just for notational convenience */
+}
+
+/*
+ * TupleDescGetAttInMetadata - Build an AttInMetadata structure based on the
+ * supplied TupleDesc. AttInMetadata can be used in conjunction with C strings
+ * to produce a properly formed tuple.
+ */
+AttInMetadata *
+TupleDescGetAttInMetadata(TupleDesc tupdesc)
+{
+ int natts = tupdesc->natts;
+ int i;
+ Oid atttypeid;
+ Oid attinfuncid;
+ FmgrInfo *attinfuncinfo;
+ Oid *attioparams;
+ int32 *atttypmods;
+ AttInMetadata *attinmeta;
+
+ attinmeta = (AttInMetadata *) palloc(sizeof(AttInMetadata));
+
+ /* "Bless" the tupledesc so that we can make rowtype datums with it */
+ attinmeta->tupdesc = BlessTupleDesc(tupdesc);
+
+ /*
+ * Gather info needed later to call the "in" function for each attribute
+ */
+ attinfuncinfo = (FmgrInfo *) palloc0(natts * sizeof(FmgrInfo));
+ attioparams = (Oid *) palloc0(natts * sizeof(Oid));
+ atttypmods = (int32 *) palloc0(natts * sizeof(int32));
+
+ for (i = 0; i < natts; i++)
+ {
+ Form_pg_attribute att = TupleDescAttr(tupdesc, i);
+
+ /* Ignore dropped attributes */
+ if (!att->attisdropped)
+ {
+ atttypeid = att->atttypid;
+ getTypeInputInfo(atttypeid, &attinfuncid, &attioparams[i]);
+ fmgr_info(attinfuncid, &attinfuncinfo[i]);
+ atttypmods[i] = att->atttypmod;
+ }
+ }
+ attinmeta->attinfuncs = attinfuncinfo;
+ attinmeta->attioparams = attioparams;
+ attinmeta->atttypmods = atttypmods;
+
+ return attinmeta;
+}
+
+/*
+ * BuildTupleFromCStrings - build a HeapTuple given user data in C string form.
+ * values is an array of C strings, one for each attribute of the return tuple.
+ * A NULL string pointer indicates we want to create a NULL field.
+ */
+HeapTuple
+BuildTupleFromCStrings(AttInMetadata *attinmeta, char **values)
+{
+ TupleDesc tupdesc = attinmeta->tupdesc;
+ int natts = tupdesc->natts;
+ Datum *dvalues;
+ bool *nulls;
+ int i;
+ HeapTuple tuple;
+
+ dvalues = (Datum *) palloc(natts * sizeof(Datum));
+ nulls = (bool *) palloc(natts * sizeof(bool));
+
+ /*
+ * Call the "in" function for each non-dropped attribute, even for nulls,
+ * to support domains.
+ */
+ for (i = 0; i < natts; i++)
+ {
+ if (!TupleDescAttr(tupdesc, i)->attisdropped)
+ {
+ /* Non-dropped attributes */
+ dvalues[i] = InputFunctionCall(&attinmeta->attinfuncs[i],
+ values[i],
+ attinmeta->attioparams[i],
+ attinmeta->atttypmods[i]);
+ if (values[i] != NULL)
+ nulls[i] = false;
+ else
+ nulls[i] = true;
+ }
+ else
+ {
+ /* Handle dropped attributes by setting to NULL */
+ dvalues[i] = (Datum) 0;
+ nulls[i] = true;
+ }
+ }
+
+ /*
+ * Form a tuple
+ */
+ tuple = heap_form_tuple(tupdesc, dvalues, nulls);
+
+ /*
+ * Release locally palloc'd space. XXX would probably be good to pfree
+ * values of pass-by-reference datums, as well.
+ */
+ pfree(dvalues);
+ pfree(nulls);
+
+ return tuple;
+}
+
+/*
+ * HeapTupleHeaderGetDatum - convert a HeapTupleHeader pointer to a Datum.
+ *
+ * This must *not* get applied to an on-disk tuple; the tuple should be
+ * freshly made by heap_form_tuple or some wrapper routine for it (such as
+ * BuildTupleFromCStrings). Be sure also that the tupledesc used to build
+ * the tuple has a properly "blessed" rowtype.
+ *
+ * Formerly this was a macro equivalent to PointerGetDatum, relying on the
+ * fact that heap_form_tuple fills in the appropriate tuple header fields
+ * for a composite Datum. However, we now require that composite Datums not
+ * contain any external TOAST pointers. We do not want heap_form_tuple itself
+ * to enforce that; more specifically, the rule applies only to actual Datums
+ * and not to HeapTuple structures. Therefore, HeapTupleHeaderGetDatum is
+ * now a function that detects whether there are externally-toasted fields
+ * and constructs a new tuple with inlined fields if so. We still need
+ * heap_form_tuple to insert the Datum header fields, because otherwise this
+ * code would have no way to obtain a tupledesc for the tuple.
+ *
+ * Note that if we do build a new tuple, it's palloc'd in the current
+ * memory context. Beware of code that changes context between the initial
+ * heap_form_tuple/etc call and calling HeapTuple(Header)GetDatum.
+ *
+ * For performance-critical callers, it could be worthwhile to take extra
+ * steps to ensure that there aren't TOAST pointers in the output of
+ * heap_form_tuple to begin with. It's likely however that the costs of the
+ * typcache lookup and tuple disassembly/reassembly are swamped by TOAST
+ * dereference costs, so that the benefits of such extra effort would be
+ * minimal.
+ *
+ * XXX it would likely be better to create wrapper functions that produce
+ * a composite Datum from the field values in one step. However, there's
+ * enough code using the existing APIs that we couldn't get rid of this
+ * hack anytime soon.
+ */
+Datum
+HeapTupleHeaderGetDatum(HeapTupleHeader tuple)
+{
+ Datum result;
+ TupleDesc tupDesc;
+
+ /* No work if there are no external TOAST pointers in the tuple */
+ if (!HeapTupleHeaderHasExternal(tuple))
+ return PointerGetDatum(tuple);
+
+ /* Use the type data saved by heap_form_tuple to look up the rowtype */
+ tupDesc = lookup_rowtype_tupdesc(HeapTupleHeaderGetTypeId(tuple),
+ HeapTupleHeaderGetTypMod(tuple));
+
+ /* And do the flattening */
+ result = toast_flatten_tuple_to_datum(tuple,
+ HeapTupleHeaderGetDatumLength(tuple),
+ tupDesc);
+
+ ReleaseTupleDesc(tupDesc);
+
+ return result;
+}
+
+
+/*
+ * Functions for sending tuples to the frontend (or other specified destination)
+ * as though it is a SELECT result. These are used by utility commands that
+ * need to project directly to the destination and don't need or want full
+ * table function capability. Currently used by EXPLAIN and SHOW ALL.
+ */
+TupOutputState *
+begin_tup_output_tupdesc(DestReceiver *dest,
+ TupleDesc tupdesc,
+ const TupleTableSlotOps *tts_ops)
+{
+ TupOutputState *tstate;
+
+ tstate = (TupOutputState *) palloc(sizeof(TupOutputState));
+
+ tstate->slot = MakeSingleTupleTableSlot(tupdesc, tts_ops);
+ tstate->dest = dest;
+
+ tstate->dest->rStartup(tstate->dest, (int) CMD_SELECT, tupdesc);
+
+ return tstate;
+}
+
+/*
+ * write a single tuple
+ */
+void
+do_tup_output(TupOutputState *tstate, Datum *values, bool *isnull)
+{
+ TupleTableSlot *slot = tstate->slot;
+ int natts = slot->tts_tupleDescriptor->natts;
+
+ /* make sure the slot is clear */
+ ExecClearTuple(slot);
+
+ /* insert data */
+ memcpy(slot->tts_values, values, natts * sizeof(Datum));
+ memcpy(slot->tts_isnull, isnull, natts * sizeof(bool));
+
+ /* mark slot as containing a virtual tuple */
+ ExecStoreVirtualTuple(slot);
+
+ /* send the tuple to the receiver */
+ (void) tstate->dest->receiveSlot(slot, tstate->dest);
+
+ /* clean up */
+ ExecClearTuple(slot);
+}
+
+/*
+ * write a chunk of text, breaking at newline characters
+ *
+ * Should only be used with a single-TEXT-attribute tupdesc.
+ */
+void
+do_text_output_multiline(TupOutputState *tstate, const char *txt)
+{
+ Datum values[1];
+ bool isnull[1] = {false};
+
+ while (*txt)
+ {
+ const char *eol;
+ int len;
+
+ eol = strchr(txt, '\n');
+ if (eol)
+ {
+ len = eol - txt;
+ eol++;
+ }
+ else
+ {
+ len = strlen(txt);
+ eol = txt + len;
+ }
+
+ values[0] = PointerGetDatum(cstring_to_text_with_len(txt, len));
+ do_tup_output(tstate, values, isnull);
+ pfree(DatumGetPointer(values[0]));
+ txt = eol;
+ }
+}
+
+void
+end_tup_output(TupOutputState *tstate)
+{
+ tstate->dest->rShutdown(tstate->dest);
+ /* note that destroying the dest is not ours to do */
+ ExecDropSingleTupleTableSlot(tstate->slot);
+ pfree(tstate);
+}
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
new file mode 100644
index 0000000..ad11392
--- /dev/null
+++ b/src/backend/executor/execUtils.c
@@ -0,0 +1,1351 @@
+/*-------------------------------------------------------------------------
+ *
+ * execUtils.c
+ * miscellaneous executor utility routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/execUtils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * CreateExecutorState Create/delete executor working state
+ * FreeExecutorState
+ * CreateExprContext
+ * CreateStandaloneExprContext
+ * FreeExprContext
+ * ReScanExprContext
+ *
+ * ExecAssignExprContext Common code for plan node init routines.
+ * etc
+ *
+ * ExecOpenScanRelation Common code for scan node init routines.
+ *
+ * ExecInitRangeTable Set up executor's range-table-related data.
+ *
+ * ExecGetRangeTableRelation Fetch Relation for a rangetable entry.
+ *
+ * executor_errposition Report syntactic position of an error.
+ *
+ * RegisterExprContextCallback Register function shutdown callback
+ * UnregisterExprContextCallback Deregister function shutdown callback
+ *
+ * GetAttributeByName Runtime extraction of columns from tuples.
+ * GetAttributeByNum
+ *
+ * NOTES
+ * This file has traditionally been the place to stick misc.
+ * executor support stuff that doesn't really go anyplace else.
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "access/relscan.h"
+#include "access/table.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "executor/executor.h"
+#include "executor/execPartition.h"
+#include "jit/jit.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parsetree.h"
+#include "partitioning/partdesc.h"
+#include "storage/lmgr.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/typcache.h"
+
+
+static bool tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc);
+static void ShutdownExprContext(ExprContext *econtext, bool isCommit);
+
+
+/* ----------------------------------------------------------------
+ * Executor state and memory management functions
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * CreateExecutorState
+ *
+ * Create and initialize an EState node, which is the root of
+ * working storage for an entire Executor invocation.
+ *
+ * Principally, this creates the per-query memory context that will be
+ * used to hold all working data that lives till the end of the query.
+ * Note that the per-query context will become a child of the caller's
+ * CurrentMemoryContext.
+ * ----------------
+ */
+EState *
+CreateExecutorState(void)
+{
+ EState *estate;
+ MemoryContext qcontext;
+ MemoryContext oldcontext;
+
+ /*
+ * Create the per-query context for this Executor run.
+ */
+ qcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "ExecutorState",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * Make the EState node within the per-query context. This way, we don't
+ * need a separate pfree() operation for it at shutdown.
+ */
+ oldcontext = MemoryContextSwitchTo(qcontext);
+
+ estate = makeNode(EState);
+
+ /*
+ * Initialize all fields of the Executor State structure
+ */
+ estate->es_direction = ForwardScanDirection;
+ estate->es_snapshot = InvalidSnapshot; /* caller must initialize this */
+ estate->es_crosscheck_snapshot = InvalidSnapshot; /* no crosscheck */
+ estate->es_range_table = NIL;
+ estate->es_range_table_size = 0;
+ estate->es_relations = NULL;
+ estate->es_rowmarks = NULL;
+ estate->es_plannedstmt = NULL;
+
+ estate->es_junkFilter = NULL;
+
+ estate->es_output_cid = (CommandId) 0;
+
+ estate->es_result_relations = NULL;
+ estate->es_opened_result_relations = NIL;
+ estate->es_tuple_routing_result_relations = NIL;
+ estate->es_trig_target_relations = NIL;
+
+ estate->es_param_list_info = NULL;
+ estate->es_param_exec_vals = NULL;
+
+ estate->es_queryEnv = NULL;
+
+ estate->es_query_cxt = qcontext;
+
+ estate->es_tupleTable = NIL;
+
+ estate->es_processed = 0;
+
+ estate->es_top_eflags = 0;
+ estate->es_instrument = 0;
+ estate->es_finished = false;
+
+ estate->es_exprcontexts = NIL;
+
+ estate->es_subplanstates = NIL;
+
+ estate->es_auxmodifytables = NIL;
+
+ estate->es_per_tuple_exprcontext = NULL;
+
+ estate->es_sourceText = NULL;
+
+ estate->es_use_parallel_mode = false;
+
+ estate->es_jit_flags = 0;
+ estate->es_jit = NULL;
+
+ /*
+ * Return the executor state structure
+ */
+ MemoryContextSwitchTo(oldcontext);
+
+ return estate;
+}
+
+/* ----------------
+ * FreeExecutorState
+ *
+ * Release an EState along with all remaining working storage.
+ *
+ * Note: this is not responsible for releasing non-memory resources, such as
+ * open relations or buffer pins. But it will shut down any still-active
+ * ExprContexts within the EState and deallocate associated JITed expressions.
+ * That is sufficient cleanup for situations where the EState has only been
+ * used for expression evaluation, and not to run a complete Plan.
+ *
+ * This can be called in any memory context ... so long as it's not one
+ * of the ones to be freed.
+ * ----------------
+ */
+void
+FreeExecutorState(EState *estate)
+{
+ /*
+ * Shut down and free any remaining ExprContexts. We do this explicitly
+ * to ensure that any remaining shutdown callbacks get called (since they
+ * might need to release resources that aren't simply memory within the
+ * per-query memory context).
+ */
+ while (estate->es_exprcontexts)
+ {
+ /*
+ * XXX: seems there ought to be a faster way to implement this than
+ * repeated list_delete(), no?
+ */
+ FreeExprContext((ExprContext *) linitial(estate->es_exprcontexts),
+ true);
+ /* FreeExprContext removed the list link for us */
+ }
+
+ /* release JIT context, if allocated */
+ if (estate->es_jit)
+ {
+ jit_release_context(estate->es_jit);
+ estate->es_jit = NULL;
+ }
+
+ /* release partition directory, if allocated */
+ if (estate->es_partition_directory)
+ {
+ DestroyPartitionDirectory(estate->es_partition_directory);
+ estate->es_partition_directory = NULL;
+ }
+
+ /*
+ * Free the per-query memory context, thereby releasing all working
+ * memory, including the EState node itself.
+ */
+ MemoryContextDelete(estate->es_query_cxt);
+}
+
+/*
+ * Internal implementation for CreateExprContext() and CreateWorkExprContext()
+ * that allows control over the AllocSet parameters.
+ */
+static ExprContext *
+CreateExprContextInternal(EState *estate, Size minContextSize,
+ Size initBlockSize, Size maxBlockSize)
+{
+ ExprContext *econtext;
+ MemoryContext oldcontext;
+
+ /* Create the ExprContext node within the per-query memory context */
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ econtext = makeNode(ExprContext);
+
+ /* Initialize fields of ExprContext */
+ econtext->ecxt_scantuple = NULL;
+ econtext->ecxt_innertuple = NULL;
+ econtext->ecxt_outertuple = NULL;
+
+ econtext->ecxt_per_query_memory = estate->es_query_cxt;
+
+ /*
+ * Create working memory for expression evaluation in this context.
+ */
+ econtext->ecxt_per_tuple_memory =
+ AllocSetContextCreate(estate->es_query_cxt,
+ "ExprContext",
+ minContextSize,
+ initBlockSize,
+ maxBlockSize);
+
+ econtext->ecxt_param_exec_vals = estate->es_param_exec_vals;
+ econtext->ecxt_param_list_info = estate->es_param_list_info;
+
+ econtext->ecxt_aggvalues = NULL;
+ econtext->ecxt_aggnulls = NULL;
+
+ econtext->caseValue_datum = (Datum) 0;
+ econtext->caseValue_isNull = true;
+
+ econtext->domainValue_datum = (Datum) 0;
+ econtext->domainValue_isNull = true;
+
+ econtext->ecxt_estate = estate;
+
+ econtext->ecxt_callbacks = NULL;
+
+ /*
+ * Link the ExprContext into the EState to ensure it is shut down when the
+ * EState is freed. Because we use lcons(), shutdowns will occur in
+ * reverse order of creation, which may not be essential but can't hurt.
+ */
+ estate->es_exprcontexts = lcons(econtext, estate->es_exprcontexts);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return econtext;
+}
+
+/* ----------------
+ * CreateExprContext
+ *
+ * Create a context for expression evaluation within an EState.
+ *
+ * An executor run may require multiple ExprContexts (we usually make one
+ * for each Plan node, and a separate one for per-output-tuple processing
+ * such as constraint checking). Each ExprContext has its own "per-tuple"
+ * memory context.
+ *
+ * Note we make no assumption about the caller's memory context.
+ * ----------------
+ */
+ExprContext *
+CreateExprContext(EState *estate)
+{
+ return CreateExprContextInternal(estate, ALLOCSET_DEFAULT_SIZES);
+}
+
+
+/* ----------------
+ * CreateWorkExprContext
+ *
+ * Like CreateExprContext, but specifies the AllocSet sizes to be reasonable
+ * in proportion to work_mem. If the maximum block allocation size is too
+ * large, it's easy to skip right past work_mem with a single allocation.
+ * ----------------
+ */
+ExprContext *
+CreateWorkExprContext(EState *estate)
+{
+ Size minContextSize = ALLOCSET_DEFAULT_MINSIZE;
+ Size initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+ Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
+
+ /* choose the maxBlockSize to be no larger than 1/16 of work_mem */
+ while (16 * maxBlockSize > work_mem * 1024L)
+ maxBlockSize >>= 1;
+
+ if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
+ maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+
+ return CreateExprContextInternal(estate, minContextSize,
+ initBlockSize, maxBlockSize);
+}
+
+/* ----------------
+ * CreateStandaloneExprContext
+ *
+ * Create a context for standalone expression evaluation.
+ *
+ * An ExprContext made this way can be used for evaluation of expressions
+ * that contain no Params, subplans, or Var references (it might work to
+ * put tuple references into the scantuple field, but it seems unwise).
+ *
+ * The ExprContext struct is allocated in the caller's current memory
+ * context, which also becomes its "per query" context.
+ *
+ * It is caller's responsibility to free the ExprContext when done,
+ * or at least ensure that any shutdown callbacks have been called
+ * (ReScanExprContext() is suitable). Otherwise, non-memory resources
+ * might be leaked.
+ * ----------------
+ */
+ExprContext *
+CreateStandaloneExprContext(void)
+{
+ ExprContext *econtext;
+
+ /* Create the ExprContext node within the caller's memory context */
+ econtext = makeNode(ExprContext);
+
+ /* Initialize fields of ExprContext */
+ econtext->ecxt_scantuple = NULL;
+ econtext->ecxt_innertuple = NULL;
+ econtext->ecxt_outertuple = NULL;
+
+ econtext->ecxt_per_query_memory = CurrentMemoryContext;
+
+ /*
+ * Create working memory for expression evaluation in this context.
+ */
+ econtext->ecxt_per_tuple_memory =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "ExprContext",
+ ALLOCSET_DEFAULT_SIZES);
+
+ econtext->ecxt_param_exec_vals = NULL;
+ econtext->ecxt_param_list_info = NULL;
+
+ econtext->ecxt_aggvalues = NULL;
+ econtext->ecxt_aggnulls = NULL;
+
+ econtext->caseValue_datum = (Datum) 0;
+ econtext->caseValue_isNull = true;
+
+ econtext->domainValue_datum = (Datum) 0;
+ econtext->domainValue_isNull = true;
+
+ econtext->ecxt_estate = NULL;
+
+ econtext->ecxt_callbacks = NULL;
+
+ return econtext;
+}
+
+/* ----------------
+ * FreeExprContext
+ *
+ * Free an expression context, including calling any remaining
+ * shutdown callbacks.
+ *
+ * Since we free the temporary context used for expression evaluation,
+ * any previously computed pass-by-reference expression result will go away!
+ *
+ * If isCommit is false, we are being called in error cleanup, and should
+ * not call callbacks but only release memory. (It might be better to call
+ * the callbacks and pass the isCommit flag to them, but that would require
+ * more invasive code changes than currently seems justified.)
+ *
+ * Note we make no assumption about the caller's memory context.
+ * ----------------
+ */
+void
+FreeExprContext(ExprContext *econtext, bool isCommit)
+{
+ EState *estate;
+
+ /* Call any registered callbacks */
+ ShutdownExprContext(econtext, isCommit);
+ /* And clean up the memory used */
+ MemoryContextDelete(econtext->ecxt_per_tuple_memory);
+ /* Unlink self from owning EState, if any */
+ estate = econtext->ecxt_estate;
+ if (estate)
+ estate->es_exprcontexts = list_delete_ptr(estate->es_exprcontexts,
+ econtext);
+ /* And delete the ExprContext node */
+ pfree(econtext);
+}
+
+/*
+ * ReScanExprContext
+ *
+ * Reset an expression context in preparation for a rescan of its
+ * plan node. This requires calling any registered shutdown callbacks,
+ * since any partially complete set-returning-functions must be canceled.
+ *
+ * Note we make no assumption about the caller's memory context.
+ */
+void
+ReScanExprContext(ExprContext *econtext)
+{
+ /* Call any registered callbacks */
+ ShutdownExprContext(econtext, true);
+ /* And clean up the memory used */
+ MemoryContextReset(econtext->ecxt_per_tuple_memory);
+}
+
+/*
+ * Build a per-output-tuple ExprContext for an EState.
+ *
+ * This is normally invoked via GetPerTupleExprContext() macro,
+ * not directly.
+ */
+ExprContext *
+MakePerTupleExprContext(EState *estate)
+{
+ if (estate->es_per_tuple_exprcontext == NULL)
+ estate->es_per_tuple_exprcontext = CreateExprContext(estate);
+
+ return estate->es_per_tuple_exprcontext;
+}
+
+
+/* ----------------------------------------------------------------
+ * miscellaneous node-init support functions
+ *
+ * Note: all of these are expected to be called with CurrentMemoryContext
+ * equal to the per-query memory context.
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * ExecAssignExprContext
+ *
+ * This initializes the ps_ExprContext field. It is only necessary
+ * to do this for nodes which use ExecQual or ExecProject
+ * because those routines require an econtext. Other nodes that
+ * don't have to evaluate expressions don't need to do this.
+ * ----------------
+ */
+void
+ExecAssignExprContext(EState *estate, PlanState *planstate)
+{
+ planstate->ps_ExprContext = CreateExprContext(estate);
+}
+
+/* ----------------
+ * ExecGetResultType
+ * ----------------
+ */
+TupleDesc
+ExecGetResultType(PlanState *planstate)
+{
+ return planstate->ps_ResultTupleDesc;
+}
+
+/*
+ * ExecGetResultSlotOps - information about node's type of result slot
+ */
+const TupleTableSlotOps *
+ExecGetResultSlotOps(PlanState *planstate, bool *isfixed)
+{
+ if (planstate->resultopsset && planstate->resultops)
+ {
+ if (isfixed)
+ *isfixed = planstate->resultopsfixed;
+ return planstate->resultops;
+ }
+
+ if (isfixed)
+ {
+ if (planstate->resultopsset)
+ *isfixed = planstate->resultopsfixed;
+ else if (planstate->ps_ResultTupleSlot)
+ *isfixed = TTS_FIXED(planstate->ps_ResultTupleSlot);
+ else
+ *isfixed = false;
+ }
+
+ if (!planstate->ps_ResultTupleSlot)
+ return &TTSOpsVirtual;
+
+ return planstate->ps_ResultTupleSlot->tts_ops;
+}
+
+
+/* ----------------
+ * ExecAssignProjectionInfo
+ *
+ * forms the projection information from the node's targetlist
+ *
+ * Notes for inputDesc are same as for ExecBuildProjectionInfo: supply it
+ * for a relation-scan node, can pass NULL for upper-level nodes
+ * ----------------
+ */
+void
+ExecAssignProjectionInfo(PlanState *planstate,
+ TupleDesc inputDesc)
+{
+ planstate->ps_ProjInfo =
+ ExecBuildProjectionInfo(planstate->plan->targetlist,
+ planstate->ps_ExprContext,
+ planstate->ps_ResultTupleSlot,
+ planstate,
+ inputDesc);
+}
+
+
+/* ----------------
+ * ExecConditionalAssignProjectionInfo
+ *
+ * as ExecAssignProjectionInfo, but store NULL rather than building projection
+ * info if no projection is required
+ * ----------------
+ */
+void
+ExecConditionalAssignProjectionInfo(PlanState *planstate, TupleDesc inputDesc,
+ Index varno)
+{
+ if (tlist_matches_tupdesc(planstate,
+ planstate->plan->targetlist,
+ varno,
+ inputDesc))
+ {
+ planstate->ps_ProjInfo = NULL;
+ planstate->resultopsset = planstate->scanopsset;
+ planstate->resultopsfixed = planstate->scanopsfixed;
+ planstate->resultops = planstate->scanops;
+ }
+ else
+ {
+ if (!planstate->ps_ResultTupleSlot)
+ {
+ ExecInitResultSlot(planstate, &TTSOpsVirtual);
+ planstate->resultops = &TTSOpsVirtual;
+ planstate->resultopsfixed = true;
+ planstate->resultopsset = true;
+ }
+ ExecAssignProjectionInfo(planstate, inputDesc);
+ }
+}
+
+static bool
+tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc)
+{
+ int numattrs = tupdesc->natts;
+ int attrno;
+ ListCell *tlist_item = list_head(tlist);
+
+ /* Check the tlist attributes */
+ for (attrno = 1; attrno <= numattrs; attrno++)
+ {
+ Form_pg_attribute att_tup = TupleDescAttr(tupdesc, attrno - 1);
+ Var *var;
+
+ if (tlist_item == NULL)
+ return false; /* tlist too short */
+ var = (Var *) ((TargetEntry *) lfirst(tlist_item))->expr;
+ if (!var || !IsA(var, Var))
+ return false; /* tlist item not a Var */
+ /* if these Asserts fail, planner messed up */
+ Assert(var->varno == varno);
+ Assert(var->varlevelsup == 0);
+ if (var->varattno != attrno)
+ return false; /* out of order */
+ if (att_tup->attisdropped)
+ return false; /* table contains dropped columns */
+ if (att_tup->atthasmissing)
+ return false; /* table contains cols with missing values */
+
+ /*
+ * Note: usually the Var's type should match the tupdesc exactly, but
+ * in situations involving unions of columns that have different
+ * typmods, the Var may have come from above the union and hence have
+ * typmod -1. This is a legitimate situation since the Var still
+ * describes the column, just not as exactly as the tupdesc does. We
+ * could change the planner to prevent it, but it'd then insert
+ * projection steps just to convert from specific typmod to typmod -1,
+ * which is pretty silly.
+ */
+ if (var->vartype != att_tup->atttypid ||
+ (var->vartypmod != att_tup->atttypmod &&
+ var->vartypmod != -1))
+ return false; /* type mismatch */
+
+ tlist_item = lnext(tlist, tlist_item);
+ }
+
+ if (tlist_item)
+ return false; /* tlist too long */
+
+ return true;
+}
+
+/* ----------------
+ * ExecFreeExprContext
+ *
+ * A plan node's ExprContext should be freed explicitly during executor
+ * shutdown because there may be shutdown callbacks to call. (Other resources
+ * made by the above routines, such as projection info, don't need to be freed
+ * explicitly because they're just memory in the per-query memory context.)
+ *
+ * However ... there is no particular need to do it during ExecEndNode,
+ * because FreeExecutorState will free any remaining ExprContexts within
+ * the EState. Letting FreeExecutorState do it allows the ExprContexts to
+ * be freed in reverse order of creation, rather than order of creation as
+ * will happen if we delete them here, which saves O(N^2) work in the list
+ * cleanup inside FreeExprContext.
+ * ----------------
+ */
+void
+ExecFreeExprContext(PlanState *planstate)
+{
+ /*
+ * Per above discussion, don't actually delete the ExprContext. We do
+ * unlink it from the plan node, though.
+ */
+ planstate->ps_ExprContext = NULL;
+}
+
+
+/* ----------------------------------------------------------------
+ * Scan node support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * ExecAssignScanType
+ * ----------------
+ */
+void
+ExecAssignScanType(ScanState *scanstate, TupleDesc tupDesc)
+{
+ TupleTableSlot *slot = scanstate->ss_ScanTupleSlot;
+
+ ExecSetSlotDescriptor(slot, tupDesc);
+}
+
+/* ----------------
+ * ExecCreateScanSlotFromOuterPlan
+ * ----------------
+ */
+void
+ExecCreateScanSlotFromOuterPlan(EState *estate,
+ ScanState *scanstate,
+ const TupleTableSlotOps *tts_ops)
+{
+ PlanState *outerPlan;
+ TupleDesc tupDesc;
+
+ outerPlan = outerPlanState(scanstate);
+ tupDesc = ExecGetResultType(outerPlan);
+
+ ExecInitScanTupleSlot(estate, scanstate, tupDesc, tts_ops);
+}
+
+/* ----------------------------------------------------------------
+ * ExecRelationIsTargetRelation
+ *
+ * Detect whether a relation (identified by rangetable index)
+ * is one of the target relations of the query.
+ *
+ * Note: This is currently no longer used in core. We keep it around
+ * because FDWs may wish to use it to determine if their foreign table
+ * is a target relation.
+ * ----------------------------------------------------------------
+ */
+bool
+ExecRelationIsTargetRelation(EState *estate, Index scanrelid)
+{
+ return list_member_int(estate->es_plannedstmt->resultRelations, scanrelid);
+}
+
+/* ----------------------------------------------------------------
+ * ExecOpenScanRelation
+ *
+ * Open the heap relation to be scanned by a base-level scan plan node.
+ * This should be called during the node's ExecInit routine.
+ * ----------------------------------------------------------------
+ */
+Relation
+ExecOpenScanRelation(EState *estate, Index scanrelid, int eflags)
+{
+ Relation rel;
+
+ /* Open the relation. */
+ rel = ExecGetRangeTableRelation(estate, scanrelid);
+
+ /*
+ * Complain if we're attempting a scan of an unscannable relation, except
+ * when the query won't actually be run. This is a slightly klugy place
+ * to do this, perhaps, but there is no better place.
+ */
+ if ((eflags & (EXEC_FLAG_EXPLAIN_ONLY | EXEC_FLAG_WITH_NO_DATA)) == 0 &&
+ !RelationIsScannable(rel))
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("materialized view \"%s\" has not been populated",
+ RelationGetRelationName(rel)),
+ errhint("Use the REFRESH MATERIALIZED VIEW command.")));
+
+ return rel;
+}
+
+/*
+ * ExecInitRangeTable
+ * Set up executor's range-table-related data
+ *
+ * In addition to the range table proper, initialize arrays that are
+ * indexed by rangetable index.
+ */
+void
+ExecInitRangeTable(EState *estate, List *rangeTable)
+{
+ /* Remember the range table List as-is */
+ estate->es_range_table = rangeTable;
+
+ /* Set size of associated arrays */
+ estate->es_range_table_size = list_length(rangeTable);
+
+ /*
+ * Allocate an array to store an open Relation corresponding to each
+ * rangetable entry, and initialize entries to NULL. Relations are opened
+ * and stored here as needed.
+ */
+ estate->es_relations = (Relation *)
+ palloc0(estate->es_range_table_size * sizeof(Relation));
+
+ /*
+ * es_result_relations and es_rowmarks are also parallel to
+ * es_range_table, but are allocated only if needed.
+ */
+ estate->es_result_relations = NULL;
+ estate->es_rowmarks = NULL;
+}
+
+/*
+ * ExecGetRangeTableRelation
+ * Open the Relation for a range table entry, if not already done
+ *
+ * The Relations will be closed again in ExecEndPlan().
+ */
+Relation
+ExecGetRangeTableRelation(EState *estate, Index rti)
+{
+ Relation rel;
+
+ Assert(rti > 0 && rti <= estate->es_range_table_size);
+
+ rel = estate->es_relations[rti - 1];
+ if (rel == NULL)
+ {
+ /* First time through, so open the relation */
+ RangeTblEntry *rte = exec_rt_fetch(rti, estate);
+
+ Assert(rte->rtekind == RTE_RELATION);
+
+ if (!IsParallelWorker())
+ {
+ /*
+ * In a normal query, we should already have the appropriate lock,
+ * but verify that through an Assert. Since there's already an
+ * Assert inside table_open that insists on holding some lock, it
+ * seems sufficient to check this only when rellockmode is higher
+ * than the minimum.
+ */
+ rel = table_open(rte->relid, NoLock);
+ Assert(rte->rellockmode == AccessShareLock ||
+ CheckRelationLockedByMe(rel, rte->rellockmode, false));
+ }
+ else
+ {
+ /*
+ * If we are a parallel worker, we need to obtain our own local
+ * lock on the relation. This ensures sane behavior in case the
+ * parent process exits before we do.
+ */
+ rel = table_open(rte->relid, rte->rellockmode);
+ }
+
+ estate->es_relations[rti - 1] = rel;
+ }
+
+ return rel;
+}
+
+/*
+ * ExecInitResultRelation
+ * Open relation given by the passed-in RT index and fill its
+ * ResultRelInfo node
+ *
+ * Here, we also save the ResultRelInfo in estate->es_result_relations array
+ * such that it can be accessed later using the RT index.
+ */
+void
+ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo,
+ Index rti)
+{
+ Relation resultRelationDesc;
+
+ resultRelationDesc = ExecGetRangeTableRelation(estate, rti);
+ InitResultRelInfo(resultRelInfo,
+ resultRelationDesc,
+ rti,
+ NULL,
+ estate->es_instrument);
+
+ if (estate->es_result_relations == NULL)
+ estate->es_result_relations = (ResultRelInfo **)
+ palloc0(estate->es_range_table_size * sizeof(ResultRelInfo *));
+ estate->es_result_relations[rti - 1] = resultRelInfo;
+
+ /*
+ * Saving in the list allows to avoid needlessly traversing the whole
+ * array when only a few of its entries are possibly non-NULL.
+ */
+ estate->es_opened_result_relations =
+ lappend(estate->es_opened_result_relations, resultRelInfo);
+}
+
+/*
+ * UpdateChangedParamSet
+ * Add changed parameters to a plan node's chgParam set
+ */
+void
+UpdateChangedParamSet(PlanState *node, Bitmapset *newchg)
+{
+ Bitmapset *parmset;
+
+ /*
+ * The plan node only depends on params listed in its allParam set. Don't
+ * include anything else into its chgParam set.
+ */
+ parmset = bms_intersect(node->plan->allParam, newchg);
+
+ /*
+ * Keep node->chgParam == NULL if there's not actually any members; this
+ * allows the simplest possible tests in executor node files.
+ */
+ if (!bms_is_empty(parmset))
+ node->chgParam = bms_join(node->chgParam, parmset);
+ else
+ bms_free(parmset);
+}
+
+/*
+ * executor_errposition
+ * Report an execution-time cursor position, if possible.
+ *
+ * This is expected to be used within an ereport() call. The return value
+ * is a dummy (always 0, in fact).
+ *
+ * The locations stored in parsetrees are byte offsets into the source string.
+ * We have to convert them to 1-based character indexes for reporting to
+ * clients. (We do things this way to avoid unnecessary overhead in the
+ * normal non-error case: computing character indexes would be much more
+ * expensive than storing token offsets.)
+ */
+int
+executor_errposition(EState *estate, int location)
+{
+ int pos;
+
+ /* No-op if location was not provided */
+ if (location < 0)
+ return 0;
+ /* Can't do anything if source text is not available */
+ if (estate == NULL || estate->es_sourceText == NULL)
+ return 0;
+ /* Convert offset to character number */
+ pos = pg_mbstrlen_with_len(estate->es_sourceText, location) + 1;
+ /* And pass it to the ereport mechanism */
+ return errposition(pos);
+}
+
+/*
+ * Register a shutdown callback in an ExprContext.
+ *
+ * Shutdown callbacks will be called (in reverse order of registration)
+ * when the ExprContext is deleted or rescanned. This provides a hook
+ * for functions called in the context to do any cleanup needed --- it's
+ * particularly useful for functions returning sets. Note that the
+ * callback will *not* be called in the event that execution is aborted
+ * by an error.
+ */
+void
+RegisterExprContextCallback(ExprContext *econtext,
+ ExprContextCallbackFunction function,
+ Datum arg)
+{
+ ExprContext_CB *ecxt_callback;
+
+ /* Save the info in appropriate memory context */
+ ecxt_callback = (ExprContext_CB *)
+ MemoryContextAlloc(econtext->ecxt_per_query_memory,
+ sizeof(ExprContext_CB));
+
+ ecxt_callback->function = function;
+ ecxt_callback->arg = arg;
+
+ /* link to front of list for appropriate execution order */
+ ecxt_callback->next = econtext->ecxt_callbacks;
+ econtext->ecxt_callbacks = ecxt_callback;
+}
+
+/*
+ * Deregister a shutdown callback in an ExprContext.
+ *
+ * Any list entries matching the function and arg will be removed.
+ * This can be used if it's no longer necessary to call the callback.
+ */
+void
+UnregisterExprContextCallback(ExprContext *econtext,
+ ExprContextCallbackFunction function,
+ Datum arg)
+{
+ ExprContext_CB **prev_callback;
+ ExprContext_CB *ecxt_callback;
+
+ prev_callback = &econtext->ecxt_callbacks;
+
+ while ((ecxt_callback = *prev_callback) != NULL)
+ {
+ if (ecxt_callback->function == function && ecxt_callback->arg == arg)
+ {
+ *prev_callback = ecxt_callback->next;
+ pfree(ecxt_callback);
+ }
+ else
+ prev_callback = &ecxt_callback->next;
+ }
+}
+
+/*
+ * Call all the shutdown callbacks registered in an ExprContext.
+ *
+ * The callback list is emptied (important in case this is only a rescan
+ * reset, and not deletion of the ExprContext).
+ *
+ * If isCommit is false, just clean the callback list but don't call 'em.
+ * (See comment for FreeExprContext.)
+ */
+static void
+ShutdownExprContext(ExprContext *econtext, bool isCommit)
+{
+ ExprContext_CB *ecxt_callback;
+ MemoryContext oldcontext;
+
+ /* Fast path in normal case where there's nothing to do. */
+ if (econtext->ecxt_callbacks == NULL)
+ return;
+
+ /*
+ * Call the callbacks in econtext's per-tuple context. This ensures that
+ * any memory they might leak will get cleaned up.
+ */
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ /*
+ * Call each callback function in reverse registration order.
+ */
+ while ((ecxt_callback = econtext->ecxt_callbacks) != NULL)
+ {
+ econtext->ecxt_callbacks = ecxt_callback->next;
+ if (isCommit)
+ ecxt_callback->function(ecxt_callback->arg);
+ pfree(ecxt_callback);
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * GetAttributeByName
+ * GetAttributeByNum
+ *
+ * These functions return the value of the requested attribute
+ * out of the given tuple Datum.
+ * C functions which take a tuple as an argument are expected
+ * to use these. Ex: overpaid(EMP) might call GetAttributeByNum().
+ * Note: these are actually rather slow because they do a typcache
+ * lookup on each call.
+ */
+Datum
+GetAttributeByName(HeapTupleHeader tuple, const char *attname, bool *isNull)
+{
+ AttrNumber attrno;
+ Datum result;
+ Oid tupType;
+ int32 tupTypmod;
+ TupleDesc tupDesc;
+ HeapTupleData tmptup;
+ int i;
+
+ if (attname == NULL)
+ elog(ERROR, "invalid attribute name");
+
+ if (isNull == NULL)
+ elog(ERROR, "a NULL isNull pointer was passed");
+
+ if (tuple == NULL)
+ {
+ /* Kinda bogus but compatible with old behavior... */
+ *isNull = true;
+ return (Datum) 0;
+ }
+
+ tupType = HeapTupleHeaderGetTypeId(tuple);
+ tupTypmod = HeapTupleHeaderGetTypMod(tuple);
+ tupDesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
+
+ attrno = InvalidAttrNumber;
+ for (i = 0; i < tupDesc->natts; i++)
+ {
+ Form_pg_attribute att = TupleDescAttr(tupDesc, i);
+
+ if (namestrcmp(&(att->attname), attname) == 0)
+ {
+ attrno = att->attnum;
+ break;
+ }
+ }
+
+ if (attrno == InvalidAttrNumber)
+ elog(ERROR, "attribute \"%s\" does not exist", attname);
+
+ /*
+ * heap_getattr needs a HeapTuple not a bare HeapTupleHeader. We set all
+ * the fields in the struct just in case user tries to inspect system
+ * columns.
+ */
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+ ItemPointerSetInvalid(&(tmptup.t_self));
+ tmptup.t_tableOid = InvalidOid;
+ tmptup.t_data = tuple;
+
+ result = heap_getattr(&tmptup,
+ attrno,
+ tupDesc,
+ isNull);
+
+ ReleaseTupleDesc(tupDesc);
+
+ return result;
+}
+
+Datum
+GetAttributeByNum(HeapTupleHeader tuple,
+ AttrNumber attrno,
+ bool *isNull)
+{
+ Datum result;
+ Oid tupType;
+ int32 tupTypmod;
+ TupleDesc tupDesc;
+ HeapTupleData tmptup;
+
+ if (!AttributeNumberIsValid(attrno))
+ elog(ERROR, "invalid attribute number %d", attrno);
+
+ if (isNull == NULL)
+ elog(ERROR, "a NULL isNull pointer was passed");
+
+ if (tuple == NULL)
+ {
+ /* Kinda bogus but compatible with old behavior... */
+ *isNull = true;
+ return (Datum) 0;
+ }
+
+ tupType = HeapTupleHeaderGetTypeId(tuple);
+ tupTypmod = HeapTupleHeaderGetTypMod(tuple);
+ tupDesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
+
+ /*
+ * heap_getattr needs a HeapTuple not a bare HeapTupleHeader. We set all
+ * the fields in the struct just in case user tries to inspect system
+ * columns.
+ */
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+ ItemPointerSetInvalid(&(tmptup.t_self));
+ tmptup.t_tableOid = InvalidOid;
+ tmptup.t_data = tuple;
+
+ result = heap_getattr(&tmptup,
+ attrno,
+ tupDesc,
+ isNull);
+
+ ReleaseTupleDesc(tupDesc);
+
+ return result;
+}
+
+/*
+ * Number of items in a tlist (including any resjunk items!)
+ */
+int
+ExecTargetListLength(List *targetlist)
+{
+ /* This used to be more complex, but fjoins are dead */
+ return list_length(targetlist);
+}
+
+/*
+ * Number of items in a tlist, not including any resjunk items
+ */
+int
+ExecCleanTargetListLength(List *targetlist)
+{
+ int len = 0;
+ ListCell *tl;
+
+ foreach(tl, targetlist)
+ {
+ TargetEntry *curTle = lfirst_node(TargetEntry, tl);
+
+ if (!curTle->resjunk)
+ len++;
+ }
+ return len;
+}
+
+/*
+ * Return a relInfo's tuple slot for a trigger's OLD tuples.
+ */
+TupleTableSlot *
+ExecGetTriggerOldSlot(EState *estate, ResultRelInfo *relInfo)
+{
+ if (relInfo->ri_TrigOldSlot == NULL)
+ {
+ Relation rel = relInfo->ri_RelationDesc;
+ MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ relInfo->ri_TrigOldSlot =
+ ExecInitExtraTupleSlot(estate,
+ RelationGetDescr(rel),
+ table_slot_callbacks(rel));
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ return relInfo->ri_TrigOldSlot;
+}
+
+/*
+ * Return a relInfo's tuple slot for a trigger's NEW tuples.
+ */
+TupleTableSlot *
+ExecGetTriggerNewSlot(EState *estate, ResultRelInfo *relInfo)
+{
+ if (relInfo->ri_TrigNewSlot == NULL)
+ {
+ Relation rel = relInfo->ri_RelationDesc;
+ MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ relInfo->ri_TrigNewSlot =
+ ExecInitExtraTupleSlot(estate,
+ RelationGetDescr(rel),
+ table_slot_callbacks(rel));
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ return relInfo->ri_TrigNewSlot;
+}
+
+/*
+ * Return a relInfo's tuple slot for processing returning tuples.
+ */
+TupleTableSlot *
+ExecGetReturningSlot(EState *estate, ResultRelInfo *relInfo)
+{
+ if (relInfo->ri_ReturningSlot == NULL)
+ {
+ Relation rel = relInfo->ri_RelationDesc;
+ MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ relInfo->ri_ReturningSlot =
+ ExecInitExtraTupleSlot(estate,
+ RelationGetDescr(rel),
+ table_slot_callbacks(rel));
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ return relInfo->ri_ReturningSlot;
+}
+
+/*
+ * Return the map needed to convert given child result relation's tuples to
+ * the rowtype of the query's main target ("root") relation. Note that a
+ * NULL result is valid and means that no conversion is needed.
+ */
+TupleConversionMap *
+ExecGetChildToRootMap(ResultRelInfo *resultRelInfo)
+{
+ /* If we didn't already do so, compute the map for this child. */
+ if (!resultRelInfo->ri_ChildToRootMapValid)
+ {
+ ResultRelInfo *rootRelInfo = resultRelInfo->ri_RootResultRelInfo;
+
+ if (rootRelInfo)
+ resultRelInfo->ri_ChildToRootMap =
+ convert_tuples_by_name(RelationGetDescr(resultRelInfo->ri_RelationDesc),
+ RelationGetDescr(rootRelInfo->ri_RelationDesc));
+ else /* this isn't a child result rel */
+ resultRelInfo->ri_ChildToRootMap = NULL;
+
+ resultRelInfo->ri_ChildToRootMapValid = true;
+ }
+
+ return resultRelInfo->ri_ChildToRootMap;
+}
+
+/* Return a bitmap representing columns being inserted */
+Bitmapset *
+ExecGetInsertedCols(ResultRelInfo *relinfo, EState *estate)
+{
+ /*
+ * The columns are stored in the range table entry. If this ResultRelInfo
+ * represents a partition routing target, and doesn't have an entry of its
+ * own in the range table, fetch the parent's RTE and map the columns to
+ * the order they are in the partition.
+ */
+ if (relinfo->ri_RangeTableIndex != 0)
+ {
+ RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate);
+
+ return rte->insertedCols;
+ }
+ else if (relinfo->ri_RootResultRelInfo)
+ {
+ ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo;
+ RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate);
+
+ if (relinfo->ri_RootToPartitionMap != NULL)
+ return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap,
+ rte->insertedCols);
+ else
+ return rte->insertedCols;
+ }
+ else
+ {
+ /*
+ * The relation isn't in the range table and it isn't a partition
+ * routing target. This ResultRelInfo must've been created only for
+ * firing triggers and the relation is not being inserted into. (See
+ * ExecGetTriggerResultRel.)
+ */
+ return NULL;
+ }
+}
+
+/* Return a bitmap representing columns being updated */
+Bitmapset *
+ExecGetUpdatedCols(ResultRelInfo *relinfo, EState *estate)
+{
+ /* see ExecGetInsertedCols() */
+ if (relinfo->ri_RangeTableIndex != 0)
+ {
+ RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate);
+
+ return rte->updatedCols;
+ }
+ else if (relinfo->ri_RootResultRelInfo)
+ {
+ ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo;
+ RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate);
+
+ if (relinfo->ri_RootToPartitionMap != NULL)
+ return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap,
+ rte->updatedCols);
+ else
+ return rte->updatedCols;
+ }
+ else
+ return NULL;
+}
+
+/* Return a bitmap representing generated columns being updated */
+Bitmapset *
+ExecGetExtraUpdatedCols(ResultRelInfo *relinfo, EState *estate)
+{
+ /* see ExecGetInsertedCols() */
+ if (relinfo->ri_RangeTableIndex != 0)
+ {
+ RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate);
+
+ return rte->extraUpdatedCols;
+ }
+ else if (relinfo->ri_RootResultRelInfo)
+ {
+ ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo;
+ RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate);
+
+ if (relinfo->ri_RootToPartitionMap != NULL)
+ return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap,
+ rte->extraUpdatedCols);
+ else
+ return rte->extraUpdatedCols;
+ }
+ else
+ return NULL;
+}
+
+/* Return columns being updated, including generated columns */
+Bitmapset *
+ExecGetAllUpdatedCols(ResultRelInfo *relinfo, EState *estate)
+{
+ return bms_union(ExecGetUpdatedCols(relinfo, estate),
+ ExecGetExtraUpdatedCols(relinfo, estate));
+}
diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c
new file mode 100644
index 0000000..296e54e
--- /dev/null
+++ b/src/backend/executor/functions.c
@@ -0,0 +1,2103 @@
+/*-------------------------------------------------------------------------
+ *
+ * functions.c
+ * Execution of SQL-language functions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/functions.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "executor/functions.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parse_coerce.h"
+#include "parser/parse_collate.h"
+#include "parser/parse_func.h"
+#include "rewrite/rewriteHandler.h"
+#include "storage/proc.h"
+#include "tcop/utility.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+
+
+/*
+ * Specialized DestReceiver for collecting query output in a SQL function
+ */
+typedef struct
+{
+ DestReceiver pub; /* publicly-known function pointers */
+ Tuplestorestate *tstore; /* where to put result tuples */
+ MemoryContext cxt; /* context containing tstore */
+ JunkFilter *filter; /* filter to convert tuple type */
+} DR_sqlfunction;
+
+/*
+ * We have an execution_state record for each query in a function. Each
+ * record contains a plantree for its query. If the query is currently in
+ * F_EXEC_RUN state then there's a QueryDesc too.
+ *
+ * The "next" fields chain together all the execution_state records generated
+ * from a single original parsetree. (There will only be more than one in
+ * case of rule expansion of the original parsetree.)
+ */
+typedef enum
+{
+ F_EXEC_START, F_EXEC_RUN, F_EXEC_DONE
+} ExecStatus;
+
+typedef struct execution_state
+{
+ struct execution_state *next;
+ ExecStatus status;
+ bool setsResult; /* true if this query produces func's result */
+ bool lazyEval; /* true if should fetch one row at a time */
+ PlannedStmt *stmt; /* plan for this query */
+ QueryDesc *qd; /* null unless status == RUN */
+} execution_state;
+
+
+/*
+ * An SQLFunctionCache record is built during the first call,
+ * and linked to from the fn_extra field of the FmgrInfo struct.
+ *
+ * Note that currently this has only the lifespan of the calling query.
+ * Someday we should rewrite this code to use plancache.c to save parse/plan
+ * results for longer than that.
+ *
+ * Physically, though, the data has the lifespan of the FmgrInfo that's used
+ * to call the function, and there are cases (particularly with indexes)
+ * where the FmgrInfo might survive across transactions. We cannot assume
+ * that the parse/plan trees are good for longer than the (sub)transaction in
+ * which parsing was done, so we must mark the record with the LXID/subxid of
+ * its creation time, and regenerate everything if that's obsolete. To avoid
+ * memory leakage when we do have to regenerate things, all the data is kept
+ * in a sub-context of the FmgrInfo's fn_mcxt.
+ */
+typedef struct
+{
+ char *fname; /* function name (for error msgs) */
+ char *src; /* function body text (for error msgs) */
+
+ SQLFunctionParseInfoPtr pinfo; /* data for parser callback hooks */
+
+ Oid rettype; /* actual return type */
+ int16 typlen; /* length of the return type */
+ bool typbyval; /* true if return type is pass by value */
+ bool returnsSet; /* true if returning multiple rows */
+ bool returnsTuple; /* true if returning whole tuple result */
+ bool shutdown_reg; /* true if registered shutdown callback */
+ bool readonly_func; /* true to run in "read only" mode */
+ bool lazyEval; /* true if using lazyEval for result query */
+
+ ParamListInfo paramLI; /* Param list representing current args */
+
+ Tuplestorestate *tstore; /* where we accumulate result tuples */
+
+ JunkFilter *junkFilter; /* will be NULL if function returns VOID */
+
+ /*
+ * func_state is a List of execution_state records, each of which is the
+ * first for its original parsetree, with any additional records chained
+ * to it via the "next" fields. This sublist structure is needed to keep
+ * track of where the original query boundaries are.
+ */
+ List *func_state;
+
+ MemoryContext fcontext; /* memory context holding this struct and all
+ * subsidiary data */
+
+ LocalTransactionId lxid; /* lxid in which cache was made */
+ SubTransactionId subxid; /* subxid in which cache was made */
+} SQLFunctionCache;
+
+typedef SQLFunctionCache *SQLFunctionCachePtr;
+
+
+/* non-export function prototypes */
+static Node *sql_fn_param_ref(ParseState *pstate, ParamRef *pref);
+static Node *sql_fn_post_column_ref(ParseState *pstate,
+ ColumnRef *cref, Node *var);
+static Node *sql_fn_make_param(SQLFunctionParseInfoPtr pinfo,
+ int paramno, int location);
+static Node *sql_fn_resolve_param_name(SQLFunctionParseInfoPtr pinfo,
+ const char *paramname, int location);
+static List *init_execution_state(List *queryTree_list,
+ SQLFunctionCachePtr fcache,
+ bool lazyEvalOK);
+static void init_sql_fcache(FunctionCallInfo fcinfo, Oid collation, bool lazyEvalOK);
+static void postquel_start(execution_state *es, SQLFunctionCachePtr fcache);
+static bool postquel_getnext(execution_state *es, SQLFunctionCachePtr fcache);
+static void postquel_end(execution_state *es);
+static void postquel_sub_params(SQLFunctionCachePtr fcache,
+ FunctionCallInfo fcinfo);
+static Datum postquel_get_single_result(TupleTableSlot *slot,
+ FunctionCallInfo fcinfo,
+ SQLFunctionCachePtr fcache,
+ MemoryContext resultcontext);
+static void sql_exec_error_callback(void *arg);
+static void ShutdownSQLFunction(Datum arg);
+static bool coerce_fn_result_column(TargetEntry *src_tle,
+ Oid res_type, int32 res_typmod,
+ bool tlist_is_modifiable,
+ List **upper_tlist,
+ bool *upper_tlist_nontrivial);
+static void sqlfunction_startup(DestReceiver *self, int operation, TupleDesc typeinfo);
+static bool sqlfunction_receive(TupleTableSlot *slot, DestReceiver *self);
+static void sqlfunction_shutdown(DestReceiver *self);
+static void sqlfunction_destroy(DestReceiver *self);
+
+
+/*
+ * Prepare the SQLFunctionParseInfo struct for parsing a SQL function body
+ *
+ * This includes resolving actual types of polymorphic arguments.
+ *
+ * call_expr can be passed as NULL, but then we will fail if there are any
+ * polymorphic arguments.
+ */
+SQLFunctionParseInfoPtr
+prepare_sql_fn_parse_info(HeapTuple procedureTuple,
+ Node *call_expr,
+ Oid inputCollation)
+{
+ SQLFunctionParseInfoPtr pinfo;
+ Form_pg_proc procedureStruct = (Form_pg_proc) GETSTRUCT(procedureTuple);
+ int nargs;
+
+ pinfo = (SQLFunctionParseInfoPtr) palloc0(sizeof(SQLFunctionParseInfo));
+
+ /* Function's name (only) can be used to qualify argument names */
+ pinfo->fname = pstrdup(NameStr(procedureStruct->proname));
+
+ /* Save the function's input collation */
+ pinfo->collation = inputCollation;
+
+ /*
+ * Copy input argument types from the pg_proc entry, then resolve any
+ * polymorphic types.
+ */
+ pinfo->nargs = nargs = procedureStruct->pronargs;
+ if (nargs > 0)
+ {
+ Oid *argOidVect;
+ int argnum;
+
+ argOidVect = (Oid *) palloc(nargs * sizeof(Oid));
+ memcpy(argOidVect,
+ procedureStruct->proargtypes.values,
+ nargs * sizeof(Oid));
+
+ for (argnum = 0; argnum < nargs; argnum++)
+ {
+ Oid argtype = argOidVect[argnum];
+
+ if (IsPolymorphicType(argtype))
+ {
+ argtype = get_call_expr_argtype(call_expr, argnum);
+ if (argtype == InvalidOid)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("could not determine actual type of argument declared %s",
+ format_type_be(argOidVect[argnum]))));
+ argOidVect[argnum] = argtype;
+ }
+ }
+
+ pinfo->argtypes = argOidVect;
+ }
+
+ /*
+ * Collect names of arguments, too, if any
+ */
+ if (nargs > 0)
+ {
+ Datum proargnames;
+ Datum proargmodes;
+ int n_arg_names;
+ bool isNull;
+
+ proargnames = SysCacheGetAttr(PROCNAMEARGSNSP, procedureTuple,
+ Anum_pg_proc_proargnames,
+ &isNull);
+ if (isNull)
+ proargnames = PointerGetDatum(NULL); /* just to be sure */
+
+ proargmodes = SysCacheGetAttr(PROCNAMEARGSNSP, procedureTuple,
+ Anum_pg_proc_proargmodes,
+ &isNull);
+ if (isNull)
+ proargmodes = PointerGetDatum(NULL); /* just to be sure */
+
+ n_arg_names = get_func_input_arg_names(proargnames, proargmodes,
+ &pinfo->argnames);
+
+ /* Paranoia: ignore the result if too few array entries */
+ if (n_arg_names < nargs)
+ pinfo->argnames = NULL;
+ }
+ else
+ pinfo->argnames = NULL;
+
+ return pinfo;
+}
+
+/*
+ * Parser setup hook for parsing a SQL function body.
+ */
+void
+sql_fn_parser_setup(struct ParseState *pstate, SQLFunctionParseInfoPtr pinfo)
+{
+ pstate->p_pre_columnref_hook = NULL;
+ pstate->p_post_columnref_hook = sql_fn_post_column_ref;
+ pstate->p_paramref_hook = sql_fn_param_ref;
+ /* no need to use p_coerce_param_hook */
+ pstate->p_ref_hook_state = (void *) pinfo;
+}
+
+/*
+ * sql_fn_post_column_ref parser callback for ColumnRefs
+ */
+static Node *
+sql_fn_post_column_ref(ParseState *pstate, ColumnRef *cref, Node *var)
+{
+ SQLFunctionParseInfoPtr pinfo = (SQLFunctionParseInfoPtr) pstate->p_ref_hook_state;
+ int nnames;
+ Node *field1;
+ Node *subfield = NULL;
+ const char *name1;
+ const char *name2 = NULL;
+ Node *param;
+
+ /*
+ * Never override a table-column reference. This corresponds to
+ * considering the parameter names to appear in a scope outside the
+ * individual SQL commands, which is what we want.
+ */
+ if (var != NULL)
+ return NULL;
+
+ /*----------
+ * The allowed syntaxes are:
+ *
+ * A A = parameter name
+ * A.B A = function name, B = parameter name
+ * OR: A = record-typed parameter name, B = field name
+ * (the first possibility takes precedence)
+ * A.B.C A = function name, B = record-typed parameter name,
+ * C = field name
+ * A.* Whole-row reference to composite parameter A.
+ * A.B.* Same, with A = function name, B = parameter name
+ *
+ * Here, it's sufficient to ignore the "*" in the last two cases --- the
+ * main parser will take care of expanding the whole-row reference.
+ *----------
+ */
+ nnames = list_length(cref->fields);
+
+ if (nnames > 3)
+ return NULL;
+
+ if (IsA(llast(cref->fields), A_Star))
+ nnames--;
+
+ field1 = (Node *) linitial(cref->fields);
+ Assert(IsA(field1, String));
+ name1 = strVal(field1);
+ if (nnames > 1)
+ {
+ subfield = (Node *) lsecond(cref->fields);
+ Assert(IsA(subfield, String));
+ name2 = strVal(subfield);
+ }
+
+ if (nnames == 3)
+ {
+ /*
+ * Three-part name: if the first part doesn't match the function name,
+ * we can fail immediately. Otherwise, look up the second part, and
+ * take the third part to be a field reference.
+ */
+ if (strcmp(name1, pinfo->fname) != 0)
+ return NULL;
+
+ param = sql_fn_resolve_param_name(pinfo, name2, cref->location);
+
+ subfield = (Node *) lthird(cref->fields);
+ Assert(IsA(subfield, String));
+ }
+ else if (nnames == 2 && strcmp(name1, pinfo->fname) == 0)
+ {
+ /*
+ * Two-part name with first part matching function name: first see if
+ * second part matches any parameter name.
+ */
+ param = sql_fn_resolve_param_name(pinfo, name2, cref->location);
+
+ if (param)
+ {
+ /* Yes, so this is a parameter reference, no subfield */
+ subfield = NULL;
+ }
+ else
+ {
+ /* No, so try to match as parameter name and subfield */
+ param = sql_fn_resolve_param_name(pinfo, name1, cref->location);
+ }
+ }
+ else
+ {
+ /* Single name, or parameter name followed by subfield */
+ param = sql_fn_resolve_param_name(pinfo, name1, cref->location);
+ }
+
+ if (!param)
+ return NULL; /* No match */
+
+ if (subfield)
+ {
+ /*
+ * Must be a reference to a field of a composite parameter; otherwise
+ * ParseFuncOrColumn will return NULL, and we'll fail back at the
+ * caller.
+ */
+ param = ParseFuncOrColumn(pstate,
+ list_make1(subfield),
+ list_make1(param),
+ pstate->p_last_srf,
+ NULL,
+ false,
+ cref->location);
+ }
+
+ return param;
+}
+
+/*
+ * sql_fn_param_ref parser callback for ParamRefs ($n symbols)
+ */
+static Node *
+sql_fn_param_ref(ParseState *pstate, ParamRef *pref)
+{
+ SQLFunctionParseInfoPtr pinfo = (SQLFunctionParseInfoPtr) pstate->p_ref_hook_state;
+ int paramno = pref->number;
+
+ /* Check parameter number is valid */
+ if (paramno <= 0 || paramno > pinfo->nargs)
+ return NULL; /* unknown parameter number */
+
+ return sql_fn_make_param(pinfo, paramno, pref->location);
+}
+
+/*
+ * sql_fn_make_param construct a Param node for the given paramno
+ */
+static Node *
+sql_fn_make_param(SQLFunctionParseInfoPtr pinfo,
+ int paramno, int location)
+{
+ Param *param;
+
+ param = makeNode(Param);
+ param->paramkind = PARAM_EXTERN;
+ param->paramid = paramno;
+ param->paramtype = pinfo->argtypes[paramno - 1];
+ param->paramtypmod = -1;
+ param->paramcollid = get_typcollation(param->paramtype);
+ param->location = location;
+
+ /*
+ * If we have a function input collation, allow it to override the
+ * type-derived collation for parameter symbols. (XXX perhaps this should
+ * not happen if the type collation is not default?)
+ */
+ if (OidIsValid(pinfo->collation) && OidIsValid(param->paramcollid))
+ param->paramcollid = pinfo->collation;
+
+ return (Node *) param;
+}
+
+/*
+ * Search for a function parameter of the given name; if there is one,
+ * construct and return a Param node for it. If not, return NULL.
+ * Helper function for sql_fn_post_column_ref.
+ */
+static Node *
+sql_fn_resolve_param_name(SQLFunctionParseInfoPtr pinfo,
+ const char *paramname, int location)
+{
+ int i;
+
+ if (pinfo->argnames == NULL)
+ return NULL;
+
+ for (i = 0; i < pinfo->nargs; i++)
+ {
+ if (pinfo->argnames[i] && strcmp(pinfo->argnames[i], paramname) == 0)
+ return sql_fn_make_param(pinfo, i + 1, location);
+ }
+
+ return NULL;
+}
+
+/*
+ * Set up the per-query execution_state records for a SQL function.
+ *
+ * The input is a List of Lists of parsed and rewritten, but not planned,
+ * querytrees. The sublist structure denotes the original query boundaries.
+ */
+static List *
+init_execution_state(List *queryTree_list,
+ SQLFunctionCachePtr fcache,
+ bool lazyEvalOK)
+{
+ List *eslist = NIL;
+ execution_state *lasttages = NULL;
+ ListCell *lc1;
+
+ foreach(lc1, queryTree_list)
+ {
+ List *qtlist = lfirst_node(List, lc1);
+ execution_state *firstes = NULL;
+ execution_state *preves = NULL;
+ ListCell *lc2;
+
+ foreach(lc2, qtlist)
+ {
+ Query *queryTree = lfirst_node(Query, lc2);
+ PlannedStmt *stmt;
+ execution_state *newes;
+
+ /* Plan the query if needed */
+ if (queryTree->commandType == CMD_UTILITY)
+ {
+ /* Utility commands require no planning. */
+ stmt = makeNode(PlannedStmt);
+ stmt->commandType = CMD_UTILITY;
+ stmt->canSetTag = queryTree->canSetTag;
+ stmt->utilityStmt = queryTree->utilityStmt;
+ stmt->stmt_location = queryTree->stmt_location;
+ stmt->stmt_len = queryTree->stmt_len;
+ }
+ else
+ stmt = pg_plan_query(queryTree,
+ fcache->src,
+ CURSOR_OPT_PARALLEL_OK,
+ NULL);
+
+ /*
+ * Precheck all commands for validity in a function. This should
+ * generally match the restrictions spi.c applies.
+ */
+ if (stmt->commandType == CMD_UTILITY)
+ {
+ if (IsA(stmt->utilityStmt, CopyStmt) &&
+ ((CopyStmt *) stmt->utilityStmt)->filename == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot COPY to/from client in an SQL function")));
+
+ if (IsA(stmt->utilityStmt, TransactionStmt))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ /* translator: %s is a SQL statement name */
+ errmsg("%s is not allowed in an SQL function",
+ CreateCommandName(stmt->utilityStmt))));
+ }
+
+ if (fcache->readonly_func && !CommandIsReadOnly(stmt))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ /* translator: %s is a SQL statement name */
+ errmsg("%s is not allowed in a non-volatile function",
+ CreateCommandName((Node *) stmt))));
+
+ /* OK, build the execution_state for this query */
+ newes = (execution_state *) palloc(sizeof(execution_state));
+ if (preves)
+ preves->next = newes;
+ else
+ firstes = newes;
+
+ newes->next = NULL;
+ newes->status = F_EXEC_START;
+ newes->setsResult = false; /* might change below */
+ newes->lazyEval = false; /* might change below */
+ newes->stmt = stmt;
+ newes->qd = NULL;
+
+ if (queryTree->canSetTag)
+ lasttages = newes;
+
+ preves = newes;
+ }
+
+ eslist = lappend(eslist, firstes);
+ }
+
+ /*
+ * Mark the last canSetTag query as delivering the function result; then,
+ * if it is a plain SELECT, mark it for lazy evaluation. If it's not a
+ * SELECT we must always run it to completion.
+ *
+ * Note: at some point we might add additional criteria for whether to use
+ * lazy eval. However, we should prefer to use it whenever the function
+ * doesn't return set, since fetching more than one row is useless in that
+ * case.
+ *
+ * Note: don't set setsResult if the function returns VOID, as evidenced
+ * by not having made a junkfilter. This ensures we'll throw away any
+ * output from the last statement in such a function.
+ */
+ if (lasttages && fcache->junkFilter)
+ {
+ lasttages->setsResult = true;
+ if (lazyEvalOK &&
+ lasttages->stmt->commandType == CMD_SELECT &&
+ !lasttages->stmt->hasModifyingCTE)
+ fcache->lazyEval = lasttages->lazyEval = true;
+ }
+
+ return eslist;
+}
+
+/*
+ * Initialize the SQLFunctionCache for a SQL function
+ */
+static void
+init_sql_fcache(FunctionCallInfo fcinfo, Oid collation, bool lazyEvalOK)
+{
+ FmgrInfo *finfo = fcinfo->flinfo;
+ Oid foid = finfo->fn_oid;
+ MemoryContext fcontext;
+ MemoryContext oldcontext;
+ Oid rettype;
+ TupleDesc rettupdesc;
+ HeapTuple procedureTuple;
+ Form_pg_proc procedureStruct;
+ SQLFunctionCachePtr fcache;
+ List *queryTree_list;
+ List *resulttlist;
+ ListCell *lc;
+ Datum tmp;
+ bool isNull;
+
+ /*
+ * Create memory context that holds all the SQLFunctionCache data. It
+ * must be a child of whatever context holds the FmgrInfo.
+ */
+ fcontext = AllocSetContextCreate(finfo->fn_mcxt,
+ "SQL function",
+ ALLOCSET_DEFAULT_SIZES);
+
+ oldcontext = MemoryContextSwitchTo(fcontext);
+
+ /*
+ * Create the struct proper, link it to fcontext and fn_extra. Once this
+ * is done, we'll be able to recover the memory after failure, even if the
+ * FmgrInfo is long-lived.
+ */
+ fcache = (SQLFunctionCachePtr) palloc0(sizeof(SQLFunctionCache));
+ fcache->fcontext = fcontext;
+ finfo->fn_extra = (void *) fcache;
+
+ /*
+ * get the procedure tuple corresponding to the given function Oid
+ */
+ procedureTuple = SearchSysCache1(PROCOID, ObjectIdGetDatum(foid));
+ if (!HeapTupleIsValid(procedureTuple))
+ elog(ERROR, "cache lookup failed for function %u", foid);
+ procedureStruct = (Form_pg_proc) GETSTRUCT(procedureTuple);
+
+ /*
+ * copy function name immediately for use by error reporting callback, and
+ * for use as memory context identifier
+ */
+ fcache->fname = pstrdup(NameStr(procedureStruct->proname));
+ MemoryContextSetIdentifier(fcontext, fcache->fname);
+
+ /*
+ * Resolve any polymorphism, obtaining the actual result type, and the
+ * corresponding tupdesc if it's a rowtype.
+ */
+ (void) get_call_result_type(fcinfo, &rettype, &rettupdesc);
+
+ fcache->rettype = rettype;
+
+ /* Fetch the typlen and byval info for the result type */
+ get_typlenbyval(rettype, &fcache->typlen, &fcache->typbyval);
+
+ /* Remember whether we're returning setof something */
+ fcache->returnsSet = procedureStruct->proretset;
+
+ /* Remember if function is STABLE/IMMUTABLE */
+ fcache->readonly_func =
+ (procedureStruct->provolatile != PROVOLATILE_VOLATILE);
+
+ /*
+ * We need the actual argument types to pass to the parser. Also make
+ * sure that parameter symbols are considered to have the function's
+ * resolved input collation.
+ */
+ fcache->pinfo = prepare_sql_fn_parse_info(procedureTuple,
+ finfo->fn_expr,
+ collation);
+
+ /*
+ * And of course we need the function body text.
+ */
+ tmp = SysCacheGetAttr(PROCOID,
+ procedureTuple,
+ Anum_pg_proc_prosrc,
+ &isNull);
+ if (isNull)
+ elog(ERROR, "null prosrc for function %u", foid);
+ fcache->src = TextDatumGetCString(tmp);
+
+ /* If we have prosqlbody, pay attention to that not prosrc. */
+ tmp = SysCacheGetAttr(PROCOID,
+ procedureTuple,
+ Anum_pg_proc_prosqlbody,
+ &isNull);
+
+ /*
+ * Parse and rewrite the queries in the function text. Use sublists to
+ * keep track of the original query boundaries.
+ *
+ * Note: since parsing and planning is done in fcontext, we will generate
+ * a lot of cruft that lives as long as the fcache does. This is annoying
+ * but we'll not worry about it until the module is rewritten to use
+ * plancache.c.
+ */
+ queryTree_list = NIL;
+ if (!isNull)
+ {
+ Node *n;
+ List *stored_query_list;
+
+ n = stringToNode(TextDatumGetCString(tmp));
+ if (IsA(n, List))
+ stored_query_list = linitial_node(List, castNode(List, n));
+ else
+ stored_query_list = list_make1(n);
+
+ foreach(lc, stored_query_list)
+ {
+ Query *parsetree = lfirst_node(Query, lc);
+ List *queryTree_sublist;
+
+ AcquireRewriteLocks(parsetree, true, false);
+ queryTree_sublist = pg_rewrite_query(parsetree);
+ queryTree_list = lappend(queryTree_list, queryTree_sublist);
+ }
+ }
+ else
+ {
+ List *raw_parsetree_list;
+
+ raw_parsetree_list = pg_parse_query(fcache->src);
+
+ foreach(lc, raw_parsetree_list)
+ {
+ RawStmt *parsetree = lfirst_node(RawStmt, lc);
+ List *queryTree_sublist;
+
+ queryTree_sublist = pg_analyze_and_rewrite_params(parsetree,
+ fcache->src,
+ (ParserSetupHook) sql_fn_parser_setup,
+ fcache->pinfo,
+ NULL);
+ queryTree_list = lappend(queryTree_list, queryTree_sublist);
+ }
+ }
+
+ /*
+ * Check that there are no statements we don't want to allow.
+ */
+ check_sql_fn_statements(queryTree_list);
+
+ /*
+ * Check that the function returns the type it claims to. Although in
+ * simple cases this was already done when the function was defined, we
+ * have to recheck because database objects used in the function's queries
+ * might have changed type. We'd have to recheck anyway if the function
+ * had any polymorphic arguments. Moreover, check_sql_fn_retval takes
+ * care of injecting any required column type coercions. (But we don't
+ * ask it to insert nulls for dropped columns; the junkfilter handles
+ * that.)
+ *
+ * Note: we set fcache->returnsTuple according to whether we are returning
+ * the whole tuple result or just a single column. In the latter case we
+ * clear returnsTuple because we need not act different from the scalar
+ * result case, even if it's a rowtype column. (However, we have to force
+ * lazy eval mode in that case; otherwise we'd need extra code to expand
+ * the rowtype column into multiple columns, since we have no way to
+ * notify the caller that it should do that.)
+ */
+ fcache->returnsTuple = check_sql_fn_retval(queryTree_list,
+ rettype,
+ rettupdesc,
+ false,
+ &resulttlist);
+
+ /*
+ * Construct a JunkFilter we can use to coerce the returned rowtype to the
+ * desired form, unless the result type is VOID, in which case there's
+ * nothing to coerce to. (XXX Frequently, the JunkFilter isn't doing
+ * anything very interesting, but much of this module expects it to be
+ * there anyway.)
+ */
+ if (rettype != VOIDOID)
+ {
+ TupleTableSlot *slot = MakeSingleTupleTableSlot(NULL,
+ &TTSOpsMinimalTuple);
+
+ /*
+ * If the result is composite, *and* we are returning the whole tuple
+ * result, we need to insert nulls for any dropped columns. In the
+ * single-column-result case, there might be dropped columns within
+ * the composite column value, but it's not our problem here. There
+ * should be no resjunk entries in resulttlist, so in the second case
+ * the JunkFilter is certainly a no-op.
+ */
+ if (rettupdesc && fcache->returnsTuple)
+ fcache->junkFilter = ExecInitJunkFilterConversion(resulttlist,
+ rettupdesc,
+ slot);
+ else
+ fcache->junkFilter = ExecInitJunkFilter(resulttlist, slot);
+ }
+
+ if (fcache->returnsTuple)
+ {
+ /* Make sure output rowtype is properly blessed */
+ BlessTupleDesc(fcache->junkFilter->jf_resultSlot->tts_tupleDescriptor);
+ }
+ else if (fcache->returnsSet && type_is_rowtype(fcache->rettype))
+ {
+ /*
+ * Returning rowtype as if it were scalar --- materialize won't work.
+ * Right now it's sufficient to override any caller preference for
+ * materialize mode, but to add more smarts in init_execution_state
+ * about this, we'd probably need a three-way flag instead of bool.
+ */
+ lazyEvalOK = true;
+ }
+
+ /* Finally, plan the queries */
+ fcache->func_state = init_execution_state(queryTree_list,
+ fcache,
+ lazyEvalOK);
+
+ /* Mark fcache with time of creation to show it's valid */
+ fcache->lxid = MyProc->lxid;
+ fcache->subxid = GetCurrentSubTransactionId();
+
+ ReleaseSysCache(procedureTuple);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/* Start up execution of one execution_state node */
+static void
+postquel_start(execution_state *es, SQLFunctionCachePtr fcache)
+{
+ DestReceiver *dest;
+
+ Assert(es->qd == NULL);
+
+ /* Caller should have ensured a suitable snapshot is active */
+ Assert(ActiveSnapshotSet());
+
+ /*
+ * If this query produces the function result, send its output to the
+ * tuplestore; else discard any output.
+ */
+ if (es->setsResult)
+ {
+ DR_sqlfunction *myState;
+
+ dest = CreateDestReceiver(DestSQLFunction);
+ /* pass down the needed info to the dest receiver routines */
+ myState = (DR_sqlfunction *) dest;
+ Assert(myState->pub.mydest == DestSQLFunction);
+ myState->tstore = fcache->tstore;
+ myState->cxt = CurrentMemoryContext;
+ myState->filter = fcache->junkFilter;
+ }
+ else
+ dest = None_Receiver;
+
+ es->qd = CreateQueryDesc(es->stmt,
+ fcache->src,
+ GetActiveSnapshot(),
+ InvalidSnapshot,
+ dest,
+ fcache->paramLI,
+ es->qd ? es->qd->queryEnv : NULL,
+ 0);
+
+ /* Utility commands don't need Executor. */
+ if (es->qd->operation != CMD_UTILITY)
+ {
+ /*
+ * In lazyEval mode, do not let the executor set up an AfterTrigger
+ * context. This is necessary not just an optimization, because we
+ * mustn't exit from the function execution with a stacked
+ * AfterTrigger level still active. We are careful not to select
+ * lazyEval mode for any statement that could possibly queue triggers.
+ */
+ int eflags;
+
+ if (es->lazyEval)
+ eflags = EXEC_FLAG_SKIP_TRIGGERS;
+ else
+ eflags = 0; /* default run-to-completion flags */
+ ExecutorStart(es->qd, eflags);
+ }
+
+ es->status = F_EXEC_RUN;
+}
+
+/* Run one execution_state; either to completion or to first result row */
+/* Returns true if we ran to completion */
+static bool
+postquel_getnext(execution_state *es, SQLFunctionCachePtr fcache)
+{
+ bool result;
+
+ if (es->qd->operation == CMD_UTILITY)
+ {
+ ProcessUtility(es->qd->plannedstmt,
+ fcache->src,
+ false,
+ PROCESS_UTILITY_QUERY,
+ es->qd->params,
+ es->qd->queryEnv,
+ es->qd->dest,
+ NULL);
+ result = true; /* never stops early */
+ }
+ else
+ {
+ /* Run regular commands to completion unless lazyEval */
+ uint64 count = (es->lazyEval) ? 1 : 0;
+
+ ExecutorRun(es->qd, ForwardScanDirection, count, !fcache->returnsSet || !es->lazyEval);
+
+ /*
+ * If we requested run to completion OR there was no tuple returned,
+ * command must be complete.
+ */
+ result = (count == 0 || es->qd->estate->es_processed == 0);
+ }
+
+ return result;
+}
+
+/* Shut down execution of one execution_state node */
+static void
+postquel_end(execution_state *es)
+{
+ /* mark status done to ensure we don't do ExecutorEnd twice */
+ es->status = F_EXEC_DONE;
+
+ /* Utility commands don't need Executor. */
+ if (es->qd->operation != CMD_UTILITY)
+ {
+ ExecutorFinish(es->qd);
+ ExecutorEnd(es->qd);
+ }
+
+ es->qd->dest->rDestroy(es->qd->dest);
+
+ FreeQueryDesc(es->qd);
+ es->qd = NULL;
+}
+
+/* Build ParamListInfo array representing current arguments */
+static void
+postquel_sub_params(SQLFunctionCachePtr fcache,
+ FunctionCallInfo fcinfo)
+{
+ int nargs = fcinfo->nargs;
+
+ if (nargs > 0)
+ {
+ ParamListInfo paramLI;
+
+ if (fcache->paramLI == NULL)
+ {
+ paramLI = makeParamList(nargs);
+ fcache->paramLI = paramLI;
+ }
+ else
+ {
+ paramLI = fcache->paramLI;
+ Assert(paramLI->numParams == nargs);
+ }
+
+ for (int i = 0; i < nargs; i++)
+ {
+ ParamExternData *prm = &paramLI->params[i];
+
+ prm->value = fcinfo->args[i].value;
+ prm->isnull = fcinfo->args[i].isnull;
+ prm->pflags = 0;
+ prm->ptype = fcache->pinfo->argtypes[i];
+ }
+ }
+ else
+ fcache->paramLI = NULL;
+}
+
+/*
+ * Extract the SQL function's value from a single result row. This is used
+ * both for scalar (non-set) functions and for each row of a lazy-eval set
+ * result.
+ */
+static Datum
+postquel_get_single_result(TupleTableSlot *slot,
+ FunctionCallInfo fcinfo,
+ SQLFunctionCachePtr fcache,
+ MemoryContext resultcontext)
+{
+ Datum value;
+ MemoryContext oldcontext;
+
+ /*
+ * Set up to return the function value. For pass-by-reference datatypes,
+ * be sure to allocate the result in resultcontext, not the current memory
+ * context (which has query lifespan). We can't leave the data in the
+ * TupleTableSlot because we intend to clear the slot before returning.
+ */
+ oldcontext = MemoryContextSwitchTo(resultcontext);
+
+ if (fcache->returnsTuple)
+ {
+ /* We must return the whole tuple as a Datum. */
+ fcinfo->isnull = false;
+ value = ExecFetchSlotHeapTupleDatum(slot);
+ }
+ else
+ {
+ /*
+ * Returning a scalar, which we have to extract from the first column
+ * of the SELECT result, and then copy into result context if needed.
+ */
+ value = slot_getattr(slot, 1, &(fcinfo->isnull));
+
+ if (!fcinfo->isnull)
+ value = datumCopy(value, fcache->typbyval, fcache->typlen);
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return value;
+}
+
+/*
+ * fmgr_sql: function call manager for SQL functions
+ */
+Datum
+fmgr_sql(PG_FUNCTION_ARGS)
+{
+ SQLFunctionCachePtr fcache;
+ ErrorContextCallback sqlerrcontext;
+ MemoryContext oldcontext;
+ bool randomAccess;
+ bool lazyEvalOK;
+ bool is_first;
+ bool pushed_snapshot;
+ execution_state *es;
+ TupleTableSlot *slot;
+ Datum result;
+ List *eslist;
+ ListCell *eslc;
+
+ /*
+ * Setup error traceback support for ereport()
+ */
+ sqlerrcontext.callback = sql_exec_error_callback;
+ sqlerrcontext.arg = fcinfo->flinfo;
+ sqlerrcontext.previous = error_context_stack;
+ error_context_stack = &sqlerrcontext;
+
+ /* Check call context */
+ if (fcinfo->flinfo->fn_retset)
+ {
+ ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
+
+ /*
+ * For simplicity, we require callers to support both set eval modes.
+ * There are cases where we must use one or must use the other, and
+ * it's not really worthwhile to postpone the check till we know. But
+ * note we do not require caller to provide an expectedDesc.
+ */
+ if (!rsi || !IsA(rsi, ReturnSetInfo) ||
+ (rsi->allowedModes & SFRM_ValuePerCall) == 0 ||
+ (rsi->allowedModes & SFRM_Materialize) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ randomAccess = rsi->allowedModes & SFRM_Materialize_Random;
+ lazyEvalOK = !(rsi->allowedModes & SFRM_Materialize_Preferred);
+ }
+ else
+ {
+ randomAccess = false;
+ lazyEvalOK = true;
+ }
+
+ /*
+ * Initialize fcache (build plans) if first time through; or re-initialize
+ * if the cache is stale.
+ */
+ fcache = (SQLFunctionCachePtr) fcinfo->flinfo->fn_extra;
+
+ if (fcache != NULL)
+ {
+ if (fcache->lxid != MyProc->lxid ||
+ !SubTransactionIsActive(fcache->subxid))
+ {
+ /* It's stale; unlink and delete */
+ fcinfo->flinfo->fn_extra = NULL;
+ MemoryContextDelete(fcache->fcontext);
+ fcache = NULL;
+ }
+ }
+
+ if (fcache == NULL)
+ {
+ init_sql_fcache(fcinfo, PG_GET_COLLATION(), lazyEvalOK);
+ fcache = (SQLFunctionCachePtr) fcinfo->flinfo->fn_extra;
+ }
+
+ /*
+ * Switch to context in which the fcache lives. This ensures that our
+ * tuplestore etc will have sufficient lifetime. The sub-executor is
+ * responsible for deleting per-tuple information. (XXX in the case of a
+ * long-lived FmgrInfo, this policy represents more memory leakage, but
+ * it's not entirely clear where to keep stuff instead.)
+ */
+ oldcontext = MemoryContextSwitchTo(fcache->fcontext);
+
+ /*
+ * Find first unfinished query in function, and note whether it's the
+ * first query.
+ */
+ eslist = fcache->func_state;
+ es = NULL;
+ is_first = true;
+ foreach(eslc, eslist)
+ {
+ es = (execution_state *) lfirst(eslc);
+
+ while (es && es->status == F_EXEC_DONE)
+ {
+ is_first = false;
+ es = es->next;
+ }
+
+ if (es)
+ break;
+ }
+
+ /*
+ * Convert params to appropriate format if starting a fresh execution. (If
+ * continuing execution, we can re-use prior params.)
+ */
+ if (is_first && es && es->status == F_EXEC_START)
+ postquel_sub_params(fcache, fcinfo);
+
+ /*
+ * Build tuplestore to hold results, if we don't have one already. Note
+ * it's in the query-lifespan context.
+ */
+ if (!fcache->tstore)
+ fcache->tstore = tuplestore_begin_heap(randomAccess, false, work_mem);
+
+ /*
+ * Execute each command in the function one after another until we either
+ * run out of commands or get a result row from a lazily-evaluated SELECT.
+ *
+ * Notes about snapshot management:
+ *
+ * In a read-only function, we just use the surrounding query's snapshot.
+ *
+ * In a non-read-only function, we rely on the fact that we'll never
+ * suspend execution between queries of the function: the only reason to
+ * suspend execution before completion is if we are returning a row from a
+ * lazily-evaluated SELECT. So, when first entering this loop, we'll
+ * either start a new query (and push a fresh snapshot) or re-establish
+ * the active snapshot from the existing query descriptor. If we need to
+ * start a new query in a subsequent execution of the loop, either we need
+ * a fresh snapshot (and pushed_snapshot is false) or the existing
+ * snapshot is on the active stack and we can just bump its command ID.
+ */
+ pushed_snapshot = false;
+ while (es)
+ {
+ bool completed;
+
+ if (es->status == F_EXEC_START)
+ {
+ /*
+ * If not read-only, be sure to advance the command counter for
+ * each command, so that all work to date in this transaction is
+ * visible. Take a new snapshot if we don't have one yet,
+ * otherwise just bump the command ID in the existing snapshot.
+ */
+ if (!fcache->readonly_func)
+ {
+ CommandCounterIncrement();
+ if (!pushed_snapshot)
+ {
+ PushActiveSnapshot(GetTransactionSnapshot());
+ pushed_snapshot = true;
+ }
+ else
+ UpdateActiveSnapshotCommandId();
+ }
+
+ postquel_start(es, fcache);
+ }
+ else if (!fcache->readonly_func && !pushed_snapshot)
+ {
+ /* Re-establish active snapshot when re-entering function */
+ PushActiveSnapshot(es->qd->snapshot);
+ pushed_snapshot = true;
+ }
+
+ completed = postquel_getnext(es, fcache);
+
+ /*
+ * If we ran the command to completion, we can shut it down now. Any
+ * row(s) we need to return are safely stashed in the tuplestore, and
+ * we want to be sure that, for example, AFTER triggers get fired
+ * before we return anything. Also, if the function doesn't return
+ * set, we can shut it down anyway because it must be a SELECT and we
+ * don't care about fetching any more result rows.
+ */
+ if (completed || !fcache->returnsSet)
+ postquel_end(es);
+
+ /*
+ * Break from loop if we didn't shut down (implying we got a
+ * lazily-evaluated row). Otherwise we'll press on till the whole
+ * function is done, relying on the tuplestore to keep hold of the
+ * data to eventually be returned. This is necessary since an
+ * INSERT/UPDATE/DELETE RETURNING that sets the result might be
+ * followed by additional rule-inserted commands, and we want to
+ * finish doing all those commands before we return anything.
+ */
+ if (es->status != F_EXEC_DONE)
+ break;
+
+ /*
+ * Advance to next execution_state, which might be in the next list.
+ */
+ es = es->next;
+ while (!es)
+ {
+ eslc = lnext(eslist, eslc);
+ if (!eslc)
+ break; /* end of function */
+
+ es = (execution_state *) lfirst(eslc);
+
+ /*
+ * Flush the current snapshot so that we will take a new one for
+ * the new query list. This ensures that new snaps are taken at
+ * original-query boundaries, matching the behavior of interactive
+ * execution.
+ */
+ if (pushed_snapshot)
+ {
+ PopActiveSnapshot();
+ pushed_snapshot = false;
+ }
+ }
+ }
+
+ /*
+ * The tuplestore now contains whatever row(s) we are supposed to return.
+ */
+ if (fcache->returnsSet)
+ {
+ ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
+
+ if (es)
+ {
+ /*
+ * If we stopped short of being done, we must have a lazy-eval
+ * row.
+ */
+ Assert(es->lazyEval);
+ /* Re-use the junkfilter's output slot to fetch back the tuple */
+ Assert(fcache->junkFilter);
+ slot = fcache->junkFilter->jf_resultSlot;
+ if (!tuplestore_gettupleslot(fcache->tstore, true, false, slot))
+ elog(ERROR, "failed to fetch lazy-eval tuple");
+ /* Extract the result as a datum, and copy out from the slot */
+ result = postquel_get_single_result(slot, fcinfo,
+ fcache, oldcontext);
+ /* Clear the tuplestore, but keep it for next time */
+ /* NB: this might delete the slot's content, but we don't care */
+ tuplestore_clear(fcache->tstore);
+
+ /*
+ * Let caller know we're not finished.
+ */
+ rsi->isDone = ExprMultipleResult;
+
+ /*
+ * Ensure we will get shut down cleanly if the exprcontext is not
+ * run to completion.
+ */
+ if (!fcache->shutdown_reg)
+ {
+ RegisterExprContextCallback(rsi->econtext,
+ ShutdownSQLFunction,
+ PointerGetDatum(fcache));
+ fcache->shutdown_reg = true;
+ }
+ }
+ else if (fcache->lazyEval)
+ {
+ /*
+ * We are done with a lazy evaluation. Clean up.
+ */
+ tuplestore_clear(fcache->tstore);
+
+ /*
+ * Let caller know we're finished.
+ */
+ rsi->isDone = ExprEndResult;
+
+ fcinfo->isnull = true;
+ result = (Datum) 0;
+
+ /* Deregister shutdown callback, if we made one */
+ if (fcache->shutdown_reg)
+ {
+ UnregisterExprContextCallback(rsi->econtext,
+ ShutdownSQLFunction,
+ PointerGetDatum(fcache));
+ fcache->shutdown_reg = false;
+ }
+ }
+ else
+ {
+ /*
+ * We are done with a non-lazy evaluation. Return whatever is in
+ * the tuplestore. (It is now caller's responsibility to free the
+ * tuplestore when done.)
+ */
+ rsi->returnMode = SFRM_Materialize;
+ rsi->setResult = fcache->tstore;
+ fcache->tstore = NULL;
+ /* must copy desc because execSRF.c will free it */
+ if (fcache->junkFilter)
+ rsi->setDesc = CreateTupleDescCopy(fcache->junkFilter->jf_cleanTupType);
+
+ fcinfo->isnull = true;
+ result = (Datum) 0;
+
+ /* Deregister shutdown callback, if we made one */
+ if (fcache->shutdown_reg)
+ {
+ UnregisterExprContextCallback(rsi->econtext,
+ ShutdownSQLFunction,
+ PointerGetDatum(fcache));
+ fcache->shutdown_reg = false;
+ }
+ }
+ }
+ else
+ {
+ /*
+ * Non-set function. If we got a row, return it; else return NULL.
+ */
+ if (fcache->junkFilter)
+ {
+ /* Re-use the junkfilter's output slot to fetch back the tuple */
+ slot = fcache->junkFilter->jf_resultSlot;
+ if (tuplestore_gettupleslot(fcache->tstore, true, false, slot))
+ result = postquel_get_single_result(slot, fcinfo,
+ fcache, oldcontext);
+ else
+ {
+ fcinfo->isnull = true;
+ result = (Datum) 0;
+ }
+ }
+ else
+ {
+ /* Should only get here for VOID functions and procedures */
+ Assert(fcache->rettype == VOIDOID);
+ fcinfo->isnull = true;
+ result = (Datum) 0;
+ }
+
+ /* Clear the tuplestore, but keep it for next time */
+ tuplestore_clear(fcache->tstore);
+ }
+
+ /* Pop snapshot if we have pushed one */
+ if (pushed_snapshot)
+ PopActiveSnapshot();
+
+ /*
+ * If we've gone through every command in the function, we are done. Reset
+ * the execution states to start over again on next call.
+ */
+ if (es == NULL)
+ {
+ foreach(eslc, fcache->func_state)
+ {
+ es = (execution_state *) lfirst(eslc);
+ while (es)
+ {
+ es->status = F_EXEC_START;
+ es = es->next;
+ }
+ }
+ }
+
+ error_context_stack = sqlerrcontext.previous;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return result;
+}
+
+
+/*
+ * error context callback to let us supply a call-stack traceback
+ */
+static void
+sql_exec_error_callback(void *arg)
+{
+ FmgrInfo *flinfo = (FmgrInfo *) arg;
+ SQLFunctionCachePtr fcache = (SQLFunctionCachePtr) flinfo->fn_extra;
+ int syntaxerrposition;
+
+ /*
+ * We can do nothing useful if init_sql_fcache() didn't get as far as
+ * saving the function name
+ */
+ if (fcache == NULL || fcache->fname == NULL)
+ return;
+
+ /*
+ * If there is a syntax error position, convert to internal syntax error
+ */
+ syntaxerrposition = geterrposition();
+ if (syntaxerrposition > 0 && fcache->src != NULL)
+ {
+ errposition(0);
+ internalerrposition(syntaxerrposition);
+ internalerrquery(fcache->src);
+ }
+
+ /*
+ * Try to determine where in the function we failed. If there is a query
+ * with non-null QueryDesc, finger it. (We check this rather than looking
+ * for F_EXEC_RUN state, so that errors during ExecutorStart or
+ * ExecutorEnd are blamed on the appropriate query; see postquel_start and
+ * postquel_end.)
+ */
+ if (fcache->func_state)
+ {
+ execution_state *es;
+ int query_num;
+ ListCell *lc;
+
+ es = NULL;
+ query_num = 1;
+ foreach(lc, fcache->func_state)
+ {
+ es = (execution_state *) lfirst(lc);
+ while (es)
+ {
+ if (es->qd)
+ {
+ errcontext("SQL function \"%s\" statement %d",
+ fcache->fname, query_num);
+ break;
+ }
+ es = es->next;
+ }
+ if (es)
+ break;
+ query_num++;
+ }
+ if (es == NULL)
+ {
+ /*
+ * couldn't identify a running query; might be function entry,
+ * function exit, or between queries.
+ */
+ errcontext("SQL function \"%s\"", fcache->fname);
+ }
+ }
+ else
+ {
+ /*
+ * Assume we failed during init_sql_fcache(). (It's possible that the
+ * function actually has an empty body, but in that case we may as
+ * well report all errors as being "during startup".)
+ */
+ errcontext("SQL function \"%s\" during startup", fcache->fname);
+ }
+}
+
+
+/*
+ * callback function in case a function-returning-set needs to be shut down
+ * before it has been run to completion
+ */
+static void
+ShutdownSQLFunction(Datum arg)
+{
+ SQLFunctionCachePtr fcache = (SQLFunctionCachePtr) DatumGetPointer(arg);
+ execution_state *es;
+ ListCell *lc;
+
+ foreach(lc, fcache->func_state)
+ {
+ es = (execution_state *) lfirst(lc);
+ while (es)
+ {
+ /* Shut down anything still running */
+ if (es->status == F_EXEC_RUN)
+ {
+ /* Re-establish active snapshot for any called functions */
+ if (!fcache->readonly_func)
+ PushActiveSnapshot(es->qd->snapshot);
+
+ postquel_end(es);
+
+ if (!fcache->readonly_func)
+ PopActiveSnapshot();
+ }
+
+ /* Reset states to START in case we're called again */
+ es->status = F_EXEC_START;
+ es = es->next;
+ }
+ }
+
+ /* Release tuplestore if we have one */
+ if (fcache->tstore)
+ tuplestore_end(fcache->tstore);
+ fcache->tstore = NULL;
+
+ /* execUtils will deregister the callback... */
+ fcache->shutdown_reg = false;
+}
+
+/*
+ * check_sql_fn_statements
+ *
+ * Check statements in an SQL function. Error out if there is anything that
+ * is not acceptable.
+ */
+void
+check_sql_fn_statements(List *queryTreeLists)
+{
+ ListCell *lc;
+
+ /* We are given a list of sublists of Queries */
+ foreach(lc, queryTreeLists)
+ {
+ List *sublist = lfirst_node(List, lc);
+ ListCell *lc2;
+
+ foreach(lc2, sublist)
+ {
+ Query *query = lfirst_node(Query, lc2);
+
+ /*
+ * Disallow calling procedures with output arguments. The current
+ * implementation would just throw the output values away, unless
+ * the statement is the last one. Per SQL standard, we should
+ * assign the output values by name. By disallowing this here, we
+ * preserve an opportunity for future improvement.
+ */
+ if (query->commandType == CMD_UTILITY &&
+ IsA(query->utilityStmt, CallStmt))
+ {
+ CallStmt *stmt = (CallStmt *) query->utilityStmt;
+
+ if (stmt->outargs != NIL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("calling procedures with output arguments is not supported in SQL functions")));
+ }
+ }
+ }
+}
+
+/*
+ * check_sql_fn_retval()
+ * Check return value of a list of lists of sql parse trees.
+ *
+ * The return value of a sql function is the value returned by the last
+ * canSetTag query in the function. We do some ad-hoc type checking and
+ * coercion here to ensure that the function returns what it's supposed to.
+ * Note that we may actually modify the last query to make it match!
+ *
+ * This function returns true if the sql function returns the entire tuple
+ * result of its final statement, or false if it returns just the first column
+ * result of that statement. It throws an error if the final statement doesn't
+ * return the right type at all.
+ *
+ * Note that because we allow "SELECT rowtype_expression", the result can be
+ * false even when the declared function return type is a rowtype.
+ *
+ * For a polymorphic function the passed rettype must be the actual resolved
+ * output type of the function. (This means we can't check the type during
+ * function definition of a polymorphic function.) If we do see a polymorphic
+ * rettype we'll throw an error, saying it is not a supported rettype.
+ *
+ * If the function returns composite, the passed rettupdesc should describe
+ * the expected output. If rettupdesc is NULL, we can't verify that the
+ * output matches; that should only happen in fmgr_sql_validator(), or when
+ * the function returns RECORD and the caller doesn't actually care which
+ * composite type it is.
+ *
+ * (Typically, rettype and rettupdesc are computed by get_call_result_type
+ * or a sibling function.)
+ *
+ * In addition to coercing individual output columns, we can modify the
+ * output to include dummy NULL columns for any dropped columns appearing
+ * in rettupdesc. This is done only if the caller asks for it.
+ *
+ * If resultTargetList isn't NULL, then *resultTargetList is set to the
+ * targetlist that defines the final statement's result. Exception: if the
+ * function is defined to return VOID then *resultTargetList is set to NIL.
+ */
+bool
+check_sql_fn_retval(List *queryTreeLists,
+ Oid rettype, TupleDesc rettupdesc,
+ bool insertDroppedCols,
+ List **resultTargetList)
+{
+ bool is_tuple_result = false;
+ Query *parse;
+ ListCell *parse_cell;
+ List *tlist;
+ int tlistlen;
+ bool tlist_is_modifiable;
+ char fn_typtype;
+ List *upper_tlist = NIL;
+ bool upper_tlist_nontrivial = false;
+ ListCell *lc;
+
+ if (resultTargetList)
+ *resultTargetList = NIL; /* initialize in case of VOID result */
+
+ /*
+ * If it's declared to return VOID, we don't care what's in the function.
+ * (This takes care of the procedure case, as well.)
+ */
+ if (rettype == VOIDOID)
+ return false;
+
+ /*
+ * Find the last canSetTag query in the function body (which is presented
+ * to us as a list of sublists of Query nodes). This isn't necessarily
+ * the last parsetree, because rule rewriting can insert queries after
+ * what the user wrote. Note that it might not even be in the last
+ * sublist, for example if the last query rewrites to DO INSTEAD NOTHING.
+ * (It might not be unreasonable to throw an error in such a case, but
+ * this is the historical behavior and it doesn't seem worth changing.)
+ */
+ parse = NULL;
+ parse_cell = NULL;
+ foreach(lc, queryTreeLists)
+ {
+ List *sublist = lfirst_node(List, lc);
+ ListCell *lc2;
+
+ foreach(lc2, sublist)
+ {
+ Query *q = lfirst_node(Query, lc2);
+
+ if (q->canSetTag)
+ {
+ parse = q;
+ parse_cell = lc2;
+ }
+ }
+ }
+
+ /*
+ * If it's a plain SELECT, it returns whatever the targetlist says.
+ * Otherwise, if it's INSERT/UPDATE/DELETE with RETURNING, it returns
+ * that. Otherwise, the function return type must be VOID.
+ *
+ * Note: eventually replace this test with QueryReturnsTuples? We'd need
+ * a more general method of determining the output type, though. Also, it
+ * seems too dangerous to consider FETCH or EXECUTE as returning a
+ * determinable rowtype, since they depend on relatively short-lived
+ * entities.
+ */
+ if (parse &&
+ parse->commandType == CMD_SELECT)
+ {
+ tlist = parse->targetList;
+ /* tlist is modifiable unless it's a dummy in a setop query */
+ tlist_is_modifiable = (parse->setOperations == NULL);
+ }
+ else if (parse &&
+ (parse->commandType == CMD_INSERT ||
+ parse->commandType == CMD_UPDATE ||
+ parse->commandType == CMD_DELETE) &&
+ parse->returningList)
+ {
+ tlist = parse->returningList;
+ /* returningList can always be modified */
+ tlist_is_modifiable = true;
+ }
+ else
+ {
+ /* Empty function body, or last statement is a utility command */
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("return type mismatch in function declared to return %s",
+ format_type_be(rettype)),
+ errdetail("Function's final statement must be SELECT or INSERT/UPDATE/DELETE RETURNING.")));
+ return false; /* keep compiler quiet */
+ }
+
+ /*
+ * OK, check that the targetlist returns something matching the declared
+ * type, and modify it if necessary. If possible, we insert any coercion
+ * steps right into the final statement's targetlist. However, that might
+ * risk changes in the statement's semantics --- we can't safely change
+ * the output type of a grouping column, for instance. In such cases we
+ * handle coercions by inserting an extra level of Query that effectively
+ * just does a projection.
+ */
+
+ /*
+ * Count the non-junk entries in the result targetlist.
+ */
+ tlistlen = ExecCleanTargetListLength(tlist);
+
+ fn_typtype = get_typtype(rettype);
+
+ if (fn_typtype == TYPTYPE_BASE ||
+ fn_typtype == TYPTYPE_DOMAIN ||
+ fn_typtype == TYPTYPE_ENUM ||
+ fn_typtype == TYPTYPE_RANGE ||
+ fn_typtype == TYPTYPE_MULTIRANGE)
+ {
+ /*
+ * For scalar-type returns, the target list must have exactly one
+ * non-junk entry, and its type must be coercible to rettype.
+ */
+ TargetEntry *tle;
+
+ if (tlistlen != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("return type mismatch in function declared to return %s",
+ format_type_be(rettype)),
+ errdetail("Final statement must return exactly one column.")));
+
+ /* We assume here that non-junk TLEs must come first in tlists */
+ tle = (TargetEntry *) linitial(tlist);
+ Assert(!tle->resjunk);
+
+ if (!coerce_fn_result_column(tle, rettype, -1,
+ tlist_is_modifiable,
+ &upper_tlist,
+ &upper_tlist_nontrivial))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("return type mismatch in function declared to return %s",
+ format_type_be(rettype)),
+ errdetail("Actual return type is %s.",
+ format_type_be(exprType((Node *) tle->expr)))));
+ }
+ else if (fn_typtype == TYPTYPE_COMPOSITE || rettype == RECORDOID)
+ {
+ /*
+ * Returns a rowtype.
+ *
+ * Note that we will not consider a domain over composite to be a
+ * "rowtype" return type; it goes through the scalar case above. This
+ * is because we only provide column-by-column implicit casting, and
+ * will not cast the complete record result. So the only way to
+ * produce a domain-over-composite result is to compute it as an
+ * explicit single-column result. The single-composite-column code
+ * path just below could handle such cases, but it won't be reached.
+ */
+ int tupnatts; /* physical number of columns in tuple */
+ int tuplogcols; /* # of nondeleted columns in tuple */
+ int colindex; /* physical column index */
+
+ /*
+ * If the target list has one non-junk entry, and that expression has
+ * or can be coerced to the declared return type, take it as the
+ * result. This allows, for example, 'SELECT func2()', where func2
+ * has the same composite return type as the function that's calling
+ * it. This provision creates some ambiguity --- maybe the expression
+ * was meant to be the lone field of the composite result --- but it
+ * works well enough as long as we don't get too enthusiastic about
+ * inventing coercions from scalar to composite types.
+ *
+ * XXX Note that if rettype is RECORD and the expression is of a named
+ * composite type, or vice versa, this coercion will succeed, whether
+ * or not the record type really matches. For the moment we rely on
+ * runtime type checking to catch any discrepancy, but it'd be nice to
+ * do better at parse time.
+ */
+ if (tlistlen == 1)
+ {
+ TargetEntry *tle = (TargetEntry *) linitial(tlist);
+
+ Assert(!tle->resjunk);
+ if (coerce_fn_result_column(tle, rettype, -1,
+ tlist_is_modifiable,
+ &upper_tlist,
+ &upper_tlist_nontrivial))
+ {
+ /* Note that we're NOT setting is_tuple_result */
+ goto tlist_coercion_finished;
+ }
+ }
+
+ /*
+ * If the caller didn't provide an expected tupdesc, we can't do any
+ * further checking. Assume we're returning the whole tuple.
+ */
+ if (rettupdesc == NULL)
+ {
+ /* Return tlist if requested */
+ if (resultTargetList)
+ *resultTargetList = tlist;
+ return true;
+ }
+
+ /*
+ * Verify that the targetlist matches the return tuple type. We scan
+ * the non-resjunk columns, and coerce them if necessary to match the
+ * datatypes of the non-deleted attributes. For deleted attributes,
+ * insert NULL result columns if the caller asked for that.
+ */
+ tupnatts = rettupdesc->natts;
+ tuplogcols = 0; /* we'll count nondeleted cols as we go */
+ colindex = 0;
+
+ foreach(lc, tlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+ Form_pg_attribute attr;
+
+ /* resjunk columns can simply be ignored */
+ if (tle->resjunk)
+ continue;
+
+ do
+ {
+ colindex++;
+ if (colindex > tupnatts)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("return type mismatch in function declared to return %s",
+ format_type_be(rettype)),
+ errdetail("Final statement returns too many columns.")));
+ attr = TupleDescAttr(rettupdesc, colindex - 1);
+ if (attr->attisdropped && insertDroppedCols)
+ {
+ Expr *null_expr;
+
+ /* The type of the null we insert isn't important */
+ null_expr = (Expr *) makeConst(INT4OID,
+ -1,
+ InvalidOid,
+ sizeof(int32),
+ (Datum) 0,
+ true, /* isnull */
+ true /* byval */ );
+ upper_tlist = lappend(upper_tlist,
+ makeTargetEntry(null_expr,
+ list_length(upper_tlist) + 1,
+ NULL,
+ false));
+ upper_tlist_nontrivial = true;
+ }
+ } while (attr->attisdropped);
+ tuplogcols++;
+
+ if (!coerce_fn_result_column(tle,
+ attr->atttypid, attr->atttypmod,
+ tlist_is_modifiable,
+ &upper_tlist,
+ &upper_tlist_nontrivial))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("return type mismatch in function declared to return %s",
+ format_type_be(rettype)),
+ errdetail("Final statement returns %s instead of %s at column %d.",
+ format_type_be(exprType((Node *) tle->expr)),
+ format_type_be(attr->atttypid),
+ tuplogcols)));
+ }
+
+ /* remaining columns in rettupdesc had better all be dropped */
+ for (colindex++; colindex <= tupnatts; colindex++)
+ {
+ if (!TupleDescAttr(rettupdesc, colindex - 1)->attisdropped)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("return type mismatch in function declared to return %s",
+ format_type_be(rettype)),
+ errdetail("Final statement returns too few columns.")));
+ if (insertDroppedCols)
+ {
+ Expr *null_expr;
+
+ /* The type of the null we insert isn't important */
+ null_expr = (Expr *) makeConst(INT4OID,
+ -1,
+ InvalidOid,
+ sizeof(int32),
+ (Datum) 0,
+ true, /* isnull */
+ true /* byval */ );
+ upper_tlist = lappend(upper_tlist,
+ makeTargetEntry(null_expr,
+ list_length(upper_tlist) + 1,
+ NULL,
+ false));
+ upper_tlist_nontrivial = true;
+ }
+ }
+
+ /* Report that we are returning entire tuple result */
+ is_tuple_result = true;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("return type %s is not supported for SQL functions",
+ format_type_be(rettype))));
+
+tlist_coercion_finished:
+
+ /*
+ * If necessary, modify the final Query by injecting an extra Query level
+ * that just performs a projection. (It'd be dubious to do this to a
+ * non-SELECT query, but we never have to; RETURNING lists can always be
+ * modified in-place.)
+ */
+ if (upper_tlist_nontrivial)
+ {
+ Query *newquery;
+ List *colnames;
+ RangeTblEntry *rte;
+ RangeTblRef *rtr;
+
+ Assert(parse->commandType == CMD_SELECT);
+
+ /* Most of the upper Query struct can be left as zeroes/nulls */
+ newquery = makeNode(Query);
+ newquery->commandType = CMD_SELECT;
+ newquery->querySource = parse->querySource;
+ newquery->canSetTag = true;
+ newquery->targetList = upper_tlist;
+
+ /* We need a moderately realistic colnames list for the subquery RTE */
+ colnames = NIL;
+ foreach(lc, parse->targetList)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+ if (tle->resjunk)
+ continue;
+ colnames = lappend(colnames,
+ makeString(tle->resname ? tle->resname : ""));
+ }
+
+ /* Build a suitable RTE for the subquery */
+ rte = makeNode(RangeTblEntry);
+ rte->rtekind = RTE_SUBQUERY;
+ rte->subquery = parse;
+ rte->eref = rte->alias = makeAlias("*SELECT*", colnames);
+ rte->lateral = false;
+ rte->inh = false;
+ rte->inFromCl = true;
+ newquery->rtable = list_make1(rte);
+
+ rtr = makeNode(RangeTblRef);
+ rtr->rtindex = 1;
+ newquery->jointree = makeFromExpr(list_make1(rtr), NULL);
+
+ /* Replace original query in the correct element of the query list */
+ lfirst(parse_cell) = newquery;
+ }
+
+ /* Return tlist (possibly modified) if requested */
+ if (resultTargetList)
+ *resultTargetList = upper_tlist;
+
+ return is_tuple_result;
+}
+
+/*
+ * Process one function result column for check_sql_fn_retval
+ *
+ * Coerce the output value to the required type/typmod, and add a column
+ * to *upper_tlist for it. Set *upper_tlist_nontrivial to true if we
+ * add an upper tlist item that's not just a Var.
+ *
+ * Returns true if OK, false if could not coerce to required type
+ * (in which case, no changes have been made)
+ */
+static bool
+coerce_fn_result_column(TargetEntry *src_tle,
+ Oid res_type,
+ int32 res_typmod,
+ bool tlist_is_modifiable,
+ List **upper_tlist,
+ bool *upper_tlist_nontrivial)
+{
+ TargetEntry *new_tle;
+ Expr *new_tle_expr;
+ Node *cast_result;
+
+ /*
+ * If the TLE has a sortgroupref marking, don't change it, as it probably
+ * is referenced by ORDER BY, DISTINCT, etc, and changing its type would
+ * break query semantics. Otherwise, it's safe to modify in-place unless
+ * the query as a whole has issues with that.
+ */
+ if (tlist_is_modifiable && src_tle->ressortgroupref == 0)
+ {
+ /* OK to modify src_tle in place, if necessary */
+ cast_result = coerce_to_target_type(NULL,
+ (Node *) src_tle->expr,
+ exprType((Node *) src_tle->expr),
+ res_type, res_typmod,
+ COERCION_ASSIGNMENT,
+ COERCE_IMPLICIT_CAST,
+ -1);
+ if (cast_result == NULL)
+ return false;
+ assign_expr_collations(NULL, cast_result);
+ src_tle->expr = (Expr *) cast_result;
+ /* Make a Var referencing the possibly-modified TLE */
+ new_tle_expr = (Expr *) makeVarFromTargetEntry(1, src_tle);
+ }
+ else
+ {
+ /* Any casting must happen in the upper tlist */
+ Var *var = makeVarFromTargetEntry(1, src_tle);
+
+ cast_result = coerce_to_target_type(NULL,
+ (Node *) var,
+ var->vartype,
+ res_type, res_typmod,
+ COERCION_ASSIGNMENT,
+ COERCE_IMPLICIT_CAST,
+ -1);
+ if (cast_result == NULL)
+ return false;
+ assign_expr_collations(NULL, cast_result);
+ /* Did the coercion actually do anything? */
+ if (cast_result != (Node *) var)
+ *upper_tlist_nontrivial = true;
+ new_tle_expr = (Expr *) cast_result;
+ }
+ new_tle = makeTargetEntry(new_tle_expr,
+ list_length(*upper_tlist) + 1,
+ src_tle->resname, false);
+ *upper_tlist = lappend(*upper_tlist, new_tle);
+ return true;
+}
+
+
+/*
+ * CreateSQLFunctionDestReceiver -- create a suitable DestReceiver object
+ */
+DestReceiver *
+CreateSQLFunctionDestReceiver(void)
+{
+ DR_sqlfunction *self = (DR_sqlfunction *) palloc0(sizeof(DR_sqlfunction));
+
+ self->pub.receiveSlot = sqlfunction_receive;
+ self->pub.rStartup = sqlfunction_startup;
+ self->pub.rShutdown = sqlfunction_shutdown;
+ self->pub.rDestroy = sqlfunction_destroy;
+ self->pub.mydest = DestSQLFunction;
+
+ /* private fields will be set by postquel_start */
+
+ return (DestReceiver *) self;
+}
+
+/*
+ * sqlfunction_startup --- executor startup
+ */
+static void
+sqlfunction_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+ /* no-op */
+}
+
+/*
+ * sqlfunction_receive --- receive one tuple
+ */
+static bool
+sqlfunction_receive(TupleTableSlot *slot, DestReceiver *self)
+{
+ DR_sqlfunction *myState = (DR_sqlfunction *) self;
+
+ /* Filter tuple as needed */
+ slot = ExecFilterJunk(myState->filter, slot);
+
+ /* Store the filtered tuple into the tuplestore */
+ tuplestore_puttupleslot(myState->tstore, slot);
+
+ return true;
+}
+
+/*
+ * sqlfunction_shutdown --- executor end
+ */
+static void
+sqlfunction_shutdown(DestReceiver *self)
+{
+ /* no-op */
+}
+
+/*
+ * sqlfunction_destroy --- release DestReceiver object
+ */
+static void
+sqlfunction_destroy(DestReceiver *self)
+{
+ pfree(self);
+}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
new file mode 100644
index 0000000..2b106d8
--- /dev/null
+++ b/src/backend/executor/instrument.c
@@ -0,0 +1,279 @@
+/*-------------------------------------------------------------------------
+ *
+ * instrument.c
+ * functions for instrumentation of plan execution
+ *
+ *
+ * Copyright (c) 2001-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/executor/instrument.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "executor/instrument.h"
+
+BufferUsage pgBufferUsage;
+static BufferUsage save_pgBufferUsage;
+WalUsage pgWalUsage;
+static WalUsage save_pgWalUsage;
+
+static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
+static void WalUsageAdd(WalUsage *dst, WalUsage *add);
+
+
+/* Allocate new instrumentation structure(s) */
+Instrumentation *
+InstrAlloc(int n, int instrument_options, bool async_mode)
+{
+ Instrumentation *instr;
+
+ /* initialize all fields to zeroes, then modify as needed */
+ instr = palloc0(n * sizeof(Instrumentation));
+ if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
+ {
+ bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
+ bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
+ bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+ int i;
+
+ for (i = 0; i < n; i++)
+ {
+ instr[i].need_bufusage = need_buffers;
+ instr[i].need_walusage = need_wal;
+ instr[i].need_timer = need_timer;
+ instr[i].async_mode = async_mode;
+ }
+ }
+
+ return instr;
+}
+
+/* Initialize a pre-allocated instrumentation structure. */
+void
+InstrInit(Instrumentation *instr, int instrument_options)
+{
+ memset(instr, 0, sizeof(Instrumentation));
+ instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
+ instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
+ instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+}
+
+/* Entry to a plan node */
+void
+InstrStartNode(Instrumentation *instr)
+{
+ if (instr->need_timer &&
+ !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
+ elog(ERROR, "InstrStartNode called twice in a row");
+
+ /* save buffer usage totals at node entry, if needed */
+ if (instr->need_bufusage)
+ instr->bufusage_start = pgBufferUsage;
+
+ if (instr->need_walusage)
+ instr->walusage_start = pgWalUsage;
+}
+
+/* Exit from a plan node */
+void
+InstrStopNode(Instrumentation *instr, double nTuples)
+{
+ double save_tuplecount = instr->tuplecount;
+ instr_time endtime;
+
+ /* count the returned tuples */
+ instr->tuplecount += nTuples;
+
+ /* let's update the time only if the timer was requested */
+ if (instr->need_timer)
+ {
+ if (INSTR_TIME_IS_ZERO(instr->starttime))
+ elog(ERROR, "InstrStopNode called without start");
+
+ INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
+
+ INSTR_TIME_SET_ZERO(instr->starttime);
+ }
+
+ /* Add delta of buffer usage since entry to node's totals */
+ if (instr->need_bufusage)
+ BufferUsageAccumDiff(&instr->bufusage,
+ &pgBufferUsage, &instr->bufusage_start);
+
+ if (instr->need_walusage)
+ WalUsageAccumDiff(&instr->walusage,
+ &pgWalUsage, &instr->walusage_start);
+
+ /* Is this the first tuple of this cycle? */
+ if (!instr->running)
+ {
+ instr->running = true;
+ instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
+ }
+ else
+ {
+ /*
+ * In async mode, if the plan node hadn't emitted any tuples before,
+ * this might be the first tuple
+ */
+ if (instr->async_mode && save_tuplecount < 1.0)
+ instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
+ }
+}
+
+/* Update tuple count */
+void
+InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
+{
+ /* count the returned tuples */
+ instr->tuplecount += nTuples;
+}
+
+/* Finish a run cycle for a plan node */
+void
+InstrEndLoop(Instrumentation *instr)
+{
+ double totaltime;
+
+ /* Skip if nothing has happened, or already shut down */
+ if (!instr->running)
+ return;
+
+ if (!INSTR_TIME_IS_ZERO(instr->starttime))
+ elog(ERROR, "InstrEndLoop called on running node");
+
+ /* Accumulate per-cycle statistics into totals */
+ totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
+
+ instr->startup += instr->firsttuple;
+ instr->total += totaltime;
+ instr->ntuples += instr->tuplecount;
+ instr->nloops += 1;
+
+ /* Reset for next cycle (if any) */
+ instr->running = false;
+ INSTR_TIME_SET_ZERO(instr->starttime);
+ INSTR_TIME_SET_ZERO(instr->counter);
+ instr->firsttuple = 0;
+ instr->tuplecount = 0;
+}
+
+/* aggregate instrumentation information */
+void
+InstrAggNode(Instrumentation *dst, Instrumentation *add)
+{
+ if (!dst->running && add->running)
+ {
+ dst->running = true;
+ dst->firsttuple = add->firsttuple;
+ }
+ else if (dst->running && add->running && dst->firsttuple > add->firsttuple)
+ dst->firsttuple = add->firsttuple;
+
+ INSTR_TIME_ADD(dst->counter, add->counter);
+
+ dst->tuplecount += add->tuplecount;
+ dst->startup += add->startup;
+ dst->total += add->total;
+ dst->ntuples += add->ntuples;
+ dst->ntuples2 += add->ntuples2;
+ dst->nloops += add->nloops;
+ dst->nfiltered1 += add->nfiltered1;
+ dst->nfiltered2 += add->nfiltered2;
+
+ /* Add delta of buffer usage since entry to node's totals */
+ if (dst->need_bufusage)
+ BufferUsageAdd(&dst->bufusage, &add->bufusage);
+
+ if (dst->need_walusage)
+ WalUsageAdd(&dst->walusage, &add->walusage);
+}
+
+/* note current values during parallel executor startup */
+void
+InstrStartParallelQuery(void)
+{
+ save_pgBufferUsage = pgBufferUsage;
+ save_pgWalUsage = pgWalUsage;
+}
+
+/* report usage after parallel executor shutdown */
+void
+InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+{
+ memset(bufusage, 0, sizeof(BufferUsage));
+ BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
+ memset(walusage, 0, sizeof(WalUsage));
+ WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
+}
+
+/* accumulate work done by workers in leader's stats */
+void
+InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+{
+ BufferUsageAdd(&pgBufferUsage, bufusage);
+ WalUsageAdd(&pgWalUsage, walusage);
+}
+
+/* dst += add */
+static void
+BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
+{
+ dst->shared_blks_hit += add->shared_blks_hit;
+ dst->shared_blks_read += add->shared_blks_read;
+ dst->shared_blks_dirtied += add->shared_blks_dirtied;
+ dst->shared_blks_written += add->shared_blks_written;
+ dst->local_blks_hit += add->local_blks_hit;
+ dst->local_blks_read += add->local_blks_read;
+ dst->local_blks_dirtied += add->local_blks_dirtied;
+ dst->local_blks_written += add->local_blks_written;
+ dst->temp_blks_read += add->temp_blks_read;
+ dst->temp_blks_written += add->temp_blks_written;
+ INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
+ INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
+}
+
+/* dst += add - sub */
+void
+BufferUsageAccumDiff(BufferUsage *dst,
+ const BufferUsage *add,
+ const BufferUsage *sub)
+{
+ dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
+ dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
+ dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
+ dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
+ dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
+ dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
+ dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
+ dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
+ dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
+ dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
+ INSTR_TIME_ACCUM_DIFF(dst->blk_read_time,
+ add->blk_read_time, sub->blk_read_time);
+ INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
+ add->blk_write_time, sub->blk_write_time);
+}
+
+/* helper functions for WAL usage accumulation */
+static void
+WalUsageAdd(WalUsage *dst, WalUsage *add)
+{
+ dst->wal_bytes += add->wal_bytes;
+ dst->wal_records += add->wal_records;
+ dst->wal_fpi += add->wal_fpi;
+}
+
+void
+WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
+{
+ dst->wal_bytes += add->wal_bytes - sub->wal_bytes;
+ dst->wal_records += add->wal_records - sub->wal_records;
+ dst->wal_fpi += add->wal_fpi - sub->wal_fpi;
+}
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
new file mode 100644
index 0000000..31609c6
--- /dev/null
+++ b/src/backend/executor/nodeAgg.c
@@ -0,0 +1,4829 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeAgg.c
+ * Routines to handle aggregate nodes.
+ *
+ * ExecAgg normally evaluates each aggregate in the following steps:
+ *
+ * transvalue = initcond
+ * foreach input_tuple do
+ * transvalue = transfunc(transvalue, input_value(s))
+ * result = finalfunc(transvalue, direct_argument(s))
+ *
+ * If a finalfunc is not supplied then the result is just the ending
+ * value of transvalue.
+ *
+ * Other behaviors can be selected by the "aggsplit" mode, which exists
+ * to support partial aggregation. It is possible to:
+ * * Skip running the finalfunc, so that the output is always the
+ * final transvalue state.
+ * * Substitute the combinefunc for the transfunc, so that transvalue
+ * states (propagated up from a child partial-aggregation step) are merged
+ * rather than processing raw input rows. (The statements below about
+ * the transfunc apply equally to the combinefunc, when it's selected.)
+ * * Apply the serializefunc to the output values (this only makes sense
+ * when skipping the finalfunc, since the serializefunc works on the
+ * transvalue data type).
+ * * Apply the deserializefunc to the input values (this only makes sense
+ * when using the combinefunc, for similar reasons).
+ * It is the planner's responsibility to connect up Agg nodes using these
+ * alternate behaviors in a way that makes sense, with partial aggregation
+ * results being fed to nodes that expect them.
+ *
+ * If a normal aggregate call specifies DISTINCT or ORDER BY, we sort the
+ * input tuples and eliminate duplicates (if required) before performing
+ * the above-depicted process. (However, we don't do that for ordered-set
+ * aggregates; their "ORDER BY" inputs are ordinary aggregate arguments
+ * so far as this module is concerned.) Note that partial aggregation
+ * is not supported in these cases, since we couldn't ensure global
+ * ordering or distinctness of the inputs.
+ *
+ * If transfunc is marked "strict" in pg_proc and initcond is NULL,
+ * then the first non-NULL input_value is assigned directly to transvalue,
+ * and transfunc isn't applied until the second non-NULL input_value.
+ * The agg's first input type and transtype must be the same in this case!
+ *
+ * If transfunc is marked "strict" then NULL input_values are skipped,
+ * keeping the previous transvalue. If transfunc is not strict then it
+ * is called for every input tuple and must deal with NULL initcond
+ * or NULL input_values for itself.
+ *
+ * If finalfunc is marked "strict" then it is not called when the
+ * ending transvalue is NULL, instead a NULL result is created
+ * automatically (this is just the usual handling of strict functions,
+ * of course). A non-strict finalfunc can make its own choice of
+ * what to return for a NULL ending transvalue.
+ *
+ * Ordered-set aggregates are treated specially in one other way: we
+ * evaluate any "direct" arguments and pass them to the finalfunc along
+ * with the transition value.
+ *
+ * A finalfunc can have additional arguments beyond the transvalue and
+ * any "direct" arguments, corresponding to the input arguments of the
+ * aggregate. These are always just passed as NULL. Such arguments may be
+ * needed to allow resolution of a polymorphic aggregate's result type.
+ *
+ * We compute aggregate input expressions and run the transition functions
+ * in a temporary econtext (aggstate->tmpcontext). This is reset at least
+ * once per input tuple, so when the transvalue datatype is
+ * pass-by-reference, we have to be careful to copy it into a longer-lived
+ * memory context, and free the prior value to avoid memory leakage. We
+ * store transvalues in another set of econtexts, aggstate->aggcontexts
+ * (one per grouping set, see below), which are also used for the hashtable
+ * structures in AGG_HASHED mode. These econtexts are rescanned, not just
+ * reset, at group boundaries so that aggregate transition functions can
+ * register shutdown callbacks via AggRegisterCallback.
+ *
+ * The node's regular econtext (aggstate->ss.ps.ps_ExprContext) is used to
+ * run finalize functions and compute the output tuple; this context can be
+ * reset once per output tuple.
+ *
+ * The executor's AggState node is passed as the fmgr "context" value in
+ * all transfunc and finalfunc calls. It is not recommended that the
+ * transition functions look at the AggState node directly, but they can
+ * use AggCheckCallContext() to verify that they are being called by
+ * nodeAgg.c (and not as ordinary SQL functions). The main reason a
+ * transition function might want to know this is so that it can avoid
+ * palloc'ing a fixed-size pass-by-ref transition value on every call:
+ * it can instead just scribble on and return its left input. Ordinarily
+ * it is completely forbidden for functions to modify pass-by-ref inputs,
+ * but in the aggregate case we know the left input is either the initial
+ * transition value or a previous function result, and in either case its
+ * value need not be preserved. See int8inc() for an example. Notice that
+ * the EEOP_AGG_PLAIN_TRANS step is coded to avoid a data copy step when
+ * the previous transition value pointer is returned. It is also possible
+ * to avoid repeated data copying when the transition value is an expanded
+ * object: to do that, the transition function must take care to return
+ * an expanded object that is in a child context of the memory context
+ * returned by AggCheckCallContext(). Also, some transition functions want
+ * to store working state in addition to the nominal transition value; they
+ * can use the memory context returned by AggCheckCallContext() to do that.
+ *
+ * Note: AggCheckCallContext() is available as of PostgreSQL 9.0. The
+ * AggState is available as context in earlier releases (back to 8.1),
+ * but direct examination of the node is needed to use it before 9.0.
+ *
+ * As of 9.4, aggregate transition functions can also use AggGetAggref()
+ * to get hold of the Aggref expression node for their aggregate call.
+ * This is mainly intended for ordered-set aggregates, which are not
+ * supported as window functions. (A regular aggregate function would
+ * need some fallback logic to use this, since there's no Aggref node
+ * for a window function.)
+ *
+ * Grouping sets:
+ *
+ * A list of grouping sets which is structurally equivalent to a ROLLUP
+ * clause (e.g. (a,b,c), (a,b), (a)) can be processed in a single pass over
+ * ordered data. We do this by keeping a separate set of transition values
+ * for each grouping set being concurrently processed; for each input tuple
+ * we update them all, and on group boundaries we reset those states
+ * (starting at the front of the list) whose grouping values have changed
+ * (the list of grouping sets is ordered from most specific to least
+ * specific).
+ *
+ * Where more complex grouping sets are used, we break them down into
+ * "phases", where each phase has a different sort order (except phase 0
+ * which is reserved for hashing). During each phase but the last, the
+ * input tuples are additionally stored in a tuplesort which is keyed to the
+ * next phase's sort order; during each phase but the first, the input
+ * tuples are drawn from the previously sorted data. (The sorting of the
+ * data for the first phase is handled by the planner, as it might be
+ * satisfied by underlying nodes.)
+ *
+ * Hashing can be mixed with sorted grouping. To do this, we have an
+ * AGG_MIXED strategy that populates the hashtables during the first sorted
+ * phase, and switches to reading them out after completing all sort phases.
+ * We can also support AGG_HASHED with multiple hash tables and no sorting
+ * at all.
+ *
+ * From the perspective of aggregate transition and final functions, the
+ * only issue regarding grouping sets is this: a single call site (flinfo)
+ * of an aggregate function may be used for updating several different
+ * transition values in turn. So the function must not cache in the flinfo
+ * anything which logically belongs as part of the transition value (most
+ * importantly, the memory context in which the transition value exists).
+ * The support API functions (AggCheckCallContext, AggRegisterCallback) are
+ * sensitive to the grouping set for which the aggregate function is
+ * currently being called.
+ *
+ * Plan structure:
+ *
+ * What we get from the planner is actually one "real" Agg node which is
+ * part of the plan tree proper, but which optionally has an additional list
+ * of Agg nodes hung off the side via the "chain" field. This is because an
+ * Agg node happens to be a convenient representation of all the data we
+ * need for grouping sets.
+ *
+ * For many purposes, we treat the "real" node as if it were just the first
+ * node in the chain. The chain must be ordered such that hashed entries
+ * come before sorted/plain entries; the real node is marked AGG_MIXED if
+ * there are both types present (in which case the real node describes one
+ * of the hashed groupings, other AGG_HASHED nodes may optionally follow in
+ * the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node). If
+ * the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
+ * nodes must be of the same type; if it is AGG_PLAIN, there can be no
+ * chained nodes.
+ *
+ * We collect all hashed nodes into a single "phase", numbered 0, and create
+ * a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
+ * Phase 0 is allocated even if there are no hashes, but remains unused in
+ * that case.
+ *
+ * AGG_HASHED nodes actually refer to only a single grouping set each,
+ * because for each hashed grouping we need a separate grpColIdx and
+ * numGroups estimate. AGG_SORTED nodes represent a "rollup", a list of
+ * grouping sets that share a sort order. Each AGG_SORTED node other than
+ * the first one has an associated Sort node which describes the sort order
+ * to be used; the first sorted node takes its input from the outer subtree,
+ * which the planner has already arranged to provide ordered data.
+ *
+ * Memory and ExprContext usage:
+ *
+ * Because we're accumulating aggregate values across input rows, we need to
+ * use more memory contexts than just simple input/output tuple contexts.
+ * In fact, for a rollup, we need a separate context for each grouping set
+ * so that we can reset the inner (finer-grained) aggregates on their group
+ * boundaries while continuing to accumulate values for outer
+ * (coarser-grained) groupings. On top of this, we might be simultaneously
+ * populating hashtables; however, we only need one context for all the
+ * hashtables.
+ *
+ * So we create an array, aggcontexts, with an ExprContext for each grouping
+ * set in the largest rollup that we're going to process, and use the
+ * per-tuple memory context of those ExprContexts to store the aggregate
+ * transition values. hashcontext is the single context created to support
+ * all hash tables.
+ *
+ * Spilling To Disk
+ *
+ * When performing hash aggregation, if the hash table memory exceeds the
+ * limit (see hash_agg_check_limits()), we enter "spill mode". In spill
+ * mode, we advance the transition states only for groups already in the
+ * hash table. For tuples that would need to create a new hash table
+ * entries (and initialize new transition states), we instead spill them to
+ * disk to be processed later. The tuples are spilled in a partitioned
+ * manner, so that subsequent batches are smaller and less likely to exceed
+ * hash_mem (if a batch does exceed hash_mem, it must be spilled
+ * recursively).
+ *
+ * Spilled data is written to logical tapes. These provide better control
+ * over memory usage, disk space, and the number of files than if we were
+ * to use a BufFile for each spill.
+ *
+ * Note that it's possible for transition states to start small but then
+ * grow very large; for instance in the case of ARRAY_AGG. In such cases,
+ * it's still possible to significantly exceed hash_mem. We try to avoid
+ * this situation by estimating what will fit in the available memory, and
+ * imposing a limit on the number of groups separately from the amount of
+ * memory consumed.
+ *
+ * Transition / Combine function invocation:
+ *
+ * For performance reasons transition functions, including combine
+ * functions, aren't invoked one-by-one from nodeAgg.c after computing
+ * arguments using the expression evaluation engine. Instead
+ * ExecBuildAggTrans() builds one large expression that does both argument
+ * evaluation and transition function invocation. That avoids performance
+ * issues due to repeated uses of expression evaluation, complications due
+ * to filter expressions having to be evaluated early, and allows to JIT
+ * the entire expression into one native function.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeAgg.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/parallel.h"
+#include "catalog/objectaccess.h"
+#include "catalog/pg_aggregate.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "common/hashfn.h"
+#include "executor/execExpr.h"
+#include "executor/executor.h"
+#include "executor/nodeAgg.h"
+#include "lib/hyperloglog.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
+#include "parser/parse_agg.h"
+#include "parser/parse_coerce.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/dynahash.h"
+#include "utils/expandeddatum.h"
+#include "utils/logtape.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/syscache.h"
+#include "utils/tuplesort.h"
+
+/*
+ * Control how many partitions are created when spilling HashAgg to
+ * disk.
+ *
+ * HASHAGG_PARTITION_FACTOR is multiplied by the estimated number of
+ * partitions needed such that each partition will fit in memory. The factor
+ * is set higher than one because there's not a high cost to having a few too
+ * many partitions, and it makes it less likely that a partition will need to
+ * be spilled recursively. Another benefit of having more, smaller partitions
+ * is that small hash tables may perform better than large ones due to memory
+ * caching effects.
+ *
+ * We also specify a min and max number of partitions per spill. Too few might
+ * mean a lot of wasted I/O from repeated spilling of the same tuples. Too
+ * many will result in lots of memory wasted buffering the spill files (which
+ * could instead be spent on a larger hash table).
+ */
+#define HASHAGG_PARTITION_FACTOR 1.50
+#define HASHAGG_MIN_PARTITIONS 4
+#define HASHAGG_MAX_PARTITIONS 1024
+
+/*
+ * For reading from tapes, the buffer size must be a multiple of
+ * BLCKSZ. Larger values help when reading from multiple tapes concurrently,
+ * but that doesn't happen in HashAgg, so we simply use BLCKSZ. Writing to a
+ * tape always uses a buffer of size BLCKSZ.
+ */
+#define HASHAGG_READ_BUFFER_SIZE BLCKSZ
+#define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ
+
+/*
+ * HyperLogLog is used for estimating the cardinality of the spilled tuples in
+ * a given partition. 5 bits corresponds to a size of about 32 bytes and a
+ * worst-case error of around 18%. That's effective enough to choose a
+ * reasonable number of partitions when recursing.
+ */
+#define HASHAGG_HLL_BIT_WIDTH 5
+
+/*
+ * Estimate chunk overhead as a constant 16 bytes. XXX: should this be
+ * improved?
+ */
+#define CHUNKHDRSZ 16
+
+/*
+ * Track all tapes needed for a HashAgg that spills. We don't know the maximum
+ * number of tapes needed at the start of the algorithm (because it can
+ * recurse), so one tape set is allocated and extended as needed for new
+ * tapes. When a particular tape is already read, rewind it for write mode and
+ * put it in the free list.
+ *
+ * Tapes' buffers can take up substantial memory when many tapes are open at
+ * once. We only need one tape open at a time in read mode (using a buffer
+ * that's a multiple of BLCKSZ); but we need one tape open in write mode (each
+ * requiring a buffer of size BLCKSZ) for each partition.
+ */
+typedef struct HashTapeInfo
+{
+ LogicalTapeSet *tapeset;
+ int ntapes;
+ int *freetapes;
+ int nfreetapes;
+ int freetapes_alloc;
+} HashTapeInfo;
+
+/*
+ * Represents partitioned spill data for a single hashtable. Contains the
+ * necessary information to route tuples to the correct partition, and to
+ * transform the spilled data into new batches.
+ *
+ * The high bits are used for partition selection (when recursing, we ignore
+ * the bits that have already been used for partition selection at an earlier
+ * level).
+ */
+typedef struct HashAggSpill
+{
+ LogicalTapeSet *tapeset; /* borrowed reference to tape set */
+ int npartitions; /* number of partitions */
+ int *partitions; /* spill partition tape numbers */
+ int64 *ntuples; /* number of tuples in each partition */
+ uint32 mask; /* mask to find partition from hash value */
+ int shift; /* after masking, shift by this amount */
+ hyperLogLogState *hll_card; /* cardinality estimate for contents */
+} HashAggSpill;
+
+/*
+ * Represents work to be done for one pass of hash aggregation (with only one
+ * grouping set).
+ *
+ * Also tracks the bits of the hash already used for partition selection by
+ * earlier iterations, so that this batch can use new bits. If all bits have
+ * already been used, no partitioning will be done (any spilled data will go
+ * to a single output tape).
+ */
+typedef struct HashAggBatch
+{
+ int setno; /* grouping set */
+ int used_bits; /* number of bits of hash already used */
+ LogicalTapeSet *tapeset; /* borrowed reference to tape set */
+ int input_tapenum; /* input partition tape */
+ int64 input_tuples; /* number of tuples in this batch */
+ double input_card; /* estimated group cardinality */
+} HashAggBatch;
+
+/* used to find referenced colnos */
+typedef struct FindColsContext
+{
+ bool is_aggref; /* is under an aggref */
+ Bitmapset *aggregated; /* column references under an aggref */
+ Bitmapset *unaggregated; /* other column references */
+} FindColsContext;
+
+static void select_current_set(AggState *aggstate, int setno, bool is_hash);
+static void initialize_phase(AggState *aggstate, int newphase);
+static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
+static void initialize_aggregates(AggState *aggstate,
+ AggStatePerGroup *pergroups,
+ int numReset);
+static void advance_transition_function(AggState *aggstate,
+ AggStatePerTrans pertrans,
+ AggStatePerGroup pergroupstate);
+static void advance_aggregates(AggState *aggstate);
+static void process_ordered_aggregate_single(AggState *aggstate,
+ AggStatePerTrans pertrans,
+ AggStatePerGroup pergroupstate);
+static void process_ordered_aggregate_multi(AggState *aggstate,
+ AggStatePerTrans pertrans,
+ AggStatePerGroup pergroupstate);
+static void finalize_aggregate(AggState *aggstate,
+ AggStatePerAgg peragg,
+ AggStatePerGroup pergroupstate,
+ Datum *resultVal, bool *resultIsNull);
+static void finalize_partialaggregate(AggState *aggstate,
+ AggStatePerAgg peragg,
+ AggStatePerGroup pergroupstate,
+ Datum *resultVal, bool *resultIsNull);
+static inline void prepare_hash_slot(AggStatePerHash perhash,
+ TupleTableSlot *inputslot,
+ TupleTableSlot *hashslot);
+static void prepare_projection_slot(AggState *aggstate,
+ TupleTableSlot *slot,
+ int currentSet);
+static void finalize_aggregates(AggState *aggstate,
+ AggStatePerAgg peragg,
+ AggStatePerGroup pergroup);
+static TupleTableSlot *project_aggregates(AggState *aggstate);
+static void find_cols(AggState *aggstate, Bitmapset **aggregated,
+ Bitmapset **unaggregated);
+static bool find_cols_walker(Node *node, FindColsContext *context);
+static void build_hash_tables(AggState *aggstate);
+static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
+static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
+ bool nullcheck);
+static long hash_choose_num_buckets(double hashentrysize,
+ long estimated_nbuckets,
+ Size memory);
+static int hash_choose_num_partitions(double input_groups,
+ double hashentrysize,
+ int used_bits,
+ int *log2_npartittions);
+static void initialize_hash_entry(AggState *aggstate,
+ TupleHashTable hashtable,
+ TupleHashEntry entry);
+static void lookup_hash_entries(AggState *aggstate);
+static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
+static void agg_fill_hash_table(AggState *aggstate);
+static bool agg_refill_hash_table(AggState *aggstate);
+static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate);
+static TupleTableSlot *agg_retrieve_hash_table_in_memory(AggState *aggstate);
+static void hash_agg_check_limits(AggState *aggstate);
+static void hash_agg_enter_spill_mode(AggState *aggstate);
+static void hash_agg_update_metrics(AggState *aggstate, bool from_tape,
+ int npartitions);
+static void hashagg_finish_initial_spills(AggState *aggstate);
+static void hashagg_reset_spill_state(AggState *aggstate);
+static HashAggBatch *hashagg_batch_new(LogicalTapeSet *tapeset,
+ int input_tapenum, int setno,
+ int64 input_tuples, double input_card,
+ int used_bits);
+static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
+static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo,
+ int used_bits, double input_groups,
+ double hashentrysize);
+static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
+ TupleTableSlot *slot, uint32 hash);
+static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill,
+ int setno);
+static void hashagg_tapeinfo_init(AggState *aggstate);
+static void hashagg_tapeinfo_assign(HashTapeInfo *tapeinfo, int *dest,
+ int ndest);
+static void hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum);
+static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
+static void build_pertrans_for_aggref(AggStatePerTrans pertrans,
+ AggState *aggstate, EState *estate,
+ Aggref *aggref, Oid aggtransfn, Oid aggtranstype,
+ Oid aggserialfn, Oid aggdeserialfn,
+ Datum initValue, bool initValueIsNull,
+ Oid *inputTypes, int numArguments);
+
+
+/*
+ * Select the current grouping set; affects current_set and
+ * curaggcontext.
+ */
+static void
+select_current_set(AggState *aggstate, int setno, bool is_hash)
+{
+ /*
+ * When changing this, also adapt ExecAggPlainTransByVal() and
+ * ExecAggPlainTransByRef().
+ */
+ if (is_hash)
+ aggstate->curaggcontext = aggstate->hashcontext;
+ else
+ aggstate->curaggcontext = aggstate->aggcontexts[setno];
+
+ aggstate->current_set = setno;
+}
+
+/*
+ * Switch to phase "newphase", which must either be 0 or 1 (to reset) or
+ * current_phase + 1. Juggle the tuplesorts accordingly.
+ *
+ * Phase 0 is for hashing, which we currently handle last in the AGG_MIXED
+ * case, so when entering phase 0, all we need to do is drop open sorts.
+ */
+static void
+initialize_phase(AggState *aggstate, int newphase)
+{
+ Assert(newphase <= 1 || newphase == aggstate->current_phase + 1);
+
+ /*
+ * Whatever the previous state, we're now done with whatever input
+ * tuplesort was in use.
+ */
+ if (aggstate->sort_in)
+ {
+ tuplesort_end(aggstate->sort_in);
+ aggstate->sort_in = NULL;
+ }
+
+ if (newphase <= 1)
+ {
+ /*
+ * Discard any existing output tuplesort.
+ */
+ if (aggstate->sort_out)
+ {
+ tuplesort_end(aggstate->sort_out);
+ aggstate->sort_out = NULL;
+ }
+ }
+ else
+ {
+ /*
+ * The old output tuplesort becomes the new input one, and this is the
+ * right time to actually sort it.
+ */
+ aggstate->sort_in = aggstate->sort_out;
+ aggstate->sort_out = NULL;
+ Assert(aggstate->sort_in);
+ tuplesort_performsort(aggstate->sort_in);
+ }
+
+ /*
+ * If this isn't the last phase, we need to sort appropriately for the
+ * next phase in sequence.
+ */
+ if (newphase > 0 && newphase < aggstate->numphases - 1)
+ {
+ Sort *sortnode = aggstate->phases[newphase + 1].sortnode;
+ PlanState *outerNode = outerPlanState(aggstate);
+ TupleDesc tupDesc = ExecGetResultType(outerNode);
+
+ aggstate->sort_out = tuplesort_begin_heap(tupDesc,
+ sortnode->numCols,
+ sortnode->sortColIdx,
+ sortnode->sortOperators,
+ sortnode->collations,
+ sortnode->nullsFirst,
+ work_mem,
+ NULL, false);
+ }
+
+ aggstate->current_phase = newphase;
+ aggstate->phase = &aggstate->phases[newphase];
+}
+
+/*
+ * Fetch a tuple from either the outer plan (for phase 1) or from the sorter
+ * populated by the previous phase. Copy it to the sorter for the next phase
+ * if any.
+ *
+ * Callers cannot rely on memory for tuple in returned slot remaining valid
+ * past any subsequently fetched tuple.
+ */
+static TupleTableSlot *
+fetch_input_tuple(AggState *aggstate)
+{
+ TupleTableSlot *slot;
+
+ if (aggstate->sort_in)
+ {
+ /* make sure we check for interrupts in either path through here */
+ CHECK_FOR_INTERRUPTS();
+ if (!tuplesort_gettupleslot(aggstate->sort_in, true, false,
+ aggstate->sort_slot, NULL))
+ return NULL;
+ slot = aggstate->sort_slot;
+ }
+ else
+ slot = ExecProcNode(outerPlanState(aggstate));
+
+ if (!TupIsNull(slot) && aggstate->sort_out)
+ tuplesort_puttupleslot(aggstate->sort_out, slot);
+
+ return slot;
+}
+
+/*
+ * (Re)Initialize an individual aggregate.
+ *
+ * This function handles only one grouping set, already set in
+ * aggstate->current_set.
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans,
+ AggStatePerGroup pergroupstate)
+{
+ /*
+ * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
+ */
+ if (pertrans->numSortCols > 0)
+ {
+ /*
+ * In case of rescan, maybe there could be an uncompleted sort
+ * operation? Clean it up if so.
+ */
+ if (pertrans->sortstates[aggstate->current_set])
+ tuplesort_end(pertrans->sortstates[aggstate->current_set]);
+
+
+ /*
+ * We use a plain Datum sorter when there's a single input column;
+ * otherwise sort the full tuple. (See comments for
+ * process_ordered_aggregate_single.)
+ */
+ if (pertrans->numInputs == 1)
+ {
+ Form_pg_attribute attr = TupleDescAttr(pertrans->sortdesc, 0);
+
+ pertrans->sortstates[aggstate->current_set] =
+ tuplesort_begin_datum(attr->atttypid,
+ pertrans->sortOperators[0],
+ pertrans->sortCollations[0],
+ pertrans->sortNullsFirst[0],
+ work_mem, NULL, false);
+ }
+ else
+ pertrans->sortstates[aggstate->current_set] =
+ tuplesort_begin_heap(pertrans->sortdesc,
+ pertrans->numSortCols,
+ pertrans->sortColIdx,
+ pertrans->sortOperators,
+ pertrans->sortCollations,
+ pertrans->sortNullsFirst,
+ work_mem, NULL, false);
+ }
+
+ /*
+ * (Re)set transValue to the initial value.
+ *
+ * Note that when the initial value is pass-by-ref, we must copy it (into
+ * the aggcontext) since we will pfree the transValue later.
+ */
+ if (pertrans->initValueIsNull)
+ pergroupstate->transValue = pertrans->initValue;
+ else
+ {
+ MemoryContext oldContext;
+
+ oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
+ pergroupstate->transValue = datumCopy(pertrans->initValue,
+ pertrans->transtypeByVal,
+ pertrans->transtypeLen);
+ MemoryContextSwitchTo(oldContext);
+ }
+ pergroupstate->transValueIsNull = pertrans->initValueIsNull;
+
+ /*
+ * If the initial value for the transition state doesn't exist in the
+ * pg_aggregate table then we will let the first non-NULL value returned
+ * from the outer procNode become the initial value. (This is useful for
+ * aggregates like max() and min().) The noTransValue flag signals that we
+ * still need to do this.
+ */
+ pergroupstate->noTransValue = pertrans->initValueIsNull;
+}
+
+/*
+ * Initialize all aggregate transition states for a new group of input values.
+ *
+ * If there are multiple grouping sets, we initialize only the first numReset
+ * of them (the grouping sets are ordered so that the most specific one, which
+ * is reset most often, is first). As a convenience, if numReset is 0, we
+ * reinitialize all sets.
+ *
+ * NB: This cannot be used for hash aggregates, as for those the grouping set
+ * number has to be specified from further up.
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+initialize_aggregates(AggState *aggstate,
+ AggStatePerGroup *pergroups,
+ int numReset)
+{
+ int transno;
+ int numGroupingSets = Max(aggstate->phase->numsets, 1);
+ int setno = 0;
+ int numTrans = aggstate->numtrans;
+ AggStatePerTrans transstates = aggstate->pertrans;
+
+ if (numReset == 0)
+ numReset = numGroupingSets;
+
+ for (setno = 0; setno < numReset; setno++)
+ {
+ AggStatePerGroup pergroup = pergroups[setno];
+
+ select_current_set(aggstate, setno, false);
+
+ for (transno = 0; transno < numTrans; transno++)
+ {
+ AggStatePerTrans pertrans = &transstates[transno];
+ AggStatePerGroup pergroupstate = &pergroup[transno];
+
+ initialize_aggregate(aggstate, pertrans, pergroupstate);
+ }
+ }
+}
+
+/*
+ * Given new input value(s), advance the transition function of one aggregate
+ * state within one grouping set only (already set in aggstate->current_set)
+ *
+ * The new values (and null flags) have been preloaded into argument positions
+ * 1 and up in pertrans->transfn_fcinfo, so that we needn't copy them again to
+ * pass to the transition function. We also expect that the static fields of
+ * the fcinfo are already initialized; that was done by ExecInitAgg().
+ *
+ * It doesn't matter which memory context this is called in.
+ */
+static void
+advance_transition_function(AggState *aggstate,
+ AggStatePerTrans pertrans,
+ AggStatePerGroup pergroupstate)
+{
+ FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+ MemoryContext oldContext;
+ Datum newVal;
+
+ if (pertrans->transfn.fn_strict)
+ {
+ /*
+ * For a strict transfn, nothing happens when there's a NULL input; we
+ * just keep the prior transValue.
+ */
+ int numTransInputs = pertrans->numTransInputs;
+ int i;
+
+ for (i = 1; i <= numTransInputs; i++)
+ {
+ if (fcinfo->args[i].isnull)
+ return;
+ }
+ if (pergroupstate->noTransValue)
+ {
+ /*
+ * transValue has not been initialized. This is the first non-NULL
+ * input value. We use it as the initial value for transValue. (We
+ * already checked that the agg's input type is binary-compatible
+ * with its transtype, so straight copy here is OK.)
+ *
+ * We must copy the datum into aggcontext if it is pass-by-ref. We
+ * do not need to pfree the old transValue, since it's NULL.
+ */
+ oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
+ pergroupstate->transValue = datumCopy(fcinfo->args[1].value,
+ pertrans->transtypeByVal,
+ pertrans->transtypeLen);
+ pergroupstate->transValueIsNull = false;
+ pergroupstate->noTransValue = false;
+ MemoryContextSwitchTo(oldContext);
+ return;
+ }
+ if (pergroupstate->transValueIsNull)
+ {
+ /*
+ * Don't call a strict function with NULL inputs. Note it is
+ * possible to get here despite the above tests, if the transfn is
+ * strict *and* returned a NULL on a prior cycle. If that happens
+ * we will propagate the NULL all the way to the end.
+ */
+ return;
+ }
+ }
+
+ /* We run the transition functions in per-input-tuple memory context */
+ oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+
+ /* set up aggstate->curpertrans for AggGetAggref() */
+ aggstate->curpertrans = pertrans;
+
+ /*
+ * OK to call the transition function
+ */
+ fcinfo->args[0].value = pergroupstate->transValue;
+ fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
+ fcinfo->isnull = false; /* just in case transfn doesn't set it */
+
+ newVal = FunctionCallInvoke(fcinfo);
+
+ aggstate->curpertrans = NULL;
+
+ /*
+ * If pass-by-ref datatype, must copy the new value into aggcontext and
+ * free the prior transValue. But if transfn returned a pointer to its
+ * first input, we don't need to do anything. Also, if transfn returned a
+ * pointer to a R/W expanded object that is already a child of the
+ * aggcontext, assume we can adopt that value without copying it.
+ *
+ * It's safe to compare newVal with pergroup->transValue without regard
+ * for either being NULL, because ExecAggTransReparent() takes care to set
+ * transValue to 0 when NULL. Otherwise we could end up accidentally not
+ * reparenting, when the transValue has the same numerical value as
+ * newValue, despite being NULL. This is a somewhat hot path, making it
+ * undesirable to instead solve this with another branch for the common
+ * case of the transition function returning its (modified) input
+ * argument.
+ */
+ if (!pertrans->transtypeByVal &&
+ DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue))
+ newVal = ExecAggTransReparent(aggstate, pertrans,
+ newVal, fcinfo->isnull,
+ pergroupstate->transValue,
+ pergroupstate->transValueIsNull);
+
+ pergroupstate->transValue = newVal;
+ pergroupstate->transValueIsNull = fcinfo->isnull;
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Advance each aggregate transition state for one input tuple. The input
+ * tuple has been stored in tmpcontext->ecxt_outertuple, so that it is
+ * accessible to ExecEvalExpr.
+ *
+ * We have two sets of transition states to handle: one for sorted aggregation
+ * and one for hashed; we do them both here, to avoid multiple evaluation of
+ * the inputs.
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+advance_aggregates(AggState *aggstate)
+{
+ bool dummynull;
+
+ ExecEvalExprSwitchContext(aggstate->phase->evaltrans,
+ aggstate->tmpcontext,
+ &dummynull);
+}
+
+/*
+ * Run the transition function for a DISTINCT or ORDER BY aggregate
+ * with only one input. This is called after we have completed
+ * entering all the input values into the sort object. We complete the
+ * sort, read out the values in sorted order, and run the transition
+ * function on each value (applying DISTINCT if appropriate).
+ *
+ * Note that the strictness of the transition function was checked when
+ * entering the values into the sort, so we don't check it again here;
+ * we just apply standard SQL DISTINCT logic.
+ *
+ * The one-input case is handled separately from the multi-input case
+ * for performance reasons: for single by-value inputs, such as the
+ * common case of count(distinct id), the tuplesort_getdatum code path
+ * is around 300% faster. (The speedup for by-reference types is less
+ * but still noticeable.)
+ *
+ * This function handles only one grouping set (already set in
+ * aggstate->current_set).
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+process_ordered_aggregate_single(AggState *aggstate,
+ AggStatePerTrans pertrans,
+ AggStatePerGroup pergroupstate)
+{
+ Datum oldVal = (Datum) 0;
+ bool oldIsNull = true;
+ bool haveOldVal = false;
+ MemoryContext workcontext = aggstate->tmpcontext->ecxt_per_tuple_memory;
+ MemoryContext oldContext;
+ bool isDistinct = (pertrans->numDistinctCols > 0);
+ Datum newAbbrevVal = (Datum) 0;
+ Datum oldAbbrevVal = (Datum) 0;
+ FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+ Datum *newVal;
+ bool *isNull;
+
+ Assert(pertrans->numDistinctCols < 2);
+
+ tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
+
+ /* Load the column into argument 1 (arg 0 will be transition value) */
+ newVal = &fcinfo->args[1].value;
+ isNull = &fcinfo->args[1].isnull;
+
+ /*
+ * Note: if input type is pass-by-ref, the datums returned by the sort are
+ * freshly palloc'd in the per-query context, so we must be careful to
+ * pfree them when they are no longer needed.
+ */
+
+ while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set],
+ true, newVal, isNull, &newAbbrevVal))
+ {
+ /*
+ * Clear and select the working context for evaluation of the equality
+ * function and transition function.
+ */
+ MemoryContextReset(workcontext);
+ oldContext = MemoryContextSwitchTo(workcontext);
+
+ /*
+ * If DISTINCT mode, and not distinct from prior, skip it.
+ */
+ if (isDistinct &&
+ haveOldVal &&
+ ((oldIsNull && *isNull) ||
+ (!oldIsNull && !*isNull &&
+ oldAbbrevVal == newAbbrevVal &&
+ DatumGetBool(FunctionCall2Coll(&pertrans->equalfnOne,
+ pertrans->aggCollation,
+ oldVal, *newVal)))))
+ {
+ /* equal to prior, so forget this one */
+ if (!pertrans->inputtypeByVal && !*isNull)
+ pfree(DatumGetPointer(*newVal));
+ }
+ else
+ {
+ advance_transition_function(aggstate, pertrans, pergroupstate);
+ /* forget the old value, if any */
+ if (!oldIsNull && !pertrans->inputtypeByVal)
+ pfree(DatumGetPointer(oldVal));
+ /* and remember the new one for subsequent equality checks */
+ oldVal = *newVal;
+ oldAbbrevVal = newAbbrevVal;
+ oldIsNull = *isNull;
+ haveOldVal = true;
+ }
+
+ MemoryContextSwitchTo(oldContext);
+ }
+
+ if (!oldIsNull && !pertrans->inputtypeByVal)
+ pfree(DatumGetPointer(oldVal));
+
+ tuplesort_end(pertrans->sortstates[aggstate->current_set]);
+ pertrans->sortstates[aggstate->current_set] = NULL;
+}
+
+/*
+ * Run the transition function for a DISTINCT or ORDER BY aggregate
+ * with more than one input. This is called after we have completed
+ * entering all the input values into the sort object. We complete the
+ * sort, read out the values in sorted order, and run the transition
+ * function on each value (applying DISTINCT if appropriate).
+ *
+ * This function handles only one grouping set (already set in
+ * aggstate->current_set).
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+process_ordered_aggregate_multi(AggState *aggstate,
+ AggStatePerTrans pertrans,
+ AggStatePerGroup pergroupstate)
+{
+ ExprContext *tmpcontext = aggstate->tmpcontext;
+ FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+ TupleTableSlot *slot1 = pertrans->sortslot;
+ TupleTableSlot *slot2 = pertrans->uniqslot;
+ int numTransInputs = pertrans->numTransInputs;
+ int numDistinctCols = pertrans->numDistinctCols;
+ Datum newAbbrevVal = (Datum) 0;
+ Datum oldAbbrevVal = (Datum) 0;
+ bool haveOldValue = false;
+ TupleTableSlot *save = aggstate->tmpcontext->ecxt_outertuple;
+ int i;
+
+ tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
+
+ ExecClearTuple(slot1);
+ if (slot2)
+ ExecClearTuple(slot2);
+
+ while (tuplesort_gettupleslot(pertrans->sortstates[aggstate->current_set],
+ true, true, slot1, &newAbbrevVal))
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ tmpcontext->ecxt_outertuple = slot1;
+ tmpcontext->ecxt_innertuple = slot2;
+
+ if (numDistinctCols == 0 ||
+ !haveOldValue ||
+ newAbbrevVal != oldAbbrevVal ||
+ !ExecQual(pertrans->equalfnMulti, tmpcontext))
+ {
+ /*
+ * Extract the first numTransInputs columns as datums to pass to
+ * the transfn.
+ */
+ slot_getsomeattrs(slot1, numTransInputs);
+
+ /* Load values into fcinfo */
+ /* Start from 1, since the 0th arg will be the transition value */
+ for (i = 0; i < numTransInputs; i++)
+ {
+ fcinfo->args[i + 1].value = slot1->tts_values[i];
+ fcinfo->args[i + 1].isnull = slot1->tts_isnull[i];
+ }
+
+ advance_transition_function(aggstate, pertrans, pergroupstate);
+
+ if (numDistinctCols > 0)
+ {
+ /* swap the slot pointers to retain the current tuple */
+ TupleTableSlot *tmpslot = slot2;
+
+ slot2 = slot1;
+ slot1 = tmpslot;
+ /* avoid ExecQual() calls by reusing abbreviated keys */
+ oldAbbrevVal = newAbbrevVal;
+ haveOldValue = true;
+ }
+ }
+
+ /* Reset context each time */
+ ResetExprContext(tmpcontext);
+
+ ExecClearTuple(slot1);
+ }
+
+ if (slot2)
+ ExecClearTuple(slot2);
+
+ tuplesort_end(pertrans->sortstates[aggstate->current_set]);
+ pertrans->sortstates[aggstate->current_set] = NULL;
+
+ /* restore previous slot, potentially in use for grouping sets */
+ tmpcontext->ecxt_outertuple = save;
+}
+
+/*
+ * Compute the final value of one aggregate.
+ *
+ * This function handles only one grouping set (already set in
+ * aggstate->current_set).
+ *
+ * The finalfn will be run, and the result delivered, in the
+ * output-tuple context; caller's CurrentMemoryContext does not matter.
+ *
+ * The finalfn uses the state as set in the transno. This also might be
+ * being used by another aggregate function, so it's important that we do
+ * nothing destructive here.
+ */
+static void
+finalize_aggregate(AggState *aggstate,
+ AggStatePerAgg peragg,
+ AggStatePerGroup pergroupstate,
+ Datum *resultVal, bool *resultIsNull)
+{
+ LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+ bool anynull = false;
+ MemoryContext oldContext;
+ int i;
+ ListCell *lc;
+ AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
+
+ oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+ /*
+ * Evaluate any direct arguments. We do this even if there's no finalfn
+ * (which is unlikely anyway), so that side-effects happen as expected.
+ * The direct arguments go into arg positions 1 and up, leaving position 0
+ * for the transition state value.
+ */
+ i = 1;
+ foreach(lc, peragg->aggdirectargs)
+ {
+ ExprState *expr = (ExprState *) lfirst(lc);
+
+ fcinfo->args[i].value = ExecEvalExpr(expr,
+ aggstate->ss.ps.ps_ExprContext,
+ &fcinfo->args[i].isnull);
+ anynull |= fcinfo->args[i].isnull;
+ i++;
+ }
+
+ /*
+ * Apply the agg's finalfn if one is provided, else return transValue.
+ */
+ if (OidIsValid(peragg->finalfn_oid))
+ {
+ int numFinalArgs = peragg->numFinalArgs;
+
+ /* set up aggstate->curperagg for AggGetAggref() */
+ aggstate->curperagg = peragg;
+
+ InitFunctionCallInfoData(*fcinfo, &peragg->finalfn,
+ numFinalArgs,
+ pertrans->aggCollation,
+ (void *) aggstate, NULL);
+
+ /* Fill in the transition state value */
+ fcinfo->args[0].value =
+ MakeExpandedObjectReadOnly(pergroupstate->transValue,
+ pergroupstate->transValueIsNull,
+ pertrans->transtypeLen);
+ fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
+ anynull |= pergroupstate->transValueIsNull;
+
+ /* Fill any remaining argument positions with nulls */
+ for (; i < numFinalArgs; i++)
+ {
+ fcinfo->args[i].value = (Datum) 0;
+ fcinfo->args[i].isnull = true;
+ anynull = true;
+ }
+
+ if (fcinfo->flinfo->fn_strict && anynull)
+ {
+ /* don't call a strict function with NULL inputs */
+ *resultVal = (Datum) 0;
+ *resultIsNull = true;
+ }
+ else
+ {
+ *resultVal = FunctionCallInvoke(fcinfo);
+ *resultIsNull = fcinfo->isnull;
+ }
+ aggstate->curperagg = NULL;
+ }
+ else
+ {
+ /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
+ *resultVal = pergroupstate->transValue;
+ *resultIsNull = pergroupstate->transValueIsNull;
+ }
+
+ /*
+ * If result is pass-by-ref, make sure it is in the right context.
+ */
+ if (!peragg->resulttypeByVal && !*resultIsNull &&
+ !MemoryContextContains(CurrentMemoryContext,
+ DatumGetPointer(*resultVal)))
+ *resultVal = datumCopy(*resultVal,
+ peragg->resulttypeByVal,
+ peragg->resulttypeLen);
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Compute the output value of one partial aggregate.
+ *
+ * The serialization function will be run, and the result delivered, in the
+ * output-tuple context; caller's CurrentMemoryContext does not matter.
+ */
+static void
+finalize_partialaggregate(AggState *aggstate,
+ AggStatePerAgg peragg,
+ AggStatePerGroup pergroupstate,
+ Datum *resultVal, bool *resultIsNull)
+{
+ AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
+ MemoryContext oldContext;
+
+ oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+ /*
+ * serialfn_oid will be set if we must serialize the transvalue before
+ * returning it
+ */
+ if (OidIsValid(pertrans->serialfn_oid))
+ {
+ /* Don't call a strict serialization function with NULL input. */
+ if (pertrans->serialfn.fn_strict && pergroupstate->transValueIsNull)
+ {
+ *resultVal = (Datum) 0;
+ *resultIsNull = true;
+ }
+ else
+ {
+ FunctionCallInfo fcinfo = pertrans->serialfn_fcinfo;
+
+ fcinfo->args[0].value =
+ MakeExpandedObjectReadOnly(pergroupstate->transValue,
+ pergroupstate->transValueIsNull,
+ pertrans->transtypeLen);
+ fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
+ fcinfo->isnull = false;
+
+ *resultVal = FunctionCallInvoke(fcinfo);
+ *resultIsNull = fcinfo->isnull;
+ }
+ }
+ else
+ {
+ /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
+ *resultVal = pergroupstate->transValue;
+ *resultIsNull = pergroupstate->transValueIsNull;
+ }
+
+ /* If result is pass-by-ref, make sure it is in the right context. */
+ if (!peragg->resulttypeByVal && !*resultIsNull &&
+ !MemoryContextContains(CurrentMemoryContext,
+ DatumGetPointer(*resultVal)))
+ *resultVal = datumCopy(*resultVal,
+ peragg->resulttypeByVal,
+ peragg->resulttypeLen);
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Extract the attributes that make up the grouping key into the
+ * hashslot. This is necessary to compute the hash or perform a lookup.
+ */
+static inline void
+prepare_hash_slot(AggStatePerHash perhash,
+ TupleTableSlot *inputslot,
+ TupleTableSlot *hashslot)
+{
+ int i;
+
+ /* transfer just the needed columns into hashslot */
+ slot_getsomeattrs(inputslot, perhash->largestGrpColIdx);
+ ExecClearTuple(hashslot);
+
+ for (i = 0; i < perhash->numhashGrpCols; i++)
+ {
+ int varNumber = perhash->hashGrpColIdxInput[i] - 1;
+
+ hashslot->tts_values[i] = inputslot->tts_values[varNumber];
+ hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber];
+ }
+ ExecStoreVirtualTuple(hashslot);
+}
+
+/*
+ * Prepare to finalize and project based on the specified representative tuple
+ * slot and grouping set.
+ *
+ * In the specified tuple slot, force to null all attributes that should be
+ * read as null in the context of the current grouping set. Also stash the
+ * current group bitmap where GroupingExpr can get at it.
+ *
+ * This relies on three conditions:
+ *
+ * 1) Nothing is ever going to try and extract the whole tuple from this slot,
+ * only reference it in evaluations, which will only access individual
+ * attributes.
+ *
+ * 2) No system columns are going to need to be nulled. (If a system column is
+ * referenced in a group clause, it is actually projected in the outer plan
+ * tlist.)
+ *
+ * 3) Within a given phase, we never need to recover the value of an attribute
+ * once it has been set to null.
+ *
+ * Poking into the slot this way is a bit ugly, but the consensus is that the
+ * alternative was worse.
+ */
+static void
+prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
+{
+ if (aggstate->phase->grouped_cols)
+ {
+ Bitmapset *grouped_cols = aggstate->phase->grouped_cols[currentSet];
+
+ aggstate->grouped_cols = grouped_cols;
+
+ if (TTS_EMPTY(slot))
+ {
+ /*
+ * Force all values to be NULL if working on an empty input tuple
+ * (i.e. an empty grouping set for which no input rows were
+ * supplied).
+ */
+ ExecStoreAllNullTuple(slot);
+ }
+ else if (aggstate->all_grouped_cols)
+ {
+ ListCell *lc;
+
+ /* all_grouped_cols is arranged in desc order */
+ slot_getsomeattrs(slot, linitial_int(aggstate->all_grouped_cols));
+
+ foreach(lc, aggstate->all_grouped_cols)
+ {
+ int attnum = lfirst_int(lc);
+
+ if (!bms_is_member(attnum, grouped_cols))
+ slot->tts_isnull[attnum - 1] = true;
+ }
+ }
+ }
+}
+
+/*
+ * Compute the final value of all aggregates for one group.
+ *
+ * This function handles only one grouping set at a time, which the caller must
+ * have selected. It's also the caller's responsibility to adjust the supplied
+ * pergroup parameter to point to the current set's transvalues.
+ *
+ * Results are stored in the output econtext aggvalues/aggnulls.
+ */
+static void
+finalize_aggregates(AggState *aggstate,
+ AggStatePerAgg peraggs,
+ AggStatePerGroup pergroup)
+{
+ ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
+ Datum *aggvalues = econtext->ecxt_aggvalues;
+ bool *aggnulls = econtext->ecxt_aggnulls;
+ int aggno;
+ int transno;
+
+ /*
+ * If there were any DISTINCT and/or ORDER BY aggregates, sort their
+ * inputs and run the transition functions.
+ */
+ for (transno = 0; transno < aggstate->numtrans; transno++)
+ {
+ AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+ AggStatePerGroup pergroupstate;
+
+ pergroupstate = &pergroup[transno];
+
+ if (pertrans->numSortCols > 0)
+ {
+ Assert(aggstate->aggstrategy != AGG_HASHED &&
+ aggstate->aggstrategy != AGG_MIXED);
+
+ if (pertrans->numInputs == 1)
+ process_ordered_aggregate_single(aggstate,
+ pertrans,
+ pergroupstate);
+ else
+ process_ordered_aggregate_multi(aggstate,
+ pertrans,
+ pergroupstate);
+ }
+ }
+
+ /*
+ * Run the final functions.
+ */
+ for (aggno = 0; aggno < aggstate->numaggs; aggno++)
+ {
+ AggStatePerAgg peragg = &peraggs[aggno];
+ int transno = peragg->transno;
+ AggStatePerGroup pergroupstate;
+
+ pergroupstate = &pergroup[transno];
+
+ if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
+ finalize_partialaggregate(aggstate, peragg, pergroupstate,
+ &aggvalues[aggno], &aggnulls[aggno]);
+ else
+ finalize_aggregate(aggstate, peragg, pergroupstate,
+ &aggvalues[aggno], &aggnulls[aggno]);
+ }
+}
+
+/*
+ * Project the result of a group (whose aggs have already been calculated by
+ * finalize_aggregates). Returns the result slot, or NULL if no row is
+ * projected (suppressed by qual).
+ */
+static TupleTableSlot *
+project_aggregates(AggState *aggstate)
+{
+ ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
+
+ /*
+ * Check the qual (HAVING clause); if the group does not match, ignore it.
+ */
+ if (ExecQual(aggstate->ss.ps.qual, econtext))
+ {
+ /*
+ * Form and return projection tuple using the aggregate results and
+ * the representative input tuple.
+ */
+ return ExecProject(aggstate->ss.ps.ps_ProjInfo);
+ }
+ else
+ InstrCountFiltered1(aggstate, 1);
+
+ return NULL;
+}
+
+/*
+ * Find input-tuple columns that are needed, dividing them into
+ * aggregated and unaggregated sets.
+ */
+static void
+find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
+{
+ Agg *agg = (Agg *) aggstate->ss.ps.plan;
+ FindColsContext context;
+
+ context.is_aggref = false;
+ context.aggregated = NULL;
+ context.unaggregated = NULL;
+
+ /* Examine tlist and quals */
+ (void) find_cols_walker((Node *) agg->plan.targetlist, &context);
+ (void) find_cols_walker((Node *) agg->plan.qual, &context);
+
+ /* In some cases, grouping columns will not appear in the tlist */
+ for (int i = 0; i < agg->numCols; i++)
+ context.unaggregated = bms_add_member(context.unaggregated,
+ agg->grpColIdx[i]);
+
+ *aggregated = context.aggregated;
+ *unaggregated = context.unaggregated;
+}
+
+static bool
+find_cols_walker(Node *node, FindColsContext *context)
+{
+ if (node == NULL)
+ return false;
+ if (IsA(node, Var))
+ {
+ Var *var = (Var *) node;
+
+ /* setrefs.c should have set the varno to OUTER_VAR */
+ Assert(var->varno == OUTER_VAR);
+ Assert(var->varlevelsup == 0);
+ if (context->is_aggref)
+ context->aggregated = bms_add_member(context->aggregated,
+ var->varattno);
+ else
+ context->unaggregated = bms_add_member(context->unaggregated,
+ var->varattno);
+ return false;
+ }
+ if (IsA(node, Aggref))
+ {
+ Assert(!context->is_aggref);
+ context->is_aggref = true;
+ expression_tree_walker(node, find_cols_walker, (void *) context);
+ context->is_aggref = false;
+ return false;
+ }
+ return expression_tree_walker(node, find_cols_walker,
+ (void *) context);
+}
+
+/*
+ * (Re-)initialize the hash table(s) to empty.
+ *
+ * To implement hashed aggregation, we need a hashtable that stores a
+ * representative tuple and an array of AggStatePerGroup structs for each
+ * distinct set of GROUP BY column values. We compute the hash key from the
+ * GROUP BY columns. The per-group data is allocated in lookup_hash_entry(),
+ * for each entry.
+ *
+ * We have a separate hashtable and associated perhash data structure for each
+ * grouping set for which we're doing hashing.
+ *
+ * The contents of the hash tables always live in the hashcontext's per-tuple
+ * memory context (there is only one of these for all tables together, since
+ * they are all reset at the same time).
+ */
+static void
+build_hash_tables(AggState *aggstate)
+{
+ int setno;
+
+ for (setno = 0; setno < aggstate->num_hashes; ++setno)
+ {
+ AggStatePerHash perhash = &aggstate->perhash[setno];
+ long nbuckets;
+ Size memory;
+
+ if (perhash->hashtable != NULL)
+ {
+ ResetTupleHashTable(perhash->hashtable);
+ continue;
+ }
+
+ Assert(perhash->aggnode->numGroups > 0);
+
+ memory = aggstate->hash_mem_limit / aggstate->num_hashes;
+
+ /* choose reasonable number of buckets per hashtable */
+ nbuckets = hash_choose_num_buckets(aggstate->hashentrysize,
+ perhash->aggnode->numGroups,
+ memory);
+
+ build_hash_table(aggstate, setno, nbuckets);
+ }
+
+ aggstate->hash_ngroups_current = 0;
+}
+
+/*
+ * Build a single hashtable for this grouping set.
+ */
+static void
+build_hash_table(AggState *aggstate, int setno, long nbuckets)
+{
+ AggStatePerHash perhash = &aggstate->perhash[setno];
+ MemoryContext metacxt = aggstate->hash_metacxt;
+ MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
+ MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
+ Size additionalsize;
+
+ Assert(aggstate->aggstrategy == AGG_HASHED ||
+ aggstate->aggstrategy == AGG_MIXED);
+
+ /*
+ * Used to make sure initial hash table allocation does not exceed
+ * hash_mem. Note that the estimate does not include space for
+ * pass-by-reference transition data values, nor for the representative
+ * tuple of each group.
+ */
+ additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
+
+ perhash->hashtable = BuildTupleHashTableExt(&aggstate->ss.ps,
+ perhash->hashslot->tts_tupleDescriptor,
+ perhash->numCols,
+ perhash->hashGrpColIdxHash,
+ perhash->eqfuncoids,
+ perhash->hashfunctions,
+ perhash->aggnode->grpCollations,
+ nbuckets,
+ additionalsize,
+ metacxt,
+ hashcxt,
+ tmpcxt,
+ DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
+}
+
+/*
+ * Compute columns that actually need to be stored in hashtable entries. The
+ * incoming tuples from the child plan node will contain grouping columns,
+ * other columns referenced in our targetlist and qual, columns used to
+ * compute the aggregate functions, and perhaps just junk columns we don't use
+ * at all. Only columns of the first two types need to be stored in the
+ * hashtable, and getting rid of the others can make the table entries
+ * significantly smaller. The hashtable only contains the relevant columns,
+ * and is packed/unpacked in lookup_hash_entry() / agg_retrieve_hash_table()
+ * into the format of the normal input descriptor.
+ *
+ * Additional columns, in addition to the columns grouped by, come from two
+ * sources: Firstly functionally dependent columns that we don't need to group
+ * by themselves, and secondly ctids for row-marks.
+ *
+ * To eliminate duplicates, we build a bitmapset of the needed columns, and
+ * then build an array of the columns included in the hashtable. We might
+ * still have duplicates if the passed-in grpColIdx has them, which can happen
+ * in edge cases from semijoins/distinct; these can't always be removed,
+ * because it's not certain that the duplicate cols will be using the same
+ * hash function.
+ *
+ * Note that the array is preserved over ExecReScanAgg, so we allocate it in
+ * the per-query context (unlike the hash table itself).
+ */
+static void
+find_hash_columns(AggState *aggstate)
+{
+ Bitmapset *base_colnos;
+ Bitmapset *aggregated_colnos;
+ TupleDesc scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
+ List *outerTlist = outerPlanState(aggstate)->plan->targetlist;
+ int numHashes = aggstate->num_hashes;
+ EState *estate = aggstate->ss.ps.state;
+ int j;
+
+ /* Find Vars that will be needed in tlist and qual */
+ find_cols(aggstate, &aggregated_colnos, &base_colnos);
+ aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos);
+ aggstate->max_colno_needed = 0;
+ aggstate->all_cols_needed = true;
+
+ for (int i = 0; i < scanDesc->natts; i++)
+ {
+ int colno = i + 1;
+
+ if (bms_is_member(colno, aggstate->colnos_needed))
+ aggstate->max_colno_needed = colno;
+ else
+ aggstate->all_cols_needed = false;
+ }
+
+ for (j = 0; j < numHashes; ++j)
+ {
+ AggStatePerHash perhash = &aggstate->perhash[j];
+ Bitmapset *colnos = bms_copy(base_colnos);
+ AttrNumber *grpColIdx = perhash->aggnode->grpColIdx;
+ List *hashTlist = NIL;
+ TupleDesc hashDesc;
+ int maxCols;
+ int i;
+
+ perhash->largestGrpColIdx = 0;
+
+ /*
+ * If we're doing grouping sets, then some Vars might be referenced in
+ * tlist/qual for the benefit of other grouping sets, but not needed
+ * when hashing; i.e. prepare_projection_slot will null them out, so
+ * there'd be no point storing them. Use prepare_projection_slot's
+ * logic to determine which.
+ */
+ if (aggstate->phases[0].grouped_cols)
+ {
+ Bitmapset *grouped_cols = aggstate->phases[0].grouped_cols[j];
+ ListCell *lc;
+
+ foreach(lc, aggstate->all_grouped_cols)
+ {
+ int attnum = lfirst_int(lc);
+
+ if (!bms_is_member(attnum, grouped_cols))
+ colnos = bms_del_member(colnos, attnum);
+ }
+ }
+
+ /*
+ * Compute maximum number of input columns accounting for possible
+ * duplications in the grpColIdx array, which can happen in some edge
+ * cases where HashAggregate was generated as part of a semijoin or a
+ * DISTINCT.
+ */
+ maxCols = bms_num_members(colnos) + perhash->numCols;
+
+ perhash->hashGrpColIdxInput =
+ palloc(maxCols * sizeof(AttrNumber));
+ perhash->hashGrpColIdxHash =
+ palloc(perhash->numCols * sizeof(AttrNumber));
+
+ /* Add all the grouping columns to colnos */
+ for (i = 0; i < perhash->numCols; i++)
+ colnos = bms_add_member(colnos, grpColIdx[i]);
+
+ /*
+ * First build mapping for columns directly hashed. These are the
+ * first, because they'll be accessed when computing hash values and
+ * comparing tuples for exact matches. We also build simple mapping
+ * for execGrouping, so it knows where to find the to-be-hashed /
+ * compared columns in the input.
+ */
+ for (i = 0; i < perhash->numCols; i++)
+ {
+ perhash->hashGrpColIdxInput[i] = grpColIdx[i];
+ perhash->hashGrpColIdxHash[i] = i + 1;
+ perhash->numhashGrpCols++;
+ /* delete already mapped columns */
+ bms_del_member(colnos, grpColIdx[i]);
+ }
+
+ /* and add the remaining columns */
+ while ((i = bms_first_member(colnos)) >= 0)
+ {
+ perhash->hashGrpColIdxInput[perhash->numhashGrpCols] = i;
+ perhash->numhashGrpCols++;
+ }
+
+ /* and build a tuple descriptor for the hashtable */
+ for (i = 0; i < perhash->numhashGrpCols; i++)
+ {
+ int varNumber = perhash->hashGrpColIdxInput[i] - 1;
+
+ hashTlist = lappend(hashTlist, list_nth(outerTlist, varNumber));
+ perhash->largestGrpColIdx =
+ Max(varNumber + 1, perhash->largestGrpColIdx);
+ }
+
+ hashDesc = ExecTypeFromTL(hashTlist);
+
+ execTuplesHashPrepare(perhash->numCols,
+ perhash->aggnode->grpOperators,
+ &perhash->eqfuncoids,
+ &perhash->hashfunctions);
+ perhash->hashslot =
+ ExecAllocTableSlot(&estate->es_tupleTable, hashDesc,
+ &TTSOpsMinimalTuple);
+
+ list_free(hashTlist);
+ bms_free(colnos);
+ }
+
+ bms_free(base_colnos);
+}
+
+/*
+ * Estimate per-hash-table-entry overhead.
+ */
+Size
+hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
+{
+ Size tupleChunkSize;
+ Size pergroupChunkSize;
+ Size transitionChunkSize;
+ Size tupleSize = (MAXALIGN(SizeofMinimalTupleHeader) +
+ tupleWidth);
+ Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
+
+ tupleChunkSize = CHUNKHDRSZ + tupleSize;
+
+ if (pergroupSize > 0)
+ pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
+ else
+ pergroupChunkSize = 0;
+
+ if (transitionSpace > 0)
+ transitionChunkSize = CHUNKHDRSZ + transitionSpace;
+ else
+ transitionChunkSize = 0;
+
+ return
+ sizeof(TupleHashEntryData) +
+ tupleChunkSize +
+ pergroupChunkSize +
+ transitionChunkSize;
+}
+
+/*
+ * hashagg_recompile_expressions()
+ *
+ * Identifies the right phase, compiles the right expression given the
+ * arguments, and then sets phase->evalfunc to that expression.
+ *
+ * Different versions of the compiled expression are needed depending on
+ * whether hash aggregation has spilled or not, and whether it's reading from
+ * the outer plan or a tape. Before spilling to disk, the expression reads
+ * from the outer plan and does not need to perform a NULL check. After
+ * HashAgg begins to spill, new groups will not be created in the hash table,
+ * and the AggStatePerGroup array may be NULL; therefore we need to add a null
+ * pointer check to the expression. Then, when reading spilled data from a
+ * tape, we change the outer slot type to be a fixed minimal tuple slot.
+ *
+ * It would be wasteful to recompile every time, so cache the compiled
+ * expressions in the AggStatePerPhase, and reuse when appropriate.
+ */
+static void
+hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
+{
+ AggStatePerPhase phase;
+ int i = minslot ? 1 : 0;
+ int j = nullcheck ? 1 : 0;
+
+ Assert(aggstate->aggstrategy == AGG_HASHED ||
+ aggstate->aggstrategy == AGG_MIXED);
+
+ if (aggstate->aggstrategy == AGG_HASHED)
+ phase = &aggstate->phases[0];
+ else /* AGG_MIXED */
+ phase = &aggstate->phases[1];
+
+ if (phase->evaltrans_cache[i][j] == NULL)
+ {
+ const TupleTableSlotOps *outerops = aggstate->ss.ps.outerops;
+ bool outerfixed = aggstate->ss.ps.outeropsfixed;
+ bool dohash = true;
+ bool dosort = false;
+
+ /*
+ * If minslot is true, that means we are processing a spilled batch
+ * (inside agg_refill_hash_table()), and we must not advance the
+ * sorted grouping sets.
+ */
+ if (aggstate->aggstrategy == AGG_MIXED && !minslot)
+ dosort = true;
+
+ /* temporarily change the outerops while compiling the expression */
+ if (minslot)
+ {
+ aggstate->ss.ps.outerops = &TTSOpsMinimalTuple;
+ aggstate->ss.ps.outeropsfixed = true;
+ }
+
+ phase->evaltrans_cache[i][j] = ExecBuildAggTrans(aggstate, phase,
+ dosort, dohash,
+ nullcheck);
+
+ /* change back */
+ aggstate->ss.ps.outerops = outerops;
+ aggstate->ss.ps.outeropsfixed = outerfixed;
+ }
+
+ phase->evaltrans = phase->evaltrans_cache[i][j];
+}
+
+/*
+ * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
+ * number of partitions we expect to create (if we do spill).
+ *
+ * There are two limits: a memory limit, and also an ngroups limit. The
+ * ngroups limit becomes important when we expect transition values to grow
+ * substantially larger than the initial value.
+ */
+void
+hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
+ Size *mem_limit, uint64 *ngroups_limit,
+ int *num_partitions)
+{
+ int npartitions;
+ Size partition_mem;
+ Size hash_mem_limit = get_hash_memory_limit();
+
+ /* if not expected to spill, use all of hash_mem */
+ if (input_groups * hashentrysize <= hash_mem_limit)
+ {
+ if (num_partitions != NULL)
+ *num_partitions = 0;
+ *mem_limit = hash_mem_limit;
+ *ngroups_limit = hash_mem_limit / hashentrysize;
+ return;
+ }
+
+ /*
+ * Calculate expected memory requirements for spilling, which is the size
+ * of the buffers needed for all the tapes that need to be open at once.
+ * Then, subtract that from the memory available for holding hash tables.
+ */
+ npartitions = hash_choose_num_partitions(input_groups,
+ hashentrysize,
+ used_bits,
+ NULL);
+ if (num_partitions != NULL)
+ *num_partitions = npartitions;
+
+ partition_mem =
+ HASHAGG_READ_BUFFER_SIZE +
+ HASHAGG_WRITE_BUFFER_SIZE * npartitions;
+
+ /*
+ * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
+ * minimum number of partitions, so we aren't going to dramatically exceed
+ * work mem anyway.
+ */
+ if (hash_mem_limit > 4 * partition_mem)
+ *mem_limit = hash_mem_limit - partition_mem;
+ else
+ *mem_limit = hash_mem_limit * 0.75;
+
+ if (*mem_limit > hashentrysize)
+ *ngroups_limit = *mem_limit / hashentrysize;
+ else
+ *ngroups_limit = 1;
+}
+
+/*
+ * hash_agg_check_limits
+ *
+ * After adding a new group to the hash table, check whether we need to enter
+ * spill mode. Allocations may happen without adding new groups (for instance,
+ * if the transition state size grows), so this check is imperfect.
+ */
+static void
+hash_agg_check_limits(AggState *aggstate)
+{
+ uint64 ngroups = aggstate->hash_ngroups_current;
+ Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
+ true);
+ Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+ true);
+
+ /*
+ * Don't spill unless there's at least one group in the hash table so we
+ * can be sure to make progress even in edge cases.
+ */
+ if (aggstate->hash_ngroups_current > 0 &&
+ (meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
+ ngroups > aggstate->hash_ngroups_limit))
+ {
+ hash_agg_enter_spill_mode(aggstate);
+ }
+}
+
+/*
+ * Enter "spill mode", meaning that no new groups are added to any of the hash
+ * tables. Tuples that would create a new group are instead spilled, and
+ * processed later.
+ */
+static void
+hash_agg_enter_spill_mode(AggState *aggstate)
+{
+ aggstate->hash_spill_mode = true;
+ hashagg_recompile_expressions(aggstate, aggstate->table_filled, true);
+
+ if (!aggstate->hash_ever_spilled)
+ {
+ Assert(aggstate->hash_tapeinfo == NULL);
+ Assert(aggstate->hash_spills == NULL);
+
+ aggstate->hash_ever_spilled = true;
+
+ hashagg_tapeinfo_init(aggstate);
+
+ aggstate->hash_spills = palloc(sizeof(HashAggSpill) * aggstate->num_hashes);
+
+ for (int setno = 0; setno < aggstate->num_hashes; setno++)
+ {
+ AggStatePerHash perhash = &aggstate->perhash[setno];
+ HashAggSpill *spill = &aggstate->hash_spills[setno];
+
+ hashagg_spill_init(spill, aggstate->hash_tapeinfo, 0,
+ perhash->aggnode->numGroups,
+ aggstate->hashentrysize);
+ }
+ }
+}
+
+/*
+ * Update metrics after filling the hash table.
+ *
+ * If reading from the outer plan, from_tape should be false; if reading from
+ * another tape, from_tape should be true.
+ */
+static void
+hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
+{
+ Size meta_mem;
+ Size hashkey_mem;
+ Size buffer_mem;
+ Size total_mem;
+
+ if (aggstate->aggstrategy != AGG_MIXED &&
+ aggstate->aggstrategy != AGG_HASHED)
+ return;
+
+ /* memory for the hash table itself */
+ meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
+
+ /* memory for the group keys and transition states */
+ hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
+
+ /* memory for read/write tape buffers, if spilled */
+ buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE;
+ if (from_tape)
+ buffer_mem += HASHAGG_READ_BUFFER_SIZE;
+
+ /* update peak mem */
+ total_mem = meta_mem + hashkey_mem + buffer_mem;
+ if (total_mem > aggstate->hash_mem_peak)
+ aggstate->hash_mem_peak = total_mem;
+
+ /* update disk usage */
+ if (aggstate->hash_tapeinfo != NULL)
+ {
+ uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeinfo->tapeset) * (BLCKSZ / 1024);
+
+ if (aggstate->hash_disk_used < disk_used)
+ aggstate->hash_disk_used = disk_used;
+ }
+
+ /* update hashentrysize estimate based on contents */
+ if (aggstate->hash_ngroups_current > 0)
+ {
+ aggstate->hashentrysize =
+ sizeof(TupleHashEntryData) +
+ (hashkey_mem / (double) aggstate->hash_ngroups_current);
+ }
+}
+
+/*
+ * Choose a reasonable number of buckets for the initial hash table size.
+ */
+static long
+hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
+{
+ long max_nbuckets;
+ long nbuckets = ngroups;
+
+ max_nbuckets = memory / hashentrysize;
+
+ /*
+ * Underestimating is better than overestimating. Too many buckets crowd
+ * out space for group keys and transition state values.
+ */
+ max_nbuckets >>= 1;
+
+ if (nbuckets > max_nbuckets)
+ nbuckets = max_nbuckets;
+
+ return Max(nbuckets, 1);
+}
+
+/*
+ * Determine the number of partitions to create when spilling, which will
+ * always be a power of two. If log2_npartitions is non-NULL, set
+ * *log2_npartitions to the log2() of the number of partitions.
+ */
+static int
+hash_choose_num_partitions(double input_groups, double hashentrysize,
+ int used_bits, int *log2_npartitions)
+{
+ Size hash_mem_limit = get_hash_memory_limit();
+ double partition_limit;
+ double mem_wanted;
+ double dpartitions;
+ int npartitions;
+ int partition_bits;
+
+ /*
+ * Avoid creating so many partitions that the memory requirements of the
+ * open partition files are greater than 1/4 of hash_mem.
+ */
+ partition_limit =
+ (hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
+ HASHAGG_WRITE_BUFFER_SIZE;
+
+ mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
+
+ /* make enough partitions so that each one is likely to fit in memory */
+ dpartitions = 1 + (mem_wanted / hash_mem_limit);
+
+ if (dpartitions > partition_limit)
+ dpartitions = partition_limit;
+
+ if (dpartitions < HASHAGG_MIN_PARTITIONS)
+ dpartitions = HASHAGG_MIN_PARTITIONS;
+ if (dpartitions > HASHAGG_MAX_PARTITIONS)
+ dpartitions = HASHAGG_MAX_PARTITIONS;
+
+ /* HASHAGG_MAX_PARTITIONS limit makes this safe */
+ npartitions = (int) dpartitions;
+
+ /* ceil(log2(npartitions)) */
+ partition_bits = my_log2(npartitions);
+
+ /* make sure that we don't exhaust the hash bits */
+ if (partition_bits + used_bits >= 32)
+ partition_bits = 32 - used_bits;
+
+ if (log2_npartitions != NULL)
+ *log2_npartitions = partition_bits;
+
+ /* number of partitions will be a power of two */
+ npartitions = 1 << partition_bits;
+
+ return npartitions;
+}
+
+/*
+ * Initialize a freshly-created TupleHashEntry.
+ */
+static void
+initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
+ TupleHashEntry entry)
+{
+ AggStatePerGroup pergroup;
+ int transno;
+
+ aggstate->hash_ngroups_current++;
+ hash_agg_check_limits(aggstate);
+
+ /* no need to allocate or initialize per-group state */
+ if (aggstate->numtrans == 0)
+ return;
+
+ pergroup = (AggStatePerGroup)
+ MemoryContextAlloc(hashtable->tablecxt,
+ sizeof(AggStatePerGroupData) * aggstate->numtrans);
+
+ entry->additional = pergroup;
+
+ /*
+ * Initialize aggregates for new tuple group, lookup_hash_entries()
+ * already has selected the relevant grouping set.
+ */
+ for (transno = 0; transno < aggstate->numtrans; transno++)
+ {
+ AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+ AggStatePerGroup pergroupstate = &pergroup[transno];
+
+ initialize_aggregate(aggstate, pertrans, pergroupstate);
+ }
+}
+
+/*
+ * Look up hash entries for the current tuple in all hashed grouping sets.
+ *
+ * Be aware that lookup_hash_entry can reset the tmpcontext.
+ *
+ * Some entries may be left NULL if we are in "spill mode". The same tuple
+ * will belong to different groups for each grouping set, so may match a group
+ * already in memory for one set and match a group not in memory for another
+ * set. When in "spill mode", the tuple will be spilled for each grouping set
+ * where it doesn't match a group in memory.
+ *
+ * NB: It's possible to spill the same tuple for several different grouping
+ * sets. This may seem wasteful, but it's actually a trade-off: if we spill
+ * the tuple multiple times for multiple grouping sets, it can be partitioned
+ * for each grouping set, making the refilling of the hash table very
+ * efficient.
+ */
+static void
+lookup_hash_entries(AggState *aggstate)
+{
+ AggStatePerGroup *pergroup = aggstate->hash_pergroup;
+ TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
+ int setno;
+
+ for (setno = 0; setno < aggstate->num_hashes; setno++)
+ {
+ AggStatePerHash perhash = &aggstate->perhash[setno];
+ TupleHashTable hashtable = perhash->hashtable;
+ TupleTableSlot *hashslot = perhash->hashslot;
+ TupleHashEntry entry;
+ uint32 hash;
+ bool isnew = false;
+ bool *p_isnew;
+
+ /* if hash table already spilled, don't create new entries */
+ p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
+
+ select_current_set(aggstate, setno, true);
+ prepare_hash_slot(perhash,
+ outerslot,
+ hashslot);
+
+ entry = LookupTupleHashEntry(hashtable, hashslot,
+ p_isnew, &hash);
+
+ if (entry != NULL)
+ {
+ if (isnew)
+ initialize_hash_entry(aggstate, hashtable, entry);
+ pergroup[setno] = entry->additional;
+ }
+ else
+ {
+ HashAggSpill *spill = &aggstate->hash_spills[setno];
+ TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
+
+ if (spill->partitions == NULL)
+ hashagg_spill_init(spill, aggstate->hash_tapeinfo, 0,
+ perhash->aggnode->numGroups,
+ aggstate->hashentrysize);
+
+ hashagg_spill_tuple(aggstate, spill, slot, hash);
+ pergroup[setno] = NULL;
+ }
+ }
+}
+
+/*
+ * ExecAgg -
+ *
+ * ExecAgg receives tuples from its outer subplan and aggregates over
+ * the appropriate attribute for each aggregate function use (Aggref
+ * node) appearing in the targetlist or qual of the node. The number
+ * of tuples to aggregate over depends on whether grouped or plain
+ * aggregation is selected. In grouped aggregation, we produce a result
+ * row for each group; in plain aggregation there's a single result row
+ * for the whole query. In either case, the value of each aggregate is
+ * stored in the expression context to be used when ExecProject evaluates
+ * the result tuple.
+ */
+static TupleTableSlot *
+ExecAgg(PlanState *pstate)
+{
+ AggState *node = castNode(AggState, pstate);
+ TupleTableSlot *result = NULL;
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!node->agg_done)
+ {
+ /* Dispatch based on strategy */
+ switch (node->phase->aggstrategy)
+ {
+ case AGG_HASHED:
+ if (!node->table_filled)
+ agg_fill_hash_table(node);
+ /* FALLTHROUGH */
+ case AGG_MIXED:
+ result = agg_retrieve_hash_table(node);
+ break;
+ case AGG_PLAIN:
+ case AGG_SORTED:
+ result = agg_retrieve_direct(node);
+ break;
+ }
+
+ if (!TupIsNull(result))
+ return result;
+ }
+
+ return NULL;
+}
+
+/*
+ * ExecAgg for non-hashed case
+ */
+static TupleTableSlot *
+agg_retrieve_direct(AggState *aggstate)
+{
+ Agg *node = aggstate->phase->aggnode;
+ ExprContext *econtext;
+ ExprContext *tmpcontext;
+ AggStatePerAgg peragg;
+ AggStatePerGroup *pergroups;
+ TupleTableSlot *outerslot;
+ TupleTableSlot *firstSlot;
+ TupleTableSlot *result;
+ bool hasGroupingSets = aggstate->phase->numsets > 0;
+ int numGroupingSets = Max(aggstate->phase->numsets, 1);
+ int currentSet;
+ int nextSetSize;
+ int numReset;
+ int i;
+
+ /*
+ * get state info from node
+ *
+ * econtext is the per-output-tuple expression context
+ *
+ * tmpcontext is the per-input-tuple expression context
+ */
+ econtext = aggstate->ss.ps.ps_ExprContext;
+ tmpcontext = aggstate->tmpcontext;
+
+ peragg = aggstate->peragg;
+ pergroups = aggstate->pergroups;
+ firstSlot = aggstate->ss.ss_ScanTupleSlot;
+
+ /*
+ * We loop retrieving groups until we find one matching
+ * aggstate->ss.ps.qual
+ *
+ * For grouping sets, we have the invariant that aggstate->projected_set
+ * is either -1 (initial call) or the index (starting from 0) in
+ * gset_lengths for the group we just completed (either by projecting a
+ * row or by discarding it in the qual).
+ */
+ while (!aggstate->agg_done)
+ {
+ /*
+ * Clear the per-output-tuple context for each group, as well as
+ * aggcontext (which contains any pass-by-ref transvalues of the old
+ * group). Some aggregate functions store working state in child
+ * contexts; those now get reset automatically without us needing to
+ * do anything special.
+ *
+ * We use ReScanExprContext not just ResetExprContext because we want
+ * any registered shutdown callbacks to be called. That allows
+ * aggregate functions to ensure they've cleaned up any non-memory
+ * resources.
+ */
+ ReScanExprContext(econtext);
+
+ /*
+ * Determine how many grouping sets need to be reset at this boundary.
+ */
+ if (aggstate->projected_set >= 0 &&
+ aggstate->projected_set < numGroupingSets)
+ numReset = aggstate->projected_set + 1;
+ else
+ numReset = numGroupingSets;
+
+ /*
+ * numReset can change on a phase boundary, but that's OK; we want to
+ * reset the contexts used in _this_ phase, and later, after possibly
+ * changing phase, initialize the right number of aggregates for the
+ * _new_ phase.
+ */
+
+ for (i = 0; i < numReset; i++)
+ {
+ ReScanExprContext(aggstate->aggcontexts[i]);
+ }
+
+ /*
+ * Check if input is complete and there are no more groups to project
+ * in this phase; move to next phase or mark as done.
+ */
+ if (aggstate->input_done == true &&
+ aggstate->projected_set >= (numGroupingSets - 1))
+ {
+ if (aggstate->current_phase < aggstate->numphases - 1)
+ {
+ initialize_phase(aggstate, aggstate->current_phase + 1);
+ aggstate->input_done = false;
+ aggstate->projected_set = -1;
+ numGroupingSets = Max(aggstate->phase->numsets, 1);
+ node = aggstate->phase->aggnode;
+ numReset = numGroupingSets;
+ }
+ else if (aggstate->aggstrategy == AGG_MIXED)
+ {
+ /*
+ * Mixed mode; we've output all the grouped stuff and have
+ * full hashtables, so switch to outputting those.
+ */
+ initialize_phase(aggstate, 0);
+ aggstate->table_filled = true;
+ ResetTupleHashIterator(aggstate->perhash[0].hashtable,
+ &aggstate->perhash[0].hashiter);
+ select_current_set(aggstate, 0, true);
+ return agg_retrieve_hash_table(aggstate);
+ }
+ else
+ {
+ aggstate->agg_done = true;
+ break;
+ }
+ }
+
+ /*
+ * Get the number of columns in the next grouping set after the last
+ * projected one (if any). This is the number of columns to compare to
+ * see if we reached the boundary of that set too.
+ */
+ if (aggstate->projected_set >= 0 &&
+ aggstate->projected_set < (numGroupingSets - 1))
+ nextSetSize = aggstate->phase->gset_lengths[aggstate->projected_set + 1];
+ else
+ nextSetSize = 0;
+
+ /*----------
+ * If a subgroup for the current grouping set is present, project it.
+ *
+ * We have a new group if:
+ * - we're out of input but haven't projected all grouping sets
+ * (checked above)
+ * OR
+ * - we already projected a row that wasn't from the last grouping
+ * set
+ * AND
+ * - the next grouping set has at least one grouping column (since
+ * empty grouping sets project only once input is exhausted)
+ * AND
+ * - the previous and pending rows differ on the grouping columns
+ * of the next grouping set
+ *----------
+ */
+ tmpcontext->ecxt_innertuple = econtext->ecxt_outertuple;
+ if (aggstate->input_done ||
+ (node->aggstrategy != AGG_PLAIN &&
+ aggstate->projected_set != -1 &&
+ aggstate->projected_set < (numGroupingSets - 1) &&
+ nextSetSize > 0 &&
+ !ExecQualAndReset(aggstate->phase->eqfunctions[nextSetSize - 1],
+ tmpcontext)))
+ {
+ aggstate->projected_set += 1;
+
+ Assert(aggstate->projected_set < numGroupingSets);
+ Assert(nextSetSize > 0 || aggstate->input_done);
+ }
+ else
+ {
+ /*
+ * We no longer care what group we just projected, the next
+ * projection will always be the first (or only) grouping set
+ * (unless the input proves to be empty).
+ */
+ aggstate->projected_set = 0;
+
+ /*
+ * If we don't already have the first tuple of the new group,
+ * fetch it from the outer plan.
+ */
+ if (aggstate->grp_firstTuple == NULL)
+ {
+ outerslot = fetch_input_tuple(aggstate);
+ if (!TupIsNull(outerslot))
+ {
+ /*
+ * Make a copy of the first input tuple; we will use this
+ * for comparisons (in group mode) and for projection.
+ */
+ aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
+ }
+ else
+ {
+ /* outer plan produced no tuples at all */
+ if (hasGroupingSets)
+ {
+ /*
+ * If there was no input at all, we need to project
+ * rows only if there are grouping sets of size 0.
+ * Note that this implies that there can't be any
+ * references to ungrouped Vars, which would otherwise
+ * cause issues with the empty output slot.
+ *
+ * XXX: This is no longer true, we currently deal with
+ * this in finalize_aggregates().
+ */
+ aggstate->input_done = true;
+
+ while (aggstate->phase->gset_lengths[aggstate->projected_set] > 0)
+ {
+ aggstate->projected_set += 1;
+ if (aggstate->projected_set >= numGroupingSets)
+ {
+ /*
+ * We can't set agg_done here because we might
+ * have more phases to do, even though the
+ * input is empty. So we need to restart the
+ * whole outer loop.
+ */
+ break;
+ }
+ }
+
+ if (aggstate->projected_set >= numGroupingSets)
+ continue;
+ }
+ else
+ {
+ aggstate->agg_done = true;
+ /* If we are grouping, we should produce no tuples too */
+ if (node->aggstrategy != AGG_PLAIN)
+ return NULL;
+ }
+ }
+ }
+
+ /*
+ * Initialize working state for a new input tuple group.
+ */
+ initialize_aggregates(aggstate, pergroups, numReset);
+
+ if (aggstate->grp_firstTuple != NULL)
+ {
+ /*
+ * Store the copied first input tuple in the tuple table slot
+ * reserved for it. The tuple will be deleted when it is
+ * cleared from the slot.
+ */
+ ExecForceStoreHeapTuple(aggstate->grp_firstTuple,
+ firstSlot, true);
+ aggstate->grp_firstTuple = NULL; /* don't keep two pointers */
+
+ /* set up for first advance_aggregates call */
+ tmpcontext->ecxt_outertuple = firstSlot;
+
+ /*
+ * Process each outer-plan tuple, and then fetch the next one,
+ * until we exhaust the outer plan or cross a group boundary.
+ */
+ for (;;)
+ {
+ /*
+ * During phase 1 only of a mixed agg, we need to update
+ * hashtables as well in advance_aggregates.
+ */
+ if (aggstate->aggstrategy == AGG_MIXED &&
+ aggstate->current_phase == 1)
+ {
+ lookup_hash_entries(aggstate);
+ }
+
+ /* Advance the aggregates (or combine functions) */
+ advance_aggregates(aggstate);
+
+ /* Reset per-input-tuple context after each tuple */
+ ResetExprContext(tmpcontext);
+
+ outerslot = fetch_input_tuple(aggstate);
+ if (TupIsNull(outerslot))
+ {
+ /* no more outer-plan tuples available */
+
+ /* if we built hash tables, finalize any spills */
+ if (aggstate->aggstrategy == AGG_MIXED &&
+ aggstate->current_phase == 1)
+ hashagg_finish_initial_spills(aggstate);
+
+ if (hasGroupingSets)
+ {
+ aggstate->input_done = true;
+ break;
+ }
+ else
+ {
+ aggstate->agg_done = true;
+ break;
+ }
+ }
+ /* set up for next advance_aggregates call */
+ tmpcontext->ecxt_outertuple = outerslot;
+
+ /*
+ * If we are grouping, check whether we've crossed a group
+ * boundary.
+ */
+ if (node->aggstrategy != AGG_PLAIN)
+ {
+ tmpcontext->ecxt_innertuple = firstSlot;
+ if (!ExecQual(aggstate->phase->eqfunctions[node->numCols - 1],
+ tmpcontext))
+ {
+ aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Use the representative input tuple for any references to
+ * non-aggregated input columns in aggregate direct args, the node
+ * qual, and the tlist. (If we are not grouping, and there are no
+ * input rows at all, we will come here with an empty firstSlot
+ * ... but if not grouping, there can't be any references to
+ * non-aggregated input columns, so no problem.)
+ */
+ econtext->ecxt_outertuple = firstSlot;
+ }
+
+ Assert(aggstate->projected_set >= 0);
+
+ currentSet = aggstate->projected_set;
+
+ prepare_projection_slot(aggstate, econtext->ecxt_outertuple, currentSet);
+
+ select_current_set(aggstate, currentSet, false);
+
+ finalize_aggregates(aggstate,
+ peragg,
+ pergroups[currentSet]);
+
+ /*
+ * If there's no row to project right now, we must continue rather
+ * than returning a null since there might be more groups.
+ */
+ result = project_aggregates(aggstate);
+ if (result)
+ return result;
+ }
+
+ /* No more groups */
+ return NULL;
+}
+
+/*
+ * ExecAgg for hashed case: read input and build hash table
+ */
+static void
+agg_fill_hash_table(AggState *aggstate)
+{
+ TupleTableSlot *outerslot;
+ ExprContext *tmpcontext = aggstate->tmpcontext;
+
+ /*
+ * Process each outer-plan tuple, and then fetch the next one, until we
+ * exhaust the outer plan.
+ */
+ for (;;)
+ {
+ outerslot = fetch_input_tuple(aggstate);
+ if (TupIsNull(outerslot))
+ break;
+
+ /* set up for lookup_hash_entries and advance_aggregates */
+ tmpcontext->ecxt_outertuple = outerslot;
+
+ /* Find or build hashtable entries */
+ lookup_hash_entries(aggstate);
+
+ /* Advance the aggregates (or combine functions) */
+ advance_aggregates(aggstate);
+
+ /*
+ * Reset per-input-tuple context after each tuple, but note that the
+ * hash lookups do this too
+ */
+ ResetExprContext(aggstate->tmpcontext);
+ }
+
+ /* finalize spills, if any */
+ hashagg_finish_initial_spills(aggstate);
+
+ aggstate->table_filled = true;
+ /* Initialize to walk the first hash table */
+ select_current_set(aggstate, 0, true);
+ ResetTupleHashIterator(aggstate->perhash[0].hashtable,
+ &aggstate->perhash[0].hashiter);
+}
+
+/*
+ * If any data was spilled during hash aggregation, reset the hash table and
+ * reprocess one batch of spilled data. After reprocessing a batch, the hash
+ * table will again contain data, ready to be consumed by
+ * agg_retrieve_hash_table_in_memory().
+ *
+ * Should only be called after all in memory hash table entries have been
+ * finalized and emitted.
+ *
+ * Return false when input is exhausted and there's no more work to be done;
+ * otherwise return true.
+ */
+static bool
+agg_refill_hash_table(AggState *aggstate)
+{
+ HashAggBatch *batch;
+ AggStatePerHash perhash;
+ HashAggSpill spill;
+ HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
+ bool spill_initialized = false;
+
+ if (aggstate->hash_batches == NIL)
+ return false;
+
+ /* hash_batches is a stack, with the top item at the end of the list */
+ batch = llast(aggstate->hash_batches);
+ aggstate->hash_batches = list_delete_last(aggstate->hash_batches);
+
+ hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
+ batch->used_bits, &aggstate->hash_mem_limit,
+ &aggstate->hash_ngroups_limit, NULL);
+
+ /*
+ * Each batch only processes one grouping set; set the rest to NULL so
+ * that advance_aggregates() knows to ignore them. We don't touch
+ * pergroups for sorted grouping sets here, because they will be needed if
+ * we rescan later. The expressions for sorted grouping sets will not be
+ * evaluated after we recompile anyway.
+ */
+ MemSet(aggstate->hash_pergroup, 0,
+ sizeof(AggStatePerGroup) * aggstate->num_hashes);
+
+ /* free memory and reset hash tables */
+ ReScanExprContext(aggstate->hashcontext);
+ for (int setno = 0; setno < aggstate->num_hashes; setno++)
+ ResetTupleHashTable(aggstate->perhash[setno].hashtable);
+
+ aggstate->hash_ngroups_current = 0;
+
+ /*
+ * In AGG_MIXED mode, hash aggregation happens in phase 1 and the output
+ * happens in phase 0. So, we switch to phase 1 when processing a batch,
+ * and back to phase 0 after the batch is done.
+ */
+ Assert(aggstate->current_phase == 0);
+ if (aggstate->phase->aggstrategy == AGG_MIXED)
+ {
+ aggstate->current_phase = 1;
+ aggstate->phase = &aggstate->phases[aggstate->current_phase];
+ }
+
+ select_current_set(aggstate, batch->setno, true);
+
+ perhash = &aggstate->perhash[aggstate->current_set];
+
+ /*
+ * Spilled tuples are always read back as MinimalTuples, which may be
+ * different from the outer plan, so recompile the aggregate expressions.
+ *
+ * We still need the NULL check, because we are only processing one
+ * grouping set at a time and the rest will be NULL.
+ */
+ hashagg_recompile_expressions(aggstate, true, true);
+
+ for (;;)
+ {
+ TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
+ TupleTableSlot *hashslot = perhash->hashslot;
+ TupleHashEntry entry;
+ MinimalTuple tuple;
+ uint32 hash;
+ bool isnew = false;
+ bool *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
+
+ CHECK_FOR_INTERRUPTS();
+
+ tuple = hashagg_batch_read(batch, &hash);
+ if (tuple == NULL)
+ break;
+
+ ExecStoreMinimalTuple(tuple, spillslot, true);
+ aggstate->tmpcontext->ecxt_outertuple = spillslot;
+
+ prepare_hash_slot(perhash,
+ aggstate->tmpcontext->ecxt_outertuple,
+ hashslot);
+ entry = LookupTupleHashEntryHash(
+ perhash->hashtable, hashslot, p_isnew, hash);
+
+ if (entry != NULL)
+ {
+ if (isnew)
+ initialize_hash_entry(aggstate, perhash->hashtable, entry);
+ aggstate->hash_pergroup[batch->setno] = entry->additional;
+ advance_aggregates(aggstate);
+ }
+ else
+ {
+ if (!spill_initialized)
+ {
+ /*
+ * Avoid initializing the spill until we actually need it so
+ * that we don't assign tapes that will never be used.
+ */
+ spill_initialized = true;
+ hashagg_spill_init(&spill, tapeinfo, batch->used_bits,
+ batch->input_card, aggstate->hashentrysize);
+ }
+ /* no memory for a new group, spill */
+ hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
+
+ aggstate->hash_pergroup[batch->setno] = NULL;
+ }
+
+ /*
+ * Reset per-input-tuple context after each tuple, but note that the
+ * hash lookups do this too
+ */
+ ResetExprContext(aggstate->tmpcontext);
+ }
+
+ hashagg_tapeinfo_release(tapeinfo, batch->input_tapenum);
+
+ /* change back to phase 0 */
+ aggstate->current_phase = 0;
+ aggstate->phase = &aggstate->phases[aggstate->current_phase];
+
+ if (spill_initialized)
+ {
+ hashagg_spill_finish(aggstate, &spill, batch->setno);
+ hash_agg_update_metrics(aggstate, true, spill.npartitions);
+ }
+ else
+ hash_agg_update_metrics(aggstate, true, 0);
+
+ aggstate->hash_spill_mode = false;
+
+ /* prepare to walk the first hash table */
+ select_current_set(aggstate, batch->setno, true);
+ ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable,
+ &aggstate->perhash[batch->setno].hashiter);
+
+ pfree(batch);
+
+ return true;
+}
+
+/*
+ * ExecAgg for hashed case: retrieving groups from hash table
+ *
+ * After exhausting in-memory tuples, also try refilling the hash table using
+ * previously-spilled tuples. Only returns NULL after all in-memory and
+ * spilled tuples are exhausted.
+ */
+static TupleTableSlot *
+agg_retrieve_hash_table(AggState *aggstate)
+{
+ TupleTableSlot *result = NULL;
+
+ while (result == NULL)
+ {
+ result = agg_retrieve_hash_table_in_memory(aggstate);
+ if (result == NULL)
+ {
+ if (!agg_refill_hash_table(aggstate))
+ {
+ aggstate->agg_done = true;
+ break;
+ }
+ }
+ }
+
+ return result;
+}
+
+/*
+ * Retrieve the groups from the in-memory hash tables without considering any
+ * spilled tuples.
+ */
+static TupleTableSlot *
+agg_retrieve_hash_table_in_memory(AggState *aggstate)
+{
+ ExprContext *econtext;
+ AggStatePerAgg peragg;
+ AggStatePerGroup pergroup;
+ TupleHashEntryData *entry;
+ TupleTableSlot *firstSlot;
+ TupleTableSlot *result;
+ AggStatePerHash perhash;
+
+ /*
+ * get state info from node.
+ *
+ * econtext is the per-output-tuple expression context.
+ */
+ econtext = aggstate->ss.ps.ps_ExprContext;
+ peragg = aggstate->peragg;
+ firstSlot = aggstate->ss.ss_ScanTupleSlot;
+
+ /*
+ * Note that perhash (and therefore anything accessed through it) can
+ * change inside the loop, as we change between grouping sets.
+ */
+ perhash = &aggstate->perhash[aggstate->current_set];
+
+ /*
+ * We loop retrieving groups until we find one satisfying
+ * aggstate->ss.ps.qual
+ */
+ for (;;)
+ {
+ TupleTableSlot *hashslot = perhash->hashslot;
+ int i;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Find the next entry in the hash table
+ */
+ entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter);
+ if (entry == NULL)
+ {
+ int nextset = aggstate->current_set + 1;
+
+ if (nextset < aggstate->num_hashes)
+ {
+ /*
+ * Switch to next grouping set, reinitialize, and restart the
+ * loop.
+ */
+ select_current_set(aggstate, nextset, true);
+
+ perhash = &aggstate->perhash[aggstate->current_set];
+
+ ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter);
+
+ continue;
+ }
+ else
+ {
+ return NULL;
+ }
+ }
+
+ /*
+ * Clear the per-output-tuple context for each group
+ *
+ * We intentionally don't use ReScanExprContext here; if any aggs have
+ * registered shutdown callbacks, they mustn't be called yet, since we
+ * might not be done with that agg.
+ */
+ ResetExprContext(econtext);
+
+ /*
+ * Transform representative tuple back into one with the right
+ * columns.
+ */
+ ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
+ slot_getallattrs(hashslot);
+
+ ExecClearTuple(firstSlot);
+ memset(firstSlot->tts_isnull, true,
+ firstSlot->tts_tupleDescriptor->natts * sizeof(bool));
+
+ for (i = 0; i < perhash->numhashGrpCols; i++)
+ {
+ int varNumber = perhash->hashGrpColIdxInput[i] - 1;
+
+ firstSlot->tts_values[varNumber] = hashslot->tts_values[i];
+ firstSlot->tts_isnull[varNumber] = hashslot->tts_isnull[i];
+ }
+ ExecStoreVirtualTuple(firstSlot);
+
+ pergroup = (AggStatePerGroup) entry->additional;
+
+ /*
+ * Use the representative input tuple for any references to
+ * non-aggregated input columns in the qual and tlist.
+ */
+ econtext->ecxt_outertuple = firstSlot;
+
+ prepare_projection_slot(aggstate,
+ econtext->ecxt_outertuple,
+ aggstate->current_set);
+
+ finalize_aggregates(aggstate, peragg, pergroup);
+
+ result = project_aggregates(aggstate);
+ if (result)
+ return result;
+ }
+
+ /* No more groups */
+ return NULL;
+}
+
+/*
+ * Initialize HashTapeInfo
+ */
+static void
+hashagg_tapeinfo_init(AggState *aggstate)
+{
+ HashTapeInfo *tapeinfo = palloc(sizeof(HashTapeInfo));
+ int init_tapes = 16; /* expanded dynamically */
+
+ tapeinfo->tapeset = LogicalTapeSetCreate(init_tapes, true, NULL, NULL, -1);
+ tapeinfo->ntapes = init_tapes;
+ tapeinfo->nfreetapes = init_tapes;
+ tapeinfo->freetapes_alloc = init_tapes;
+ tapeinfo->freetapes = palloc(init_tapes * sizeof(int));
+ for (int i = 0; i < init_tapes; i++)
+ tapeinfo->freetapes[i] = i;
+
+ aggstate->hash_tapeinfo = tapeinfo;
+}
+
+/*
+ * Assign unused tapes to spill partitions, extending the tape set if
+ * necessary.
+ */
+static void
+hashagg_tapeinfo_assign(HashTapeInfo *tapeinfo, int *partitions,
+ int npartitions)
+{
+ int partidx = 0;
+
+ /* use free tapes if available */
+ while (partidx < npartitions && tapeinfo->nfreetapes > 0)
+ partitions[partidx++] = tapeinfo->freetapes[--tapeinfo->nfreetapes];
+
+ if (partidx < npartitions)
+ {
+ LogicalTapeSetExtend(tapeinfo->tapeset, npartitions - partidx);
+
+ while (partidx < npartitions)
+ partitions[partidx++] = tapeinfo->ntapes++;
+ }
+}
+
+/*
+ * After a tape has already been written to and then read, this function
+ * rewinds it for writing and adds it to the free list.
+ */
+static void
+hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum)
+{
+ /* rewinding frees the buffer while not in use */
+ LogicalTapeRewindForWrite(tapeinfo->tapeset, tapenum);
+ if (tapeinfo->freetapes_alloc == tapeinfo->nfreetapes)
+ {
+ tapeinfo->freetapes_alloc <<= 1;
+ tapeinfo->freetapes = repalloc(tapeinfo->freetapes,
+ tapeinfo->freetapes_alloc * sizeof(int));
+ }
+ tapeinfo->freetapes[tapeinfo->nfreetapes++] = tapenum;
+}
+
+/*
+ * hashagg_spill_init
+ *
+ * Called after we determined that spilling is necessary. Chooses the number
+ * of partitions to create, and initializes them.
+ */
+static void
+hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
+ double input_groups, double hashentrysize)
+{
+ int npartitions;
+ int partition_bits;
+
+ npartitions = hash_choose_num_partitions(input_groups, hashentrysize,
+ used_bits, &partition_bits);
+
+ spill->partitions = palloc0(sizeof(int) * npartitions);
+ spill->ntuples = palloc0(sizeof(int64) * npartitions);
+ spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions);
+
+ hashagg_tapeinfo_assign(tapeinfo, spill->partitions, npartitions);
+
+ spill->tapeset = tapeinfo->tapeset;
+ spill->shift = 32 - used_bits - partition_bits;
+ spill->mask = (npartitions - 1) << spill->shift;
+ spill->npartitions = npartitions;
+
+ for (int i = 0; i < npartitions; i++)
+ initHyperLogLog(&spill->hll_card[i], HASHAGG_HLL_BIT_WIDTH);
+}
+
+/*
+ * hashagg_spill_tuple
+ *
+ * No room for new groups in the hash table. Save for later in the appropriate
+ * partition.
+ */
+static Size
+hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
+ TupleTableSlot *inputslot, uint32 hash)
+{
+ LogicalTapeSet *tapeset = spill->tapeset;
+ TupleTableSlot *spillslot;
+ int partition;
+ MinimalTuple tuple;
+ int tapenum;
+ int total_written = 0;
+ bool shouldFree;
+
+ Assert(spill->partitions != NULL);
+
+ /* spill only attributes that we actually need */
+ if (!aggstate->all_cols_needed)
+ {
+ spillslot = aggstate->hash_spill_wslot;
+ slot_getsomeattrs(inputslot, aggstate->max_colno_needed);
+ ExecClearTuple(spillslot);
+ for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++)
+ {
+ if (bms_is_member(i + 1, aggstate->colnos_needed))
+ {
+ spillslot->tts_values[i] = inputslot->tts_values[i];
+ spillslot->tts_isnull[i] = inputslot->tts_isnull[i];
+ }
+ else
+ spillslot->tts_isnull[i] = true;
+ }
+ ExecStoreVirtualTuple(spillslot);
+ }
+ else
+ spillslot = inputslot;
+
+ tuple = ExecFetchSlotMinimalTuple(spillslot, &shouldFree);
+
+ partition = (hash & spill->mask) >> spill->shift;
+ spill->ntuples[partition]++;
+
+ /*
+ * All hash values destined for a given partition have some bits in
+ * common, which causes bad HLL cardinality estimates. Hash the hash to
+ * get a more uniform distribution.
+ */
+ addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash));
+
+ tapenum = spill->partitions[partition];
+
+ LogicalTapeWrite(tapeset, tapenum, (void *) &hash, sizeof(uint32));
+ total_written += sizeof(uint32);
+
+ LogicalTapeWrite(tapeset, tapenum, (void *) tuple, tuple->t_len);
+ total_written += tuple->t_len;
+
+ if (shouldFree)
+ pfree(tuple);
+
+ return total_written;
+}
+
+/*
+ * hashagg_batch_new
+ *
+ * Construct a HashAggBatch item, which represents one iteration of HashAgg to
+ * be done.
+ */
+static HashAggBatch *
+hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
+ int64 input_tuples, double input_card, int used_bits)
+{
+ HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
+
+ batch->setno = setno;
+ batch->used_bits = used_bits;
+ batch->tapeset = tapeset;
+ batch->input_tapenum = tapenum;
+ batch->input_tuples = input_tuples;
+ batch->input_card = input_card;
+
+ return batch;
+}
+
+/*
+ * read_spilled_tuple
+ * read the next tuple from a batch's tape. Return NULL if no more.
+ */
+static MinimalTuple
+hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
+{
+ LogicalTapeSet *tapeset = batch->tapeset;
+ int tapenum = batch->input_tapenum;
+ MinimalTuple tuple;
+ uint32 t_len;
+ size_t nread;
+ uint32 hash;
+
+ nread = LogicalTapeRead(tapeset, tapenum, &hash, sizeof(uint32));
+ if (nread == 0)
+ return NULL;
+ if (nread != sizeof(uint32))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
+ tapenum, sizeof(uint32), nread)));
+ if (hashp != NULL)
+ *hashp = hash;
+
+ nread = LogicalTapeRead(tapeset, tapenum, &t_len, sizeof(t_len));
+ if (nread != sizeof(uint32))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
+ tapenum, sizeof(uint32), nread)));
+
+ tuple = (MinimalTuple) palloc(t_len);
+ tuple->t_len = t_len;
+
+ nread = LogicalTapeRead(tapeset, tapenum,
+ (void *) ((char *) tuple + sizeof(uint32)),
+ t_len - sizeof(uint32));
+ if (nread != t_len - sizeof(uint32))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
+ tapenum, t_len - sizeof(uint32), nread)));
+
+ return tuple;
+}
+
+/*
+ * hashagg_finish_initial_spills
+ *
+ * After a HashAggBatch has been processed, it may have spilled tuples to
+ * disk. If so, turn the spilled partitions into new batches that must later
+ * be executed.
+ */
+static void
+hashagg_finish_initial_spills(AggState *aggstate)
+{
+ int setno;
+ int total_npartitions = 0;
+
+ if (aggstate->hash_spills != NULL)
+ {
+ for (setno = 0; setno < aggstate->num_hashes; setno++)
+ {
+ HashAggSpill *spill = &aggstate->hash_spills[setno];
+
+ total_npartitions += spill->npartitions;
+ hashagg_spill_finish(aggstate, spill, setno);
+ }
+
+ /*
+ * We're not processing tuples from outer plan any more; only
+ * processing batches of spilled tuples. The initial spill structures
+ * are no longer needed.
+ */
+ pfree(aggstate->hash_spills);
+ aggstate->hash_spills = NULL;
+ }
+
+ hash_agg_update_metrics(aggstate, false, total_npartitions);
+ aggstate->hash_spill_mode = false;
+}
+
+/*
+ * hashagg_spill_finish
+ *
+ * Transform spill partitions into new batches.
+ */
+static void
+hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
+{
+ int i;
+ int used_bits = 32 - spill->shift;
+
+ if (spill->npartitions == 0)
+ return; /* didn't spill */
+
+ for (i = 0; i < spill->npartitions; i++)
+ {
+ LogicalTapeSet *tapeset = aggstate->hash_tapeinfo->tapeset;
+ int tapenum = spill->partitions[i];
+ HashAggBatch *new_batch;
+ double cardinality;
+
+ /* if the partition is empty, don't create a new batch of work */
+ if (spill->ntuples[i] == 0)
+ continue;
+
+ cardinality = estimateHyperLogLog(&spill->hll_card[i]);
+ freeHyperLogLog(&spill->hll_card[i]);
+
+ /* rewinding frees the buffer while not in use */
+ LogicalTapeRewindForRead(tapeset, tapenum,
+ HASHAGG_READ_BUFFER_SIZE);
+
+ new_batch = hashagg_batch_new(tapeset, tapenum, setno,
+ spill->ntuples[i], cardinality,
+ used_bits);
+ aggstate->hash_batches = lappend(aggstate->hash_batches, new_batch);
+ aggstate->hash_batches_used++;
+ }
+
+ pfree(spill->ntuples);
+ pfree(spill->hll_card);
+ pfree(spill->partitions);
+}
+
+/*
+ * Free resources related to a spilled HashAgg.
+ */
+static void
+hashagg_reset_spill_state(AggState *aggstate)
+{
+ /* free spills from initial pass */
+ if (aggstate->hash_spills != NULL)
+ {
+ int setno;
+
+ for (setno = 0; setno < aggstate->num_hashes; setno++)
+ {
+ HashAggSpill *spill = &aggstate->hash_spills[setno];
+
+ pfree(spill->ntuples);
+ pfree(spill->partitions);
+ }
+ pfree(aggstate->hash_spills);
+ aggstate->hash_spills = NULL;
+ }
+
+ /* free batches */
+ list_free_deep(aggstate->hash_batches);
+ aggstate->hash_batches = NIL;
+
+ /* close tape set */
+ if (aggstate->hash_tapeinfo != NULL)
+ {
+ HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
+
+ LogicalTapeSetClose(tapeinfo->tapeset);
+ pfree(tapeinfo->freetapes);
+ pfree(tapeinfo);
+ aggstate->hash_tapeinfo = NULL;
+ }
+}
+
+
+/* -----------------
+ * ExecInitAgg
+ *
+ * Creates the run-time information for the agg node produced by the
+ * planner and initializes its outer subtree.
+ *
+ * -----------------
+ */
+AggState *
+ExecInitAgg(Agg *node, EState *estate, int eflags)
+{
+ AggState *aggstate;
+ AggStatePerAgg peraggs;
+ AggStatePerTrans pertransstates;
+ AggStatePerGroup *pergroups;
+ Plan *outerPlan;
+ ExprContext *econtext;
+ TupleDesc scanDesc;
+ int max_aggno;
+ int max_transno;
+ int numaggrefs;
+ int numaggs;
+ int numtrans;
+ int phase;
+ int phaseidx;
+ ListCell *l;
+ Bitmapset *all_grouped_cols = NULL;
+ int numGroupingSets = 1;
+ int numPhases;
+ int numHashes;
+ int i = 0;
+ int j = 0;
+ bool use_hashing = (node->aggstrategy == AGG_HASHED ||
+ node->aggstrategy == AGG_MIXED);
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ aggstate = makeNode(AggState);
+ aggstate->ss.ps.plan = (Plan *) node;
+ aggstate->ss.ps.state = estate;
+ aggstate->ss.ps.ExecProcNode = ExecAgg;
+
+ aggstate->aggs = NIL;
+ aggstate->numaggs = 0;
+ aggstate->numtrans = 0;
+ aggstate->aggstrategy = node->aggstrategy;
+ aggstate->aggsplit = node->aggsplit;
+ aggstate->maxsets = 0;
+ aggstate->projected_set = -1;
+ aggstate->current_set = 0;
+ aggstate->peragg = NULL;
+ aggstate->pertrans = NULL;
+ aggstate->curperagg = NULL;
+ aggstate->curpertrans = NULL;
+ aggstate->input_done = false;
+ aggstate->agg_done = false;
+ aggstate->pergroups = NULL;
+ aggstate->grp_firstTuple = NULL;
+ aggstate->sort_in = NULL;
+ aggstate->sort_out = NULL;
+
+ /*
+ * phases[0] always exists, but is dummy in sorted/plain mode
+ */
+ numPhases = (use_hashing ? 1 : 2);
+ numHashes = (use_hashing ? 1 : 0);
+
+ /*
+ * Calculate the maximum number of grouping sets in any phase; this
+ * determines the size of some allocations. Also calculate the number of
+ * phases, since all hashed/mixed nodes contribute to only a single phase.
+ */
+ if (node->groupingSets)
+ {
+ numGroupingSets = list_length(node->groupingSets);
+
+ foreach(l, node->chain)
+ {
+ Agg *agg = lfirst(l);
+
+ numGroupingSets = Max(numGroupingSets,
+ list_length(agg->groupingSets));
+
+ /*
+ * additional AGG_HASHED aggs become part of phase 0, but all
+ * others add an extra phase.
+ */
+ if (agg->aggstrategy != AGG_HASHED)
+ ++numPhases;
+ else
+ ++numHashes;
+ }
+ }
+
+ aggstate->maxsets = numGroupingSets;
+ aggstate->numphases = numPhases;
+
+ aggstate->aggcontexts = (ExprContext **)
+ palloc0(sizeof(ExprContext *) * numGroupingSets);
+
+ /*
+ * Create expression contexts. We need three or more, one for
+ * per-input-tuple processing, one for per-output-tuple processing, one
+ * for all the hashtables, and one for each grouping set. The per-tuple
+ * memory context of the per-grouping-set ExprContexts (aggcontexts)
+ * replaces the standalone memory context formerly used to hold transition
+ * values. We cheat a little by using ExecAssignExprContext() to build
+ * all of them.
+ *
+ * NOTE: the details of what is stored in aggcontexts and what is stored
+ * in the regular per-query memory context are driven by a simple
+ * decision: we want to reset the aggcontext at group boundaries (if not
+ * hashing) and in ExecReScanAgg to recover no-longer-wanted space.
+ */
+ ExecAssignExprContext(estate, &aggstate->ss.ps);
+ aggstate->tmpcontext = aggstate->ss.ps.ps_ExprContext;
+
+ for (i = 0; i < numGroupingSets; ++i)
+ {
+ ExecAssignExprContext(estate, &aggstate->ss.ps);
+ aggstate->aggcontexts[i] = aggstate->ss.ps.ps_ExprContext;
+ }
+
+ if (use_hashing)
+ aggstate->hashcontext = CreateWorkExprContext(estate);
+
+ ExecAssignExprContext(estate, &aggstate->ss.ps);
+
+ /*
+ * Initialize child nodes.
+ *
+ * If we are doing a hashed aggregation then the child plan does not need
+ * to handle REWIND efficiently; see ExecReScanAgg.
+ */
+ if (node->aggstrategy == AGG_HASHED)
+ eflags &= ~EXEC_FLAG_REWIND;
+ outerPlan = outerPlan(node);
+ outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags);
+
+ /*
+ * initialize source tuple type.
+ */
+ aggstate->ss.ps.outerops =
+ ExecGetResultSlotOps(outerPlanState(&aggstate->ss),
+ &aggstate->ss.ps.outeropsfixed);
+ aggstate->ss.ps.outeropsset = true;
+
+ ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss,
+ aggstate->ss.ps.outerops);
+ scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
+
+ /*
+ * If there are more than two phases (including a potential dummy phase
+ * 0), input will be resorted using tuplesort. Need a slot for that.
+ */
+ if (numPhases > 2)
+ {
+ aggstate->sort_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsMinimalTuple);
+
+ /*
+ * The output of the tuplesort, and the output from the outer child
+ * might not use the same type of slot. In most cases the child will
+ * be a Sort, and thus return a TTSOpsMinimalTuple type slot - but the
+ * input can also be presorted due an index, in which case it could be
+ * a different type of slot.
+ *
+ * XXX: For efficiency it would be good to instead/additionally
+ * generate expressions with corresponding settings of outerops* for
+ * the individual phases - deforming is often a bottleneck for
+ * aggregations with lots of rows per group. If there's multiple
+ * sorts, we know that all but the first use TTSOpsMinimalTuple (via
+ * the nodeAgg.c internal tuplesort).
+ */
+ if (aggstate->ss.ps.outeropsfixed &&
+ aggstate->ss.ps.outerops != &TTSOpsMinimalTuple)
+ aggstate->ss.ps.outeropsfixed = false;
+ }
+
+ /*
+ * Initialize result type, slot and projection.
+ */
+ ExecInitResultTupleSlotTL(&aggstate->ss.ps, &TTSOpsVirtual);
+ ExecAssignProjectionInfo(&aggstate->ss.ps, NULL);
+
+ /*
+ * initialize child expressions
+ *
+ * We expect the parser to have checked that no aggs contain other agg
+ * calls in their arguments (and just to be sure, we verify it again while
+ * initializing the plan node). This would make no sense under SQL
+ * semantics, and it's forbidden by the spec. Because it is true, we
+ * don't need to worry about evaluating the aggs in any particular order.
+ *
+ * Note: execExpr.c finds Aggrefs for us, and adds them to aggstate->aggs.
+ * Aggrefs in the qual are found here; Aggrefs in the targetlist are found
+ * during ExecAssignProjectionInfo, above.
+ */
+ aggstate->ss.ps.qual =
+ ExecInitQual(node->plan.qual, (PlanState *) aggstate);
+
+ /*
+ * We should now have found all Aggrefs in the targetlist and quals.
+ */
+ numaggrefs = list_length(aggstate->aggs);
+ max_aggno = -1;
+ max_transno = -1;
+ foreach(l, aggstate->aggs)
+ {
+ Aggref *aggref = (Aggref *) lfirst(l);
+
+ max_aggno = Max(max_aggno, aggref->aggno);
+ max_transno = Max(max_transno, aggref->aggtransno);
+ }
+ numaggs = max_aggno + 1;
+ numtrans = max_transno + 1;
+
+ /*
+ * For each phase, prepare grouping set data and fmgr lookup data for
+ * compare functions. Accumulate all_grouped_cols in passing.
+ */
+ aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData));
+
+ aggstate->num_hashes = numHashes;
+ if (numHashes)
+ {
+ aggstate->perhash = palloc0(sizeof(AggStatePerHashData) * numHashes);
+ aggstate->phases[0].numsets = 0;
+ aggstate->phases[0].gset_lengths = palloc(numHashes * sizeof(int));
+ aggstate->phases[0].grouped_cols = palloc(numHashes * sizeof(Bitmapset *));
+ }
+
+ phase = 0;
+ for (phaseidx = 0; phaseidx <= list_length(node->chain); ++phaseidx)
+ {
+ Agg *aggnode;
+ Sort *sortnode;
+
+ if (phaseidx > 0)
+ {
+ aggnode = list_nth_node(Agg, node->chain, phaseidx - 1);
+ sortnode = castNode(Sort, aggnode->plan.lefttree);
+ }
+ else
+ {
+ aggnode = node;
+ sortnode = NULL;
+ }
+
+ Assert(phase <= 1 || sortnode);
+
+ if (aggnode->aggstrategy == AGG_HASHED
+ || aggnode->aggstrategy == AGG_MIXED)
+ {
+ AggStatePerPhase phasedata = &aggstate->phases[0];
+ AggStatePerHash perhash;
+ Bitmapset *cols = NULL;
+
+ Assert(phase == 0);
+ i = phasedata->numsets++;
+ perhash = &aggstate->perhash[i];
+
+ /* phase 0 always points to the "real" Agg in the hash case */
+ phasedata->aggnode = node;
+ phasedata->aggstrategy = node->aggstrategy;
+
+ /* but the actual Agg node representing this hash is saved here */
+ perhash->aggnode = aggnode;
+
+ phasedata->gset_lengths[i] = perhash->numCols = aggnode->numCols;
+
+ for (j = 0; j < aggnode->numCols; ++j)
+ cols = bms_add_member(cols, aggnode->grpColIdx[j]);
+
+ phasedata->grouped_cols[i] = cols;
+
+ all_grouped_cols = bms_add_members(all_grouped_cols, cols);
+ continue;
+ }
+ else
+ {
+ AggStatePerPhase phasedata = &aggstate->phases[++phase];
+ int num_sets;
+
+ phasedata->numsets = num_sets = list_length(aggnode->groupingSets);
+
+ if (num_sets)
+ {
+ phasedata->gset_lengths = palloc(num_sets * sizeof(int));
+ phasedata->grouped_cols = palloc(num_sets * sizeof(Bitmapset *));
+
+ i = 0;
+ foreach(l, aggnode->groupingSets)
+ {
+ int current_length = list_length(lfirst(l));
+ Bitmapset *cols = NULL;
+
+ /* planner forces this to be correct */
+ for (j = 0; j < current_length; ++j)
+ cols = bms_add_member(cols, aggnode->grpColIdx[j]);
+
+ phasedata->grouped_cols[i] = cols;
+ phasedata->gset_lengths[i] = current_length;
+
+ ++i;
+ }
+
+ all_grouped_cols = bms_add_members(all_grouped_cols,
+ phasedata->grouped_cols[0]);
+ }
+ else
+ {
+ Assert(phaseidx == 0);
+
+ phasedata->gset_lengths = NULL;
+ phasedata->grouped_cols = NULL;
+ }
+
+ /*
+ * If we are grouping, precompute fmgr lookup data for inner loop.
+ */
+ if (aggnode->aggstrategy == AGG_SORTED)
+ {
+ int i = 0;
+
+ Assert(aggnode->numCols > 0);
+
+ /*
+ * Build a separate function for each subset of columns that
+ * need to be compared.
+ */
+ phasedata->eqfunctions =
+ (ExprState **) palloc0(aggnode->numCols * sizeof(ExprState *));
+
+ /* for each grouping set */
+ for (i = 0; i < phasedata->numsets; i++)
+ {
+ int length = phasedata->gset_lengths[i];
+
+ if (phasedata->eqfunctions[length - 1] != NULL)
+ continue;
+
+ phasedata->eqfunctions[length - 1] =
+ execTuplesMatchPrepare(scanDesc,
+ length,
+ aggnode->grpColIdx,
+ aggnode->grpOperators,
+ aggnode->grpCollations,
+ (PlanState *) aggstate);
+ }
+
+ /* and for all grouped columns, unless already computed */
+ if (phasedata->eqfunctions[aggnode->numCols - 1] == NULL)
+ {
+ phasedata->eqfunctions[aggnode->numCols - 1] =
+ execTuplesMatchPrepare(scanDesc,
+ aggnode->numCols,
+ aggnode->grpColIdx,
+ aggnode->grpOperators,
+ aggnode->grpCollations,
+ (PlanState *) aggstate);
+ }
+ }
+
+ phasedata->aggnode = aggnode;
+ phasedata->aggstrategy = aggnode->aggstrategy;
+ phasedata->sortnode = sortnode;
+ }
+ }
+
+ /*
+ * Convert all_grouped_cols to a descending-order list.
+ */
+ i = -1;
+ while ((i = bms_next_member(all_grouped_cols, i)) >= 0)
+ aggstate->all_grouped_cols = lcons_int(i, aggstate->all_grouped_cols);
+
+ /*
+ * Set up aggregate-result storage in the output expr context, and also
+ * allocate my private per-agg working storage
+ */
+ econtext = aggstate->ss.ps.ps_ExprContext;
+ econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs);
+ econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs);
+
+ peraggs = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs);
+ pertransstates = (AggStatePerTrans) palloc0(sizeof(AggStatePerTransData) * numtrans);
+
+ aggstate->peragg = peraggs;
+ aggstate->pertrans = pertransstates;
+
+
+ aggstate->all_pergroups =
+ (AggStatePerGroup *) palloc0(sizeof(AggStatePerGroup)
+ * (numGroupingSets + numHashes));
+ pergroups = aggstate->all_pergroups;
+
+ if (node->aggstrategy != AGG_HASHED)
+ {
+ for (i = 0; i < numGroupingSets; i++)
+ {
+ pergroups[i] = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData)
+ * numaggs);
+ }
+
+ aggstate->pergroups = pergroups;
+ pergroups += numGroupingSets;
+ }
+
+ /*
+ * Hashing can only appear in the initial phase.
+ */
+ if (use_hashing)
+ {
+ Plan *outerplan = outerPlan(node);
+ uint64 totalGroups = 0;
+ int i;
+
+ aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
+ "HashAgg meta context",
+ ALLOCSET_DEFAULT_SIZES);
+ aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsMinimalTuple);
+ aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsVirtual);
+
+ /* this is an array of pointers, not structures */
+ aggstate->hash_pergroup = pergroups;
+
+ aggstate->hashentrysize = hash_agg_entry_size(aggstate->numtrans,
+ outerplan->plan_width,
+ node->transitionSpace);
+
+ /*
+ * Consider all of the grouping sets together when setting the limits
+ * and estimating the number of partitions. This can be inaccurate
+ * when there is more than one grouping set, but should still be
+ * reasonable.
+ */
+ for (i = 0; i < aggstate->num_hashes; i++)
+ totalGroups += aggstate->perhash[i].aggnode->numGroups;
+
+ hash_agg_set_limits(aggstate->hashentrysize, totalGroups, 0,
+ &aggstate->hash_mem_limit,
+ &aggstate->hash_ngroups_limit,
+ &aggstate->hash_planned_partitions);
+ find_hash_columns(aggstate);
+
+ /* Skip massive memory allocation if we are just doing EXPLAIN */
+ if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ build_hash_tables(aggstate);
+
+ aggstate->table_filled = false;
+
+ /* Initialize this to 1, meaning nothing spilled, yet */
+ aggstate->hash_batches_used = 1;
+ }
+
+ /*
+ * Initialize current phase-dependent values to initial phase. The initial
+ * phase is 1 (first sort pass) for all strategies that use sorting (if
+ * hashing is being done too, then phase 0 is processed last); but if only
+ * hashing is being done, then phase 0 is all there is.
+ */
+ if (node->aggstrategy == AGG_HASHED)
+ {
+ aggstate->current_phase = 0;
+ initialize_phase(aggstate, 0);
+ select_current_set(aggstate, 0, true);
+ }
+ else
+ {
+ aggstate->current_phase = 1;
+ initialize_phase(aggstate, 1);
+ select_current_set(aggstate, 0, false);
+ }
+
+ /*
+ * Perform lookups of aggregate function info, and initialize the
+ * unchanging fields of the per-agg and per-trans data.
+ */
+ foreach(l, aggstate->aggs)
+ {
+ Aggref *aggref = lfirst(l);
+ AggStatePerAgg peragg;
+ AggStatePerTrans pertrans;
+ Oid inputTypes[FUNC_MAX_ARGS];
+ int numArguments;
+ int numDirectArgs;
+ HeapTuple aggTuple;
+ Form_pg_aggregate aggform;
+ AclResult aclresult;
+ Oid finalfn_oid;
+ Oid serialfn_oid,
+ deserialfn_oid;
+ Oid aggOwner;
+ Expr *finalfnexpr;
+ Oid aggtranstype;
+
+ /* Planner should have assigned aggregate to correct level */
+ Assert(aggref->agglevelsup == 0);
+ /* ... and the split mode should match */
+ Assert(aggref->aggsplit == aggstate->aggsplit);
+
+ peragg = &peraggs[aggref->aggno];
+
+ /* Check if we initialized the state for this aggregate already. */
+ if (peragg->aggref != NULL)
+ continue;
+
+ peragg->aggref = aggref;
+ peragg->transno = aggref->aggtransno;
+
+ /* Fetch the pg_aggregate row */
+ aggTuple = SearchSysCache1(AGGFNOID,
+ ObjectIdGetDatum(aggref->aggfnoid));
+ if (!HeapTupleIsValid(aggTuple))
+ elog(ERROR, "cache lookup failed for aggregate %u",
+ aggref->aggfnoid);
+ aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
+
+ /* Check permission to call aggregate function */
+ aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(),
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_AGGREGATE,
+ get_func_name(aggref->aggfnoid));
+ InvokeFunctionExecuteHook(aggref->aggfnoid);
+
+ /* planner recorded transition state type in the Aggref itself */
+ aggtranstype = aggref->aggtranstype;
+ Assert(OidIsValid(aggtranstype));
+
+ /* Final function only required if we're finalizing the aggregates */
+ if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
+ peragg->finalfn_oid = finalfn_oid = InvalidOid;
+ else
+ peragg->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
+
+ serialfn_oid = InvalidOid;
+ deserialfn_oid = InvalidOid;
+
+ /*
+ * Check if serialization/deserialization is required. We only do it
+ * for aggregates that have transtype INTERNAL.
+ */
+ if (aggtranstype == INTERNALOID)
+ {
+ /*
+ * The planner should only have generated a serialize agg node if
+ * every aggregate with an INTERNAL state has a serialization
+ * function. Verify that.
+ */
+ if (DO_AGGSPLIT_SERIALIZE(aggstate->aggsplit))
+ {
+ /* serialization only valid when not running finalfn */
+ Assert(DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
+
+ if (!OidIsValid(aggform->aggserialfn))
+ elog(ERROR, "serialfunc not provided for serialization aggregation");
+ serialfn_oid = aggform->aggserialfn;
+ }
+
+ /* Likewise for deserialization functions */
+ if (DO_AGGSPLIT_DESERIALIZE(aggstate->aggsplit))
+ {
+ /* deserialization only valid when combining states */
+ Assert(DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
+
+ if (!OidIsValid(aggform->aggdeserialfn))
+ elog(ERROR, "deserialfunc not provided for deserialization aggregation");
+ deserialfn_oid = aggform->aggdeserialfn;
+ }
+ }
+
+ /* Check that aggregate owner has permission to call component fns */
+ {
+ HeapTuple procTuple;
+
+ procTuple = SearchSysCache1(PROCOID,
+ ObjectIdGetDatum(aggref->aggfnoid));
+ if (!HeapTupleIsValid(procTuple))
+ elog(ERROR, "cache lookup failed for function %u",
+ aggref->aggfnoid);
+ aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
+ ReleaseSysCache(procTuple);
+
+ if (OidIsValid(finalfn_oid))
+ {
+ aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(finalfn_oid));
+ InvokeFunctionExecuteHook(finalfn_oid);
+ }
+ if (OidIsValid(serialfn_oid))
+ {
+ aclresult = pg_proc_aclcheck(serialfn_oid, aggOwner,
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(serialfn_oid));
+ InvokeFunctionExecuteHook(serialfn_oid);
+ }
+ if (OidIsValid(deserialfn_oid))
+ {
+ aclresult = pg_proc_aclcheck(deserialfn_oid, aggOwner,
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(deserialfn_oid));
+ InvokeFunctionExecuteHook(deserialfn_oid);
+ }
+ }
+
+ /*
+ * Get actual datatypes of the (nominal) aggregate inputs. These
+ * could be different from the agg's declared input types, when the
+ * agg accepts ANY or a polymorphic type.
+ */
+ numArguments = get_aggregate_argtypes(aggref, inputTypes);
+
+ /* Count the "direct" arguments, if any */
+ numDirectArgs = list_length(aggref->aggdirectargs);
+
+ /* Detect how many arguments to pass to the finalfn */
+ if (aggform->aggfinalextra)
+ peragg->numFinalArgs = numArguments + 1;
+ else
+ peragg->numFinalArgs = numDirectArgs + 1;
+
+ /* Initialize any direct-argument expressions */
+ peragg->aggdirectargs = ExecInitExprList(aggref->aggdirectargs,
+ (PlanState *) aggstate);
+
+ /*
+ * build expression trees using actual argument & result types for the
+ * finalfn, if it exists and is required.
+ */
+ if (OidIsValid(finalfn_oid))
+ {
+ build_aggregate_finalfn_expr(inputTypes,
+ peragg->numFinalArgs,
+ aggtranstype,
+ aggref->aggtype,
+ aggref->inputcollid,
+ finalfn_oid,
+ &finalfnexpr);
+ fmgr_info(finalfn_oid, &peragg->finalfn);
+ fmgr_info_set_expr((Node *) finalfnexpr, &peragg->finalfn);
+ }
+
+ /* get info about the output value's datatype */
+ get_typlenbyval(aggref->aggtype,
+ &peragg->resulttypeLen,
+ &peragg->resulttypeByVal);
+
+ /*
+ * Build working state for invoking the transition function, if we
+ * haven't done it already.
+ */
+ pertrans = &pertransstates[aggref->aggtransno];
+ if (pertrans->aggref == NULL)
+ {
+ Datum textInitVal;
+ Datum initValue;
+ bool initValueIsNull;
+ Oid transfn_oid;
+
+ /*
+ * If this aggregation is performing state combines, then instead
+ * of using the transition function, we'll use the combine
+ * function
+ */
+ if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
+ {
+ transfn_oid = aggform->aggcombinefn;
+
+ /* If not set then the planner messed up */
+ if (!OidIsValid(transfn_oid))
+ elog(ERROR, "combinefn not set for aggregate function");
+ }
+ else
+ transfn_oid = aggform->aggtransfn;
+
+ aclresult = pg_proc_aclcheck(transfn_oid, aggOwner,
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(transfn_oid));
+ InvokeFunctionExecuteHook(transfn_oid);
+
+ /*
+ * initval is potentially null, so don't try to access it as a
+ * struct field. Must do it the hard way with SysCacheGetAttr.
+ */
+ textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
+ Anum_pg_aggregate_agginitval,
+ &initValueIsNull);
+ if (initValueIsNull)
+ initValue = (Datum) 0;
+ else
+ initValue = GetAggInitVal(textInitVal, aggtranstype);
+
+ build_pertrans_for_aggref(pertrans, aggstate, estate,
+ aggref, transfn_oid, aggtranstype,
+ serialfn_oid, deserialfn_oid,
+ initValue, initValueIsNull,
+ inputTypes, numArguments);
+ }
+ else
+ pertrans->aggshared = true;
+ ReleaseSysCache(aggTuple);
+ }
+
+ /*
+ * Update aggstate->numaggs to be the number of unique aggregates found.
+ * Also set numstates to the number of unique transition states found.
+ */
+ aggstate->numaggs = numaggs;
+ aggstate->numtrans = numtrans;
+
+ /*
+ * Last, check whether any more aggregates got added onto the node while
+ * we processed the expressions for the aggregate arguments (including not
+ * only the regular arguments and FILTER expressions handled immediately
+ * above, but any direct arguments we might've handled earlier). If so,
+ * we have nested aggregate functions, which is semantically nonsensical,
+ * so complain. (This should have been caught by the parser, so we don't
+ * need to work hard on a helpful error message; but we defend against it
+ * here anyway, just to be sure.)
+ */
+ if (numaggrefs != list_length(aggstate->aggs))
+ ereport(ERROR,
+ (errcode(ERRCODE_GROUPING_ERROR),
+ errmsg("aggregate function calls cannot be nested")));
+
+ /*
+ * Build expressions doing all the transition work at once. We build a
+ * different one for each phase, as the number of transition function
+ * invocation can differ between phases. Note this'll work both for
+ * transition and combination functions (although there'll only be one
+ * phase in the latter case).
+ */
+ for (phaseidx = 0; phaseidx < aggstate->numphases; phaseidx++)
+ {
+ AggStatePerPhase phase = &aggstate->phases[phaseidx];
+ bool dohash = false;
+ bool dosort = false;
+
+ /* phase 0 doesn't necessarily exist */
+ if (!phase->aggnode)
+ continue;
+
+ if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 1)
+ {
+ /*
+ * Phase one, and only phase one, in a mixed agg performs both
+ * sorting and aggregation.
+ */
+ dohash = true;
+ dosort = true;
+ }
+ else if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 0)
+ {
+ /*
+ * No need to compute a transition function for an AGG_MIXED phase
+ * 0 - the contents of the hashtables will have been computed
+ * during phase 1.
+ */
+ continue;
+ }
+ else if (phase->aggstrategy == AGG_PLAIN ||
+ phase->aggstrategy == AGG_SORTED)
+ {
+ dohash = false;
+ dosort = true;
+ }
+ else if (phase->aggstrategy == AGG_HASHED)
+ {
+ dohash = true;
+ dosort = false;
+ }
+ else
+ Assert(false);
+
+ phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash,
+ false);
+
+ /* cache compiled expression for outer slot without NULL check */
+ phase->evaltrans_cache[0][0] = phase->evaltrans;
+ }
+
+ return aggstate;
+}
+
+/*
+ * Build the state needed to calculate a state value for an aggregate.
+ *
+ * This initializes all the fields in 'pertrans'. 'aggref' is the aggregate
+ * to initialize the state for. 'aggtransfn', 'aggtranstype', and the rest
+ * of the arguments could be calculated from 'aggref', but the caller has
+ * calculated them already, so might as well pass them.
+ */
+static void
+build_pertrans_for_aggref(AggStatePerTrans pertrans,
+ AggState *aggstate, EState *estate,
+ Aggref *aggref,
+ Oid aggtransfn, Oid aggtranstype,
+ Oid aggserialfn, Oid aggdeserialfn,
+ Datum initValue, bool initValueIsNull,
+ Oid *inputTypes, int numArguments)
+{
+ int numGroupingSets = Max(aggstate->maxsets, 1);
+ Expr *serialfnexpr = NULL;
+ Expr *deserialfnexpr = NULL;
+ ListCell *lc;
+ int numInputs;
+ int numDirectArgs;
+ List *sortlist;
+ int numSortCols;
+ int numDistinctCols;
+ int i;
+
+ /* Begin filling in the pertrans data */
+ pertrans->aggref = aggref;
+ pertrans->aggshared = false;
+ pertrans->aggCollation = aggref->inputcollid;
+ pertrans->transfn_oid = aggtransfn;
+ pertrans->serialfn_oid = aggserialfn;
+ pertrans->deserialfn_oid = aggdeserialfn;
+ pertrans->initValue = initValue;
+ pertrans->initValueIsNull = initValueIsNull;
+
+ /* Count the "direct" arguments, if any */
+ numDirectArgs = list_length(aggref->aggdirectargs);
+
+ /* Count the number of aggregated input columns */
+ pertrans->numInputs = numInputs = list_length(aggref->args);
+
+ pertrans->aggtranstype = aggtranstype;
+
+ /*
+ * When combining states, we have no use at all for the aggregate
+ * function's transfn. Instead we use the combinefn. In this case, the
+ * transfn and transfn_oid fields of pertrans refer to the combine
+ * function rather than the transition function.
+ */
+ if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
+ {
+ Expr *combinefnexpr;
+ size_t numTransArgs;
+
+ /*
+ * When combining there's only one input, the to-be-combined added
+ * transition value from below (this node's transition value is
+ * counted separately).
+ */
+ pertrans->numTransInputs = 1;
+
+ /* account for the current transition state */
+ numTransArgs = pertrans->numTransInputs + 1;
+
+ build_aggregate_combinefn_expr(aggtranstype,
+ aggref->inputcollid,
+ aggtransfn,
+ &combinefnexpr);
+ fmgr_info(aggtransfn, &pertrans->transfn);
+ fmgr_info_set_expr((Node *) combinefnexpr, &pertrans->transfn);
+
+ pertrans->transfn_fcinfo =
+ (FunctionCallInfo) palloc(SizeForFunctionCallInfo(2));
+ InitFunctionCallInfoData(*pertrans->transfn_fcinfo,
+ &pertrans->transfn,
+ numTransArgs,
+ pertrans->aggCollation,
+ (void *) aggstate, NULL);
+
+ /*
+ * Ensure that a combine function to combine INTERNAL states is not
+ * strict. This should have been checked during CREATE AGGREGATE, but
+ * the strict property could have been changed since then.
+ */
+ if (pertrans->transfn.fn_strict && aggtranstype == INTERNALOID)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("combine function with transition type %s must not be declared STRICT",
+ format_type_be(aggtranstype))));
+ }
+ else
+ {
+ Expr *transfnexpr;
+ size_t numTransArgs;
+
+ /* Detect how many arguments to pass to the transfn */
+ if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
+ pertrans->numTransInputs = numInputs;
+ else
+ pertrans->numTransInputs = numArguments;
+
+ /* account for the current transition state */
+ numTransArgs = pertrans->numTransInputs + 1;
+
+ /*
+ * Set up infrastructure for calling the transfn. Note that
+ * invtransfn is not needed here.
+ */
+ build_aggregate_transfn_expr(inputTypes,
+ numArguments,
+ numDirectArgs,
+ aggref->aggvariadic,
+ aggtranstype,
+ aggref->inputcollid,
+ aggtransfn,
+ InvalidOid,
+ &transfnexpr,
+ NULL);
+ fmgr_info(aggtransfn, &pertrans->transfn);
+ fmgr_info_set_expr((Node *) transfnexpr, &pertrans->transfn);
+
+ pertrans->transfn_fcinfo =
+ (FunctionCallInfo) palloc(SizeForFunctionCallInfo(numTransArgs));
+ InitFunctionCallInfoData(*pertrans->transfn_fcinfo,
+ &pertrans->transfn,
+ numTransArgs,
+ pertrans->aggCollation,
+ (void *) aggstate, NULL);
+
+ /*
+ * If the transfn is strict and the initval is NULL, make sure input
+ * type and transtype are the same (or at least binary-compatible), so
+ * that it's OK to use the first aggregated input value as the initial
+ * transValue. This should have been checked at agg definition time,
+ * but we must check again in case the transfn's strictness property
+ * has been changed.
+ */
+ if (pertrans->transfn.fn_strict && pertrans->initValueIsNull)
+ {
+ if (numArguments <= numDirectArgs ||
+ !IsBinaryCoercible(inputTypes[numDirectArgs],
+ aggtranstype))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("aggregate %u needs to have compatible input type and transition type",
+ aggref->aggfnoid)));
+ }
+ }
+
+ /* get info about the state value's datatype */
+ get_typlenbyval(aggtranstype,
+ &pertrans->transtypeLen,
+ &pertrans->transtypeByVal);
+
+ if (OidIsValid(aggserialfn))
+ {
+ build_aggregate_serialfn_expr(aggserialfn,
+ &serialfnexpr);
+ fmgr_info(aggserialfn, &pertrans->serialfn);
+ fmgr_info_set_expr((Node *) serialfnexpr, &pertrans->serialfn);
+
+ pertrans->serialfn_fcinfo =
+ (FunctionCallInfo) palloc(SizeForFunctionCallInfo(1));
+ InitFunctionCallInfoData(*pertrans->serialfn_fcinfo,
+ &pertrans->serialfn,
+ 1,
+ InvalidOid,
+ (void *) aggstate, NULL);
+ }
+
+ if (OidIsValid(aggdeserialfn))
+ {
+ build_aggregate_deserialfn_expr(aggdeserialfn,
+ &deserialfnexpr);
+ fmgr_info(aggdeserialfn, &pertrans->deserialfn);
+ fmgr_info_set_expr((Node *) deserialfnexpr, &pertrans->deserialfn);
+
+ pertrans->deserialfn_fcinfo =
+ (FunctionCallInfo) palloc(SizeForFunctionCallInfo(2));
+ InitFunctionCallInfoData(*pertrans->deserialfn_fcinfo,
+ &pertrans->deserialfn,
+ 2,
+ InvalidOid,
+ (void *) aggstate, NULL);
+
+ }
+
+ /*
+ * If we're doing either DISTINCT or ORDER BY for a plain agg, then we
+ * have a list of SortGroupClause nodes; fish out the data in them and
+ * stick them into arrays. We ignore ORDER BY for an ordered-set agg,
+ * however; the agg's transfn and finalfn are responsible for that.
+ *
+ * Note that by construction, if there is a DISTINCT clause then the ORDER
+ * BY clause is a prefix of it (see transformDistinctClause).
+ */
+ if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
+ {
+ sortlist = NIL;
+ numSortCols = numDistinctCols = 0;
+ }
+ else if (aggref->aggdistinct)
+ {
+ sortlist = aggref->aggdistinct;
+ numSortCols = numDistinctCols = list_length(sortlist);
+ Assert(numSortCols >= list_length(aggref->aggorder));
+ }
+ else
+ {
+ sortlist = aggref->aggorder;
+ numSortCols = list_length(sortlist);
+ numDistinctCols = 0;
+ }
+
+ pertrans->numSortCols = numSortCols;
+ pertrans->numDistinctCols = numDistinctCols;
+
+ /*
+ * If we have either sorting or filtering to do, create a tupledesc and
+ * slot corresponding to the aggregated inputs (including sort
+ * expressions) of the agg.
+ */
+ if (numSortCols > 0 || aggref->aggfilter)
+ {
+ pertrans->sortdesc = ExecTypeFromTL(aggref->args);
+ pertrans->sortslot =
+ ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
+ &TTSOpsMinimalTuple);
+ }
+
+ if (numSortCols > 0)
+ {
+ /*
+ * We don't implement DISTINCT or ORDER BY aggs in the HASHED case
+ * (yet)
+ */
+ Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED);
+
+ /* If we have only one input, we need its len/byval info. */
+ if (numInputs == 1)
+ {
+ get_typlenbyval(inputTypes[numDirectArgs],
+ &pertrans->inputtypeLen,
+ &pertrans->inputtypeByVal);
+ }
+ else if (numDistinctCols > 0)
+ {
+ /* we will need an extra slot to store prior values */
+ pertrans->uniqslot =
+ ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
+ &TTSOpsMinimalTuple);
+ }
+
+ /* Extract the sort information for use later */
+ pertrans->sortColIdx =
+ (AttrNumber *) palloc(numSortCols * sizeof(AttrNumber));
+ pertrans->sortOperators =
+ (Oid *) palloc(numSortCols * sizeof(Oid));
+ pertrans->sortCollations =
+ (Oid *) palloc(numSortCols * sizeof(Oid));
+ pertrans->sortNullsFirst =
+ (bool *) palloc(numSortCols * sizeof(bool));
+
+ i = 0;
+ foreach(lc, sortlist)
+ {
+ SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc);
+ TargetEntry *tle = get_sortgroupclause_tle(sortcl, aggref->args);
+
+ /* the parser should have made sure of this */
+ Assert(OidIsValid(sortcl->sortop));
+
+ pertrans->sortColIdx[i] = tle->resno;
+ pertrans->sortOperators[i] = sortcl->sortop;
+ pertrans->sortCollations[i] = exprCollation((Node *) tle->expr);
+ pertrans->sortNullsFirst[i] = sortcl->nulls_first;
+ i++;
+ }
+ Assert(i == numSortCols);
+ }
+
+ if (aggref->aggdistinct)
+ {
+ Oid *ops;
+
+ Assert(numArguments > 0);
+ Assert(list_length(aggref->aggdistinct) == numDistinctCols);
+
+ ops = palloc(numDistinctCols * sizeof(Oid));
+
+ i = 0;
+ foreach(lc, aggref->aggdistinct)
+ ops[i++] = ((SortGroupClause *) lfirst(lc))->eqop;
+
+ /* lookup / build the necessary comparators */
+ if (numDistinctCols == 1)
+ fmgr_info(get_opcode(ops[0]), &pertrans->equalfnOne);
+ else
+ pertrans->equalfnMulti =
+ execTuplesMatchPrepare(pertrans->sortdesc,
+ numDistinctCols,
+ pertrans->sortColIdx,
+ ops,
+ pertrans->sortCollations,
+ &aggstate->ss.ps);
+ pfree(ops);
+ }
+
+ pertrans->sortstates = (Tuplesortstate **)
+ palloc0(sizeof(Tuplesortstate *) * numGroupingSets);
+}
+
+
+static Datum
+GetAggInitVal(Datum textInitVal, Oid transtype)
+{
+ Oid typinput,
+ typioparam;
+ char *strInitVal;
+ Datum initVal;
+
+ getTypeInputInfo(transtype, &typinput, &typioparam);
+ strInitVal = TextDatumGetCString(textInitVal);
+ initVal = OidInputFunctionCall(typinput, strInitVal,
+ typioparam, -1);
+ pfree(strInitVal);
+ return initVal;
+}
+
+void
+ExecEndAgg(AggState *node)
+{
+ PlanState *outerPlan;
+ int transno;
+ int numGroupingSets = Max(node->maxsets, 1);
+ int setno;
+
+ /*
+ * When ending a parallel worker, copy the statistics gathered by the
+ * worker back into shared memory so that it can be picked up by the main
+ * process to report in EXPLAIN ANALYZE.
+ */
+ if (node->shared_info && IsParallelWorker())
+ {
+ AggregateInstrumentation *si;
+
+ Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+ si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+ si->hash_batches_used = node->hash_batches_used;
+ si->hash_disk_used = node->hash_disk_used;
+ si->hash_mem_peak = node->hash_mem_peak;
+ }
+
+ /* Make sure we have closed any open tuplesorts */
+
+ if (node->sort_in)
+ tuplesort_end(node->sort_in);
+ if (node->sort_out)
+ tuplesort_end(node->sort_out);
+
+ hashagg_reset_spill_state(node);
+
+ if (node->hash_metacxt != NULL)
+ {
+ MemoryContextDelete(node->hash_metacxt);
+ node->hash_metacxt = NULL;
+ }
+
+ for (transno = 0; transno < node->numtrans; transno++)
+ {
+ AggStatePerTrans pertrans = &node->pertrans[transno];
+
+ for (setno = 0; setno < numGroupingSets; setno++)
+ {
+ if (pertrans->sortstates[setno])
+ tuplesort_end(pertrans->sortstates[setno]);
+ }
+ }
+
+ /* And ensure any agg shutdown callbacks have been called */
+ for (setno = 0; setno < numGroupingSets; setno++)
+ ReScanExprContext(node->aggcontexts[setno]);
+ if (node->hashcontext)
+ ReScanExprContext(node->hashcontext);
+
+ /*
+ * We don't actually free any ExprContexts here (see comment in
+ * ExecFreeExprContext), just unlinking the output one from the plan node
+ * suffices.
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /* clean up tuple table */
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ outerPlan = outerPlanState(node);
+ ExecEndNode(outerPlan);
+}
+
+void
+ExecReScanAgg(AggState *node)
+{
+ ExprContext *econtext = node->ss.ps.ps_ExprContext;
+ PlanState *outerPlan = outerPlanState(node);
+ Agg *aggnode = (Agg *) node->ss.ps.plan;
+ int transno;
+ int numGroupingSets = Max(node->maxsets, 1);
+ int setno;
+
+ node->agg_done = false;
+
+ if (node->aggstrategy == AGG_HASHED)
+ {
+ /*
+ * In the hashed case, if we haven't yet built the hash table then we
+ * can just return; nothing done yet, so nothing to undo. If subnode's
+ * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+ * else no reason to re-scan it at all.
+ */
+ if (!node->table_filled)
+ return;
+
+ /*
+ * If we do have the hash table, and it never spilled, and the subplan
+ * does not have any parameter changes, and none of our own parameter
+ * changes affect input expressions of the aggregated functions, then
+ * we can just rescan the existing hash table; no need to build it
+ * again.
+ */
+ if (outerPlan->chgParam == NULL && !node->hash_ever_spilled &&
+ !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams))
+ {
+ ResetTupleHashIterator(node->perhash[0].hashtable,
+ &node->perhash[0].hashiter);
+ select_current_set(node, 0, true);
+ return;
+ }
+ }
+
+ /* Make sure we have closed any open tuplesorts */
+ for (transno = 0; transno < node->numtrans; transno++)
+ {
+ for (setno = 0; setno < numGroupingSets; setno++)
+ {
+ AggStatePerTrans pertrans = &node->pertrans[transno];
+
+ if (pertrans->sortstates[setno])
+ {
+ tuplesort_end(pertrans->sortstates[setno]);
+ pertrans->sortstates[setno] = NULL;
+ }
+ }
+ }
+
+ /*
+ * We don't need to ReScanExprContext the output tuple context here;
+ * ExecReScan already did it. But we do need to reset our per-grouping-set
+ * contexts, which may have transvalues stored in them. (We use rescan
+ * rather than just reset because transfns may have registered callbacks
+ * that need to be run now.) For the AGG_HASHED case, see below.
+ */
+
+ for (setno = 0; setno < numGroupingSets; setno++)
+ {
+ ReScanExprContext(node->aggcontexts[setno]);
+ }
+
+ /* Release first tuple of group, if we have made a copy */
+ if (node->grp_firstTuple != NULL)
+ {
+ heap_freetuple(node->grp_firstTuple);
+ node->grp_firstTuple = NULL;
+ }
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /* Forget current agg values */
+ MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numaggs);
+ MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs);
+
+ /*
+ * With AGG_HASHED/MIXED, the hash table is allocated in a sub-context of
+ * the hashcontext. This used to be an issue, but now, resetting a context
+ * automatically deletes sub-contexts too.
+ */
+ if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED)
+ {
+ hashagg_reset_spill_state(node);
+
+ node->hash_ever_spilled = false;
+ node->hash_spill_mode = false;
+ node->hash_ngroups_current = 0;
+
+ ReScanExprContext(node->hashcontext);
+ /* Rebuild an empty hash table */
+ build_hash_tables(node);
+ node->table_filled = false;
+ /* iterator will be reset when the table is filled */
+
+ hashagg_recompile_expressions(node, false, false);
+ }
+
+ if (node->aggstrategy != AGG_HASHED)
+ {
+ /*
+ * Reset the per-group state (in particular, mark transvalues null)
+ */
+ for (setno = 0; setno < numGroupingSets; setno++)
+ {
+ MemSet(node->pergroups[setno], 0,
+ sizeof(AggStatePerGroupData) * node->numaggs);
+ }
+
+ /* reset to phase 1 */
+ initialize_phase(node, 1);
+
+ node->input_done = false;
+ node->projected_set = -1;
+ }
+
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+}
+
+
+/***********************************************************************
+ * API exposed to aggregate functions
+ ***********************************************************************/
+
+
+/*
+ * AggCheckCallContext - test if a SQL function is being called as an aggregate
+ *
+ * The transition and/or final functions of an aggregate may want to verify
+ * that they are being called as aggregates, rather than as plain SQL
+ * functions. They should use this function to do so. The return value
+ * is nonzero if being called as an aggregate, or zero if not. (Specific
+ * nonzero values are AGG_CONTEXT_AGGREGATE or AGG_CONTEXT_WINDOW, but more
+ * values could conceivably appear in future.)
+ *
+ * If aggcontext isn't NULL, the function also stores at *aggcontext the
+ * identity of the memory context that aggregate transition values are being
+ * stored in. Note that the same aggregate call site (flinfo) may be called
+ * interleaved on different transition values in different contexts, so it's
+ * not kosher to cache aggcontext under fn_extra. It is, however, kosher to
+ * cache it in the transvalue itself (for internal-type transvalues).
+ */
+int
+AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext)
+{
+ if (fcinfo->context && IsA(fcinfo->context, AggState))
+ {
+ if (aggcontext)
+ {
+ AggState *aggstate = ((AggState *) fcinfo->context);
+ ExprContext *cxt = aggstate->curaggcontext;
+
+ *aggcontext = cxt->ecxt_per_tuple_memory;
+ }
+ return AGG_CONTEXT_AGGREGATE;
+ }
+ if (fcinfo->context && IsA(fcinfo->context, WindowAggState))
+ {
+ if (aggcontext)
+ *aggcontext = ((WindowAggState *) fcinfo->context)->curaggcontext;
+ return AGG_CONTEXT_WINDOW;
+ }
+
+ /* this is just to prevent "uninitialized variable" warnings */
+ if (aggcontext)
+ *aggcontext = NULL;
+ return 0;
+}
+
+/*
+ * AggGetAggref - allow an aggregate support function to get its Aggref
+ *
+ * If the function is being called as an aggregate support function,
+ * return the Aggref node for the aggregate call. Otherwise, return NULL.
+ *
+ * Aggregates sharing the same inputs and transition functions can get
+ * merged into a single transition calculation. If the transition function
+ * calls AggGetAggref, it will get some one of the Aggrefs for which it is
+ * executing. It must therefore not pay attention to the Aggref fields that
+ * relate to the final function, as those are indeterminate. But if a final
+ * function calls AggGetAggref, it will get a precise result.
+ *
+ * Note that if an aggregate is being used as a window function, this will
+ * return NULL. We could provide a similar function to return the relevant
+ * WindowFunc node in such cases, but it's not needed yet.
+ */
+Aggref *
+AggGetAggref(FunctionCallInfo fcinfo)
+{
+ if (fcinfo->context && IsA(fcinfo->context, AggState))
+ {
+ AggState *aggstate = (AggState *) fcinfo->context;
+ AggStatePerAgg curperagg;
+ AggStatePerTrans curpertrans;
+
+ /* check curperagg (valid when in a final function) */
+ curperagg = aggstate->curperagg;
+
+ if (curperagg)
+ return curperagg->aggref;
+
+ /* check curpertrans (valid when in a transition function) */
+ curpertrans = aggstate->curpertrans;
+
+ if (curpertrans)
+ return curpertrans->aggref;
+ }
+ return NULL;
+}
+
+/*
+ * AggGetTempMemoryContext - fetch short-term memory context for aggregates
+ *
+ * This is useful in agg final functions; the context returned is one that
+ * the final function can safely reset as desired. This isn't useful for
+ * transition functions, since the context returned MAY (we don't promise)
+ * be the same as the context those are called in.
+ *
+ * As above, this is currently not useful for aggs called as window functions.
+ */
+MemoryContext
+AggGetTempMemoryContext(FunctionCallInfo fcinfo)
+{
+ if (fcinfo->context && IsA(fcinfo->context, AggState))
+ {
+ AggState *aggstate = (AggState *) fcinfo->context;
+
+ return aggstate->tmpcontext->ecxt_per_tuple_memory;
+ }
+ return NULL;
+}
+
+/*
+ * AggStateIsShared - find out whether transition state is shared
+ *
+ * If the function is being called as an aggregate support function,
+ * return true if the aggregate's transition state is shared across
+ * multiple aggregates, false if it is not.
+ *
+ * Returns true if not called as an aggregate support function.
+ * This is intended as a conservative answer, ie "no you'd better not
+ * scribble on your input". In particular, will return true if the
+ * aggregate is being used as a window function, which is a scenario
+ * in which changing the transition state is a bad idea. We might
+ * want to refine the behavior for the window case in future.
+ */
+bool
+AggStateIsShared(FunctionCallInfo fcinfo)
+{
+ if (fcinfo->context && IsA(fcinfo->context, AggState))
+ {
+ AggState *aggstate = (AggState *) fcinfo->context;
+ AggStatePerAgg curperagg;
+ AggStatePerTrans curpertrans;
+
+ /* check curperagg (valid when in a final function) */
+ curperagg = aggstate->curperagg;
+
+ if (curperagg)
+ return aggstate->pertrans[curperagg->transno].aggshared;
+
+ /* check curpertrans (valid when in a transition function) */
+ curpertrans = aggstate->curpertrans;
+
+ if (curpertrans)
+ return curpertrans->aggshared;
+ }
+ return true;
+}
+
+/*
+ * AggRegisterCallback - register a cleanup callback for an aggregate
+ *
+ * This is useful for aggs to register shutdown callbacks, which will ensure
+ * that non-memory resources are freed. The callback will occur just before
+ * the associated aggcontext (as returned by AggCheckCallContext) is reset,
+ * either between groups or as a result of rescanning the query. The callback
+ * will NOT be called on error paths. The typical use-case is for freeing of
+ * tuplestores or tuplesorts maintained in aggcontext, or pins held by slots
+ * created by the agg functions. (The callback will not be called until after
+ * the result of the finalfn is no longer needed, so it's safe for the finalfn
+ * to return data that will be freed by the callback.)
+ *
+ * As above, this is currently not useful for aggs called as window functions.
+ */
+void
+AggRegisterCallback(FunctionCallInfo fcinfo,
+ ExprContextCallbackFunction func,
+ Datum arg)
+{
+ if (fcinfo->context && IsA(fcinfo->context, AggState))
+ {
+ AggState *aggstate = (AggState *) fcinfo->context;
+ ExprContext *cxt = aggstate->curaggcontext;
+
+ RegisterExprContextCallback(cxt, func, arg);
+
+ return;
+ }
+ elog(ERROR, "aggregate function cannot register a callback in this context");
+}
+
+
+/* ----------------------------------------------------------------
+ * Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+ /* ----------------------------------------------------------------
+ * ExecAggEstimate
+ *
+ * Estimate space required to propagate aggregate statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggEstimate(AggState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
+ size = add_size(size, offsetof(SharedAggInfo, sinstrument));
+ shm_toc_estimate_chunk(&pcxt->estimator, size);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAggInitializeDSM
+ *
+ * Initialize DSM space for aggregate statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = offsetof(SharedAggInfo, sinstrument)
+ + pcxt->nworkers * sizeof(AggregateInstrumentation);
+ node->shared_info = shm_toc_allocate(pcxt->toc, size);
+ /* ensure any unfilled slots will contain zeroes */
+ memset(node->shared_info, 0, size);
+ node->shared_info->num_workers = pcxt->nworkers;
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+ node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAggInitializeWorker
+ *
+ * Attach worker to DSM space for aggregate statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
+{
+ node->shared_info =
+ shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAggRetrieveInstrumentation
+ *
+ * Transfer aggregate statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggRetrieveInstrumentation(AggState *node)
+{
+ Size size;
+ SharedAggInfo *si;
+
+ if (node->shared_info == NULL)
+ return;
+
+ size = offsetof(SharedAggInfo, sinstrument)
+ + node->shared_info->num_workers * sizeof(AggregateInstrumentation);
+ si = palloc(size);
+ memcpy(si, node->shared_info, size);
+ node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeAppend.c b/src/backend/executor/nodeAppend.c
new file mode 100644
index 0000000..6a2daa6
--- /dev/null
+++ b/src/backend/executor/nodeAppend.c
@@ -0,0 +1,1186 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeAppend.c
+ * routines to handle append nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeAppend.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ * ExecInitAppend - initialize the append node
+ * ExecAppend - retrieve the next tuple from the node
+ * ExecEndAppend - shut down the append node
+ * ExecReScanAppend - rescan the append node
+ *
+ * NOTES
+ * Each append node contains a list of one or more subplans which
+ * must be iteratively processed (forwards or backwards).
+ * Tuples are retrieved by executing the 'whichplan'th subplan
+ * until the subplan stops returning tuples, at which point that
+ * plan is shut down and the next started up.
+ *
+ * Append nodes don't make use of their left and right
+ * subtrees, rather they maintain a list of subplans so
+ * a typical append node looks like this in the plan tree:
+ *
+ * ...
+ * /
+ * Append -------+------+------+--- nil
+ * / \ | | |
+ * nil nil ... ... ...
+ * subplans
+ *
+ * Append nodes are currently used for unions, and to support
+ * inheritance queries, where several relations need to be scanned.
+ * For example, in our standard person/student/employee/student-emp
+ * example, where student and employee inherit from person
+ * and student-emp inherits from student and employee, the
+ * query:
+ *
+ * select name from person
+ *
+ * generates the plan:
+ *
+ * |
+ * Append -------+-------+--------+--------+
+ * / \ | | | |
+ * nil nil Scan Scan Scan Scan
+ * | | | |
+ * person employee student student-emp
+ */
+
+#include "postgres.h"
+
+#include "executor/execAsync.h"
+#include "executor/execdebug.h"
+#include "executor/execPartition.h"
+#include "executor/nodeAppend.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/latch.h"
+
+/* Shared state for parallel-aware Append. */
+struct ParallelAppendState
+{
+ LWLock pa_lock; /* mutual exclusion to choose next subplan */
+ int pa_next_plan; /* next plan to choose by any worker */
+
+ /*
+ * pa_finished[i] should be true if no more workers should select subplan
+ * i. for a non-partial plan, this should be set to true as soon as a
+ * worker selects the plan; for a partial plan, it remains false until
+ * some worker executes the plan to completion.
+ */
+ bool pa_finished[FLEXIBLE_ARRAY_MEMBER];
+};
+
+#define INVALID_SUBPLAN_INDEX -1
+#define EVENT_BUFFER_SIZE 16
+
+static TupleTableSlot *ExecAppend(PlanState *pstate);
+static bool choose_next_subplan_locally(AppendState *node);
+static bool choose_next_subplan_for_leader(AppendState *node);
+static bool choose_next_subplan_for_worker(AppendState *node);
+static void mark_invalid_subplans_as_finished(AppendState *node);
+static void ExecAppendAsyncBegin(AppendState *node);
+static bool ExecAppendAsyncGetNext(AppendState *node, TupleTableSlot **result);
+static bool ExecAppendAsyncRequest(AppendState *node, TupleTableSlot **result);
+static void ExecAppendAsyncEventWait(AppendState *node);
+static void classify_matching_subplans(AppendState *node);
+
+/* ----------------------------------------------------------------
+ * ExecInitAppend
+ *
+ * Begin all of the subscans of the append node.
+ *
+ * (This is potentially wasteful, since the entire result of the
+ * append node may not be scanned, but this way all of the
+ * structures get allocated in the executor's top level memory
+ * block instead of that of the call to ExecAppend.)
+ * ----------------------------------------------------------------
+ */
+AppendState *
+ExecInitAppend(Append *node, EState *estate, int eflags)
+{
+ AppendState *appendstate = makeNode(AppendState);
+ PlanState **appendplanstates;
+ Bitmapset *validsubplans;
+ Bitmapset *asyncplans;
+ int nplans;
+ int nasyncplans;
+ int firstvalid;
+ int i,
+ j;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & EXEC_FLAG_MARK));
+
+ /*
+ * create new AppendState for our append node
+ */
+ appendstate->ps.plan = (Plan *) node;
+ appendstate->ps.state = estate;
+ appendstate->ps.ExecProcNode = ExecAppend;
+
+ /* Let choose_next_subplan_* function handle setting the first subplan */
+ appendstate->as_whichplan = INVALID_SUBPLAN_INDEX;
+ appendstate->as_syncdone = false;
+ appendstate->as_begun = false;
+
+ /* If run-time partition pruning is enabled, then set that up now */
+ if (node->part_prune_info != NULL)
+ {
+ PartitionPruneState *prunestate;
+
+ /* We may need an expression context to evaluate partition exprs */
+ ExecAssignExprContext(estate, &appendstate->ps);
+
+ /* Create the working data structure for pruning. */
+ prunestate = ExecCreatePartitionPruneState(&appendstate->ps,
+ node->part_prune_info);
+ appendstate->as_prune_state = prunestate;
+
+ /* Perform an initial partition prune, if required. */
+ if (prunestate->do_initial_prune)
+ {
+ /* Determine which subplans survive initial pruning */
+ validsubplans = ExecFindInitialMatchingSubPlans(prunestate,
+ list_length(node->appendplans));
+
+ nplans = bms_num_members(validsubplans);
+ }
+ else
+ {
+ /* We'll need to initialize all subplans */
+ nplans = list_length(node->appendplans);
+ Assert(nplans > 0);
+ validsubplans = bms_add_range(NULL, 0, nplans - 1);
+ }
+
+ /*
+ * When no run-time pruning is required and there's at least one
+ * subplan, we can fill as_valid_subplans immediately, preventing
+ * later calls to ExecFindMatchingSubPlans.
+ */
+ if (!prunestate->do_exec_prune && nplans > 0)
+ appendstate->as_valid_subplans = bms_add_range(NULL, 0, nplans - 1);
+ }
+ else
+ {
+ nplans = list_length(node->appendplans);
+
+ /*
+ * When run-time partition pruning is not enabled we can just mark all
+ * subplans as valid; they must also all be initialized.
+ */
+ Assert(nplans > 0);
+ appendstate->as_valid_subplans = validsubplans =
+ bms_add_range(NULL, 0, nplans - 1);
+ appendstate->as_prune_state = NULL;
+ }
+
+ /*
+ * Initialize result tuple type and slot.
+ */
+ ExecInitResultTupleSlotTL(&appendstate->ps, &TTSOpsVirtual);
+
+ /* node returns slots from each of its subnodes, therefore not fixed */
+ appendstate->ps.resultopsset = true;
+ appendstate->ps.resultopsfixed = false;
+
+ appendplanstates = (PlanState **) palloc(nplans *
+ sizeof(PlanState *));
+
+ /*
+ * call ExecInitNode on each of the valid plans to be executed and save
+ * the results into the appendplanstates array.
+ *
+ * While at it, find out the first valid partial plan.
+ */
+ j = 0;
+ asyncplans = NULL;
+ nasyncplans = 0;
+ firstvalid = nplans;
+ i = -1;
+ while ((i = bms_next_member(validsubplans, i)) >= 0)
+ {
+ Plan *initNode = (Plan *) list_nth(node->appendplans, i);
+
+ /*
+ * Record async subplans. When executing EvalPlanQual, we treat them
+ * as sync ones; don't do this when initializing an EvalPlanQual plan
+ * tree.
+ */
+ if (initNode->async_capable && estate->es_epq_active == NULL)
+ {
+ asyncplans = bms_add_member(asyncplans, j);
+ nasyncplans++;
+ }
+
+ /*
+ * Record the lowest appendplans index which is a valid partial plan.
+ */
+ if (i >= node->first_partial_plan && j < firstvalid)
+ firstvalid = j;
+
+ appendplanstates[j++] = ExecInitNode(initNode, estate, eflags);
+ }
+
+ appendstate->as_first_partial_plan = firstvalid;
+ appendstate->appendplans = appendplanstates;
+ appendstate->as_nplans = nplans;
+
+ /* Initialize async state */
+ appendstate->as_asyncplans = asyncplans;
+ appendstate->as_nasyncplans = nasyncplans;
+ appendstate->as_asyncrequests = NULL;
+ appendstate->as_asyncresults = NULL;
+ appendstate->as_nasyncresults = 0;
+ appendstate->as_nasyncremain = 0;
+ appendstate->as_needrequest = NULL;
+ appendstate->as_eventset = NULL;
+ appendstate->as_valid_asyncplans = NULL;
+
+ if (nasyncplans > 0)
+ {
+ appendstate->as_asyncrequests = (AsyncRequest **)
+ palloc0(nplans * sizeof(AsyncRequest *));
+
+ i = -1;
+ while ((i = bms_next_member(asyncplans, i)) >= 0)
+ {
+ AsyncRequest *areq;
+
+ areq = palloc(sizeof(AsyncRequest));
+ areq->requestor = (PlanState *) appendstate;
+ areq->requestee = appendplanstates[i];
+ areq->request_index = i;
+ areq->callback_pending = false;
+ areq->request_complete = false;
+ areq->result = NULL;
+
+ appendstate->as_asyncrequests[i] = areq;
+ }
+
+ appendstate->as_asyncresults = (TupleTableSlot **)
+ palloc0(nasyncplans * sizeof(TupleTableSlot *));
+
+ if (appendstate->as_valid_subplans != NULL)
+ classify_matching_subplans(appendstate);
+ }
+
+ /*
+ * Miscellaneous initialization
+ */
+
+ appendstate->ps.ps_ProjInfo = NULL;
+
+ /* For parallel query, this will be overridden later. */
+ appendstate->choose_next_subplan = choose_next_subplan_locally;
+
+ return appendstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecAppend
+ *
+ * Handles iteration over multiple subplans.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecAppend(PlanState *pstate)
+{
+ AppendState *node = castNode(AppendState, pstate);
+ TupleTableSlot *result;
+
+ /*
+ * If this is the first call after Init or ReScan, we need to do the
+ * initialization work.
+ */
+ if (!node->as_begun)
+ {
+ Assert(node->as_whichplan == INVALID_SUBPLAN_INDEX);
+ Assert(!node->as_syncdone);
+
+ /* Nothing to do if there are no subplans */
+ if (node->as_nplans == 0)
+ return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ /* If there are any async subplans, begin executing them. */
+ if (node->as_nasyncplans > 0)
+ ExecAppendAsyncBegin(node);
+
+ /*
+ * If no sync subplan has been chosen, we must choose one before
+ * proceeding.
+ */
+ if (!node->choose_next_subplan(node) && node->as_nasyncremain == 0)
+ return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ Assert(node->as_syncdone ||
+ (node->as_whichplan >= 0 &&
+ node->as_whichplan < node->as_nplans));
+
+ /* And we're initialized. */
+ node->as_begun = true;
+ }
+
+ for (;;)
+ {
+ PlanState *subnode;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * try to get a tuple from an async subplan if any
+ */
+ if (node->as_syncdone || !bms_is_empty(node->as_needrequest))
+ {
+ if (ExecAppendAsyncGetNext(node, &result))
+ return result;
+ Assert(!node->as_syncdone);
+ Assert(bms_is_empty(node->as_needrequest));
+ }
+
+ /*
+ * figure out which sync subplan we are currently processing
+ */
+ Assert(node->as_whichplan >= 0 && node->as_whichplan < node->as_nplans);
+ subnode = node->appendplans[node->as_whichplan];
+
+ /*
+ * get a tuple from the subplan
+ */
+ result = ExecProcNode(subnode);
+
+ if (!TupIsNull(result))
+ {
+ /*
+ * If the subplan gave us something then return it as-is. We do
+ * NOT make use of the result slot that was set up in
+ * ExecInitAppend; there's no need for it.
+ */
+ return result;
+ }
+
+ /*
+ * wait or poll for async events if any. We do this before checking
+ * for the end of iteration, because it might drain the remaining
+ * async subplans.
+ */
+ if (node->as_nasyncremain > 0)
+ ExecAppendAsyncEventWait(node);
+
+ /* choose new sync subplan; if no sync/async subplans, we're done */
+ if (!node->choose_next_subplan(node) && node->as_nasyncremain == 0)
+ return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndAppend
+ *
+ * Shuts down the subscans of the append node.
+ *
+ * Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndAppend(AppendState *node)
+{
+ PlanState **appendplans;
+ int nplans;
+ int i;
+
+ /*
+ * get information from the node
+ */
+ appendplans = node->appendplans;
+ nplans = node->as_nplans;
+
+ /*
+ * shut down each of the subscans
+ */
+ for (i = 0; i < nplans; i++)
+ ExecEndNode(appendplans[i]);
+}
+
+void
+ExecReScanAppend(AppendState *node)
+{
+ int nasyncplans = node->as_nasyncplans;
+ int i;
+
+ /*
+ * If any PARAM_EXEC Params used in pruning expressions have changed, then
+ * we'd better unset the valid subplans so that they are reselected for
+ * the new parameter values.
+ */
+ if (node->as_prune_state &&
+ bms_overlap(node->ps.chgParam,
+ node->as_prune_state->execparamids))
+ {
+ bms_free(node->as_valid_subplans);
+ node->as_valid_subplans = NULL;
+ if (nasyncplans > 0)
+ {
+ bms_free(node->as_valid_asyncplans);
+ node->as_valid_asyncplans = NULL;
+ }
+ }
+
+ for (i = 0; i < node->as_nplans; i++)
+ {
+ PlanState *subnode = node->appendplans[i];
+
+ /*
+ * ExecReScan doesn't know about my subplans, so I have to do
+ * changed-parameter signaling myself.
+ */
+ if (node->ps.chgParam != NULL)
+ UpdateChangedParamSet(subnode, node->ps.chgParam);
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode or by first ExecAsyncRequest.
+ */
+ if (subnode->chgParam == NULL)
+ ExecReScan(subnode);
+ }
+
+ /* Reset async state */
+ if (nasyncplans > 0)
+ {
+ i = -1;
+ while ((i = bms_next_member(node->as_asyncplans, i)) >= 0)
+ {
+ AsyncRequest *areq = node->as_asyncrequests[i];
+
+ areq->callback_pending = false;
+ areq->request_complete = false;
+ areq->result = NULL;
+ }
+
+ node->as_nasyncresults = 0;
+ node->as_nasyncremain = 0;
+ bms_free(node->as_needrequest);
+ node->as_needrequest = NULL;
+ }
+
+ /* Let choose_next_subplan_* function handle setting the first subplan */
+ node->as_whichplan = INVALID_SUBPLAN_INDEX;
+ node->as_syncdone = false;
+ node->as_begun = false;
+}
+
+/* ----------------------------------------------------------------
+ * Parallel Append Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecAppendEstimate
+ *
+ * Compute the amount of space we'll need in the parallel
+ * query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendEstimate(AppendState *node,
+ ParallelContext *pcxt)
+{
+ node->pstate_len =
+ add_size(offsetof(ParallelAppendState, pa_finished),
+ sizeof(bool) * node->as_nplans);
+
+ shm_toc_estimate_chunk(&pcxt->estimator, node->pstate_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecAppendInitializeDSM
+ *
+ * Set up shared state for Parallel Append.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendInitializeDSM(AppendState *node,
+ ParallelContext *pcxt)
+{
+ ParallelAppendState *pstate;
+
+ pstate = shm_toc_allocate(pcxt->toc, node->pstate_len);
+ memset(pstate, 0, node->pstate_len);
+ LWLockInitialize(&pstate->pa_lock, LWTRANCHE_PARALLEL_APPEND);
+ shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id, pstate);
+
+ node->as_pstate = pstate;
+ node->choose_next_subplan = choose_next_subplan_for_leader;
+}
+
+/* ----------------------------------------------------------------
+ * ExecAppendReInitializeDSM
+ *
+ * Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendReInitializeDSM(AppendState *node, ParallelContext *pcxt)
+{
+ ParallelAppendState *pstate = node->as_pstate;
+
+ pstate->pa_next_plan = 0;
+ memset(pstate->pa_finished, 0, sizeof(bool) * node->as_nplans);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAppendInitializeWorker
+ *
+ * Copy relevant information from TOC into planstate, and initialize
+ * whatever is required to choose and execute the optimal subplan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendInitializeWorker(AppendState *node, ParallelWorkerContext *pwcxt)
+{
+ node->as_pstate = shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);
+ node->choose_next_subplan = choose_next_subplan_for_worker;
+}
+
+/* ----------------------------------------------------------------
+ * choose_next_subplan_locally
+ *
+ * Choose next sync subplan for a non-parallel-aware Append,
+ * returning false if there are no more.
+ * ----------------------------------------------------------------
+ */
+static bool
+choose_next_subplan_locally(AppendState *node)
+{
+ int whichplan = node->as_whichplan;
+ int nextplan;
+
+ /* We should never be called when there are no subplans */
+ Assert(node->as_nplans > 0);
+
+ /* Nothing to do if syncdone */
+ if (node->as_syncdone)
+ return false;
+
+ /*
+ * If first call then have the bms member function choose the first valid
+ * sync subplan by initializing whichplan to -1. If there happen to be no
+ * valid sync subplans then the bms member function will handle that by
+ * returning a negative number which will allow us to exit returning a
+ * false value.
+ */
+ if (whichplan == INVALID_SUBPLAN_INDEX)
+ {
+ if (node->as_nasyncplans > 0)
+ {
+ /* We'd have filled as_valid_subplans already */
+ Assert(node->as_valid_subplans);
+ }
+ else if (node->as_valid_subplans == NULL)
+ node->as_valid_subplans =
+ ExecFindMatchingSubPlans(node->as_prune_state);
+
+ whichplan = -1;
+ }
+
+ /* Ensure whichplan is within the expected range */
+ Assert(whichplan >= -1 && whichplan <= node->as_nplans);
+
+ if (ScanDirectionIsForward(node->ps.state->es_direction))
+ nextplan = bms_next_member(node->as_valid_subplans, whichplan);
+ else
+ nextplan = bms_prev_member(node->as_valid_subplans, whichplan);
+
+ if (nextplan < 0)
+ {
+ /* Set as_syncdone if in async mode */
+ if (node->as_nasyncplans > 0)
+ node->as_syncdone = true;
+ return false;
+ }
+
+ node->as_whichplan = nextplan;
+
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * choose_next_subplan_for_leader
+ *
+ * Try to pick a plan which doesn't commit us to doing much
+ * work locally, so that as much work as possible is done in
+ * the workers. Cheapest subplans are at the end.
+ * ----------------------------------------------------------------
+ */
+static bool
+choose_next_subplan_for_leader(AppendState *node)
+{
+ ParallelAppendState *pstate = node->as_pstate;
+
+ /* Backward scan is not supported by parallel-aware plans */
+ Assert(ScanDirectionIsForward(node->ps.state->es_direction));
+
+ /* We should never be called when there are no subplans */
+ Assert(node->as_nplans > 0);
+
+ LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE);
+
+ if (node->as_whichplan != INVALID_SUBPLAN_INDEX)
+ {
+ /* Mark just-completed subplan as finished. */
+ node->as_pstate->pa_finished[node->as_whichplan] = true;
+ }
+ else
+ {
+ /* Start with last subplan. */
+ node->as_whichplan = node->as_nplans - 1;
+
+ /*
+ * If we've yet to determine the valid subplans then do so now. If
+ * run-time pruning is disabled then the valid subplans will always be
+ * set to all subplans.
+ */
+ if (node->as_valid_subplans == NULL)
+ {
+ node->as_valid_subplans =
+ ExecFindMatchingSubPlans(node->as_prune_state);
+
+ /*
+ * Mark each invalid plan as finished to allow the loop below to
+ * select the first valid subplan.
+ */
+ mark_invalid_subplans_as_finished(node);
+ }
+ }
+
+ /* Loop until we find a subplan to execute. */
+ while (pstate->pa_finished[node->as_whichplan])
+ {
+ if (node->as_whichplan == 0)
+ {
+ pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
+ node->as_whichplan = INVALID_SUBPLAN_INDEX;
+ LWLockRelease(&pstate->pa_lock);
+ return false;
+ }
+
+ /*
+ * We needn't pay attention to as_valid_subplans here as all invalid
+ * plans have been marked as finished.
+ */
+ node->as_whichplan--;
+ }
+
+ /* If non-partial, immediately mark as finished. */
+ if (node->as_whichplan < node->as_first_partial_plan)
+ node->as_pstate->pa_finished[node->as_whichplan] = true;
+
+ LWLockRelease(&pstate->pa_lock);
+
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * choose_next_subplan_for_worker
+ *
+ * Choose next subplan for a parallel-aware Append, returning
+ * false if there are no more.
+ *
+ * We start from the first plan and advance through the list;
+ * when we get back to the end, we loop back to the first
+ * partial plan. This assigns the non-partial plans first in
+ * order of descending cost and then spreads out the workers
+ * as evenly as possible across the remaining partial plans.
+ * ----------------------------------------------------------------
+ */
+static bool
+choose_next_subplan_for_worker(AppendState *node)
+{
+ ParallelAppendState *pstate = node->as_pstate;
+
+ /* Backward scan is not supported by parallel-aware plans */
+ Assert(ScanDirectionIsForward(node->ps.state->es_direction));
+
+ /* We should never be called when there are no subplans */
+ Assert(node->as_nplans > 0);
+
+ LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE);
+
+ /* Mark just-completed subplan as finished. */
+ if (node->as_whichplan != INVALID_SUBPLAN_INDEX)
+ node->as_pstate->pa_finished[node->as_whichplan] = true;
+
+ /*
+ * If we've yet to determine the valid subplans then do so now. If
+ * run-time pruning is disabled then the valid subplans will always be set
+ * to all subplans.
+ */
+ else if (node->as_valid_subplans == NULL)
+ {
+ node->as_valid_subplans =
+ ExecFindMatchingSubPlans(node->as_prune_state);
+ mark_invalid_subplans_as_finished(node);
+ }
+
+ /* If all the plans are already done, we have nothing to do */
+ if (pstate->pa_next_plan == INVALID_SUBPLAN_INDEX)
+ {
+ LWLockRelease(&pstate->pa_lock);
+ return false;
+ }
+
+ /* Save the plan from which we are starting the search. */
+ node->as_whichplan = pstate->pa_next_plan;
+
+ /* Loop until we find a valid subplan to execute. */
+ while (pstate->pa_finished[pstate->pa_next_plan])
+ {
+ int nextplan;
+
+ nextplan = bms_next_member(node->as_valid_subplans,
+ pstate->pa_next_plan);
+ if (nextplan >= 0)
+ {
+ /* Advance to the next valid plan. */
+ pstate->pa_next_plan = nextplan;
+ }
+ else if (node->as_whichplan > node->as_first_partial_plan)
+ {
+ /*
+ * Try looping back to the first valid partial plan, if there is
+ * one. If there isn't, arrange to bail out below.
+ */
+ nextplan = bms_next_member(node->as_valid_subplans,
+ node->as_first_partial_plan - 1);
+ pstate->pa_next_plan =
+ nextplan < 0 ? node->as_whichplan : nextplan;
+ }
+ else
+ {
+ /*
+ * At last plan, and either there are no partial plans or we've
+ * tried them all. Arrange to bail out.
+ */
+ pstate->pa_next_plan = node->as_whichplan;
+ }
+
+ if (pstate->pa_next_plan == node->as_whichplan)
+ {
+ /* We've tried everything! */
+ pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
+ LWLockRelease(&pstate->pa_lock);
+ return false;
+ }
+ }
+
+ /* Pick the plan we found, and advance pa_next_plan one more time. */
+ node->as_whichplan = pstate->pa_next_plan;
+ pstate->pa_next_plan = bms_next_member(node->as_valid_subplans,
+ pstate->pa_next_plan);
+
+ /*
+ * If there are no more valid plans then try setting the next plan to the
+ * first valid partial plan.
+ */
+ if (pstate->pa_next_plan < 0)
+ {
+ int nextplan = bms_next_member(node->as_valid_subplans,
+ node->as_first_partial_plan - 1);
+
+ if (nextplan >= 0)
+ pstate->pa_next_plan = nextplan;
+ else
+ {
+ /*
+ * There are no valid partial plans, and we already chose the last
+ * non-partial plan; so flag that there's nothing more for our
+ * fellow workers to do.
+ */
+ pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
+ }
+ }
+
+ /* If non-partial, immediately mark as finished. */
+ if (node->as_whichplan < node->as_first_partial_plan)
+ node->as_pstate->pa_finished[node->as_whichplan] = true;
+
+ LWLockRelease(&pstate->pa_lock);
+
+ return true;
+}
+
+/*
+ * mark_invalid_subplans_as_finished
+ * Marks the ParallelAppendState's pa_finished as true for each invalid
+ * subplan.
+ *
+ * This function should only be called for parallel Append with run-time
+ * pruning enabled.
+ */
+static void
+mark_invalid_subplans_as_finished(AppendState *node)
+{
+ int i;
+
+ /* Only valid to call this while in parallel Append mode */
+ Assert(node->as_pstate);
+
+ /* Shouldn't have been called when run-time pruning is not enabled */
+ Assert(node->as_prune_state);
+
+ /* Nothing to do if all plans are valid */
+ if (bms_num_members(node->as_valid_subplans) == node->as_nplans)
+ return;
+
+ /* Mark all non-valid plans as finished */
+ for (i = 0; i < node->as_nplans; i++)
+ {
+ if (!bms_is_member(i, node->as_valid_subplans))
+ node->as_pstate->pa_finished[i] = true;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * Asynchronous Append Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecAppendAsyncBegin
+ *
+ * Begin executing designed async-capable subplans.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecAppendAsyncBegin(AppendState *node)
+{
+ int i;
+
+ /* Backward scan is not supported by async-aware Appends. */
+ Assert(ScanDirectionIsForward(node->ps.state->es_direction));
+
+ /* We should never be called when there are no subplans */
+ Assert(node->as_nplans > 0);
+
+ /* We should never be called when there are no async subplans. */
+ Assert(node->as_nasyncplans > 0);
+
+ /* If we've yet to determine the valid subplans then do so now. */
+ if (node->as_valid_subplans == NULL)
+ {
+ node->as_valid_subplans =
+ ExecFindMatchingSubPlans(node->as_prune_state);
+
+ classify_matching_subplans(node);
+ }
+
+ /* Initialize state variables. */
+ node->as_syncdone = bms_is_empty(node->as_valid_subplans);
+ node->as_nasyncremain = bms_num_members(node->as_valid_asyncplans);
+
+ /* Nothing to do if there are no valid async subplans. */
+ if (node->as_nasyncremain == 0)
+ return;
+
+ /* Make a request for each of the valid async subplans. */
+ i = -1;
+ while ((i = bms_next_member(node->as_valid_asyncplans, i)) >= 0)
+ {
+ AsyncRequest *areq = node->as_asyncrequests[i];
+
+ Assert(areq->request_index == i);
+ Assert(!areq->callback_pending);
+
+ /* Do the actual work. */
+ ExecAsyncRequest(areq);
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecAppendAsyncGetNext
+ *
+ * Get the next tuple from any of the asynchronous subplans.
+ * ----------------------------------------------------------------
+ */
+static bool
+ExecAppendAsyncGetNext(AppendState *node, TupleTableSlot **result)
+{
+ *result = NULL;
+
+ /* We should never be called when there are no valid async subplans. */
+ Assert(node->as_nasyncremain > 0);
+
+ /* Request a tuple asynchronously. */
+ if (ExecAppendAsyncRequest(node, result))
+ return true;
+
+ while (node->as_nasyncremain > 0)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /* Wait or poll for async events. */
+ ExecAppendAsyncEventWait(node);
+
+ /* Request a tuple asynchronously. */
+ if (ExecAppendAsyncRequest(node, result))
+ return true;
+
+ /* Break from loop if there's any sync subplan that isn't complete. */
+ if (!node->as_syncdone)
+ break;
+ }
+
+ /*
+ * If all sync subplans are complete, we're totally done scanning the
+ * given node. Otherwise, we're done with the asynchronous stuff but must
+ * continue scanning the sync subplans.
+ */
+ if (node->as_syncdone)
+ {
+ Assert(node->as_nasyncremain == 0);
+ *result = ExecClearTuple(node->ps.ps_ResultTupleSlot);
+ return true;
+ }
+
+ return false;
+}
+
+/* ----------------------------------------------------------------
+ * ExecAppendAsyncRequest
+ *
+ * Request a tuple asynchronously.
+ * ----------------------------------------------------------------
+ */
+static bool
+ExecAppendAsyncRequest(AppendState *node, TupleTableSlot **result)
+{
+ Bitmapset *needrequest;
+ int i;
+
+ /* Nothing to do if there are no async subplans needing a new request. */
+ if (bms_is_empty(node->as_needrequest))
+ {
+ Assert(node->as_nasyncresults == 0);
+ return false;
+ }
+
+ /*
+ * If there are any asynchronously-generated results that have not yet
+ * been returned, we have nothing to do; just return one of them.
+ */
+ if (node->as_nasyncresults > 0)
+ {
+ --node->as_nasyncresults;
+ *result = node->as_asyncresults[node->as_nasyncresults];
+ return true;
+ }
+
+ /* Make a new request for each of the async subplans that need it. */
+ needrequest = node->as_needrequest;
+ node->as_needrequest = NULL;
+ i = -1;
+ while ((i = bms_next_member(needrequest, i)) >= 0)
+ {
+ AsyncRequest *areq = node->as_asyncrequests[i];
+
+ /* Do the actual work. */
+ ExecAsyncRequest(areq);
+ }
+ bms_free(needrequest);
+
+ /* Return one of the asynchronously-generated results if any. */
+ if (node->as_nasyncresults > 0)
+ {
+ --node->as_nasyncresults;
+ *result = node->as_asyncresults[node->as_nasyncresults];
+ return true;
+ }
+
+ return false;
+}
+
+/* ----------------------------------------------------------------
+ * ExecAppendAsyncEventWait
+ *
+ * Wait or poll for file descriptor events and fire callbacks.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecAppendAsyncEventWait(AppendState *node)
+{
+ int nevents = node->as_nasyncplans + 1;
+ long timeout = node->as_syncdone ? -1 : 0;
+ WaitEvent occurred_event[EVENT_BUFFER_SIZE];
+ int noccurred;
+ int i;
+
+ /* We should never be called when there are no valid async subplans. */
+ Assert(node->as_nasyncremain > 0);
+
+ node->as_eventset = CreateWaitEventSet(CurrentMemoryContext, nevents);
+ AddWaitEventToSet(node->as_eventset, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+ NULL, NULL);
+
+ /* Give each waiting subplan a chance to add an event. */
+ i = -1;
+ while ((i = bms_next_member(node->as_asyncplans, i)) >= 0)
+ {
+ AsyncRequest *areq = node->as_asyncrequests[i];
+
+ if (areq->callback_pending)
+ ExecAsyncConfigureWait(areq);
+ }
+
+ /*
+ * No need for further processing if there are no configured events other
+ * than the postmaster death event.
+ */
+ if (GetNumRegisteredWaitEvents(node->as_eventset) == 1)
+ {
+ FreeWaitEventSet(node->as_eventset);
+ node->as_eventset = NULL;
+ return;
+ }
+
+ /* We wait on at most EVENT_BUFFER_SIZE events. */
+ if (nevents > EVENT_BUFFER_SIZE)
+ nevents = EVENT_BUFFER_SIZE;
+
+ /*
+ * If the timeout is -1, wait until at least one event occurs. If the
+ * timeout is 0, poll for events, but do not wait at all.
+ */
+ noccurred = WaitEventSetWait(node->as_eventset, timeout, occurred_event,
+ nevents, WAIT_EVENT_APPEND_READY);
+ FreeWaitEventSet(node->as_eventset);
+ node->as_eventset = NULL;
+ if (noccurred == 0)
+ return;
+
+ /* Deliver notifications. */
+ for (i = 0; i < noccurred; i++)
+ {
+ WaitEvent *w = &occurred_event[i];
+
+ /*
+ * Each waiting subplan should have registered its wait event with
+ * user_data pointing back to its AsyncRequest.
+ */
+ if ((w->events & WL_SOCKET_READABLE) != 0)
+ {
+ AsyncRequest *areq = (AsyncRequest *) w->user_data;
+
+ if (areq->callback_pending)
+ {
+ /*
+ * Mark it as no longer needing a callback. We must do this
+ * before dispatching the callback in case the callback resets
+ * the flag.
+ */
+ areq->callback_pending = false;
+
+ /* Do the actual work. */
+ ExecAsyncNotify(areq);
+ }
+ }
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecAsyncAppendResponse
+ *
+ * Receive a response from an asynchronous request we made.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncAppendResponse(AsyncRequest *areq)
+{
+ AppendState *node = (AppendState *) areq->requestor;
+ TupleTableSlot *slot = areq->result;
+
+ /* The result should be a TupleTableSlot or NULL. */
+ Assert(slot == NULL || IsA(slot, TupleTableSlot));
+
+ /* Nothing to do if the request is pending. */
+ if (!areq->request_complete)
+ {
+ /* The request would have been pending for a callback. */
+ Assert(areq->callback_pending);
+ return;
+ }
+
+ /* If the result is NULL or an empty slot, there's nothing more to do. */
+ if (TupIsNull(slot))
+ {
+ /* The ending subplan wouldn't have been pending for a callback. */
+ Assert(!areq->callback_pending);
+ --node->as_nasyncremain;
+ return;
+ }
+
+ /* Save result so we can return it. */
+ Assert(node->as_nasyncresults < node->as_nasyncplans);
+ node->as_asyncresults[node->as_nasyncresults++] = slot;
+
+ /*
+ * Mark the subplan that returned a result as ready for a new request. We
+ * don't launch another one here immediately because it might complete.
+ */
+ node->as_needrequest = bms_add_member(node->as_needrequest,
+ areq->request_index);
+}
+
+/* ----------------------------------------------------------------
+ * classify_matching_subplans
+ *
+ * Classify the node's as_valid_subplans into sync ones and
+ * async ones, adjust it to contain sync ones only, and save
+ * async ones in the node's as_valid_asyncplans.
+ * ----------------------------------------------------------------
+ */
+static void
+classify_matching_subplans(AppendState *node)
+{
+ Bitmapset *valid_asyncplans;
+
+ Assert(node->as_valid_asyncplans == NULL);
+
+ /* Nothing to do if there are no valid subplans. */
+ if (bms_is_empty(node->as_valid_subplans))
+ {
+ node->as_syncdone = true;
+ node->as_nasyncremain = 0;
+ return;
+ }
+
+ /* Nothing to do if there are no valid async subplans. */
+ if (!bms_overlap(node->as_valid_subplans, node->as_asyncplans))
+ {
+ node->as_nasyncremain = 0;
+ return;
+ }
+
+ /* Get valid async subplans. */
+ valid_asyncplans = bms_copy(node->as_asyncplans);
+ valid_asyncplans = bms_int_members(valid_asyncplans,
+ node->as_valid_subplans);
+
+ /* Adjust the valid subplans to contain sync subplans only. */
+ node->as_valid_subplans = bms_del_members(node->as_valid_subplans,
+ valid_asyncplans);
+
+ /* Save valid async subplans. */
+ node->as_valid_asyncplans = valid_asyncplans;
+}
diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c
new file mode 100644
index 0000000..a8d7b1e
--- /dev/null
+++ b/src/backend/executor/nodeBitmapAnd.c
@@ -0,0 +1,223 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeBitmapAnd.c
+ * routines to handle BitmapAnd nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeBitmapAnd.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ * ExecInitBitmapAnd - initialize the BitmapAnd node
+ * MultiExecBitmapAnd - retrieve the result bitmap from the node
+ * ExecEndBitmapAnd - shut down the BitmapAnd node
+ * ExecReScanBitmapAnd - rescan the BitmapAnd node
+ *
+ * NOTES
+ * BitmapAnd nodes don't make use of their left and right
+ * subtrees, rather they maintain a list of subplans,
+ * much like Append nodes. The logic is much simpler than
+ * Append, however, since we needn't cope with forward/backward
+ * execution.
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeBitmapAnd.h"
+
+
+/* ----------------------------------------------------------------
+ * ExecBitmapAnd
+ *
+ * stub for pro forma compliance
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecBitmapAnd(PlanState *pstate)
+{
+ elog(ERROR, "BitmapAnd node does not support ExecProcNode call convention");
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitBitmapAnd
+ *
+ * Begin all of the subscans of the BitmapAnd node.
+ * ----------------------------------------------------------------
+ */
+BitmapAndState *
+ExecInitBitmapAnd(BitmapAnd *node, EState *estate, int eflags)
+{
+ BitmapAndState *bitmapandstate = makeNode(BitmapAndState);
+ PlanState **bitmapplanstates;
+ int nplans;
+ int i;
+ ListCell *l;
+ Plan *initNode;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * Set up empty vector of subplan states
+ */
+ nplans = list_length(node->bitmapplans);
+
+ bitmapplanstates = (PlanState **) palloc0(nplans * sizeof(PlanState *));
+
+ /*
+ * create new BitmapAndState for our BitmapAnd node
+ */
+ bitmapandstate->ps.plan = (Plan *) node;
+ bitmapandstate->ps.state = estate;
+ bitmapandstate->ps.ExecProcNode = ExecBitmapAnd;
+ bitmapandstate->bitmapplans = bitmapplanstates;
+ bitmapandstate->nplans = nplans;
+
+ /*
+ * call ExecInitNode on each of the plans to be executed and save the
+ * results into the array "bitmapplanstates".
+ */
+ i = 0;
+ foreach(l, node->bitmapplans)
+ {
+ initNode = (Plan *) lfirst(l);
+ bitmapplanstates[i] = ExecInitNode(initNode, estate, eflags);
+ i++;
+ }
+
+ /*
+ * Miscellaneous initialization
+ *
+ * BitmapAnd plans don't have expression contexts because they never call
+ * ExecQual or ExecProject. They don't need any tuple slots either.
+ */
+
+ return bitmapandstate;
+}
+
+/* ----------------------------------------------------------------
+ * MultiExecBitmapAnd
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecBitmapAnd(BitmapAndState *node)
+{
+ PlanState **bitmapplans;
+ int nplans;
+ int i;
+ TIDBitmap *result = NULL;
+
+ /* must provide our own instrumentation support */
+ if (node->ps.instrument)
+ InstrStartNode(node->ps.instrument);
+
+ /*
+ * get information from the node
+ */
+ bitmapplans = node->bitmapplans;
+ nplans = node->nplans;
+
+ /*
+ * Scan all the subplans and AND their result bitmaps
+ */
+ for (i = 0; i < nplans; i++)
+ {
+ PlanState *subnode = bitmapplans[i];
+ TIDBitmap *subresult;
+
+ subresult = (TIDBitmap *) MultiExecProcNode(subnode);
+
+ if (!subresult || !IsA(subresult, TIDBitmap))
+ elog(ERROR, "unrecognized result from subplan");
+
+ if (result == NULL)
+ result = subresult; /* first subplan */
+ else
+ {
+ tbm_intersect(result, subresult);
+ tbm_free(subresult);
+ }
+
+ /*
+ * If at any stage we have a completely empty bitmap, we can fall out
+ * without evaluating the remaining subplans, since ANDing them can no
+ * longer change the result. (Note: the fact that indxpath.c orders
+ * the subplans by selectivity should make this case more likely to
+ * occur.)
+ */
+ if (tbm_is_empty(result))
+ break;
+ }
+
+ if (result == NULL)
+ elog(ERROR, "BitmapAnd doesn't support zero inputs");
+
+ /* must provide our own instrumentation support */
+ if (node->ps.instrument)
+ InstrStopNode(node->ps.instrument, 0 /* XXX */ );
+
+ return (Node *) result;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndBitmapAnd
+ *
+ * Shuts down the subscans of the BitmapAnd node.
+ *
+ * Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndBitmapAnd(BitmapAndState *node)
+{
+ PlanState **bitmapplans;
+ int nplans;
+ int i;
+
+ /*
+ * get information from the node
+ */
+ bitmapplans = node->bitmapplans;
+ nplans = node->nplans;
+
+ /*
+ * shut down each of the subscans (that we've initialized)
+ */
+ for (i = 0; i < nplans; i++)
+ {
+ if (bitmapplans[i])
+ ExecEndNode(bitmapplans[i]);
+ }
+}
+
+void
+ExecReScanBitmapAnd(BitmapAndState *node)
+{
+ int i;
+
+ for (i = 0; i < node->nplans; i++)
+ {
+ PlanState *subnode = node->bitmapplans[i];
+
+ /*
+ * ExecReScan doesn't know about my subplans, so I have to do
+ * changed-parameter signaling myself.
+ */
+ if (node->ps.chgParam != NULL)
+ UpdateChangedParamSet(subnode, node->ps.chgParam);
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (subnode->chgParam == NULL)
+ ExecReScan(subnode);
+ }
+}
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
new file mode 100644
index 0000000..2db1914
--- /dev/null
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -0,0 +1,954 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeBitmapHeapscan.c
+ * Routines to support bitmapped scans of relations
+ *
+ * NOTE: it is critical that this plan type only be used with MVCC-compliant
+ * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
+ * special snapshots). The reason is that since index and heap scans are
+ * decoupled, there can be no assurance that the index tuple prompting a
+ * visit to a particular heap TID still exists when the visit is made.
+ * Therefore the tuple might not exist anymore either (which is OK because
+ * heap_fetch will cope) --- but worse, the tuple slot could have been
+ * re-used for a newer tuple. With an MVCC snapshot the newer tuple is
+ * certain to fail the time qual and so it will not be mistakenly returned,
+ * but with anything else we might return a tuple that doesn't meet the
+ * required index qual conditions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeBitmapHeapscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecBitmapHeapScan scans a relation using bitmap info
+ * ExecBitmapHeapNext workhorse for above
+ * ExecInitBitmapHeapScan creates and initializes state info.
+ * ExecReScanBitmapHeapScan prepares to rescan the plan.
+ * ExecEndBitmapHeapScan releases all storage.
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/visibilitymap.h"
+#include "executor/execdebug.h"
+#include "executor/nodeBitmapHeapscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+#include "utils/spccache.h"
+
+static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
+static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
+static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
+ TBMIterateResult *tbmres);
+static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
+static inline void BitmapPrefetch(BitmapHeapScanState *node,
+ TableScanDesc scan);
+static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
+
+
+/* ----------------------------------------------------------------
+ * BitmapHeapNext
+ *
+ * Retrieve next tuple from the BitmapHeapScan node's currentRelation
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+BitmapHeapNext(BitmapHeapScanState *node)
+{
+ ExprContext *econtext;
+ TableScanDesc scan;
+ TIDBitmap *tbm;
+ TBMIterator *tbmiterator = NULL;
+ TBMSharedIterator *shared_tbmiterator = NULL;
+ TBMIterateResult *tbmres;
+ TupleTableSlot *slot;
+ ParallelBitmapHeapState *pstate = node->pstate;
+ dsa_area *dsa = node->ss.ps.state->es_query_dsa;
+
+ /*
+ * extract necessary information from index scan node
+ */
+ econtext = node->ss.ps.ps_ExprContext;
+ slot = node->ss.ss_ScanTupleSlot;
+ scan = node->ss.ss_currentScanDesc;
+ tbm = node->tbm;
+ if (pstate == NULL)
+ tbmiterator = node->tbmiterator;
+ else
+ shared_tbmiterator = node->shared_tbmiterator;
+ tbmres = node->tbmres;
+
+ /*
+ * If we haven't yet performed the underlying index scan, do it, and begin
+ * the iteration over the bitmap.
+ *
+ * For prefetching, we use *two* iterators, one for the pages we are
+ * actually scanning and another that runs ahead of the first for
+ * prefetching. node->prefetch_pages tracks exactly how many pages ahead
+ * the prefetch iterator is. Also, node->prefetch_target tracks the
+ * desired prefetch distance, which starts small and increases up to the
+ * node->prefetch_maximum. This is to avoid doing a lot of prefetching in
+ * a scan that stops after a few tuples because of a LIMIT.
+ */
+ if (!node->initialized)
+ {
+ if (!pstate)
+ {
+ tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
+
+ if (!tbm || !IsA(tbm, TIDBitmap))
+ elog(ERROR, "unrecognized result from subplan");
+
+ node->tbm = tbm;
+ node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
+ node->tbmres = tbmres = NULL;
+
+#ifdef USE_PREFETCH
+ if (node->prefetch_maximum > 0)
+ {
+ node->prefetch_iterator = tbm_begin_iterate(tbm);
+ node->prefetch_pages = 0;
+ node->prefetch_target = -1;
+ }
+#endif /* USE_PREFETCH */
+ }
+ else
+ {
+ /*
+ * The leader will immediately come out of the function, but
+ * others will be blocked until leader populates the TBM and wakes
+ * them up.
+ */
+ if (BitmapShouldInitializeSharedState(pstate))
+ {
+ tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
+ if (!tbm || !IsA(tbm, TIDBitmap))
+ elog(ERROR, "unrecognized result from subplan");
+
+ node->tbm = tbm;
+
+ /*
+ * Prepare to iterate over the TBM. This will return the
+ * dsa_pointer of the iterator state which will be used by
+ * multiple processes to iterate jointly.
+ */
+ pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
+#ifdef USE_PREFETCH
+ if (node->prefetch_maximum > 0)
+ {
+ pstate->prefetch_iterator =
+ tbm_prepare_shared_iterate(tbm);
+
+ /*
+ * We don't need the mutex here as we haven't yet woke up
+ * others.
+ */
+ pstate->prefetch_pages = 0;
+ pstate->prefetch_target = -1;
+ }
+#endif
+
+ /* We have initialized the shared state so wake up others. */
+ BitmapDoneInitializingSharedState(pstate);
+ }
+
+ /* Allocate a private iterator and attach the shared state to it */
+ node->shared_tbmiterator = shared_tbmiterator =
+ tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
+ node->tbmres = tbmres = NULL;
+
+#ifdef USE_PREFETCH
+ if (node->prefetch_maximum > 0)
+ {
+ node->shared_prefetch_iterator =
+ tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
+ }
+#endif /* USE_PREFETCH */
+ }
+ node->initialized = true;
+ }
+
+ for (;;)
+ {
+ bool skip_fetch;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Get next page of results if needed
+ */
+ if (tbmres == NULL)
+ {
+ if (!pstate)
+ node->tbmres = tbmres = tbm_iterate(tbmiterator);
+ else
+ node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
+ if (tbmres == NULL)
+ {
+ /* no more entries in the bitmap */
+ break;
+ }
+
+ BitmapAdjustPrefetchIterator(node, tbmres);
+
+ /*
+ * We can skip fetching the heap page if we don't need any fields
+ * from the heap, and the bitmap entries don't need rechecking,
+ * and all tuples on the page are visible to our transaction.
+ *
+ * XXX: It's a layering violation that we do these checks above
+ * tableam, they should probably moved below it at some point.
+ */
+ skip_fetch = (node->can_skip_fetch &&
+ !tbmres->recheck &&
+ VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+ tbmres->blockno,
+ &node->vmbuffer));
+
+ if (skip_fetch)
+ {
+ /* can't be lossy in the skip_fetch case */
+ Assert(tbmres->ntuples >= 0);
+
+ /*
+ * The number of tuples on this page is put into
+ * node->return_empty_tuples.
+ */
+ node->return_empty_tuples = tbmres->ntuples;
+ }
+ else if (!table_scan_bitmap_next_block(scan, tbmres))
+ {
+ /* AM doesn't think this block is valid, skip */
+ continue;
+ }
+
+ if (tbmres->ntuples >= 0)
+ node->exact_pages++;
+ else
+ node->lossy_pages++;
+
+ /* Adjust the prefetch target */
+ BitmapAdjustPrefetchTarget(node);
+ }
+ else
+ {
+ /*
+ * Continuing in previously obtained page.
+ */
+
+#ifdef USE_PREFETCH
+
+ /*
+ * Try to prefetch at least a few pages even before we get to the
+ * second page if we don't stop reading after the first tuple.
+ */
+ if (!pstate)
+ {
+ if (node->prefetch_target < node->prefetch_maximum)
+ node->prefetch_target++;
+ }
+ else if (pstate->prefetch_target < node->prefetch_maximum)
+ {
+ /* take spinlock while updating shared state */
+ SpinLockAcquire(&pstate->mutex);
+ if (pstate->prefetch_target < node->prefetch_maximum)
+ pstate->prefetch_target++;
+ SpinLockRelease(&pstate->mutex);
+ }
+#endif /* USE_PREFETCH */
+ }
+
+ /*
+ * We issue prefetch requests *after* fetching the current page to try
+ * to avoid having prefetching interfere with the main I/O. Also, this
+ * should happen only when we have determined there is still something
+ * to do on the current page, else we may uselessly prefetch the same
+ * page we are just about to request for real.
+ *
+ * XXX: It's a layering violation that we do these checks above
+ * tableam, they should probably moved below it at some point.
+ */
+ BitmapPrefetch(node, scan);
+
+ if (node->return_empty_tuples > 0)
+ {
+ /*
+ * If we don't have to fetch the tuple, just return nulls.
+ */
+ ExecStoreAllNullTuple(slot);
+
+ if (--node->return_empty_tuples == 0)
+ {
+ /* no more tuples to return in the next round */
+ node->tbmres = tbmres = NULL;
+ }
+ }
+ else
+ {
+ /*
+ * Attempt to fetch tuple from AM.
+ */
+ if (!table_scan_bitmap_next_tuple(scan, tbmres, slot))
+ {
+ /* nothing more to look at on this page */
+ node->tbmres = tbmres = NULL;
+ continue;
+ }
+
+ /*
+ * If we are using lossy info, we have to recheck the qual
+ * conditions at every tuple.
+ */
+ if (tbmres->recheck)
+ {
+ econtext->ecxt_scantuple = slot;
+ if (!ExecQualAndReset(node->bitmapqualorig, econtext))
+ {
+ /* Fails recheck, so drop it and loop back for another */
+ InstrCountFiltered2(node, 1);
+ ExecClearTuple(slot);
+ continue;
+ }
+ }
+ }
+
+ /* OK to return this tuple */
+ return slot;
+ }
+
+ /*
+ * if we get here it means we are at the end of the scan..
+ */
+ return ExecClearTuple(slot);
+}
+
+/*
+ * BitmapDoneInitializingSharedState - Shared state is initialized
+ *
+ * By this time the leader has already populated the TBM and initialized the
+ * shared state so wake up other processes.
+ */
+static inline void
+BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
+{
+ SpinLockAcquire(&pstate->mutex);
+ pstate->state = BM_FINISHED;
+ SpinLockRelease(&pstate->mutex);
+ ConditionVariableBroadcast(&pstate->cv);
+}
+
+/*
+ * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
+ */
+static inline void
+BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
+ TBMIterateResult *tbmres)
+{
+#ifdef USE_PREFETCH
+ ParallelBitmapHeapState *pstate = node->pstate;
+
+ if (pstate == NULL)
+ {
+ TBMIterator *prefetch_iterator = node->prefetch_iterator;
+
+ if (node->prefetch_pages > 0)
+ {
+ /* The main iterator has closed the distance by one page */
+ node->prefetch_pages--;
+ }
+ else if (prefetch_iterator)
+ {
+ /* Do not let the prefetch iterator get behind the main one */
+ TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+
+ if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
+ elog(ERROR, "prefetch and main iterators are out of sync");
+ }
+ return;
+ }
+
+ if (node->prefetch_maximum > 0)
+ {
+ TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
+
+ SpinLockAcquire(&pstate->mutex);
+ if (pstate->prefetch_pages > 0)
+ {
+ pstate->prefetch_pages--;
+ SpinLockRelease(&pstate->mutex);
+ }
+ else
+ {
+ /* Release the mutex before iterating */
+ SpinLockRelease(&pstate->mutex);
+
+ /*
+ * In case of shared mode, we can not ensure that the current
+ * blockno of the main iterator and that of the prefetch iterator
+ * are same. It's possible that whatever blockno we are
+ * prefetching will be processed by another process. Therefore,
+ * we don't validate the blockno here as we do in non-parallel
+ * case.
+ */
+ if (prefetch_iterator)
+ tbm_shared_iterate(prefetch_iterator);
+ }
+ }
+#endif /* USE_PREFETCH */
+}
+
+/*
+ * BitmapAdjustPrefetchTarget - Adjust the prefetch target
+ *
+ * Increase prefetch target if it's not yet at the max. Note that
+ * we will increase it to zero after fetching the very first
+ * page/tuple, then to one after the second tuple is fetched, then
+ * it doubles as later pages are fetched.
+ */
+static inline void
+BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
+{
+#ifdef USE_PREFETCH
+ ParallelBitmapHeapState *pstate = node->pstate;
+
+ if (pstate == NULL)
+ {
+ if (node->prefetch_target >= node->prefetch_maximum)
+ /* don't increase any further */ ;
+ else if (node->prefetch_target >= node->prefetch_maximum / 2)
+ node->prefetch_target = node->prefetch_maximum;
+ else if (node->prefetch_target > 0)
+ node->prefetch_target *= 2;
+ else
+ node->prefetch_target++;
+ return;
+ }
+
+ /* Do an unlocked check first to save spinlock acquisitions. */
+ if (pstate->prefetch_target < node->prefetch_maximum)
+ {
+ SpinLockAcquire(&pstate->mutex);
+ if (pstate->prefetch_target >= node->prefetch_maximum)
+ /* don't increase any further */ ;
+ else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
+ pstate->prefetch_target = node->prefetch_maximum;
+ else if (pstate->prefetch_target > 0)
+ pstate->prefetch_target *= 2;
+ else
+ pstate->prefetch_target++;
+ SpinLockRelease(&pstate->mutex);
+ }
+#endif /* USE_PREFETCH */
+}
+
+/*
+ * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
+ */
+static inline void
+BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
+{
+#ifdef USE_PREFETCH
+ ParallelBitmapHeapState *pstate = node->pstate;
+
+ if (pstate == NULL)
+ {
+ TBMIterator *prefetch_iterator = node->prefetch_iterator;
+
+ if (prefetch_iterator)
+ {
+ while (node->prefetch_pages < node->prefetch_target)
+ {
+ TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+ bool skip_fetch;
+
+ if (tbmpre == NULL)
+ {
+ /* No more pages to prefetch */
+ tbm_end_iterate(prefetch_iterator);
+ node->prefetch_iterator = NULL;
+ break;
+ }
+ node->prefetch_pages++;
+
+ /*
+ * If we expect not to have to actually read this heap page,
+ * skip this prefetch call, but continue to run the prefetch
+ * logic normally. (Would it be better not to increment
+ * prefetch_pages?)
+ *
+ * This depends on the assumption that the index AM will
+ * report the same recheck flag for this future heap page as
+ * it did for the current heap page; which is not a certainty
+ * but is true in many cases.
+ */
+ skip_fetch = (node->can_skip_fetch &&
+ (node->tbmres ? !node->tbmres->recheck : false) &&
+ VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+ tbmpre->blockno,
+ &node->pvmbuffer));
+
+ if (!skip_fetch)
+ PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
+ }
+ }
+
+ return;
+ }
+
+ if (pstate->prefetch_pages < pstate->prefetch_target)
+ {
+ TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
+
+ if (prefetch_iterator)
+ {
+ while (1)
+ {
+ TBMIterateResult *tbmpre;
+ bool do_prefetch = false;
+ bool skip_fetch;
+
+ /*
+ * Recheck under the mutex. If some other process has already
+ * done enough prefetching then we need not to do anything.
+ */
+ SpinLockAcquire(&pstate->mutex);
+ if (pstate->prefetch_pages < pstate->prefetch_target)
+ {
+ pstate->prefetch_pages++;
+ do_prefetch = true;
+ }
+ SpinLockRelease(&pstate->mutex);
+
+ if (!do_prefetch)
+ return;
+
+ tbmpre = tbm_shared_iterate(prefetch_iterator);
+ if (tbmpre == NULL)
+ {
+ /* No more pages to prefetch */
+ tbm_end_shared_iterate(prefetch_iterator);
+ node->shared_prefetch_iterator = NULL;
+ break;
+ }
+
+ /* As above, skip prefetch if we expect not to need page */
+ skip_fetch = (node->can_skip_fetch &&
+ (node->tbmres ? !node->tbmres->recheck : false) &&
+ VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+ tbmpre->blockno,
+ &node->pvmbuffer));
+
+ if (!skip_fetch)
+ PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
+ }
+ }
+ }
+#endif /* USE_PREFETCH */
+}
+
+/*
+ * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
+{
+ ExprContext *econtext;
+
+ /*
+ * extract necessary information from index scan node
+ */
+ econtext = node->ss.ps.ps_ExprContext;
+
+ /* Does the tuple meet the original qual conditions? */
+ econtext->ecxt_scantuple = slot;
+ return ExecQualAndReset(node->bitmapqualorig, econtext);
+}
+
+/* ----------------------------------------------------------------
+ * ExecBitmapHeapScan(node)
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecBitmapHeapScan(PlanState *pstate)
+{
+ BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) BitmapHeapNext,
+ (ExecScanRecheckMtd) BitmapHeapRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanBitmapHeapScan(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ /* rescan to release any page pin */
+ table_rescan(node->ss.ss_currentScanDesc, NULL);
+
+ /* release bitmaps and buffers if any */
+ if (node->tbmiterator)
+ tbm_end_iterate(node->tbmiterator);
+ if (node->prefetch_iterator)
+ tbm_end_iterate(node->prefetch_iterator);
+ if (node->shared_tbmiterator)
+ tbm_end_shared_iterate(node->shared_tbmiterator);
+ if (node->shared_prefetch_iterator)
+ tbm_end_shared_iterate(node->shared_prefetch_iterator);
+ if (node->tbm)
+ tbm_free(node->tbm);
+ if (node->vmbuffer != InvalidBuffer)
+ ReleaseBuffer(node->vmbuffer);
+ if (node->pvmbuffer != InvalidBuffer)
+ ReleaseBuffer(node->pvmbuffer);
+ node->tbm = NULL;
+ node->tbmiterator = NULL;
+ node->tbmres = NULL;
+ node->prefetch_iterator = NULL;
+ node->initialized = false;
+ node->shared_tbmiterator = NULL;
+ node->shared_prefetch_iterator = NULL;
+ node->vmbuffer = InvalidBuffer;
+ node->pvmbuffer = InvalidBuffer;
+
+ ExecScanReScan(&node->ss);
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndBitmapHeapScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndBitmapHeapScan(BitmapHeapScanState *node)
+{
+ TableScanDesc scanDesc;
+
+ /*
+ * extract information from the node
+ */
+ scanDesc = node->ss.ss_currentScanDesc;
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clear out tuple table slots
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * close down subplans
+ */
+ ExecEndNode(outerPlanState(node));
+
+ /*
+ * release bitmaps and buffers if any
+ */
+ if (node->tbmiterator)
+ tbm_end_iterate(node->tbmiterator);
+ if (node->prefetch_iterator)
+ tbm_end_iterate(node->prefetch_iterator);
+ if (node->tbm)
+ tbm_free(node->tbm);
+ if (node->shared_tbmiterator)
+ tbm_end_shared_iterate(node->shared_tbmiterator);
+ if (node->shared_prefetch_iterator)
+ tbm_end_shared_iterate(node->shared_prefetch_iterator);
+ if (node->vmbuffer != InvalidBuffer)
+ ReleaseBuffer(node->vmbuffer);
+ if (node->pvmbuffer != InvalidBuffer)
+ ReleaseBuffer(node->pvmbuffer);
+
+ /*
+ * close heap scan
+ */
+ table_endscan(scanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitBitmapHeapScan
+ *
+ * Initializes the scan's state information.
+ * ----------------------------------------------------------------
+ */
+BitmapHeapScanState *
+ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
+{
+ BitmapHeapScanState *scanstate;
+ Relation currentRelation;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * Assert caller didn't ask for an unsafe snapshot --- see comments at
+ * head of file.
+ */
+ Assert(IsMVCCSnapshot(estate->es_snapshot));
+
+ /*
+ * create state structure
+ */
+ scanstate = makeNode(BitmapHeapScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
+
+ scanstate->tbm = NULL;
+ scanstate->tbmiterator = NULL;
+ scanstate->tbmres = NULL;
+ scanstate->return_empty_tuples = 0;
+ scanstate->vmbuffer = InvalidBuffer;
+ scanstate->pvmbuffer = InvalidBuffer;
+ scanstate->exact_pages = 0;
+ scanstate->lossy_pages = 0;
+ scanstate->prefetch_iterator = NULL;
+ scanstate->prefetch_pages = 0;
+ scanstate->prefetch_target = 0;
+ scanstate->pscan_len = 0;
+ scanstate->initialized = false;
+ scanstate->shared_tbmiterator = NULL;
+ scanstate->shared_prefetch_iterator = NULL;
+ scanstate->pstate = NULL;
+
+ /*
+ * We can potentially skip fetching heap pages if we do not need any
+ * columns of the table, either for checking non-indexable quals or for
+ * returning data. This test is a bit simplistic, as it checks the
+ * stronger condition that there's no qual or return tlist at all. But in
+ * most cases it's probably not worth working harder than that.
+ */
+ scanstate->can_skip_fetch = (node->scan.plan.qual == NIL &&
+ node->scan.plan.targetlist == NIL);
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ /*
+ * open the scan relation
+ */
+ currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+ /*
+ * initialize child nodes
+ */
+ outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * get the scan type from the relation descriptor.
+ */
+ ExecInitScanTupleSlot(estate, &scanstate->ss,
+ RelationGetDescr(currentRelation),
+ table_slot_callbacks(currentRelation));
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+ scanstate->bitmapqualorig =
+ ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
+
+ /*
+ * Maximum number of prefetches for the tablespace if configured,
+ * otherwise the current value of the effective_io_concurrency GUC.
+ */
+ scanstate->prefetch_maximum =
+ get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
+
+ scanstate->ss.ss_currentRelation = currentRelation;
+
+ scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation,
+ estate->es_snapshot,
+ 0,
+ NULL);
+
+ /*
+ * all done.
+ */
+ return scanstate;
+}
+
+/*----------------
+ * BitmapShouldInitializeSharedState
+ *
+ * The first process to come here and see the state to the BM_INITIAL
+ * will become the leader for the parallel bitmap scan and will be
+ * responsible for populating the TIDBitmap. The other processes will
+ * be blocked by the condition variable until the leader wakes them up.
+ * ---------------
+ */
+static bool
+BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
+{
+ SharedBitmapState state;
+
+ while (1)
+ {
+ SpinLockAcquire(&pstate->mutex);
+ state = pstate->state;
+ if (pstate->state == BM_INITIAL)
+ pstate->state = BM_INPROGRESS;
+ SpinLockRelease(&pstate->mutex);
+
+ /* Exit if bitmap is done, or if we're the leader. */
+ if (state != BM_INPROGRESS)
+ break;
+
+ /* Wait for the leader to wake us up. */
+ ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
+ }
+
+ ConditionVariableCancelSleep();
+
+ return (state == BM_INITIAL);
+}
+
+/* ----------------------------------------------------------------
+ * ExecBitmapHeapEstimate
+ *
+ * Compute the amount of space we'll need in the parallel
+ * query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapEstimate(BitmapHeapScanState *node,
+ ParallelContext *pcxt)
+{
+ EState *estate = node->ss.ps.state;
+
+ node->pscan_len = add_size(offsetof(ParallelBitmapHeapState,
+ phs_snapshot_data),
+ EstimateSnapshotSpace(estate->es_snapshot));
+
+ shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecBitmapHeapInitializeDSM
+ *
+ * Set up a parallel bitmap heap scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
+ ParallelContext *pcxt)
+{
+ ParallelBitmapHeapState *pstate;
+ EState *estate = node->ss.ps.state;
+ dsa_area *dsa = node->ss.ps.state->es_query_dsa;
+
+ /* If there's no DSA, there are no workers; initialize nothing. */
+ if (dsa == NULL)
+ return;
+
+ pstate = shm_toc_allocate(pcxt->toc, node->pscan_len);
+
+ pstate->tbmiterator = 0;
+ pstate->prefetch_iterator = 0;
+
+ /* Initialize the mutex */
+ SpinLockInit(&pstate->mutex);
+ pstate->prefetch_pages = 0;
+ pstate->prefetch_target = 0;
+ pstate->state = BM_INITIAL;
+
+ ConditionVariableInit(&pstate->cv);
+ SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data);
+
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
+ node->pstate = pstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecBitmapHeapReInitializeDSM
+ *
+ * Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
+ ParallelContext *pcxt)
+{
+ ParallelBitmapHeapState *pstate = node->pstate;
+ dsa_area *dsa = node->ss.ps.state->es_query_dsa;
+
+ /* If there's no DSA, there are no workers; do nothing. */
+ if (dsa == NULL)
+ return;
+
+ pstate->state = BM_INITIAL;
+
+ if (DsaPointerIsValid(pstate->tbmiterator))
+ tbm_free_shared_area(dsa, pstate->tbmiterator);
+
+ if (DsaPointerIsValid(pstate->prefetch_iterator))
+ tbm_free_shared_area(dsa, pstate->prefetch_iterator);
+
+ pstate->tbmiterator = InvalidDsaPointer;
+ pstate->prefetch_iterator = InvalidDsaPointer;
+}
+
+/* ----------------------------------------------------------------
+ * ExecBitmapHeapInitializeWorker
+ *
+ * Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
+ ParallelWorkerContext *pwcxt)
+{
+ ParallelBitmapHeapState *pstate;
+ Snapshot snapshot;
+
+ Assert(node->ss.ps.state->es_query_dsa != NULL);
+
+ pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+ node->pstate = pstate;
+
+ snapshot = RestoreSnapshot(pstate->phs_snapshot_data);
+ table_scan_update_snapshot(node->ss.ss_currentScanDesc, snapshot);
+}
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
new file mode 100644
index 0000000..48c2036
--- /dev/null
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -0,0 +1,330 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeBitmapIndexscan.c
+ * Routines to support bitmapped index scans of relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeBitmapIndexscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * MultiExecBitmapIndexScan scans a relation using index.
+ * ExecInitBitmapIndexScan creates and initializes state info.
+ * ExecReScanBitmapIndexScan prepares to rescan the plan.
+ * ExecEndBitmapIndexScan releases all storage.
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "executor/execdebug.h"
+#include "executor/nodeBitmapIndexscan.h"
+#include "executor/nodeIndexscan.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/* ----------------------------------------------------------------
+ * ExecBitmapIndexScan
+ *
+ * stub for pro forma compliance
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecBitmapIndexScan(PlanState *pstate)
+{
+ elog(ERROR, "BitmapIndexScan node does not support ExecProcNode call convention");
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * MultiExecBitmapIndexScan(node)
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecBitmapIndexScan(BitmapIndexScanState *node)
+{
+ TIDBitmap *tbm;
+ IndexScanDesc scandesc;
+ double nTuples = 0;
+ bool doscan;
+
+ /* must provide our own instrumentation support */
+ if (node->ss.ps.instrument)
+ InstrStartNode(node->ss.ps.instrument);
+
+ /*
+ * extract necessary information from index scan node
+ */
+ scandesc = node->biss_ScanDesc;
+
+ /*
+ * If we have runtime keys and they've not already been set up, do it now.
+ * Array keys are also treated as runtime keys; note that if ExecReScan
+ * returns with biss_RuntimeKeysReady still false, then there is an empty
+ * array key so we should do nothing.
+ */
+ if (!node->biss_RuntimeKeysReady &&
+ (node->biss_NumRuntimeKeys != 0 || node->biss_NumArrayKeys != 0))
+ {
+ ExecReScan((PlanState *) node);
+ doscan = node->biss_RuntimeKeysReady;
+ }
+ else
+ doscan = true;
+
+ /*
+ * Prepare the result bitmap. Normally we just create a new one to pass
+ * back; however, our parent node is allowed to store a pre-made one into
+ * node->biss_result, in which case we just OR our tuple IDs into the
+ * existing bitmap. (This saves needing explicit UNION steps.)
+ */
+ if (node->biss_result)
+ {
+ tbm = node->biss_result;
+ node->biss_result = NULL; /* reset for next time */
+ }
+ else
+ {
+ /* XXX should we use less than work_mem for this? */
+ tbm = tbm_create(work_mem * 1024L,
+ ((BitmapIndexScan *) node->ss.ps.plan)->isshared ?
+ node->ss.ps.state->es_query_dsa : NULL);
+ }
+
+ /*
+ * Get TIDs from index and insert into bitmap
+ */
+ while (doscan)
+ {
+ nTuples += (double) index_getbitmap(scandesc, tbm);
+
+ CHECK_FOR_INTERRUPTS();
+
+ doscan = ExecIndexAdvanceArrayKeys(node->biss_ArrayKeys,
+ node->biss_NumArrayKeys);
+ if (doscan) /* reset index scan */
+ index_rescan(node->biss_ScanDesc,
+ node->biss_ScanKeys, node->biss_NumScanKeys,
+ NULL, 0);
+ }
+
+ /* must provide our own instrumentation support */
+ if (node->ss.ps.instrument)
+ InstrStopNode(node->ss.ps.instrument, nTuples);
+
+ return (Node *) tbm;
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanBitmapIndexScan(node)
+ *
+ * Recalculates the values of any scan keys whose value depends on
+ * information known at runtime, then rescans the indexed relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanBitmapIndexScan(BitmapIndexScanState *node)
+{
+ ExprContext *econtext = node->biss_RuntimeContext;
+
+ /*
+ * Reset the runtime-key context so we don't leak memory as each outer
+ * tuple is scanned. Note this assumes that we will recalculate *all*
+ * runtime keys on each call.
+ */
+ if (econtext)
+ ResetExprContext(econtext);
+
+ /*
+ * If we are doing runtime key calculations (ie, any of the index key
+ * values weren't simple Consts), compute the new key values.
+ *
+ * Array keys are also treated as runtime keys; note that if we return
+ * with biss_RuntimeKeysReady still false, then there is an empty array
+ * key so no index scan is needed.
+ */
+ if (node->biss_NumRuntimeKeys != 0)
+ ExecIndexEvalRuntimeKeys(econtext,
+ node->biss_RuntimeKeys,
+ node->biss_NumRuntimeKeys);
+ if (node->biss_NumArrayKeys != 0)
+ node->biss_RuntimeKeysReady =
+ ExecIndexEvalArrayKeys(econtext,
+ node->biss_ArrayKeys,
+ node->biss_NumArrayKeys);
+ else
+ node->biss_RuntimeKeysReady = true;
+
+ /* reset index scan */
+ if (node->biss_RuntimeKeysReady)
+ index_rescan(node->biss_ScanDesc,
+ node->biss_ScanKeys, node->biss_NumScanKeys,
+ NULL, 0);
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndBitmapIndexScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndBitmapIndexScan(BitmapIndexScanState *node)
+{
+ Relation indexRelationDesc;
+ IndexScanDesc indexScanDesc;
+
+ /*
+ * extract information from the node
+ */
+ indexRelationDesc = node->biss_RelationDesc;
+ indexScanDesc = node->biss_ScanDesc;
+
+ /*
+ * Free the exprcontext ... now dead code, see ExecFreeExprContext
+ */
+#ifdef NOT_USED
+ if (node->biss_RuntimeContext)
+ FreeExprContext(node->biss_RuntimeContext, true);
+#endif
+
+ /*
+ * close the index relation (no-op if we didn't open it)
+ */
+ if (indexScanDesc)
+ index_endscan(indexScanDesc);
+ if (indexRelationDesc)
+ index_close(indexRelationDesc, NoLock);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitBitmapIndexScan
+ *
+ * Initializes the index scan's state information.
+ * ----------------------------------------------------------------
+ */
+BitmapIndexScanState *
+ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
+{
+ BitmapIndexScanState *indexstate;
+ LOCKMODE lockmode;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ indexstate = makeNode(BitmapIndexScanState);
+ indexstate->ss.ps.plan = (Plan *) node;
+ indexstate->ss.ps.state = estate;
+ indexstate->ss.ps.ExecProcNode = ExecBitmapIndexScan;
+
+ /* normally we don't make the result bitmap till runtime */
+ indexstate->biss_result = NULL;
+
+ /*
+ * We do not open or lock the base relation here. We assume that an
+ * ancestor BitmapHeapScan node is holding AccessShareLock (or better) on
+ * the heap relation throughout the execution of the plan tree.
+ */
+
+ indexstate->ss.ss_currentRelation = NULL;
+ indexstate->ss.ss_currentScanDesc = NULL;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * We do not need a standard exprcontext for this node, though we may
+ * decide below to create a runtime-key exprcontext
+ */
+
+ /*
+ * initialize child expressions
+ *
+ * We don't need to initialize targetlist or qual since neither are used.
+ *
+ * Note: we don't initialize all of the indexqual expression, only the
+ * sub-parts corresponding to runtime keys (see below).
+ */
+
+ /*
+ * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
+ * here. This allows an index-advisor plugin to EXPLAIN a plan containing
+ * references to nonexistent indexes.
+ */
+ if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
+ return indexstate;
+
+ /* Open the index relation. */
+ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
+ indexstate->biss_RelationDesc = index_open(node->indexid, lockmode);
+
+ /*
+ * Initialize index-specific scan state
+ */
+ indexstate->biss_RuntimeKeysReady = false;
+ indexstate->biss_RuntimeKeys = NULL;
+ indexstate->biss_NumRuntimeKeys = 0;
+
+ /*
+ * build the index scan keys from the index qualification
+ */
+ ExecIndexBuildScanKeys((PlanState *) indexstate,
+ indexstate->biss_RelationDesc,
+ node->indexqual,
+ false,
+ &indexstate->biss_ScanKeys,
+ &indexstate->biss_NumScanKeys,
+ &indexstate->biss_RuntimeKeys,
+ &indexstate->biss_NumRuntimeKeys,
+ &indexstate->biss_ArrayKeys,
+ &indexstate->biss_NumArrayKeys);
+
+ /*
+ * If we have runtime keys or array keys, we need an ExprContext to
+ * evaluate them. We could just create a "standard" plan node exprcontext,
+ * but to keep the code looking similar to nodeIndexscan.c, it seems
+ * better to stick with the approach of using a separate ExprContext.
+ */
+ if (indexstate->biss_NumRuntimeKeys != 0 ||
+ indexstate->biss_NumArrayKeys != 0)
+ {
+ ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
+
+ ExecAssignExprContext(estate, &indexstate->ss.ps);
+ indexstate->biss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
+ indexstate->ss.ps.ps_ExprContext = stdecontext;
+ }
+ else
+ {
+ indexstate->biss_RuntimeContext = NULL;
+ }
+
+ /*
+ * Initialize scan descriptor.
+ */
+ indexstate->biss_ScanDesc =
+ index_beginscan_bitmap(indexstate->biss_RelationDesc,
+ estate->es_snapshot,
+ indexstate->biss_NumScanKeys);
+
+ /*
+ * If no run-time keys to calculate, go ahead and pass the scankeys to the
+ * index AM.
+ */
+ if (indexstate->biss_NumRuntimeKeys == 0 &&
+ indexstate->biss_NumArrayKeys == 0)
+ index_rescan(indexstate->biss_ScanDesc,
+ indexstate->biss_ScanKeys, indexstate->biss_NumScanKeys,
+ NULL, 0);
+
+ /*
+ * all done.
+ */
+ return indexstate;
+}
diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c
new file mode 100644
index 0000000..4a8c01d
--- /dev/null
+++ b/src/backend/executor/nodeBitmapOr.c
@@ -0,0 +1,241 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeBitmapOr.c
+ * routines to handle BitmapOr nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeBitmapOr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ * ExecInitBitmapOr - initialize the BitmapOr node
+ * MultiExecBitmapOr - retrieve the result bitmap from the node
+ * ExecEndBitmapOr - shut down the BitmapOr node
+ * ExecReScanBitmapOr - rescan the BitmapOr node
+ *
+ * NOTES
+ * BitmapOr nodes don't make use of their left and right
+ * subtrees, rather they maintain a list of subplans,
+ * much like Append nodes. The logic is much simpler than
+ * Append, however, since we needn't cope with forward/backward
+ * execution.
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeBitmapOr.h"
+#include "miscadmin.h"
+
+
+/* ----------------------------------------------------------------
+ * ExecBitmapOr
+ *
+ * stub for pro forma compliance
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecBitmapOr(PlanState *pstate)
+{
+ elog(ERROR, "BitmapOr node does not support ExecProcNode call convention");
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitBitmapOr
+ *
+ * Begin all of the subscans of the BitmapOr node.
+ * ----------------------------------------------------------------
+ */
+BitmapOrState *
+ExecInitBitmapOr(BitmapOr *node, EState *estate, int eflags)
+{
+ BitmapOrState *bitmaporstate = makeNode(BitmapOrState);
+ PlanState **bitmapplanstates;
+ int nplans;
+ int i;
+ ListCell *l;
+ Plan *initNode;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * Set up empty vector of subplan states
+ */
+ nplans = list_length(node->bitmapplans);
+
+ bitmapplanstates = (PlanState **) palloc0(nplans * sizeof(PlanState *));
+
+ /*
+ * create new BitmapOrState for our BitmapOr node
+ */
+ bitmaporstate->ps.plan = (Plan *) node;
+ bitmaporstate->ps.state = estate;
+ bitmaporstate->ps.ExecProcNode = ExecBitmapOr;
+ bitmaporstate->bitmapplans = bitmapplanstates;
+ bitmaporstate->nplans = nplans;
+
+ /*
+ * call ExecInitNode on each of the plans to be executed and save the
+ * results into the array "bitmapplanstates".
+ */
+ i = 0;
+ foreach(l, node->bitmapplans)
+ {
+ initNode = (Plan *) lfirst(l);
+ bitmapplanstates[i] = ExecInitNode(initNode, estate, eflags);
+ i++;
+ }
+
+ /*
+ * Miscellaneous initialization
+ *
+ * BitmapOr plans don't have expression contexts because they never call
+ * ExecQual or ExecProject. They don't need any tuple slots either.
+ */
+
+ return bitmaporstate;
+}
+
+/* ----------------------------------------------------------------
+ * MultiExecBitmapOr
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecBitmapOr(BitmapOrState *node)
+{
+ PlanState **bitmapplans;
+ int nplans;
+ int i;
+ TIDBitmap *result = NULL;
+
+ /* must provide our own instrumentation support */
+ if (node->ps.instrument)
+ InstrStartNode(node->ps.instrument);
+
+ /*
+ * get information from the node
+ */
+ bitmapplans = node->bitmapplans;
+ nplans = node->nplans;
+
+ /*
+ * Scan all the subplans and OR their result bitmaps
+ */
+ for (i = 0; i < nplans; i++)
+ {
+ PlanState *subnode = bitmapplans[i];
+ TIDBitmap *subresult;
+
+ /*
+ * We can special-case BitmapIndexScan children to avoid an explicit
+ * tbm_union step for each child: just pass down the current result
+ * bitmap and let the child OR directly into it.
+ */
+ if (IsA(subnode, BitmapIndexScanState))
+ {
+ if (result == NULL) /* first subplan */
+ {
+ /* XXX should we use less than work_mem for this? */
+ result = tbm_create(work_mem * 1024L,
+ ((BitmapOr *) node->ps.plan)->isshared ?
+ node->ps.state->es_query_dsa : NULL);
+ }
+
+ ((BitmapIndexScanState *) subnode)->biss_result = result;
+
+ subresult = (TIDBitmap *) MultiExecProcNode(subnode);
+
+ if (subresult != result)
+ elog(ERROR, "unrecognized result from subplan");
+ }
+ else
+ {
+ /* standard implementation */
+ subresult = (TIDBitmap *) MultiExecProcNode(subnode);
+
+ if (!subresult || !IsA(subresult, TIDBitmap))
+ elog(ERROR, "unrecognized result from subplan");
+
+ if (result == NULL)
+ result = subresult; /* first subplan */
+ else
+ {
+ tbm_union(result, subresult);
+ tbm_free(subresult);
+ }
+ }
+ }
+
+ /* We could return an empty result set here? */
+ if (result == NULL)
+ elog(ERROR, "BitmapOr doesn't support zero inputs");
+
+ /* must provide our own instrumentation support */
+ if (node->ps.instrument)
+ InstrStopNode(node->ps.instrument, 0 /* XXX */ );
+
+ return (Node *) result;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndBitmapOr
+ *
+ * Shuts down the subscans of the BitmapOr node.
+ *
+ * Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndBitmapOr(BitmapOrState *node)
+{
+ PlanState **bitmapplans;
+ int nplans;
+ int i;
+
+ /*
+ * get information from the node
+ */
+ bitmapplans = node->bitmapplans;
+ nplans = node->nplans;
+
+ /*
+ * shut down each of the subscans (that we've initialized)
+ */
+ for (i = 0; i < nplans; i++)
+ {
+ if (bitmapplans[i])
+ ExecEndNode(bitmapplans[i]);
+ }
+}
+
+void
+ExecReScanBitmapOr(BitmapOrState *node)
+{
+ int i;
+
+ for (i = 0; i < node->nplans; i++)
+ {
+ PlanState *subnode = node->bitmapplans[i];
+
+ /*
+ * ExecReScan doesn't know about my subplans, so I have to do
+ * changed-parameter signaling myself.
+ */
+ if (node->ps.chgParam != NULL)
+ UpdateChangedParamSet(subnode, node->ps.chgParam);
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (subnode->chgParam == NULL)
+ ExecReScan(subnode);
+ }
+}
diff --git a/src/backend/executor/nodeCtescan.c b/src/backend/executor/nodeCtescan.c
new file mode 100644
index 0000000..9c2b08d
--- /dev/null
+++ b/src/backend/executor/nodeCtescan.c
@@ -0,0 +1,351 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeCtescan.c
+ * routines to handle CteScan nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeCtescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeCtescan.h"
+#include "miscadmin.h"
+
+static TupleTableSlot *CteScanNext(CteScanState *node);
+
+/* ----------------------------------------------------------------
+ * CteScanNext
+ *
+ * This is a workhorse for ExecCteScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+CteScanNext(CteScanState *node)
+{
+ EState *estate;
+ ScanDirection dir;
+ bool forward;
+ Tuplestorestate *tuplestorestate;
+ bool eof_tuplestore;
+ TupleTableSlot *slot;
+
+ /*
+ * get state info from node
+ */
+ estate = node->ss.ps.state;
+ dir = estate->es_direction;
+ forward = ScanDirectionIsForward(dir);
+ tuplestorestate = node->leader->cte_table;
+ tuplestore_select_read_pointer(tuplestorestate, node->readptr);
+ slot = node->ss.ss_ScanTupleSlot;
+
+ /*
+ * If we are not at the end of the tuplestore, or are going backwards, try
+ * to fetch a tuple from tuplestore.
+ */
+ eof_tuplestore = tuplestore_ateof(tuplestorestate);
+
+ if (!forward && eof_tuplestore)
+ {
+ if (!node->leader->eof_cte)
+ {
+ /*
+ * When reversing direction at tuplestore EOF, the first
+ * gettupleslot call will fetch the last-added tuple; but we want
+ * to return the one before that, if possible. So do an extra
+ * fetch.
+ */
+ if (!tuplestore_advance(tuplestorestate, forward))
+ return NULL; /* the tuplestore must be empty */
+ }
+ eof_tuplestore = false;
+ }
+
+ /*
+ * If we can fetch another tuple from the tuplestore, return it.
+ *
+ * Note: we have to use copy=true in the tuplestore_gettupleslot call,
+ * because we are sharing the tuplestore with other nodes that might write
+ * into the tuplestore before we get called again.
+ */
+ if (!eof_tuplestore)
+ {
+ if (tuplestore_gettupleslot(tuplestorestate, forward, true, slot))
+ return slot;
+ if (forward)
+ eof_tuplestore = true;
+ }
+
+ /*
+ * If necessary, try to fetch another row from the CTE query.
+ *
+ * Note: the eof_cte state variable exists to short-circuit further calls
+ * of the CTE plan. It's not optional, unfortunately, because some plan
+ * node types are not robust about being called again when they've already
+ * returned NULL.
+ */
+ if (eof_tuplestore && !node->leader->eof_cte)
+ {
+ TupleTableSlot *cteslot;
+
+ /*
+ * We can only get here with forward==true, so no need to worry about
+ * which direction the subplan will go.
+ */
+ cteslot = ExecProcNode(node->cteplanstate);
+ if (TupIsNull(cteslot))
+ {
+ node->leader->eof_cte = true;
+ return NULL;
+ }
+
+ /*
+ * There are corner cases where the subplan could change which
+ * tuplestore read pointer is active, so be sure to reselect ours
+ * before storing the tuple we got.
+ */
+ tuplestore_select_read_pointer(tuplestorestate, node->readptr);
+
+ /*
+ * Append a copy of the returned tuple to tuplestore. NOTE: because
+ * our read pointer is certainly in EOF state, its read position will
+ * move forward over the added tuple. This is what we want. Also,
+ * any other readers will *not* move past the new tuple, which is what
+ * they want.
+ */
+ tuplestore_puttupleslot(tuplestorestate, cteslot);
+
+ /*
+ * We MUST copy the CTE query's output tuple into our own slot. This
+ * is because other CteScan nodes might advance the CTE query before
+ * we are called again, and our output tuple must stay stable over
+ * that.
+ */
+ return ExecCopySlot(slot, cteslot);
+ }
+
+ /*
+ * Nothing left ...
+ */
+ return ExecClearTuple(slot);
+}
+
+/*
+ * CteScanRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+CteScanRecheck(CteScanState *node, TupleTableSlot *slot)
+{
+ /* nothing to check */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecCteScan(node)
+ *
+ * Scans the CTE sequentially and returns the next qualifying tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecCteScan(PlanState *pstate)
+{
+ CteScanState *node = castNode(CteScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) CteScanNext,
+ (ExecScanRecheckMtd) CteScanRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecInitCteScan
+ * ----------------------------------------------------------------
+ */
+CteScanState *
+ExecInitCteScan(CteScan *node, EState *estate, int eflags)
+{
+ CteScanState *scanstate;
+ ParamExecData *prmdata;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & EXEC_FLAG_MARK));
+
+ /*
+ * For the moment we have to force the tuplestore to allow REWIND, because
+ * we might be asked to rescan the CTE even though upper levels didn't
+ * tell us to be prepared to do it efficiently. Annoying, since this
+ * prevents truncation of the tuplestore. XXX FIXME
+ *
+ * Note: if we are in an EPQ recheck plan tree, it's likely that no access
+ * to the tuplestore is needed at all, making this even more annoying.
+ * It's not worth improving that as long as all the read pointers would
+ * have REWIND anyway, but if we ever improve this logic then that aspect
+ * should be considered too.
+ */
+ eflags |= EXEC_FLAG_REWIND;
+
+ /*
+ * CteScan should not have any children.
+ */
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create new CteScanState for node
+ */
+ scanstate = makeNode(CteScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecCteScan;
+ scanstate->eflags = eflags;
+ scanstate->cte_table = NULL;
+ scanstate->eof_cte = false;
+
+ /*
+ * Find the already-initialized plan for the CTE query.
+ */
+ scanstate->cteplanstate = (PlanState *) list_nth(estate->es_subplanstates,
+ node->ctePlanId - 1);
+
+ /*
+ * The Param slot associated with the CTE query is used to hold a pointer
+ * to the CteState of the first CteScan node that initializes for this
+ * CTE. This node will be the one that holds the shared state for all the
+ * CTEs, particularly the shared tuplestore.
+ */
+ prmdata = &(estate->es_param_exec_vals[node->cteParam]);
+ Assert(prmdata->execPlan == NULL);
+ Assert(!prmdata->isnull);
+ scanstate->leader = castNode(CteScanState, DatumGetPointer(prmdata->value));
+ if (scanstate->leader == NULL)
+ {
+ /* I am the leader */
+ prmdata->value = PointerGetDatum(scanstate);
+ scanstate->leader = scanstate;
+ scanstate->cte_table = tuplestore_begin_heap(true, false, work_mem);
+ tuplestore_set_eflags(scanstate->cte_table, scanstate->eflags);
+ scanstate->readptr = 0;
+ }
+ else
+ {
+ /* Not the leader */
+ /* Create my own read pointer, and ensure it is at start */
+ scanstate->readptr =
+ tuplestore_alloc_read_pointer(scanstate->leader->cte_table,
+ scanstate->eflags);
+ tuplestore_select_read_pointer(scanstate->leader->cte_table,
+ scanstate->readptr);
+ tuplestore_rescan(scanstate->leader->cte_table);
+ }
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ /*
+ * The scan tuple type (ie, the rowtype we expect to find in the work
+ * table) is the same as the result rowtype of the CTE query.
+ */
+ ExecInitScanTupleSlot(estate, &scanstate->ss,
+ ExecGetResultType(scanstate->cteplanstate),
+ &TTSOpsMinimalTuple);
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndCteScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndCteScan(CteScanState *node)
+{
+ /*
+ * Free exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * If I am the leader, free the tuplestore.
+ */
+ if (node->leader == node)
+ {
+ tuplestore_end(node->cte_table);
+ node->cte_table = NULL;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanCteScan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanCteScan(CteScanState *node)
+{
+ Tuplestorestate *tuplestorestate = node->leader->cte_table;
+
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ ExecScanReScan(&node->ss);
+
+ /*
+ * Clear the tuplestore if a new scan of the underlying CTE is required.
+ * This implicitly resets all the tuplestore's read pointers. Note that
+ * multiple CTE nodes might redundantly clear the tuplestore; that's OK,
+ * and not unduly expensive. We'll stop taking this path as soon as
+ * somebody has attempted to read something from the underlying CTE
+ * (thereby causing its chgParam to be cleared).
+ */
+ if (node->leader->cteplanstate->chgParam != NULL)
+ {
+ tuplestore_clear(tuplestorestate);
+ node->leader->eof_cte = false;
+ }
+ else
+ {
+ /*
+ * Else, just rewind my own pointer. Either the underlying CTE
+ * doesn't need a rescan (and we can re-read what's in the tuplestore
+ * now), or somebody else already took care of it.
+ */
+ tuplestore_select_read_pointer(tuplestorestate, node->readptr);
+ tuplestore_rescan(tuplestorestate);
+ }
+}
diff --git a/src/backend/executor/nodeCustom.c b/src/backend/executor/nodeCustom.c
new file mode 100644
index 0000000..c82060e
--- /dev/null
+++ b/src/backend/executor/nodeCustom.c
@@ -0,0 +1,228 @@
+/* ------------------------------------------------------------------------
+ *
+ * nodeCustom.c
+ * Routines to handle execution of custom scan node
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * ------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "executor/executor.h"
+#include "executor/nodeCustom.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "nodes/extensible.h"
+#include "nodes/plannodes.h"
+#include "parser/parsetree.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static TupleTableSlot *ExecCustomScan(PlanState *pstate);
+
+
+CustomScanState *
+ExecInitCustomScan(CustomScan *cscan, EState *estate, int eflags)
+{
+ CustomScanState *css;
+ Relation scan_rel = NULL;
+ Index scanrelid = cscan->scan.scanrelid;
+ Index tlistvarno;
+
+ /*
+ * Allocate the CustomScanState object. We let the custom scan provider
+ * do the palloc, in case it wants to make a larger object that embeds
+ * CustomScanState as the first field. It must set the node tag and the
+ * methods field correctly at this time. Other standard fields should be
+ * set to zero.
+ */
+ css = castNode(CustomScanState,
+ cscan->methods->CreateCustomScanState(cscan));
+
+ /* ensure flags is filled correctly */
+ css->flags = cscan->flags;
+
+ /* fill up fields of ScanState */
+ css->ss.ps.plan = &cscan->scan.plan;
+ css->ss.ps.state = estate;
+ css->ss.ps.ExecProcNode = ExecCustomScan;
+
+ /* create expression context for node */
+ ExecAssignExprContext(estate, &css->ss.ps);
+
+ /*
+ * open the scan relation, if any
+ */
+ if (scanrelid > 0)
+ {
+ scan_rel = ExecOpenScanRelation(estate, scanrelid, eflags);
+ css->ss.ss_currentRelation = scan_rel;
+ }
+
+ /*
+ * Determine the scan tuple type. If the custom scan provider provided a
+ * targetlist describing the scan tuples, use that; else use base
+ * relation's rowtype.
+ */
+ if (cscan->custom_scan_tlist != NIL || scan_rel == NULL)
+ {
+ TupleDesc scan_tupdesc;
+
+ scan_tupdesc = ExecTypeFromTL(cscan->custom_scan_tlist);
+ ExecInitScanTupleSlot(estate, &css->ss, scan_tupdesc, &TTSOpsVirtual);
+ /* Node's targetlist will contain Vars with varno = INDEX_VAR */
+ tlistvarno = INDEX_VAR;
+ }
+ else
+ {
+ ExecInitScanTupleSlot(estate, &css->ss, RelationGetDescr(scan_rel),
+ &TTSOpsVirtual);
+ /* Node's targetlist will contain Vars with varno = scanrelid */
+ tlistvarno = scanrelid;
+ }
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTupleSlotTL(&css->ss.ps, &TTSOpsVirtual);
+ ExecAssignScanProjectionInfoWithVarno(&css->ss, tlistvarno);
+
+ /* initialize child expressions */
+ css->ss.ps.qual =
+ ExecInitQual(cscan->scan.plan.qual, (PlanState *) css);
+
+ /*
+ * The callback of custom-scan provider applies the final initialization
+ * of the custom-scan-state node according to its logic.
+ */
+ css->methods->BeginCustomScan(css, estate, eflags);
+
+ return css;
+}
+
+static TupleTableSlot *
+ExecCustomScan(PlanState *pstate)
+{
+ CustomScanState *node = castNode(CustomScanState, pstate);
+
+ CHECK_FOR_INTERRUPTS();
+
+ Assert(node->methods->ExecCustomScan != NULL);
+ return node->methods->ExecCustomScan(node);
+}
+
+void
+ExecEndCustomScan(CustomScanState *node)
+{
+ Assert(node->methods->EndCustomScan != NULL);
+ node->methods->EndCustomScan(node);
+
+ /* Free the exprcontext */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /* Clean out the tuple table */
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+void
+ExecReScanCustomScan(CustomScanState *node)
+{
+ Assert(node->methods->ReScanCustomScan != NULL);
+ node->methods->ReScanCustomScan(node);
+}
+
+void
+ExecCustomMarkPos(CustomScanState *node)
+{
+ if (!node->methods->MarkPosCustomScan)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("custom scan \"%s\" does not support MarkPos",
+ node->methods->CustomName)));
+ node->methods->MarkPosCustomScan(node);
+}
+
+void
+ExecCustomRestrPos(CustomScanState *node)
+{
+ if (!node->methods->RestrPosCustomScan)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("custom scan \"%s\" does not support MarkPos",
+ node->methods->CustomName)));
+ node->methods->RestrPosCustomScan(node);
+}
+
+void
+ExecCustomScanEstimate(CustomScanState *node, ParallelContext *pcxt)
+{
+ const CustomExecMethods *methods = node->methods;
+
+ if (methods->EstimateDSMCustomScan)
+ {
+ node->pscan_len = methods->EstimateDSMCustomScan(node, pcxt);
+ shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+ }
+}
+
+void
+ExecCustomScanInitializeDSM(CustomScanState *node, ParallelContext *pcxt)
+{
+ const CustomExecMethods *methods = node->methods;
+
+ if (methods->InitializeDSMCustomScan)
+ {
+ int plan_node_id = node->ss.ps.plan->plan_node_id;
+ void *coordinate;
+
+ coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len);
+ methods->InitializeDSMCustomScan(node, pcxt, coordinate);
+ shm_toc_insert(pcxt->toc, plan_node_id, coordinate);
+ }
+}
+
+void
+ExecCustomScanReInitializeDSM(CustomScanState *node, ParallelContext *pcxt)
+{
+ const CustomExecMethods *methods = node->methods;
+
+ if (methods->ReInitializeDSMCustomScan)
+ {
+ int plan_node_id = node->ss.ps.plan->plan_node_id;
+ void *coordinate;
+
+ coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false);
+ methods->ReInitializeDSMCustomScan(node, pcxt, coordinate);
+ }
+}
+
+void
+ExecCustomScanInitializeWorker(CustomScanState *node,
+ ParallelWorkerContext *pwcxt)
+{
+ const CustomExecMethods *methods = node->methods;
+
+ if (methods->InitializeWorkerCustomScan)
+ {
+ int plan_node_id = node->ss.ps.plan->plan_node_id;
+ void *coordinate;
+
+ coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false);
+ methods->InitializeWorkerCustomScan(node, pwcxt->toc, coordinate);
+ }
+}
+
+void
+ExecShutdownCustomScan(CustomScanState *node)
+{
+ const CustomExecMethods *methods = node->methods;
+
+ if (methods->ShutdownCustomScan)
+ methods->ShutdownCustomScan(node);
+}
diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c
new file mode 100644
index 0000000..d27849a
--- /dev/null
+++ b/src/backend/executor/nodeForeignscan.c
@@ -0,0 +1,504 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeForeignscan.c
+ * Routines to support scans of foreign tables
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeForeignscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *
+ * ExecForeignScan scans a foreign table.
+ * ExecInitForeignScan creates and initializes state info.
+ * ExecReScanForeignScan rescans the foreign relation.
+ * ExecEndForeignScan releases any resources allocated.
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeForeignscan.h"
+#include "foreign/fdwapi.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static TupleTableSlot *ForeignNext(ForeignScanState *node);
+static bool ForeignRecheck(ForeignScanState *node, TupleTableSlot *slot);
+
+
+/* ----------------------------------------------------------------
+ * ForeignNext
+ *
+ * This is a workhorse for ExecForeignScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ForeignNext(ForeignScanState *node)
+{
+ TupleTableSlot *slot;
+ ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
+ ExprContext *econtext = node->ss.ps.ps_ExprContext;
+ MemoryContext oldcontext;
+
+ /* Call the Iterate function in short-lived context */
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+ if (plan->operation != CMD_SELECT)
+ {
+ /*
+ * direct modifications cannot be re-evaluated, so shouldn't get here
+ * during EvalPlanQual processing
+ */
+ Assert(node->ss.ps.state->es_epq_active == NULL);
+
+ slot = node->fdwroutine->IterateDirectModify(node);
+ }
+ else
+ slot = node->fdwroutine->IterateForeignScan(node);
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * Insert valid value into tableoid, the only actually-useful system
+ * column.
+ */
+ if (plan->fsSystemCol && !TupIsNull(slot))
+ slot->tts_tableOid = RelationGetRelid(node->ss.ss_currentRelation);
+
+ return slot;
+}
+
+/*
+ * ForeignRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+ForeignRecheck(ForeignScanState *node, TupleTableSlot *slot)
+{
+ FdwRoutine *fdwroutine = node->fdwroutine;
+ ExprContext *econtext;
+
+ /*
+ * extract necessary information from foreign scan node
+ */
+ econtext = node->ss.ps.ps_ExprContext;
+
+ /* Does the tuple meet the remote qual condition? */
+ econtext->ecxt_scantuple = slot;
+
+ ResetExprContext(econtext);
+
+ /*
+ * If an outer join is pushed down, RecheckForeignScan may need to store a
+ * different tuple in the slot, because a different set of columns may go
+ * to NULL upon recheck. Otherwise, it shouldn't need to change the slot
+ * contents, just return true or false to indicate whether the quals still
+ * pass. For simple cases, setting fdw_recheck_quals may be easier than
+ * providing this callback.
+ */
+ if (fdwroutine->RecheckForeignScan &&
+ !fdwroutine->RecheckForeignScan(node, slot))
+ return false;
+
+ return ExecQual(node->fdw_recheck_quals, econtext);
+}
+
+/* ----------------------------------------------------------------
+ * ExecForeignScan(node)
+ *
+ * Fetches the next tuple from the FDW, checks local quals, and
+ * returns it.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecForeignScan(PlanState *pstate)
+{
+ ForeignScanState *node = castNode(ForeignScanState, pstate);
+ ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
+ EState *estate = node->ss.ps.state;
+
+ /*
+ * Ignore direct modifications when EvalPlanQual is active --- they are
+ * irrelevant for EvalPlanQual rechecking
+ */
+ if (estate->es_epq_active != NULL && plan->operation != CMD_SELECT)
+ return NULL;
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) ForeignNext,
+ (ExecScanRecheckMtd) ForeignRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecInitForeignScan
+ * ----------------------------------------------------------------
+ */
+ForeignScanState *
+ExecInitForeignScan(ForeignScan *node, EState *estate, int eflags)
+{
+ ForeignScanState *scanstate;
+ Relation currentRelation = NULL;
+ Index scanrelid = node->scan.scanrelid;
+ Index tlistvarno;
+ FdwRoutine *fdwroutine;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ scanstate = makeNode(ForeignScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecForeignScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ /*
+ * open the scan relation, if any; also acquire function pointers from the
+ * FDW's handler
+ */
+ if (scanrelid > 0)
+ {
+ currentRelation = ExecOpenScanRelation(estate, scanrelid, eflags);
+ scanstate->ss.ss_currentRelation = currentRelation;
+ fdwroutine = GetFdwRoutineForRelation(currentRelation, true);
+ }
+ else
+ {
+ /* We can't use the relcache, so get fdwroutine the hard way */
+ fdwroutine = GetFdwRoutineByServerId(node->fs_server);
+ }
+
+ /*
+ * Determine the scan tuple type. If the FDW provided a targetlist
+ * describing the scan tuples, use that; else use base relation's rowtype.
+ */
+ if (node->fdw_scan_tlist != NIL || currentRelation == NULL)
+ {
+ TupleDesc scan_tupdesc;
+
+ scan_tupdesc = ExecTypeFromTL(node->fdw_scan_tlist);
+ ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc,
+ &TTSOpsHeapTuple);
+ /* Node's targetlist will contain Vars with varno = INDEX_VAR */
+ tlistvarno = INDEX_VAR;
+ }
+ else
+ {
+ TupleDesc scan_tupdesc;
+
+ /* don't trust FDWs to return tuples fulfilling NOT NULL constraints */
+ scan_tupdesc = CreateTupleDescCopy(RelationGetDescr(currentRelation));
+ ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc,
+ &TTSOpsHeapTuple);
+ /* Node's targetlist will contain Vars with varno = scanrelid */
+ tlistvarno = scanrelid;
+ }
+
+ /* Don't know what an FDW might return */
+ scanstate->ss.ps.scanopsfixed = false;
+ scanstate->ss.ps.scanopsset = true;
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfoWithVarno(&scanstate->ss, tlistvarno);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+ scanstate->fdw_recheck_quals =
+ ExecInitQual(node->fdw_recheck_quals, (PlanState *) scanstate);
+
+ /*
+ * Determine whether to scan the foreign relation asynchronously or not;
+ * this has to be kept in sync with the code in ExecInitAppend().
+ */
+ scanstate->ss.ps.async_capable = (((Plan *) node)->async_capable &&
+ estate->es_epq_active == NULL);
+
+ /*
+ * Initialize FDW-related state.
+ */
+ scanstate->fdwroutine = fdwroutine;
+ scanstate->fdw_state = NULL;
+
+ /*
+ * For the FDW's convenience, look up the modification target relation's
+ * ResultRelInfo. The ModifyTable node should have initialized it for us,
+ * see ExecInitModifyTable.
+ *
+ * Don't try to look up the ResultRelInfo when EvalPlanQual is active,
+ * though. Direct modifications cannot be re-evaluated as part of
+ * EvalPlanQual. The lookup wouldn't work anyway because during
+ * EvalPlanQual processing, EvalPlanQual only initializes the subtree
+ * under the ModifyTable, and doesn't run ExecInitModifyTable.
+ */
+ if (node->resultRelation > 0 && estate->es_epq_active == NULL)
+ {
+ if (estate->es_result_relations == NULL ||
+ estate->es_result_relations[node->resultRelation - 1] == NULL)
+ {
+ elog(ERROR, "result relation not initialized");
+ }
+ scanstate->resultRelInfo = estate->es_result_relations[node->resultRelation - 1];
+ }
+
+ /* Initialize any outer plan. */
+ if (outerPlan(node))
+ outerPlanState(scanstate) =
+ ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * Tell the FDW to initialize the scan.
+ */
+ if (node->operation != CMD_SELECT)
+ {
+ /*
+ * Direct modifications cannot be re-evaluated by EvalPlanQual, so
+ * don't bother preparing the FDW.
+ *
+ * In case of an inherited UPDATE/DELETE with foreign targets there
+ * can be direct-modify ForeignScan nodes in the EvalPlanQual subtree,
+ * so we need to ignore such ForeignScan nodes during EvalPlanQual
+ * processing. See also ExecForeignScan/ExecReScanForeignScan.
+ */
+ if (estate->es_epq_active == NULL)
+ fdwroutine->BeginDirectModify(scanstate, eflags);
+ }
+ else
+ fdwroutine->BeginForeignScan(scanstate, eflags);
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndForeignScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndForeignScan(ForeignScanState *node)
+{
+ ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
+ EState *estate = node->ss.ps.state;
+
+ /* Let the FDW shut down */
+ if (plan->operation != CMD_SELECT)
+ {
+ if (estate->es_epq_active == NULL)
+ node->fdwroutine->EndDirectModify(node);
+ }
+ else
+ node->fdwroutine->EndForeignScan(node);
+
+ /* Shut down any outer plan. */
+ if (outerPlanState(node))
+ ExecEndNode(outerPlanState(node));
+
+ /* Free the exprcontext */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /* clean out the tuple table */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanForeignScan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanForeignScan(ForeignScanState *node)
+{
+ ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
+ EState *estate = node->ss.ps.state;
+ PlanState *outerPlan = outerPlanState(node);
+
+ /*
+ * Ignore direct modifications when EvalPlanQual is active --- they are
+ * irrelevant for EvalPlanQual rechecking
+ */
+ if (estate->es_epq_active != NULL && plan->operation != CMD_SELECT)
+ return;
+
+ node->fdwroutine->ReScanForeignScan(node);
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode. outerPlan may also be NULL, in which case there is
+ * nothing to rescan at all.
+ */
+ if (outerPlan != NULL && outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+
+ ExecScanReScan(&node->ss);
+}
+
+/* ----------------------------------------------------------------
+ * ExecForeignScanEstimate
+ *
+ * Informs size of the parallel coordination information, if any
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanEstimate(ForeignScanState *node, ParallelContext *pcxt)
+{
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ if (fdwroutine->EstimateDSMForeignScan)
+ {
+ node->pscan_len = fdwroutine->EstimateDSMForeignScan(node, pcxt);
+ shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecForeignScanInitializeDSM
+ *
+ * Initialize the parallel coordination information
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanInitializeDSM(ForeignScanState *node, ParallelContext *pcxt)
+{
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ if (fdwroutine->InitializeDSMForeignScan)
+ {
+ int plan_node_id = node->ss.ps.plan->plan_node_id;
+ void *coordinate;
+
+ coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len);
+ fdwroutine->InitializeDSMForeignScan(node, pcxt, coordinate);
+ shm_toc_insert(pcxt->toc, plan_node_id, coordinate);
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecForeignScanReInitializeDSM
+ *
+ * Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanReInitializeDSM(ForeignScanState *node, ParallelContext *pcxt)
+{
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ if (fdwroutine->ReInitializeDSMForeignScan)
+ {
+ int plan_node_id = node->ss.ps.plan->plan_node_id;
+ void *coordinate;
+
+ coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false);
+ fdwroutine->ReInitializeDSMForeignScan(node, pcxt, coordinate);
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecForeignScanInitializeWorker
+ *
+ * Initialization according to the parallel coordination information
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanInitializeWorker(ForeignScanState *node,
+ ParallelWorkerContext *pwcxt)
+{
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ if (fdwroutine->InitializeWorkerForeignScan)
+ {
+ int plan_node_id = node->ss.ps.plan->plan_node_id;
+ void *coordinate;
+
+ coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false);
+ fdwroutine->InitializeWorkerForeignScan(node, pwcxt->toc, coordinate);
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecShutdownForeignScan
+ *
+ * Gives FDW chance to stop asynchronous resource consumption
+ * and release any resources still held.
+ * ----------------------------------------------------------------
+ */
+void
+ExecShutdownForeignScan(ForeignScanState *node)
+{
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ if (fdwroutine->ShutdownForeignScan)
+ fdwroutine->ShutdownForeignScan(node);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAsyncForeignScanRequest
+ *
+ * Asynchronously request a tuple from a designed async-capable node
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanRequest(AsyncRequest *areq)
+{
+ ForeignScanState *node = (ForeignScanState *) areq->requestee;
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ Assert(fdwroutine->ForeignAsyncRequest != NULL);
+ fdwroutine->ForeignAsyncRequest(areq);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAsyncForeignScanConfigureWait
+ *
+ * In async mode, configure for a wait
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanConfigureWait(AsyncRequest *areq)
+{
+ ForeignScanState *node = (ForeignScanState *) areq->requestee;
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ Assert(fdwroutine->ForeignAsyncConfigureWait != NULL);
+ fdwroutine->ForeignAsyncConfigureWait(areq);
+}
+
+/* ----------------------------------------------------------------
+ * ExecAsyncForeignScanNotify
+ *
+ * Callback invoked when a relevant event has occurred
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanNotify(AsyncRequest *areq)
+{
+ ForeignScanState *node = (ForeignScanState *) areq->requestee;
+ FdwRoutine *fdwroutine = node->fdwroutine;
+
+ Assert(fdwroutine->ForeignAsyncNotify != NULL);
+ fdwroutine->ForeignAsyncNotify(areq);
+}
diff --git a/src/backend/executor/nodeFunctionscan.c b/src/backend/executor/nodeFunctionscan.c
new file mode 100644
index 0000000..b31b2b2
--- /dev/null
+++ b/src/backend/executor/nodeFunctionscan.c
@@ -0,0 +1,620 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeFunctionscan.c
+ * Support routines for scanning RangeFunctions (functions in rangetable).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeFunctionscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecFunctionScan scans a function.
+ * ExecFunctionNext retrieve next tuple in sequential order.
+ * ExecInitFunctionScan creates and initializes a functionscan node.
+ * ExecEndFunctionScan releases any storage allocated.
+ * ExecReScanFunctionScan rescans the function
+ */
+#include "postgres.h"
+
+#include "catalog/pg_type.h"
+#include "executor/nodeFunctionscan.h"
+#include "funcapi.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Runtime data for each function being scanned.
+ */
+typedef struct FunctionScanPerFuncState
+{
+ SetExprState *setexpr; /* state of the expression being evaluated */
+ TupleDesc tupdesc; /* desc of the function result type */
+ int colcount; /* expected number of result columns */
+ Tuplestorestate *tstore; /* holds the function result set */
+ int64 rowcount; /* # of rows in result set, -1 if not known */
+ TupleTableSlot *func_slot; /* function result slot (or NULL) */
+} FunctionScanPerFuncState;
+
+static TupleTableSlot *FunctionNext(FunctionScanState *node);
+
+
+/* ----------------------------------------------------------------
+ * Scan Support
+ * ----------------------------------------------------------------
+ */
+/* ----------------------------------------------------------------
+ * FunctionNext
+ *
+ * This is a workhorse for ExecFunctionScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+FunctionNext(FunctionScanState *node)
+{
+ EState *estate;
+ ScanDirection direction;
+ TupleTableSlot *scanslot;
+ bool alldone;
+ int64 oldpos;
+ int funcno;
+ int att;
+
+ /*
+ * get information from the estate and scan state
+ */
+ estate = node->ss.ps.state;
+ direction = estate->es_direction;
+ scanslot = node->ss.ss_ScanTupleSlot;
+
+ if (node->simple)
+ {
+ /*
+ * Fast path for the trivial case: the function return type and scan
+ * result type are the same, so we fetch the function result straight
+ * into the scan result slot. No need to update ordinality or
+ * rowcounts either.
+ */
+ Tuplestorestate *tstore = node->funcstates[0].tstore;
+
+ /*
+ * If first time through, read all tuples from function and put them
+ * in a tuplestore. Subsequent calls just fetch tuples from
+ * tuplestore.
+ */
+ if (tstore == NULL)
+ {
+ node->funcstates[0].tstore = tstore =
+ ExecMakeTableFunctionResult(node->funcstates[0].setexpr,
+ node->ss.ps.ps_ExprContext,
+ node->argcontext,
+ node->funcstates[0].tupdesc,
+ node->eflags & EXEC_FLAG_BACKWARD);
+
+ /*
+ * paranoia - cope if the function, which may have constructed the
+ * tuplestore itself, didn't leave it pointing at the start. This
+ * call is fast, so the overhead shouldn't be an issue.
+ */
+ tuplestore_rescan(tstore);
+ }
+
+ /*
+ * Get the next tuple from tuplestore.
+ */
+ (void) tuplestore_gettupleslot(tstore,
+ ScanDirectionIsForward(direction),
+ false,
+ scanslot);
+ return scanslot;
+ }
+
+ /*
+ * Increment or decrement ordinal counter before checking for end-of-data,
+ * so that we can move off either end of the result by 1 (and no more than
+ * 1) without losing correct count. See PortalRunSelect for why we can
+ * assume that we won't be called repeatedly in the end-of-data state.
+ */
+ oldpos = node->ordinal;
+ if (ScanDirectionIsForward(direction))
+ node->ordinal++;
+ else
+ node->ordinal--;
+
+ /*
+ * Main loop over functions.
+ *
+ * We fetch the function results into func_slots (which match the function
+ * return types), and then copy the values to scanslot (which matches the
+ * scan result type), setting the ordinal column (if any) as well.
+ */
+ ExecClearTuple(scanslot);
+ att = 0;
+ alldone = true;
+ for (funcno = 0; funcno < node->nfuncs; funcno++)
+ {
+ FunctionScanPerFuncState *fs = &node->funcstates[funcno];
+ int i;
+
+ /*
+ * If first time through, read all tuples from function and put them
+ * in a tuplestore. Subsequent calls just fetch tuples from
+ * tuplestore.
+ */
+ if (fs->tstore == NULL)
+ {
+ fs->tstore =
+ ExecMakeTableFunctionResult(fs->setexpr,
+ node->ss.ps.ps_ExprContext,
+ node->argcontext,
+ fs->tupdesc,
+ node->eflags & EXEC_FLAG_BACKWARD);
+
+ /*
+ * paranoia - cope if the function, which may have constructed the
+ * tuplestore itself, didn't leave it pointing at the start. This
+ * call is fast, so the overhead shouldn't be an issue.
+ */
+ tuplestore_rescan(fs->tstore);
+ }
+
+ /*
+ * Get the next tuple from tuplestore.
+ *
+ * If we have a rowcount for the function, and we know the previous
+ * read position was out of bounds, don't try the read. This allows
+ * backward scan to work when there are mixed row counts present.
+ */
+ if (fs->rowcount != -1 && fs->rowcount < oldpos)
+ ExecClearTuple(fs->func_slot);
+ else
+ (void) tuplestore_gettupleslot(fs->tstore,
+ ScanDirectionIsForward(direction),
+ false,
+ fs->func_slot);
+
+ if (TupIsNull(fs->func_slot))
+ {
+ /*
+ * If we ran out of data for this function in the forward
+ * direction then we now know how many rows it returned. We need
+ * to know this in order to handle backwards scans. The row count
+ * we store is actually 1+ the actual number, because we have to
+ * position the tuplestore 1 off its end sometimes.
+ */
+ if (ScanDirectionIsForward(direction) && fs->rowcount == -1)
+ fs->rowcount = node->ordinal;
+
+ /*
+ * populate the result cols with nulls
+ */
+ for (i = 0; i < fs->colcount; i++)
+ {
+ scanslot->tts_values[att] = (Datum) 0;
+ scanslot->tts_isnull[att] = true;
+ att++;
+ }
+ }
+ else
+ {
+ /*
+ * we have a result, so just copy it to the result cols.
+ */
+ slot_getallattrs(fs->func_slot);
+
+ for (i = 0; i < fs->colcount; i++)
+ {
+ scanslot->tts_values[att] = fs->func_slot->tts_values[i];
+ scanslot->tts_isnull[att] = fs->func_slot->tts_isnull[i];
+ att++;
+ }
+
+ /*
+ * We're not done until every function result is exhausted; we pad
+ * the shorter results with nulls until then.
+ */
+ alldone = false;
+ }
+ }
+
+ /*
+ * ordinal col is always last, per spec.
+ */
+ if (node->ordinality)
+ {
+ scanslot->tts_values[att] = Int64GetDatumFast(node->ordinal);
+ scanslot->tts_isnull[att] = false;
+ }
+
+ /*
+ * If alldone, we just return the previously-cleared scanslot. Otherwise,
+ * finish creating the virtual tuple.
+ */
+ if (!alldone)
+ ExecStoreVirtualTuple(scanslot);
+
+ return scanslot;
+}
+
+/*
+ * FunctionRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+FunctionRecheck(FunctionScanState *node, TupleTableSlot *slot)
+{
+ /* nothing to check */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecFunctionScan(node)
+ *
+ * Scans the function sequentially and returns the next qualifying
+ * tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecFunctionScan(PlanState *pstate)
+{
+ FunctionScanState *node = castNode(FunctionScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) FunctionNext,
+ (ExecScanRecheckMtd) FunctionRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitFunctionScan
+ * ----------------------------------------------------------------
+ */
+FunctionScanState *
+ExecInitFunctionScan(FunctionScan *node, EState *estate, int eflags)
+{
+ FunctionScanState *scanstate;
+ int nfuncs = list_length(node->functions);
+ TupleDesc scan_tupdesc;
+ int i,
+ natts;
+ ListCell *lc;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & EXEC_FLAG_MARK));
+
+ /*
+ * FunctionScan should not have any children.
+ */
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create new ScanState for node
+ */
+ scanstate = makeNode(FunctionScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecFunctionScan;
+ scanstate->eflags = eflags;
+
+ /*
+ * are we adding an ordinality column?
+ */
+ scanstate->ordinality = node->funcordinality;
+
+ scanstate->nfuncs = nfuncs;
+ if (nfuncs == 1 && !node->funcordinality)
+ scanstate->simple = true;
+ else
+ scanstate->simple = false;
+
+ /*
+ * Ordinal 0 represents the "before the first row" position.
+ *
+ * We need to track ordinal position even when not adding an ordinality
+ * column to the result, in order to handle backwards scanning properly
+ * with multiple functions with different result sizes. (We can't position
+ * any individual function's tuplestore any more than 1 place beyond its
+ * end, so when scanning backwards, we need to know when to start
+ * including the function in the scan again.)
+ */
+ scanstate->ordinal = 0;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ scanstate->funcstates = palloc(nfuncs * sizeof(FunctionScanPerFuncState));
+
+ natts = 0;
+ i = 0;
+ foreach(lc, node->functions)
+ {
+ RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
+ Node *funcexpr = rtfunc->funcexpr;
+ int colcount = rtfunc->funccolcount;
+ FunctionScanPerFuncState *fs = &scanstate->funcstates[i];
+ TypeFuncClass functypclass;
+ Oid funcrettype;
+ TupleDesc tupdesc;
+
+ fs->setexpr =
+ ExecInitTableFunctionResult((Expr *) funcexpr,
+ scanstate->ss.ps.ps_ExprContext,
+ &scanstate->ss.ps);
+
+ /*
+ * Don't allocate the tuplestores; the actual calls to the functions
+ * do that. NULL means that we have not called the function yet (or
+ * need to call it again after a rescan).
+ */
+ fs->tstore = NULL;
+ fs->rowcount = -1;
+
+ /*
+ * Now determine if the function returns a simple or composite type,
+ * and build an appropriate tupdesc. Note that in the composite case,
+ * the function may now return more columns than it did when the plan
+ * was made; we have to ignore any columns beyond "colcount".
+ */
+ functypclass = get_expr_result_type(funcexpr,
+ &funcrettype,
+ &tupdesc);
+
+ if (functypclass == TYPEFUNC_COMPOSITE ||
+ functypclass == TYPEFUNC_COMPOSITE_DOMAIN)
+ {
+ /* Composite data type, e.g. a table's row type */
+ Assert(tupdesc);
+ Assert(tupdesc->natts >= colcount);
+ /* Must copy it out of typcache for safety */
+ tupdesc = CreateTupleDescCopy(tupdesc);
+ }
+ else if (functypclass == TYPEFUNC_SCALAR)
+ {
+ /* Base data type, i.e. scalar */
+ tupdesc = CreateTemplateTupleDesc(1);
+ TupleDescInitEntry(tupdesc,
+ (AttrNumber) 1,
+ NULL, /* don't care about the name here */
+ funcrettype,
+ -1,
+ 0);
+ TupleDescInitEntryCollation(tupdesc,
+ (AttrNumber) 1,
+ exprCollation(funcexpr));
+ }
+ else if (functypclass == TYPEFUNC_RECORD)
+ {
+ tupdesc = BuildDescFromLists(rtfunc->funccolnames,
+ rtfunc->funccoltypes,
+ rtfunc->funccoltypmods,
+ rtfunc->funccolcollations);
+
+ /*
+ * For RECORD results, make sure a typmod has been assigned. (The
+ * function should do this for itself, but let's cover things in
+ * case it doesn't.)
+ */
+ BlessTupleDesc(tupdesc);
+ }
+ else
+ {
+ /* crummy error message, but parser should have caught this */
+ elog(ERROR, "function in FROM has unsupported return type");
+ }
+
+ fs->tupdesc = tupdesc;
+ fs->colcount = colcount;
+
+ /*
+ * We only need separate slots for the function results if we are
+ * doing ordinality or multiple functions; otherwise, we'll fetch
+ * function results directly into the scan slot.
+ */
+ if (!scanstate->simple)
+ {
+ fs->func_slot = ExecInitExtraTupleSlot(estate, fs->tupdesc,
+ &TTSOpsMinimalTuple);
+ }
+ else
+ fs->func_slot = NULL;
+
+ natts += colcount;
+ i++;
+ }
+
+ /*
+ * Create the combined TupleDesc
+ *
+ * If there is just one function without ordinality, the scan result
+ * tupdesc is the same as the function result tupdesc --- except that we
+ * may stuff new names into it below, so drop any rowtype label.
+ */
+ if (scanstate->simple)
+ {
+ scan_tupdesc = CreateTupleDescCopy(scanstate->funcstates[0].tupdesc);
+ scan_tupdesc->tdtypeid = RECORDOID;
+ scan_tupdesc->tdtypmod = -1;
+ }
+ else
+ {
+ AttrNumber attno = 0;
+
+ if (node->funcordinality)
+ natts++;
+
+ scan_tupdesc = CreateTemplateTupleDesc(natts);
+
+ for (i = 0; i < nfuncs; i++)
+ {
+ TupleDesc tupdesc = scanstate->funcstates[i].tupdesc;
+ int colcount = scanstate->funcstates[i].colcount;
+ int j;
+
+ for (j = 1; j <= colcount; j++)
+ TupleDescCopyEntry(scan_tupdesc, ++attno, tupdesc, j);
+ }
+
+ /* If doing ordinality, add a column of type "bigint" at the end */
+ if (node->funcordinality)
+ {
+ TupleDescInitEntry(scan_tupdesc,
+ ++attno,
+ NULL, /* don't care about the name here */
+ INT8OID,
+ -1,
+ 0);
+ }
+
+ Assert(attno == natts);
+ }
+
+ /*
+ * Initialize scan slot and type.
+ */
+ ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc,
+ &TTSOpsMinimalTuple);
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+ /*
+ * Create a memory context that ExecMakeTableFunctionResult can use to
+ * evaluate function arguments in. We can't use the per-tuple context for
+ * this because it gets reset too often; but we don't want to leak
+ * evaluation results into the query-lifespan context either. We just
+ * need one context, because we evaluate each function separately.
+ */
+ scanstate->argcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "Table function arguments",
+ ALLOCSET_DEFAULT_SIZES);
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndFunctionScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndFunctionScan(FunctionScanState *node)
+{
+ int i;
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * Release slots and tuplestore resources
+ */
+ for (i = 0; i < node->nfuncs; i++)
+ {
+ FunctionScanPerFuncState *fs = &node->funcstates[i];
+
+ if (fs->func_slot)
+ ExecClearTuple(fs->func_slot);
+
+ if (fs->tstore != NULL)
+ {
+ tuplestore_end(node->funcstates[i].tstore);
+ fs->tstore = NULL;
+ }
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanFunctionScan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanFunctionScan(FunctionScanState *node)
+{
+ FunctionScan *scan = (FunctionScan *) node->ss.ps.plan;
+ int i;
+ Bitmapset *chgparam = node->ss.ps.chgParam;
+
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ for (i = 0; i < node->nfuncs; i++)
+ {
+ FunctionScanPerFuncState *fs = &node->funcstates[i];
+
+ if (fs->func_slot)
+ ExecClearTuple(fs->func_slot);
+ }
+
+ ExecScanReScan(&node->ss);
+
+ /*
+ * Here we have a choice whether to drop the tuplestores (and recompute
+ * the function outputs) or just rescan them. We must recompute if an
+ * expression contains changed parameters, else we rescan.
+ *
+ * XXX maybe we should recompute if the function is volatile? But in
+ * general the executor doesn't conditionalize its actions on that.
+ */
+ if (chgparam)
+ {
+ ListCell *lc;
+
+ i = 0;
+ foreach(lc, scan->functions)
+ {
+ RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
+
+ if (bms_overlap(chgparam, rtfunc->funcparams))
+ {
+ if (node->funcstates[i].tstore != NULL)
+ {
+ tuplestore_end(node->funcstates[i].tstore);
+ node->funcstates[i].tstore = NULL;
+ }
+ node->funcstates[i].rowcount = -1;
+ }
+ i++;
+ }
+ }
+
+ /* Reset ordinality counter */
+ node->ordinal = 0;
+
+ /* Make sure we rewind any remaining tuplestores */
+ for (i = 0; i < node->nfuncs; i++)
+ {
+ if (node->funcstates[i].tstore != NULL)
+ tuplestore_rescan(node->funcstates[i].tstore);
+ }
+}
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
new file mode 100644
index 0000000..734142b
--- /dev/null
+++ b/src/backend/executor/nodeGather.c
@@ -0,0 +1,477 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeGather.c
+ * Support routines for scanning a plan via multiple workers.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * A Gather executor launches parallel workers to run multiple copies of a
+ * plan. It can also run the plan itself, if the workers are not available
+ * or have not started up yet. It then merges all of the results it produces
+ * and the results from the workers into a single output stream. Therefore,
+ * it will normally be used with a plan where running multiple copies of the
+ * same plan does not produce duplicate output, such as parallel-aware
+ * SeqScan.
+ *
+ * Alternatively, a Gather node can be configured to use just one worker
+ * and the single-copy flag can be set. In this case, the Gather node will
+ * run the plan in one worker and will not execute the plan itself. In
+ * this case, it simply returns whatever tuples were returned by the worker.
+ * If a worker cannot be obtained, then it will run the plan itself and
+ * return the results. Therefore, a plan used with a single-copy Gather
+ * node need not be parallel-aware.
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeGather.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/xact.h"
+#include "executor/execdebug.h"
+#include "executor/execParallel.h"
+#include "executor/nodeGather.h"
+#include "executor/nodeSubplan.h"
+#include "executor/tqueue.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+static TupleTableSlot *ExecGather(PlanState *pstate);
+static TupleTableSlot *gather_getnext(GatherState *gatherstate);
+static MinimalTuple gather_readnext(GatherState *gatherstate);
+static void ExecShutdownGatherWorkers(GatherState *node);
+
+
+/* ----------------------------------------------------------------
+ * ExecInitGather
+ * ----------------------------------------------------------------
+ */
+GatherState *
+ExecInitGather(Gather *node, EState *estate, int eflags)
+{
+ GatherState *gatherstate;
+ Plan *outerNode;
+ TupleDesc tupDesc;
+
+ /* Gather node doesn't have innerPlan node. */
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create state structure
+ */
+ gatherstate = makeNode(GatherState);
+ gatherstate->ps.plan = (Plan *) node;
+ gatherstate->ps.state = estate;
+ gatherstate->ps.ExecProcNode = ExecGather;
+
+ gatherstate->initialized = false;
+ gatherstate->need_to_scan_locally =
+ !node->single_copy && parallel_leader_participation;
+ gatherstate->tuples_needed = -1;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &gatherstate->ps);
+
+ /*
+ * now initialize outer plan
+ */
+ outerNode = outerPlan(node);
+ outerPlanState(gatherstate) = ExecInitNode(outerNode, estate, eflags);
+ tupDesc = ExecGetResultType(outerPlanState(gatherstate));
+
+ /*
+ * Leader may access ExecProcNode result directly (if
+ * need_to_scan_locally), or from workers via tuple queue. So we can't
+ * trivially rely on the slot type being fixed for expressions evaluated
+ * within this node.
+ */
+ gatherstate->ps.outeropsset = true;
+ gatherstate->ps.outeropsfixed = false;
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&gatherstate->ps);
+ ExecConditionalAssignProjectionInfo(&gatherstate->ps, tupDesc, OUTER_VAR);
+
+ /*
+ * Without projections result slot type is not trivially known, see
+ * comment above.
+ */
+ if (gatherstate->ps.ps_ProjInfo == NULL)
+ {
+ gatherstate->ps.resultopsset = true;
+ gatherstate->ps.resultopsfixed = false;
+ }
+
+ /*
+ * Initialize funnel slot to same tuple descriptor as outer plan.
+ */
+ gatherstate->funnel_slot = ExecInitExtraTupleSlot(estate, tupDesc,
+ &TTSOpsMinimalTuple);
+
+ /*
+ * Gather doesn't support checking a qual (it's always more efficient to
+ * do it in the child node).
+ */
+ Assert(!node->plan.qual);
+
+ return gatherstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecGather(node)
+ *
+ * Scans the relation via multiple workers and returns
+ * the next qualifying tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecGather(PlanState *pstate)
+{
+ GatherState *node = castNode(GatherState, pstate);
+ TupleTableSlot *slot;
+ ExprContext *econtext;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Initialize the parallel context and workers on first execution. We do
+ * this on first execution rather than during node initialization, as it
+ * needs to allocate a large dynamic segment, so it is better to do it
+ * only if it is really needed.
+ */
+ if (!node->initialized)
+ {
+ EState *estate = node->ps.state;
+ Gather *gather = (Gather *) node->ps.plan;
+
+ /*
+ * Sometimes we might have to run without parallelism; but if parallel
+ * mode is active then we can try to fire up some workers.
+ */
+ if (gather->num_workers > 0 && estate->es_use_parallel_mode)
+ {
+ ParallelContext *pcxt;
+
+ /* Initialize, or re-initialize, shared state needed by workers. */
+ if (!node->pei)
+ node->pei = ExecInitParallelPlan(node->ps.lefttree,
+ estate,
+ gather->initParam,
+ gather->num_workers,
+ node->tuples_needed);
+ else
+ ExecParallelReinitialize(node->ps.lefttree,
+ node->pei,
+ gather->initParam);
+
+ /*
+ * Register backend workers. We might not get as many as we
+ * requested, or indeed any at all.
+ */
+ pcxt = node->pei->pcxt;
+ LaunchParallelWorkers(pcxt);
+ /* We save # workers launched for the benefit of EXPLAIN */
+ node->nworkers_launched = pcxt->nworkers_launched;
+
+ /* Set up tuple queue readers to read the results. */
+ if (pcxt->nworkers_launched > 0)
+ {
+ ExecParallelCreateReaders(node->pei);
+ /* Make a working array showing the active readers */
+ node->nreaders = pcxt->nworkers_launched;
+ node->reader = (TupleQueueReader **)
+ palloc(node->nreaders * sizeof(TupleQueueReader *));
+ memcpy(node->reader, node->pei->reader,
+ node->nreaders * sizeof(TupleQueueReader *));
+ }
+ else
+ {
+ /* No workers? Then never mind. */
+ node->nreaders = 0;
+ node->reader = NULL;
+ }
+ node->nextreader = 0;
+ }
+
+ /* Run plan locally if no workers or enabled and not single-copy. */
+ node->need_to_scan_locally = (node->nreaders == 0)
+ || (!gather->single_copy && parallel_leader_participation);
+ node->initialized = true;
+ }
+
+ /*
+ * Reset per-tuple memory context to free any expression evaluation
+ * storage allocated in the previous tuple cycle.
+ */
+ econtext = node->ps.ps_ExprContext;
+ ResetExprContext(econtext);
+
+ /*
+ * Get next tuple, either from one of our workers, or by running the plan
+ * ourselves.
+ */
+ slot = gather_getnext(node);
+ if (TupIsNull(slot))
+ return NULL;
+
+ /* If no projection is required, we're done. */
+ if (node->ps.ps_ProjInfo == NULL)
+ return slot;
+
+ /*
+ * Form the result tuple using ExecProject(), and return it.
+ */
+ econtext->ecxt_outertuple = slot;
+ return ExecProject(node->ps.ps_ProjInfo);
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndGather
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndGather(GatherState *node)
+{
+ ExecEndNode(outerPlanState(node)); /* let children clean up first */
+ ExecShutdownGather(node);
+ ExecFreeExprContext(&node->ps);
+ if (node->ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+}
+
+/*
+ * Read the next tuple. We might fetch a tuple from one of the tuple queues
+ * using gather_readnext, or if no tuple queue contains a tuple and the
+ * single_copy flag is not set, we might generate one locally instead.
+ */
+static TupleTableSlot *
+gather_getnext(GatherState *gatherstate)
+{
+ PlanState *outerPlan = outerPlanState(gatherstate);
+ TupleTableSlot *outerTupleSlot;
+ TupleTableSlot *fslot = gatherstate->funnel_slot;
+ MinimalTuple tup;
+
+ while (gatherstate->nreaders > 0 || gatherstate->need_to_scan_locally)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ if (gatherstate->nreaders > 0)
+ {
+ tup = gather_readnext(gatherstate);
+
+ if (HeapTupleIsValid(tup))
+ {
+ ExecStoreMinimalTuple(tup, /* tuple to store */
+ fslot, /* slot to store the tuple */
+ false); /* don't pfree tuple */
+ return fslot;
+ }
+ }
+
+ if (gatherstate->need_to_scan_locally)
+ {
+ EState *estate = gatherstate->ps.state;
+
+ /* Install our DSA area while executing the plan. */
+ estate->es_query_dsa =
+ gatherstate->pei ? gatherstate->pei->area : NULL;
+ outerTupleSlot = ExecProcNode(outerPlan);
+ estate->es_query_dsa = NULL;
+
+ if (!TupIsNull(outerTupleSlot))
+ return outerTupleSlot;
+
+ gatherstate->need_to_scan_locally = false;
+ }
+ }
+
+ return ExecClearTuple(fslot);
+}
+
+/*
+ * Attempt to read a tuple from one of our parallel workers.
+ */
+static MinimalTuple
+gather_readnext(GatherState *gatherstate)
+{
+ int nvisited = 0;
+
+ for (;;)
+ {
+ TupleQueueReader *reader;
+ MinimalTuple tup;
+ bool readerdone;
+
+ /* Check for async events, particularly messages from workers. */
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Attempt to read a tuple, but don't block if none is available.
+ *
+ * Note that TupleQueueReaderNext will just return NULL for a worker
+ * which fails to initialize. We'll treat that worker as having
+ * produced no tuples; WaitForParallelWorkersToFinish will error out
+ * when we get there.
+ */
+ Assert(gatherstate->nextreader < gatherstate->nreaders);
+ reader = gatherstate->reader[gatherstate->nextreader];
+ tup = TupleQueueReaderNext(reader, true, &readerdone);
+
+ /*
+ * If this reader is done, remove it from our working array of active
+ * readers. If all readers are done, we're outta here.
+ */
+ if (readerdone)
+ {
+ Assert(!tup);
+ --gatherstate->nreaders;
+ if (gatherstate->nreaders == 0)
+ {
+ ExecShutdownGatherWorkers(gatherstate);
+ return NULL;
+ }
+ memmove(&gatherstate->reader[gatherstate->nextreader],
+ &gatherstate->reader[gatherstate->nextreader + 1],
+ sizeof(TupleQueueReader *)
+ * (gatherstate->nreaders - gatherstate->nextreader));
+ if (gatherstate->nextreader >= gatherstate->nreaders)
+ gatherstate->nextreader = 0;
+ continue;
+ }
+
+ /* If we got a tuple, return it. */
+ if (tup)
+ return tup;
+
+ /*
+ * Advance nextreader pointer in round-robin fashion. Note that we
+ * only reach this code if we weren't able to get a tuple from the
+ * current worker. We used to advance the nextreader pointer after
+ * every tuple, but it turns out to be much more efficient to keep
+ * reading from the same queue until that would require blocking.
+ */
+ gatherstate->nextreader++;
+ if (gatherstate->nextreader >= gatherstate->nreaders)
+ gatherstate->nextreader = 0;
+
+ /* Have we visited every (surviving) TupleQueueReader? */
+ nvisited++;
+ if (nvisited >= gatherstate->nreaders)
+ {
+ /*
+ * If (still) running plan locally, return NULL so caller can
+ * generate another tuple from the local copy of the plan.
+ */
+ if (gatherstate->need_to_scan_locally)
+ return NULL;
+
+ /* Nothing to do except wait for developments. */
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ WAIT_EVENT_EXECUTE_GATHER);
+ ResetLatch(MyLatch);
+ nvisited = 0;
+ }
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecShutdownGatherWorkers
+ *
+ * Stop all the parallel workers.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecShutdownGatherWorkers(GatherState *node)
+{
+ if (node->pei != NULL)
+ ExecParallelFinish(node->pei);
+
+ /* Flush local copy of reader array */
+ if (node->reader)
+ pfree(node->reader);
+ node->reader = NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecShutdownGather
+ *
+ * Destroy the setup for parallel workers including parallel context.
+ * ----------------------------------------------------------------
+ */
+void
+ExecShutdownGather(GatherState *node)
+{
+ ExecShutdownGatherWorkers(node);
+
+ /* Now destroy the parallel context. */
+ if (node->pei != NULL)
+ {
+ ExecParallelCleanup(node->pei);
+ node->pei = NULL;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * Join Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecReScanGather
+ *
+ * Prepare to re-scan the result of a Gather.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanGather(GatherState *node)
+{
+ Gather *gather = (Gather *) node->ps.plan;
+ PlanState *outerPlan = outerPlanState(node);
+
+ /* Make sure any existing workers are gracefully shut down */
+ ExecShutdownGatherWorkers(node);
+
+ /* Mark node so that shared state will be rebuilt at next call */
+ node->initialized = false;
+
+ /*
+ * Set child node's chgParam to tell it that the next scan might deliver a
+ * different set of rows within the leader process. (The overall rowset
+ * shouldn't change, but the leader process's subset might; hence nodes
+ * between here and the parallel table scan node mustn't optimize on the
+ * assumption of an unchanging rowset.)
+ */
+ if (gather->rescan_param >= 0)
+ outerPlan->chgParam = bms_add_member(outerPlan->chgParam,
+ gather->rescan_param);
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode. Note: because this does nothing if we have a
+ * rescan_param, it's currently guaranteed that parallel-aware child nodes
+ * will not see a ReScan call until after they get a ReInitializeDSM call.
+ * That ordering might not be something to rely on, though. A good rule
+ * of thumb is that ReInitializeDSM should reset only shared state, ReScan
+ * should reset only local state, and anything that depends on both of
+ * those steps being finished must wait until the first ExecProcNode call.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+}
diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c
new file mode 100644
index 0000000..03f02a1
--- /dev/null
+++ b/src/backend/executor/nodeGatherMerge.c
@@ -0,0 +1,789 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeGatherMerge.c
+ * Scan a plan in multiple workers, and do order-preserving merge.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeGatherMerge.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/xact.h"
+#include "executor/execdebug.h"
+#include "executor/execParallel.h"
+#include "executor/nodeGatherMerge.h"
+#include "executor/nodeSubplan.h"
+#include "executor/tqueue.h"
+#include "lib/binaryheap.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/*
+ * When we read tuples from workers, it's a good idea to read several at once
+ * for efficiency when possible: this minimizes context-switching overhead.
+ * But reading too many at a time wastes memory without improving performance.
+ * We'll read up to MAX_TUPLE_STORE tuples (in addition to the first one).
+ */
+#define MAX_TUPLE_STORE 10
+
+/*
+ * Pending-tuple array for each worker. This holds additional tuples that
+ * we were able to fetch from the worker, but can't process yet. In addition,
+ * this struct holds the "done" flag indicating the worker is known to have
+ * no more tuples. (We do not use this struct for the leader; we don't keep
+ * any pending tuples for the leader, and the need_to_scan_locally flag serves
+ * as its "done" indicator.)
+ */
+typedef struct GMReaderTupleBuffer
+{
+ MinimalTuple *tuple; /* array of length MAX_TUPLE_STORE */
+ int nTuples; /* number of tuples currently stored */
+ int readCounter; /* index of next tuple to extract */
+ bool done; /* true if reader is known exhausted */
+} GMReaderTupleBuffer;
+
+static TupleTableSlot *ExecGatherMerge(PlanState *pstate);
+static int32 heap_compare_slots(Datum a, Datum b, void *arg);
+static TupleTableSlot *gather_merge_getnext(GatherMergeState *gm_state);
+static MinimalTuple gm_readnext_tuple(GatherMergeState *gm_state, int nreader,
+ bool nowait, bool *done);
+static void ExecShutdownGatherMergeWorkers(GatherMergeState *node);
+static void gather_merge_setup(GatherMergeState *gm_state);
+static void gather_merge_init(GatherMergeState *gm_state);
+static void gather_merge_clear_tuples(GatherMergeState *gm_state);
+static bool gather_merge_readnext(GatherMergeState *gm_state, int reader,
+ bool nowait);
+static void load_tuple_array(GatherMergeState *gm_state, int reader);
+
+/* ----------------------------------------------------------------
+ * ExecInitGather
+ * ----------------------------------------------------------------
+ */
+GatherMergeState *
+ExecInitGatherMerge(GatherMerge *node, EState *estate, int eflags)
+{
+ GatherMergeState *gm_state;
+ Plan *outerNode;
+ TupleDesc tupDesc;
+
+ /* Gather merge node doesn't have innerPlan node. */
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create state structure
+ */
+ gm_state = makeNode(GatherMergeState);
+ gm_state->ps.plan = (Plan *) node;
+ gm_state->ps.state = estate;
+ gm_state->ps.ExecProcNode = ExecGatherMerge;
+
+ gm_state->initialized = false;
+ gm_state->gm_initialized = false;
+ gm_state->tuples_needed = -1;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &gm_state->ps);
+
+ /*
+ * GatherMerge doesn't support checking a qual (it's always more efficient
+ * to do it in the child node).
+ */
+ Assert(!node->plan.qual);
+
+ /*
+ * now initialize outer plan
+ */
+ outerNode = outerPlan(node);
+ outerPlanState(gm_state) = ExecInitNode(outerNode, estate, eflags);
+
+ /*
+ * Leader may access ExecProcNode result directly (if
+ * need_to_scan_locally), or from workers via tuple queue. So we can't
+ * trivially rely on the slot type being fixed for expressions evaluated
+ * within this node.
+ */
+ gm_state->ps.outeropsset = true;
+ gm_state->ps.outeropsfixed = false;
+
+ /*
+ * Store the tuple descriptor into gather merge state, so we can use it
+ * while initializing the gather merge slots.
+ */
+ tupDesc = ExecGetResultType(outerPlanState(gm_state));
+ gm_state->tupDesc = tupDesc;
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&gm_state->ps);
+ ExecConditionalAssignProjectionInfo(&gm_state->ps, tupDesc, OUTER_VAR);
+
+ /*
+ * Without projections result slot type is not trivially known, see
+ * comment above.
+ */
+ if (gm_state->ps.ps_ProjInfo == NULL)
+ {
+ gm_state->ps.resultopsset = true;
+ gm_state->ps.resultopsfixed = false;
+ }
+
+ /*
+ * initialize sort-key information
+ */
+ if (node->numCols)
+ {
+ int i;
+
+ gm_state->gm_nkeys = node->numCols;
+ gm_state->gm_sortkeys =
+ palloc0(sizeof(SortSupportData) * node->numCols);
+
+ for (i = 0; i < node->numCols; i++)
+ {
+ SortSupport sortKey = gm_state->gm_sortkeys + i;
+
+ sortKey->ssup_cxt = CurrentMemoryContext;
+ sortKey->ssup_collation = node->collations[i];
+ sortKey->ssup_nulls_first = node->nullsFirst[i];
+ sortKey->ssup_attno = node->sortColIdx[i];
+
+ /*
+ * We don't perform abbreviated key conversion here, for the same
+ * reasons that it isn't used in MergeAppend
+ */
+ sortKey->abbreviate = false;
+
+ PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
+ }
+ }
+
+ /* Now allocate the workspace for gather merge */
+ gather_merge_setup(gm_state);
+
+ return gm_state;
+}
+
+/* ----------------------------------------------------------------
+ * ExecGatherMerge(node)
+ *
+ * Scans the relation via multiple workers and returns
+ * the next qualifying tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecGatherMerge(PlanState *pstate)
+{
+ GatherMergeState *node = castNode(GatherMergeState, pstate);
+ TupleTableSlot *slot;
+ ExprContext *econtext;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * As with Gather, we don't launch workers until this node is actually
+ * executed.
+ */
+ if (!node->initialized)
+ {
+ EState *estate = node->ps.state;
+ GatherMerge *gm = castNode(GatherMerge, node->ps.plan);
+
+ /*
+ * Sometimes we might have to run without parallelism; but if parallel
+ * mode is active then we can try to fire up some workers.
+ */
+ if (gm->num_workers > 0 && estate->es_use_parallel_mode)
+ {
+ ParallelContext *pcxt;
+
+ /* Initialize, or re-initialize, shared state needed by workers. */
+ if (!node->pei)
+ node->pei = ExecInitParallelPlan(node->ps.lefttree,
+ estate,
+ gm->initParam,
+ gm->num_workers,
+ node->tuples_needed);
+ else
+ ExecParallelReinitialize(node->ps.lefttree,
+ node->pei,
+ gm->initParam);
+
+ /* Try to launch workers. */
+ pcxt = node->pei->pcxt;
+ LaunchParallelWorkers(pcxt);
+ /* We save # workers launched for the benefit of EXPLAIN */
+ node->nworkers_launched = pcxt->nworkers_launched;
+
+ /* Set up tuple queue readers to read the results. */
+ if (pcxt->nworkers_launched > 0)
+ {
+ ExecParallelCreateReaders(node->pei);
+ /* Make a working array showing the active readers */
+ node->nreaders = pcxt->nworkers_launched;
+ node->reader = (TupleQueueReader **)
+ palloc(node->nreaders * sizeof(TupleQueueReader *));
+ memcpy(node->reader, node->pei->reader,
+ node->nreaders * sizeof(TupleQueueReader *));
+ }
+ else
+ {
+ /* No workers? Then never mind. */
+ node->nreaders = 0;
+ node->reader = NULL;
+ }
+ }
+
+ /* allow leader to participate if enabled or no choice */
+ if (parallel_leader_participation || node->nreaders == 0)
+ node->need_to_scan_locally = true;
+ node->initialized = true;
+ }
+
+ /*
+ * Reset per-tuple memory context to free any expression evaluation
+ * storage allocated in the previous tuple cycle.
+ */
+ econtext = node->ps.ps_ExprContext;
+ ResetExprContext(econtext);
+
+ /*
+ * Get next tuple, either from one of our workers, or by running the plan
+ * ourselves.
+ */
+ slot = gather_merge_getnext(node);
+ if (TupIsNull(slot))
+ return NULL;
+
+ /* If no projection is required, we're done. */
+ if (node->ps.ps_ProjInfo == NULL)
+ return slot;
+
+ /*
+ * Form the result tuple using ExecProject(), and return it.
+ */
+ econtext->ecxt_outertuple = slot;
+ return ExecProject(node->ps.ps_ProjInfo);
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndGatherMerge
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndGatherMerge(GatherMergeState *node)
+{
+ ExecEndNode(outerPlanState(node)); /* let children clean up first */
+ ExecShutdownGatherMerge(node);
+ ExecFreeExprContext(&node->ps);
+ if (node->ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecShutdownGatherMerge
+ *
+ * Destroy the setup for parallel workers including parallel context.
+ * ----------------------------------------------------------------
+ */
+void
+ExecShutdownGatherMerge(GatherMergeState *node)
+{
+ ExecShutdownGatherMergeWorkers(node);
+
+ /* Now destroy the parallel context. */
+ if (node->pei != NULL)
+ {
+ ExecParallelCleanup(node->pei);
+ node->pei = NULL;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecShutdownGatherMergeWorkers
+ *
+ * Stop all the parallel workers.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecShutdownGatherMergeWorkers(GatherMergeState *node)
+{
+ if (node->pei != NULL)
+ ExecParallelFinish(node->pei);
+
+ /* Flush local copy of reader array */
+ if (node->reader)
+ pfree(node->reader);
+ node->reader = NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanGatherMerge
+ *
+ * Prepare to re-scan the result of a GatherMerge.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanGatherMerge(GatherMergeState *node)
+{
+ GatherMerge *gm = (GatherMerge *) node->ps.plan;
+ PlanState *outerPlan = outerPlanState(node);
+
+ /* Make sure any existing workers are gracefully shut down */
+ ExecShutdownGatherMergeWorkers(node);
+
+ /* Free any unused tuples, so we don't leak memory across rescans */
+ gather_merge_clear_tuples(node);
+
+ /* Mark node so that shared state will be rebuilt at next call */
+ node->initialized = false;
+ node->gm_initialized = false;
+
+ /*
+ * Set child node's chgParam to tell it that the next scan might deliver a
+ * different set of rows within the leader process. (The overall rowset
+ * shouldn't change, but the leader process's subset might; hence nodes
+ * between here and the parallel table scan node mustn't optimize on the
+ * assumption of an unchanging rowset.)
+ */
+ if (gm->rescan_param >= 0)
+ outerPlan->chgParam = bms_add_member(outerPlan->chgParam,
+ gm->rescan_param);
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode. Note: because this does nothing if we have a
+ * rescan_param, it's currently guaranteed that parallel-aware child nodes
+ * will not see a ReScan call until after they get a ReInitializeDSM call.
+ * That ordering might not be something to rely on, though. A good rule
+ * of thumb is that ReInitializeDSM should reset only shared state, ReScan
+ * should reset only local state, and anything that depends on both of
+ * those steps being finished must wait until the first ExecProcNode call.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+}
+
+/*
+ * Set up the data structures that we'll need for Gather Merge.
+ *
+ * We allocate these once on the basis of gm->num_workers, which is an
+ * upper bound for the number of workers we'll actually have. During
+ * a rescan, we reset the structures to empty. This approach simplifies
+ * not leaking memory across rescans.
+ *
+ * In the gm_slots[] array, index 0 is for the leader, and indexes 1 to n
+ * are for workers. The values placed into gm_heap correspond to indexes
+ * in gm_slots[]. The gm_tuple_buffers[] array, however, is indexed from
+ * 0 to n-1; it has no entry for the leader.
+ */
+static void
+gather_merge_setup(GatherMergeState *gm_state)
+{
+ GatherMerge *gm = castNode(GatherMerge, gm_state->ps.plan);
+ int nreaders = gm->num_workers;
+ int i;
+
+ /*
+ * Allocate gm_slots for the number of workers + one more slot for leader.
+ * Slot 0 is always for the leader. Leader always calls ExecProcNode() to
+ * read the tuple, and then stores it directly into its gm_slots entry.
+ * For other slots, code below will call ExecInitExtraTupleSlot() to
+ * create a slot for the worker's results. Note that during any single
+ * scan, we might have fewer than num_workers available workers, in which
+ * case the extra array entries go unused.
+ */
+ gm_state->gm_slots = (TupleTableSlot **)
+ palloc0((nreaders + 1) * sizeof(TupleTableSlot *));
+
+ /* Allocate the tuple slot and tuple array for each worker */
+ gm_state->gm_tuple_buffers = (GMReaderTupleBuffer *)
+ palloc0(nreaders * sizeof(GMReaderTupleBuffer));
+
+ for (i = 0; i < nreaders; i++)
+ {
+ /* Allocate the tuple array with length MAX_TUPLE_STORE */
+ gm_state->gm_tuple_buffers[i].tuple =
+ (MinimalTuple *) palloc0(sizeof(MinimalTuple) * MAX_TUPLE_STORE);
+
+ /* Initialize tuple slot for worker */
+ gm_state->gm_slots[i + 1] =
+ ExecInitExtraTupleSlot(gm_state->ps.state, gm_state->tupDesc,
+ &TTSOpsMinimalTuple);
+ }
+
+ /* Allocate the resources for the merge */
+ gm_state->gm_heap = binaryheap_allocate(nreaders + 1,
+ heap_compare_slots,
+ gm_state);
+}
+
+/*
+ * Initialize the Gather Merge.
+ *
+ * Reset data structures to ensure they're empty. Then pull at least one
+ * tuple from leader + each worker (or set its "done" indicator), and set up
+ * the heap.
+ */
+static void
+gather_merge_init(GatherMergeState *gm_state)
+{
+ int nreaders = gm_state->nreaders;
+ bool nowait = true;
+ int i;
+
+ /* Assert that gather_merge_setup made enough space */
+ Assert(nreaders <= castNode(GatherMerge, gm_state->ps.plan)->num_workers);
+
+ /* Reset leader's tuple slot to empty */
+ gm_state->gm_slots[0] = NULL;
+
+ /* Reset the tuple slot and tuple array for each worker */
+ for (i = 0; i < nreaders; i++)
+ {
+ /* Reset tuple array to empty */
+ gm_state->gm_tuple_buffers[i].nTuples = 0;
+ gm_state->gm_tuple_buffers[i].readCounter = 0;
+ /* Reset done flag to not-done */
+ gm_state->gm_tuple_buffers[i].done = false;
+ /* Ensure output slot is empty */
+ ExecClearTuple(gm_state->gm_slots[i + 1]);
+ }
+
+ /* Reset binary heap to empty */
+ binaryheap_reset(gm_state->gm_heap);
+
+ /*
+ * First, try to read a tuple from each worker (including leader) in
+ * nowait mode. After this, if not all workers were able to produce a
+ * tuple (or a "done" indication), then re-read from remaining workers,
+ * this time using wait mode. Add all live readers (those producing at
+ * least one tuple) to the heap.
+ */
+reread:
+ for (i = 0; i <= nreaders; i++)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /* skip this source if already known done */
+ if ((i == 0) ? gm_state->need_to_scan_locally :
+ !gm_state->gm_tuple_buffers[i - 1].done)
+ {
+ if (TupIsNull(gm_state->gm_slots[i]))
+ {
+ /* Don't have a tuple yet, try to get one */
+ if (gather_merge_readnext(gm_state, i, nowait))
+ binaryheap_add_unordered(gm_state->gm_heap,
+ Int32GetDatum(i));
+ }
+ else
+ {
+ /*
+ * We already got at least one tuple from this worker, but
+ * might as well see if it has any more ready by now.
+ */
+ load_tuple_array(gm_state, i);
+ }
+ }
+ }
+
+ /* need not recheck leader, since nowait doesn't matter for it */
+ for (i = 1; i <= nreaders; i++)
+ {
+ if (!gm_state->gm_tuple_buffers[i - 1].done &&
+ TupIsNull(gm_state->gm_slots[i]))
+ {
+ nowait = false;
+ goto reread;
+ }
+ }
+
+ /* Now heapify the heap. */
+ binaryheap_build(gm_state->gm_heap);
+
+ gm_state->gm_initialized = true;
+}
+
+/*
+ * Clear out the tuple table slot, and any unused pending tuples,
+ * for each gather merge input.
+ */
+static void
+gather_merge_clear_tuples(GatherMergeState *gm_state)
+{
+ int i;
+
+ for (i = 0; i < gm_state->nreaders; i++)
+ {
+ GMReaderTupleBuffer *tuple_buffer = &gm_state->gm_tuple_buffers[i];
+
+ while (tuple_buffer->readCounter < tuple_buffer->nTuples)
+ pfree(tuple_buffer->tuple[tuple_buffer->readCounter++]);
+
+ ExecClearTuple(gm_state->gm_slots[i + 1]);
+ }
+}
+
+/*
+ * Read the next tuple for gather merge.
+ *
+ * Fetch the sorted tuple out of the heap.
+ */
+static TupleTableSlot *
+gather_merge_getnext(GatherMergeState *gm_state)
+{
+ int i;
+
+ if (!gm_state->gm_initialized)
+ {
+ /*
+ * First time through: pull the first tuple from each participant, and
+ * set up the heap.
+ */
+ gather_merge_init(gm_state);
+ }
+ else
+ {
+ /*
+ * Otherwise, pull the next tuple from whichever participant we
+ * returned from last time, and reinsert that participant's index into
+ * the heap, because it might now compare differently against the
+ * other elements of the heap.
+ */
+ i = DatumGetInt32(binaryheap_first(gm_state->gm_heap));
+
+ if (gather_merge_readnext(gm_state, i, false))
+ binaryheap_replace_first(gm_state->gm_heap, Int32GetDatum(i));
+ else
+ {
+ /* reader exhausted, remove it from heap */
+ (void) binaryheap_remove_first(gm_state->gm_heap);
+ }
+ }
+
+ if (binaryheap_empty(gm_state->gm_heap))
+ {
+ /* All the queues are exhausted, and so is the heap */
+ gather_merge_clear_tuples(gm_state);
+ return NULL;
+ }
+ else
+ {
+ /* Return next tuple from whichever participant has the leading one */
+ i = DatumGetInt32(binaryheap_first(gm_state->gm_heap));
+ return gm_state->gm_slots[i];
+ }
+}
+
+/*
+ * Read tuple(s) for given reader in nowait mode, and load into its tuple
+ * array, until we have MAX_TUPLE_STORE of them or would have to block.
+ */
+static void
+load_tuple_array(GatherMergeState *gm_state, int reader)
+{
+ GMReaderTupleBuffer *tuple_buffer;
+ int i;
+
+ /* Don't do anything if this is the leader. */
+ if (reader == 0)
+ return;
+
+ tuple_buffer = &gm_state->gm_tuple_buffers[reader - 1];
+
+ /* If there's nothing in the array, reset the counters to zero. */
+ if (tuple_buffer->nTuples == tuple_buffer->readCounter)
+ tuple_buffer->nTuples = tuple_buffer->readCounter = 0;
+
+ /* Try to fill additional slots in the array. */
+ for (i = tuple_buffer->nTuples; i < MAX_TUPLE_STORE; i++)
+ {
+ MinimalTuple tuple;
+
+ tuple = gm_readnext_tuple(gm_state,
+ reader,
+ true,
+ &tuple_buffer->done);
+ if (!tuple)
+ break;
+ tuple_buffer->tuple[i] = tuple;
+ tuple_buffer->nTuples++;
+ }
+}
+
+/*
+ * Store the next tuple for a given reader into the appropriate slot.
+ *
+ * Returns true if successful, false if not (either reader is exhausted,
+ * or we didn't want to wait for a tuple). Sets done flag if reader
+ * is found to be exhausted.
+ */
+static bool
+gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
+{
+ GMReaderTupleBuffer *tuple_buffer;
+ MinimalTuple tup;
+
+ /*
+ * If we're being asked to generate a tuple from the leader, then we just
+ * call ExecProcNode as normal to produce one.
+ */
+ if (reader == 0)
+ {
+ if (gm_state->need_to_scan_locally)
+ {
+ PlanState *outerPlan = outerPlanState(gm_state);
+ TupleTableSlot *outerTupleSlot;
+ EState *estate = gm_state->ps.state;
+
+ /* Install our DSA area while executing the plan. */
+ estate->es_query_dsa = gm_state->pei ? gm_state->pei->area : NULL;
+ outerTupleSlot = ExecProcNode(outerPlan);
+ estate->es_query_dsa = NULL;
+
+ if (!TupIsNull(outerTupleSlot))
+ {
+ gm_state->gm_slots[0] = outerTupleSlot;
+ return true;
+ }
+ /* need_to_scan_locally serves as "done" flag for leader */
+ gm_state->need_to_scan_locally = false;
+ }
+ return false;
+ }
+
+ /* Otherwise, check the state of the relevant tuple buffer. */
+ tuple_buffer = &gm_state->gm_tuple_buffers[reader - 1];
+
+ if (tuple_buffer->nTuples > tuple_buffer->readCounter)
+ {
+ /* Return any tuple previously read that is still buffered. */
+ tup = tuple_buffer->tuple[tuple_buffer->readCounter++];
+ }
+ else if (tuple_buffer->done)
+ {
+ /* Reader is known to be exhausted. */
+ return false;
+ }
+ else
+ {
+ /* Read and buffer next tuple. */
+ tup = gm_readnext_tuple(gm_state,
+ reader,
+ nowait,
+ &tuple_buffer->done);
+ if (!tup)
+ return false;
+
+ /*
+ * Attempt to read more tuples in nowait mode and store them in the
+ * pending-tuple array for the reader.
+ */
+ load_tuple_array(gm_state, reader);
+ }
+
+ Assert(tup);
+
+ /* Build the TupleTableSlot for the given tuple */
+ ExecStoreMinimalTuple(tup, /* tuple to store */
+ gm_state->gm_slots[reader], /* slot in which to
+ * store the tuple */
+ true); /* pfree tuple when done with it */
+
+ return true;
+}
+
+/*
+ * Attempt to read a tuple from given worker.
+ */
+static MinimalTuple
+gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
+ bool *done)
+{
+ TupleQueueReader *reader;
+ MinimalTuple tup;
+
+ /* Check for async events, particularly messages from workers. */
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Attempt to read a tuple.
+ *
+ * Note that TupleQueueReaderNext will just return NULL for a worker which
+ * fails to initialize. We'll treat that worker as having produced no
+ * tuples; WaitForParallelWorkersToFinish will error out when we get
+ * there.
+ */
+ reader = gm_state->reader[nreader - 1];
+ tup = TupleQueueReaderNext(reader, nowait, done);
+
+ /*
+ * Since we'll be buffering these across multiple calls, we need to make a
+ * copy.
+ */
+ return tup ? heap_copy_minimal_tuple(tup) : NULL;
+}
+
+/*
+ * We have one slot for each item in the heap array. We use SlotNumber
+ * to store slot indexes. This doesn't actually provide any formal
+ * type-safety, but it makes the code more self-documenting.
+ */
+typedef int32 SlotNumber;
+
+/*
+ * Compare the tuples in the two given slots.
+ */
+static int32
+heap_compare_slots(Datum a, Datum b, void *arg)
+{
+ GatherMergeState *node = (GatherMergeState *) arg;
+ SlotNumber slot1 = DatumGetInt32(a);
+ SlotNumber slot2 = DatumGetInt32(b);
+
+ TupleTableSlot *s1 = node->gm_slots[slot1];
+ TupleTableSlot *s2 = node->gm_slots[slot2];
+ int nkey;
+
+ Assert(!TupIsNull(s1));
+ Assert(!TupIsNull(s2));
+
+ for (nkey = 0; nkey < node->gm_nkeys; nkey++)
+ {
+ SortSupport sortKey = node->gm_sortkeys + nkey;
+ AttrNumber attno = sortKey->ssup_attno;
+ Datum datum1,
+ datum2;
+ bool isNull1,
+ isNull2;
+ int compare;
+
+ datum1 = slot_getattr(s1, attno, &isNull1);
+ datum2 = slot_getattr(s2, attno, &isNull2);
+
+ compare = ApplySortComparator(datum1, isNull1,
+ datum2, isNull2,
+ sortKey);
+ if (compare != 0)
+ {
+ INVERT_COMPARE_RESULT(compare);
+ return compare;
+ }
+ }
+ return 0;
+}
diff --git a/src/backend/executor/nodeGroup.c b/src/backend/executor/nodeGroup.c
new file mode 100644
index 0000000..1721b2a
--- /dev/null
+++ b/src/backend/executor/nodeGroup.c
@@ -0,0 +1,255 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeGroup.c
+ * Routines to handle group nodes (used for queries with GROUP BY clause).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * DESCRIPTION
+ * The Group node is designed for handling queries with a GROUP BY clause.
+ * Its outer plan must deliver tuples that are sorted in the order
+ * specified by the grouping columns (ie. tuples from the same group are
+ * consecutive). That way, we just have to compare adjacent tuples to
+ * locate group boundaries.
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeGroup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeGroup.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/*
+ * ExecGroup -
+ *
+ * Return one tuple for each group of matching input tuples.
+ */
+static TupleTableSlot *
+ExecGroup(PlanState *pstate)
+{
+ GroupState *node = castNode(GroupState, pstate);
+ ExprContext *econtext;
+ TupleTableSlot *firsttupleslot;
+ TupleTableSlot *outerslot;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * get state info from node
+ */
+ if (node->grp_done)
+ return NULL;
+ econtext = node->ss.ps.ps_ExprContext;
+
+ /*
+ * The ScanTupleSlot holds the (copied) first tuple of each group.
+ */
+ firsttupleslot = node->ss.ss_ScanTupleSlot;
+
+ /*
+ * We need not call ResetExprContext here because ExecQualAndReset() will
+ * reset the per-tuple memory context once per input tuple.
+ */
+
+ /*
+ * If first time through, acquire first input tuple and determine whether
+ * to return it or not.
+ */
+ if (TupIsNull(firsttupleslot))
+ {
+ outerslot = ExecProcNode(outerPlanState(node));
+ if (TupIsNull(outerslot))
+ {
+ /* empty input, so return nothing */
+ node->grp_done = true;
+ return NULL;
+ }
+ /* Copy tuple into firsttupleslot */
+ ExecCopySlot(firsttupleslot, outerslot);
+
+ /*
+ * Set it up as input for qual test and projection. The expressions
+ * will access the input tuple as varno OUTER.
+ */
+ econtext->ecxt_outertuple = firsttupleslot;
+
+ /*
+ * Check the qual (HAVING clause); if the group does not match, ignore
+ * it and fall into scan loop.
+ */
+ if (ExecQual(node->ss.ps.qual, econtext))
+ {
+ /*
+ * Form and return a projection tuple using the first input tuple.
+ */
+ return ExecProject(node->ss.ps.ps_ProjInfo);
+ }
+ else
+ InstrCountFiltered1(node, 1);
+ }
+
+ /*
+ * This loop iterates once per input tuple group. At the head of the
+ * loop, we have finished processing the first tuple of the group and now
+ * need to scan over all the other group members.
+ */
+ for (;;)
+ {
+ /*
+ * Scan over all remaining tuples that belong to this group
+ */
+ for (;;)
+ {
+ outerslot = ExecProcNode(outerPlanState(node));
+ if (TupIsNull(outerslot))
+ {
+ /* no more groups, so we're done */
+ node->grp_done = true;
+ return NULL;
+ }
+
+ /*
+ * Compare with first tuple and see if this tuple is of the same
+ * group. If so, ignore it and keep scanning.
+ */
+ econtext->ecxt_innertuple = firsttupleslot;
+ econtext->ecxt_outertuple = outerslot;
+ if (!ExecQualAndReset(node->eqfunction, econtext))
+ break;
+ }
+
+ /*
+ * We have the first tuple of the next input group. See if we want to
+ * return it.
+ */
+ /* Copy tuple, set up as input for qual test and projection */
+ ExecCopySlot(firsttupleslot, outerslot);
+ econtext->ecxt_outertuple = firsttupleslot;
+
+ /*
+ * Check the qual (HAVING clause); if the group does not match, ignore
+ * it and loop back to scan the rest of the group.
+ */
+ if (ExecQual(node->ss.ps.qual, econtext))
+ {
+ /*
+ * Form and return a projection tuple using the first input tuple.
+ */
+ return ExecProject(node->ss.ps.ps_ProjInfo);
+ }
+ else
+ InstrCountFiltered1(node, 1);
+ }
+}
+
+/* -----------------
+ * ExecInitGroup
+ *
+ * Creates the run-time information for the group node produced by the
+ * planner and initializes its outer subtree
+ * -----------------
+ */
+GroupState *
+ExecInitGroup(Group *node, EState *estate, int eflags)
+{
+ GroupState *grpstate;
+ const TupleTableSlotOps *tts_ops;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ grpstate = makeNode(GroupState);
+ grpstate->ss.ps.plan = (Plan *) node;
+ grpstate->ss.ps.state = estate;
+ grpstate->ss.ps.ExecProcNode = ExecGroup;
+ grpstate->grp_done = false;
+
+ /*
+ * create expression context
+ */
+ ExecAssignExprContext(estate, &grpstate->ss.ps);
+
+ /*
+ * initialize child nodes
+ */
+ outerPlanState(grpstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * Initialize scan slot and type.
+ */
+ tts_ops = ExecGetResultSlotOps(outerPlanState(&grpstate->ss), NULL);
+ ExecCreateScanSlotFromOuterPlan(estate, &grpstate->ss, tts_ops);
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTupleSlotTL(&grpstate->ss.ps, &TTSOpsVirtual);
+ ExecAssignProjectionInfo(&grpstate->ss.ps, NULL);
+
+ /*
+ * initialize child expressions
+ */
+ grpstate->ss.ps.qual =
+ ExecInitQual(node->plan.qual, (PlanState *) grpstate);
+
+ /*
+ * Precompute fmgr lookup data for inner loop
+ */
+ grpstate->eqfunction =
+ execTuplesMatchPrepare(ExecGetResultType(outerPlanState(grpstate)),
+ node->numCols,
+ node->grpColIdx,
+ node->grpOperators,
+ node->grpCollations,
+ &grpstate->ss.ps);
+
+ return grpstate;
+}
+
+/* ------------------------
+ * ExecEndGroup(node)
+ *
+ * -----------------------
+ */
+void
+ExecEndGroup(GroupState *node)
+{
+ PlanState *outerPlan;
+
+ ExecFreeExprContext(&node->ss.ps);
+
+ /* clean up tuple table */
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ outerPlan = outerPlanState(node);
+ ExecEndNode(outerPlan);
+}
+
+void
+ExecReScanGroup(GroupState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ node->grp_done = false;
+ /* must clear first tuple */
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+}
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
new file mode 100644
index 0000000..15d8bbe
--- /dev/null
+++ b/src/backend/executor/nodeHash.c
@@ -0,0 +1,3434 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeHash.c
+ * Routines to hash relations for hashjoin
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeHash.c
+ *
+ * See note on parallelism in nodeHashjoin.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * MultiExecHash - generate an in-memory hash table of the relation
+ * ExecInitHash - initialize node and subnodes
+ * ExecEndHash - shutdown node and subnodes
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+#include <limits.h>
+
+#include "access/htup_details.h"
+#include "access/parallel.h"
+#include "catalog/pg_statistic.h"
+#include "commands/tablespace.h"
+#include "executor/execdebug.h"
+#include "executor/hashjoin.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "port/pg_bitutils.h"
+#include "utils/dynahash.h"
+#include "utils/guc.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/syscache.h"
+
+static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
+static void ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable);
+static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
+ int mcvsToUse);
+static void ExecHashSkewTableInsert(HashJoinTable hashtable,
+ TupleTableSlot *slot,
+ uint32 hashvalue,
+ int bucketNumber);
+static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
+
+static void *dense_alloc(HashJoinTable hashtable, Size size);
+static HashJoinTuple ExecParallelHashTupleAlloc(HashJoinTable hashtable,
+ size_t size,
+ dsa_pointer *shared);
+static void MultiExecPrivateHash(HashState *node);
+static void MultiExecParallelHash(HashState *node);
+static inline HashJoinTuple ExecParallelHashFirstTuple(HashJoinTable table,
+ int bucketno);
+static inline HashJoinTuple ExecParallelHashNextTuple(HashJoinTable table,
+ HashJoinTuple tuple);
+static inline void ExecParallelHashPushTuple(dsa_pointer_atomic *head,
+ HashJoinTuple tuple,
+ dsa_pointer tuple_shared);
+static void ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch);
+static void ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable);
+static void ExecParallelHashRepartitionFirst(HashJoinTable hashtable);
+static void ExecParallelHashRepartitionRest(HashJoinTable hashtable);
+static HashMemoryChunk ExecParallelHashPopChunkQueue(HashJoinTable table,
+ dsa_pointer *shared);
+static bool ExecParallelHashTuplePrealloc(HashJoinTable hashtable,
+ int batchno,
+ size_t size);
+static void ExecParallelHashMergeCounters(HashJoinTable hashtable);
+static void ExecParallelHashCloseBatchAccessors(HashJoinTable hashtable);
+
+
+/* ----------------------------------------------------------------
+ * ExecHash
+ *
+ * stub for pro forma compliance
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecHash(PlanState *pstate)
+{
+ elog(ERROR, "Hash node does not support ExecProcNode call convention");
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * MultiExecHash
+ *
+ * build hash table for hashjoin, doing partitioning if more
+ * than one batch is required.
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecHash(HashState *node)
+{
+ /* must provide our own instrumentation support */
+ if (node->ps.instrument)
+ InstrStartNode(node->ps.instrument);
+
+ if (node->parallel_state != NULL)
+ MultiExecParallelHash(node);
+ else
+ MultiExecPrivateHash(node);
+
+ /* must provide our own instrumentation support */
+ if (node->ps.instrument)
+ InstrStopNode(node->ps.instrument, node->hashtable->partialTuples);
+
+ /*
+ * We do not return the hash table directly because it's not a subtype of
+ * Node, and so would violate the MultiExecProcNode API. Instead, our
+ * parent Hashjoin node is expected to know how to fish it out of our node
+ * state. Ugly but not really worth cleaning up, since Hashjoin knows
+ * quite a bit more about Hash besides that.
+ */
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * MultiExecPrivateHash
+ *
+ * parallel-oblivious version, building a backend-private
+ * hash table and (if necessary) batch files.
+ * ----------------------------------------------------------------
+ */
+static void
+MultiExecPrivateHash(HashState *node)
+{
+ PlanState *outerNode;
+ List *hashkeys;
+ HashJoinTable hashtable;
+ TupleTableSlot *slot;
+ ExprContext *econtext;
+ uint32 hashvalue;
+
+ /*
+ * get state info from node
+ */
+ outerNode = outerPlanState(node);
+ hashtable = node->hashtable;
+
+ /*
+ * set expression context
+ */
+ hashkeys = node->hashkeys;
+ econtext = node->ps.ps_ExprContext;
+
+ /*
+ * Get all tuples from the node below the Hash node and insert into the
+ * hash table (or temp files).
+ */
+ for (;;)
+ {
+ slot = ExecProcNode(outerNode);
+ if (TupIsNull(slot))
+ break;
+ /* We have to compute the hash value */
+ econtext->ecxt_outertuple = slot;
+ if (ExecHashGetHashValue(hashtable, econtext, hashkeys,
+ false, hashtable->keepNulls,
+ &hashvalue))
+ {
+ int bucketNumber;
+
+ bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue);
+ if (bucketNumber != INVALID_SKEW_BUCKET_NO)
+ {
+ /* It's a skew tuple, so put it into that hash table */
+ ExecHashSkewTableInsert(hashtable, slot, hashvalue,
+ bucketNumber);
+ hashtable->skewTuples += 1;
+ }
+ else
+ {
+ /* Not subject to skew optimization, so insert normally */
+ ExecHashTableInsert(hashtable, slot, hashvalue);
+ }
+ hashtable->totalTuples += 1;
+ }
+ }
+
+ /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
+ if (hashtable->nbuckets != hashtable->nbuckets_optimal)
+ ExecHashIncreaseNumBuckets(hashtable);
+
+ /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
+ hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
+ if (hashtable->spaceUsed > hashtable->spacePeak)
+ hashtable->spacePeak = hashtable->spaceUsed;
+
+ hashtable->partialTuples = hashtable->totalTuples;
+}
+
+/* ----------------------------------------------------------------
+ * MultiExecParallelHash
+ *
+ * parallel-aware version, building a shared hash table and
+ * (if necessary) batch files using the combined effort of
+ * a set of co-operating backends.
+ * ----------------------------------------------------------------
+ */
+static void
+MultiExecParallelHash(HashState *node)
+{
+ ParallelHashJoinState *pstate;
+ PlanState *outerNode;
+ List *hashkeys;
+ HashJoinTable hashtable;
+ TupleTableSlot *slot;
+ ExprContext *econtext;
+ uint32 hashvalue;
+ Barrier *build_barrier;
+ int i;
+
+ /*
+ * get state info from node
+ */
+ outerNode = outerPlanState(node);
+ hashtable = node->hashtable;
+
+ /*
+ * set expression context
+ */
+ hashkeys = node->hashkeys;
+ econtext = node->ps.ps_ExprContext;
+
+ /*
+ * Synchronize the parallel hash table build. At this stage we know that
+ * the shared hash table has been or is being set up by
+ * ExecHashTableCreate(), but we don't know if our peers have returned
+ * from there or are here in MultiExecParallelHash(), and if so how far
+ * through they are. To find out, we check the build_barrier phase then
+ * and jump to the right step in the build algorithm.
+ */
+ pstate = hashtable->parallel_state;
+ build_barrier = &pstate->build_barrier;
+ Assert(BarrierPhase(build_barrier) >= PHJ_BUILD_ALLOCATING);
+ switch (BarrierPhase(build_barrier))
+ {
+ case PHJ_BUILD_ALLOCATING:
+
+ /*
+ * Either I just allocated the initial hash table in
+ * ExecHashTableCreate(), or someone else is doing that. Either
+ * way, wait for everyone to arrive here so we can proceed.
+ */
+ BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ALLOCATE);
+ /* Fall through. */
+
+ case PHJ_BUILD_HASHING_INNER:
+
+ /*
+ * It's time to begin hashing, or if we just arrived here then
+ * hashing is already underway, so join in that effort. While
+ * hashing we have to be prepared to help increase the number of
+ * batches or buckets at any time, and if we arrived here when
+ * that was already underway we'll have to help complete that work
+ * immediately so that it's safe to access batches and buckets
+ * below.
+ */
+ if (PHJ_GROW_BATCHES_PHASE(BarrierAttach(&pstate->grow_batches_barrier)) !=
+ PHJ_GROW_BATCHES_ELECTING)
+ ExecParallelHashIncreaseNumBatches(hashtable);
+ if (PHJ_GROW_BUCKETS_PHASE(BarrierAttach(&pstate->grow_buckets_barrier)) !=
+ PHJ_GROW_BUCKETS_ELECTING)
+ ExecParallelHashIncreaseNumBuckets(hashtable);
+ ExecParallelHashEnsureBatchAccessors(hashtable);
+ ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+ for (;;)
+ {
+ slot = ExecProcNode(outerNode);
+ if (TupIsNull(slot))
+ break;
+ econtext->ecxt_outertuple = slot;
+ if (ExecHashGetHashValue(hashtable, econtext, hashkeys,
+ false, hashtable->keepNulls,
+ &hashvalue))
+ ExecParallelHashTableInsert(hashtable, slot, hashvalue);
+ hashtable->partialTuples++;
+ }
+
+ /*
+ * Make sure that any tuples we wrote to disk are visible to
+ * others before anyone tries to load them.
+ */
+ for (i = 0; i < hashtable->nbatch; ++i)
+ sts_end_write(hashtable->batches[i].inner_tuples);
+
+ /*
+ * Update shared counters. We need an accurate total tuple count
+ * to control the empty table optimization.
+ */
+ ExecParallelHashMergeCounters(hashtable);
+
+ BarrierDetach(&pstate->grow_buckets_barrier);
+ BarrierDetach(&pstate->grow_batches_barrier);
+
+ /*
+ * Wait for everyone to finish building and flushing files and
+ * counters.
+ */
+ if (BarrierArriveAndWait(build_barrier,
+ WAIT_EVENT_HASH_BUILD_HASH_INNER))
+ {
+ /*
+ * Elect one backend to disable any further growth. Batches
+ * are now fixed. While building them we made sure they'd fit
+ * in our memory budget when we load them back in later (or we
+ * tried to do that and gave up because we detected extreme
+ * skew).
+ */
+ pstate->growth = PHJ_GROWTH_DISABLED;
+ }
+ }
+
+ /*
+ * We're not yet attached to a batch. We all agree on the dimensions and
+ * number of inner tuples (for the empty table optimization).
+ */
+ hashtable->curbatch = -1;
+ hashtable->nbuckets = pstate->nbuckets;
+ hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
+ hashtable->totalTuples = pstate->total_tuples;
+ ExecParallelHashEnsureBatchAccessors(hashtable);
+
+ /*
+ * The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
+ * case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
+ * there already).
+ */
+ Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+ BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitHash
+ *
+ * Init routine for Hash node
+ * ----------------------------------------------------------------
+ */
+HashState *
+ExecInitHash(Hash *node, EState *estate, int eflags)
+{
+ HashState *hashstate;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ hashstate = makeNode(HashState);
+ hashstate->ps.plan = (Plan *) node;
+ hashstate->ps.state = estate;
+ hashstate->ps.ExecProcNode = ExecHash;
+ hashstate->hashtable = NULL;
+ hashstate->hashkeys = NIL; /* will be set by parent HashJoin */
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &hashstate->ps);
+
+ /*
+ * initialize child nodes
+ */
+ outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * initialize our result slot and type. No need to build projection
+ * because this node doesn't do projections.
+ */
+ ExecInitResultTupleSlotTL(&hashstate->ps, &TTSOpsMinimalTuple);
+ hashstate->ps.ps_ProjInfo = NULL;
+
+ /*
+ * initialize child expressions
+ */
+ Assert(node->plan.qual == NIL);
+ hashstate->hashkeys =
+ ExecInitExprList(node->hashkeys, (PlanState *) hashstate);
+
+ return hashstate;
+}
+
+/* ---------------------------------------------------------------
+ * ExecEndHash
+ *
+ * clean up routine for Hash node
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndHash(HashState *node)
+{
+ PlanState *outerPlan;
+
+ /*
+ * free exprcontext
+ */
+ ExecFreeExprContext(&node->ps);
+
+ /*
+ * shut down the subplan
+ */
+ outerPlan = outerPlanState(node);
+ ExecEndNode(outerPlan);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecHashTableCreate
+ *
+ * create an empty hashtable data structure for hashjoin.
+ * ----------------------------------------------------------------
+ */
+HashJoinTable
+ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, bool keepNulls)
+{
+ Hash *node;
+ HashJoinTable hashtable;
+ Plan *outerNode;
+ size_t space_allowed;
+ int nbuckets;
+ int nbatch;
+ double rows;
+ int num_skew_mcvs;
+ int log2_nbuckets;
+ int nkeys;
+ int i;
+ ListCell *ho;
+ ListCell *hc;
+ MemoryContext oldcxt;
+
+ /*
+ * Get information about the size of the relation to be hashed (it's the
+ * "outer" subtree of this node, but the inner relation of the hashjoin).
+ * Compute the appropriate size of the hash table.
+ */
+ node = (Hash *) state->ps.plan;
+ outerNode = outerPlan(node);
+
+ /*
+ * If this is shared hash table with a partial plan, then we can't use
+ * outerNode->plan_rows to estimate its size. We need an estimate of the
+ * total number of rows across all copies of the partial plan.
+ */
+ rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;
+
+ ExecChooseHashTableSize(rows, outerNode->plan_width,
+ OidIsValid(node->skewTable),
+ state->parallel_state != NULL,
+ state->parallel_state != NULL ?
+ state->parallel_state->nparticipants - 1 : 0,
+ &space_allowed,
+ &nbuckets, &nbatch, &num_skew_mcvs);
+
+ /* nbuckets must be a power of 2 */
+ log2_nbuckets = my_log2(nbuckets);
+ Assert(nbuckets == (1 << log2_nbuckets));
+
+ /*
+ * Initialize the hash table control block.
+ *
+ * The hashtable control block is just palloc'd from the executor's
+ * per-query memory context. Everything else should be kept inside the
+ * subsidiary hashCxt or batchCxt.
+ */
+ hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
+ hashtable->nbuckets = nbuckets;
+ hashtable->nbuckets_original = nbuckets;
+ hashtable->nbuckets_optimal = nbuckets;
+ hashtable->log2_nbuckets = log2_nbuckets;
+ hashtable->log2_nbuckets_optimal = log2_nbuckets;
+ hashtable->buckets.unshared = NULL;
+ hashtable->keepNulls = keepNulls;
+ hashtable->skewEnabled = false;
+ hashtable->skewBucket = NULL;
+ hashtable->skewBucketLen = 0;
+ hashtable->nSkewBuckets = 0;
+ hashtable->skewBucketNums = NULL;
+ hashtable->nbatch = nbatch;
+ hashtable->curbatch = 0;
+ hashtable->nbatch_original = nbatch;
+ hashtable->nbatch_outstart = nbatch;
+ hashtable->growEnabled = true;
+ hashtable->totalTuples = 0;
+ hashtable->partialTuples = 0;
+ hashtable->skewTuples = 0;
+ hashtable->innerBatchFile = NULL;
+ hashtable->outerBatchFile = NULL;
+ hashtable->spaceUsed = 0;
+ hashtable->spacePeak = 0;
+ hashtable->spaceAllowed = space_allowed;
+ hashtable->spaceUsedSkew = 0;
+ hashtable->spaceAllowedSkew =
+ hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;
+ hashtable->chunks = NULL;
+ hashtable->current_chunk = NULL;
+ hashtable->parallel_state = state->parallel_state;
+ hashtable->area = state->ps.state->es_query_dsa;
+ hashtable->batches = NULL;
+
+#ifdef HJDEBUG
+ printf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n",
+ hashtable, nbatch, nbuckets);
+#endif
+
+ /*
+ * Create temporary memory contexts in which to keep the hashtable working
+ * storage. See notes in executor/hashjoin.h.
+ */
+ hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "HashTableContext",
+ ALLOCSET_DEFAULT_SIZES);
+
+ hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
+ "HashBatchContext",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /* Allocate data that will live for the life of the hashjoin */
+
+ oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+
+ /*
+ * Get info about the hash functions to be used for each hash key. Also
+ * remember whether the join operators are strict.
+ */
+ nkeys = list_length(hashOperators);
+ hashtable->outer_hashfunctions =
+ (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
+ hashtable->inner_hashfunctions =
+ (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
+ hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));
+ hashtable->collations = (Oid *) palloc(nkeys * sizeof(Oid));
+ i = 0;
+ forboth(ho, hashOperators, hc, hashCollations)
+ {
+ Oid hashop = lfirst_oid(ho);
+ Oid left_hashfn;
+ Oid right_hashfn;
+
+ if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
+ elog(ERROR, "could not find hash function for hash operator %u",
+ hashop);
+ fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
+ fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
+ hashtable->hashStrict[i] = op_strict(hashop);
+ hashtable->collations[i] = lfirst_oid(hc);
+ i++;
+ }
+
+ if (nbatch > 1 && hashtable->parallel_state == NULL)
+ {
+ /*
+ * allocate and initialize the file arrays in hashCxt (not needed for
+ * parallel case which uses shared tuplestores instead of raw files)
+ */
+ hashtable->innerBatchFile = (BufFile **)
+ palloc0(nbatch * sizeof(BufFile *));
+ hashtable->outerBatchFile = (BufFile **)
+ palloc0(nbatch * sizeof(BufFile *));
+ /* The files will not be opened until needed... */
+ /* ... but make sure we have temp tablespaces established for them */
+ PrepareTempTablespaces();
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+
+ if (hashtable->parallel_state)
+ {
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ Barrier *build_barrier;
+
+ /*
+ * Attach to the build barrier. The corresponding detach operation is
+ * in ExecHashTableDetach. Note that we won't attach to the
+ * batch_barrier for batch 0 yet. We'll attach later and start it out
+ * in PHJ_BATCH_PROBING phase, because batch 0 is allocated up front
+ * and then loaded while hashing (the standard hybrid hash join
+ * algorithm), and we'll coordinate that using build_barrier.
+ */
+ build_barrier = &pstate->build_barrier;
+ BarrierAttach(build_barrier);
+
+ /*
+ * So far we have no idea whether there are any other participants,
+ * and if so, what phase they are working on. The only thing we care
+ * about at this point is whether someone has already created the
+ * SharedHashJoinBatch objects and the hash table for batch 0. One
+ * backend will be elected to do that now if necessary.
+ */
+ if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECTING &&
+ BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECT))
+ {
+ pstate->nbatch = nbatch;
+ pstate->space_allowed = space_allowed;
+ pstate->growth = PHJ_GROWTH_OK;
+
+ /* Set up the shared state for coordinating batches. */
+ ExecParallelHashJoinSetUpBatches(hashtable, nbatch);
+
+ /*
+ * Allocate batch 0's hash table up front so we can load it
+ * directly while hashing.
+ */
+ pstate->nbuckets = nbuckets;
+ ExecParallelHashTableAlloc(hashtable, 0);
+ }
+
+ /*
+ * The next Parallel Hash synchronization point is in
+ * MultiExecParallelHash(), which will progress it all the way to
+ * PHJ_BUILD_DONE. The caller must not return control from this
+ * executor node between now and then.
+ */
+ }
+ else
+ {
+ /*
+ * Prepare context for the first-scan space allocations; allocate the
+ * hashbucket array therein, and set each bucket "empty".
+ */
+ MemoryContextSwitchTo(hashtable->batchCxt);
+
+ hashtable->buckets.unshared = (HashJoinTuple *)
+ palloc0(nbuckets * sizeof(HashJoinTuple));
+
+ /*
+ * Set up for skew optimization, if possible and there's a need for
+ * more than one batch. (In a one-batch join, there's no point in
+ * it.)
+ */
+ if (nbatch > 1)
+ ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ return hashtable;
+}
+
+
+/*
+ * Compute appropriate size for hashtable given the estimated size of the
+ * relation to be hashed (number of rows and average row width).
+ *
+ * This is exported so that the planner's costsize.c can use it.
+ */
+
+/* Target bucket loading (tuples per bucket) */
+#define NTUP_PER_BUCKET 1
+
+void
+ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
+ bool try_combined_hash_mem,
+ int parallel_workers,
+ size_t *space_allowed,
+ int *numbuckets,
+ int *numbatches,
+ int *num_skew_mcvs)
+{
+ int tupsize;
+ double inner_rel_bytes;
+ size_t hash_table_bytes;
+ size_t bucket_bytes;
+ size_t max_pointers;
+ int nbatch = 1;
+ int nbuckets;
+ double dbuckets;
+
+ /* Force a plausible relation size if no info */
+ if (ntuples <= 0.0)
+ ntuples = 1000.0;
+
+ /*
+ * Estimate tupsize based on footprint of tuple in hashtable... note this
+ * does not allow for any palloc overhead. The manipulations of spaceUsed
+ * don't count palloc overhead either.
+ */
+ tupsize = HJTUPLE_OVERHEAD +
+ MAXALIGN(SizeofMinimalTupleHeader) +
+ MAXALIGN(tupwidth);
+ inner_rel_bytes = ntuples * tupsize;
+
+ /*
+ * Compute in-memory hashtable size limit from GUCs.
+ */
+ hash_table_bytes = get_hash_memory_limit();
+
+ /*
+ * Parallel Hash tries to use the combined hash_mem of all workers to
+ * avoid the need to batch. If that won't work, it falls back to hash_mem
+ * per worker and tries to process batches in parallel.
+ */
+ if (try_combined_hash_mem)
+ {
+ /* Careful, this could overflow size_t */
+ double newlimit;
+
+ newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1);
+ newlimit = Min(newlimit, (double) SIZE_MAX);
+ hash_table_bytes = (size_t) newlimit;
+ }
+
+ *space_allowed = hash_table_bytes;
+
+ /*
+ * If skew optimization is possible, estimate the number of skew buckets
+ * that will fit in the memory allowed, and decrement the assumed space
+ * available for the main hash table accordingly.
+ *
+ * We make the optimistic assumption that each skew bucket will contain
+ * one inner-relation tuple. If that turns out to be low, we will recover
+ * at runtime by reducing the number of skew buckets.
+ *
+ * hashtable->skewBucket will have up to 8 times as many HashSkewBucket
+ * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
+ * will round up to the next power of 2 and then multiply by 4 to reduce
+ * collisions.
+ */
+ if (useskew)
+ {
+ size_t bytes_per_mcv;
+ size_t skew_mcvs;
+
+ /*----------
+ * Compute number of MCVs we could hold in hash_table_bytes
+ *
+ * Divisor is:
+ * size of a hash tuple +
+ * worst-case size of skewBucket[] per MCV +
+ * size of skewBucketNums[] entry +
+ * size of skew bucket struct itself
+ *----------
+ */
+ bytes_per_mcv = tupsize +
+ (8 * sizeof(HashSkewBucket *)) +
+ sizeof(int) +
+ SKEW_BUCKET_OVERHEAD;
+ skew_mcvs = hash_table_bytes / bytes_per_mcv;
+
+ /*
+ * Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as
+ * not to worry about size_t overflow in the multiplication)
+ */
+ skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100;
+
+ /* Now clamp to integer range */
+ skew_mcvs = Min(skew_mcvs, INT_MAX);
+
+ *num_skew_mcvs = (int) skew_mcvs;
+
+ /* Reduce hash_table_bytes by the amount needed for the skew table */
+ if (skew_mcvs > 0)
+ hash_table_bytes -= skew_mcvs * bytes_per_mcv;
+ }
+ else
+ *num_skew_mcvs = 0;
+
+ /*
+ * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
+ * memory is filled, assuming a single batch; but limit the value so that
+ * the pointer arrays we'll try to allocate do not exceed hash_table_bytes
+ * nor MaxAllocSize.
+ *
+ * Note that both nbuckets and nbatch must be powers of 2 to make
+ * ExecHashGetBucketAndBatch fast.
+ */
+ max_pointers = hash_table_bytes / sizeof(HashJoinTuple);
+ max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
+ /* If max_pointers isn't a power of 2, must round it down to one */
+ max_pointers = pg_prevpower2_size_t(max_pointers);
+
+ /* Also ensure we avoid integer overflow in nbatch and nbuckets */
+ /* (this step is redundant given the current value of MaxAllocSize) */
+ max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
+
+ dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
+ dbuckets = Min(dbuckets, max_pointers);
+ nbuckets = (int) dbuckets;
+ /* don't let nbuckets be really small, though ... */
+ nbuckets = Max(nbuckets, 1024);
+ /* ... and force it to be a power of 2. */
+ nbuckets = pg_nextpower2_32(nbuckets);
+
+ /*
+ * If there's not enough space to store the projected number of tuples and
+ * the required bucket headers, we will need multiple batches.
+ */
+ bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
+ if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
+ {
+ /* We'll need multiple batches */
+ size_t sbuckets;
+ double dbatch;
+ int minbatch;
+ size_t bucket_size;
+
+ /*
+ * If Parallel Hash with combined hash_mem would still need multiple
+ * batches, we'll have to fall back to regular hash_mem budget.
+ */
+ if (try_combined_hash_mem)
+ {
+ ExecChooseHashTableSize(ntuples, tupwidth, useskew,
+ false, parallel_workers,
+ space_allowed,
+ numbuckets,
+ numbatches,
+ num_skew_mcvs);
+ return;
+ }
+
+ /*
+ * Estimate the number of buckets we'll want to have when hash_mem is
+ * entirely full. Each bucket will contain a bucket pointer plus
+ * NTUP_PER_BUCKET tuples, whose projected size already includes
+ * overhead for the hash code, pointer to the next tuple, etc.
+ */
+ bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
+ sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size);
+ sbuckets = Min(sbuckets, max_pointers);
+ nbuckets = (int) sbuckets;
+ nbuckets = pg_nextpower2_32(nbuckets);
+ bucket_bytes = nbuckets * sizeof(HashJoinTuple);
+
+ /*
+ * Buckets are simple pointers to hashjoin tuples, while tupsize
+ * includes the pointer, hash code, and MinimalTupleData. So buckets
+ * should never really exceed 25% of hash_mem (even for
+ * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not
+ * 2^N bytes, where we might get more because of doubling. So let's
+ * look for 50% here.
+ */
+ Assert(bucket_bytes <= hash_table_bytes / 2);
+
+ /* Calculate required number of batches. */
+ dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));
+ dbatch = Min(dbatch, max_pointers);
+ minbatch = (int) dbatch;
+ nbatch = pg_nextpower2_32(Max(2, minbatch));
+ }
+
+ Assert(nbuckets > 0);
+ Assert(nbatch > 0);
+
+ *numbuckets = nbuckets;
+ *numbatches = nbatch;
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecHashTableDestroy
+ *
+ * destroy a hash table
+ * ----------------------------------------------------------------
+ */
+void
+ExecHashTableDestroy(HashJoinTable hashtable)
+{
+ int i;
+
+ /*
+ * Make sure all the temp files are closed. We skip batch 0, since it
+ * can't have any temp files (and the arrays might not even exist if
+ * nbatch is only 1). Parallel hash joins don't use these files.
+ */
+ if (hashtable->innerBatchFile != NULL)
+ {
+ for (i = 1; i < hashtable->nbatch; i++)
+ {
+ if (hashtable->innerBatchFile[i])
+ BufFileClose(hashtable->innerBatchFile[i]);
+ if (hashtable->outerBatchFile[i])
+ BufFileClose(hashtable->outerBatchFile[i]);
+ }
+ }
+
+ /* Release working memory (batchCxt is a child, so it goes away too) */
+ MemoryContextDelete(hashtable->hashCxt);
+
+ /* And drop the control block */
+ pfree(hashtable);
+}
+
+/*
+ * ExecHashIncreaseNumBatches
+ * increase the original number of batches in order to reduce
+ * current memory consumption
+ */
+static void
+ExecHashIncreaseNumBatches(HashJoinTable hashtable)
+{
+ int oldnbatch = hashtable->nbatch;
+ int curbatch = hashtable->curbatch;
+ int nbatch;
+ MemoryContext oldcxt;
+ long ninmemory;
+ long nfreed;
+ HashMemoryChunk oldchunks;
+
+ /* do nothing if we've decided to shut off growth */
+ if (!hashtable->growEnabled)
+ return;
+
+ /* safety check to avoid overflow */
+ if (oldnbatch > Min(INT_MAX / 2, MaxAllocSize / (sizeof(void *) * 2)))
+ return;
+
+ nbatch = oldnbatch * 2;
+ Assert(nbatch > 1);
+
+#ifdef HJDEBUG
+ printf("Hashjoin %p: increasing nbatch to %d because space = %zu\n",
+ hashtable, nbatch, hashtable->spaceUsed);
+#endif
+
+ oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+
+ if (hashtable->innerBatchFile == NULL)
+ {
+ /* we had no file arrays before */
+ hashtable->innerBatchFile = (BufFile **)
+ palloc0(nbatch * sizeof(BufFile *));
+ hashtable->outerBatchFile = (BufFile **)
+ palloc0(nbatch * sizeof(BufFile *));
+ /* time to establish the temp tablespaces, too */
+ PrepareTempTablespaces();
+ }
+ else
+ {
+ /* enlarge arrays and zero out added entries */
+ hashtable->innerBatchFile = (BufFile **)
+ repalloc(hashtable->innerBatchFile, nbatch * sizeof(BufFile *));
+ hashtable->outerBatchFile = (BufFile **)
+ repalloc(hashtable->outerBatchFile, nbatch * sizeof(BufFile *));
+ MemSet(hashtable->innerBatchFile + oldnbatch, 0,
+ (nbatch - oldnbatch) * sizeof(BufFile *));
+ MemSet(hashtable->outerBatchFile + oldnbatch, 0,
+ (nbatch - oldnbatch) * sizeof(BufFile *));
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+
+ hashtable->nbatch = nbatch;
+
+ /*
+ * Scan through the existing hash table entries and dump out any that are
+ * no longer of the current batch.
+ */
+ ninmemory = nfreed = 0;
+
+ /* If know we need to resize nbuckets, we can do it while rebatching. */
+ if (hashtable->nbuckets_optimal != hashtable->nbuckets)
+ {
+ /* we never decrease the number of buckets */
+ Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
+
+ hashtable->nbuckets = hashtable->nbuckets_optimal;
+ hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
+
+ hashtable->buckets.unshared =
+ repalloc(hashtable->buckets.unshared,
+ sizeof(HashJoinTuple) * hashtable->nbuckets);
+ }
+
+ /*
+ * We will scan through the chunks directly, so that we can reset the
+ * buckets now and not have to keep track which tuples in the buckets have
+ * already been processed. We will free the old chunks as we go.
+ */
+ memset(hashtable->buckets.unshared, 0,
+ sizeof(HashJoinTuple) * hashtable->nbuckets);
+ oldchunks = hashtable->chunks;
+ hashtable->chunks = NULL;
+
+ /* so, let's scan through the old chunks, and all tuples in each chunk */
+ while (oldchunks != NULL)
+ {
+ HashMemoryChunk nextchunk = oldchunks->next.unshared;
+
+ /* position within the buffer (up to oldchunks->used) */
+ size_t idx = 0;
+
+ /* process all tuples stored in this chunk (and then free it) */
+ while (idx < oldchunks->used)
+ {
+ HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(oldchunks) + idx);
+ MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+ int hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len);
+ int bucketno;
+ int batchno;
+
+ ninmemory++;
+ ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+ &bucketno, &batchno);
+
+ if (batchno == curbatch)
+ {
+ /* keep tuple in memory - copy it into the new chunk */
+ HashJoinTuple copyTuple;
+
+ copyTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
+ memcpy(copyTuple, hashTuple, hashTupleSize);
+
+ /* and add it back to the appropriate bucket */
+ copyTuple->next.unshared = hashtable->buckets.unshared[bucketno];
+ hashtable->buckets.unshared[bucketno] = copyTuple;
+ }
+ else
+ {
+ /* dump it out */
+ Assert(batchno > curbatch);
+ ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
+ hashTuple->hashvalue,
+ &hashtable->innerBatchFile[batchno]);
+
+ hashtable->spaceUsed -= hashTupleSize;
+ nfreed++;
+ }
+
+ /* next tuple in this chunk */
+ idx += MAXALIGN(hashTupleSize);
+
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /* we're done with this chunk - free it and proceed to the next one */
+ pfree(oldchunks);
+ oldchunks = nextchunk;
+ }
+
+#ifdef HJDEBUG
+ printf("Hashjoin %p: freed %ld of %ld tuples, space now %zu\n",
+ hashtable, nfreed, ninmemory, hashtable->spaceUsed);
+#endif
+
+ /*
+ * If we dumped out either all or none of the tuples in the table, disable
+ * further expansion of nbatch. This situation implies that we have
+ * enough tuples of identical hashvalues to overflow spaceAllowed.
+ * Increasing nbatch will not fix it since there's no way to subdivide the
+ * group any more finely. We have to just gut it out and hope the server
+ * has enough RAM.
+ */
+ if (nfreed == 0 || nfreed == ninmemory)
+ {
+ hashtable->growEnabled = false;
+#ifdef HJDEBUG
+ printf("Hashjoin %p: disabling further increase of nbatch\n",
+ hashtable);
+#endif
+ }
+}
+
+/*
+ * ExecParallelHashIncreaseNumBatches
+ * Every participant attached to grow_batches_barrier must run this
+ * function when it observes growth == PHJ_GROWTH_NEED_MORE_BATCHES.
+ */
+static void
+ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ int i;
+
+ Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER);
+
+ /*
+ * It's unlikely, but we need to be prepared for new participants to show
+ * up while we're in the middle of this operation so we need to switch on
+ * barrier phase here.
+ */
+ switch (PHJ_GROW_BATCHES_PHASE(BarrierPhase(&pstate->grow_batches_barrier)))
+ {
+ case PHJ_GROW_BATCHES_ELECTING:
+
+ /*
+ * Elect one participant to prepare to grow the number of batches.
+ * This involves reallocating or resetting the buckets of batch 0
+ * in preparation for all participants to begin repartitioning the
+ * tuples.
+ */
+ if (BarrierArriveAndWait(&pstate->grow_batches_barrier,
+ WAIT_EVENT_HASH_GROW_BATCHES_ELECT))
+ {
+ dsa_pointer_atomic *buckets;
+ ParallelHashJoinBatch *old_batch0;
+ int new_nbatch;
+ int i;
+
+ /* Move the old batch out of the way. */
+ old_batch0 = hashtable->batches[0].shared;
+ pstate->old_batches = pstate->batches;
+ pstate->old_nbatch = hashtable->nbatch;
+ pstate->batches = InvalidDsaPointer;
+
+ /* Free this backend's old accessors. */
+ ExecParallelHashCloseBatchAccessors(hashtable);
+
+ /* Figure out how many batches to use. */
+ if (hashtable->nbatch == 1)
+ {
+ /*
+ * We are going from single-batch to multi-batch. We need
+ * to switch from one large combined memory budget to the
+ * regular hash_mem budget.
+ */
+ pstate->space_allowed = get_hash_memory_limit();
+
+ /*
+ * The combined hash_mem of all participants wasn't
+ * enough. Therefore one batch per participant would be
+ * approximately equivalent and would probably also be
+ * insufficient. So try two batches per participant,
+ * rounded up to a power of two.
+ */
+ new_nbatch = pg_nextpower2_32(pstate->nparticipants * 2);
+ }
+ else
+ {
+ /*
+ * We were already multi-batched. Try doubling the number
+ * of batches.
+ */
+ new_nbatch = hashtable->nbatch * 2;
+ }
+
+ /* Allocate new larger generation of batches. */
+ Assert(hashtable->nbatch == pstate->nbatch);
+ ExecParallelHashJoinSetUpBatches(hashtable, new_nbatch);
+ Assert(hashtable->nbatch == pstate->nbatch);
+
+ /* Replace or recycle batch 0's bucket array. */
+ if (pstate->old_nbatch == 1)
+ {
+ double dtuples;
+ double dbuckets;
+ int new_nbuckets;
+
+ /*
+ * We probably also need a smaller bucket array. How many
+ * tuples do we expect per batch, assuming we have only
+ * half of them so far? Normally we don't need to change
+ * the bucket array's size, because the size of each batch
+ * stays the same as we add more batches, but in this
+ * special case we move from a large batch to many smaller
+ * batches and it would be wasteful to keep the large
+ * array.
+ */
+ dtuples = (old_batch0->ntuples * 2.0) / new_nbatch;
+ dbuckets = ceil(dtuples / NTUP_PER_BUCKET);
+ dbuckets = Min(dbuckets,
+ MaxAllocSize / sizeof(dsa_pointer_atomic));
+ new_nbuckets = (int) dbuckets;
+ new_nbuckets = Max(new_nbuckets, 1024);
+ new_nbuckets = pg_nextpower2_32(new_nbuckets);
+ dsa_free(hashtable->area, old_batch0->buckets);
+ hashtable->batches[0].shared->buckets =
+ dsa_allocate(hashtable->area,
+ sizeof(dsa_pointer_atomic) * new_nbuckets);
+ buckets = (dsa_pointer_atomic *)
+ dsa_get_address(hashtable->area,
+ hashtable->batches[0].shared->buckets);
+ for (i = 0; i < new_nbuckets; ++i)
+ dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
+ pstate->nbuckets = new_nbuckets;
+ }
+ else
+ {
+ /* Recycle the existing bucket array. */
+ hashtable->batches[0].shared->buckets = old_batch0->buckets;
+ buckets = (dsa_pointer_atomic *)
+ dsa_get_address(hashtable->area, old_batch0->buckets);
+ for (i = 0; i < hashtable->nbuckets; ++i)
+ dsa_pointer_atomic_write(&buckets[i], InvalidDsaPointer);
+ }
+
+ /* Move all chunks to the work queue for parallel processing. */
+ pstate->chunk_work_queue = old_batch0->chunks;
+
+ /* Disable further growth temporarily while we're growing. */
+ pstate->growth = PHJ_GROWTH_DISABLED;
+ }
+ else
+ {
+ /* All other participants just flush their tuples to disk. */
+ ExecParallelHashCloseBatchAccessors(hashtable);
+ }
+ /* Fall through. */
+
+ case PHJ_GROW_BATCHES_ALLOCATING:
+ /* Wait for the above to be finished. */
+ BarrierArriveAndWait(&pstate->grow_batches_barrier,
+ WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE);
+ /* Fall through. */
+
+ case PHJ_GROW_BATCHES_REPARTITIONING:
+ /* Make sure that we have the current dimensions and buckets. */
+ ExecParallelHashEnsureBatchAccessors(hashtable);
+ ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+ /* Then partition, flush counters. */
+ ExecParallelHashRepartitionFirst(hashtable);
+ ExecParallelHashRepartitionRest(hashtable);
+ ExecParallelHashMergeCounters(hashtable);
+ /* Wait for the above to be finished. */
+ BarrierArriveAndWait(&pstate->grow_batches_barrier,
+ WAIT_EVENT_HASH_GROW_BATCHES_REPARTITION);
+ /* Fall through. */
+
+ case PHJ_GROW_BATCHES_DECIDING:
+
+ /*
+ * Elect one participant to clean up and decide whether further
+ * repartitioning is needed, or should be disabled because it's
+ * not helping.
+ */
+ if (BarrierArriveAndWait(&pstate->grow_batches_barrier,
+ WAIT_EVENT_HASH_GROW_BATCHES_DECIDE))
+ {
+ bool space_exhausted = false;
+ bool extreme_skew_detected = false;
+
+ /* Make sure that we have the current dimensions and buckets. */
+ ExecParallelHashEnsureBatchAccessors(hashtable);
+ ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+
+ /* Are any of the new generation of batches exhausted? */
+ for (i = 0; i < hashtable->nbatch; ++i)
+ {
+ ParallelHashJoinBatch *batch = hashtable->batches[i].shared;
+
+ if (batch->space_exhausted ||
+ batch->estimated_size > pstate->space_allowed)
+ {
+ int parent;
+
+ space_exhausted = true;
+
+ /*
+ * Did this batch receive ALL of the tuples from its
+ * parent batch? That would indicate that further
+ * repartitioning isn't going to help (the hash values
+ * are probably all the same).
+ */
+ parent = i % pstate->old_nbatch;
+ if (batch->ntuples == hashtable->batches[parent].shared->old_ntuples)
+ extreme_skew_detected = true;
+ }
+ }
+
+ /* Don't keep growing if it's not helping or we'd overflow. */
+ if (extreme_skew_detected || hashtable->nbatch >= INT_MAX / 2)
+ pstate->growth = PHJ_GROWTH_DISABLED;
+ else if (space_exhausted)
+ pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
+ else
+ pstate->growth = PHJ_GROWTH_OK;
+
+ /* Free the old batches in shared memory. */
+ dsa_free(hashtable->area, pstate->old_batches);
+ pstate->old_batches = InvalidDsaPointer;
+ }
+ /* Fall through. */
+
+ case PHJ_GROW_BATCHES_FINISHING:
+ /* Wait for the above to complete. */
+ BarrierArriveAndWait(&pstate->grow_batches_barrier,
+ WAIT_EVENT_HASH_GROW_BATCHES_FINISH);
+ }
+}
+
+/*
+ * Repartition the tuples currently loaded into memory for inner batch 0
+ * because the number of batches has been increased. Some tuples are retained
+ * in memory and some are written out to a later batch.
+ */
+static void
+ExecParallelHashRepartitionFirst(HashJoinTable hashtable)
+{
+ dsa_pointer chunk_shared;
+ HashMemoryChunk chunk;
+
+ Assert(hashtable->nbatch == hashtable->parallel_state->nbatch);
+
+ while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared)))
+ {
+ size_t idx = 0;
+
+ /* Repartition all tuples in this chunk. */
+ while (idx < chunk->used)
+ {
+ HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
+ MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+ HashJoinTuple copyTuple;
+ dsa_pointer shared;
+ int bucketno;
+ int batchno;
+
+ ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+ &bucketno, &batchno);
+
+ Assert(batchno < hashtable->nbatch);
+ if (batchno == 0)
+ {
+ /* It still belongs in batch 0. Copy to a new chunk. */
+ copyTuple =
+ ExecParallelHashTupleAlloc(hashtable,
+ HJTUPLE_OVERHEAD + tuple->t_len,
+ &shared);
+ copyTuple->hashvalue = hashTuple->hashvalue;
+ memcpy(HJTUPLE_MINTUPLE(copyTuple), tuple, tuple->t_len);
+ ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+ copyTuple, shared);
+ }
+ else
+ {
+ size_t tuple_size =
+ MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+
+ /* It belongs in a later batch. */
+ hashtable->batches[batchno].estimated_size += tuple_size;
+ sts_puttuple(hashtable->batches[batchno].inner_tuples,
+ &hashTuple->hashvalue, tuple);
+ }
+
+ /* Count this tuple. */
+ ++hashtable->batches[0].old_ntuples;
+ ++hashtable->batches[batchno].ntuples;
+
+ idx += MAXALIGN(HJTUPLE_OVERHEAD +
+ HJTUPLE_MINTUPLE(hashTuple)->t_len);
+ }
+
+ /* Free this chunk. */
+ dsa_free(hashtable->area, chunk_shared);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+}
+
+/*
+ * Help repartition inner batches 1..n.
+ */
+static void
+ExecParallelHashRepartitionRest(HashJoinTable hashtable)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ int old_nbatch = pstate->old_nbatch;
+ SharedTuplestoreAccessor **old_inner_tuples;
+ ParallelHashJoinBatch *old_batches;
+ int i;
+
+ /* Get our hands on the previous generation of batches. */
+ old_batches = (ParallelHashJoinBatch *)
+ dsa_get_address(hashtable->area, pstate->old_batches);
+ old_inner_tuples = palloc0(sizeof(SharedTuplestoreAccessor *) * old_nbatch);
+ for (i = 1; i < old_nbatch; ++i)
+ {
+ ParallelHashJoinBatch *shared =
+ NthParallelHashJoinBatch(old_batches, i);
+
+ old_inner_tuples[i] = sts_attach(ParallelHashJoinBatchInner(shared),
+ ParallelWorkerNumber + 1,
+ &pstate->fileset);
+ }
+
+ /* Join in the effort to repartition them. */
+ for (i = 1; i < old_nbatch; ++i)
+ {
+ MinimalTuple tuple;
+ uint32 hashvalue;
+
+ /* Scan one partition from the previous generation. */
+ sts_begin_parallel_scan(old_inner_tuples[i]);
+ while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], &hashvalue)))
+ {
+ size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+ int bucketno;
+ int batchno;
+
+ /* Decide which partition it goes to in the new generation. */
+ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno,
+ &batchno);
+
+ hashtable->batches[batchno].estimated_size += tuple_size;
+ ++hashtable->batches[batchno].ntuples;
+ ++hashtable->batches[i].old_ntuples;
+
+ /* Store the tuple its new batch. */
+ sts_puttuple(hashtable->batches[batchno].inner_tuples,
+ &hashvalue, tuple);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+ sts_end_parallel_scan(old_inner_tuples[i]);
+ }
+
+ pfree(old_inner_tuples);
+}
+
+/*
+ * Transfer the backend-local per-batch counters to the shared totals.
+ */
+static void
+ExecParallelHashMergeCounters(HashJoinTable hashtable)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ int i;
+
+ LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
+ pstate->total_tuples = 0;
+ for (i = 0; i < hashtable->nbatch; ++i)
+ {
+ ParallelHashJoinBatchAccessor *batch = &hashtable->batches[i];
+
+ batch->shared->size += batch->size;
+ batch->shared->estimated_size += batch->estimated_size;
+ batch->shared->ntuples += batch->ntuples;
+ batch->shared->old_ntuples += batch->old_ntuples;
+ batch->size = 0;
+ batch->estimated_size = 0;
+ batch->ntuples = 0;
+ batch->old_ntuples = 0;
+ pstate->total_tuples += batch->shared->ntuples;
+ }
+ LWLockRelease(&pstate->lock);
+}
+
+/*
+ * ExecHashIncreaseNumBuckets
+ * increase the original number of buckets in order to reduce
+ * number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+ HashMemoryChunk chunk;
+
+ /* do nothing if not an increase (it's called increase for a reason) */
+ if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
+ return;
+
+#ifdef HJDEBUG
+ printf("Hashjoin %p: increasing nbuckets %d => %d\n",
+ hashtable, hashtable->nbuckets, hashtable->nbuckets_optimal);
+#endif
+
+ hashtable->nbuckets = hashtable->nbuckets_optimal;
+ hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
+
+ Assert(hashtable->nbuckets > 1);
+ Assert(hashtable->nbuckets <= (INT_MAX / 2));
+ Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+ /*
+ * Just reallocate the proper number of buckets - we don't need to walk
+ * through them - we can walk the dense-allocated chunks (just like in
+ * ExecHashIncreaseNumBatches, but without all the copying into new
+ * chunks)
+ */
+ hashtable->buckets.unshared =
+ (HashJoinTuple *) repalloc(hashtable->buckets.unshared,
+ hashtable->nbuckets * sizeof(HashJoinTuple));
+
+ memset(hashtable->buckets.unshared, 0,
+ hashtable->nbuckets * sizeof(HashJoinTuple));
+
+ /* scan through all tuples in all chunks to rebuild the hash table */
+ for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next.unshared)
+ {
+ /* process all tuples stored in this chunk */
+ size_t idx = 0;
+
+ while (idx < chunk->used)
+ {
+ HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
+ int bucketno;
+ int batchno;
+
+ ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+ &bucketno, &batchno);
+
+ /* add the tuple to the proper bucket */
+ hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
+ hashtable->buckets.unshared[bucketno] = hashTuple;
+
+ /* advance index past the tuple */
+ idx += MAXALIGN(HJTUPLE_OVERHEAD +
+ HJTUPLE_MINTUPLE(hashTuple)->t_len);
+ }
+
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ }
+}
+
+static void
+ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ int i;
+ HashMemoryChunk chunk;
+ dsa_pointer chunk_s;
+
+ Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER);
+
+ /*
+ * It's unlikely, but we need to be prepared for new participants to show
+ * up while we're in the middle of this operation so we need to switch on
+ * barrier phase here.
+ */
+ switch (PHJ_GROW_BUCKETS_PHASE(BarrierPhase(&pstate->grow_buckets_barrier)))
+ {
+ case PHJ_GROW_BUCKETS_ELECTING:
+ /* Elect one participant to prepare to increase nbuckets. */
+ if (BarrierArriveAndWait(&pstate->grow_buckets_barrier,
+ WAIT_EVENT_HASH_GROW_BUCKETS_ELECT))
+ {
+ size_t size;
+ dsa_pointer_atomic *buckets;
+
+ /* Double the size of the bucket array. */
+ pstate->nbuckets *= 2;
+ size = pstate->nbuckets * sizeof(dsa_pointer_atomic);
+ hashtable->batches[0].shared->size += size / 2;
+ dsa_free(hashtable->area, hashtable->batches[0].shared->buckets);
+ hashtable->batches[0].shared->buckets =
+ dsa_allocate(hashtable->area, size);
+ buckets = (dsa_pointer_atomic *)
+ dsa_get_address(hashtable->area,
+ hashtable->batches[0].shared->buckets);
+ for (i = 0; i < pstate->nbuckets; ++i)
+ dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
+
+ /* Put the chunk list onto the work queue. */
+ pstate->chunk_work_queue = hashtable->batches[0].shared->chunks;
+
+ /* Clear the flag. */
+ pstate->growth = PHJ_GROWTH_OK;
+ }
+ /* Fall through. */
+
+ case PHJ_GROW_BUCKETS_ALLOCATING:
+ /* Wait for the above to complete. */
+ BarrierArriveAndWait(&pstate->grow_buckets_barrier,
+ WAIT_EVENT_HASH_GROW_BUCKETS_ALLOCATE);
+ /* Fall through. */
+
+ case PHJ_GROW_BUCKETS_REINSERTING:
+ /* Reinsert all tuples into the hash table. */
+ ExecParallelHashEnsureBatchAccessors(hashtable);
+ ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+ while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_s)))
+ {
+ size_t idx = 0;
+
+ while (idx < chunk->used)
+ {
+ HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
+ dsa_pointer shared = chunk_s + HASH_CHUNK_HEADER_SIZE + idx;
+ int bucketno;
+ int batchno;
+
+ ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+ &bucketno, &batchno);
+ Assert(batchno == 0);
+
+ /* add the tuple to the proper bucket */
+ ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+ hashTuple, shared);
+
+ /* advance index past the tuple */
+ idx += MAXALIGN(HJTUPLE_OVERHEAD +
+ HJTUPLE_MINTUPLE(hashTuple)->t_len);
+ }
+
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ }
+ BarrierArriveAndWait(&pstate->grow_buckets_barrier,
+ WAIT_EVENT_HASH_GROW_BUCKETS_REINSERT);
+ }
+}
+
+/*
+ * ExecHashTableInsert
+ * insert a tuple into the hash table depending on the hash value
+ * it may just go to a temp file for later batches
+ *
+ * Note: the passed TupleTableSlot may contain a regular, minimal, or virtual
+ * tuple; the minimal case in particular is certain to happen while reloading
+ * tuples from batch files. We could save some cycles in the regular-tuple
+ * case by not forcing the slot contents into minimal form; not clear if it's
+ * worth the messiness required.
+ */
+void
+ExecHashTableInsert(HashJoinTable hashtable,
+ TupleTableSlot *slot,
+ uint32 hashvalue)
+{
+ bool shouldFree;
+ MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+ int bucketno;
+ int batchno;
+
+ ExecHashGetBucketAndBatch(hashtable, hashvalue,
+ &bucketno, &batchno);
+
+ /*
+ * decide whether to put the tuple in the hash table or a temp file
+ */
+ if (batchno == hashtable->curbatch)
+ {
+ /*
+ * put the tuple in hash table
+ */
+ HashJoinTuple hashTuple;
+ int hashTupleSize;
+ double ntuples = (hashtable->totalTuples - hashtable->skewTuples);
+
+ /* Create the HashJoinTuple */
+ hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
+ hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
+
+ hashTuple->hashvalue = hashvalue;
+ memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+
+ /*
+ * We always reset the tuple-matched flag on insertion. This is okay
+ * even when reloading a tuple from a batch file, since the tuple
+ * could not possibly have been matched to an outer tuple before it
+ * went into the batch file.
+ */
+ HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
+
+ /* Push it onto the front of the bucket's list */
+ hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
+ hashtable->buckets.unshared[bucketno] = hashTuple;
+
+ /*
+ * Increase the (optimal) number of buckets if we just exceeded the
+ * NTUP_PER_BUCKET threshold, but only when there's still a single
+ * batch.
+ */
+ if (hashtable->nbatch == 1 &&
+ ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
+ {
+ /* Guard against integer overflow and alloc size overflow */
+ if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
+ hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
+ {
+ hashtable->nbuckets_optimal *= 2;
+ hashtable->log2_nbuckets_optimal += 1;
+ }
+ }
+
+ /* Account for space used, and back off if we've used too much */
+ hashtable->spaceUsed += hashTupleSize;
+ if (hashtable->spaceUsed > hashtable->spacePeak)
+ hashtable->spacePeak = hashtable->spaceUsed;
+ if (hashtable->spaceUsed +
+ hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
+ > hashtable->spaceAllowed)
+ ExecHashIncreaseNumBatches(hashtable);
+ }
+ else
+ {
+ /*
+ * put the tuple into a temp file for later batches
+ */
+ Assert(batchno > hashtable->curbatch);
+ ExecHashJoinSaveTuple(tuple,
+ hashvalue,
+ &hashtable->innerBatchFile[batchno]);
+ }
+
+ if (shouldFree)
+ heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * ExecParallelHashTableInsert
+ * insert a tuple into a shared hash table or shared batch tuplestore
+ */
+void
+ExecParallelHashTableInsert(HashJoinTable hashtable,
+ TupleTableSlot *slot,
+ uint32 hashvalue)
+{
+ bool shouldFree;
+ MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+ dsa_pointer shared;
+ int bucketno;
+ int batchno;
+
+retry:
+ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+
+ if (batchno == 0)
+ {
+ HashJoinTuple hashTuple;
+
+ /* Try to load it into memory. */
+ Assert(BarrierPhase(&hashtable->parallel_state->build_barrier) ==
+ PHJ_BUILD_HASHING_INNER);
+ hashTuple = ExecParallelHashTupleAlloc(hashtable,
+ HJTUPLE_OVERHEAD + tuple->t_len,
+ &shared);
+ if (hashTuple == NULL)
+ goto retry;
+
+ /* Store the hash value in the HashJoinTuple header. */
+ hashTuple->hashvalue = hashvalue;
+ memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+
+ /* Push it onto the front of the bucket's list */
+ ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+ hashTuple, shared);
+ }
+ else
+ {
+ size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+
+ Assert(batchno > 0);
+
+ /* Try to preallocate space in the batch if necessary. */
+ if (hashtable->batches[batchno].preallocated < tuple_size)
+ {
+ if (!ExecParallelHashTuplePrealloc(hashtable, batchno, tuple_size))
+ goto retry;
+ }
+
+ Assert(hashtable->batches[batchno].preallocated >= tuple_size);
+ hashtable->batches[batchno].preallocated -= tuple_size;
+ sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue,
+ tuple);
+ }
+ ++hashtable->batches[batchno].ntuples;
+
+ if (shouldFree)
+ heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * Insert a tuple into the current hash table. Unlike
+ * ExecParallelHashTableInsert, this version is not prepared to send the tuple
+ * to other batches or to run out of memory, and should only be called with
+ * tuples that belong in the current batch once growth has been disabled.
+ */
+void
+ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
+ TupleTableSlot *slot,
+ uint32 hashvalue)
+{
+ bool shouldFree;
+ MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+ HashJoinTuple hashTuple;
+ dsa_pointer shared;
+ int batchno;
+ int bucketno;
+
+ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+ Assert(batchno == hashtable->curbatch);
+ hashTuple = ExecParallelHashTupleAlloc(hashtable,
+ HJTUPLE_OVERHEAD + tuple->t_len,
+ &shared);
+ hashTuple->hashvalue = hashvalue;
+ memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+ HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
+ ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+ hashTuple, shared);
+
+ if (shouldFree)
+ heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * ExecHashGetHashValue
+ * Compute the hash value for a tuple
+ *
+ * The tuple to be tested must be in econtext->ecxt_outertuple (thus Vars in
+ * the hashkeys expressions need to have OUTER_VAR as varno). If outer_tuple
+ * is false (meaning it's the HashJoin's inner node, Hash), econtext,
+ * hashkeys, and slot need to be from Hash, with hashkeys/slot referencing and
+ * being suitable for tuples from the node below the Hash. Conversely, if
+ * outer_tuple is true, econtext is from HashJoin, and hashkeys/slot need to
+ * be appropriate for tuples from HashJoin's outer node.
+ *
+ * A true result means the tuple's hash value has been successfully computed
+ * and stored at *hashvalue. A false result means the tuple cannot match
+ * because it contains a null attribute, and hence it should be discarded
+ * immediately. (If keep_nulls is true then false is never returned.)
+ */
+bool
+ExecHashGetHashValue(HashJoinTable hashtable,
+ ExprContext *econtext,
+ List *hashkeys,
+ bool outer_tuple,
+ bool keep_nulls,
+ uint32 *hashvalue)
+{
+ uint32 hashkey = 0;
+ FmgrInfo *hashfunctions;
+ ListCell *hk;
+ int i = 0;
+ MemoryContext oldContext;
+
+ /*
+ * We reset the eval context each time to reclaim any memory leaked in the
+ * hashkey expressions.
+ */
+ ResetExprContext(econtext);
+
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ if (outer_tuple)
+ hashfunctions = hashtable->outer_hashfunctions;
+ else
+ hashfunctions = hashtable->inner_hashfunctions;
+
+ foreach(hk, hashkeys)
+ {
+ ExprState *keyexpr = (ExprState *) lfirst(hk);
+ Datum keyval;
+ bool isNull;
+
+ /* rotate hashkey left 1 bit at each step */
+ hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+ /*
+ * Get the join attribute value of the tuple
+ */
+ keyval = ExecEvalExpr(keyexpr, econtext, &isNull);
+
+ /*
+ * If the attribute is NULL, and the join operator is strict, then
+ * this tuple cannot pass the join qual so we can reject it
+ * immediately (unless we're scanning the outside of an outer join, in
+ * which case we must not reject it). Otherwise we act like the
+ * hashcode of NULL is zero (this will support operators that act like
+ * IS NOT DISTINCT, though not any more-random behavior). We treat
+ * the hash support function as strict even if the operator is not.
+ *
+ * Note: currently, all hashjoinable operators must be strict since
+ * the hash index AM assumes that. However, it takes so little extra
+ * code here to allow non-strict that we may as well do it.
+ */
+ if (isNull)
+ {
+ if (hashtable->hashStrict[i] && !keep_nulls)
+ {
+ MemoryContextSwitchTo(oldContext);
+ return false; /* cannot match */
+ }
+ /* else, leave hashkey unmodified, equivalent to hashcode 0 */
+ }
+ else
+ {
+ /* Compute the hash function */
+ uint32 hkey;
+
+ hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i], hashtable->collations[i], keyval));
+ hashkey ^= hkey;
+ }
+
+ i++;
+ }
+
+ MemoryContextSwitchTo(oldContext);
+
+ *hashvalue = hashkey;
+ return true;
+}
+
+/*
+ * ExecHashGetBucketAndBatch
+ * Determine the bucket number and batch number for a hash value
+ *
+ * Note: on-the-fly increases of nbatch must not change the bucket number
+ * for a given hash code (since we don't move tuples to different hash
+ * chains), and must only cause the batch number to remain the same or
+ * increase. Our algorithm is
+ * bucketno = hashvalue MOD nbuckets
+ * batchno = ROR(hashvalue, log2_nbuckets) MOD nbatch
+ * where nbuckets and nbatch are both expected to be powers of 2, so we can
+ * do the computations by shifting and masking. (This assumes that all hash
+ * functions are good about randomizing all their output bits, else we are
+ * likely to have very skewed bucket or batch occupancy.)
+ *
+ * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic
+ * bucket count growth. Once we start batching, the value is fixed and does
+ * not change over the course of the join (making it possible to compute batch
+ * number the way we do here).
+ *
+ * nbatch is always a power of 2; we increase it only by doubling it. This
+ * effectively adds one more bit to the top of the batchno. In very large
+ * joins, we might run out of bits to add, so we do this by rotating the hash
+ * value. This causes batchno to steal bits from bucketno when the number of
+ * virtual buckets exceeds 2^32. It's better to have longer bucket chains
+ * than to lose the ability to divide batches.
+ */
+void
+ExecHashGetBucketAndBatch(HashJoinTable hashtable,
+ uint32 hashvalue,
+ int *bucketno,
+ int *batchno)
+{
+ uint32 nbuckets = (uint32) hashtable->nbuckets;
+ uint32 nbatch = (uint32) hashtable->nbatch;
+
+ if (nbatch > 1)
+ {
+ *bucketno = hashvalue & (nbuckets - 1);
+ *batchno = pg_rotate_right32(hashvalue,
+ hashtable->log2_nbuckets) & (nbatch - 1);
+ }
+ else
+ {
+ *bucketno = hashvalue & (nbuckets - 1);
+ *batchno = 0;
+ }
+}
+
+/*
+ * ExecScanHashBucket
+ * scan a hash bucket for matches to the current outer tuple
+ *
+ * The current outer tuple must be stored in econtext->ecxt_outertuple.
+ *
+ * On success, the inner tuple is stored into hjstate->hj_CurTuple and
+ * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
+ * for the latter.
+ */
+bool
+ExecScanHashBucket(HashJoinState *hjstate,
+ ExprContext *econtext)
+{
+ ExprState *hjclauses = hjstate->hashclauses;
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ HashJoinTuple hashTuple = hjstate->hj_CurTuple;
+ uint32 hashvalue = hjstate->hj_CurHashValue;
+
+ /*
+ * hj_CurTuple is the address of the tuple last returned from the current
+ * bucket, or NULL if it's time to start scanning a new bucket.
+ *
+ * If the tuple hashed to a skew bucket then scan the skew bucket
+ * otherwise scan the standard hashtable bucket.
+ */
+ if (hashTuple != NULL)
+ hashTuple = hashTuple->next.unshared;
+ else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
+ hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;
+ else
+ hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
+
+ while (hashTuple != NULL)
+ {
+ if (hashTuple->hashvalue == hashvalue)
+ {
+ TupleTableSlot *inntuple;
+
+ /* insert hashtable's tuple into exec slot so ExecQual sees it */
+ inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+ hjstate->hj_HashTupleSlot,
+ false); /* do not pfree */
+ econtext->ecxt_innertuple = inntuple;
+
+ if (ExecQualAndReset(hjclauses, econtext))
+ {
+ hjstate->hj_CurTuple = hashTuple;
+ return true;
+ }
+ }
+
+ hashTuple = hashTuple->next.unshared;
+ }
+
+ /*
+ * no match
+ */
+ return false;
+}
+
+/*
+ * ExecParallelScanHashBucket
+ * scan a hash bucket for matches to the current outer tuple
+ *
+ * The current outer tuple must be stored in econtext->ecxt_outertuple.
+ *
+ * On success, the inner tuple is stored into hjstate->hj_CurTuple and
+ * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
+ * for the latter.
+ */
+bool
+ExecParallelScanHashBucket(HashJoinState *hjstate,
+ ExprContext *econtext)
+{
+ ExprState *hjclauses = hjstate->hashclauses;
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ HashJoinTuple hashTuple = hjstate->hj_CurTuple;
+ uint32 hashvalue = hjstate->hj_CurHashValue;
+
+ /*
+ * hj_CurTuple is the address of the tuple last returned from the current
+ * bucket, or NULL if it's time to start scanning a new bucket.
+ */
+ if (hashTuple != NULL)
+ hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
+ else
+ hashTuple = ExecParallelHashFirstTuple(hashtable,
+ hjstate->hj_CurBucketNo);
+
+ while (hashTuple != NULL)
+ {
+ if (hashTuple->hashvalue == hashvalue)
+ {
+ TupleTableSlot *inntuple;
+
+ /* insert hashtable's tuple into exec slot so ExecQual sees it */
+ inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+ hjstate->hj_HashTupleSlot,
+ false); /* do not pfree */
+ econtext->ecxt_innertuple = inntuple;
+
+ if (ExecQualAndReset(hjclauses, econtext))
+ {
+ hjstate->hj_CurTuple = hashTuple;
+ return true;
+ }
+ }
+
+ hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
+ }
+
+ /*
+ * no match
+ */
+ return false;
+}
+
+/*
+ * ExecPrepHashTableForUnmatched
+ * set up for a series of ExecScanHashTableForUnmatched calls
+ */
+void
+ExecPrepHashTableForUnmatched(HashJoinState *hjstate)
+{
+ /*----------
+ * During this scan we use the HashJoinState fields as follows:
+ *
+ * hj_CurBucketNo: next regular bucket to scan
+ * hj_CurSkewBucketNo: next skew bucket (an index into skewBucketNums)
+ * hj_CurTuple: last tuple returned, or NULL to start next bucket
+ *----------
+ */
+ hjstate->hj_CurBucketNo = 0;
+ hjstate->hj_CurSkewBucketNo = 0;
+ hjstate->hj_CurTuple = NULL;
+}
+
+/*
+ * ExecScanHashTableForUnmatched
+ * scan the hash table for unmatched inner tuples
+ *
+ * On success, the inner tuple is stored into hjstate->hj_CurTuple and
+ * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
+ * for the latter.
+ */
+bool
+ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ HashJoinTuple hashTuple = hjstate->hj_CurTuple;
+
+ for (;;)
+ {
+ /*
+ * hj_CurTuple is the address of the tuple last returned from the
+ * current bucket, or NULL if it's time to start scanning a new
+ * bucket.
+ */
+ if (hashTuple != NULL)
+ hashTuple = hashTuple->next.unshared;
+ else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
+ {
+ hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
+ hjstate->hj_CurBucketNo++;
+ }
+ else if (hjstate->hj_CurSkewBucketNo < hashtable->nSkewBuckets)
+ {
+ int j = hashtable->skewBucketNums[hjstate->hj_CurSkewBucketNo];
+
+ hashTuple = hashtable->skewBucket[j]->tuples;
+ hjstate->hj_CurSkewBucketNo++;
+ }
+ else
+ break; /* finished all buckets */
+
+ while (hashTuple != NULL)
+ {
+ if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
+ {
+ TupleTableSlot *inntuple;
+
+ /* insert hashtable's tuple into exec slot */
+ inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+ hjstate->hj_HashTupleSlot,
+ false); /* do not pfree */
+ econtext->ecxt_innertuple = inntuple;
+
+ /*
+ * Reset temp memory each time; although this function doesn't
+ * do any qual eval, the caller will, so let's keep it
+ * parallel to ExecScanHashBucket.
+ */
+ ResetExprContext(econtext);
+
+ hjstate->hj_CurTuple = hashTuple;
+ return true;
+ }
+
+ hashTuple = hashTuple->next.unshared;
+ }
+
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * no more unmatched tuples
+ */
+ return false;
+}
+
+/*
+ * ExecHashTableReset
+ *
+ * reset hash table header for new batch
+ */
+void
+ExecHashTableReset(HashJoinTable hashtable)
+{
+ MemoryContext oldcxt;
+ int nbuckets = hashtable->nbuckets;
+
+ /*
+ * Release all the hash buckets and tuples acquired in the prior pass, and
+ * reinitialize the context for a new pass.
+ */
+ MemoryContextReset(hashtable->batchCxt);
+ oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
+
+ /* Reallocate and reinitialize the hash bucket headers. */
+ hashtable->buckets.unshared = (HashJoinTuple *)
+ palloc0(nbuckets * sizeof(HashJoinTuple));
+
+ hashtable->spaceUsed = 0;
+
+ MemoryContextSwitchTo(oldcxt);
+
+ /* Forget the chunks (the memory was freed by the context reset above). */
+ hashtable->chunks = NULL;
+}
+
+/*
+ * ExecHashTableResetMatchFlags
+ * Clear all the HeapTupleHeaderHasMatch flags in the table
+ */
+void
+ExecHashTableResetMatchFlags(HashJoinTable hashtable)
+{
+ HashJoinTuple tuple;
+ int i;
+
+ /* Reset all flags in the main table ... */
+ for (i = 0; i < hashtable->nbuckets; i++)
+ {
+ for (tuple = hashtable->buckets.unshared[i]; tuple != NULL;
+ tuple = tuple->next.unshared)
+ HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
+ }
+
+ /* ... and the same for the skew buckets, if any */
+ for (i = 0; i < hashtable->nSkewBuckets; i++)
+ {
+ int j = hashtable->skewBucketNums[i];
+ HashSkewBucket *skewBucket = hashtable->skewBucket[j];
+
+ for (tuple = skewBucket->tuples; tuple != NULL; tuple = tuple->next.unshared)
+ HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
+ }
+}
+
+
+void
+ExecReScanHash(HashState *node)
+{
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->ps.lefttree->chgParam == NULL)
+ ExecReScan(node->ps.lefttree);
+}
+
+
+/*
+ * ExecHashBuildSkewHash
+ *
+ * Set up for skew optimization if we can identify the most common values
+ * (MCVs) of the outer relation's join key. We make a skew hash bucket
+ * for the hash value of each MCV, up to the number of slots allowed
+ * based on available memory.
+ */
+static void
+ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, int mcvsToUse)
+{
+ HeapTupleData *statsTuple;
+ AttStatsSlot sslot;
+
+ /* Do nothing if planner didn't identify the outer relation's join key */
+ if (!OidIsValid(node->skewTable))
+ return;
+ /* Also, do nothing if we don't have room for at least one skew bucket */
+ if (mcvsToUse <= 0)
+ return;
+
+ /*
+ * Try to find the MCV statistics for the outer relation's join key.
+ */
+ statsTuple = SearchSysCache3(STATRELATTINH,
+ ObjectIdGetDatum(node->skewTable),
+ Int16GetDatum(node->skewColumn),
+ BoolGetDatum(node->skewInherit));
+ if (!HeapTupleIsValid(statsTuple))
+ return;
+
+ if (get_attstatsslot(&sslot, statsTuple,
+ STATISTIC_KIND_MCV, InvalidOid,
+ ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
+ {
+ double frac;
+ int nbuckets;
+ FmgrInfo *hashfunctions;
+ int i;
+
+ if (mcvsToUse > sslot.nvalues)
+ mcvsToUse = sslot.nvalues;
+
+ /*
+ * Calculate the expected fraction of outer relation that will
+ * participate in the skew optimization. If this isn't at least
+ * SKEW_MIN_OUTER_FRACTION, don't use skew optimization.
+ */
+ frac = 0;
+ for (i = 0; i < mcvsToUse; i++)
+ frac += sslot.numbers[i];
+ if (frac < SKEW_MIN_OUTER_FRACTION)
+ {
+ free_attstatsslot(&sslot);
+ ReleaseSysCache(statsTuple);
+ return;
+ }
+
+ /*
+ * Okay, set up the skew hashtable.
+ *
+ * skewBucket[] is an open addressing hashtable with a power of 2 size
+ * that is greater than the number of MCV values. (This ensures there
+ * will be at least one null entry, so searches will always
+ * terminate.)
+ *
+ * Note: this code could fail if mcvsToUse exceeds INT_MAX/8 or
+ * MaxAllocSize/sizeof(void *)/8, but that is not currently possible
+ * since we limit pg_statistic entries to much less than that.
+ */
+ nbuckets = pg_nextpower2_32(mcvsToUse + 1);
+ /* use two more bits just to help avoid collisions */
+ nbuckets <<= 2;
+
+ hashtable->skewEnabled = true;
+ hashtable->skewBucketLen = nbuckets;
+
+ /*
+ * We allocate the bucket memory in the hashtable's batch context. It
+ * is only needed during the first batch, and this ensures it will be
+ * automatically removed once the first batch is done.
+ */
+ hashtable->skewBucket = (HashSkewBucket **)
+ MemoryContextAllocZero(hashtable->batchCxt,
+ nbuckets * sizeof(HashSkewBucket *));
+ hashtable->skewBucketNums = (int *)
+ MemoryContextAllocZero(hashtable->batchCxt,
+ mcvsToUse * sizeof(int));
+
+ hashtable->spaceUsed += nbuckets * sizeof(HashSkewBucket *)
+ + mcvsToUse * sizeof(int);
+ hashtable->spaceUsedSkew += nbuckets * sizeof(HashSkewBucket *)
+ + mcvsToUse * sizeof(int);
+ if (hashtable->spaceUsed > hashtable->spacePeak)
+ hashtable->spacePeak = hashtable->spaceUsed;
+
+ /*
+ * Create a skew bucket for each MCV hash value.
+ *
+ * Note: it is very important that we create the buckets in order of
+ * decreasing MCV frequency. If we have to remove some buckets, they
+ * must be removed in reverse order of creation (see notes in
+ * ExecHashRemoveNextSkewBucket) and we want the least common MCVs to
+ * be removed first.
+ */
+ hashfunctions = hashtable->outer_hashfunctions;
+
+ for (i = 0; i < mcvsToUse; i++)
+ {
+ uint32 hashvalue;
+ int bucket;
+
+ hashvalue = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[0],
+ hashtable->collations[0],
+ sslot.values[i]));
+
+ /*
+ * While we have not hit a hole in the hashtable and have not hit
+ * the desired bucket, we have collided with some previous hash
+ * value, so try the next bucket location. NB: this code must
+ * match ExecHashGetSkewBucket.
+ */
+ bucket = hashvalue & (nbuckets - 1);
+ while (hashtable->skewBucket[bucket] != NULL &&
+ hashtable->skewBucket[bucket]->hashvalue != hashvalue)
+ bucket = (bucket + 1) & (nbuckets - 1);
+
+ /*
+ * If we found an existing bucket with the same hashvalue, leave
+ * it alone. It's okay for two MCVs to share a hashvalue.
+ */
+ if (hashtable->skewBucket[bucket] != NULL)
+ continue;
+
+ /* Okay, create a new skew bucket for this hashvalue. */
+ hashtable->skewBucket[bucket] = (HashSkewBucket *)
+ MemoryContextAlloc(hashtable->batchCxt,
+ sizeof(HashSkewBucket));
+ hashtable->skewBucket[bucket]->hashvalue = hashvalue;
+ hashtable->skewBucket[bucket]->tuples = NULL;
+ hashtable->skewBucketNums[hashtable->nSkewBuckets] = bucket;
+ hashtable->nSkewBuckets++;
+ hashtable->spaceUsed += SKEW_BUCKET_OVERHEAD;
+ hashtable->spaceUsedSkew += SKEW_BUCKET_OVERHEAD;
+ if (hashtable->spaceUsed > hashtable->spacePeak)
+ hashtable->spacePeak = hashtable->spaceUsed;
+ }
+
+ free_attstatsslot(&sslot);
+ }
+
+ ReleaseSysCache(statsTuple);
+}
+
+/*
+ * ExecHashGetSkewBucket
+ *
+ * Returns the index of the skew bucket for this hashvalue,
+ * or INVALID_SKEW_BUCKET_NO if the hashvalue is not
+ * associated with any active skew bucket.
+ */
+int
+ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue)
+{
+ int bucket;
+
+ /*
+ * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in
+ * particular, this happens after the initial batch is done).
+ */
+ if (!hashtable->skewEnabled)
+ return INVALID_SKEW_BUCKET_NO;
+
+ /*
+ * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.
+ */
+ bucket = hashvalue & (hashtable->skewBucketLen - 1);
+
+ /*
+ * While we have not hit a hole in the hashtable and have not hit the
+ * desired bucket, we have collided with some other hash value, so try the
+ * next bucket location.
+ */
+ while (hashtable->skewBucket[bucket] != NULL &&
+ hashtable->skewBucket[bucket]->hashvalue != hashvalue)
+ bucket = (bucket + 1) & (hashtable->skewBucketLen - 1);
+
+ /*
+ * Found the desired bucket?
+ */
+ if (hashtable->skewBucket[bucket] != NULL)
+ return bucket;
+
+ /*
+ * There must not be any hashtable entry for this hash value.
+ */
+ return INVALID_SKEW_BUCKET_NO;
+}
+
+/*
+ * ExecHashSkewTableInsert
+ *
+ * Insert a tuple into the skew hashtable.
+ *
+ * This should generally match up with the current-batch case in
+ * ExecHashTableInsert.
+ */
+static void
+ExecHashSkewTableInsert(HashJoinTable hashtable,
+ TupleTableSlot *slot,
+ uint32 hashvalue,
+ int bucketNumber)
+{
+ bool shouldFree;
+ MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+ HashJoinTuple hashTuple;
+ int hashTupleSize;
+
+ /* Create the HashJoinTuple */
+ hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
+ hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
+ hashTupleSize);
+ hashTuple->hashvalue = hashvalue;
+ memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+ HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
+
+ /* Push it onto the front of the skew bucket's list */
+ hashTuple->next.unshared = hashtable->skewBucket[bucketNumber]->tuples;
+ hashtable->skewBucket[bucketNumber]->tuples = hashTuple;
+ Assert(hashTuple != hashTuple->next.unshared);
+
+ /* Account for space used, and back off if we've used too much */
+ hashtable->spaceUsed += hashTupleSize;
+ hashtable->spaceUsedSkew += hashTupleSize;
+ if (hashtable->spaceUsed > hashtable->spacePeak)
+ hashtable->spacePeak = hashtable->spaceUsed;
+ while (hashtable->spaceUsedSkew > hashtable->spaceAllowedSkew)
+ ExecHashRemoveNextSkewBucket(hashtable);
+
+ /* Check we are not over the total spaceAllowed, either */
+ if (hashtable->spaceUsed > hashtable->spaceAllowed)
+ ExecHashIncreaseNumBatches(hashtable);
+
+ if (shouldFree)
+ heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * ExecHashRemoveNextSkewBucket
+ *
+ * Remove the least valuable skew bucket by pushing its tuples into
+ * the main hash table.
+ */
+static void
+ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
+{
+ int bucketToRemove;
+ HashSkewBucket *bucket;
+ uint32 hashvalue;
+ int bucketno;
+ int batchno;
+ HashJoinTuple hashTuple;
+
+ /* Locate the bucket to remove */
+ bucketToRemove = hashtable->skewBucketNums[hashtable->nSkewBuckets - 1];
+ bucket = hashtable->skewBucket[bucketToRemove];
+
+ /*
+ * Calculate which bucket and batch the tuples belong to in the main
+ * hashtable. They all have the same hash value, so it's the same for all
+ * of them. Also note that it's not possible for nbatch to increase while
+ * we are processing the tuples.
+ */
+ hashvalue = bucket->hashvalue;
+ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+
+ /* Process all tuples in the bucket */
+ hashTuple = bucket->tuples;
+ while (hashTuple != NULL)
+ {
+ HashJoinTuple nextHashTuple = hashTuple->next.unshared;
+ MinimalTuple tuple;
+ Size tupleSize;
+
+ /*
+ * This code must agree with ExecHashTableInsert. We do not use
+ * ExecHashTableInsert directly as ExecHashTableInsert expects a
+ * TupleTableSlot while we already have HashJoinTuples.
+ */
+ tuple = HJTUPLE_MINTUPLE(hashTuple);
+ tupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
+
+ /* Decide whether to put the tuple in the hash table or a temp file */
+ if (batchno == hashtable->curbatch)
+ {
+ /* Move the tuple to the main hash table */
+ HashJoinTuple copyTuple;
+
+ /*
+ * We must copy the tuple into the dense storage, else it will not
+ * be found by, eg, ExecHashIncreaseNumBatches.
+ */
+ copyTuple = (HashJoinTuple) dense_alloc(hashtable, tupleSize);
+ memcpy(copyTuple, hashTuple, tupleSize);
+ pfree(hashTuple);
+
+ copyTuple->next.unshared = hashtable->buckets.unshared[bucketno];
+ hashtable->buckets.unshared[bucketno] = copyTuple;
+
+ /* We have reduced skew space, but overall space doesn't change */
+ hashtable->spaceUsedSkew -= tupleSize;
+ }
+ else
+ {
+ /* Put the tuple into a temp file for later batches */
+ Assert(batchno > hashtable->curbatch);
+ ExecHashJoinSaveTuple(tuple, hashvalue,
+ &hashtable->innerBatchFile[batchno]);
+ pfree(hashTuple);
+ hashtable->spaceUsed -= tupleSize;
+ hashtable->spaceUsedSkew -= tupleSize;
+ }
+
+ hashTuple = nextHashTuple;
+
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * Free the bucket struct itself and reset the hashtable entry to NULL.
+ *
+ * NOTE: this is not nearly as simple as it looks on the surface, because
+ * of the possibility of collisions in the hashtable. Suppose that hash
+ * values A and B collide at a particular hashtable entry, and that A was
+ * entered first so B gets shifted to a different table entry. If we were
+ * to remove A first then ExecHashGetSkewBucket would mistakenly start
+ * reporting that B is not in the hashtable, because it would hit the NULL
+ * before finding B. However, we always remove entries in the reverse
+ * order of creation, so this failure cannot happen.
+ */
+ hashtable->skewBucket[bucketToRemove] = NULL;
+ hashtable->nSkewBuckets--;
+ pfree(bucket);
+ hashtable->spaceUsed -= SKEW_BUCKET_OVERHEAD;
+ hashtable->spaceUsedSkew -= SKEW_BUCKET_OVERHEAD;
+
+ /*
+ * If we have removed all skew buckets then give up on skew optimization.
+ * Release the arrays since they aren't useful any more.
+ */
+ if (hashtable->nSkewBuckets == 0)
+ {
+ hashtable->skewEnabled = false;
+ pfree(hashtable->skewBucket);
+ pfree(hashtable->skewBucketNums);
+ hashtable->skewBucket = NULL;
+ hashtable->skewBucketNums = NULL;
+ hashtable->spaceUsed -= hashtable->spaceUsedSkew;
+ hashtable->spaceUsedSkew = 0;
+ }
+}
+
+/*
+ * Reserve space in the DSM segment for instrumentation data.
+ */
+void
+ExecHashEstimate(HashState *node, ParallelContext *pcxt)
+{
+ size_t size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation));
+ size = add_size(size, offsetof(SharedHashInfo, hinstrument));
+ shm_toc_estimate_chunk(&pcxt->estimator, size);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/*
+ * Set up a space in the DSM for all workers to record instrumentation data
+ * about their hash table.
+ */
+void
+ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt)
+{
+ size_t size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = offsetof(SharedHashInfo, hinstrument) +
+ pcxt->nworkers * sizeof(HashInstrumentation);
+ node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);
+
+ /* Each per-worker area must start out as zeroes. */
+ memset(node->shared_info, 0, size);
+
+ node->shared_info->num_workers = pcxt->nworkers;
+ shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id,
+ node->shared_info);
+}
+
+/*
+ * Locate the DSM space for hash table instrumentation data that we'll write
+ * to at shutdown time.
+ */
+void
+ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt)
+{
+ SharedHashInfo *shared_info;
+
+ /* don't need this if not instrumenting */
+ if (!node->ps.instrument)
+ return;
+
+ /*
+ * Find our entry in the shared area, and set up a pointer to it so that
+ * we'll accumulate stats there when shutting down or rebuilding the hash
+ * table.
+ */
+ shared_info = (SharedHashInfo *)
+ shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);
+ node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber];
+}
+
+/*
+ * Collect EXPLAIN stats if needed, saving them into DSM memory if
+ * ExecHashInitializeWorker was called, or local storage if not. In the
+ * parallel case, this must be done in ExecShutdownHash() rather than
+ * ExecEndHash() because the latter runs after we've detached from the DSM
+ * segment.
+ */
+void
+ExecShutdownHash(HashState *node)
+{
+ /* Allocate save space if EXPLAIN'ing and we didn't do so already */
+ if (node->ps.instrument && !node->hinstrument)
+ node->hinstrument = (HashInstrumentation *)
+ palloc0(sizeof(HashInstrumentation));
+ /* Now accumulate data for the current (final) hash table */
+ if (node->hinstrument && node->hashtable)
+ ExecHashAccumInstrumentation(node->hinstrument, node->hashtable);
+}
+
+/*
+ * Retrieve instrumentation data from workers before the DSM segment is
+ * detached, so that EXPLAIN can access it.
+ */
+void
+ExecHashRetrieveInstrumentation(HashState *node)
+{
+ SharedHashInfo *shared_info = node->shared_info;
+ size_t size;
+
+ if (shared_info == NULL)
+ return;
+
+ /* Replace node->shared_info with a copy in backend-local memory. */
+ size = offsetof(SharedHashInfo, hinstrument) +
+ shared_info->num_workers * sizeof(HashInstrumentation);
+ node->shared_info = palloc(size);
+ memcpy(node->shared_info, shared_info, size);
+}
+
+/*
+ * Accumulate instrumentation data from 'hashtable' into an
+ * initially-zeroed HashInstrumentation struct.
+ *
+ * This is used to merge information across successive hash table instances
+ * within a single plan node. We take the maximum values of each interesting
+ * number. The largest nbuckets and largest nbatch values might have occurred
+ * in different instances, so there's some risk of confusion from reporting
+ * unrelated numbers; but there's a bigger risk of misdiagnosing a performance
+ * issue if we don't report the largest values. Similarly, we want to report
+ * the largest spacePeak regardless of whether it happened in the same
+ * instance as the largest nbuckets or nbatch. All the instances should have
+ * the same nbuckets_original and nbatch_original; but there's little value
+ * in depending on that here, so handle them the same way.
+ */
+void
+ExecHashAccumInstrumentation(HashInstrumentation *instrument,
+ HashJoinTable hashtable)
+{
+ instrument->nbuckets = Max(instrument->nbuckets,
+ hashtable->nbuckets);
+ instrument->nbuckets_original = Max(instrument->nbuckets_original,
+ hashtable->nbuckets_original);
+ instrument->nbatch = Max(instrument->nbatch,
+ hashtable->nbatch);
+ instrument->nbatch_original = Max(instrument->nbatch_original,
+ hashtable->nbatch_original);
+ instrument->space_peak = Max(instrument->space_peak,
+ hashtable->spacePeak);
+}
+
+/*
+ * Allocate 'size' bytes from the currently active HashMemoryChunk
+ */
+static void *
+dense_alloc(HashJoinTable hashtable, Size size)
+{
+ HashMemoryChunk newChunk;
+ char *ptr;
+
+ /* just in case the size is not already aligned properly */
+ size = MAXALIGN(size);
+
+ /*
+ * If tuple size is larger than threshold, allocate a separate chunk.
+ */
+ if (size > HASH_CHUNK_THRESHOLD)
+ {
+ /* allocate new chunk and put it at the beginning of the list */
+ newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
+ HASH_CHUNK_HEADER_SIZE + size);
+ newChunk->maxlen = size;
+ newChunk->used = size;
+ newChunk->ntuples = 1;
+
+ /*
+ * Add this chunk to the list after the first existing chunk, so that
+ * we don't lose the remaining space in the "current" chunk.
+ */
+ if (hashtable->chunks != NULL)
+ {
+ newChunk->next = hashtable->chunks->next;
+ hashtable->chunks->next.unshared = newChunk;
+ }
+ else
+ {
+ newChunk->next.unshared = hashtable->chunks;
+ hashtable->chunks = newChunk;
+ }
+
+ return HASH_CHUNK_DATA(newChunk);
+ }
+
+ /*
+ * See if we have enough space for it in the current chunk (if any). If
+ * not, allocate a fresh chunk.
+ */
+ if ((hashtable->chunks == NULL) ||
+ (hashtable->chunks->maxlen - hashtable->chunks->used) < size)
+ {
+ /* allocate new chunk and put it at the beginning of the list */
+ newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
+ HASH_CHUNK_HEADER_SIZE + HASH_CHUNK_SIZE);
+
+ newChunk->maxlen = HASH_CHUNK_SIZE;
+ newChunk->used = size;
+ newChunk->ntuples = 1;
+
+ newChunk->next.unshared = hashtable->chunks;
+ hashtable->chunks = newChunk;
+
+ return HASH_CHUNK_DATA(newChunk);
+ }
+
+ /* There is enough space in the current chunk, let's add the tuple */
+ ptr = HASH_CHUNK_DATA(hashtable->chunks) + hashtable->chunks->used;
+ hashtable->chunks->used += size;
+ hashtable->chunks->ntuples += 1;
+
+ /* return pointer to the start of the tuple memory */
+ return ptr;
+}
+
+/*
+ * Allocate space for a tuple in shared dense storage. This is equivalent to
+ * dense_alloc but for Parallel Hash using shared memory.
+ *
+ * While loading a tuple into shared memory, we might run out of memory and
+ * decide to repartition, or determine that the load factor is too high and
+ * decide to expand the bucket array, or discover that another participant has
+ * commanded us to help do that. Return NULL if number of buckets or batches
+ * has changed, indicating that the caller must retry (considering the
+ * possibility that the tuple no longer belongs in the same batch).
+ */
+static HashJoinTuple
+ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
+ dsa_pointer *shared)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ dsa_pointer chunk_shared;
+ HashMemoryChunk chunk;
+ Size chunk_size;
+ HashJoinTuple result;
+ int curbatch = hashtable->curbatch;
+
+ size = MAXALIGN(size);
+
+ /*
+ * Fast path: if there is enough space in this backend's current chunk,
+ * then we can allocate without any locking.
+ */
+ chunk = hashtable->current_chunk;
+ if (chunk != NULL &&
+ size <= HASH_CHUNK_THRESHOLD &&
+ chunk->maxlen - chunk->used >= size)
+ {
+
+ chunk_shared = hashtable->current_chunk_shared;
+ Assert(chunk == dsa_get_address(hashtable->area, chunk_shared));
+ *shared = chunk_shared + HASH_CHUNK_HEADER_SIZE + chunk->used;
+ result = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + chunk->used);
+ chunk->used += size;
+
+ Assert(chunk->used <= chunk->maxlen);
+ Assert(result == dsa_get_address(hashtable->area, *shared));
+
+ return result;
+ }
+
+ /* Slow path: try to allocate a new chunk. */
+ LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
+
+ /*
+ * Check if we need to help increase the number of buckets or batches.
+ */
+ if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES ||
+ pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+ {
+ ParallelHashGrowth growth = pstate->growth;
+
+ hashtable->current_chunk = NULL;
+ LWLockRelease(&pstate->lock);
+
+ /* Another participant has commanded us to help grow. */
+ if (growth == PHJ_GROWTH_NEED_MORE_BATCHES)
+ ExecParallelHashIncreaseNumBatches(hashtable);
+ else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+ ExecParallelHashIncreaseNumBuckets(hashtable);
+
+ /* The caller must retry. */
+ return NULL;
+ }
+
+ /* Oversized tuples get their own chunk. */
+ if (size > HASH_CHUNK_THRESHOLD)
+ chunk_size = size + HASH_CHUNK_HEADER_SIZE;
+ else
+ chunk_size = HASH_CHUNK_SIZE;
+
+ /* Check if it's time to grow batches or buckets. */
+ if (pstate->growth != PHJ_GROWTH_DISABLED)
+ {
+ Assert(curbatch == 0);
+ Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER);
+
+ /*
+ * Check if our space limit would be exceeded. To avoid choking on
+ * very large tuples or very low hash_mem setting, we'll always allow
+ * each backend to allocate at least one chunk.
+ */
+ if (hashtable->batches[0].at_least_one_chunk &&
+ hashtable->batches[0].shared->size +
+ chunk_size > pstate->space_allowed)
+ {
+ pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
+ hashtable->batches[0].shared->space_exhausted = true;
+ LWLockRelease(&pstate->lock);
+
+ return NULL;
+ }
+
+ /* Check if our load factor limit would be exceeded. */
+ if (hashtable->nbatch == 1)
+ {
+ hashtable->batches[0].shared->ntuples += hashtable->batches[0].ntuples;
+ hashtable->batches[0].ntuples = 0;
+ /* Guard against integer overflow and alloc size overflow */
+ if (hashtable->batches[0].shared->ntuples + 1 >
+ hashtable->nbuckets * NTUP_PER_BUCKET &&
+ hashtable->nbuckets < (INT_MAX / 2) &&
+ hashtable->nbuckets * 2 <=
+ MaxAllocSize / sizeof(dsa_pointer_atomic))
+ {
+ pstate->growth = PHJ_GROWTH_NEED_MORE_BUCKETS;
+ LWLockRelease(&pstate->lock);
+
+ return NULL;
+ }
+ }
+ }
+
+ /* We are cleared to allocate a new chunk. */
+ chunk_shared = dsa_allocate(hashtable->area, chunk_size);
+ hashtable->batches[curbatch].shared->size += chunk_size;
+ hashtable->batches[curbatch].at_least_one_chunk = true;
+
+ /* Set up the chunk. */
+ chunk = (HashMemoryChunk) dsa_get_address(hashtable->area, chunk_shared);
+ *shared = chunk_shared + HASH_CHUNK_HEADER_SIZE;
+ chunk->maxlen = chunk_size - HASH_CHUNK_HEADER_SIZE;
+ chunk->used = size;
+
+ /*
+ * Push it onto the list of chunks, so that it can be found if we need to
+ * increase the number of buckets or batches (batch 0 only) and later for
+ * freeing the memory (all batches).
+ */
+ chunk->next.shared = hashtable->batches[curbatch].shared->chunks;
+ hashtable->batches[curbatch].shared->chunks = chunk_shared;
+
+ if (size <= HASH_CHUNK_THRESHOLD)
+ {
+ /*
+ * Make this the current chunk so that we can use the fast path to
+ * fill the rest of it up in future calls.
+ */
+ hashtable->current_chunk = chunk;
+ hashtable->current_chunk_shared = chunk_shared;
+ }
+ LWLockRelease(&pstate->lock);
+
+ Assert(HASH_CHUNK_DATA(chunk) == dsa_get_address(hashtable->area, *shared));
+ result = (HashJoinTuple) HASH_CHUNK_DATA(chunk);
+
+ return result;
+}
+
+/*
+ * One backend needs to set up the shared batch state including tuplestores.
+ * Other backends will ensure they have correctly configured accessors by
+ * called ExecParallelHashEnsureBatchAccessors().
+ */
+static void
+ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ ParallelHashJoinBatch *batches;
+ MemoryContext oldcxt;
+ int i;
+
+ Assert(hashtable->batches == NULL);
+
+ /* Allocate space. */
+ pstate->batches =
+ dsa_allocate0(hashtable->area,
+ EstimateParallelHashJoinBatch(hashtable) * nbatch);
+ pstate->nbatch = nbatch;
+ batches = dsa_get_address(hashtable->area, pstate->batches);
+
+ /* Use hash join memory context. */
+ oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+
+ /* Allocate this backend's accessor array. */
+ hashtable->nbatch = nbatch;
+ hashtable->batches = (ParallelHashJoinBatchAccessor *)
+ palloc0(sizeof(ParallelHashJoinBatchAccessor) * hashtable->nbatch);
+
+ /* Set up the shared state, tuplestores and backend-local accessors. */
+ for (i = 0; i < hashtable->nbatch; ++i)
+ {
+ ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i];
+ ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i);
+ char name[MAXPGPATH];
+
+ /*
+ * All members of shared were zero-initialized. We just need to set
+ * up the Barrier.
+ */
+ BarrierInit(&shared->batch_barrier, 0);
+ if (i == 0)
+ {
+ /* Batch 0 doesn't need to be loaded. */
+ BarrierAttach(&shared->batch_barrier);
+ while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_PROBING)
+ BarrierArriveAndWait(&shared->batch_barrier, 0);
+ BarrierDetach(&shared->batch_barrier);
+ }
+
+ /* Initialize accessor state. All members were zero-initialized. */
+ accessor->shared = shared;
+
+ /* Initialize the shared tuplestores. */
+ snprintf(name, sizeof(name), "i%dof%d", i, hashtable->nbatch);
+ accessor->inner_tuples =
+ sts_initialize(ParallelHashJoinBatchInner(shared),
+ pstate->nparticipants,
+ ParallelWorkerNumber + 1,
+ sizeof(uint32),
+ SHARED_TUPLESTORE_SINGLE_PASS,
+ &pstate->fileset,
+ name);
+ snprintf(name, sizeof(name), "o%dof%d", i, hashtable->nbatch);
+ accessor->outer_tuples =
+ sts_initialize(ParallelHashJoinBatchOuter(shared,
+ pstate->nparticipants),
+ pstate->nparticipants,
+ ParallelWorkerNumber + 1,
+ sizeof(uint32),
+ SHARED_TUPLESTORE_SINGLE_PASS,
+ &pstate->fileset,
+ name);
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * Free the current set of ParallelHashJoinBatchAccessor objects.
+ */
+static void
+ExecParallelHashCloseBatchAccessors(HashJoinTable hashtable)
+{
+ int i;
+
+ for (i = 0; i < hashtable->nbatch; ++i)
+ {
+ /* Make sure no files are left open. */
+ sts_end_write(hashtable->batches[i].inner_tuples);
+ sts_end_write(hashtable->batches[i].outer_tuples);
+ sts_end_parallel_scan(hashtable->batches[i].inner_tuples);
+ sts_end_parallel_scan(hashtable->batches[i].outer_tuples);
+ }
+ pfree(hashtable->batches);
+ hashtable->batches = NULL;
+}
+
+/*
+ * Make sure this backend has up-to-date accessors for the current set of
+ * batches.
+ */
+static void
+ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ ParallelHashJoinBatch *batches;
+ MemoryContext oldcxt;
+ int i;
+
+ if (hashtable->batches != NULL)
+ {
+ if (hashtable->nbatch == pstate->nbatch)
+ return;
+ ExecParallelHashCloseBatchAccessors(hashtable);
+ }
+
+ /*
+ * It's possible for a backend to start up very late so that the whole
+ * join is finished and the shm state for tracking batches has already
+ * been freed by ExecHashTableDetach(). In that case we'll just leave
+ * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
+ * up early.
+ */
+ if (!DsaPointerIsValid(pstate->batches))
+ return;
+
+ /* Use hash join memory context. */
+ oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+
+ /* Allocate this backend's accessor array. */
+ hashtable->nbatch = pstate->nbatch;
+ hashtable->batches = (ParallelHashJoinBatchAccessor *)
+ palloc0(sizeof(ParallelHashJoinBatchAccessor) * hashtable->nbatch);
+
+ /* Find the base of the pseudo-array of ParallelHashJoinBatch objects. */
+ batches = (ParallelHashJoinBatch *)
+ dsa_get_address(hashtable->area, pstate->batches);
+
+ /* Set up the accessor array and attach to the tuplestores. */
+ for (i = 0; i < hashtable->nbatch; ++i)
+ {
+ ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i];
+ ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i);
+
+ accessor->shared = shared;
+ accessor->preallocated = 0;
+ accessor->done = false;
+ accessor->inner_tuples =
+ sts_attach(ParallelHashJoinBatchInner(shared),
+ ParallelWorkerNumber + 1,
+ &pstate->fileset);
+ accessor->outer_tuples =
+ sts_attach(ParallelHashJoinBatchOuter(shared,
+ pstate->nparticipants),
+ ParallelWorkerNumber + 1,
+ &pstate->fileset);
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * Allocate an empty shared memory hash table for a given batch.
+ */
+void
+ExecParallelHashTableAlloc(HashJoinTable hashtable, int batchno)
+{
+ ParallelHashJoinBatch *batch = hashtable->batches[batchno].shared;
+ dsa_pointer_atomic *buckets;
+ int nbuckets = hashtable->parallel_state->nbuckets;
+ int i;
+
+ batch->buckets =
+ dsa_allocate(hashtable->area, sizeof(dsa_pointer_atomic) * nbuckets);
+ buckets = (dsa_pointer_atomic *)
+ dsa_get_address(hashtable->area, batch->buckets);
+ for (i = 0; i < nbuckets; ++i)
+ dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
+}
+
+/*
+ * If we are currently attached to a shared hash join batch, detach. If we
+ * are last to detach, clean up.
+ */
+void
+ExecHashTableDetachBatch(HashJoinTable hashtable)
+{
+ if (hashtable->parallel_state != NULL &&
+ hashtable->curbatch >= 0)
+ {
+ int curbatch = hashtable->curbatch;
+ ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
+
+ /* Make sure any temporary files are closed. */
+ sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);
+ sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);
+
+ /* Detach from the batch we were last working on. */
+ if (BarrierArriveAndDetach(&batch->batch_barrier))
+ {
+ /*
+ * Technically we shouldn't access the barrier because we're no
+ * longer attached, but since there is no way it's moving after
+ * this point it seems safe to make the following assertion.
+ */
+ Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_DONE);
+
+ /* Free shared chunks and buckets. */
+ while (DsaPointerIsValid(batch->chunks))
+ {
+ HashMemoryChunk chunk =
+ dsa_get_address(hashtable->area, batch->chunks);
+ dsa_pointer next = chunk->next.shared;
+
+ dsa_free(hashtable->area, batch->chunks);
+ batch->chunks = next;
+ }
+ if (DsaPointerIsValid(batch->buckets))
+ {
+ dsa_free(hashtable->area, batch->buckets);
+ batch->buckets = InvalidDsaPointer;
+ }
+ }
+
+ /*
+ * Track the largest batch we've been attached to. Though each
+ * backend might see a different subset of batches, explain.c will
+ * scan the results from all backends to find the largest value.
+ */
+ hashtable->spacePeak =
+ Max(hashtable->spacePeak,
+ batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);
+
+ /* Remember that we are not attached to a batch. */
+ hashtable->curbatch = -1;
+ }
+}
+
+/*
+ * Detach from all shared resources. If we are last to detach, clean up.
+ */
+void
+ExecHashTableDetach(HashJoinTable hashtable)
+{
+ if (hashtable->parallel_state)
+ {
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ int i;
+
+ /* Make sure any temporary files are closed. */
+ if (hashtable->batches)
+ {
+ for (i = 0; i < hashtable->nbatch; ++i)
+ {
+ sts_end_write(hashtable->batches[i].inner_tuples);
+ sts_end_write(hashtable->batches[i].outer_tuples);
+ sts_end_parallel_scan(hashtable->batches[i].inner_tuples);
+ sts_end_parallel_scan(hashtable->batches[i].outer_tuples);
+ }
+ }
+
+ /* If we're last to detach, clean up shared memory. */
+ if (BarrierDetach(&pstate->build_barrier))
+ {
+ if (DsaPointerIsValid(pstate->batches))
+ {
+ dsa_free(hashtable->area, pstate->batches);
+ pstate->batches = InvalidDsaPointer;
+ }
+ }
+
+ hashtable->parallel_state = NULL;
+ }
+}
+
+/*
+ * Get the first tuple in a given bucket identified by number.
+ */
+static inline HashJoinTuple
+ExecParallelHashFirstTuple(HashJoinTable hashtable, int bucketno)
+{
+ HashJoinTuple tuple;
+ dsa_pointer p;
+
+ Assert(hashtable->parallel_state);
+ p = dsa_pointer_atomic_read(&hashtable->buckets.shared[bucketno]);
+ tuple = (HashJoinTuple) dsa_get_address(hashtable->area, p);
+
+ return tuple;
+}
+
+/*
+ * Get the next tuple in the same bucket as 'tuple'.
+ */
+static inline HashJoinTuple
+ExecParallelHashNextTuple(HashJoinTable hashtable, HashJoinTuple tuple)
+{
+ HashJoinTuple next;
+
+ Assert(hashtable->parallel_state);
+ next = (HashJoinTuple) dsa_get_address(hashtable->area, tuple->next.shared);
+
+ return next;
+}
+
+/*
+ * Insert a tuple at the front of a chain of tuples in DSA memory atomically.
+ */
+static inline void
+ExecParallelHashPushTuple(dsa_pointer_atomic *head,
+ HashJoinTuple tuple,
+ dsa_pointer tuple_shared)
+{
+ for (;;)
+ {
+ tuple->next.shared = dsa_pointer_atomic_read(head);
+ if (dsa_pointer_atomic_compare_exchange(head,
+ &tuple->next.shared,
+ tuple_shared))
+ break;
+ }
+}
+
+/*
+ * Prepare to work on a given batch.
+ */
+void
+ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno)
+{
+ Assert(hashtable->batches[batchno].shared->buckets != InvalidDsaPointer);
+
+ hashtable->curbatch = batchno;
+ hashtable->buckets.shared = (dsa_pointer_atomic *)
+ dsa_get_address(hashtable->area,
+ hashtable->batches[batchno].shared->buckets);
+ hashtable->nbuckets = hashtable->parallel_state->nbuckets;
+ hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
+ hashtable->current_chunk = NULL;
+ hashtable->current_chunk_shared = InvalidDsaPointer;
+ hashtable->batches[batchno].at_least_one_chunk = false;
+}
+
+/*
+ * Take the next available chunk from the queue of chunks being worked on in
+ * parallel. Return NULL if there are none left. Otherwise return a pointer
+ * to the chunk, and set *shared to the DSA pointer to the chunk.
+ */
+static HashMemoryChunk
+ExecParallelHashPopChunkQueue(HashJoinTable hashtable, dsa_pointer *shared)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ HashMemoryChunk chunk;
+
+ LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
+ if (DsaPointerIsValid(pstate->chunk_work_queue))
+ {
+ *shared = pstate->chunk_work_queue;
+ chunk = (HashMemoryChunk)
+ dsa_get_address(hashtable->area, *shared);
+ pstate->chunk_work_queue = chunk->next.shared;
+ }
+ else
+ chunk = NULL;
+ LWLockRelease(&pstate->lock);
+
+ return chunk;
+}
+
+/*
+ * Increase the space preallocated in this backend for a given inner batch by
+ * at least a given amount. This allows us to track whether a given batch
+ * would fit in memory when loaded back in. Also increase the number of
+ * batches or buckets if required.
+ *
+ * This maintains a running estimation of how much space will be taken when we
+ * load the batch back into memory by simulating the way chunks will be handed
+ * out to workers. It's not perfectly accurate because the tuples will be
+ * packed into memory chunks differently by ExecParallelHashTupleAlloc(), but
+ * it should be pretty close. It tends to overestimate by a fraction of a
+ * chunk per worker since all workers gang up to preallocate during hashing,
+ * but workers tend to reload batches alone if there are enough to go around,
+ * leaving fewer partially filled chunks. This effect is bounded by
+ * nparticipants.
+ *
+ * Return false if the number of batches or buckets has changed, and the
+ * caller should reconsider which batch a given tuple now belongs in and call
+ * again.
+ */
+static bool
+ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
+{
+ ParallelHashJoinState *pstate = hashtable->parallel_state;
+ ParallelHashJoinBatchAccessor *batch = &hashtable->batches[batchno];
+ size_t want = Max(size, HASH_CHUNK_SIZE - HASH_CHUNK_HEADER_SIZE);
+
+ Assert(batchno > 0);
+ Assert(batchno < hashtable->nbatch);
+ Assert(size == MAXALIGN(size));
+
+ LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
+
+ /* Has another participant commanded us to help grow? */
+ if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES ||
+ pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+ {
+ ParallelHashGrowth growth = pstate->growth;
+
+ LWLockRelease(&pstate->lock);
+ if (growth == PHJ_GROWTH_NEED_MORE_BATCHES)
+ ExecParallelHashIncreaseNumBatches(hashtable);
+ else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+ ExecParallelHashIncreaseNumBuckets(hashtable);
+
+ return false;
+ }
+
+ if (pstate->growth != PHJ_GROWTH_DISABLED &&
+ batch->at_least_one_chunk &&
+ (batch->shared->estimated_size + want + HASH_CHUNK_HEADER_SIZE
+ > pstate->space_allowed))
+ {
+ /*
+ * We have determined that this batch would exceed the space budget if
+ * loaded into memory. Command all participants to help repartition.
+ */
+ batch->shared->space_exhausted = true;
+ pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
+ LWLockRelease(&pstate->lock);
+
+ return false;
+ }
+
+ batch->at_least_one_chunk = true;
+ batch->shared->estimated_size += want + HASH_CHUNK_HEADER_SIZE;
+ batch->preallocated = want;
+ LWLockRelease(&pstate->lock);
+
+ return true;
+}
+
+/*
+ * Calculate the limit on how much memory can be used by Hash and similar
+ * plan types. This is work_mem times hash_mem_multiplier, and is
+ * expressed in bytes.
+ *
+ * Exported for use by the planner, as well as other hash-like executor
+ * nodes. This is a rather random place for this, but there is no better
+ * place.
+ */
+size_t
+get_hash_memory_limit(void)
+{
+ double mem_limit;
+
+ /* Do initial calculation in double arithmetic */
+ mem_limit = (double) work_mem * hash_mem_multiplier * 1024.0;
+
+ /* Clamp in case it doesn't fit in size_t */
+ mem_limit = Min(mem_limit, (double) SIZE_MAX);
+
+ return (size_t) mem_limit;
+}
+
+/*
+ * Convert the hash memory limit to an integer number of kilobytes,
+ * that is something comparable to work_mem. Like work_mem, we clamp
+ * the result to ensure that multiplying it by 1024 fits in a long int.
+ *
+ * This is deprecated since it may understate the actual memory limit.
+ * It is unused in core and will eventually be removed.
+ */
+int
+get_hash_mem(void)
+{
+ size_t mem_limit = get_hash_memory_limit();
+
+ /* Remove the kilobyte factor */
+ mem_limit /= 1024;
+
+ /* Clamp to MAX_KILOBYTES, like work_mem */
+ mem_limit = Min(mem_limit, (size_t) MAX_KILOBYTES);
+
+ return (int) mem_limit;
+}
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
new file mode 100644
index 0000000..510bdd3
--- /dev/null
+++ b/src/backend/executor/nodeHashjoin.c
@@ -0,0 +1,1551 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeHashjoin.c
+ * Routines to handle hash join nodes
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeHashjoin.c
+ *
+ * PARALLELISM
+ *
+ * Hash joins can participate in parallel query execution in several ways. A
+ * parallel-oblivious hash join is one where the node is unaware that it is
+ * part of a parallel plan. In this case, a copy of the inner plan is used to
+ * build a copy of the hash table in every backend, and the outer plan could
+ * either be built from a partial or complete path, so that the results of the
+ * hash join are correspondingly either partial or complete. A parallel-aware
+ * hash join is one that behaves differently, coordinating work between
+ * backends, and appears as Parallel Hash Join in EXPLAIN output. A Parallel
+ * Hash Join always appears with a Parallel Hash node.
+ *
+ * Parallel-aware hash joins use the same per-backend state machine to track
+ * progress through the hash join algorithm as parallel-oblivious hash joins.
+ * In a parallel-aware hash join, there is also a shared state machine that
+ * co-operating backends use to synchronize their local state machines and
+ * program counters. The shared state machine is managed with a Barrier IPC
+ * primitive. When all attached participants arrive at a barrier, the phase
+ * advances and all waiting participants are released.
+ *
+ * When a participant begins working on a parallel hash join, it must first
+ * figure out how much progress has already been made, because participants
+ * don't wait for each other to begin. For this reason there are switch
+ * statements at key points in the code where we have to synchronize our local
+ * state machine with the phase, and then jump to the correct part of the
+ * algorithm so that we can get started.
+ *
+ * One barrier called build_barrier is used to coordinate the hashing phases.
+ * The phase is represented by an integer which begins at zero and increments
+ * one by one, but in the code it is referred to by symbolic names as follows:
+ *
+ * PHJ_BUILD_ELECTING -- initial state
+ * PHJ_BUILD_ALLOCATING -- one sets up the batches and table 0
+ * PHJ_BUILD_HASHING_INNER -- all hash the inner rel
+ * PHJ_BUILD_HASHING_OUTER -- (multi-batch only) all hash the outer
+ * PHJ_BUILD_DONE -- building done, probing can begin
+ *
+ * While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may
+ * be used repeatedly as required to coordinate expansions in the number of
+ * batches or buckets. Their phases are as follows:
+ *
+ * PHJ_GROW_BATCHES_ELECTING -- initial state
+ * PHJ_GROW_BATCHES_ALLOCATING -- one allocates new batches
+ * PHJ_GROW_BATCHES_REPARTITIONING -- all repartition
+ * PHJ_GROW_BATCHES_FINISHING -- one cleans up, detects skew
+ *
+ * PHJ_GROW_BUCKETS_ELECTING -- initial state
+ * PHJ_GROW_BUCKETS_ALLOCATING -- one allocates new buckets
+ * PHJ_GROW_BUCKETS_REINSERTING -- all insert tuples
+ *
+ * If the planner got the number of batches and buckets right, those won't be
+ * necessary, but on the other hand we might finish up needing to expand the
+ * buckets or batches multiple times while hashing the inner relation to stay
+ * within our memory budget and load factor target. For that reason it's a
+ * separate pair of barriers using circular phases.
+ *
+ * The PHJ_BUILD_HASHING_OUTER phase is required only for multi-batch joins,
+ * because we need to divide the outer relation into batches up front in order
+ * to be able to process batches entirely independently. In contrast, the
+ * parallel-oblivious algorithm simply throws tuples 'forward' to 'later'
+ * batches whenever it encounters them while scanning and probing, which it
+ * can do because it processes batches in serial order.
+ *
+ * Once PHJ_BUILD_DONE is reached, backends then split up and process
+ * different batches, or gang up and work together on probing batches if there
+ * aren't enough to go around. For each batch there is a separate barrier
+ * with the following phases:
+ *
+ * PHJ_BATCH_ELECTING -- initial state
+ * PHJ_BATCH_ALLOCATING -- one allocates buckets
+ * PHJ_BATCH_LOADING -- all load the hash table from disk
+ * PHJ_BATCH_PROBING -- all probe
+ * PHJ_BATCH_DONE -- end
+ *
+ * Batch 0 is a special case, because it starts out in phase
+ * PHJ_BATCH_PROBING; populating batch 0's hash table is done during
+ * PHJ_BUILD_HASHING_INNER so we can skip loading.
+ *
+ * Initially we try to plan for a single-batch hash join using the combined
+ * hash_mem of all participants to create a large shared hash table. If that
+ * turns out either at planning or execution time to be impossible then we
+ * fall back to regular hash_mem sized hash tables.
+ *
+ * To avoid deadlocks, we never wait for any barrier unless it is known that
+ * all other backends attached to it are actively executing the node or have
+ * already arrived. Practically, that means that we never return a tuple
+ * while attached to a barrier, unless the barrier has reached its final
+ * state. In the slightly special case of the per-batch barrier, we return
+ * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use
+ * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/parallel.h"
+#include "executor/executor.h"
+#include "executor/hashjoin.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#include "utils/sharedtuplestore.h"
+
+
+/*
+ * States of the ExecHashJoin state machine
+ */
+#define HJ_BUILD_HASHTABLE 1
+#define HJ_NEED_NEW_OUTER 2
+#define HJ_SCAN_BUCKET 3
+#define HJ_FILL_OUTER_TUPLE 4
+#define HJ_FILL_INNER_TUPLES 5
+#define HJ_NEED_NEW_BATCH 6
+
+/* Returns true if doing null-fill on outer relation */
+#define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL)
+/* Returns true if doing null-fill on inner relation */
+#define HJ_FILL_INNER(hjstate) ((hjstate)->hj_NullOuterTupleSlot != NULL)
+
+static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
+ HashJoinState *hjstate,
+ uint32 *hashvalue);
+static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
+ HashJoinState *hjstate,
+ uint32 *hashvalue);
+static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
+ BufFile *file,
+ uint32 *hashvalue,
+ TupleTableSlot *tupleSlot);
+static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
+static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate);
+static void ExecParallelHashJoinPartitionOuter(HashJoinState *node);
+
+
+/* ----------------------------------------------------------------
+ * ExecHashJoinImpl
+ *
+ * This function implements the Hybrid Hashjoin algorithm. It is marked
+ * with an always-inline attribute so that ExecHashJoin() and
+ * ExecParallelHashJoin() can inline it. Compilers that respect the
+ * attribute should create versions specialized for parallel == true and
+ * parallel == false with unnecessary branches removed.
+ *
+ * Note: the relation we build hash table on is the "inner"
+ * the other one is "outer".
+ * ----------------------------------------------------------------
+ */
+static pg_attribute_always_inline TupleTableSlot *
+ExecHashJoinImpl(PlanState *pstate, bool parallel)
+{
+ HashJoinState *node = castNode(HashJoinState, pstate);
+ PlanState *outerNode;
+ HashState *hashNode;
+ ExprState *joinqual;
+ ExprState *otherqual;
+ ExprContext *econtext;
+ HashJoinTable hashtable;
+ TupleTableSlot *outerTupleSlot;
+ uint32 hashvalue;
+ int batchno;
+ ParallelHashJoinState *parallel_state;
+
+ /*
+ * get information from HashJoin node
+ */
+ joinqual = node->js.joinqual;
+ otherqual = node->js.ps.qual;
+ hashNode = (HashState *) innerPlanState(node);
+ outerNode = outerPlanState(node);
+ hashtable = node->hj_HashTable;
+ econtext = node->js.ps.ps_ExprContext;
+ parallel_state = hashNode->parallel_state;
+
+ /*
+ * Reset per-tuple memory context to free any expression evaluation
+ * storage allocated in the previous tuple cycle.
+ */
+ ResetExprContext(econtext);
+
+ /*
+ * run the hash join state machine
+ */
+ for (;;)
+ {
+ /*
+ * It's possible to iterate this loop many times before returning a
+ * tuple, in some pathological cases such as needing to move much of
+ * the current batch to a later batch. So let's check for interrupts
+ * each time through.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ switch (node->hj_JoinState)
+ {
+ case HJ_BUILD_HASHTABLE:
+
+ /*
+ * First time through: build hash table for inner relation.
+ */
+ Assert(hashtable == NULL);
+
+ /*
+ * If the outer relation is completely empty, and it's not
+ * right/full join, we can quit without building the hash
+ * table. However, for an inner join it is only a win to
+ * check this when the outer relation's startup cost is less
+ * than the projected cost of building the hash table.
+ * Otherwise it's best to build the hash table first and see
+ * if the inner relation is empty. (When it's a left join, we
+ * should always make this check, since we aren't going to be
+ * able to skip the join on the strength of an empty inner
+ * relation anyway.)
+ *
+ * If we are rescanning the join, we make use of information
+ * gained on the previous scan: don't bother to try the
+ * prefetch if the previous scan found the outer relation
+ * nonempty. This is not 100% reliable since with new
+ * parameters the outer relation might yield different
+ * results, but it's a good heuristic.
+ *
+ * The only way to make the check is to try to fetch a tuple
+ * from the outer plan node. If we succeed, we have to stash
+ * it away for later consumption by ExecHashJoinOuterGetTuple.
+ */
+ if (HJ_FILL_INNER(node))
+ {
+ /* no chance to not build the hash table */
+ node->hj_FirstOuterTupleSlot = NULL;
+ }
+ else if (parallel)
+ {
+ /*
+ * The empty-outer optimization is not implemented for
+ * shared hash tables, because no one participant can
+ * determine that there are no outer tuples, and it's not
+ * yet clear that it's worth the synchronization overhead
+ * of reaching consensus to figure that out. So we have
+ * to build the hash table.
+ */
+ node->hj_FirstOuterTupleSlot = NULL;
+ }
+ else if (HJ_FILL_OUTER(node) ||
+ (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
+ !node->hj_OuterNotEmpty))
+ {
+ node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
+ if (TupIsNull(node->hj_FirstOuterTupleSlot))
+ {
+ node->hj_OuterNotEmpty = false;
+ return NULL;
+ }
+ else
+ node->hj_OuterNotEmpty = true;
+ }
+ else
+ node->hj_FirstOuterTupleSlot = NULL;
+
+ /*
+ * Create the hash table. If using Parallel Hash, then
+ * whoever gets here first will create the hash table and any
+ * later arrivals will merely attach to it.
+ */
+ hashtable = ExecHashTableCreate(hashNode,
+ node->hj_HashOperators,
+ node->hj_Collations,
+ HJ_FILL_INNER(node));
+ node->hj_HashTable = hashtable;
+
+ /*
+ * Execute the Hash node, to build the hash table. If using
+ * Parallel Hash, then we'll try to help hashing unless we
+ * arrived too late.
+ */
+ hashNode->hashtable = hashtable;
+ (void) MultiExecProcNode((PlanState *) hashNode);
+
+ /*
+ * If the inner relation is completely empty, and we're not
+ * doing a left outer join, we can quit without scanning the
+ * outer relation.
+ */
+ if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
+ return NULL;
+
+ /*
+ * need to remember whether nbatch has increased since we
+ * began scanning the outer relation
+ */
+ hashtable->nbatch_outstart = hashtable->nbatch;
+
+ /*
+ * Reset OuterNotEmpty for scan. (It's OK if we fetched a
+ * tuple above, because ExecHashJoinOuterGetTuple will
+ * immediately set it again.)
+ */
+ node->hj_OuterNotEmpty = false;
+
+ if (parallel)
+ {
+ Barrier *build_barrier;
+
+ build_barrier = &parallel_state->build_barrier;
+ Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+ BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
+ if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER)
+ {
+ /*
+ * If multi-batch, we need to hash the outer relation
+ * up front.
+ */
+ if (hashtable->nbatch > 1)
+ ExecParallelHashJoinPartitionOuter(node);
+ BarrierArriveAndWait(build_barrier,
+ WAIT_EVENT_HASH_BUILD_HASH_OUTER);
+ }
+ Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
+
+ /* Each backend should now select a batch to work on. */
+ hashtable->curbatch = -1;
+ node->hj_JoinState = HJ_NEED_NEW_BATCH;
+
+ continue;
+ }
+ else
+ node->hj_JoinState = HJ_NEED_NEW_OUTER;
+
+ /* FALL THRU */
+
+ case HJ_NEED_NEW_OUTER:
+
+ /*
+ * We don't have an outer tuple, try to get the next one
+ */
+ if (parallel)
+ outerTupleSlot =
+ ExecParallelHashJoinOuterGetTuple(outerNode, node,
+ &hashvalue);
+ else
+ outerTupleSlot =
+ ExecHashJoinOuterGetTuple(outerNode, node, &hashvalue);
+
+ if (TupIsNull(outerTupleSlot))
+ {
+ /* end of batch, or maybe whole join */
+ if (HJ_FILL_INNER(node))
+ {
+ /* set up to scan for unmatched inner tuples */
+ ExecPrepHashTableForUnmatched(node);
+ node->hj_JoinState = HJ_FILL_INNER_TUPLES;
+ }
+ else
+ node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ continue;
+ }
+
+ econtext->ecxt_outertuple = outerTupleSlot;
+ node->hj_MatchedOuter = false;
+
+ /*
+ * Find the corresponding bucket for this tuple in the main
+ * hash table or skew hash table.
+ */
+ node->hj_CurHashValue = hashvalue;
+ ExecHashGetBucketAndBatch(hashtable, hashvalue,
+ &node->hj_CurBucketNo, &batchno);
+ node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
+ hashvalue);
+ node->hj_CurTuple = NULL;
+
+ /*
+ * The tuple might not belong to the current batch (where
+ * "current batch" includes the skew buckets if any).
+ */
+ if (batchno != hashtable->curbatch &&
+ node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
+ {
+ bool shouldFree;
+ MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot,
+ &shouldFree);
+
+ /*
+ * Need to postpone this outer tuple to a later batch.
+ * Save it in the corresponding outer-batch file.
+ */
+ Assert(parallel_state == NULL);
+ Assert(batchno > hashtable->curbatch);
+ ExecHashJoinSaveTuple(mintuple, hashvalue,
+ &hashtable->outerBatchFile[batchno]);
+
+ if (shouldFree)
+ heap_free_minimal_tuple(mintuple);
+
+ /* Loop around, staying in HJ_NEED_NEW_OUTER state */
+ continue;
+ }
+
+ /* OK, let's scan the bucket for matches */
+ node->hj_JoinState = HJ_SCAN_BUCKET;
+
+ /* FALL THRU */
+
+ case HJ_SCAN_BUCKET:
+
+ /*
+ * Scan the selected hash bucket for matches to current outer
+ */
+ if (parallel)
+ {
+ if (!ExecParallelScanHashBucket(node, econtext))
+ {
+ /* out of matches; check for possible outer-join fill */
+ node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
+ continue;
+ }
+ }
+ else
+ {
+ if (!ExecScanHashBucket(node, econtext))
+ {
+ /* out of matches; check for possible outer-join fill */
+ node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
+ continue;
+ }
+ }
+
+ /*
+ * We've got a match, but still need to test non-hashed quals.
+ * ExecScanHashBucket already set up all the state needed to
+ * call ExecQual.
+ *
+ * If we pass the qual, then save state for next call and have
+ * ExecProject form the projection, store it in the tuple
+ * table, and return the slot.
+ *
+ * Only the joinquals determine tuple match status, but all
+ * quals must pass to actually return the tuple.
+ */
+ if (joinqual == NULL || ExecQual(joinqual, econtext))
+ {
+ node->hj_MatchedOuter = true;
+
+ if (parallel)
+ {
+ /*
+ * Full/right outer joins are currently not supported
+ * for parallel joins, so we don't need to set the
+ * match bit. Experiments show that it's worth
+ * avoiding the shared memory traffic on large
+ * systems.
+ */
+ Assert(!HJ_FILL_INNER(node));
+ }
+ else
+ {
+ /*
+ * This is really only needed if HJ_FILL_INNER(node),
+ * but we'll avoid the branch and just set it always.
+ */
+ HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+ }
+
+ /* In an antijoin, we never return a matched tuple */
+ if (node->js.jointype == JOIN_ANTI)
+ {
+ node->hj_JoinState = HJ_NEED_NEW_OUTER;
+ continue;
+ }
+
+ /*
+ * If we only need to join to the first matching inner
+ * tuple, then consider returning this one, but after that
+ * continue with next outer tuple.
+ */
+ if (node->js.single_match)
+ node->hj_JoinState = HJ_NEED_NEW_OUTER;
+
+ if (otherqual == NULL || ExecQual(otherqual, econtext))
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ else
+ InstrCountFiltered2(node, 1);
+ }
+ else
+ InstrCountFiltered1(node, 1);
+ break;
+
+ case HJ_FILL_OUTER_TUPLE:
+
+ /*
+ * The current outer tuple has run out of matches, so check
+ * whether to emit a dummy outer-join tuple. Whether we emit
+ * one or not, the next state is NEED_NEW_OUTER.
+ */
+ node->hj_JoinState = HJ_NEED_NEW_OUTER;
+
+ if (!node->hj_MatchedOuter &&
+ HJ_FILL_OUTER(node))
+ {
+ /*
+ * Generate a fake join tuple with nulls for the inner
+ * tuple, and return it if it passes the non-join quals.
+ */
+ econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
+
+ if (otherqual == NULL || ExecQual(otherqual, econtext))
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ else
+ InstrCountFiltered2(node, 1);
+ }
+ break;
+
+ case HJ_FILL_INNER_TUPLES:
+
+ /*
+ * We have finished a batch, but we are doing right/full join,
+ * so any unmatched inner tuples in the hashtable have to be
+ * emitted before we continue to the next batch.
+ */
+ if (!ExecScanHashTableForUnmatched(node, econtext))
+ {
+ /* no more unmatched tuples */
+ node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ continue;
+ }
+
+ /*
+ * Generate a fake join tuple with nulls for the outer tuple,
+ * and return it if it passes the non-join quals.
+ */
+ econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
+
+ if (otherqual == NULL || ExecQual(otherqual, econtext))
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ else
+ InstrCountFiltered2(node, 1);
+ break;
+
+ case HJ_NEED_NEW_BATCH:
+
+ /*
+ * Try to advance to next batch. Done if there are no more.
+ */
+ if (parallel)
+ {
+ if (!ExecParallelHashJoinNewBatch(node))
+ return NULL; /* end of parallel-aware join */
+ }
+ else
+ {
+ if (!ExecHashJoinNewBatch(node))
+ return NULL; /* end of parallel-oblivious join */
+ }
+ node->hj_JoinState = HJ_NEED_NEW_OUTER;
+ break;
+
+ default:
+ elog(ERROR, "unrecognized hashjoin state: %d",
+ (int) node->hj_JoinState);
+ }
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecHashJoin
+ *
+ * Parallel-oblivious version.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot * /* return: a tuple or NULL */
+ExecHashJoin(PlanState *pstate)
+{
+ /*
+ * On sufficiently smart compilers this should be inlined with the
+ * parallel-aware branches removed.
+ */
+ return ExecHashJoinImpl(pstate, false);
+}
+
+/* ----------------------------------------------------------------
+ * ExecParallelHashJoin
+ *
+ * Parallel-aware version.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot * /* return: a tuple or NULL */
+ExecParallelHashJoin(PlanState *pstate)
+{
+ /*
+ * On sufficiently smart compilers this should be inlined with the
+ * parallel-oblivious branches removed.
+ */
+ return ExecHashJoinImpl(pstate, true);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitHashJoin
+ *
+ * Init routine for HashJoin node.
+ * ----------------------------------------------------------------
+ */
+HashJoinState *
+ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
+{
+ HashJoinState *hjstate;
+ Plan *outerNode;
+ Hash *hashNode;
+ TupleDesc outerDesc,
+ innerDesc;
+ const TupleTableSlotOps *ops;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ hjstate = makeNode(HashJoinState);
+ hjstate->js.ps.plan = (Plan *) node;
+ hjstate->js.ps.state = estate;
+
+ /*
+ * See ExecHashJoinInitializeDSM() and ExecHashJoinInitializeWorker()
+ * where this function may be replaced with a parallel version, if we
+ * managed to launch a parallel query.
+ */
+ hjstate->js.ps.ExecProcNode = ExecHashJoin;
+ hjstate->js.jointype = node->join.jointype;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &hjstate->js.ps);
+
+ /*
+ * initialize child nodes
+ *
+ * Note: we could suppress the REWIND flag for the inner input, which
+ * would amount to betting that the hash will be a single batch. Not
+ * clear if this would be a win or not.
+ */
+ outerNode = outerPlan(node);
+ hashNode = (Hash *) innerPlan(node);
+
+ outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
+ outerDesc = ExecGetResultType(outerPlanState(hjstate));
+ innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
+ innerDesc = ExecGetResultType(innerPlanState(hjstate));
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTupleSlotTL(&hjstate->js.ps, &TTSOpsVirtual);
+ ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
+
+ /*
+ * tuple table initialization
+ */
+ ops = ExecGetResultSlotOps(outerPlanState(hjstate), NULL);
+ hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate, outerDesc,
+ ops);
+
+ /*
+ * detect whether we need only consider the first matching inner tuple
+ */
+ hjstate->js.single_match = (node->join.inner_unique ||
+ node->join.jointype == JOIN_SEMI);
+
+ /* set up null tuples for outer joins, if needed */
+ switch (node->join.jointype)
+ {
+ case JOIN_INNER:
+ case JOIN_SEMI:
+ break;
+ case JOIN_LEFT:
+ case JOIN_ANTI:
+ hjstate->hj_NullInnerTupleSlot =
+ ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual);
+ break;
+ case JOIN_RIGHT:
+ hjstate->hj_NullOuterTupleSlot =
+ ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual);
+ break;
+ case JOIN_FULL:
+ hjstate->hj_NullOuterTupleSlot =
+ ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual);
+ hjstate->hj_NullInnerTupleSlot =
+ ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual);
+ break;
+ default:
+ elog(ERROR, "unrecognized join type: %d",
+ (int) node->join.jointype);
+ }
+
+ /*
+ * now for some voodoo. our temporary tuple slot is actually the result
+ * tuple slot of the Hash node (which is our inner plan). we can do this
+ * because Hash nodes don't return tuples via ExecProcNode() -- instead
+ * the hash join node uses ExecScanHashBucket() to get at the contents of
+ * the hash table. -cim 6/9/91
+ */
+ {
+ HashState *hashstate = (HashState *) innerPlanState(hjstate);
+ TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
+
+ hjstate->hj_HashTupleSlot = slot;
+ }
+
+ /*
+ * initialize child expressions
+ */
+ hjstate->js.ps.qual =
+ ExecInitQual(node->join.plan.qual, (PlanState *) hjstate);
+ hjstate->js.joinqual =
+ ExecInitQual(node->join.joinqual, (PlanState *) hjstate);
+ hjstate->hashclauses =
+ ExecInitQual(node->hashclauses, (PlanState *) hjstate);
+
+ /*
+ * initialize hash-specific info
+ */
+ hjstate->hj_HashTable = NULL;
+ hjstate->hj_FirstOuterTupleSlot = NULL;
+
+ hjstate->hj_CurHashValue = 0;
+ hjstate->hj_CurBucketNo = 0;
+ hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
+ hjstate->hj_CurTuple = NULL;
+
+ hjstate->hj_OuterHashKeys = ExecInitExprList(node->hashkeys,
+ (PlanState *) hjstate);
+ hjstate->hj_HashOperators = node->hashoperators;
+ hjstate->hj_Collations = node->hashcollations;
+
+ hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
+ hjstate->hj_MatchedOuter = false;
+ hjstate->hj_OuterNotEmpty = false;
+
+ return hjstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndHashJoin
+ *
+ * clean up routine for HashJoin node
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndHashJoin(HashJoinState *node)
+{
+ /*
+ * Free hash table
+ */
+ if (node->hj_HashTable)
+ {
+ ExecHashTableDestroy(node->hj_HashTable);
+ node->hj_HashTable = NULL;
+ }
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->js.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->hj_OuterTupleSlot);
+ ExecClearTuple(node->hj_HashTupleSlot);
+
+ /*
+ * clean up subtrees
+ */
+ ExecEndNode(outerPlanState(node));
+ ExecEndNode(innerPlanState(node));
+}
+
+/*
+ * ExecHashJoinOuterGetTuple
+ *
+ * get the next outer tuple for a parallel oblivious hashjoin: either by
+ * executing the outer plan node in the first pass, or from the temp
+ * files for the hashjoin batches.
+ *
+ * Returns a null slot if no more outer tuples (within the current batch).
+ *
+ * On success, the tuple's hash value is stored at *hashvalue --- this is
+ * either originally computed, or re-read from the temp file.
+ */
+static TupleTableSlot *
+ExecHashJoinOuterGetTuple(PlanState *outerNode,
+ HashJoinState *hjstate,
+ uint32 *hashvalue)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ int curbatch = hashtable->curbatch;
+ TupleTableSlot *slot;
+
+ if (curbatch == 0) /* if it is the first pass */
+ {
+ /*
+ * Check to see if first outer tuple was already fetched by
+ * ExecHashJoin() and not used yet.
+ */
+ slot = hjstate->hj_FirstOuterTupleSlot;
+ if (!TupIsNull(slot))
+ hjstate->hj_FirstOuterTupleSlot = NULL;
+ else
+ slot = ExecProcNode(outerNode);
+
+ while (!TupIsNull(slot))
+ {
+ /*
+ * We have to compute the tuple's hash value.
+ */
+ ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
+
+ econtext->ecxt_outertuple = slot;
+ if (ExecHashGetHashValue(hashtable, econtext,
+ hjstate->hj_OuterHashKeys,
+ true, /* outer tuple */
+ HJ_FILL_OUTER(hjstate),
+ hashvalue))
+ {
+ /* remember outer relation is not empty for possible rescan */
+ hjstate->hj_OuterNotEmpty = true;
+
+ return slot;
+ }
+
+ /*
+ * That tuple couldn't match because of a NULL, so discard it and
+ * continue with the next one.
+ */
+ slot = ExecProcNode(outerNode);
+ }
+ }
+ else if (curbatch < hashtable->nbatch)
+ {
+ BufFile *file = hashtable->outerBatchFile[curbatch];
+
+ /*
+ * In outer-join cases, we could get here even though the batch file
+ * is empty.
+ */
+ if (file == NULL)
+ return NULL;
+
+ slot = ExecHashJoinGetSavedTuple(hjstate,
+ file,
+ hashvalue,
+ hjstate->hj_OuterTupleSlot);
+ if (!TupIsNull(slot))
+ return slot;
+ }
+
+ /* End of this batch */
+ return NULL;
+}
+
+/*
+ * ExecHashJoinOuterGetTuple variant for the parallel case.
+ */
+static TupleTableSlot *
+ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
+ HashJoinState *hjstate,
+ uint32 *hashvalue)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ int curbatch = hashtable->curbatch;
+ TupleTableSlot *slot;
+
+ /*
+ * In the Parallel Hash case we only run the outer plan directly for
+ * single-batch hash joins. Otherwise we have to go to batch files, even
+ * for batch 0.
+ */
+ if (curbatch == 0 && hashtable->nbatch == 1)
+ {
+ slot = ExecProcNode(outerNode);
+
+ while (!TupIsNull(slot))
+ {
+ ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
+
+ econtext->ecxt_outertuple = slot;
+ if (ExecHashGetHashValue(hashtable, econtext,
+ hjstate->hj_OuterHashKeys,
+ true, /* outer tuple */
+ HJ_FILL_OUTER(hjstate),
+ hashvalue))
+ return slot;
+
+ /*
+ * That tuple couldn't match because of a NULL, so discard it and
+ * continue with the next one.
+ */
+ slot = ExecProcNode(outerNode);
+ }
+ }
+ else if (curbatch < hashtable->nbatch)
+ {
+ MinimalTuple tuple;
+
+ tuple = sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples,
+ hashvalue);
+ if (tuple != NULL)
+ {
+ ExecForceStoreMinimalTuple(tuple,
+ hjstate->hj_OuterTupleSlot,
+ false);
+ slot = hjstate->hj_OuterTupleSlot;
+ return slot;
+ }
+ else
+ ExecClearTuple(hjstate->hj_OuterTupleSlot);
+ }
+
+ /* End of this batch */
+ return NULL;
+}
+
+/*
+ * ExecHashJoinNewBatch
+ * switch to a new hashjoin batch
+ *
+ * Returns true if successful, false if there are no more batches.
+ */
+static bool
+ExecHashJoinNewBatch(HashJoinState *hjstate)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ int nbatch;
+ int curbatch;
+ BufFile *innerFile;
+ TupleTableSlot *slot;
+ uint32 hashvalue;
+
+ nbatch = hashtable->nbatch;
+ curbatch = hashtable->curbatch;
+
+ if (curbatch > 0)
+ {
+ /*
+ * We no longer need the previous outer batch file; close it right
+ * away to free disk space.
+ */
+ if (hashtable->outerBatchFile[curbatch])
+ BufFileClose(hashtable->outerBatchFile[curbatch]);
+ hashtable->outerBatchFile[curbatch] = NULL;
+ }
+ else /* we just finished the first batch */
+ {
+ /*
+ * Reset some of the skew optimization state variables, since we no
+ * longer need to consider skew tuples after the first batch. The
+ * memory context reset we are about to do will release the skew
+ * hashtable itself.
+ */
+ hashtable->skewEnabled = false;
+ hashtable->skewBucket = NULL;
+ hashtable->skewBucketNums = NULL;
+ hashtable->nSkewBuckets = 0;
+ hashtable->spaceUsedSkew = 0;
+ }
+
+ /*
+ * We can always skip over any batches that are completely empty on both
+ * sides. We can sometimes skip over batches that are empty on only one
+ * side, but there are exceptions:
+ *
+ * 1. In a left/full outer join, we have to process outer batches even if
+ * the inner batch is empty. Similarly, in a right/full outer join, we
+ * have to process inner batches even if the outer batch is empty.
+ *
+ * 2. If we have increased nbatch since the initial estimate, we have to
+ * scan inner batches since they might contain tuples that need to be
+ * reassigned to later inner batches.
+ *
+ * 3. Similarly, if we have increased nbatch since starting the outer
+ * scan, we have to rescan outer batches in case they contain tuples that
+ * need to be reassigned.
+ */
+ curbatch++;
+ while (curbatch < nbatch &&
+ (hashtable->outerBatchFile[curbatch] == NULL ||
+ hashtable->innerBatchFile[curbatch] == NULL))
+ {
+ if (hashtable->outerBatchFile[curbatch] &&
+ HJ_FILL_OUTER(hjstate))
+ break; /* must process due to rule 1 */
+ if (hashtable->innerBatchFile[curbatch] &&
+ HJ_FILL_INNER(hjstate))
+ break; /* must process due to rule 1 */
+ if (hashtable->innerBatchFile[curbatch] &&
+ nbatch != hashtable->nbatch_original)
+ break; /* must process due to rule 2 */
+ if (hashtable->outerBatchFile[curbatch] &&
+ nbatch != hashtable->nbatch_outstart)
+ break; /* must process due to rule 3 */
+ /* We can ignore this batch. */
+ /* Release associated temp files right away. */
+ if (hashtable->innerBatchFile[curbatch])
+ BufFileClose(hashtable->innerBatchFile[curbatch]);
+ hashtable->innerBatchFile[curbatch] = NULL;
+ if (hashtable->outerBatchFile[curbatch])
+ BufFileClose(hashtable->outerBatchFile[curbatch]);
+ hashtable->outerBatchFile[curbatch] = NULL;
+ curbatch++;
+ }
+
+ if (curbatch >= nbatch)
+ return false; /* no more batches */
+
+ hashtable->curbatch = curbatch;
+
+ /*
+ * Reload the hash table with the new inner batch (which could be empty)
+ */
+ ExecHashTableReset(hashtable);
+
+ innerFile = hashtable->innerBatchFile[curbatch];
+
+ if (innerFile != NULL)
+ {
+ if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rewind hash-join temporary file")));
+
+ while ((slot = ExecHashJoinGetSavedTuple(hjstate,
+ innerFile,
+ &hashvalue,
+ hjstate->hj_HashTupleSlot)))
+ {
+ /*
+ * NOTE: some tuples may be sent to future batches. Also, it is
+ * possible for hashtable->nbatch to be increased here!
+ */
+ ExecHashTableInsert(hashtable, slot, hashvalue);
+ }
+
+ /*
+ * after we build the hash table, the inner batch file is no longer
+ * needed
+ */
+ BufFileClose(innerFile);
+ hashtable->innerBatchFile[curbatch] = NULL;
+ }
+
+ /*
+ * Rewind outer batch file (if present), so that we can start reading it.
+ */
+ if (hashtable->outerBatchFile[curbatch] != NULL)
+ {
+ if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rewind hash-join temporary file")));
+ }
+
+ return true;
+}
+
+/*
+ * Choose a batch to work on, and attach to it. Returns true if successful,
+ * false if there are no more batches.
+ */
+static bool
+ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
+{
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ int start_batchno;
+ int batchno;
+
+ /*
+ * If we started up so late that the batch tracking array has been freed
+ * already by ExecHashTableDetach(), then we are finished. See also
+ * ExecParallelHashEnsureBatchAccessors().
+ */
+ if (hashtable->batches == NULL)
+ return false;
+
+ /*
+ * If we were already attached to a batch, remember not to bother checking
+ * it again, and detach from it (possibly freeing the hash table if we are
+ * last to detach).
+ */
+ if (hashtable->curbatch >= 0)
+ {
+ hashtable->batches[hashtable->curbatch].done = true;
+ ExecHashTableDetachBatch(hashtable);
+ }
+
+ /*
+ * Search for a batch that isn't done. We use an atomic counter to start
+ * our search at a different batch in every participant when there are
+ * more batches than participants.
+ */
+ batchno = start_batchno =
+ pg_atomic_fetch_add_u32(&hashtable->parallel_state->distributor, 1) %
+ hashtable->nbatch;
+ do
+ {
+ uint32 hashvalue;
+ MinimalTuple tuple;
+ TupleTableSlot *slot;
+
+ if (!hashtable->batches[batchno].done)
+ {
+ SharedTuplestoreAccessor *inner_tuples;
+ Barrier *batch_barrier =
+ &hashtable->batches[batchno].shared->batch_barrier;
+
+ switch (BarrierAttach(batch_barrier))
+ {
+ case PHJ_BATCH_ELECTING:
+
+ /* One backend allocates the hash table. */
+ if (BarrierArriveAndWait(batch_barrier,
+ WAIT_EVENT_HASH_BATCH_ELECT))
+ ExecParallelHashTableAlloc(hashtable, batchno);
+ /* Fall through. */
+
+ case PHJ_BATCH_ALLOCATING:
+ /* Wait for allocation to complete. */
+ BarrierArriveAndWait(batch_barrier,
+ WAIT_EVENT_HASH_BATCH_ALLOCATE);
+ /* Fall through. */
+
+ case PHJ_BATCH_LOADING:
+ /* Start (or join in) loading tuples. */
+ ExecParallelHashTableSetCurrentBatch(hashtable, batchno);
+ inner_tuples = hashtable->batches[batchno].inner_tuples;
+ sts_begin_parallel_scan(inner_tuples);
+ while ((tuple = sts_parallel_scan_next(inner_tuples,
+ &hashvalue)))
+ {
+ ExecForceStoreMinimalTuple(tuple,
+ hjstate->hj_HashTupleSlot,
+ false);
+ slot = hjstate->hj_HashTupleSlot;
+ ExecParallelHashTableInsertCurrentBatch(hashtable, slot,
+ hashvalue);
+ }
+ sts_end_parallel_scan(inner_tuples);
+ BarrierArriveAndWait(batch_barrier,
+ WAIT_EVENT_HASH_BATCH_LOAD);
+ /* Fall through. */
+
+ case PHJ_BATCH_PROBING:
+
+ /*
+ * This batch is ready to probe. Return control to
+ * caller. We stay attached to batch_barrier so that the
+ * hash table stays alive until everyone's finished
+ * probing it, but no participant is allowed to wait at
+ * this barrier again (or else a deadlock could occur).
+ * All attached participants must eventually call
+ * BarrierArriveAndDetach() so that the final phase
+ * PHJ_BATCH_DONE can be reached.
+ */
+ ExecParallelHashTableSetCurrentBatch(hashtable, batchno);
+ sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples);
+ return true;
+
+ case PHJ_BATCH_DONE:
+
+ /*
+ * Already done. Detach and go around again (if any
+ * remain).
+ */
+ BarrierDetach(batch_barrier);
+ hashtable->batches[batchno].done = true;
+ hashtable->curbatch = -1;
+ break;
+
+ default:
+ elog(ERROR, "unexpected batch phase %d",
+ BarrierPhase(batch_barrier));
+ }
+ }
+ batchno = (batchno + 1) % hashtable->nbatch;
+ } while (batchno != start_batchno);
+
+ return false;
+}
+
+/*
+ * ExecHashJoinSaveTuple
+ * save a tuple to a batch file.
+ *
+ * The data recorded in the file for each tuple is its hash value,
+ * then the tuple in MinimalTuple format.
+ *
+ * Note: it is important always to call this in the regular executor
+ * context, not in a shorter-lived context; else the temp file buffers
+ * will get messed up.
+ */
+void
+ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
+ BufFile **fileptr)
+{
+ BufFile *file = *fileptr;
+
+ if (file == NULL)
+ {
+ /* First write to this batch file, so open it. */
+ file = BufFileCreateTemp(false);
+ *fileptr = file;
+ }
+
+ BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
+ BufFileWrite(file, (void *) tuple, tuple->t_len);
+}
+
+/*
+ * ExecHashJoinGetSavedTuple
+ * read the next tuple from a batch file. Return NULL if no more.
+ *
+ * On success, *hashvalue is set to the tuple's hash value, and the tuple
+ * itself is stored in the given slot.
+ */
+static TupleTableSlot *
+ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
+ BufFile *file,
+ uint32 *hashvalue,
+ TupleTableSlot *tupleSlot)
+{
+ uint32 header[2];
+ size_t nread;
+ MinimalTuple tuple;
+
+ /*
+ * We check for interrupts here because this is typically taken as an
+ * alternative code path to an ExecProcNode() call, which would include
+ * such a check.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Since both the hash value and the MinimalTuple length word are uint32,
+ * we can read them both in one BufFileRead() call without any type
+ * cheating.
+ */
+ nread = BufFileRead(file, (void *) header, sizeof(header));
+ if (nread == 0) /* end of file */
+ {
+ ExecClearTuple(tupleSlot);
+ return NULL;
+ }
+ if (nread != sizeof(header))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
+ nread, sizeof(header))));
+ *hashvalue = header[0];
+ tuple = (MinimalTuple) palloc(header[1]);
+ tuple->t_len = header[1];
+ nread = BufFileRead(file,
+ (void *) ((char *) tuple + sizeof(uint32)),
+ header[1] - sizeof(uint32));
+ if (nread != header[1] - sizeof(uint32))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
+ nread, header[1] - sizeof(uint32))));
+ ExecForceStoreMinimalTuple(tuple, tupleSlot, true);
+ return tupleSlot;
+}
+
+
+void
+ExecReScanHashJoin(HashJoinState *node)
+{
+ /*
+ * In a multi-batch join, we currently have to do rescans the hard way,
+ * primarily because batch temp files may have already been released. But
+ * if it's a single-batch join, and there is no parameter change for the
+ * inner subnode, then we can just re-use the existing hash table without
+ * rebuilding it.
+ */
+ if (node->hj_HashTable != NULL)
+ {
+ if (node->hj_HashTable->nbatch == 1 &&
+ node->js.ps.righttree->chgParam == NULL)
+ {
+ /*
+ * Okay to reuse the hash table; needn't rescan inner, either.
+ *
+ * However, if it's a right/full join, we'd better reset the
+ * inner-tuple match flags contained in the table.
+ */
+ if (HJ_FILL_INNER(node))
+ ExecHashTableResetMatchFlags(node->hj_HashTable);
+
+ /*
+ * Also, we need to reset our state about the emptiness of the
+ * outer relation, so that the new scan of the outer will update
+ * it correctly if it turns out to be empty this time. (There's no
+ * harm in clearing it now because ExecHashJoin won't need the
+ * info. In the other cases, where the hash table doesn't exist
+ * or we are destroying it, we leave this state alone because
+ * ExecHashJoin will need it the first time through.)
+ */
+ node->hj_OuterNotEmpty = false;
+
+ /* ExecHashJoin can skip the BUILD_HASHTABLE step */
+ node->hj_JoinState = HJ_NEED_NEW_OUTER;
+ }
+ else
+ {
+ /* must destroy and rebuild hash table */
+ HashState *hashNode = castNode(HashState, innerPlanState(node));
+
+ Assert(hashNode->hashtable == node->hj_HashTable);
+ /* accumulate stats from old hash table, if wanted */
+ /* (this should match ExecShutdownHash) */
+ if (hashNode->ps.instrument && !hashNode->hinstrument)
+ hashNode->hinstrument = (HashInstrumentation *)
+ palloc0(sizeof(HashInstrumentation));
+ if (hashNode->hinstrument)
+ ExecHashAccumInstrumentation(hashNode->hinstrument,
+ hashNode->hashtable);
+ /* for safety, be sure to clear child plan node's pointer too */
+ hashNode->hashtable = NULL;
+
+ ExecHashTableDestroy(node->hj_HashTable);
+ node->hj_HashTable = NULL;
+ node->hj_JoinState = HJ_BUILD_HASHTABLE;
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned
+ * by first ExecProcNode.
+ */
+ if (node->js.ps.righttree->chgParam == NULL)
+ ExecReScan(node->js.ps.righttree);
+ }
+ }
+
+ /* Always reset intra-tuple state */
+ node->hj_CurHashValue = 0;
+ node->hj_CurBucketNo = 0;
+ node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
+ node->hj_CurTuple = NULL;
+
+ node->hj_MatchedOuter = false;
+ node->hj_FirstOuterTupleSlot = NULL;
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->js.ps.lefttree->chgParam == NULL)
+ ExecReScan(node->js.ps.lefttree);
+}
+
+void
+ExecShutdownHashJoin(HashJoinState *node)
+{
+ if (node->hj_HashTable)
+ {
+ /*
+ * Detach from shared state before DSM memory goes away. This makes
+ * sure that we don't have any pointers into DSM memory by the time
+ * ExecEndHashJoin runs.
+ */
+ ExecHashTableDetachBatch(node->hj_HashTable);
+ ExecHashTableDetach(node->hj_HashTable);
+ }
+}
+
+static void
+ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate)
+{
+ PlanState *outerState = outerPlanState(hjstate);
+ ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
+ HashJoinTable hashtable = hjstate->hj_HashTable;
+ TupleTableSlot *slot;
+ uint32 hashvalue;
+ int i;
+
+ Assert(hjstate->hj_FirstOuterTupleSlot == NULL);
+
+ /* Execute outer plan, writing all tuples to shared tuplestores. */
+ for (;;)
+ {
+ slot = ExecProcNode(outerState);
+ if (TupIsNull(slot))
+ break;
+ econtext->ecxt_outertuple = slot;
+ if (ExecHashGetHashValue(hashtable, econtext,
+ hjstate->hj_OuterHashKeys,
+ true, /* outer tuple */
+ HJ_FILL_OUTER(hjstate),
+ &hashvalue))
+ {
+ int batchno;
+ int bucketno;
+ bool shouldFree;
+ MinimalTuple mintup = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno,
+ &batchno);
+ sts_puttuple(hashtable->batches[batchno].outer_tuples,
+ &hashvalue, mintup);
+
+ if (shouldFree)
+ heap_free_minimal_tuple(mintup);
+ }
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /* Make sure all outer partitions are readable by any backend. */
+ for (i = 0; i < hashtable->nbatch; ++i)
+ sts_end_write(hashtable->batches[i].outer_tuples);
+}
+
+void
+ExecHashJoinEstimate(HashJoinState *state, ParallelContext *pcxt)
+{
+ shm_toc_estimate_chunk(&pcxt->estimator, sizeof(ParallelHashJoinState));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+void
+ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt)
+{
+ int plan_node_id = state->js.ps.plan->plan_node_id;
+ HashState *hashNode;
+ ParallelHashJoinState *pstate;
+
+ /*
+ * Disable shared hash table mode if we failed to create a real DSM
+ * segment, because that means that we don't have a DSA area to work with.
+ */
+ if (pcxt->seg == NULL)
+ return;
+
+ ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin);
+
+ /*
+ * Set up the state needed to coordinate access to the shared hash
+ * table(s), using the plan node ID as the toc key.
+ */
+ pstate = shm_toc_allocate(pcxt->toc, sizeof(ParallelHashJoinState));
+ shm_toc_insert(pcxt->toc, plan_node_id, pstate);
+
+ /*
+ * Set up the shared hash join state with no batches initially.
+ * ExecHashTableCreate() will prepare at least one later and set nbatch
+ * and space_allowed.
+ */
+ pstate->nbatch = 0;
+ pstate->space_allowed = 0;
+ pstate->batches = InvalidDsaPointer;
+ pstate->old_batches = InvalidDsaPointer;
+ pstate->nbuckets = 0;
+ pstate->growth = PHJ_GROWTH_OK;
+ pstate->chunk_work_queue = InvalidDsaPointer;
+ pg_atomic_init_u32(&pstate->distributor, 0);
+ pstate->nparticipants = pcxt->nworkers + 1;
+ pstate->total_tuples = 0;
+ LWLockInitialize(&pstate->lock,
+ LWTRANCHE_PARALLEL_HASH_JOIN);
+ BarrierInit(&pstate->build_barrier, 0);
+ BarrierInit(&pstate->grow_batches_barrier, 0);
+ BarrierInit(&pstate->grow_buckets_barrier, 0);
+
+ /* Set up the space we'll use for shared temporary files. */
+ SharedFileSetInit(&pstate->fileset, pcxt->seg);
+
+ /* Initialize the shared state in the hash node. */
+ hashNode = (HashState *) innerPlanState(state);
+ hashNode->parallel_state = pstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecHashJoinReInitializeDSM
+ *
+ * Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecHashJoinReInitializeDSM(HashJoinState *state, ParallelContext *cxt)
+{
+ int plan_node_id = state->js.ps.plan->plan_node_id;
+ ParallelHashJoinState *pstate =
+ shm_toc_lookup(cxt->toc, plan_node_id, false);
+
+ /*
+ * It would be possible to reuse the shared hash table in single-batch
+ * cases by resetting and then fast-forwarding build_barrier to
+ * PHJ_BUILD_DONE and batch 0's batch_barrier to PHJ_BATCH_PROBING, but
+ * currently shared hash tables are already freed by now (by the last
+ * participant to detach from the batch). We could consider keeping it
+ * around for single-batch joins. We'd also need to adjust
+ * finalize_plan() so that it doesn't record a dummy dependency for
+ * Parallel Hash nodes, preventing the rescan optimization. For now we
+ * don't try.
+ */
+
+ /* Detach, freeing any remaining shared memory. */
+ if (state->hj_HashTable != NULL)
+ {
+ ExecHashTableDetachBatch(state->hj_HashTable);
+ ExecHashTableDetach(state->hj_HashTable);
+ }
+
+ /* Clear any shared batch files. */
+ SharedFileSetDeleteAll(&pstate->fileset);
+
+ /* Reset build_barrier to PHJ_BUILD_ELECTING so we can go around again. */
+ BarrierInit(&pstate->build_barrier, 0);
+}
+
+void
+ExecHashJoinInitializeWorker(HashJoinState *state,
+ ParallelWorkerContext *pwcxt)
+{
+ HashState *hashNode;
+ int plan_node_id = state->js.ps.plan->plan_node_id;
+ ParallelHashJoinState *pstate =
+ shm_toc_lookup(pwcxt->toc, plan_node_id, false);
+
+ /* Attach to the space for shared temporary files. */
+ SharedFileSetAttach(&pstate->fileset, pwcxt->seg);
+
+ /* Attach to the shared state in the hash node. */
+ hashNode = (HashState *) innerPlanState(state);
+ hashNode->parallel_state = pstate;
+
+ ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin);
+}
diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c
new file mode 100644
index 0000000..934426a
--- /dev/null
+++ b/src/backend/executor/nodeIncrementalSort.c
@@ -0,0 +1,1257 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIncrementalSort.c
+ * Routines to handle incremental sorting of relations.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeIncrementalSort.c
+ *
+ * DESCRIPTION
+ *
+ * Incremental sort is an optimized variant of multikey sort for cases
+ * when the input is already sorted by a prefix of the sort keys. For
+ * example when a sort by (key1, key2 ... keyN) is requested, and the
+ * input is already sorted by (key1, key2 ... keyM), M < N, we can
+ * divide the input into groups where keys (key1, ... keyM) are equal,
+ * and only sort on the remaining columns.
+ *
+ * Consider the following example. We have input tuples consisting of
+ * two integers (X, Y) already presorted by X, while it's required to
+ * sort them by both X and Y. Let input tuples be following.
+ *
+ * (1, 5)
+ * (1, 2)
+ * (2, 9)
+ * (2, 1)
+ * (2, 5)
+ * (3, 3)
+ * (3, 7)
+ *
+ * An incremental sort algorithm would split the input into the following
+ * groups, which have equal X, and then sort them by Y individually:
+ *
+ * (1, 5) (1, 2)
+ * (2, 9) (2, 1) (2, 5)
+ * (3, 3) (3, 7)
+ *
+ * After sorting these groups and putting them altogether, we would get
+ * the following result which is sorted by X and Y, as requested:
+ *
+ * (1, 2)
+ * (1, 5)
+ * (2, 1)
+ * (2, 5)
+ * (2, 9)
+ * (3, 3)
+ * (3, 7)
+ *
+ * Incremental sort may be more efficient than plain sort, particularly
+ * on large datasets, as it reduces the amount of data to sort at once,
+ * making it more likely it fits into work_mem (eliminating the need to
+ * spill to disk). But the main advantage of incremental sort is that
+ * it can start producing rows early, before sorting the whole dataset,
+ * which is a significant benefit especially for queries with LIMIT.
+ *
+ * The algorithm we've implemented here is modified from the theoretical
+ * base described above by operating in two different modes:
+ * - Fetching a minimum number of tuples without checking prefix key
+ * group membership and sorting on all columns when safe.
+ * - Fetching all tuples for a single prefix key group and sorting on
+ * solely the unsorted columns.
+ * We always begin in the first mode, and employ a heuristic to switch
+ * into the second mode if we believe it's beneficial.
+ *
+ * Sorting incrementally can potentially use less memory, avoid fetching
+ * and sorting all tuples in the dataset, and begin returning tuples before
+ * the entire result set is available.
+ *
+ * The hybrid mode approach allows us to optimize for both very small
+ * groups (where the overhead of a new tuplesort is high) and very large
+ * groups (where we can lower cost by not having to sort on already sorted
+ * columns), albeit at some extra cost while switching between modes.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIncrementalSort.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/tuplesort.h"
+
+/*
+ * We need to store the instrumentation information in either local node's sort
+ * info or, for a parallel worker process, in the shared info (this avoids
+ * having to additionally memcpy the info from local memory to shared memory
+ * at each instrumentation call). This macro expands to choose the proper sort
+ * state and group info.
+ *
+ * Arguments:
+ * - node: type IncrementalSortState *
+ * - groupName: the token fullsort or prefixsort
+ */
+#define INSTRUMENT_SORT_GROUP(node, groupName) \
+ do { \
+ if ((node)->ss.ps.instrument != NULL) \
+ { \
+ if ((node)->shared_info && (node)->am_worker) \
+ { \
+ Assert(IsParallelWorker()); \
+ Assert(ParallelWorkerNumber <= (node)->shared_info->num_workers); \
+ instrumentSortedGroup(&(node)->shared_info->sinfo[ParallelWorkerNumber].groupName##GroupInfo, \
+ (node)->groupName##_state); \
+ } \
+ else \
+ { \
+ instrumentSortedGroup(&(node)->incsort_info.groupName##GroupInfo, \
+ (node)->groupName##_state); \
+ } \
+ } \
+ } while (0)
+
+
+/* ----------------------------------------------------------------
+ * instrumentSortedGroup
+ *
+ * Because incremental sort processes (potentially many) sort batches, we need
+ * to capture tuplesort stats each time we finalize a sort state. This summary
+ * data is later used for EXPLAIN ANALYZE output.
+ * ----------------------------------------------------------------
+ */
+static void
+instrumentSortedGroup(IncrementalSortGroupInfo *groupInfo,
+ Tuplesortstate *sortState)
+{
+ TuplesortInstrumentation sort_instr;
+
+ groupInfo->groupCount++;
+
+ tuplesort_get_stats(sortState, &sort_instr);
+
+ /* Calculate total and maximum memory and disk space used. */
+ switch (sort_instr.spaceType)
+ {
+ case SORT_SPACE_TYPE_DISK:
+ groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed;
+ if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed)
+ groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed;
+
+ break;
+ case SORT_SPACE_TYPE_MEMORY:
+ groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed;
+ if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed)
+ groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed;
+
+ break;
+ }
+
+ /* Track each sort method we've used. */
+ groupInfo->sortMethods |= sort_instr.sortMethod;
+}
+
+/* ----------------------------------------------------------------
+ * preparePresortedCols
+ *
+ * Prepare information for presorted_keys comparisons.
+ * ----------------------------------------------------------------
+ */
+static void
+preparePresortedCols(IncrementalSortState *node)
+{
+ IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);
+
+ node->presorted_keys =
+ (PresortedKeyData *) palloc(plannode->nPresortedCols *
+ sizeof(PresortedKeyData));
+
+ /* Pre-cache comparison functions for each pre-sorted key. */
+ for (int i = 0; i < plannode->nPresortedCols; i++)
+ {
+ Oid equalityOp,
+ equalityFunc;
+ PresortedKeyData *key;
+
+ key = &node->presorted_keys[i];
+ key->attno = plannode->sort.sortColIdx[i];
+
+ equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i],
+ NULL);
+ if (!OidIsValid(equalityOp))
+ elog(ERROR, "missing equality operator for ordering operator %u",
+ plannode->sort.sortOperators[i]);
+
+ equalityFunc = get_opcode(equalityOp);
+ if (!OidIsValid(equalityFunc))
+ elog(ERROR, "missing function for operator %u", equalityOp);
+
+ /* Lookup the comparison function */
+ fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext);
+
+ /* We can initialize the callinfo just once and re-use it */
+ key->fcinfo = palloc0(SizeForFunctionCallInfo(2));
+ InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2,
+ plannode->sort.collations[i], NULL, NULL);
+ key->fcinfo->args[0].isnull = false;
+ key->fcinfo->args[1].isnull = false;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * isCurrentGroup
+ *
+ * Check whether a given tuple belongs to the current sort group by comparing
+ * the presorted column values to the pivot tuple of the current group.
+ * ----------------------------------------------------------------
+ */
+static bool
+isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple)
+{
+ int nPresortedCols;
+
+ nPresortedCols = castNode(IncrementalSort, node->ss.ps.plan)->nPresortedCols;
+
+ /*
+ * That the input is sorted by keys * (0, ... n) implies that the tail
+ * keys are more likely to change. Therefore we do our comparison starting
+ * from the last pre-sorted column to optimize for early detection of
+ * inequality and minimizing the number of function calls..
+ */
+ for (int i = nPresortedCols - 1; i >= 0; i--)
+ {
+ Datum datumA,
+ datumB,
+ result;
+ bool isnullA,
+ isnullB;
+ AttrNumber attno = node->presorted_keys[i].attno;
+ PresortedKeyData *key;
+
+ datumA = slot_getattr(pivot, attno, &isnullA);
+ datumB = slot_getattr(tuple, attno, &isnullB);
+
+ /* Special case for NULL-vs-NULL, else use standard comparison */
+ if (isnullA || isnullB)
+ {
+ if (isnullA == isnullB)
+ continue;
+ else
+ return false;
+ }
+
+ key = &node->presorted_keys[i];
+
+ key->fcinfo->args[0].value = datumA;
+ key->fcinfo->args[1].value = datumB;
+
+ /* just for paranoia's sake, we reset isnull each time */
+ key->fcinfo->isnull = false;
+
+ result = FunctionCallInvoke(key->fcinfo);
+
+ /* Check for null result, since caller is clearly not expecting one */
+ if (key->fcinfo->isnull)
+ elog(ERROR, "function %u returned NULL", key->flinfo.fn_oid);
+
+ if (!DatumGetBool(result))
+ return false;
+ }
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * switchToPresortedPrefixMode
+ *
+ * When we determine that we've likely encountered a large batch of tuples all
+ * having the same presorted prefix values, we want to optimize tuplesort by
+ * only sorting on unsorted suffix keys.
+ *
+ * The problem is that we've already accumulated several tuples in another
+ * tuplesort configured to sort by all columns (assuming that there may be
+ * more than one prefix key group). So to switch to presorted prefix mode we
+ * have to go back and look at all the tuples we've already accumulated to
+ * verify they're all part of the same prefix key group before sorting them
+ * solely by unsorted suffix keys.
+ *
+ * While it's likely that all tuples already fetched are all part of a single
+ * prefix group, we also have to handle the possibility that there is at least
+ * one different prefix key group before the large prefix key group.
+ * ----------------------------------------------------------------
+ */
+static void
+switchToPresortedPrefixMode(PlanState *pstate)
+{
+ IncrementalSortState *node = castNode(IncrementalSortState, pstate);
+ ScanDirection dir;
+ int64 nTuples;
+ TupleDesc tupDesc;
+ PlanState *outerNode;
+ IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);
+
+ dir = node->ss.ps.state->es_direction;
+ outerNode = outerPlanState(node);
+ tupDesc = ExecGetResultType(outerNode);
+
+ /* Configure the prefix sort state the first time around. */
+ if (node->prefixsort_state == NULL)
+ {
+ Tuplesortstate *prefixsort_state;
+ int nPresortedCols = plannode->nPresortedCols;
+
+ /*
+ * Optimize the sort by assuming the prefix columns are all equal and
+ * thus we only need to sort by any remaining columns.
+ */
+ prefixsort_state = tuplesort_begin_heap(tupDesc,
+ plannode->sort.numCols - nPresortedCols,
+ &(plannode->sort.sortColIdx[nPresortedCols]),
+ &(plannode->sort.sortOperators[nPresortedCols]),
+ &(plannode->sort.collations[nPresortedCols]),
+ &(plannode->sort.nullsFirst[nPresortedCols]),
+ work_mem,
+ NULL,
+ false);
+ node->prefixsort_state = prefixsort_state;
+ }
+ else
+ {
+ /* Next group of presorted data */
+ tuplesort_reset(node->prefixsort_state);
+ }
+
+ /*
+ * If the current node has a bound, then it's reasonably likely that a
+ * large prefix key group will benefit from bounded sort, so configure the
+ * tuplesort to allow for that optimization.
+ */
+ if (node->bounded)
+ {
+ SO1_printf("Setting bound on presorted prefix tuplesort to: " INT64_FORMAT "\n",
+ node->bound - node->bound_Done);
+ tuplesort_set_bound(node->prefixsort_state,
+ node->bound - node->bound_Done);
+ }
+
+ /*
+ * Copy as many tuples as we can (i.e., in the same prefix key group) from
+ * the full sort state to the prefix sort state.
+ */
+ for (nTuples = 0; nTuples < node->n_fullsort_remaining; nTuples++)
+ {
+ /*
+ * When we encounter multiple prefix key groups inside the full sort
+ * tuplesort we have to carry over the last read tuple into the next
+ * batch.
+ */
+ if (nTuples == 0 && !TupIsNull(node->transfer_tuple))
+ {
+ tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
+ /* The carried over tuple is our new group pivot tuple. */
+ ExecCopySlot(node->group_pivot, node->transfer_tuple);
+ }
+ else
+ {
+ tuplesort_gettupleslot(node->fullsort_state,
+ ScanDirectionIsForward(dir),
+ false, node->transfer_tuple, NULL);
+
+ /*
+ * If this is our first time through the loop, then we need to
+ * save the first tuple we get as our new group pivot.
+ */
+ if (TupIsNull(node->group_pivot))
+ ExecCopySlot(node->group_pivot, node->transfer_tuple);
+
+ if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple))
+ {
+ tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
+ }
+ else
+ {
+ /*
+ * The tuple isn't part of the current batch so we need to
+ * carry it over into the next batch of tuples we transfer out
+ * of the full sort tuplesort into the presorted prefix
+ * tuplesort. We don't actually have to do anything special to
+ * save the tuple since we've already loaded it into the
+ * node->transfer_tuple slot, and, even though that slot
+ * points to memory inside the full sort tuplesort, we can't
+ * reset that tuplesort anyway until we've fully transferred
+ * out its tuples, so this reference is safe. We do need to
+ * reset the group pivot tuple though since we've finished the
+ * current prefix key group.
+ */
+ ExecClearTuple(node->group_pivot);
+
+ /* Break out of for-loop early */
+ break;
+ }
+ }
+ }
+
+ /*
+ * Track how many tuples remain in the full sort batch so that we know if
+ * we need to sort multiple prefix key groups before processing tuples
+ * remaining in the large single prefix key group we think we've
+ * encountered.
+ */
+ SO1_printf("Moving " INT64_FORMAT " tuples to presorted prefix tuplesort\n", nTuples);
+ node->n_fullsort_remaining -= nTuples;
+ SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT "\n", node->n_fullsort_remaining);
+
+ if (node->n_fullsort_remaining == 0)
+ {
+ /*
+ * We've found that all tuples remaining in the full sort batch are in
+ * the same prefix key group and moved all of those tuples into the
+ * presorted prefix tuplesort. We don't know that we've yet found the
+ * last tuple in the current prefix key group, so save our pivot
+ * comparison tuple and continue fetching tuples from the outer
+ * execution node to load into the presorted prefix tuplesort.
+ */
+ ExecCopySlot(node->group_pivot, node->transfer_tuple);
+ SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n");
+ node->execution_status = INCSORT_LOADPREFIXSORT;
+
+ /*
+ * Make sure we clear the transfer tuple slot so that next time we
+ * encounter a large prefix key group we don't incorrectly assume we
+ * have a tuple carried over from the previous group.
+ */
+ ExecClearTuple(node->transfer_tuple);
+ }
+ else
+ {
+ /*
+ * We finished a group but didn't consume all of the tuples from the
+ * full sort state, so we'll sort this batch, let the outer node read
+ * out all of those tuples, and then come back around to find another
+ * batch.
+ */
+ SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples);
+ tuplesort_performsort(node->prefixsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, prefixsort);
+
+ if (node->bounded)
+ {
+ /*
+ * If the current node has a bound and we've already sorted n
+ * tuples, then the functional bound remaining is (original bound
+ * - n), so store the current number of processed tuples for use
+ * in configuring sorting bound.
+ */
+ SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
+ Min(node->bound, node->bound_Done + nTuples), node->bound_Done);
+ node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+ }
+
+ SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (switchToPresortedPrefixMode)\n");
+ node->execution_status = INCSORT_READPREFIXSORT;
+ }
+}
+
+/*
+ * Sorting many small groups with tuplesort is inefficient. In order to
+ * cope with this problem we don't start a new group until the current one
+ * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also
+ * means we can't assume small groups of tuples all have the same prefix keys.)
+ * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking
+ * for the new group as soon as we've met our bound to avoid fetching more
+ * tuples than we absolutely have to fetch.
+ */
+#define DEFAULT_MIN_GROUP_SIZE 32
+
+/*
+ * While we've optimized for small prefix key groups by not starting our prefix
+ * key comparisons until we've reached a minimum number of tuples, we don't want
+ * that optimization to cause us to lose out on the benefits of being able to
+ * assume a large group of tuples is fully presorted by its prefix keys.
+ * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic
+ * for determining when we believe we've encountered a large group, and, if we
+ * get to that point without finding a new prefix key group we transition to
+ * presorted prefix key mode.
+ */
+#define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE)
+
+/* ----------------------------------------------------------------
+ * ExecIncrementalSort
+ *
+ * Assuming that outer subtree returns tuple presorted by some prefix
+ * of target sort columns, performs incremental sort.
+ *
+ * Conditions:
+ * -- none.
+ *
+ * Initial States:
+ * -- the outer child is prepared to return the first tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecIncrementalSort(PlanState *pstate)
+{
+ IncrementalSortState *node = castNode(IncrementalSortState, pstate);
+ EState *estate;
+ ScanDirection dir;
+ Tuplesortstate *read_sortstate;
+ Tuplesortstate *fullsort_state;
+ TupleTableSlot *slot;
+ IncrementalSort *plannode = (IncrementalSort *) node->ss.ps.plan;
+ PlanState *outerNode;
+ TupleDesc tupDesc;
+ int64 nTuples = 0;
+ int64 minGroupSize;
+
+ CHECK_FOR_INTERRUPTS();
+
+ estate = node->ss.ps.state;
+ dir = estate->es_direction;
+ fullsort_state = node->fullsort_state;
+
+ /*
+ * If a previous iteration has sorted a batch, then we need to check to
+ * see if there are any remaining tuples in that batch that we can return
+ * before moving on to other execution states.
+ */
+ if (node->execution_status == INCSORT_READFULLSORT
+ || node->execution_status == INCSORT_READPREFIXSORT)
+ {
+ /*
+ * Return next tuple from the current sorted group set if available.
+ */
+ read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
+ fullsort_state : node->prefixsort_state;
+ slot = node->ss.ps.ps_ResultTupleSlot;
+
+ /*
+ * We have to populate the slot from the tuplesort before checking
+ * outerNodeDone because it will set the slot to NULL if no more
+ * tuples remain. If the tuplesort is empty, but we don't have any
+ * more tuples available for sort from the outer node, then
+ * outerNodeDone will have been set so we'll return that now-empty
+ * slot to the caller.
+ */
+ if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
+ false, slot, NULL) || node->outerNodeDone)
+
+ /*
+ * Note: there isn't a good test case for the node->outerNodeDone
+ * check directly, but we need it for any plan where the outer
+ * node will fail when trying to fetch too many tuples.
+ */
+ return slot;
+ else if (node->n_fullsort_remaining > 0)
+ {
+ /*
+ * When we transition to presorted prefix mode, we might have
+ * accumulated at least one additional prefix key group in the
+ * full sort tuplesort. The first call to
+ * switchToPresortedPrefixMode() will have pulled the first one of
+ * those groups out, and we've returned those tuples to the parent
+ * node, but if at this point we still have tuples remaining in
+ * the full sort state (i.e., n_fullsort_remaining > 0), then we
+ * need to re-execute the prefix mode transition function to pull
+ * out the next prefix key group.
+ */
+ SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (" INT64_FORMAT ")\n",
+ node->n_fullsort_remaining);
+ switchToPresortedPrefixMode(pstate);
+ }
+ else
+ {
+ /*
+ * If we don't have any sorted tuples to read and we're not
+ * currently transitioning into presorted prefix sort mode, then
+ * it's time to start the process all over again by building a new
+ * group in the full sort state.
+ */
+ SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n");
+ node->execution_status = INCSORT_LOADFULLSORT;
+ }
+ }
+
+ /*
+ * Scan the subplan in the forward direction while creating the sorted
+ * data.
+ */
+ estate->es_direction = ForwardScanDirection;
+
+ outerNode = outerPlanState(node);
+ tupDesc = ExecGetResultType(outerNode);
+
+ /* Load tuples into the full sort state. */
+ if (node->execution_status == INCSORT_LOADFULLSORT)
+ {
+ /*
+ * Initialize sorting structures.
+ */
+ if (fullsort_state == NULL)
+ {
+ /*
+ * Initialize presorted column support structures for
+ * isCurrentGroup(). It's correct to do this along with the
+ * initial initialization for the full sort state (and not for the
+ * prefix sort state) since we always load the full sort state
+ * first.
+ */
+ preparePresortedCols(node);
+
+ /*
+ * Since we optimize small prefix key groups by accumulating a
+ * minimum number of tuples before sorting, we can't assume that a
+ * group of tuples all have the same prefix key values. Hence we
+ * setup the full sort tuplesort to sort by all requested sort
+ * keys.
+ */
+ fullsort_state = tuplesort_begin_heap(tupDesc,
+ plannode->sort.numCols,
+ plannode->sort.sortColIdx,
+ plannode->sort.sortOperators,
+ plannode->sort.collations,
+ plannode->sort.nullsFirst,
+ work_mem,
+ NULL,
+ false);
+ node->fullsort_state = fullsort_state;
+ }
+ else
+ {
+ /* Reset sort for the next batch. */
+ tuplesort_reset(fullsort_state);
+ }
+
+ /*
+ * Calculate the remaining tuples left if bounded and configure both
+ * bounded sort and the minimum group size accordingly.
+ */
+ if (node->bounded)
+ {
+ int64 currentBound = node->bound - node->bound_Done;
+
+ /*
+ * Bounded sort isn't likely to be a useful optimization for full
+ * sort mode since we limit full sort mode to a relatively small
+ * number of tuples and tuplesort doesn't switch over to top-n
+ * heap sort anyway unless it hits (2 * bound) tuples.
+ */
+ if (currentBound < DEFAULT_MIN_GROUP_SIZE)
+ tuplesort_set_bound(fullsort_state, currentBound);
+
+ minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound);
+ }
+ else
+ minGroupSize = DEFAULT_MIN_GROUP_SIZE;
+
+ /*
+ * Because we have to read the next tuple to find out that we've
+ * encountered a new prefix key group, on subsequent groups we have to
+ * carry over that extra tuple and add it to the new group's sort here
+ * before we read any new tuples from the outer node.
+ */
+ if (!TupIsNull(node->group_pivot))
+ {
+ tuplesort_puttupleslot(fullsort_state, node->group_pivot);
+ nTuples++;
+
+ /*
+ * We're in full sort mode accumulating a minimum number of tuples
+ * and not checking for prefix key equality yet, so we can't
+ * assume the group pivot tuple will remain the same -- unless
+ * we're using a minimum group size of 1, in which case the pivot
+ * is obviously still the pivot.
+ */
+ if (nTuples != minGroupSize)
+ ExecClearTuple(node->group_pivot);
+ }
+
+
+ /*
+ * Pull as many tuples from the outer node as possible given our
+ * current operating mode.
+ */
+ for (;;)
+ {
+ slot = ExecProcNode(outerNode);
+
+ /*
+ * If the outer node can't provide us any more tuples, then we can
+ * sort the current group and return those tuples.
+ */
+ if (TupIsNull(slot))
+ {
+ /*
+ * We need to know later if the outer node has completed to be
+ * able to distinguish between being done with a batch and
+ * being done with the whole node.
+ */
+ node->outerNodeDone = true;
+
+ SO1_printf("Sorting fullsort with " INT64_FORMAT " tuples\n", nTuples);
+ tuplesort_performsort(fullsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, fullsort);
+
+ SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n");
+ node->execution_status = INCSORT_READFULLSORT;
+ break;
+ }
+
+ /* Accumulate the next group of presorted tuples. */
+ if (nTuples < minGroupSize)
+ {
+ /*
+ * If we haven't yet hit our target minimum group size, then
+ * we don't need to bother checking for inclusion in the
+ * current prefix group since at this point we'll assume that
+ * we'll full sort this batch to avoid a large number of very
+ * tiny (and thus inefficient) sorts.
+ */
+ tuplesort_puttupleslot(fullsort_state, slot);
+ nTuples++;
+
+ /*
+ * If we've reached our minimum group size, then we need to
+ * store the most recent tuple as a pivot.
+ */
+ if (nTuples == minGroupSize)
+ ExecCopySlot(node->group_pivot, slot);
+ }
+ else
+ {
+ /*
+ * If we've already accumulated enough tuples to reach our
+ * minimum group size, then we need to compare any additional
+ * tuples to our pivot tuple to see if we reach the end of
+ * that prefix key group. Only after we find changed prefix
+ * keys can we guarantee sort stability of the tuples we've
+ * already accumulated.
+ */
+ if (isCurrentGroup(node, node->group_pivot, slot))
+ {
+ /*
+ * As long as the prefix keys match the pivot tuple then
+ * load the tuple into the tuplesort.
+ */
+ tuplesort_puttupleslot(fullsort_state, slot);
+ nTuples++;
+ }
+ else
+ {
+ /*
+ * Since the tuple we fetched isn't part of the current
+ * prefix key group we don't want to sort it as part of
+ * the current batch. Instead we use the group_pivot slot
+ * to carry it over to the next batch (even though we
+ * won't actually treat it as a group pivot).
+ */
+ ExecCopySlot(node->group_pivot, slot);
+
+ if (node->bounded)
+ {
+ /*
+ * If the current node has a bound, and we've already
+ * sorted n tuples, then the functional bound
+ * remaining is (original bound - n), so store the
+ * current number of processed tuples for later use
+ * configuring the sort state's bound.
+ */
+ SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
+ node->bound_Done,
+ Min(node->bound, node->bound_Done + nTuples));
+ node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+ }
+
+ /*
+ * Once we find changed prefix keys we can complete the
+ * sort and transition modes to reading out the sorted
+ * tuples.
+ */
+ SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n",
+ nTuples);
+ tuplesort_performsort(fullsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, fullsort);
+
+ SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n");
+ node->execution_status = INCSORT_READFULLSORT;
+ break;
+ }
+ }
+
+ /*
+ * Unless we've already transitioned modes to reading from the
+ * full sort state, then we assume that having read at least
+ * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're
+ * processing a large group of tuples all having equal prefix keys
+ * (but haven't yet found the final tuple in that prefix key
+ * group), so we need to transition into presorted prefix mode.
+ */
+ if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE &&
+ node->execution_status != INCSORT_READFULLSORT)
+ {
+ /*
+ * The group pivot we have stored has already been put into
+ * the tuplesort; we don't want to carry it over. Since we
+ * haven't yet found the end of the prefix key group, it might
+ * seem like we should keep this, but we don't actually know
+ * how many prefix key groups might be represented in the full
+ * sort state, so we'll let the mode transition function
+ * manage this state for us.
+ */
+ ExecClearTuple(node->group_pivot);
+
+ /*
+ * Unfortunately the tuplesort API doesn't include a way to
+ * retrieve tuples unless a sort has been performed, so we
+ * perform the sort even though we could just as easily rely
+ * on FIFO retrieval semantics when transferring them to the
+ * presorted prefix tuplesort.
+ */
+ SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n", nTuples);
+ tuplesort_performsort(fullsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, fullsort);
+
+ /*
+ * If the full sort tuplesort happened to switch into top-n
+ * heapsort mode then we will only be able to retrieve
+ * currentBound tuples (since the tuplesort will have only
+ * retained the top-n tuples). This is safe even though we
+ * haven't yet completed fetching the current prefix key group
+ * because the tuples we've "lost" already sorted "below" the
+ * retained ones, and we're already contractually guaranteed
+ * to not need any more than the currentBound tuples.
+ */
+ if (tuplesort_used_bound(node->fullsort_state))
+ {
+ int64 currentBound = node->bound - node->bound_Done;
+
+ SO2_printf("Read " INT64_FORMAT " tuples, but setting to " INT64_FORMAT " because we used bounded sort\n",
+ nTuples, Min(currentBound, nTuples));
+ nTuples = Min(currentBound, nTuples);
+ }
+
+ SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT " and calling switchToPresortedPrefixMode()\n",
+ nTuples);
+
+ /*
+ * We might have multiple prefix key groups in the full sort
+ * state, so the mode transition function needs to know that
+ * it needs to move from the fullsort to presorted prefix
+ * sort.
+ */
+ node->n_fullsort_remaining = nTuples;
+
+ /* Transition the tuples to the presorted prefix tuplesort. */
+ switchToPresortedPrefixMode(pstate);
+
+ /*
+ * Since we know we had tuples to move to the presorted prefix
+ * tuplesort, we know that unless that transition has verified
+ * that all tuples belonged to the same prefix key group (in
+ * which case we can go straight to continuing to load tuples
+ * into that tuplesort), we should have a tuple to return
+ * here.
+ *
+ * Either way, the appropriate execution status should have
+ * been set by switchToPresortedPrefixMode(), so we can drop
+ * out of the loop here and let the appropriate path kick in.
+ */
+ break;
+ }
+ }
+ }
+
+ if (node->execution_status == INCSORT_LOADPREFIXSORT)
+ {
+ /*
+ * We only enter this state after the mode transition function has
+ * confirmed all remaining tuples from the full sort state have the
+ * same prefix and moved those tuples to the prefix sort state. That
+ * function has also set a group pivot tuple (which doesn't need to be
+ * carried over; it's already been put into the prefix sort state).
+ */
+ Assert(!TupIsNull(node->group_pivot));
+
+ /*
+ * Read tuples from the outer node and load them into the prefix sort
+ * state until we encounter a tuple whose prefix keys don't match the
+ * current group_pivot tuple, since we can't guarantee sort stability
+ * until we have all tuples matching those prefix keys.
+ */
+ for (;;)
+ {
+ slot = ExecProcNode(outerNode);
+
+ /*
+ * If we've exhausted tuples from the outer node we're done
+ * loading the prefix sort state.
+ */
+ if (TupIsNull(slot))
+ {
+ /*
+ * We need to know later if the outer node has completed to be
+ * able to distinguish between being done with a batch and
+ * being done with the whole node.
+ */
+ node->outerNodeDone = true;
+ break;
+ }
+
+ /*
+ * If the tuple's prefix keys match our pivot tuple, we're not
+ * done yet and can load it into the prefix sort state. If not, we
+ * don't want to sort it as part of the current batch. Instead we
+ * use the group_pivot slot to carry it over to the next batch
+ * (even though we won't actually treat it as a group pivot).
+ */
+ if (isCurrentGroup(node, node->group_pivot, slot))
+ {
+ tuplesort_puttupleslot(node->prefixsort_state, slot);
+ nTuples++;
+ }
+ else
+ {
+ ExecCopySlot(node->group_pivot, slot);
+ break;
+ }
+ }
+
+ /*
+ * Perform the sort and begin returning the tuples to the parent plan
+ * node.
+ */
+ SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples);
+ tuplesort_performsort(node->prefixsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, prefixsort);
+
+ SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n");
+ node->execution_status = INCSORT_READPREFIXSORT;
+
+ if (node->bounded)
+ {
+ /*
+ * If the current node has a bound, and we've already sorted n
+ * tuples, then the functional bound remaining is (original bound
+ * - n), so store the current number of processed tuples for use
+ * in configuring sorting bound.
+ */
+ SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
+ node->bound_Done,
+ Min(node->bound, node->bound_Done + nTuples));
+ node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+ }
+ }
+
+ /* Restore to user specified direction. */
+ estate->es_direction = dir;
+
+ /*
+ * Get the first or next tuple from tuplesort. Returns NULL if no more
+ * tuples.
+ */
+ read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
+ fullsort_state : node->prefixsort_state;
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ (void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
+ false, slot, NULL);
+ return slot;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitIncrementalSort
+ *
+ * Creates the run-time state information for the sort node
+ * produced by the planner and initializes its outer subtree.
+ * ----------------------------------------------------------------
+ */
+IncrementalSortState *
+ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags)
+{
+ IncrementalSortState *incrsortstate;
+
+ SO_printf("ExecInitIncrementalSort: initializing sort node\n");
+
+ /*
+ * Incremental sort can't be used with EXEC_FLAG_BACKWARD or
+ * EXEC_FLAG_MARK, because the current sort state contains only one sort
+ * batch rather than the full result set.
+ */
+ Assert((eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) == 0);
+
+ /* Initialize state structure. */
+ incrsortstate = makeNode(IncrementalSortState);
+ incrsortstate->ss.ps.plan = (Plan *) node;
+ incrsortstate->ss.ps.state = estate;
+ incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort;
+
+ incrsortstate->execution_status = INCSORT_LOADFULLSORT;
+ incrsortstate->bounded = false;
+ incrsortstate->outerNodeDone = false;
+ incrsortstate->bound_Done = 0;
+ incrsortstate->fullsort_state = NULL;
+ incrsortstate->prefixsort_state = NULL;
+ incrsortstate->group_pivot = NULL;
+ incrsortstate->transfer_tuple = NULL;
+ incrsortstate->n_fullsort_remaining = 0;
+ incrsortstate->presorted_keys = NULL;
+
+ if (incrsortstate->ss.ps.instrument != NULL)
+ {
+ IncrementalSortGroupInfo *fullsortGroupInfo =
+ &incrsortstate->incsort_info.fullsortGroupInfo;
+ IncrementalSortGroupInfo *prefixsortGroupInfo =
+ &incrsortstate->incsort_info.prefixsortGroupInfo;
+
+ fullsortGroupInfo->groupCount = 0;
+ fullsortGroupInfo->maxDiskSpaceUsed = 0;
+ fullsortGroupInfo->totalDiskSpaceUsed = 0;
+ fullsortGroupInfo->maxMemorySpaceUsed = 0;
+ fullsortGroupInfo->totalMemorySpaceUsed = 0;
+ fullsortGroupInfo->sortMethods = 0;
+ prefixsortGroupInfo->groupCount = 0;
+ prefixsortGroupInfo->maxDiskSpaceUsed = 0;
+ prefixsortGroupInfo->totalDiskSpaceUsed = 0;
+ prefixsortGroupInfo->maxMemorySpaceUsed = 0;
+ prefixsortGroupInfo->totalMemorySpaceUsed = 0;
+ prefixsortGroupInfo->sortMethods = 0;
+ }
+
+ /*
+ * Miscellaneous initialization
+ *
+ * Sort nodes don't initialize their ExprContexts because they never call
+ * ExecQual or ExecProject.
+ */
+
+ /*
+ * Initialize child nodes.
+ *
+ * Incremental sort does not support backwards scans and mark/restore, so
+ * we don't bother removing the flags from eflags here. We allow passing a
+ * REWIND flag, because although incremental sort can't use it, the child
+ * nodes may be able to do something more useful.
+ */
+ outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * Initialize scan slot and type.
+ */
+ ExecCreateScanSlotFromOuterPlan(estate, &incrsortstate->ss, &TTSOpsMinimalTuple);
+
+ /*
+ * Initialize return slot and type. No need to initialize projection info
+ * because we don't do any projections.
+ */
+ ExecInitResultTupleSlotTL(&incrsortstate->ss.ps, &TTSOpsMinimalTuple);
+ incrsortstate->ss.ps.ps_ProjInfo = NULL;
+
+ /*
+ * Initialize standalone slots to store a tuple for pivot prefix keys and
+ * for carrying over a tuple from one batch to the next.
+ */
+ incrsortstate->group_pivot =
+ MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
+ &TTSOpsMinimalTuple);
+ incrsortstate->transfer_tuple =
+ MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
+ &TTSOpsMinimalTuple);
+
+ SO_printf("ExecInitIncrementalSort: sort node initialized\n");
+
+ return incrsortstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndIncrementalSort(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIncrementalSort(IncrementalSortState *node)
+{
+ SO_printf("ExecEndIncrementalSort: shutting down sort node\n");
+
+ /* clean out the scan tuple */
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+ /* must drop pointer to sort result tuple */
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ /* must drop standalone tuple slots from outer node */
+ ExecDropSingleTupleTableSlot(node->group_pivot);
+ ExecDropSingleTupleTableSlot(node->transfer_tuple);
+
+ /*
+ * Release tuplesort resources.
+ */
+ if (node->fullsort_state != NULL)
+ {
+ tuplesort_end(node->fullsort_state);
+ node->fullsort_state = NULL;
+ }
+ if (node->prefixsort_state != NULL)
+ {
+ tuplesort_end(node->prefixsort_state);
+ node->prefixsort_state = NULL;
+ }
+
+ /*
+ * Shut down the subplan.
+ */
+ ExecEndNode(outerPlanState(node));
+
+ SO_printf("ExecEndIncrementalSort: sort node shutdown\n");
+}
+
+void
+ExecReScanIncrementalSort(IncrementalSortState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ /*
+ * Incremental sort doesn't support efficient rescan even when parameters
+ * haven't changed (e.g., rewind) because unlike regular sort we don't
+ * store all tuples at once for the full sort.
+ *
+ * So even if EXEC_FLAG_REWIND is set we just reset all of our state and
+ * re-execute the sort along with the child node. Incremental sort itself
+ * can't do anything smarter, but maybe the child nodes can.
+ *
+ * In theory if we've only filled the full sort with one batch (and
+ * haven't reset it for a new batch yet) then we could efficiently rewind,
+ * but that seems a narrow enough case that it's not worth handling
+ * specially at this time.
+ */
+
+ /* must drop pointer to sort result tuple */
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ if (node->group_pivot != NULL)
+ ExecClearTuple(node->group_pivot);
+ if (node->transfer_tuple != NULL)
+ ExecClearTuple(node->transfer_tuple);
+
+ node->outerNodeDone = false;
+ node->n_fullsort_remaining = 0;
+ node->bound_Done = 0;
+ node->presorted_keys = NULL;
+
+ node->execution_status = INCSORT_LOADFULLSORT;
+
+ /*
+ * If we've set up either of the sort states yet, we need to reset them.
+ * We could end them and null out the pointers, but there's no reason to
+ * repay the setup cost, and because ExecIncrementalSort guards presorted
+ * column functions by checking to see if the full sort state has been
+ * initialized yet, setting the sort states to null here might actually
+ * cause a leak.
+ */
+ if (node->fullsort_state != NULL)
+ {
+ tuplesort_reset(node->fullsort_state);
+ node->fullsort_state = NULL;
+ }
+ if (node->prefixsort_state != NULL)
+ {
+ tuplesort_reset(node->prefixsort_state);
+ node->prefixsort_state = NULL;
+ }
+
+ /*
+ * If chgParam of subnode is not null, then the plan will be re-scanned by
+ * the first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+}
+
+/* ----------------------------------------------------------------
+ * Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecSortEstimate
+ *
+ * Estimate space required to propagate sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = mul_size(pcxt->nworkers, sizeof(IncrementalSortInfo));
+ size = add_size(size, offsetof(SharedIncrementalSortInfo, sinfo));
+ shm_toc_estimate_chunk(&pcxt->estimator, size);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortInitializeDSM
+ *
+ * Initialize DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = offsetof(SharedIncrementalSortInfo, sinfo)
+ + pcxt->nworkers * sizeof(IncrementalSortInfo);
+ node->shared_info = shm_toc_allocate(pcxt->toc, size);
+ /* ensure any unfilled slots will contain zeroes */
+ memset(node->shared_info, 0, size);
+ node->shared_info->num_workers = pcxt->nworkers;
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+ node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortInitializeWorker
+ *
+ * Attach worker to DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pwcxt)
+{
+ node->shared_info =
+ shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+ node->am_worker = true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortRetrieveInstrumentation
+ *
+ * Transfer sort statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node)
+{
+ Size size;
+ SharedIncrementalSortInfo *si;
+
+ if (node->shared_info == NULL)
+ return;
+
+ size = offsetof(SharedIncrementalSortInfo, sinfo)
+ + node->shared_info->num_workers * sizeof(IncrementalSortInfo);
+ si = palloc(size);
+ memcpy(si, node->shared_info, size);
+ node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
new file mode 100644
index 0000000..8fee958
--- /dev/null
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -0,0 +1,735 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIndexonlyscan.c
+ * Routines to support index-only scans
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeIndexonlyscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecIndexOnlyScan scans an index
+ * IndexOnlyNext retrieve next tuple
+ * ExecInitIndexOnlyScan creates and initializes state info.
+ * ExecReScanIndexOnlyScan rescans the indexed relation.
+ * ExecEndIndexOnlyScan releases all storage.
+ * ExecIndexOnlyMarkPos marks scan position.
+ * ExecIndexOnlyRestrPos restores scan position.
+ * ExecIndexOnlyScanEstimate estimates DSM space needed for
+ * parallel index-only scan
+ * ExecIndexOnlyScanInitializeDSM initialize DSM for parallel
+ * index-only scan
+ * ExecIndexOnlyScanReInitializeDSM reinitialize DSM for fresh scan
+ * ExecIndexOnlyScanInitializeWorker attach to DSM info in parallel worker
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/tupdesc.h"
+#include "access/visibilitymap.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
+static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
+ TupleDesc itupdesc);
+
+
+/* ----------------------------------------------------------------
+ * IndexOnlyNext
+ *
+ * Retrieve a tuple from the IndexOnlyScan node's index.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+IndexOnlyNext(IndexOnlyScanState *node)
+{
+ EState *estate;
+ ExprContext *econtext;
+ ScanDirection direction;
+ IndexScanDesc scandesc;
+ TupleTableSlot *slot;
+ ItemPointer tid;
+
+ /*
+ * extract necessary information from index scan node
+ */
+ estate = node->ss.ps.state;
+ direction = estate->es_direction;
+ /* flip direction if this is an overall backward scan */
+ if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
+ {
+ if (ScanDirectionIsForward(direction))
+ direction = BackwardScanDirection;
+ else if (ScanDirectionIsBackward(direction))
+ direction = ForwardScanDirection;
+ }
+ scandesc = node->ioss_ScanDesc;
+ econtext = node->ss.ps.ps_ExprContext;
+ slot = node->ss.ss_ScanTupleSlot;
+
+ if (scandesc == NULL)
+ {
+ /*
+ * We reach here if the index only scan is not parallel, or if we're
+ * serially executing an index only scan that was planned to be
+ * parallel.
+ */
+ scandesc = index_beginscan(node->ss.ss_currentRelation,
+ node->ioss_RelationDesc,
+ estate->es_snapshot,
+ node->ioss_NumScanKeys,
+ node->ioss_NumOrderByKeys);
+
+ node->ioss_ScanDesc = scandesc;
+
+
+ /* Set it up for index-only scan */
+ node->ioss_ScanDesc->xs_want_itup = true;
+ node->ioss_VMBuffer = InvalidBuffer;
+
+ /*
+ * If no run-time keys to calculate or they are ready, go ahead and
+ * pass the scankeys to the index AM.
+ */
+ if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
+ index_rescan(scandesc,
+ node->ioss_ScanKeys,
+ node->ioss_NumScanKeys,
+ node->ioss_OrderByKeys,
+ node->ioss_NumOrderByKeys);
+ }
+
+ /*
+ * OK, now that we have what we need, fetch the next tuple.
+ */
+ while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+ {
+ bool tuple_from_heap = false;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * We can skip the heap fetch if the TID references a heap page on
+ * which all tuples are known visible to everybody. In any case,
+ * we'll use the index tuple not the heap tuple as the data source.
+ *
+ * Note on Memory Ordering Effects: visibilitymap_get_status does not
+ * lock the visibility map buffer, and therefore the result we read
+ * here could be slightly stale. However, it can't be stale enough to
+ * matter.
+ *
+ * We need to detect clearing a VM bit due to an insert right away,
+ * because the tuple is present in the index page but not visible. The
+ * reading of the TID by this scan (using a shared lock on the index
+ * buffer) is serialized with the insert of the TID into the index
+ * (using an exclusive lock on the index buffer). Because the VM bit
+ * is cleared before updating the index, and locking/unlocking of the
+ * index page acts as a full memory barrier, we are sure to see the
+ * cleared bit if we see a recently-inserted TID.
+ *
+ * Deletes do not update the index page (only VACUUM will clear out
+ * the TID), so the clearing of the VM bit by a delete is not
+ * serialized with this test below, and we may see a value that is
+ * significantly stale. However, we don't care about the delete right
+ * away, because the tuple is still visible until the deleting
+ * transaction commits or the statement ends (if it's our
+ * transaction). In either case, the lock on the VM buffer will have
+ * been released (acting as a write barrier) after clearing the bit.
+ * And for us to have a snapshot that includes the deleting
+ * transaction (making the tuple invisible), we must have acquired
+ * ProcArrayLock after that time, acting as a read barrier.
+ *
+ * It's worth going through this complexity to avoid needing to lock
+ * the VM buffer, which could cause significant contention.
+ */
+ if (!VM_ALL_VISIBLE(scandesc->heapRelation,
+ ItemPointerGetBlockNumber(tid),
+ &node->ioss_VMBuffer))
+ {
+ /*
+ * Rats, we have to visit the heap to check visibility.
+ */
+ InstrCountTuples2(node, 1);
+ if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+ continue; /* no visible tuple, try next index entry */
+
+ ExecClearTuple(node->ioss_TableSlot);
+
+ /*
+ * Only MVCC snapshots are supported here, so there should be no
+ * need to keep following the HOT chain once a visible entry has
+ * been found. If we did want to allow that, we'd need to keep
+ * more state to remember not to call index_getnext_tid next time.
+ */
+ if (scandesc->xs_heap_continue)
+ elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+ /*
+ * Note: at this point we are holding a pin on the heap page, as
+ * recorded in scandesc->xs_cbuf. We could release that pin now,
+ * but it's not clear whether it's a win to do so. The next index
+ * entry might require a visit to the same heap page.
+ */
+
+ tuple_from_heap = true;
+ }
+
+ /*
+ * Fill the scan tuple slot with data from the index. This might be
+ * provided in either HeapTuple or IndexTuple format. Conceivably an
+ * index AM might fill both fields, in which case we prefer the heap
+ * format, since it's probably a bit cheaper to fill a slot from.
+ */
+ if (scandesc->xs_hitup)
+ {
+ /*
+ * We don't take the trouble to verify that the provided tuple has
+ * exactly the slot's format, but it seems worth doing a quick
+ * check on the number of fields.
+ */
+ Assert(slot->tts_tupleDescriptor->natts ==
+ scandesc->xs_hitupdesc->natts);
+ ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
+ }
+ else if (scandesc->xs_itup)
+ StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc);
+ else
+ elog(ERROR, "no data returned for index-only scan");
+
+ /*
+ * If the index was lossy, we have to recheck the index quals.
+ */
+ if (scandesc->xs_recheck)
+ {
+ econtext->ecxt_scantuple = slot;
+ if (!ExecQualAndReset(node->recheckqual, econtext))
+ {
+ /* Fails recheck, so drop it and loop back for another */
+ InstrCountFiltered2(node, 1);
+ continue;
+ }
+ }
+
+ /*
+ * We don't currently support rechecking ORDER BY distances. (In
+ * principle, if the index can support retrieval of the originally
+ * indexed value, it should be able to produce an exact distance
+ * calculation too. So it's not clear that adding code here for
+ * recheck/re-sort would be worth the trouble. But we should at least
+ * throw an error if someone tries it.)
+ */
+ if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("lossy distance functions are not supported in index-only scans")));
+
+ /*
+ * If we didn't access the heap, then we'll need to take a predicate
+ * lock explicitly, as if we had. For now we do that at page level.
+ */
+ if (!tuple_from_heap)
+ PredicateLockPage(scandesc->heapRelation,
+ ItemPointerGetBlockNumber(tid),
+ estate->es_snapshot);
+
+ return slot;
+ }
+
+ /*
+ * if we get here it means the index scan failed so we are at the end of
+ * the scan..
+ */
+ return ExecClearTuple(slot);
+}
+
+/*
+ * StoreIndexTuple
+ * Fill the slot with data from the index tuple.
+ *
+ * At some point this might be generally-useful functionality, but
+ * right now we don't need it elsewhere.
+ */
+static void
+StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
+{
+ /*
+ * Note: we must use the tupdesc supplied by the AM in index_deform_tuple,
+ * not the slot's tupdesc, in case the latter has different datatypes
+ * (this happens for btree name_ops in particular). They'd better have
+ * the same number of columns though, as well as being datatype-compatible
+ * which is something we can't so easily check.
+ */
+ Assert(slot->tts_tupleDescriptor->natts == itupdesc->natts);
+
+ ExecClearTuple(slot);
+ index_deform_tuple(itup, itupdesc, slot->tts_values, slot->tts_isnull);
+ ExecStoreVirtualTuple(slot);
+}
+
+/*
+ * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ *
+ * This can't really happen, since an index can't supply CTID which would
+ * be necessary data for any potential EvalPlanQual target relation. If it
+ * did happen, the EPQ code would pass us the wrong data, namely a heap
+ * tuple not an index tuple. So throw an error.
+ */
+static bool
+IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
+{
+ elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
+ return false; /* keep compiler quiet */
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexOnlyScan(node)
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecIndexOnlyScan(PlanState *pstate)
+{
+ IndexOnlyScanState *node = castNode(IndexOnlyScanState, pstate);
+
+ /*
+ * If we have runtime keys and they've not already been set up, do it now.
+ */
+ if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
+ ExecReScan((PlanState *) node);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) IndexOnlyNext,
+ (ExecScanRecheckMtd) IndexOnlyRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanIndexOnlyScan(node)
+ *
+ * Recalculates the values of any scan keys whose value depends on
+ * information known at runtime, then rescans the indexed relation.
+ *
+ * Updating the scan key was formerly done separately in
+ * ExecUpdateIndexScanKeys. Integrating it into ReScan makes
+ * rescans of indices and relations/general streams more uniform.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
+{
+ /*
+ * If we are doing runtime key calculations (ie, any of the index key
+ * values weren't simple Consts), compute the new key values. But first,
+ * reset the context so we don't leak memory as each outer tuple is
+ * scanned. Note this assumes that we will recalculate *all* runtime keys
+ * on each call.
+ */
+ if (node->ioss_NumRuntimeKeys != 0)
+ {
+ ExprContext *econtext = node->ioss_RuntimeContext;
+
+ ResetExprContext(econtext);
+ ExecIndexEvalRuntimeKeys(econtext,
+ node->ioss_RuntimeKeys,
+ node->ioss_NumRuntimeKeys);
+ }
+ node->ioss_RuntimeKeysReady = true;
+
+ /* reset index scan */
+ if (node->ioss_ScanDesc)
+ index_rescan(node->ioss_ScanDesc,
+ node->ioss_ScanKeys, node->ioss_NumScanKeys,
+ node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
+
+ ExecScanReScan(&node->ss);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecEndIndexOnlyScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIndexOnlyScan(IndexOnlyScanState *node)
+{
+ Relation indexRelationDesc;
+ IndexScanDesc indexScanDesc;
+
+ /*
+ * extract information from the node
+ */
+ indexRelationDesc = node->ioss_RelationDesc;
+ indexScanDesc = node->ioss_ScanDesc;
+
+ /* Release VM buffer pin, if any. */
+ if (node->ioss_VMBuffer != InvalidBuffer)
+ {
+ ReleaseBuffer(node->ioss_VMBuffer);
+ node->ioss_VMBuffer = InvalidBuffer;
+ }
+
+ /*
+ * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
+ */
+#ifdef NOT_USED
+ ExecFreeExprContext(&node->ss.ps);
+ if (node->ioss_RuntimeContext)
+ FreeExprContext(node->ioss_RuntimeContext, true);
+#endif
+
+ /*
+ * clear out tuple table slots
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * close the index relation (no-op if we didn't open it)
+ */
+ if (indexScanDesc)
+ index_endscan(indexScanDesc);
+ if (indexRelationDesc)
+ index_close(indexRelationDesc, NoLock);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexOnlyMarkPos
+ *
+ * Note: we assume that no caller attempts to set a mark before having read
+ * at least one tuple. Otherwise, ioss_ScanDesc might still be NULL.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
+{
+ EState *estate = node->ss.ps.state;
+ EPQState *epqstate = estate->es_epq_active;
+
+ if (epqstate != NULL)
+ {
+ /*
+ * We are inside an EvalPlanQual recheck. If a test tuple exists for
+ * this relation, then we shouldn't access the index at all. We would
+ * instead need to save, and later restore, the state of the
+ * relsubs_done flag, so that re-fetching the test tuple is possible.
+ * However, given the assumption that no caller sets a mark at the
+ * start of the scan, we can only get here with relsubs_done[i]
+ * already set, and so no state need be saved.
+ */
+ Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
+
+ Assert(scanrelid > 0);
+ if (epqstate->relsubs_slot[scanrelid - 1] != NULL ||
+ epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+ {
+ /* Verify the claim above */
+ if (!epqstate->relsubs_done[scanrelid - 1])
+ elog(ERROR, "unexpected ExecIndexOnlyMarkPos call in EPQ recheck");
+ return;
+ }
+ }
+
+ index_markpos(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexOnlyRestrPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
+{
+ EState *estate = node->ss.ps.state;
+ EPQState *epqstate = estate->es_epq_active;
+
+ if (estate->es_epq_active != NULL)
+ {
+ /* See comments in ExecIndexMarkPos */
+ Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
+
+ Assert(scanrelid > 0);
+ if (epqstate->relsubs_slot[scanrelid - 1] != NULL ||
+ epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+ {
+ /* Verify the claim above */
+ if (!epqstate->relsubs_done[scanrelid - 1])
+ elog(ERROR, "unexpected ExecIndexOnlyRestrPos call in EPQ recheck");
+ return;
+ }
+ }
+
+ index_restrpos(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitIndexOnlyScan
+ *
+ * Initializes the index scan's state information, creates
+ * scan keys, and opens the base and index relations.
+ *
+ * Note: index scans have 2 sets of state information because
+ * we have to keep track of the base relation and the
+ * index relation.
+ * ----------------------------------------------------------------
+ */
+IndexOnlyScanState *
+ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
+{
+ IndexOnlyScanState *indexstate;
+ Relation currentRelation;
+ LOCKMODE lockmode;
+ TupleDesc tupDesc;
+
+ /*
+ * create state structure
+ */
+ indexstate = makeNode(IndexOnlyScanState);
+ indexstate->ss.ps.plan = (Plan *) node;
+ indexstate->ss.ps.state = estate;
+ indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &indexstate->ss.ps);
+
+ /*
+ * open the scan relation
+ */
+ currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+ indexstate->ss.ss_currentRelation = currentRelation;
+ indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
+
+ /*
+ * Build the scan tuple type using the indextlist generated by the
+ * planner. We use this, rather than the index's physical tuple
+ * descriptor, because the latter contains storage column types not the
+ * types of the original datums. (It's the AM's responsibility to return
+ * suitable data anyway.)
+ */
+ tupDesc = ExecTypeFromTL(node->indextlist);
+ ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc,
+ &TTSOpsVirtual);
+
+ /*
+ * We need another slot, in a format that's suitable for the table AM, for
+ * when we need to fetch a tuple from the table for rechecking visibility.
+ */
+ indexstate->ioss_TableSlot =
+ ExecAllocTableSlot(&estate->es_tupleTable,
+ RelationGetDescr(currentRelation),
+ table_slot_callbacks(currentRelation));
+
+ /*
+ * Initialize result type and projection info. The node's targetlist will
+ * contain Vars with varno = INDEX_VAR, referencing the scan tuple.
+ */
+ ExecInitResultTypeTL(&indexstate->ss.ps);
+ ExecAssignScanProjectionInfoWithVarno(&indexstate->ss, INDEX_VAR);
+
+ /*
+ * initialize child expressions
+ *
+ * Note: we don't initialize all of the indexorderby expression, only the
+ * sub-parts corresponding to runtime keys (see below).
+ */
+ indexstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) indexstate);
+ indexstate->recheckqual =
+ ExecInitQual(node->recheckqual, (PlanState *) indexstate);
+
+ /*
+ * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
+ * here. This allows an index-advisor plugin to EXPLAIN a plan containing
+ * references to nonexistent indexes.
+ */
+ if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
+ return indexstate;
+
+ /* Open the index relation. */
+ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
+ indexstate->ioss_RelationDesc = index_open(node->indexid, lockmode);
+
+ /*
+ * Initialize index-specific scan state
+ */
+ indexstate->ioss_RuntimeKeysReady = false;
+ indexstate->ioss_RuntimeKeys = NULL;
+ indexstate->ioss_NumRuntimeKeys = 0;
+
+ /*
+ * build the index scan keys from the index qualification
+ */
+ ExecIndexBuildScanKeys((PlanState *) indexstate,
+ indexstate->ioss_RelationDesc,
+ node->indexqual,
+ false,
+ &indexstate->ioss_ScanKeys,
+ &indexstate->ioss_NumScanKeys,
+ &indexstate->ioss_RuntimeKeys,
+ &indexstate->ioss_NumRuntimeKeys,
+ NULL, /* no ArrayKeys */
+ NULL);
+
+ /*
+ * any ORDER BY exprs have to be turned into scankeys in the same way
+ */
+ ExecIndexBuildScanKeys((PlanState *) indexstate,
+ indexstate->ioss_RelationDesc,
+ node->indexorderby,
+ true,
+ &indexstate->ioss_OrderByKeys,
+ &indexstate->ioss_NumOrderByKeys,
+ &indexstate->ioss_RuntimeKeys,
+ &indexstate->ioss_NumRuntimeKeys,
+ NULL, /* no ArrayKeys */
+ NULL);
+
+ /*
+ * If we have runtime keys, we need an ExprContext to evaluate them. The
+ * node's standard context won't do because we want to reset that context
+ * for every tuple. So, build another context just like the other one...
+ * -tgl 7/11/00
+ */
+ if (indexstate->ioss_NumRuntimeKeys != 0)
+ {
+ ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
+
+ ExecAssignExprContext(estate, &indexstate->ss.ps);
+ indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
+ indexstate->ss.ps.ps_ExprContext = stdecontext;
+ }
+ else
+ {
+ indexstate->ioss_RuntimeContext = NULL;
+ }
+
+ /*
+ * all done.
+ */
+ return indexstate;
+}
+
+/* ----------------------------------------------------------------
+ * Parallel Index-only Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecIndexOnlyScanEstimate
+ *
+ * Compute the amount of space we'll need in the parallel
+ * query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanEstimate(IndexOnlyScanState *node,
+ ParallelContext *pcxt)
+{
+ EState *estate = node->ss.ps.state;
+
+ node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc,
+ estate->es_snapshot);
+ shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexOnlyScanInitializeDSM
+ *
+ * Set up a parallel index-only scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
+ ParallelContext *pcxt)
+{
+ EState *estate = node->ss.ps.state;
+ ParallelIndexScanDesc piscan;
+
+ piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen);
+ index_parallelscan_initialize(node->ss.ss_currentRelation,
+ node->ioss_RelationDesc,
+ estate->es_snapshot,
+ piscan);
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+ node->ioss_ScanDesc =
+ index_beginscan_parallel(node->ss.ss_currentRelation,
+ node->ioss_RelationDesc,
+ node->ioss_NumScanKeys,
+ node->ioss_NumOrderByKeys,
+ piscan);
+ node->ioss_ScanDesc->xs_want_itup = true;
+ node->ioss_VMBuffer = InvalidBuffer;
+
+ /*
+ * If no run-time keys to calculate or they are ready, go ahead and pass
+ * the scankeys to the index AM.
+ */
+ if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
+ index_rescan(node->ioss_ScanDesc,
+ node->ioss_ScanKeys, node->ioss_NumScanKeys,
+ node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexOnlyScanReInitializeDSM
+ *
+ * Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node,
+ ParallelContext *pcxt)
+{
+ index_parallelrescan(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexOnlyScanInitializeWorker
+ *
+ * Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
+ ParallelWorkerContext *pwcxt)
+{
+ ParallelIndexScanDesc piscan;
+
+ piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+ node->ioss_ScanDesc =
+ index_beginscan_parallel(node->ss.ss_currentRelation,
+ node->ioss_RelationDesc,
+ node->ioss_NumScanKeys,
+ node->ioss_NumOrderByKeys,
+ piscan);
+ node->ioss_ScanDesc->xs_want_itup = true;
+
+ /*
+ * If no run-time keys to calculate or they are ready, go ahead and pass
+ * the scankeys to the index AM.
+ */
+ if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
+ index_rescan(node->ioss_ScanDesc,
+ node->ioss_ScanKeys, node->ioss_NumScanKeys,
+ node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
+}
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
new file mode 100644
index 0000000..add29b3
--- /dev/null
+++ b/src/backend/executor/nodeIndexscan.c
@@ -0,0 +1,1747 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIndexscan.c
+ * Routines to support indexed scans of relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeIndexscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecIndexScan scans a relation using an index
+ * IndexNext retrieve next tuple using index
+ * IndexNextWithReorder same, but recheck ORDER BY expressions
+ * ExecInitIndexScan creates and initializes state info.
+ * ExecReScanIndexScan rescans the indexed relation.
+ * ExecEndIndexScan releases all storage.
+ * ExecIndexMarkPos marks scan position.
+ * ExecIndexRestrPos restores scan position.
+ * ExecIndexScanEstimate estimates DSM space needed for parallel index scan
+ * ExecIndexScanInitializeDSM initialize DSM for parallel indexscan
+ * ExecIndexScanReInitializeDSM reinitialize DSM for fresh scan
+ * ExecIndexScanInitializeWorker attach to DSM info in parallel worker
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "catalog/pg_am.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIndexscan.h"
+#include "lib/pairingheap.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/array.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/*
+ * When an ordering operator is used, tuples fetched from the index that
+ * need to be reordered are queued in a pairing heap, as ReorderTuples.
+ */
+typedef struct
+{
+ pairingheap_node ph_node;
+ HeapTuple htup;
+ Datum *orderbyvals;
+ bool *orderbynulls;
+} ReorderTuple;
+
+static TupleTableSlot *IndexNext(IndexScanState *node);
+static TupleTableSlot *IndexNextWithReorder(IndexScanState *node);
+static void EvalOrderByExpressions(IndexScanState *node, ExprContext *econtext);
+static bool IndexRecheck(IndexScanState *node, TupleTableSlot *slot);
+static int cmp_orderbyvals(const Datum *adist, const bool *anulls,
+ const Datum *bdist, const bool *bnulls,
+ IndexScanState *node);
+static int reorderqueue_cmp(const pairingheap_node *a,
+ const pairingheap_node *b, void *arg);
+static void reorderqueue_push(IndexScanState *node, TupleTableSlot *slot,
+ Datum *orderbyvals, bool *orderbynulls);
+static HeapTuple reorderqueue_pop(IndexScanState *node);
+
+
+/* ----------------------------------------------------------------
+ * IndexNext
+ *
+ * Retrieve a tuple from the IndexScan node's currentRelation
+ * using the index specified in the IndexScanState information.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+IndexNext(IndexScanState *node)
+{
+ EState *estate;
+ ExprContext *econtext;
+ ScanDirection direction;
+ IndexScanDesc scandesc;
+ TupleTableSlot *slot;
+
+ /*
+ * extract necessary information from index scan node
+ */
+ estate = node->ss.ps.state;
+ direction = estate->es_direction;
+ /* flip direction if this is an overall backward scan */
+ if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir))
+ {
+ if (ScanDirectionIsForward(direction))
+ direction = BackwardScanDirection;
+ else if (ScanDirectionIsBackward(direction))
+ direction = ForwardScanDirection;
+ }
+ scandesc = node->iss_ScanDesc;
+ econtext = node->ss.ps.ps_ExprContext;
+ slot = node->ss.ss_ScanTupleSlot;
+
+ if (scandesc == NULL)
+ {
+ /*
+ * We reach here if the index scan is not parallel, or if we're
+ * serially executing an index scan that was planned to be parallel.
+ */
+ scandesc = index_beginscan(node->ss.ss_currentRelation,
+ node->iss_RelationDesc,
+ estate->es_snapshot,
+ node->iss_NumScanKeys,
+ node->iss_NumOrderByKeys);
+
+ node->iss_ScanDesc = scandesc;
+
+ /*
+ * If no run-time keys to calculate or they are ready, go ahead and
+ * pass the scankeys to the index AM.
+ */
+ if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady)
+ index_rescan(scandesc,
+ node->iss_ScanKeys, node->iss_NumScanKeys,
+ node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+ }
+
+ /*
+ * ok, now that we have what we need, fetch the next tuple.
+ */
+ while (index_getnext_slot(scandesc, direction, slot))
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * If the index was lossy, we have to recheck the index quals using
+ * the fetched tuple.
+ */
+ if (scandesc->xs_recheck)
+ {
+ econtext->ecxt_scantuple = slot;
+ if (!ExecQualAndReset(node->indexqualorig, econtext))
+ {
+ /* Fails recheck, so drop it and loop back for another */
+ InstrCountFiltered2(node, 1);
+ continue;
+ }
+ }
+
+ return slot;
+ }
+
+ /*
+ * if we get here it means the index scan failed so we are at the end of
+ * the scan..
+ */
+ node->iss_ReachedEnd = true;
+ return ExecClearTuple(slot);
+}
+
+/* ----------------------------------------------------------------
+ * IndexNextWithReorder
+ *
+ * Like IndexNext, but this version can also re-check ORDER BY
+ * expressions, and reorder the tuples as necessary.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+IndexNextWithReorder(IndexScanState *node)
+{
+ EState *estate;
+ ExprContext *econtext;
+ IndexScanDesc scandesc;
+ TupleTableSlot *slot;
+ ReorderTuple *topmost = NULL;
+ bool was_exact;
+ Datum *lastfetched_vals;
+ bool *lastfetched_nulls;
+ int cmp;
+
+ estate = node->ss.ps.state;
+
+ /*
+ * Only forward scan is supported with reordering. Note: we can get away
+ * with just Asserting here because the system will not try to run the
+ * plan backwards if ExecSupportsBackwardScan() says it won't work.
+ * Currently, that is guaranteed because no index AMs support both
+ * amcanorderbyop and amcanbackward; if any ever do,
+ * ExecSupportsBackwardScan() will need to consider indexorderbys
+ * explicitly.
+ */
+ Assert(!ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir));
+ Assert(ScanDirectionIsForward(estate->es_direction));
+
+ scandesc = node->iss_ScanDesc;
+ econtext = node->ss.ps.ps_ExprContext;
+ slot = node->ss.ss_ScanTupleSlot;
+
+ if (scandesc == NULL)
+ {
+ /*
+ * We reach here if the index scan is not parallel, or if we're
+ * serially executing an index scan that was planned to be parallel.
+ */
+ scandesc = index_beginscan(node->ss.ss_currentRelation,
+ node->iss_RelationDesc,
+ estate->es_snapshot,
+ node->iss_NumScanKeys,
+ node->iss_NumOrderByKeys);
+
+ node->iss_ScanDesc = scandesc;
+
+ /*
+ * If no run-time keys to calculate or they are ready, go ahead and
+ * pass the scankeys to the index AM.
+ */
+ if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady)
+ index_rescan(scandesc,
+ node->iss_ScanKeys, node->iss_NumScanKeys,
+ node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+ }
+
+ for (;;)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Check the reorder queue first. If the topmost tuple in the queue
+ * has an ORDER BY value smaller than (or equal to) the value last
+ * returned by the index, we can return it now.
+ */
+ if (!pairingheap_is_empty(node->iss_ReorderQueue))
+ {
+ topmost = (ReorderTuple *) pairingheap_first(node->iss_ReorderQueue);
+
+ if (node->iss_ReachedEnd ||
+ cmp_orderbyvals(topmost->orderbyvals,
+ topmost->orderbynulls,
+ scandesc->xs_orderbyvals,
+ scandesc->xs_orderbynulls,
+ node) <= 0)
+ {
+ HeapTuple tuple;
+
+ tuple = reorderqueue_pop(node);
+
+ /* Pass 'true', as the tuple in the queue is a palloc'd copy */
+ ExecForceStoreHeapTuple(tuple, slot, true);
+ return slot;
+ }
+ }
+ else if (node->iss_ReachedEnd)
+ {
+ /* Queue is empty, and no more tuples from index. We're done. */
+ return ExecClearTuple(slot);
+ }
+
+ /*
+ * Fetch next tuple from the index.
+ */
+next_indextuple:
+ if (!index_getnext_slot(scandesc, ForwardScanDirection, slot))
+ {
+ /*
+ * No more tuples from the index. But we still need to drain any
+ * remaining tuples from the queue before we're done.
+ */
+ node->iss_ReachedEnd = true;
+ continue;
+ }
+
+ /*
+ * If the index was lossy, we have to recheck the index quals and
+ * ORDER BY expressions using the fetched tuple.
+ */
+ if (scandesc->xs_recheck)
+ {
+ econtext->ecxt_scantuple = slot;
+ if (!ExecQualAndReset(node->indexqualorig, econtext))
+ {
+ /* Fails recheck, so drop it and loop back for another */
+ InstrCountFiltered2(node, 1);
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ goto next_indextuple;
+ }
+ }
+
+ if (scandesc->xs_recheckorderby)
+ {
+ econtext->ecxt_scantuple = slot;
+ ResetExprContext(econtext);
+ EvalOrderByExpressions(node, econtext);
+
+ /*
+ * Was the ORDER BY value returned by the index accurate? The
+ * recheck flag means that the index can return inaccurate values,
+ * but then again, the value returned for any particular tuple
+ * could also be exactly correct. Compare the value returned by
+ * the index with the recalculated value. (If the value returned
+ * by the index happened to be exact right, we can often avoid
+ * pushing the tuple to the queue, just to pop it back out again.)
+ */
+ cmp = cmp_orderbyvals(node->iss_OrderByValues,
+ node->iss_OrderByNulls,
+ scandesc->xs_orderbyvals,
+ scandesc->xs_orderbynulls,
+ node);
+ if (cmp < 0)
+ elog(ERROR, "index returned tuples in wrong order");
+ else if (cmp == 0)
+ was_exact = true;
+ else
+ was_exact = false;
+ lastfetched_vals = node->iss_OrderByValues;
+ lastfetched_nulls = node->iss_OrderByNulls;
+ }
+ else
+ {
+ was_exact = true;
+ lastfetched_vals = scandesc->xs_orderbyvals;
+ lastfetched_nulls = scandesc->xs_orderbynulls;
+ }
+
+ /*
+ * Can we return this tuple immediately, or does it need to be pushed
+ * to the reorder queue? If the ORDER BY expression values returned
+ * by the index were inaccurate, we can't return it yet, because the
+ * next tuple from the index might need to come before this one. Also,
+ * we can't return it yet if there are any smaller tuples in the queue
+ * already.
+ */
+ if (!was_exact || (topmost && cmp_orderbyvals(lastfetched_vals,
+ lastfetched_nulls,
+ topmost->orderbyvals,
+ topmost->orderbynulls,
+ node) > 0))
+ {
+ /* Put this tuple to the queue */
+ reorderqueue_push(node, slot, lastfetched_vals, lastfetched_nulls);
+ continue;
+ }
+ else
+ {
+ /* Can return this tuple immediately. */
+ return slot;
+ }
+ }
+
+ /*
+ * if we get here it means the index scan failed so we are at the end of
+ * the scan..
+ */
+ return ExecClearTuple(slot);
+}
+
+/*
+ * Calculate the expressions in the ORDER BY clause, based on the heap tuple.
+ */
+static void
+EvalOrderByExpressions(IndexScanState *node, ExprContext *econtext)
+{
+ int i;
+ ListCell *l;
+ MemoryContext oldContext;
+
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ i = 0;
+ foreach(l, node->indexorderbyorig)
+ {
+ ExprState *orderby = (ExprState *) lfirst(l);
+
+ node->iss_OrderByValues[i] = ExecEvalExpr(orderby,
+ econtext,
+ &node->iss_OrderByNulls[i]);
+ i++;
+ }
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * IndexRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+IndexRecheck(IndexScanState *node, TupleTableSlot *slot)
+{
+ ExprContext *econtext;
+
+ /*
+ * extract necessary information from index scan node
+ */
+ econtext = node->ss.ps.ps_ExprContext;
+
+ /* Does the tuple meet the indexqual condition? */
+ econtext->ecxt_scantuple = slot;
+ return ExecQualAndReset(node->indexqualorig, econtext);
+}
+
+
+/*
+ * Compare ORDER BY expression values.
+ */
+static int
+cmp_orderbyvals(const Datum *adist, const bool *anulls,
+ const Datum *bdist, const bool *bnulls,
+ IndexScanState *node)
+{
+ int i;
+ int result;
+
+ for (i = 0; i < node->iss_NumOrderByKeys; i++)
+ {
+ SortSupport ssup = &node->iss_SortSupport[i];
+
+ /*
+ * Handle nulls. We only need to support NULLS LAST ordering, because
+ * match_pathkeys_to_index() doesn't consider indexorderby
+ * implementation otherwise.
+ */
+ if (anulls[i] && !bnulls[i])
+ return 1;
+ else if (!anulls[i] && bnulls[i])
+ return -1;
+ else if (anulls[i] && bnulls[i])
+ return 0;
+
+ result = ssup->comparator(adist[i], bdist[i], ssup);
+ if (result != 0)
+ return result;
+ }
+
+ return 0;
+}
+
+/*
+ * Pairing heap provides getting topmost (greatest) element while KNN provides
+ * ascending sort. That's why we invert the sort order.
+ */
+static int
+reorderqueue_cmp(const pairingheap_node *a, const pairingheap_node *b,
+ void *arg)
+{
+ ReorderTuple *rta = (ReorderTuple *) a;
+ ReorderTuple *rtb = (ReorderTuple *) b;
+ IndexScanState *node = (IndexScanState *) arg;
+
+ /* exchange argument order to invert the sort order */
+ return cmp_orderbyvals(rtb->orderbyvals, rtb->orderbynulls,
+ rta->orderbyvals, rta->orderbynulls,
+ node);
+}
+
+/*
+ * Helper function to push a tuple to the reorder queue.
+ */
+static void
+reorderqueue_push(IndexScanState *node, TupleTableSlot *slot,
+ Datum *orderbyvals, bool *orderbynulls)
+{
+ IndexScanDesc scandesc = node->iss_ScanDesc;
+ EState *estate = node->ss.ps.state;
+ MemoryContext oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+ ReorderTuple *rt;
+ int i;
+
+ rt = (ReorderTuple *) palloc(sizeof(ReorderTuple));
+ rt->htup = ExecCopySlotHeapTuple(slot);
+ rt->orderbyvals =
+ (Datum *) palloc(sizeof(Datum) * scandesc->numberOfOrderBys);
+ rt->orderbynulls =
+ (bool *) palloc(sizeof(bool) * scandesc->numberOfOrderBys);
+ for (i = 0; i < node->iss_NumOrderByKeys; i++)
+ {
+ if (!orderbynulls[i])
+ rt->orderbyvals[i] = datumCopy(orderbyvals[i],
+ node->iss_OrderByTypByVals[i],
+ node->iss_OrderByTypLens[i]);
+ else
+ rt->orderbyvals[i] = (Datum) 0;
+ rt->orderbynulls[i] = orderbynulls[i];
+ }
+ pairingheap_add(node->iss_ReorderQueue, &rt->ph_node);
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Helper function to pop the next tuple from the reorder queue.
+ */
+static HeapTuple
+reorderqueue_pop(IndexScanState *node)
+{
+ HeapTuple result;
+ ReorderTuple *topmost;
+ int i;
+
+ topmost = (ReorderTuple *) pairingheap_remove_first(node->iss_ReorderQueue);
+
+ result = topmost->htup;
+ for (i = 0; i < node->iss_NumOrderByKeys; i++)
+ {
+ if (!node->iss_OrderByTypByVals[i] && !topmost->orderbynulls[i])
+ pfree(DatumGetPointer(topmost->orderbyvals[i]));
+ }
+ pfree(topmost->orderbyvals);
+ pfree(topmost->orderbynulls);
+ pfree(topmost);
+
+ return result;
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecIndexScan(node)
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecIndexScan(PlanState *pstate)
+{
+ IndexScanState *node = castNode(IndexScanState, pstate);
+
+ /*
+ * If we have runtime keys and they've not already been set up, do it now.
+ */
+ if (node->iss_NumRuntimeKeys != 0 && !node->iss_RuntimeKeysReady)
+ ExecReScan((PlanState *) node);
+
+ if (node->iss_NumOrderByKeys > 0)
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) IndexNextWithReorder,
+ (ExecScanRecheckMtd) IndexRecheck);
+ else
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) IndexNext,
+ (ExecScanRecheckMtd) IndexRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanIndexScan(node)
+ *
+ * Recalculates the values of any scan keys whose value depends on
+ * information known at runtime, then rescans the indexed relation.
+ *
+ * Updating the scan key was formerly done separately in
+ * ExecUpdateIndexScanKeys. Integrating it into ReScan makes
+ * rescans of indices and relations/general streams more uniform.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanIndexScan(IndexScanState *node)
+{
+ /*
+ * If we are doing runtime key calculations (ie, any of the index key
+ * values weren't simple Consts), compute the new key values. But first,
+ * reset the context so we don't leak memory as each outer tuple is
+ * scanned. Note this assumes that we will recalculate *all* runtime keys
+ * on each call.
+ */
+ if (node->iss_NumRuntimeKeys != 0)
+ {
+ ExprContext *econtext = node->iss_RuntimeContext;
+
+ ResetExprContext(econtext);
+ ExecIndexEvalRuntimeKeys(econtext,
+ node->iss_RuntimeKeys,
+ node->iss_NumRuntimeKeys);
+ }
+ node->iss_RuntimeKeysReady = true;
+
+ /* flush the reorder queue */
+ if (node->iss_ReorderQueue)
+ {
+ HeapTuple tuple;
+ while (!pairingheap_is_empty(node->iss_ReorderQueue))
+ {
+ tuple = reorderqueue_pop(node);
+ heap_freetuple(tuple);
+ }
+ }
+
+ /* reset index scan */
+ if (node->iss_ScanDesc)
+ index_rescan(node->iss_ScanDesc,
+ node->iss_ScanKeys, node->iss_NumScanKeys,
+ node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+ node->iss_ReachedEnd = false;
+
+ ExecScanReScan(&node->ss);
+}
+
+
+/*
+ * ExecIndexEvalRuntimeKeys
+ * Evaluate any runtime key values, and update the scankeys.
+ */
+void
+ExecIndexEvalRuntimeKeys(ExprContext *econtext,
+ IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys)
+{
+ int j;
+ MemoryContext oldContext;
+
+ /* We want to keep the key values in per-tuple memory */
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ for (j = 0; j < numRuntimeKeys; j++)
+ {
+ ScanKey scan_key = runtimeKeys[j].scan_key;
+ ExprState *key_expr = runtimeKeys[j].key_expr;
+ Datum scanvalue;
+ bool isNull;
+
+ /*
+ * For each run-time key, extract the run-time expression and evaluate
+ * it with respect to the current context. We then stick the result
+ * into the proper scan key.
+ *
+ * Note: the result of the eval could be a pass-by-ref value that's
+ * stored in some outer scan's tuple, not in
+ * econtext->ecxt_per_tuple_memory. We assume that the outer tuple
+ * will stay put throughout our scan. If this is wrong, we could copy
+ * the result into our context explicitly, but I think that's not
+ * necessary.
+ *
+ * It's also entirely possible that the result of the eval is a
+ * toasted value. In this case we should forcibly detoast it, to
+ * avoid repeat detoastings each time the value is examined by an
+ * index support function.
+ */
+ scanvalue = ExecEvalExpr(key_expr,
+ econtext,
+ &isNull);
+ if (isNull)
+ {
+ scan_key->sk_argument = scanvalue;
+ scan_key->sk_flags |= SK_ISNULL;
+ }
+ else
+ {
+ if (runtimeKeys[j].key_toastable)
+ scanvalue = PointerGetDatum(PG_DETOAST_DATUM(scanvalue));
+ scan_key->sk_argument = scanvalue;
+ scan_key->sk_flags &= ~SK_ISNULL;
+ }
+ }
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * ExecIndexEvalArrayKeys
+ * Evaluate any array key values, and set up to iterate through arrays.
+ *
+ * Returns true if there are array elements to consider; false means there
+ * is at least one null or empty array, so no match is possible. On true
+ * result, the scankeys are initialized with the first elements of the arrays.
+ */
+bool
+ExecIndexEvalArrayKeys(ExprContext *econtext,
+ IndexArrayKeyInfo *arrayKeys, int numArrayKeys)
+{
+ bool result = true;
+ int j;
+ MemoryContext oldContext;
+
+ /* We want to keep the arrays in per-tuple memory */
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ for (j = 0; j < numArrayKeys; j++)
+ {
+ ScanKey scan_key = arrayKeys[j].scan_key;
+ ExprState *array_expr = arrayKeys[j].array_expr;
+ Datum arraydatum;
+ bool isNull;
+ ArrayType *arrayval;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ int num_elems;
+ Datum *elem_values;
+ bool *elem_nulls;
+
+ /*
+ * Compute and deconstruct the array expression. (Notes in
+ * ExecIndexEvalRuntimeKeys() apply here too.)
+ */
+ arraydatum = ExecEvalExpr(array_expr,
+ econtext,
+ &isNull);
+ if (isNull)
+ {
+ result = false;
+ break; /* no point in evaluating more */
+ }
+ arrayval = DatumGetArrayTypeP(arraydatum);
+ /* We could cache this data, but not clear it's worth it */
+ get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+ &elmlen, &elmbyval, &elmalign);
+ deconstruct_array(arrayval,
+ ARR_ELEMTYPE(arrayval),
+ elmlen, elmbyval, elmalign,
+ &elem_values, &elem_nulls, &num_elems);
+ if (num_elems <= 0)
+ {
+ result = false;
+ break; /* no point in evaluating more */
+ }
+
+ /*
+ * Note: we expect the previous array data, if any, to be
+ * automatically freed by resetting the per-tuple context; hence no
+ * pfree's here.
+ */
+ arrayKeys[j].elem_values = elem_values;
+ arrayKeys[j].elem_nulls = elem_nulls;
+ arrayKeys[j].num_elems = num_elems;
+ scan_key->sk_argument = elem_values[0];
+ if (elem_nulls[0])
+ scan_key->sk_flags |= SK_ISNULL;
+ else
+ scan_key->sk_flags &= ~SK_ISNULL;
+ arrayKeys[j].next_elem = 1;
+ }
+
+ MemoryContextSwitchTo(oldContext);
+
+ return result;
+}
+
+/*
+ * ExecIndexAdvanceArrayKeys
+ * Advance to the next set of array key values, if any.
+ *
+ * Returns true if there is another set of values to consider, false if not.
+ * On true result, the scankeys are initialized with the next set of values.
+ */
+bool
+ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys)
+{
+ bool found = false;
+ int j;
+
+ /*
+ * Note we advance the rightmost array key most quickly, since it will
+ * correspond to the lowest-order index column among the available
+ * qualifications. This is hypothesized to result in better locality of
+ * access in the index.
+ */
+ for (j = numArrayKeys - 1; j >= 0; j--)
+ {
+ ScanKey scan_key = arrayKeys[j].scan_key;
+ int next_elem = arrayKeys[j].next_elem;
+ int num_elems = arrayKeys[j].num_elems;
+ Datum *elem_values = arrayKeys[j].elem_values;
+ bool *elem_nulls = arrayKeys[j].elem_nulls;
+
+ if (next_elem >= num_elems)
+ {
+ next_elem = 0;
+ found = false; /* need to advance next array key */
+ }
+ else
+ found = true;
+ scan_key->sk_argument = elem_values[next_elem];
+ if (elem_nulls[next_elem])
+ scan_key->sk_flags |= SK_ISNULL;
+ else
+ scan_key->sk_flags &= ~SK_ISNULL;
+ arrayKeys[j].next_elem = next_elem + 1;
+ if (found)
+ break;
+ }
+
+ return found;
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecEndIndexScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIndexScan(IndexScanState *node)
+{
+ Relation indexRelationDesc;
+ IndexScanDesc indexScanDesc;
+
+ /*
+ * extract information from the node
+ */
+ indexRelationDesc = node->iss_RelationDesc;
+ indexScanDesc = node->iss_ScanDesc;
+
+ /*
+ * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
+ */
+#ifdef NOT_USED
+ ExecFreeExprContext(&node->ss.ps);
+ if (node->iss_RuntimeContext)
+ FreeExprContext(node->iss_RuntimeContext, true);
+#endif
+
+ /*
+ * clear out tuple table slots
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * close the index relation (no-op if we didn't open it)
+ */
+ if (indexScanDesc)
+ index_endscan(indexScanDesc);
+ if (indexRelationDesc)
+ index_close(indexRelationDesc, NoLock);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexMarkPos
+ *
+ * Note: we assume that no caller attempts to set a mark before having read
+ * at least one tuple. Otherwise, iss_ScanDesc might still be NULL.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexMarkPos(IndexScanState *node)
+{
+ EState *estate = node->ss.ps.state;
+ EPQState *epqstate = estate->es_epq_active;
+
+ if (epqstate != NULL)
+ {
+ /*
+ * We are inside an EvalPlanQual recheck. If a test tuple exists for
+ * this relation, then we shouldn't access the index at all. We would
+ * instead need to save, and later restore, the state of the
+ * relsubs_done flag, so that re-fetching the test tuple is possible.
+ * However, given the assumption that no caller sets a mark at the
+ * start of the scan, we can only get here with relsubs_done[i]
+ * already set, and so no state need be saved.
+ */
+ Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
+
+ Assert(scanrelid > 0);
+ if (epqstate->relsubs_slot[scanrelid - 1] != NULL ||
+ epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+ {
+ /* Verify the claim above */
+ if (!epqstate->relsubs_done[scanrelid - 1])
+ elog(ERROR, "unexpected ExecIndexMarkPos call in EPQ recheck");
+ return;
+ }
+ }
+
+ index_markpos(node->iss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexRestrPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexRestrPos(IndexScanState *node)
+{
+ EState *estate = node->ss.ps.state;
+ EPQState *epqstate = estate->es_epq_active;
+
+ if (estate->es_epq_active != NULL)
+ {
+ /* See comments in ExecIndexMarkPos */
+ Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
+
+ Assert(scanrelid > 0);
+ if (epqstate->relsubs_slot[scanrelid - 1] != NULL ||
+ epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+ {
+ /* Verify the claim above */
+ if (!epqstate->relsubs_done[scanrelid - 1])
+ elog(ERROR, "unexpected ExecIndexRestrPos call in EPQ recheck");
+ return;
+ }
+ }
+
+ index_restrpos(node->iss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitIndexScan
+ *
+ * Initializes the index scan's state information, creates
+ * scan keys, and opens the base and index relations.
+ *
+ * Note: index scans have 2 sets of state information because
+ * we have to keep track of the base relation and the
+ * index relation.
+ * ----------------------------------------------------------------
+ */
+IndexScanState *
+ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
+{
+ IndexScanState *indexstate;
+ Relation currentRelation;
+ LOCKMODE lockmode;
+
+ /*
+ * create state structure
+ */
+ indexstate = makeNode(IndexScanState);
+ indexstate->ss.ps.plan = (Plan *) node;
+ indexstate->ss.ps.state = estate;
+ indexstate->ss.ps.ExecProcNode = ExecIndexScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &indexstate->ss.ps);
+
+ /*
+ * open the scan relation
+ */
+ currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+ indexstate->ss.ss_currentRelation = currentRelation;
+ indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
+
+ /*
+ * get the scan type from the relation descriptor.
+ */
+ ExecInitScanTupleSlot(estate, &indexstate->ss,
+ RelationGetDescr(currentRelation),
+ table_slot_callbacks(currentRelation));
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&indexstate->ss.ps);
+ ExecAssignScanProjectionInfo(&indexstate->ss);
+
+ /*
+ * initialize child expressions
+ *
+ * Note: we don't initialize all of the indexqual expression, only the
+ * sub-parts corresponding to runtime keys (see below). Likewise for
+ * indexorderby, if any. But the indexqualorig expression is always
+ * initialized even though it will only be used in some uncommon cases ---
+ * would be nice to improve that. (Problem is that any SubPlans present
+ * in the expression must be found now...)
+ */
+ indexstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) indexstate);
+ indexstate->indexqualorig =
+ ExecInitQual(node->indexqualorig, (PlanState *) indexstate);
+ indexstate->indexorderbyorig =
+ ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate);
+
+ /*
+ * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
+ * here. This allows an index-advisor plugin to EXPLAIN a plan containing
+ * references to nonexistent indexes.
+ */
+ if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
+ return indexstate;
+
+ /* Open the index relation. */
+ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
+ indexstate->iss_RelationDesc = index_open(node->indexid, lockmode);
+
+ /*
+ * Initialize index-specific scan state
+ */
+ indexstate->iss_RuntimeKeysReady = false;
+ indexstate->iss_RuntimeKeys = NULL;
+ indexstate->iss_NumRuntimeKeys = 0;
+
+ /*
+ * build the index scan keys from the index qualification
+ */
+ ExecIndexBuildScanKeys((PlanState *) indexstate,
+ indexstate->iss_RelationDesc,
+ node->indexqual,
+ false,
+ &indexstate->iss_ScanKeys,
+ &indexstate->iss_NumScanKeys,
+ &indexstate->iss_RuntimeKeys,
+ &indexstate->iss_NumRuntimeKeys,
+ NULL, /* no ArrayKeys */
+ NULL);
+
+ /*
+ * any ORDER BY exprs have to be turned into scankeys in the same way
+ */
+ ExecIndexBuildScanKeys((PlanState *) indexstate,
+ indexstate->iss_RelationDesc,
+ node->indexorderby,
+ true,
+ &indexstate->iss_OrderByKeys,
+ &indexstate->iss_NumOrderByKeys,
+ &indexstate->iss_RuntimeKeys,
+ &indexstate->iss_NumRuntimeKeys,
+ NULL, /* no ArrayKeys */
+ NULL);
+
+ /* Initialize sort support, if we need to re-check ORDER BY exprs */
+ if (indexstate->iss_NumOrderByKeys > 0)
+ {
+ int numOrderByKeys = indexstate->iss_NumOrderByKeys;
+ int i;
+ ListCell *lco;
+ ListCell *lcx;
+
+ /*
+ * Prepare sort support, and look up the data type for each ORDER BY
+ * expression.
+ */
+ Assert(numOrderByKeys == list_length(node->indexorderbyops));
+ Assert(numOrderByKeys == list_length(node->indexorderbyorig));
+ indexstate->iss_SortSupport = (SortSupportData *)
+ palloc0(numOrderByKeys * sizeof(SortSupportData));
+ indexstate->iss_OrderByTypByVals = (bool *)
+ palloc(numOrderByKeys * sizeof(bool));
+ indexstate->iss_OrderByTypLens = (int16 *)
+ palloc(numOrderByKeys * sizeof(int16));
+ i = 0;
+ forboth(lco, node->indexorderbyops, lcx, node->indexorderbyorig)
+ {
+ Oid orderbyop = lfirst_oid(lco);
+ Node *orderbyexpr = (Node *) lfirst(lcx);
+ Oid orderbyType = exprType(orderbyexpr);
+ Oid orderbyColl = exprCollation(orderbyexpr);
+ SortSupport orderbysort = &indexstate->iss_SortSupport[i];
+
+ /* Initialize sort support */
+ orderbysort->ssup_cxt = CurrentMemoryContext;
+ orderbysort->ssup_collation = orderbyColl;
+ /* See cmp_orderbyvals() comments on NULLS LAST */
+ orderbysort->ssup_nulls_first = false;
+ /* ssup_attno is unused here and elsewhere */
+ orderbysort->ssup_attno = 0;
+ /* No abbreviation */
+ orderbysort->abbreviate = false;
+ PrepareSortSupportFromOrderingOp(orderbyop, orderbysort);
+
+ get_typlenbyval(orderbyType,
+ &indexstate->iss_OrderByTypLens[i],
+ &indexstate->iss_OrderByTypByVals[i]);
+ i++;
+ }
+
+ /* allocate arrays to hold the re-calculated distances */
+ indexstate->iss_OrderByValues = (Datum *)
+ palloc(numOrderByKeys * sizeof(Datum));
+ indexstate->iss_OrderByNulls = (bool *)
+ palloc(numOrderByKeys * sizeof(bool));
+
+ /* and initialize the reorder queue */
+ indexstate->iss_ReorderQueue = pairingheap_allocate(reorderqueue_cmp,
+ indexstate);
+ }
+
+ /*
+ * If we have runtime keys, we need an ExprContext to evaluate them. The
+ * node's standard context won't do because we want to reset that context
+ * for every tuple. So, build another context just like the other one...
+ * -tgl 7/11/00
+ */
+ if (indexstate->iss_NumRuntimeKeys != 0)
+ {
+ ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
+
+ ExecAssignExprContext(estate, &indexstate->ss.ps);
+ indexstate->iss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
+ indexstate->ss.ps.ps_ExprContext = stdecontext;
+ }
+ else
+ {
+ indexstate->iss_RuntimeContext = NULL;
+ }
+
+ /*
+ * all done.
+ */
+ return indexstate;
+}
+
+
+/*
+ * ExecIndexBuildScanKeys
+ * Build the index scan keys from the index qualification expressions
+ *
+ * The index quals are passed to the index AM in the form of a ScanKey array.
+ * This routine sets up the ScanKeys, fills in all constant fields of the
+ * ScanKeys, and prepares information about the keys that have non-constant
+ * comparison values. We divide index qual expressions into five types:
+ *
+ * 1. Simple operator with constant comparison value ("indexkey op constant").
+ * For these, we just fill in a ScanKey containing the constant value.
+ *
+ * 2. Simple operator with non-constant value ("indexkey op expression").
+ * For these, we create a ScanKey with everything filled in except the
+ * expression value, and set up an IndexRuntimeKeyInfo struct to drive
+ * evaluation of the expression at the right times.
+ *
+ * 3. RowCompareExpr ("(indexkey, indexkey, ...) op (expr, expr, ...)").
+ * For these, we create a header ScanKey plus a subsidiary ScanKey array,
+ * as specified in access/skey.h. The elements of the row comparison
+ * can have either constant or non-constant comparison values.
+ *
+ * 4. ScalarArrayOpExpr ("indexkey op ANY (array-expression)"). If the index
+ * supports amsearcharray, we handle these the same as simple operators,
+ * setting the SK_SEARCHARRAY flag to tell the AM to handle them. Otherwise,
+ * we create a ScanKey with everything filled in except the comparison value,
+ * and set up an IndexArrayKeyInfo struct to drive processing of the qual.
+ * (Note that if we use an IndexArrayKeyInfo struct, the array expression is
+ * always treated as requiring runtime evaluation, even if it's a constant.)
+ *
+ * 5. NullTest ("indexkey IS NULL/IS NOT NULL"). We just fill in the
+ * ScanKey properly.
+ *
+ * This code is also used to prepare ORDER BY expressions for amcanorderbyop
+ * indexes. The behavior is exactly the same, except that we have to look up
+ * the operator differently. Note that only cases 1 and 2 are currently
+ * possible for ORDER BY.
+ *
+ * Input params are:
+ *
+ * planstate: executor state node we are working for
+ * index: the index we are building scan keys for
+ * quals: indexquals (or indexorderbys) expressions
+ * isorderby: true if processing ORDER BY exprs, false if processing quals
+ * *runtimeKeys: ptr to pre-existing IndexRuntimeKeyInfos, or NULL if none
+ * *numRuntimeKeys: number of pre-existing runtime keys
+ *
+ * Output params are:
+ *
+ * *scanKeys: receives ptr to array of ScanKeys
+ * *numScanKeys: receives number of scankeys
+ * *runtimeKeys: receives ptr to array of IndexRuntimeKeyInfos, or NULL if none
+ * *numRuntimeKeys: receives number of runtime keys
+ * *arrayKeys: receives ptr to array of IndexArrayKeyInfos, or NULL if none
+ * *numArrayKeys: receives number of array keys
+ *
+ * Caller may pass NULL for arrayKeys and numArrayKeys to indicate that
+ * IndexArrayKeyInfos are not supported.
+ */
+void
+ExecIndexBuildScanKeys(PlanState *planstate, Relation index,
+ List *quals, bool isorderby,
+ ScanKey *scanKeys, int *numScanKeys,
+ IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys,
+ IndexArrayKeyInfo **arrayKeys, int *numArrayKeys)
+{
+ ListCell *qual_cell;
+ ScanKey scan_keys;
+ IndexRuntimeKeyInfo *runtime_keys;
+ IndexArrayKeyInfo *array_keys;
+ int n_scan_keys;
+ int n_runtime_keys;
+ int max_runtime_keys;
+ int n_array_keys;
+ int j;
+
+ /* Allocate array for ScanKey structs: one per qual */
+ n_scan_keys = list_length(quals);
+ scan_keys = (ScanKey) palloc(n_scan_keys * sizeof(ScanKeyData));
+
+ /*
+ * runtime_keys array is dynamically resized as needed. We handle it this
+ * way so that the same runtime keys array can be shared between
+ * indexquals and indexorderbys, which will be processed in separate calls
+ * of this function. Caller must be sure to pass in NULL/0 for first
+ * call.
+ */
+ runtime_keys = *runtimeKeys;
+ n_runtime_keys = max_runtime_keys = *numRuntimeKeys;
+
+ /* Allocate array_keys as large as it could possibly need to be */
+ array_keys = (IndexArrayKeyInfo *)
+ palloc0(n_scan_keys * sizeof(IndexArrayKeyInfo));
+ n_array_keys = 0;
+
+ /*
+ * for each opclause in the given qual, convert the opclause into a single
+ * scan key
+ */
+ j = 0;
+ foreach(qual_cell, quals)
+ {
+ Expr *clause = (Expr *) lfirst(qual_cell);
+ ScanKey this_scan_key = &scan_keys[j++];
+ Oid opno; /* operator's OID */
+ RegProcedure opfuncid; /* operator proc id used in scan */
+ Oid opfamily; /* opfamily of index column */
+ int op_strategy; /* operator's strategy number */
+ Oid op_lefttype; /* operator's declared input types */
+ Oid op_righttype;
+ Expr *leftop; /* expr on lhs of operator */
+ Expr *rightop; /* expr on rhs ... */
+ AttrNumber varattno; /* att number used in scan */
+ int indnkeyatts;
+
+ indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index);
+ if (IsA(clause, OpExpr))
+ {
+ /* indexkey op const or indexkey op expression */
+ int flags = 0;
+ Datum scanvalue;
+
+ opno = ((OpExpr *) clause)->opno;
+ opfuncid = ((OpExpr *) clause)->opfuncid;
+
+ /*
+ * leftop should be the index key Var, possibly relabeled
+ */
+ leftop = (Expr *) get_leftop(clause);
+
+ if (leftop && IsA(leftop, RelabelType))
+ leftop = ((RelabelType *) leftop)->arg;
+
+ Assert(leftop != NULL);
+
+ if (!(IsA(leftop, Var) &&
+ ((Var *) leftop)->varno == INDEX_VAR))
+ elog(ERROR, "indexqual doesn't have key on left side");
+
+ varattno = ((Var *) leftop)->varattno;
+ if (varattno < 1 || varattno > indnkeyatts)
+ elog(ERROR, "bogus index qualification");
+
+ /*
+ * We have to look up the operator's strategy number. This
+ * provides a cross-check that the operator does match the index.
+ */
+ opfamily = index->rd_opfamily[varattno - 1];
+
+ get_op_opfamily_properties(opno, opfamily, isorderby,
+ &op_strategy,
+ &op_lefttype,
+ &op_righttype);
+
+ if (isorderby)
+ flags |= SK_ORDER_BY;
+
+ /*
+ * rightop is the constant or variable comparison value
+ */
+ rightop = (Expr *) get_rightop(clause);
+
+ if (rightop && IsA(rightop, RelabelType))
+ rightop = ((RelabelType *) rightop)->arg;
+
+ Assert(rightop != NULL);
+
+ if (IsA(rightop, Const))
+ {
+ /* OK, simple constant comparison value */
+ scanvalue = ((Const *) rightop)->constvalue;
+ if (((Const *) rightop)->constisnull)
+ flags |= SK_ISNULL;
+ }
+ else
+ {
+ /* Need to treat this one as a runtime key */
+ if (n_runtime_keys >= max_runtime_keys)
+ {
+ if (max_runtime_keys == 0)
+ {
+ max_runtime_keys = 8;
+ runtime_keys = (IndexRuntimeKeyInfo *)
+ palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+ }
+ else
+ {
+ max_runtime_keys *= 2;
+ runtime_keys = (IndexRuntimeKeyInfo *)
+ repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+ }
+ }
+ runtime_keys[n_runtime_keys].scan_key = this_scan_key;
+ runtime_keys[n_runtime_keys].key_expr =
+ ExecInitExpr(rightop, planstate);
+ runtime_keys[n_runtime_keys].key_toastable =
+ TypeIsToastable(op_righttype);
+ n_runtime_keys++;
+ scanvalue = (Datum) 0;
+ }
+
+ /*
+ * initialize the scan key's fields appropriately
+ */
+ ScanKeyEntryInitialize(this_scan_key,
+ flags,
+ varattno, /* attribute number to scan */
+ op_strategy, /* op's strategy */
+ op_righttype, /* strategy subtype */
+ ((OpExpr *) clause)->inputcollid, /* collation */
+ opfuncid, /* reg proc to use */
+ scanvalue); /* constant */
+ }
+ else if (IsA(clause, RowCompareExpr))
+ {
+ /* (indexkey, indexkey, ...) op (expression, expression, ...) */
+ RowCompareExpr *rc = (RowCompareExpr *) clause;
+ ScanKey first_sub_key;
+ int n_sub_key;
+ ListCell *largs_cell;
+ ListCell *rargs_cell;
+ ListCell *opnos_cell;
+ ListCell *collids_cell;
+
+ Assert(!isorderby);
+
+ first_sub_key = (ScanKey)
+ palloc(list_length(rc->opnos) * sizeof(ScanKeyData));
+ n_sub_key = 0;
+
+ /* Scan RowCompare columns and generate subsidiary ScanKey items */
+ forfour(largs_cell, rc->largs, rargs_cell, rc->rargs,
+ opnos_cell, rc->opnos, collids_cell, rc->inputcollids)
+ {
+ ScanKey this_sub_key = &first_sub_key[n_sub_key];
+ int flags = SK_ROW_MEMBER;
+ Datum scanvalue;
+ Oid inputcollation;
+
+ leftop = (Expr *) lfirst(largs_cell);
+ rightop = (Expr *) lfirst(rargs_cell);
+ opno = lfirst_oid(opnos_cell);
+ inputcollation = lfirst_oid(collids_cell);
+
+ /*
+ * leftop should be the index key Var, possibly relabeled
+ */
+ if (leftop && IsA(leftop, RelabelType))
+ leftop = ((RelabelType *) leftop)->arg;
+
+ Assert(leftop != NULL);
+
+ if (!(IsA(leftop, Var) &&
+ ((Var *) leftop)->varno == INDEX_VAR))
+ elog(ERROR, "indexqual doesn't have key on left side");
+
+ varattno = ((Var *) leftop)->varattno;
+
+ /*
+ * We have to look up the operator's associated btree support
+ * function
+ */
+ if (index->rd_rel->relam != BTREE_AM_OID ||
+ varattno < 1 || varattno > indnkeyatts)
+ elog(ERROR, "bogus RowCompare index qualification");
+ opfamily = index->rd_opfamily[varattno - 1];
+
+ get_op_opfamily_properties(opno, opfamily, isorderby,
+ &op_strategy,
+ &op_lefttype,
+ &op_righttype);
+
+ if (op_strategy != rc->rctype)
+ elog(ERROR, "RowCompare index qualification contains wrong operator");
+
+ opfuncid = get_opfamily_proc(opfamily,
+ op_lefttype,
+ op_righttype,
+ BTORDER_PROC);
+ if (!RegProcedureIsValid(opfuncid))
+ elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
+ BTORDER_PROC, op_lefttype, op_righttype, opfamily);
+
+ /*
+ * rightop is the constant or variable comparison value
+ */
+ if (rightop && IsA(rightop, RelabelType))
+ rightop = ((RelabelType *) rightop)->arg;
+
+ Assert(rightop != NULL);
+
+ if (IsA(rightop, Const))
+ {
+ /* OK, simple constant comparison value */
+ scanvalue = ((Const *) rightop)->constvalue;
+ if (((Const *) rightop)->constisnull)
+ flags |= SK_ISNULL;
+ }
+ else
+ {
+ /* Need to treat this one as a runtime key */
+ if (n_runtime_keys >= max_runtime_keys)
+ {
+ if (max_runtime_keys == 0)
+ {
+ max_runtime_keys = 8;
+ runtime_keys = (IndexRuntimeKeyInfo *)
+ palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+ }
+ else
+ {
+ max_runtime_keys *= 2;
+ runtime_keys = (IndexRuntimeKeyInfo *)
+ repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+ }
+ }
+ runtime_keys[n_runtime_keys].scan_key = this_sub_key;
+ runtime_keys[n_runtime_keys].key_expr =
+ ExecInitExpr(rightop, planstate);
+ runtime_keys[n_runtime_keys].key_toastable =
+ TypeIsToastable(op_righttype);
+ n_runtime_keys++;
+ scanvalue = (Datum) 0;
+ }
+
+ /*
+ * initialize the subsidiary scan key's fields appropriately
+ */
+ ScanKeyEntryInitialize(this_sub_key,
+ flags,
+ varattno, /* attribute number */
+ op_strategy, /* op's strategy */
+ op_righttype, /* strategy subtype */
+ inputcollation, /* collation */
+ opfuncid, /* reg proc to use */
+ scanvalue); /* constant */
+ n_sub_key++;
+ }
+
+ /* Mark the last subsidiary scankey correctly */
+ first_sub_key[n_sub_key - 1].sk_flags |= SK_ROW_END;
+
+ /*
+ * We don't use ScanKeyEntryInitialize for the header because it
+ * isn't going to contain a valid sk_func pointer.
+ */
+ MemSet(this_scan_key, 0, sizeof(ScanKeyData));
+ this_scan_key->sk_flags = SK_ROW_HEADER;
+ this_scan_key->sk_attno = first_sub_key->sk_attno;
+ this_scan_key->sk_strategy = rc->rctype;
+ /* sk_subtype, sk_collation, sk_func not used in a header */
+ this_scan_key->sk_argument = PointerGetDatum(first_sub_key);
+ }
+ else if (IsA(clause, ScalarArrayOpExpr))
+ {
+ /* indexkey op ANY (array-expression) */
+ ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+ int flags = 0;
+ Datum scanvalue;
+
+ Assert(!isorderby);
+
+ Assert(saop->useOr);
+ opno = saop->opno;
+ opfuncid = saop->opfuncid;
+
+ /*
+ * leftop should be the index key Var, possibly relabeled
+ */
+ leftop = (Expr *) linitial(saop->args);
+
+ if (leftop && IsA(leftop, RelabelType))
+ leftop = ((RelabelType *) leftop)->arg;
+
+ Assert(leftop != NULL);
+
+ if (!(IsA(leftop, Var) &&
+ ((Var *) leftop)->varno == INDEX_VAR))
+ elog(ERROR, "indexqual doesn't have key on left side");
+
+ varattno = ((Var *) leftop)->varattno;
+ if (varattno < 1 || varattno > indnkeyatts)
+ elog(ERROR, "bogus index qualification");
+
+ /*
+ * We have to look up the operator's strategy number. This
+ * provides a cross-check that the operator does match the index.
+ */
+ opfamily = index->rd_opfamily[varattno - 1];
+
+ get_op_opfamily_properties(opno, opfamily, isorderby,
+ &op_strategy,
+ &op_lefttype,
+ &op_righttype);
+
+ /*
+ * rightop is the constant or variable array value
+ */
+ rightop = (Expr *) lsecond(saop->args);
+
+ if (rightop && IsA(rightop, RelabelType))
+ rightop = ((RelabelType *) rightop)->arg;
+
+ Assert(rightop != NULL);
+
+ if (index->rd_indam->amsearcharray)
+ {
+ /* Index AM will handle this like a simple operator */
+ flags |= SK_SEARCHARRAY;
+ if (IsA(rightop, Const))
+ {
+ /* OK, simple constant comparison value */
+ scanvalue = ((Const *) rightop)->constvalue;
+ if (((Const *) rightop)->constisnull)
+ flags |= SK_ISNULL;
+ }
+ else
+ {
+ /* Need to treat this one as a runtime key */
+ if (n_runtime_keys >= max_runtime_keys)
+ {
+ if (max_runtime_keys == 0)
+ {
+ max_runtime_keys = 8;
+ runtime_keys = (IndexRuntimeKeyInfo *)
+ palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+ }
+ else
+ {
+ max_runtime_keys *= 2;
+ runtime_keys = (IndexRuntimeKeyInfo *)
+ repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+ }
+ }
+ runtime_keys[n_runtime_keys].scan_key = this_scan_key;
+ runtime_keys[n_runtime_keys].key_expr =
+ ExecInitExpr(rightop, planstate);
+
+ /*
+ * Careful here: the runtime expression is not of
+ * op_righttype, but rather is an array of same; so
+ * TypeIsToastable() isn't helpful. However, we can
+ * assume that all array types are toastable.
+ */
+ runtime_keys[n_runtime_keys].key_toastable = true;
+ n_runtime_keys++;
+ scanvalue = (Datum) 0;
+ }
+ }
+ else
+ {
+ /* Executor has to expand the array value */
+ array_keys[n_array_keys].scan_key = this_scan_key;
+ array_keys[n_array_keys].array_expr =
+ ExecInitExpr(rightop, planstate);
+ /* the remaining fields were zeroed by palloc0 */
+ n_array_keys++;
+ scanvalue = (Datum) 0;
+ }
+
+ /*
+ * initialize the scan key's fields appropriately
+ */
+ ScanKeyEntryInitialize(this_scan_key,
+ flags,
+ varattno, /* attribute number to scan */
+ op_strategy, /* op's strategy */
+ op_righttype, /* strategy subtype */
+ saop->inputcollid, /* collation */
+ opfuncid, /* reg proc to use */
+ scanvalue); /* constant */
+ }
+ else if (IsA(clause, NullTest))
+ {
+ /* indexkey IS NULL or indexkey IS NOT NULL */
+ NullTest *ntest = (NullTest *) clause;
+ int flags;
+
+ Assert(!isorderby);
+
+ /*
+ * argument should be the index key Var, possibly relabeled
+ */
+ leftop = ntest->arg;
+
+ if (leftop && IsA(leftop, RelabelType))
+ leftop = ((RelabelType *) leftop)->arg;
+
+ Assert(leftop != NULL);
+
+ if (!(IsA(leftop, Var) &&
+ ((Var *) leftop)->varno == INDEX_VAR))
+ elog(ERROR, "NullTest indexqual has wrong key");
+
+ varattno = ((Var *) leftop)->varattno;
+
+ /*
+ * initialize the scan key's fields appropriately
+ */
+ switch (ntest->nulltesttype)
+ {
+ case IS_NULL:
+ flags = SK_ISNULL | SK_SEARCHNULL;
+ break;
+ case IS_NOT_NULL:
+ flags = SK_ISNULL | SK_SEARCHNOTNULL;
+ break;
+ default:
+ elog(ERROR, "unrecognized nulltesttype: %d",
+ (int) ntest->nulltesttype);
+ flags = 0; /* keep compiler quiet */
+ break;
+ }
+
+ ScanKeyEntryInitialize(this_scan_key,
+ flags,
+ varattno, /* attribute number to scan */
+ InvalidStrategy, /* no strategy */
+ InvalidOid, /* no strategy subtype */
+ InvalidOid, /* no collation */
+ InvalidOid, /* no reg proc for this */
+ (Datum) 0); /* constant */
+ }
+ else
+ elog(ERROR, "unsupported indexqual type: %d",
+ (int) nodeTag(clause));
+ }
+
+ Assert(n_runtime_keys <= max_runtime_keys);
+
+ /* Get rid of any unused arrays */
+ if (n_array_keys == 0)
+ {
+ pfree(array_keys);
+ array_keys = NULL;
+ }
+
+ /*
+ * Return info to our caller.
+ */
+ *scanKeys = scan_keys;
+ *numScanKeys = n_scan_keys;
+ *runtimeKeys = runtime_keys;
+ *numRuntimeKeys = n_runtime_keys;
+ if (arrayKeys)
+ {
+ *arrayKeys = array_keys;
+ *numArrayKeys = n_array_keys;
+ }
+ else if (n_array_keys != 0)
+ elog(ERROR, "ScalarArrayOpExpr index qual found where not allowed");
+}
+
+/* ----------------------------------------------------------------
+ * Parallel Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecIndexScanEstimate
+ *
+ * Compute the amount of space we'll need in the parallel
+ * query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanEstimate(IndexScanState *node,
+ ParallelContext *pcxt)
+{
+ EState *estate = node->ss.ps.state;
+
+ node->iss_PscanLen = index_parallelscan_estimate(node->iss_RelationDesc,
+ estate->es_snapshot);
+ shm_toc_estimate_chunk(&pcxt->estimator, node->iss_PscanLen);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexScanInitializeDSM
+ *
+ * Set up a parallel index scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanInitializeDSM(IndexScanState *node,
+ ParallelContext *pcxt)
+{
+ EState *estate = node->ss.ps.state;
+ ParallelIndexScanDesc piscan;
+
+ piscan = shm_toc_allocate(pcxt->toc, node->iss_PscanLen);
+ index_parallelscan_initialize(node->ss.ss_currentRelation,
+ node->iss_RelationDesc,
+ estate->es_snapshot,
+ piscan);
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+ node->iss_ScanDesc =
+ index_beginscan_parallel(node->ss.ss_currentRelation,
+ node->iss_RelationDesc,
+ node->iss_NumScanKeys,
+ node->iss_NumOrderByKeys,
+ piscan);
+
+ /*
+ * If no run-time keys to calculate or they are ready, go ahead and pass
+ * the scankeys to the index AM.
+ */
+ if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady)
+ index_rescan(node->iss_ScanDesc,
+ node->iss_ScanKeys, node->iss_NumScanKeys,
+ node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexScanReInitializeDSM
+ *
+ * Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanReInitializeDSM(IndexScanState *node,
+ ParallelContext *pcxt)
+{
+ index_parallelrescan(node->iss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * ExecIndexScanInitializeWorker
+ *
+ * Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanInitializeWorker(IndexScanState *node,
+ ParallelWorkerContext *pwcxt)
+{
+ ParallelIndexScanDesc piscan;
+
+ piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+ node->iss_ScanDesc =
+ index_beginscan_parallel(node->ss.ss_currentRelation,
+ node->iss_RelationDesc,
+ node->iss_NumScanKeys,
+ node->iss_NumOrderByKeys,
+ piscan);
+
+ /*
+ * If no run-time keys to calculate or they are ready, go ahead and pass
+ * the scankeys to the index AM.
+ */
+ if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady)
+ index_rescan(node->iss_ScanDesc,
+ node->iss_ScanKeys, node->iss_NumScanKeys,
+ node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+}
diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c
new file mode 100644
index 0000000..128eb3e
--- /dev/null
+++ b/src/backend/executor/nodeLimit.c
@@ -0,0 +1,558 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeLimit.c
+ * Routines to handle limiting of query results where appropriate
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeLimit.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecLimit - extract a limited range of tuples
+ * ExecInitLimit - initialize node and subnodes..
+ * ExecEndLimit - shutdown node and subnodes
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeLimit.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+
+static void recompute_limits(LimitState *node);
+static int64 compute_tuples_needed(LimitState *node);
+
+
+/* ----------------------------------------------------------------
+ * ExecLimit
+ *
+ * This is a very simple node which just performs LIMIT/OFFSET
+ * filtering on the stream of tuples returned by a subplan.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot * /* return: a tuple or NULL */
+ExecLimit(PlanState *pstate)
+{
+ LimitState *node = castNode(LimitState, pstate);
+ ExprContext *econtext = node->ps.ps_ExprContext;
+ ScanDirection direction;
+ TupleTableSlot *slot;
+ PlanState *outerPlan;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * get information from the node
+ */
+ direction = node->ps.state->es_direction;
+ outerPlan = outerPlanState(node);
+
+ /*
+ * The main logic is a simple state machine.
+ */
+ switch (node->lstate)
+ {
+ case LIMIT_INITIAL:
+
+ /*
+ * First call for this node, so compute limit/offset. (We can't do
+ * this any earlier, because parameters from upper nodes will not
+ * be set during ExecInitLimit.) This also sets position = 0 and
+ * changes the state to LIMIT_RESCAN.
+ */
+ recompute_limits(node);
+
+ /* FALL THRU */
+
+ case LIMIT_RESCAN:
+
+ /*
+ * If backwards scan, just return NULL without changing state.
+ */
+ if (!ScanDirectionIsForward(direction))
+ return NULL;
+
+ /*
+ * Check for empty window; if so, treat like empty subplan.
+ */
+ if (node->count <= 0 && !node->noCount)
+ {
+ node->lstate = LIMIT_EMPTY;
+ return NULL;
+ }
+
+ /*
+ * Fetch rows from subplan until we reach position > offset.
+ */
+ for (;;)
+ {
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ {
+ /*
+ * The subplan returns too few tuples for us to produce
+ * any output at all.
+ */
+ node->lstate = LIMIT_EMPTY;
+ return NULL;
+ }
+
+ /*
+ * Tuple at limit is needed for comparison in subsequent
+ * execution to detect ties.
+ */
+ if (node->limitOption == LIMIT_OPTION_WITH_TIES &&
+ node->position - node->offset == node->count - 1)
+ {
+ ExecCopySlot(node->last_slot, slot);
+ }
+ node->subSlot = slot;
+ if (++node->position > node->offset)
+ break;
+ }
+
+ /*
+ * Okay, we have the first tuple of the window.
+ */
+ node->lstate = LIMIT_INWINDOW;
+ break;
+
+ case LIMIT_EMPTY:
+
+ /*
+ * The subplan is known to return no tuples (or not more than
+ * OFFSET tuples, in general). So we return no tuples.
+ */
+ return NULL;
+
+ case LIMIT_INWINDOW:
+ if (ScanDirectionIsForward(direction))
+ {
+ /*
+ * Forwards scan, so check for stepping off end of window. At
+ * the end of the window, the behavior depends on whether WITH
+ * TIES was specified: if so, we need to change the state
+ * machine to WINDOWEND_TIES, and fall through to the code for
+ * that case. If not (nothing was specified, or ONLY was)
+ * return NULL without advancing the subplan or the position
+ * variable, but change the state machine to record having
+ * done so.
+ *
+ * Once at the end, ideally, we would shut down parallel
+ * resources; but that would destroy the parallel context
+ * which might be required for rescans. To do that, we'll
+ * need to find a way to pass down more information about
+ * whether rescans are possible.
+ */
+ if (!node->noCount &&
+ node->position - node->offset >= node->count)
+ {
+ if (node->limitOption == LIMIT_OPTION_COUNT)
+ {
+ node->lstate = LIMIT_WINDOWEND;
+ return NULL;
+ }
+ else
+ {
+ node->lstate = LIMIT_WINDOWEND_TIES;
+ /* we'll fall through to the next case */
+ }
+ }
+ else
+ {
+ /*
+ * Get next tuple from subplan, if any.
+ */
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ {
+ node->lstate = LIMIT_SUBPLANEOF;
+ return NULL;
+ }
+
+ /*
+ * If WITH TIES is active, and this is the last in-window
+ * tuple, save it to be used in subsequent WINDOWEND_TIES
+ * processing.
+ */
+ if (node->limitOption == LIMIT_OPTION_WITH_TIES &&
+ node->position - node->offset == node->count - 1)
+ {
+ ExecCopySlot(node->last_slot, slot);
+ }
+ node->subSlot = slot;
+ node->position++;
+ break;
+ }
+ }
+ else
+ {
+ /*
+ * Backwards scan, so check for stepping off start of window.
+ * As above, only change state-machine status if so.
+ */
+ if (node->position <= node->offset + 1)
+ {
+ node->lstate = LIMIT_WINDOWSTART;
+ return NULL;
+ }
+
+ /*
+ * Get previous tuple from subplan; there should be one!
+ */
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ elog(ERROR, "LIMIT subplan failed to run backwards");
+ node->subSlot = slot;
+ node->position--;
+ break;
+ }
+
+ Assert(node->lstate == LIMIT_WINDOWEND_TIES);
+ /* FALL THRU */
+
+ case LIMIT_WINDOWEND_TIES:
+ if (ScanDirectionIsForward(direction))
+ {
+ /*
+ * Advance the subplan until we find the first row with
+ * different ORDER BY pathkeys.
+ */
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ {
+ node->lstate = LIMIT_SUBPLANEOF;
+ return NULL;
+ }
+
+ /*
+ * Test if the new tuple and the last tuple match. If so we
+ * return the tuple.
+ */
+ econtext->ecxt_innertuple = slot;
+ econtext->ecxt_outertuple = node->last_slot;
+ if (ExecQualAndReset(node->eqfunction, econtext))
+ {
+ node->subSlot = slot;
+ node->position++;
+ }
+ else
+ {
+ node->lstate = LIMIT_WINDOWEND;
+ return NULL;
+ }
+ }
+ else
+ {
+ /*
+ * Backwards scan, so check for stepping off start of window.
+ * Change only state-machine status if so.
+ */
+ if (node->position <= node->offset + 1)
+ {
+ node->lstate = LIMIT_WINDOWSTART;
+ return NULL;
+ }
+
+ /*
+ * Get previous tuple from subplan; there should be one! And
+ * change state-machine status.
+ */
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ elog(ERROR, "LIMIT subplan failed to run backwards");
+ node->subSlot = slot;
+ node->position--;
+ node->lstate = LIMIT_INWINDOW;
+ }
+ break;
+
+ case LIMIT_SUBPLANEOF:
+ if (ScanDirectionIsForward(direction))
+ return NULL;
+
+ /*
+ * Backing up from subplan EOF, so re-fetch previous tuple; there
+ * should be one! Note previous tuple must be in window.
+ */
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ elog(ERROR, "LIMIT subplan failed to run backwards");
+ node->subSlot = slot;
+ node->lstate = LIMIT_INWINDOW;
+ /* position does not change 'cause we didn't advance it before */
+ break;
+
+ case LIMIT_WINDOWEND:
+ if (ScanDirectionIsForward(direction))
+ return NULL;
+
+ /*
+ * We already past one position to detect ties so re-fetch
+ * previous tuple; there should be one! Note previous tuple must
+ * be in window.
+ */
+ if (node->limitOption == LIMIT_OPTION_WITH_TIES)
+ {
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ elog(ERROR, "LIMIT subplan failed to run backwards");
+ node->subSlot = slot;
+ node->lstate = LIMIT_INWINDOW;
+ }
+ else
+ {
+ /*
+ * Backing up from window end: simply re-return the last tuple
+ * fetched from the subplan.
+ */
+ slot = node->subSlot;
+ node->lstate = LIMIT_INWINDOW;
+ /* position does not change 'cause we didn't advance it before */
+ }
+ break;
+
+ case LIMIT_WINDOWSTART:
+ if (!ScanDirectionIsForward(direction))
+ return NULL;
+
+ /*
+ * Advancing after having backed off window start: simply
+ * re-return the last tuple fetched from the subplan.
+ */
+ slot = node->subSlot;
+ node->lstate = LIMIT_INWINDOW;
+ /* position does not change 'cause we didn't change it before */
+ break;
+
+ default:
+ elog(ERROR, "impossible LIMIT state: %d",
+ (int) node->lstate);
+ slot = NULL; /* keep compiler quiet */
+ break;
+ }
+
+ /* Return the current tuple */
+ Assert(!TupIsNull(slot));
+
+ return slot;
+}
+
+/*
+ * Evaluate the limit/offset expressions --- done at startup or rescan.
+ *
+ * This is also a handy place to reset the current-position state info.
+ */
+static void
+recompute_limits(LimitState *node)
+{
+ ExprContext *econtext = node->ps.ps_ExprContext;
+ Datum val;
+ bool isNull;
+
+ if (node->limitOffset)
+ {
+ val = ExecEvalExprSwitchContext(node->limitOffset,
+ econtext,
+ &isNull);
+ /* Interpret NULL offset as no offset */
+ if (isNull)
+ node->offset = 0;
+ else
+ {
+ node->offset = DatumGetInt64(val);
+ if (node->offset < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE),
+ errmsg("OFFSET must not be negative")));
+ }
+ }
+ else
+ {
+ /* No OFFSET supplied */
+ node->offset = 0;
+ }
+
+ if (node->limitCount)
+ {
+ val = ExecEvalExprSwitchContext(node->limitCount,
+ econtext,
+ &isNull);
+ /* Interpret NULL count as no count (LIMIT ALL) */
+ if (isNull)
+ {
+ node->count = 0;
+ node->noCount = true;
+ }
+ else
+ {
+ node->count = DatumGetInt64(val);
+ if (node->count < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_ROW_COUNT_IN_LIMIT_CLAUSE),
+ errmsg("LIMIT must not be negative")));
+ node->noCount = false;
+ }
+ }
+ else
+ {
+ /* No COUNT supplied */
+ node->count = 0;
+ node->noCount = true;
+ }
+
+ /* Reset position to start-of-scan */
+ node->position = 0;
+ node->subSlot = NULL;
+
+ /* Set state-machine state */
+ node->lstate = LIMIT_RESCAN;
+
+ /*
+ * Notify child node about limit. Note: think not to "optimize" by
+ * skipping ExecSetTupleBound if compute_tuples_needed returns < 0. We
+ * must update the child node anyway, in case this is a rescan and the
+ * previous time we got a different result.
+ */
+ ExecSetTupleBound(compute_tuples_needed(node), outerPlanState(node));
+}
+
+/*
+ * Compute the maximum number of tuples needed to satisfy this Limit node.
+ * Return a negative value if there is not a determinable limit.
+ */
+static int64
+compute_tuples_needed(LimitState *node)
+{
+ if ((node->noCount) || (node->limitOption == LIMIT_OPTION_WITH_TIES))
+ return -1;
+ /* Note: if this overflows, we'll return a negative value, which is OK */
+ return node->count + node->offset;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitLimit
+ *
+ * This initializes the limit node state structures and
+ * the node's subplan.
+ * ----------------------------------------------------------------
+ */
+LimitState *
+ExecInitLimit(Limit *node, EState *estate, int eflags)
+{
+ LimitState *limitstate;
+ Plan *outerPlan;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & EXEC_FLAG_MARK));
+
+ /*
+ * create state structure
+ */
+ limitstate = makeNode(LimitState);
+ limitstate->ps.plan = (Plan *) node;
+ limitstate->ps.state = estate;
+ limitstate->ps.ExecProcNode = ExecLimit;
+
+ limitstate->lstate = LIMIT_INITIAL;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * Limit nodes never call ExecQual or ExecProject, but they need an
+ * exprcontext anyway to evaluate the limit/offset parameters in.
+ */
+ ExecAssignExprContext(estate, &limitstate->ps);
+
+ /*
+ * initialize outer plan
+ */
+ outerPlan = outerPlan(node);
+ outerPlanState(limitstate) = ExecInitNode(outerPlan, estate, eflags);
+
+ /*
+ * initialize child expressions
+ */
+ limitstate->limitOffset = ExecInitExpr((Expr *) node->limitOffset,
+ (PlanState *) limitstate);
+ limitstate->limitCount = ExecInitExpr((Expr *) node->limitCount,
+ (PlanState *) limitstate);
+ limitstate->limitOption = node->limitOption;
+
+ /*
+ * Initialize result type.
+ */
+ ExecInitResultTypeTL(&limitstate->ps);
+
+ limitstate->ps.resultopsset = true;
+ limitstate->ps.resultops = ExecGetResultSlotOps(outerPlanState(limitstate),
+ &limitstate->ps.resultopsfixed);
+
+ /*
+ * limit nodes do no projections, so initialize projection info for this
+ * node appropriately
+ */
+ limitstate->ps.ps_ProjInfo = NULL;
+
+ /*
+ * Initialize the equality evaluation, to detect ties.
+ */
+ if (node->limitOption == LIMIT_OPTION_WITH_TIES)
+ {
+ TupleDesc desc;
+ const TupleTableSlotOps *ops;
+
+ desc = ExecGetResultType(outerPlanState(limitstate));
+ ops = ExecGetResultSlotOps(outerPlanState(limitstate), NULL);
+
+ limitstate->last_slot = ExecInitExtraTupleSlot(estate, desc, ops);
+ limitstate->eqfunction = execTuplesMatchPrepare(desc,
+ node->uniqNumCols,
+ node->uniqColIdx,
+ node->uniqOperators,
+ node->uniqCollations,
+ &limitstate->ps);
+ }
+
+ return limitstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndLimit
+ *
+ * This shuts down the subplan and frees resources allocated
+ * to this node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndLimit(LimitState *node)
+{
+ ExecFreeExprContext(&node->ps);
+ ExecEndNode(outerPlanState(node));
+}
+
+
+void
+ExecReScanLimit(LimitState *node)
+{
+ /*
+ * Recompute limit/offset in case parameters changed, and reset the state
+ * machine. We must do this before rescanning our child node, in case
+ * it's a Sort that we are passing the parameters down to.
+ */
+ recompute_limits(node);
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->ps.lefttree->chgParam == NULL)
+ ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c
new file mode 100644
index 0000000..7583973
--- /dev/null
+++ b/src/backend/executor/nodeLockRows.c
@@ -0,0 +1,403 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeLockRows.c
+ * Routines to handle FOR UPDATE/FOR SHARE row locking
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeLockRows.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecLockRows - fetch locked rows
+ * ExecInitLockRows - initialize node and subnodes..
+ * ExecEndLockRows - shutdown node and subnodes
+ */
+
+#include "postgres.h"
+
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "executor/executor.h"
+#include "executor/nodeLockRows.h"
+#include "foreign/fdwapi.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+
+/* ----------------------------------------------------------------
+ * ExecLockRows
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot * /* return: a tuple or NULL */
+ExecLockRows(PlanState *pstate)
+{
+ LockRowsState *node = castNode(LockRowsState, pstate);
+ TupleTableSlot *slot;
+ EState *estate;
+ PlanState *outerPlan;
+ bool epq_needed;
+ ListCell *lc;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * get information from the node
+ */
+ estate = node->ps.state;
+ outerPlan = outerPlanState(node);
+
+ /*
+ * Get next tuple from subplan, if any.
+ */
+lnext:
+ slot = ExecProcNode(outerPlan);
+
+ if (TupIsNull(slot))
+ {
+ /* Release any resources held by EPQ mechanism before exiting */
+ EvalPlanQualEnd(&node->lr_epqstate);
+ return NULL;
+ }
+
+ /* We don't need EvalPlanQual unless we get updated tuple version(s) */
+ epq_needed = false;
+
+ /*
+ * Attempt to lock the source tuple(s). (Note we only have locking
+ * rowmarks in lr_arowMarks.)
+ */
+ foreach(lc, node->lr_arowMarks)
+ {
+ ExecAuxRowMark *aerm = (ExecAuxRowMark *) lfirst(lc);
+ ExecRowMark *erm = aerm->rowmark;
+ Datum datum;
+ bool isNull;
+ ItemPointerData tid;
+ TM_FailureData tmfd;
+ LockTupleMode lockmode;
+ int lockflags = 0;
+ TM_Result test;
+ TupleTableSlot *markSlot;
+
+ /* clear any leftover test tuple for this rel */
+ markSlot = EvalPlanQualSlot(&node->lr_epqstate, erm->relation, erm->rti);
+ ExecClearTuple(markSlot);
+
+ /* if child rel, must check whether it produced this row */
+ if (erm->rti != erm->prti)
+ {
+ Oid tableoid;
+
+ datum = ExecGetJunkAttribute(slot,
+ aerm->toidAttNo,
+ &isNull);
+ /* shouldn't ever get a null result... */
+ if (isNull)
+ elog(ERROR, "tableoid is NULL");
+ tableoid = DatumGetObjectId(datum);
+
+ Assert(OidIsValid(erm->relid));
+ if (tableoid != erm->relid)
+ {
+ /* this child is inactive right now */
+ erm->ermActive = false;
+ ItemPointerSetInvalid(&(erm->curCtid));
+ ExecClearTuple(markSlot);
+ continue;
+ }
+ }
+ erm->ermActive = true;
+
+ /* fetch the tuple's ctid */
+ datum = ExecGetJunkAttribute(slot,
+ aerm->ctidAttNo,
+ &isNull);
+ /* shouldn't ever get a null result... */
+ if (isNull)
+ elog(ERROR, "ctid is NULL");
+
+ /* requests for foreign tables must be passed to their FDW */
+ if (erm->relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+ {
+ FdwRoutine *fdwroutine;
+ bool updated = false;
+
+ fdwroutine = GetFdwRoutineForRelation(erm->relation, false);
+ /* this should have been checked already, but let's be safe */
+ if (fdwroutine->RefetchForeignRow == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot lock rows in foreign table \"%s\"",
+ RelationGetRelationName(erm->relation))));
+
+ fdwroutine->RefetchForeignRow(estate,
+ erm,
+ datum,
+ markSlot,
+ &updated);
+ if (TupIsNull(markSlot))
+ {
+ /* couldn't get the lock, so skip this row */
+ goto lnext;
+ }
+
+ /*
+ * if FDW says tuple was updated before getting locked, we need to
+ * perform EPQ testing to see if quals are still satisfied
+ */
+ if (updated)
+ epq_needed = true;
+
+ continue;
+ }
+
+ /* okay, try to lock (and fetch) the tuple */
+ tid = *((ItemPointer) DatumGetPointer(datum));
+ switch (erm->markType)
+ {
+ case ROW_MARK_EXCLUSIVE:
+ lockmode = LockTupleExclusive;
+ break;
+ case ROW_MARK_NOKEYEXCLUSIVE:
+ lockmode = LockTupleNoKeyExclusive;
+ break;
+ case ROW_MARK_SHARE:
+ lockmode = LockTupleShare;
+ break;
+ case ROW_MARK_KEYSHARE:
+ lockmode = LockTupleKeyShare;
+ break;
+ default:
+ elog(ERROR, "unsupported rowmark type");
+ lockmode = LockTupleNoKeyExclusive; /* keep compiler quiet */
+ break;
+ }
+
+ lockflags = TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS;
+ if (!IsolationUsesXactSnapshot())
+ lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION;
+
+ test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot,
+ markSlot, estate->es_output_cid,
+ lockmode, erm->waitPolicy,
+ lockflags,
+ &tmfd);
+
+ switch (test)
+ {
+ case TM_WouldBlock:
+ /* couldn't lock tuple in SKIP LOCKED mode */
+ goto lnext;
+
+ case TM_SelfModified:
+
+ /*
+ * The target tuple was already updated or deleted by the
+ * current command, or by a later command in the current
+ * transaction. We *must* ignore the tuple in the former
+ * case, so as to avoid the "Halloween problem" of repeated
+ * update attempts. In the latter case it might be sensible
+ * to fetch the updated tuple instead, but doing so would
+ * require changing heap_update and heap_delete to not
+ * complain about updating "invisible" tuples, which seems
+ * pretty scary (table_tuple_lock will not complain, but few
+ * callers expect TM_Invisible, and we're not one of them). So
+ * for now, treat the tuple as deleted and do not process.
+ */
+ goto lnext;
+
+ case TM_Ok:
+
+ /*
+ * Got the lock successfully, the locked tuple saved in
+ * markSlot for, if needed, EvalPlanQual testing below.
+ */
+ if (tmfd.traversed)
+ epq_needed = true;
+ break;
+
+ case TM_Updated:
+ if (IsolationUsesXactSnapshot())
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent update")));
+ elog(ERROR, "unexpected table_tuple_lock status: %u",
+ test);
+ break;
+
+ case TM_Deleted:
+ if (IsolationUsesXactSnapshot())
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent update")));
+ /* tuple was deleted so don't return it */
+ goto lnext;
+
+ case TM_Invisible:
+ elog(ERROR, "attempted to lock invisible tuple");
+ break;
+
+ default:
+ elog(ERROR, "unrecognized table_tuple_lock status: %u",
+ test);
+ }
+
+ /* Remember locked tuple's TID for EPQ testing and WHERE CURRENT OF */
+ erm->curCtid = tid;
+ }
+
+ /*
+ * If we need to do EvalPlanQual testing, do so.
+ */
+ if (epq_needed)
+ {
+ /* Initialize EPQ machinery */
+ EvalPlanQualBegin(&node->lr_epqstate);
+
+ /*
+ * To fetch non-locked source rows the EPQ logic needs to access junk
+ * columns from the tuple being tested.
+ */
+ EvalPlanQualSetSlot(&node->lr_epqstate, slot);
+
+ /*
+ * And finally we can re-evaluate the tuple.
+ */
+ slot = EvalPlanQualNext(&node->lr_epqstate);
+ if (TupIsNull(slot))
+ {
+ /* Updated tuple fails qual, so ignore it and go on */
+ goto lnext;
+ }
+ }
+
+ /* Got all locks, so return the current tuple */
+ return slot;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitLockRows
+ *
+ * This initializes the LockRows node state structures and
+ * the node's subplan.
+ * ----------------------------------------------------------------
+ */
+LockRowsState *
+ExecInitLockRows(LockRows *node, EState *estate, int eflags)
+{
+ LockRowsState *lrstate;
+ Plan *outerPlan = outerPlan(node);
+ List *epq_arowmarks;
+ ListCell *lc;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & EXEC_FLAG_MARK));
+
+ /*
+ * create state structure
+ */
+ lrstate = makeNode(LockRowsState);
+ lrstate->ps.plan = (Plan *) node;
+ lrstate->ps.state = estate;
+ lrstate->ps.ExecProcNode = ExecLockRows;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * LockRows nodes never call ExecQual or ExecProject, therefore no
+ * ExprContext is needed.
+ */
+
+ /*
+ * Initialize result type.
+ */
+ ExecInitResultTypeTL(&lrstate->ps);
+
+ /*
+ * then initialize outer plan
+ */
+ outerPlanState(lrstate) = ExecInitNode(outerPlan, estate, eflags);
+
+ /* node returns unmodified slots from the outer plan */
+ lrstate->ps.resultopsset = true;
+ lrstate->ps.resultops = ExecGetResultSlotOps(outerPlanState(lrstate),
+ &lrstate->ps.resultopsfixed);
+
+ /*
+ * LockRows nodes do no projections, so initialize projection info for
+ * this node appropriately
+ */
+ lrstate->ps.ps_ProjInfo = NULL;
+
+ /*
+ * Locate the ExecRowMark(s) that this node is responsible for, and
+ * construct ExecAuxRowMarks for them. (InitPlan should already have
+ * built the global list of ExecRowMarks.)
+ */
+ lrstate->lr_arowMarks = NIL;
+ epq_arowmarks = NIL;
+ foreach(lc, node->rowMarks)
+ {
+ PlanRowMark *rc = lfirst_node(PlanRowMark, lc);
+ ExecRowMark *erm;
+ ExecAuxRowMark *aerm;
+
+ /* ignore "parent" rowmarks; they are irrelevant at runtime */
+ if (rc->isParent)
+ continue;
+
+ /* find ExecRowMark and build ExecAuxRowMark */
+ erm = ExecFindRowMark(estate, rc->rti, false);
+ aerm = ExecBuildAuxRowMark(erm, outerPlan->targetlist);
+
+ /*
+ * Only locking rowmarks go into our own list. Non-locking marks are
+ * passed off to the EvalPlanQual machinery. This is because we don't
+ * want to bother fetching non-locked rows unless we actually have to
+ * do an EPQ recheck.
+ */
+ if (RowMarkRequiresRowShareLock(erm->markType))
+ lrstate->lr_arowMarks = lappend(lrstate->lr_arowMarks, aerm);
+ else
+ epq_arowmarks = lappend(epq_arowmarks, aerm);
+ }
+
+ /* Now we have the info needed to set up EPQ state */
+ EvalPlanQualInit(&lrstate->lr_epqstate, estate,
+ outerPlan, epq_arowmarks, node->epqParam);
+
+ return lrstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndLockRows
+ *
+ * This shuts down the subplan and frees resources allocated
+ * to this node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndLockRows(LockRowsState *node)
+{
+ /* We may have shut down EPQ already, but no harm in another call */
+ EvalPlanQualEnd(&node->lr_epqstate);
+ ExecEndNode(outerPlanState(node));
+}
+
+
+void
+ExecReScanLockRows(LockRowsState *node)
+{
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->ps.lefttree->chgParam == NULL)
+ ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeMaterial.c b/src/backend/executor/nodeMaterial.c
new file mode 100644
index 0000000..7c53f8e
--- /dev/null
+++ b/src/backend/executor/nodeMaterial.c
@@ -0,0 +1,368 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeMaterial.c
+ * Routines to handle materialization nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeMaterial.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecMaterial - materialize the result of a subplan
+ * ExecInitMaterial - initialize node and subnodes
+ * ExecEndMaterial - shutdown node and subnodes
+ *
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeMaterial.h"
+#include "miscadmin.h"
+
+/* ----------------------------------------------------------------
+ * ExecMaterial
+ *
+ * As long as we are at the end of the data collected in the tuplestore,
+ * we collect one new row from the subplan on each call, and stash it
+ * aside in the tuplestore before returning it. The tuplestore is
+ * only read if we are asked to scan backwards, rescan, or mark/restore.
+ *
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot * /* result tuple from subplan */
+ExecMaterial(PlanState *pstate)
+{
+ MaterialState *node = castNode(MaterialState, pstate);
+ EState *estate;
+ ScanDirection dir;
+ bool forward;
+ Tuplestorestate *tuplestorestate;
+ bool eof_tuplestore;
+ TupleTableSlot *slot;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * get state info from node
+ */
+ estate = node->ss.ps.state;
+ dir = estate->es_direction;
+ forward = ScanDirectionIsForward(dir);
+ tuplestorestate = node->tuplestorestate;
+
+ /*
+ * If first time through, and we need a tuplestore, initialize it.
+ */
+ if (tuplestorestate == NULL && node->eflags != 0)
+ {
+ tuplestorestate = tuplestore_begin_heap(true, false, work_mem);
+ tuplestore_set_eflags(tuplestorestate, node->eflags);
+ if (node->eflags & EXEC_FLAG_MARK)
+ {
+ /*
+ * Allocate a second read pointer to serve as the mark. We know it
+ * must have index 1, so needn't store that.
+ */
+ int ptrno PG_USED_FOR_ASSERTS_ONLY;
+
+ ptrno = tuplestore_alloc_read_pointer(tuplestorestate,
+ node->eflags);
+ Assert(ptrno == 1);
+ }
+ node->tuplestorestate = tuplestorestate;
+ }
+
+ /*
+ * If we are not at the end of the tuplestore, or are going backwards, try
+ * to fetch a tuple from tuplestore.
+ */
+ eof_tuplestore = (tuplestorestate == NULL) ||
+ tuplestore_ateof(tuplestorestate);
+
+ if (!forward && eof_tuplestore)
+ {
+ if (!node->eof_underlying)
+ {
+ /*
+ * When reversing direction at tuplestore EOF, the first
+ * gettupleslot call will fetch the last-added tuple; but we want
+ * to return the one before that, if possible. So do an extra
+ * fetch.
+ */
+ if (!tuplestore_advance(tuplestorestate, forward))
+ return NULL; /* the tuplestore must be empty */
+ }
+ eof_tuplestore = false;
+ }
+
+ /*
+ * If we can fetch another tuple from the tuplestore, return it.
+ */
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ if (!eof_tuplestore)
+ {
+ if (tuplestore_gettupleslot(tuplestorestate, forward, false, slot))
+ return slot;
+ if (forward)
+ eof_tuplestore = true;
+ }
+
+ /*
+ * If necessary, try to fetch another row from the subplan.
+ *
+ * Note: the eof_underlying state variable exists to short-circuit further
+ * subplan calls. It's not optional, unfortunately, because some plan
+ * node types are not robust about being called again when they've already
+ * returned NULL.
+ */
+ if (eof_tuplestore && !node->eof_underlying)
+ {
+ PlanState *outerNode;
+ TupleTableSlot *outerslot;
+
+ /*
+ * We can only get here with forward==true, so no need to worry about
+ * which direction the subplan will go.
+ */
+ outerNode = outerPlanState(node);
+ outerslot = ExecProcNode(outerNode);
+ if (TupIsNull(outerslot))
+ {
+ node->eof_underlying = true;
+ return NULL;
+ }
+
+ /*
+ * Append a copy of the returned tuple to tuplestore. NOTE: because
+ * the tuplestore is certainly in EOF state, its read position will
+ * move forward over the added tuple. This is what we want.
+ */
+ if (tuplestorestate)
+ tuplestore_puttupleslot(tuplestorestate, outerslot);
+
+ ExecCopySlot(slot, outerslot);
+ return slot;
+ }
+
+ /*
+ * Nothing left ...
+ */
+ return ExecClearTuple(slot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitMaterial
+ * ----------------------------------------------------------------
+ */
+MaterialState *
+ExecInitMaterial(Material *node, EState *estate, int eflags)
+{
+ MaterialState *matstate;
+ Plan *outerPlan;
+
+ /*
+ * create state structure
+ */
+ matstate = makeNode(MaterialState);
+ matstate->ss.ps.plan = (Plan *) node;
+ matstate->ss.ps.state = estate;
+ matstate->ss.ps.ExecProcNode = ExecMaterial;
+
+ /*
+ * We must have a tuplestore buffering the subplan output to do backward
+ * scan or mark/restore. We also prefer to materialize the subplan output
+ * if we might be called on to rewind and replay it many times. However,
+ * if none of these cases apply, we can skip storing the data.
+ */
+ matstate->eflags = (eflags & (EXEC_FLAG_REWIND |
+ EXEC_FLAG_BACKWARD |
+ EXEC_FLAG_MARK));
+
+ /*
+ * Tuplestore's interpretation of the flag bits is subtly different from
+ * the general executor meaning: it doesn't think BACKWARD necessarily
+ * means "backwards all the way to start". If told to support BACKWARD we
+ * must include REWIND in the tuplestore eflags, else tuplestore_trim
+ * might throw away too much.
+ */
+ if (eflags & EXEC_FLAG_BACKWARD)
+ matstate->eflags |= EXEC_FLAG_REWIND;
+
+ matstate->eof_underlying = false;
+ matstate->tuplestorestate = NULL;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * Materialization nodes don't need ExprContexts because they never call
+ * ExecQual or ExecProject.
+ */
+
+ /*
+ * initialize child nodes
+ *
+ * We shield the child node from the need to support REWIND, BACKWARD, or
+ * MARK/RESTORE.
+ */
+ eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);
+
+ outerPlan = outerPlan(node);
+ outerPlanState(matstate) = ExecInitNode(outerPlan, estate, eflags);
+
+ /*
+ * Initialize result type and slot. No need to initialize projection info
+ * because this node doesn't do projections.
+ *
+ * material nodes only return tuples from their materialized relation.
+ */
+ ExecInitResultTupleSlotTL(&matstate->ss.ps, &TTSOpsMinimalTuple);
+ matstate->ss.ps.ps_ProjInfo = NULL;
+
+ /*
+ * initialize tuple type.
+ */
+ ExecCreateScanSlotFromOuterPlan(estate, &matstate->ss, &TTSOpsMinimalTuple);
+
+ return matstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndMaterial
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndMaterial(MaterialState *node)
+{
+ /*
+ * clean out the tuple table
+ */
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * Release tuplestore resources
+ */
+ if (node->tuplestorestate != NULL)
+ tuplestore_end(node->tuplestorestate);
+ node->tuplestorestate = NULL;
+
+ /*
+ * shut down the subplan
+ */
+ ExecEndNode(outerPlanState(node));
+}
+
+/* ----------------------------------------------------------------
+ * ExecMaterialMarkPos
+ *
+ * Calls tuplestore to save the current position in the stored file.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMaterialMarkPos(MaterialState *node)
+{
+ Assert(node->eflags & EXEC_FLAG_MARK);
+
+ /*
+ * if we haven't materialized yet, just return.
+ */
+ if (!node->tuplestorestate)
+ return;
+
+ /*
+ * copy the active read pointer to the mark.
+ */
+ tuplestore_copy_read_pointer(node->tuplestorestate, 0, 1);
+
+ /*
+ * since we may have advanced the mark, try to truncate the tuplestore.
+ */
+ tuplestore_trim(node->tuplestorestate);
+}
+
+/* ----------------------------------------------------------------
+ * ExecMaterialRestrPos
+ *
+ * Calls tuplestore to restore the last saved file position.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMaterialRestrPos(MaterialState *node)
+{
+ Assert(node->eflags & EXEC_FLAG_MARK);
+
+ /*
+ * if we haven't materialized yet, just return.
+ */
+ if (!node->tuplestorestate)
+ return;
+
+ /*
+ * copy the mark to the active read pointer.
+ */
+ tuplestore_copy_read_pointer(node->tuplestorestate, 1, 0);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanMaterial
+ *
+ * Rescans the materialized relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanMaterial(MaterialState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ if (node->eflags != 0)
+ {
+ /*
+ * If we haven't materialized yet, just return. If outerplan's
+ * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+ * else no reason to re-scan it at all.
+ */
+ if (!node->tuplestorestate)
+ return;
+
+ /*
+ * If subnode is to be rescanned then we forget previous stored
+ * results; we have to re-read the subplan and re-store. Also, if we
+ * told tuplestore it needn't support rescan, we lose and must
+ * re-read. (This last should not happen in common cases; else our
+ * caller lied by not passing EXEC_FLAG_REWIND to us.)
+ *
+ * Otherwise we can just rewind and rescan the stored output. The
+ * state of the subnode does not change.
+ */
+ if (outerPlan->chgParam != NULL ||
+ (node->eflags & EXEC_FLAG_REWIND) == 0)
+ {
+ tuplestore_end(node->tuplestorestate);
+ node->tuplestorestate = NULL;
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+ node->eof_underlying = false;
+ }
+ else
+ tuplestore_rescan(node->tuplestorestate);
+ }
+ else
+ {
+ /* In this case we are just passing on the subquery's output */
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+ node->eof_underlying = false;
+ }
+}
diff --git a/src/backend/executor/nodeMemoize.c b/src/backend/executor/nodeMemoize.c
new file mode 100644
index 0000000..f82f41f
--- /dev/null
+++ b/src/backend/executor/nodeMemoize.c
@@ -0,0 +1,1225 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeMemoize.c
+ * Routines to handle caching of results from parameterized nodes
+ *
+ * Portions Copyright (c) 2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeMemoize.c
+ *
+ * Memoize nodes are intended to sit above parameterized nodes in the plan
+ * tree in order to cache results from them. The intention here is that a
+ * repeat scan with a parameter value that has already been seen by the node
+ * can fetch tuples from the cache rather than having to re-scan the outer
+ * node all over again. The query planner may choose to make use of one of
+ * these when it thinks rescans for previously seen values are likely enough
+ * to warrant adding the additional node.
+ *
+ * The method of cache we use is a hash table. When the cache fills, we never
+ * spill tuples to disk, instead, we choose to evict the least recently used
+ * cache entry from the cache. We remember the least recently used entry by
+ * always pushing new entries and entries we look for onto the tail of a
+ * doubly linked list. This means that older items always bubble to the top
+ * of this LRU list.
+ *
+ * Sometimes our callers won't run their scans to completion. For example a
+ * semi-join only needs to run until it finds a matching tuple, and once it
+ * does, the join operator skips to the next outer tuple and does not execute
+ * the inner side again on that scan. Because of this, we must keep track of
+ * when a cache entry is complete, and by default, we know it is when we run
+ * out of tuples to read during the scan. However, there are cases where we
+ * can mark the cache entry as complete without exhausting the scan of all
+ * tuples. One case is unique joins, where the join operator knows that there
+ * will only be at most one match for any given outer tuple. In order to
+ * support such cases we allow the "singlerow" option to be set for the cache.
+ * This option marks the cache entry as complete after we read the first tuple
+ * from the subnode.
+ *
+ * It's possible when we're filling the cache for a given set of parameters
+ * that we're unable to free enough memory to store any more tuples. If this
+ * happens then we'll have already evicted all other cache entries. When
+ * caching another tuple would cause us to exceed our memory budget, we must
+ * free the entry that we're currently populating and move the state machine
+ * into MEMO_CACHE_BYPASS_MODE. This means that we'll not attempt to cache
+ * any further tuples for this particular scan. We don't have the memory for
+ * it. The state machine will be reset again on the next rescan. If the
+ * memory requirements to cache the next parameter's tuples are less
+ * demanding, then that may allow us to start putting useful entries back into
+ * the cache again.
+ *
+ *
+ * INTERFACE ROUTINES
+ * ExecMemoize - lookup cache, exec subplan when not found
+ * ExecInitMemoize - initialize node and subnodes
+ * ExecEndMemoize - shutdown node and subnodes
+ * ExecReScanMemoize - rescan the memoize node
+ *
+ * ExecMemoizeEstimate estimates DSM space needed for parallel plan
+ * ExecMemoizeInitializeDSM initialize DSM for parallel plan
+ * ExecMemoizeInitializeWorker attach to DSM info in parallel worker
+ * ExecMemoizeRetrieveInstrumentation get instrumentation from worker
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "executor/nodeMemoize.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+
+/* States of the ExecMemoize state machine */
+#define MEMO_CACHE_LOOKUP 1 /* Attempt to perform a cache lookup */
+#define MEMO_CACHE_FETCH_NEXT_TUPLE 2 /* Get another tuple from the cache */
+#define MEMO_FILLING_CACHE 3 /* Read outer node to fill cache */
+#define MEMO_CACHE_BYPASS_MODE 4 /* Bypass mode. Just read from our
+ * subplan without caching anything */
+#define MEMO_END_OF_SCAN 5 /* Ready for rescan */
+
+
+/* Helper macros for memory accounting */
+#define EMPTY_ENTRY_MEMORY_BYTES(e) (sizeof(MemoizeEntry) + \
+ sizeof(MemoizeKey) + \
+ (e)->key->params->t_len);
+#define CACHE_TUPLE_BYTES(t) (sizeof(MemoizeTuple) + \
+ (t)->mintuple->t_len)
+
+ /* MemoizeTuple Stores an individually cached tuple */
+typedef struct MemoizeTuple
+{
+ MinimalTuple mintuple; /* Cached tuple */
+ struct MemoizeTuple *next; /* The next tuple with the same parameter
+ * values or NULL if it's the last one */
+} MemoizeTuple;
+
+/*
+ * MemoizeKey
+ * The hash table key for cached entries plus the LRU list link
+ */
+typedef struct MemoizeKey
+{
+ MinimalTuple params;
+ dlist_node lru_node; /* Pointer to next/prev key in LRU list */
+} MemoizeKey;
+
+/*
+ * MemoizeEntry
+ * The data struct that the cache hash table stores
+ */
+typedef struct MemoizeEntry
+{
+ MemoizeKey *key; /* Hash key for hash table lookups */
+ MemoizeTuple *tuplehead; /* Pointer to the first tuple or NULL if
+ * no tuples are cached for this entry */
+ uint32 hash; /* Hash value (cached) */
+ char status; /* Hash status */
+ bool complete; /* Did we read the outer plan to completion? */
+} MemoizeEntry;
+
+
+#define SH_PREFIX memoize
+#define SH_ELEMENT_TYPE MemoizeEntry
+#define SH_KEY_TYPE MemoizeKey *
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static uint32 MemoizeHash_hash(struct memoize_hash *tb,
+ const MemoizeKey *key);
+static bool MemoizeHash_equal(struct memoize_hash *tb,
+ const MemoizeKey *params1,
+ const MemoizeKey *params2);
+
+#define SH_PREFIX memoize
+#define SH_ELEMENT_TYPE MemoizeEntry
+#define SH_KEY_TYPE MemoizeKey *
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) MemoizeHash_hash(tb, key)
+#define SH_EQUAL(tb, a, b) MemoizeHash_equal(tb, a, b)
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a) a->hash
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+/*
+ * MemoizeHash_hash
+ * Hash function for simplehash hashtable. 'key' is unused here as we
+ * require that all table lookups first populate the MemoizeState's
+ * probeslot with the key values to be looked up.
+ */
+static uint32
+MemoizeHash_hash(struct memoize_hash *tb, const MemoizeKey *key)
+{
+ MemoizeState *mstate = (MemoizeState *) tb->private_data;
+ TupleTableSlot *pslot = mstate->probeslot;
+ uint32 hashkey = 0;
+ int numkeys = mstate->nkeys;
+
+ if (mstate->binary_mode)
+ {
+ for (int i = 0; i < numkeys; i++)
+ {
+ /* rotate hashkey left 1 bit at each step */
+ hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+ if (!pslot->tts_isnull[i]) /* treat nulls as having hash key 0 */
+ {
+ FormData_pg_attribute *attr;
+ uint32 hkey;
+
+ attr = &pslot->tts_tupleDescriptor->attrs[i];
+
+ hkey = datum_image_hash(pslot->tts_values[i], attr->attbyval, attr->attlen);
+
+ hashkey ^= hkey;
+ }
+ }
+ }
+ else
+ {
+ FmgrInfo *hashfunctions = mstate->hashfunctions;
+ Oid *collations = mstate->collations;
+
+ for (int i = 0; i < numkeys; i++)
+ {
+ /* rotate hashkey left 1 bit at each step */
+ hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+ if (!pslot->tts_isnull[i]) /* treat nulls as having hash key 0 */
+ {
+ uint32 hkey;
+
+ hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i],
+ collations[i], pslot->tts_values[i]));
+ hashkey ^= hkey;
+ }
+ }
+ }
+
+ return murmurhash32(hashkey);
+}
+
+/*
+ * MemoizeHash_equal
+ * Equality function for confirming hash value matches during a hash
+ * table lookup. 'key2' is never used. Instead the MemoizeState's
+ * probeslot is always populated with details of what's being looked up.
+ */
+static bool
+MemoizeHash_equal(struct memoize_hash *tb, const MemoizeKey *key1,
+ const MemoizeKey *key2)
+{
+ MemoizeState *mstate = (MemoizeState *) tb->private_data;
+ ExprContext *econtext = mstate->ss.ps.ps_ExprContext;
+ TupleTableSlot *tslot = mstate->tableslot;
+ TupleTableSlot *pslot = mstate->probeslot;
+
+ /* probeslot should have already been prepared by prepare_probe_slot() */
+ ExecStoreMinimalTuple(key1->params, tslot, false);
+
+ if (mstate->binary_mode)
+ {
+ int numkeys = mstate->nkeys;
+
+ slot_getallattrs(tslot);
+ slot_getallattrs(pslot);
+
+ for (int i = 0; i < numkeys; i++)
+ {
+ FormData_pg_attribute *attr;
+
+ if (tslot->tts_isnull[i] != pslot->tts_isnull[i])
+ return false;
+
+ /* both NULL? they're equal */
+ if (tslot->tts_isnull[i])
+ continue;
+
+ /* perform binary comparison on the two datums */
+ attr = &tslot->tts_tupleDescriptor->attrs[i];
+ if (!datum_image_eq(tslot->tts_values[i], pslot->tts_values[i],
+ attr->attbyval, attr->attlen))
+ return false;
+ }
+ return true;
+ }
+ else
+ {
+ econtext->ecxt_innertuple = tslot;
+ econtext->ecxt_outertuple = pslot;
+ return ExecQualAndReset(mstate->cache_eq_expr, econtext);
+ }
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(MemoizeState *mstate, uint32 size)
+{
+ /* Make a guess at a good size when we're not given a valid size. */
+ if (size == 0)
+ size = 1024;
+
+ /* memoize_create will convert the size to a power of 2 */
+ mstate->hashtable = memoize_create(mstate->tableContext, size, mstate);
+}
+
+/*
+ * prepare_probe_slot
+ * Populate mstate's probeslot with the values from the tuple stored
+ * in 'key'. If 'key' is NULL, then perform the population by evaluating
+ * mstate's param_exprs.
+ */
+static inline void
+prepare_probe_slot(MemoizeState *mstate, MemoizeKey *key)
+{
+ TupleTableSlot *pslot = mstate->probeslot;
+ TupleTableSlot *tslot = mstate->tableslot;
+ int numKeys = mstate->nkeys;
+
+ ExecClearTuple(pslot);
+
+ if (key == NULL)
+ {
+ /* Set the probeslot's values based on the current parameter values */
+ for (int i = 0; i < numKeys; i++)
+ pslot->tts_values[i] = ExecEvalExpr(mstate->param_exprs[i],
+ mstate->ss.ps.ps_ExprContext,
+ &pslot->tts_isnull[i]);
+ }
+ else
+ {
+ /* Process the key's MinimalTuple and store the values in probeslot */
+ ExecStoreMinimalTuple(key->params, tslot, false);
+ slot_getallattrs(tslot);
+ memcpy(pslot->tts_values, tslot->tts_values, sizeof(Datum) * numKeys);
+ memcpy(pslot->tts_isnull, tslot->tts_isnull, sizeof(bool) * numKeys);
+ }
+
+ ExecStoreVirtualTuple(pslot);
+}
+
+/*
+ * entry_purge_tuples
+ * Remove all tuples from the cache entry pointed to by 'entry'. This
+ * leaves an empty cache entry. Also, update the memory accounting to
+ * reflect the removal of the tuples.
+ */
+static inline void
+entry_purge_tuples(MemoizeState *mstate, MemoizeEntry *entry)
+{
+ MemoizeTuple *tuple = entry->tuplehead;
+ uint64 freed_mem = 0;
+
+ while (tuple != NULL)
+ {
+ MemoizeTuple *next = tuple->next;
+
+ freed_mem += CACHE_TUPLE_BYTES(tuple);
+
+ /* Free memory used for this tuple */
+ pfree(tuple->mintuple);
+ pfree(tuple);
+
+ tuple = next;
+ }
+
+ entry->complete = false;
+ entry->tuplehead = NULL;
+
+ /* Update the memory accounting */
+ mstate->mem_used -= freed_mem;
+}
+
+/*
+ * remove_cache_entry
+ * Remove 'entry' from the cache and free memory used by it.
+ */
+static void
+remove_cache_entry(MemoizeState *mstate, MemoizeEntry *entry)
+{
+ MemoizeKey *key = entry->key;
+
+ dlist_delete(&entry->key->lru_node);
+
+ /* Remove all of the tuples from this entry */
+ entry_purge_tuples(mstate, entry);
+
+ /*
+ * Update memory accounting. entry_purge_tuples should have already
+ * subtracted the memory used for each cached tuple. Here we just update
+ * the amount used by the entry itself.
+ */
+ mstate->mem_used -= EMPTY_ENTRY_MEMORY_BYTES(entry);
+
+ /* Remove the entry from the cache */
+ memoize_delete_item(mstate->hashtable, entry);
+
+ pfree(key->params);
+ pfree(key);
+}
+
+/*
+ * cache_purge_all
+ * Remove all items from the cache
+ */
+static void
+cache_purge_all(MemoizeState *mstate)
+{
+ uint64 evictions = mstate->hashtable->members;
+ PlanState *pstate = (PlanState *) mstate;
+
+ /*
+ * Likely the most efficient way to remove all items is to just reset the
+ * memory context for the cache and then rebuild a fresh hash table. This
+ * saves having to remove each item one by one and pfree each cached tuple
+ */
+ MemoryContextReset(mstate->tableContext);
+
+ /* Make the hash table the same size as the original size */
+ build_hash_table(mstate, ((Memoize *) pstate->plan)->est_entries);
+
+ /* reset the LRU list */
+ dlist_init(&mstate->lru_list);
+ mstate->last_tuple = NULL;
+ mstate->entry = NULL;
+
+ mstate->mem_used = 0;
+
+ /* XXX should we add something new to track these purges? */
+ mstate->stats.cache_evictions += evictions; /* Update Stats */
+}
+
+/*
+ * cache_reduce_memory
+ * Evict older and less recently used items from the cache in order to
+ * reduce the memory consumption back to something below the
+ * MemoizeState's mem_limit.
+ *
+ * 'specialkey', if not NULL, causes the function to return false if the entry
+ * which the key belongs to is removed from the cache.
+ */
+static bool
+cache_reduce_memory(MemoizeState *mstate, MemoizeKey *specialkey)
+{
+ bool specialkey_intact = true; /* for now */
+ dlist_mutable_iter iter;
+ uint64 evictions = 0;
+
+ /* Update peak memory usage */
+ if (mstate->mem_used > mstate->stats.mem_peak)
+ mstate->stats.mem_peak = mstate->mem_used;
+
+ /* We expect only to be called when we've gone over budget on memory */
+ Assert(mstate->mem_used > mstate->mem_limit);
+
+ /* Start the eviction process starting at the head of the LRU list. */
+ dlist_foreach_modify(iter, &mstate->lru_list)
+ {
+ MemoizeKey *key = dlist_container(MemoizeKey, lru_node, iter.cur);
+ MemoizeEntry *entry;
+
+ /*
+ * Populate the hash probe slot in preparation for looking up this LRU
+ * entry.
+ */
+ prepare_probe_slot(mstate, key);
+
+ /*
+ * Ideally the LRU list pointers would be stored in the entry itself
+ * rather than in the key. Unfortunately, we can't do that as the
+ * simplehash.h code may resize the table and allocate new memory for
+ * entries which would result in those pointers pointing to the old
+ * buckets. However, it's fine to use the key to store this as that's
+ * only referenced by a pointer in the entry, which of course follows
+ * the entry whenever the hash table is resized. Since we only have a
+ * pointer to the key here, we must perform a hash table lookup to
+ * find the entry that the key belongs to.
+ */
+ entry = memoize_lookup(mstate->hashtable, NULL);
+
+ /*
+ * Sanity check that we found the entry belonging to the LRU list
+ * item. A misbehaving hash or equality function could cause the
+ * entry not to be found or the wrong entry to be found.
+ */
+ if (unlikely(entry == NULL || entry->key != key))
+ elog(ERROR, "could not find memoization table entry");
+
+ /*
+ * If we're being called to free memory while the cache is being
+ * populated with new tuples, then we'd better take some care as we
+ * could end up freeing the entry which 'specialkey' belongs to.
+ * Generally callers will pass 'specialkey' as the key for the cache
+ * entry which is currently being populated, so we must set
+ * 'specialkey_intact' to false to inform the caller the specialkey
+ * entry has been removed.
+ */
+ if (key == specialkey)
+ specialkey_intact = false;
+
+ /*
+ * Finally remove the entry. This will remove from the LRU list too.
+ */
+ remove_cache_entry(mstate, entry);
+
+ evictions++;
+
+ /* Exit if we've freed enough memory */
+ if (mstate->mem_used <= mstate->mem_limit)
+ break;
+ }
+
+ mstate->stats.cache_evictions += evictions; /* Update Stats */
+
+ return specialkey_intact;
+}
+
+/*
+ * cache_lookup
+ * Perform a lookup to see if we've already cached tuples based on the
+ * scan's current parameters. If we find an existing entry we move it to
+ * the end of the LRU list, set *found to true then return it. If we
+ * don't find an entry then we create a new one and add it to the end of
+ * the LRU list. We also update cache memory accounting and remove older
+ * entries if we go over the memory budget. If we managed to free enough
+ * memory we return the new entry, else we return NULL.
+ *
+ * Callers can assume we'll never return NULL when *found is true.
+ */
+static MemoizeEntry *
+cache_lookup(MemoizeState *mstate, bool *found)
+{
+ MemoizeKey *key;
+ MemoizeEntry *entry;
+ MemoryContext oldcontext;
+
+ /* prepare the probe slot with the current scan parameters */
+ prepare_probe_slot(mstate, NULL);
+
+ /*
+ * Add the new entry to the cache. No need to pass a valid key since the
+ * hash function uses mstate's probeslot, which we populated above.
+ */
+ entry = memoize_insert(mstate->hashtable, NULL, found);
+
+ if (*found)
+ {
+ /*
+ * Move existing entry to the tail of the LRU list to mark it as the
+ * most recently used item.
+ */
+ dlist_move_tail(&mstate->lru_list, &entry->key->lru_node);
+
+ return entry;
+ }
+
+ oldcontext = MemoryContextSwitchTo(mstate->tableContext);
+
+ /* Allocate a new key */
+ entry->key = key = (MemoizeKey *) palloc(sizeof(MemoizeKey));
+ key->params = ExecCopySlotMinimalTuple(mstate->probeslot);
+
+ /* Update the total cache memory utilization */
+ mstate->mem_used += EMPTY_ENTRY_MEMORY_BYTES(entry);
+
+ /* Initialize this entry */
+ entry->complete = false;
+ entry->tuplehead = NULL;
+
+ /*
+ * Since this is the most recently used entry, push this entry onto the
+ * end of the LRU list.
+ */
+ dlist_push_tail(&mstate->lru_list, &entry->key->lru_node);
+
+ mstate->last_tuple = NULL;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * If we've gone over our memory budget, then we'll free up some space in
+ * the cache.
+ */
+ if (mstate->mem_used > mstate->mem_limit)
+ {
+ /*
+ * Try to free up some memory. It's highly unlikely that we'll fail
+ * to do so here since the entry we've just added is yet to contain
+ * any tuples and we're able to remove any other entry to reduce the
+ * memory consumption.
+ */
+ if (unlikely(!cache_reduce_memory(mstate, key)))
+ return NULL;
+
+ /*
+ * The process of removing entries from the cache may have caused the
+ * code in simplehash.h to shuffle elements to earlier buckets in the
+ * hash table. If it has, we'll need to find the entry again by
+ * performing a lookup. Fortunately, we can detect if this has
+ * happened by seeing if the entry is still in use and that the key
+ * pointer matches our expected key.
+ */
+ if (entry->status != memoize_SH_IN_USE || entry->key != key)
+ {
+ /*
+ * We need to repopulate the probeslot as lookups performed during
+ * the cache evictions above will have stored some other key.
+ */
+ prepare_probe_slot(mstate, key);
+
+ /* Re-find the newly added entry */
+ entry = memoize_lookup(mstate->hashtable, NULL);
+ Assert(entry != NULL);
+ }
+ }
+
+ return entry;
+}
+
+/*
+ * cache_store_tuple
+ * Add the tuple stored in 'slot' to the mstate's current cache entry.
+ * The cache entry must have already been made with cache_lookup().
+ * mstate's last_tuple field must point to the tail of mstate->entry's
+ * list of tuples.
+ */
+static bool
+cache_store_tuple(MemoizeState *mstate, TupleTableSlot *slot)
+{
+ MemoizeTuple *tuple;
+ MemoizeEntry *entry = mstate->entry;
+ MemoryContext oldcontext;
+
+ Assert(slot != NULL);
+ Assert(entry != NULL);
+
+ oldcontext = MemoryContextSwitchTo(mstate->tableContext);
+
+ tuple = (MemoizeTuple *) palloc(sizeof(MemoizeTuple));
+ tuple->mintuple = ExecCopySlotMinimalTuple(slot);
+ tuple->next = NULL;
+
+ /* Account for the memory we just consumed */
+ mstate->mem_used += CACHE_TUPLE_BYTES(tuple);
+
+ if (entry->tuplehead == NULL)
+ {
+ /*
+ * This is the first tuple for this entry, so just point the list head
+ * to it.
+ */
+ entry->tuplehead = tuple;
+ }
+ else
+ {
+ /* push this tuple onto the tail of the list */
+ mstate->last_tuple->next = tuple;
+ }
+
+ mstate->last_tuple = tuple;
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * If we've gone over our memory budget then free up some space in the
+ * cache.
+ */
+ if (mstate->mem_used > mstate->mem_limit)
+ {
+ MemoizeKey *key = entry->key;
+
+ if (!cache_reduce_memory(mstate, key))
+ return false;
+
+ /*
+ * The process of removing entries from the cache may have caused the
+ * code in simplehash.h to shuffle elements to earlier buckets in the
+ * hash table. If it has, we'll need to find the entry again by
+ * performing a lookup. Fortunately, we can detect if this has
+ * happened by seeing if the entry is still in use and that the key
+ * pointer matches our expected key.
+ */
+ if (entry->status != memoize_SH_IN_USE || entry->key != key)
+ {
+ /*
+ * We need to repopulate the probeslot as lookups performed during
+ * the cache evictions above will have stored some other key.
+ */
+ prepare_probe_slot(mstate, key);
+
+ /* Re-find the entry */
+ mstate->entry = entry = memoize_lookup(mstate->hashtable, NULL);
+ Assert(entry != NULL);
+ }
+ }
+
+ return true;
+}
+
+static TupleTableSlot *
+ExecMemoize(PlanState *pstate)
+{
+ MemoizeState *node = castNode(MemoizeState, pstate);
+ PlanState *outerNode;
+ TupleTableSlot *slot;
+
+ switch (node->mstatus)
+ {
+ case MEMO_CACHE_LOOKUP:
+ {
+ MemoizeEntry *entry;
+ TupleTableSlot *outerslot;
+ bool found;
+
+ Assert(node->entry == NULL);
+
+ /*
+ * We're only ever in this state for the first call of the
+ * scan. Here we have a look to see if we've already seen the
+ * current parameters before and if we have already cached a
+ * complete set of records that the outer plan will return for
+ * these parameters.
+ *
+ * When we find a valid cache entry, we'll return the first
+ * tuple from it. If not found, we'll create a cache entry and
+ * then try to fetch a tuple from the outer scan. If we find
+ * one there, we'll try to cache it.
+ */
+
+ /* see if we've got anything cached for the current parameters */
+ entry = cache_lookup(node, &found);
+
+ if (found && entry->complete)
+ {
+ node->stats.cache_hits += 1; /* stats update */
+
+ /*
+ * Set last_tuple and entry so that the state
+ * MEMO_CACHE_FETCH_NEXT_TUPLE can easily find the next
+ * tuple for these parameters.
+ */
+ node->last_tuple = entry->tuplehead;
+ node->entry = entry;
+
+ /* Fetch the first cached tuple, if there is one */
+ if (entry->tuplehead)
+ {
+ node->mstatus = MEMO_CACHE_FETCH_NEXT_TUPLE;
+
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ ExecStoreMinimalTuple(entry->tuplehead->mintuple,
+ slot, false);
+
+ return slot;
+ }
+
+ /* The cache entry is void of any tuples. */
+ node->mstatus = MEMO_END_OF_SCAN;
+ return NULL;
+ }
+
+ /* Handle cache miss */
+ node->stats.cache_misses += 1; /* stats update */
+
+ if (found)
+ {
+ /*
+ * A cache entry was found, but the scan for that entry
+ * did not run to completion. We'll just remove all
+ * tuples and start again. It might be tempting to
+ * continue where we left off, but there's no guarantee
+ * the outer node will produce the tuples in the same
+ * order as it did last time.
+ */
+ entry_purge_tuples(node, entry);
+ }
+
+ /* Scan the outer node for a tuple to cache */
+ outerNode = outerPlanState(node);
+ outerslot = ExecProcNode(outerNode);
+ if (TupIsNull(outerslot))
+ {
+ /*
+ * cache_lookup may have returned NULL due to failure to
+ * free enough cache space, so ensure we don't do anything
+ * here that assumes it worked. There's no need to go into
+ * bypass mode here as we're setting mstatus to end of
+ * scan.
+ */
+ if (likely(entry))
+ entry->complete = true;
+
+ node->mstatus = MEMO_END_OF_SCAN;
+ return NULL;
+ }
+
+ node->entry = entry;
+
+ /*
+ * If we failed to create the entry or failed to store the
+ * tuple in the entry, then go into bypass mode.
+ */
+ if (unlikely(entry == NULL ||
+ !cache_store_tuple(node, outerslot)))
+ {
+ node->stats.cache_overflows += 1; /* stats update */
+
+ node->mstatus = MEMO_CACHE_BYPASS_MODE;
+
+ /*
+ * No need to clear out last_tuple as we'll stay in bypass
+ * mode until the end of the scan.
+ */
+ }
+ else
+ {
+ /*
+ * If we only expect a single row from this scan then we
+ * can mark that we're not expecting more. This allows
+ * cache lookups to work even when the scan has not been
+ * executed to completion.
+ */
+ entry->complete = node->singlerow;
+ node->mstatus = MEMO_FILLING_CACHE;
+ }
+
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ ExecCopySlot(slot, outerslot);
+ return slot;
+ }
+
+ case MEMO_CACHE_FETCH_NEXT_TUPLE:
+ {
+ /* We shouldn't be in this state if these are not set */
+ Assert(node->entry != NULL);
+ Assert(node->last_tuple != NULL);
+
+ /* Skip to the next tuple to output */
+ node->last_tuple = node->last_tuple->next;
+
+ /* No more tuples in the cache */
+ if (node->last_tuple == NULL)
+ {
+ node->mstatus = MEMO_END_OF_SCAN;
+ return NULL;
+ }
+
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ ExecStoreMinimalTuple(node->last_tuple->mintuple, slot,
+ false);
+
+ return slot;
+ }
+
+ case MEMO_FILLING_CACHE:
+ {
+ TupleTableSlot *outerslot;
+ MemoizeEntry *entry = node->entry;
+
+ /* entry should already have been set by MEMO_CACHE_LOOKUP */
+ Assert(entry != NULL);
+
+ /*
+ * When in the MEMO_FILLING_CACHE state, we've just had a
+ * cache miss and are populating the cache with the current
+ * scan tuples.
+ */
+ outerNode = outerPlanState(node);
+ outerslot = ExecProcNode(outerNode);
+ if (TupIsNull(outerslot))
+ {
+ /* No more tuples. Mark it as complete */
+ entry->complete = true;
+ node->mstatus = MEMO_END_OF_SCAN;
+ return NULL;
+ }
+
+ /*
+ * Validate if the planner properly set the singlerow flag. It
+ * should only set that if each cache entry can, at most,
+ * return 1 row.
+ */
+ if (unlikely(entry->complete))
+ elog(ERROR, "cache entry already complete");
+
+ /* Record the tuple in the current cache entry */
+ if (unlikely(!cache_store_tuple(node, outerslot)))
+ {
+ /* Couldn't store it? Handle overflow */
+ node->stats.cache_overflows += 1; /* stats update */
+
+ node->mstatus = MEMO_CACHE_BYPASS_MODE;
+
+ /*
+ * No need to clear out entry or last_tuple as we'll stay
+ * in bypass mode until the end of the scan.
+ */
+ }
+
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ ExecCopySlot(slot, outerslot);
+ return slot;
+ }
+
+ case MEMO_CACHE_BYPASS_MODE:
+ {
+ TupleTableSlot *outerslot;
+
+ /*
+ * When in bypass mode we just continue to read tuples without
+ * caching. We need to wait until the next rescan before we
+ * can come out of this mode.
+ */
+ outerNode = outerPlanState(node);
+ outerslot = ExecProcNode(outerNode);
+ if (TupIsNull(outerslot))
+ {
+ node->mstatus = MEMO_END_OF_SCAN;
+ return NULL;
+ }
+
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ ExecCopySlot(slot, outerslot);
+ return slot;
+ }
+
+ case MEMO_END_OF_SCAN:
+
+ /*
+ * We've already returned NULL for this scan, but just in case
+ * something calls us again by mistake.
+ */
+ return NULL;
+
+ default:
+ elog(ERROR, "unrecognized memoize state: %d",
+ (int) node->mstatus);
+ return NULL;
+ } /* switch */
+}
+
+MemoizeState *
+ExecInitMemoize(Memoize *node, EState *estate, int eflags)
+{
+ MemoizeState *mstate = makeNode(MemoizeState);
+ Plan *outerNode;
+ int i;
+ int nkeys;
+ Oid *eqfuncoids;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ mstate->ss.ps.plan = (Plan *) node;
+ mstate->ss.ps.state = estate;
+ mstate->ss.ps.ExecProcNode = ExecMemoize;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &mstate->ss.ps);
+
+ outerNode = outerPlan(node);
+ outerPlanState(mstate) = ExecInitNode(outerNode, estate, eflags);
+
+ /*
+ * Initialize return slot and type. No need to initialize projection info
+ * because this node doesn't do projections.
+ */
+ ExecInitResultTupleSlotTL(&mstate->ss.ps, &TTSOpsMinimalTuple);
+ mstate->ss.ps.ps_ProjInfo = NULL;
+
+ /*
+ * Initialize scan slot and type.
+ */
+ ExecCreateScanSlotFromOuterPlan(estate, &mstate->ss, &TTSOpsMinimalTuple);
+
+ /*
+ * Set the state machine to lookup the cache. We won't find anything
+ * until we cache something, but this saves a special case to create the
+ * first entry.
+ */
+ mstate->mstatus = MEMO_CACHE_LOOKUP;
+
+ mstate->nkeys = nkeys = node->numKeys;
+ mstate->hashkeydesc = ExecTypeFromExprList(node->param_exprs);
+ mstate->tableslot = MakeSingleTupleTableSlot(mstate->hashkeydesc,
+ &TTSOpsMinimalTuple);
+ mstate->probeslot = MakeSingleTupleTableSlot(mstate->hashkeydesc,
+ &TTSOpsVirtual);
+
+ mstate->param_exprs = (ExprState **) palloc(nkeys * sizeof(ExprState *));
+ mstate->collations = node->collations; /* Just point directly to the plan
+ * data */
+ mstate->hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
+
+ eqfuncoids = palloc(nkeys * sizeof(Oid));
+
+ for (i = 0; i < nkeys; i++)
+ {
+ Oid hashop = node->hashOperators[i];
+ Oid left_hashfn;
+ Oid right_hashfn;
+ Expr *param_expr = (Expr *) list_nth(node->param_exprs, i);
+
+ if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
+ elog(ERROR, "could not find hash function for hash operator %u",
+ hashop);
+
+ fmgr_info(left_hashfn, &mstate->hashfunctions[i]);
+
+ mstate->param_exprs[i] = ExecInitExpr(param_expr, (PlanState *) mstate);
+ eqfuncoids[i] = get_opcode(hashop);
+ }
+
+ mstate->cache_eq_expr = ExecBuildParamSetEqual(mstate->hashkeydesc,
+ &TTSOpsMinimalTuple,
+ &TTSOpsVirtual,
+ eqfuncoids,
+ node->collations,
+ node->param_exprs,
+ (PlanState *) mstate);
+
+ pfree(eqfuncoids);
+ mstate->mem_used = 0;
+
+ /* Limit the total memory consumed by the cache to this */
+ mstate->mem_limit = get_hash_memory_limit();
+
+ /* A memory context dedicated for the cache */
+ mstate->tableContext = AllocSetContextCreate(CurrentMemoryContext,
+ "MemoizeHashTable",
+ ALLOCSET_DEFAULT_SIZES);
+
+ dlist_init(&mstate->lru_list);
+ mstate->last_tuple = NULL;
+ mstate->entry = NULL;
+
+ /*
+ * Mark if we can assume the cache entry is completed after we get the
+ * first record for it. Some callers might not call us again after
+ * getting the first match. e.g. A join operator performing a unique join
+ * is able to skip to the next outer tuple after getting the first
+ * matching inner tuple. In this case, the cache entry is complete after
+ * getting the first tuple. This allows us to mark it as so.
+ */
+ mstate->singlerow = node->singlerow;
+ mstate->keyparamids = node->keyparamids;
+
+ /*
+ * Record if the cache keys should be compared bit by bit, or logically
+ * using the type's hash equality operator
+ */
+ mstate->binary_mode = node->binary_mode;
+
+ /* Zero the statistics counters */
+ memset(&mstate->stats, 0, sizeof(MemoizeInstrumentation));
+
+ /* Allocate and set up the actual cache */
+ build_hash_table(mstate, node->est_entries);
+
+ return mstate;
+}
+
+void
+ExecEndMemoize(MemoizeState *node)
+{
+#ifdef USE_ASSERT_CHECKING
+ /* Validate the memory accounting code is correct in assert builds. */
+ {
+ int count;
+ uint64 mem = 0;
+ memoize_iterator i;
+ MemoizeEntry *entry;
+
+ memoize_start_iterate(node->hashtable, &i);
+
+ count = 0;
+ while ((entry = memoize_iterate(node->hashtable, &i)) != NULL)
+ {
+ MemoizeTuple *tuple = entry->tuplehead;
+
+ mem += EMPTY_ENTRY_MEMORY_BYTES(entry);
+ while (tuple != NULL)
+ {
+ mem += CACHE_TUPLE_BYTES(tuple);
+ tuple = tuple->next;
+ }
+ count++;
+ }
+
+ Assert(count == node->hashtable->members);
+ Assert(mem == node->mem_used);
+ }
+#endif
+
+ /*
+ * When ending a parallel worker, copy the statistics gathered by the
+ * worker back into shared memory so that it can be picked up by the main
+ * process to report in EXPLAIN ANALYZE.
+ */
+ if (node->shared_info != NULL && IsParallelWorker())
+ {
+ MemoizeInstrumentation *si;
+
+ /* Make mem_peak available for EXPLAIN */
+ if (node->stats.mem_peak == 0)
+ node->stats.mem_peak = node->mem_used;
+
+ Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+ si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+ memcpy(si, &node->stats, sizeof(MemoizeInstrumentation));
+ }
+
+ /* Remove the cache context */
+ MemoryContextDelete(node->tableContext);
+
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+ /* must drop pointer to cache result tuple */
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ /*
+ * free exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * shut down the subplan
+ */
+ ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanMemoize(MemoizeState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ /* Mark that we must lookup the cache for a new set of parameters */
+ node->mstatus = MEMO_CACHE_LOOKUP;
+
+ /* nullify pointers used for the last scan */
+ node->entry = NULL;
+ node->last_tuple = NULL;
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+
+ /*
+ * Purge the entire cache if a parameter changed that is not part of the
+ * cache key.
+ */
+ if (bms_nonempty_difference(outerPlan->chgParam, node->keyparamids))
+ cache_purge_all(node);
+}
+
+/*
+ * ExecEstimateCacheEntryOverheadBytes
+ * For use in the query planner to help it estimate the amount of memory
+ * required to store a single entry in the cache.
+ */
+double
+ExecEstimateCacheEntryOverheadBytes(double ntuples)
+{
+ return sizeof(MemoizeEntry) + sizeof(MemoizeKey) + sizeof(MemoizeTuple) *
+ ntuples;
+}
+
+/* ----------------------------------------------------------------
+ * Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+ /* ----------------------------------------------------------------
+ * ExecMemoizeEstimate
+ *
+ * Estimate space required to propagate memoize statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMemoizeEstimate(MemoizeState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = mul_size(pcxt->nworkers, sizeof(MemoizeInstrumentation));
+ size = add_size(size, offsetof(SharedMemoizeInfo, sinstrument));
+ shm_toc_estimate_chunk(&pcxt->estimator, size);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecMemoizeInitializeDSM
+ *
+ * Initialize DSM space for memoize statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMemoizeInitializeDSM(MemoizeState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = offsetof(SharedMemoizeInfo, sinstrument)
+ + pcxt->nworkers * sizeof(MemoizeInstrumentation);
+ node->shared_info = shm_toc_allocate(pcxt->toc, size);
+ /* ensure any unfilled slots will contain zeroes */
+ memset(node->shared_info, 0, size);
+ node->shared_info->num_workers = pcxt->nworkers;
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+ node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ * ExecMemoizeInitializeWorker
+ *
+ * Attach worker to DSM space for memoize statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMemoizeInitializeWorker(MemoizeState *node, ParallelWorkerContext *pwcxt)
+{
+ node->shared_info =
+ shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+}
+
+/* ----------------------------------------------------------------
+ * ExecMemoizeRetrieveInstrumentation
+ *
+ * Transfer memoize statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMemoizeRetrieveInstrumentation(MemoizeState *node)
+{
+ Size size;
+ SharedMemoizeInfo *si;
+
+ if (node->shared_info == NULL)
+ return;
+
+ size = offsetof(SharedMemoizeInfo, sinstrument)
+ + node->shared_info->num_workers * sizeof(MemoizeInstrumentation);
+ si = palloc(size);
+ memcpy(si, node->shared_info, size);
+ node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c
new file mode 100644
index 0000000..617bffb
--- /dev/null
+++ b/src/backend/executor/nodeMergeAppend.c
@@ -0,0 +1,389 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeMergeAppend.c
+ * routines to handle MergeAppend nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeMergeAppend.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ * ExecInitMergeAppend - initialize the MergeAppend node
+ * ExecMergeAppend - retrieve the next tuple from the node
+ * ExecEndMergeAppend - shut down the MergeAppend node
+ * ExecReScanMergeAppend - rescan the MergeAppend node
+ *
+ * NOTES
+ * A MergeAppend node contains a list of one or more subplans.
+ * These are each expected to deliver tuples that are sorted according
+ * to a common sort key. The MergeAppend node merges these streams
+ * to produce output sorted the same way.
+ *
+ * MergeAppend nodes don't make use of their left and right
+ * subtrees, rather they maintain a list of subplans so
+ * a typical MergeAppend node looks like this in the plan tree:
+ *
+ * ...
+ * /
+ * MergeAppend---+------+------+--- nil
+ * / \ | | |
+ * nil nil ... ... ...
+ * subplans
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/execPartition.h"
+#include "executor/nodeMergeAppend.h"
+#include "lib/binaryheap.h"
+#include "miscadmin.h"
+
+/*
+ * We have one slot for each item in the heap array. We use SlotNumber
+ * to store slot indexes. This doesn't actually provide any formal
+ * type-safety, but it makes the code more self-documenting.
+ */
+typedef int32 SlotNumber;
+
+static TupleTableSlot *ExecMergeAppend(PlanState *pstate);
+static int heap_compare_slots(Datum a, Datum b, void *arg);
+
+
+/* ----------------------------------------------------------------
+ * ExecInitMergeAppend
+ *
+ * Begin all of the subscans of the MergeAppend node.
+ * ----------------------------------------------------------------
+ */
+MergeAppendState *
+ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags)
+{
+ MergeAppendState *mergestate = makeNode(MergeAppendState);
+ PlanState **mergeplanstates;
+ Bitmapset *validsubplans;
+ int nplans;
+ int i,
+ j;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create new MergeAppendState for our node
+ */
+ mergestate->ps.plan = (Plan *) node;
+ mergestate->ps.state = estate;
+ mergestate->ps.ExecProcNode = ExecMergeAppend;
+
+ /* If run-time partition pruning is enabled, then set that up now */
+ if (node->part_prune_info != NULL)
+ {
+ PartitionPruneState *prunestate;
+
+ /* We may need an expression context to evaluate partition exprs */
+ ExecAssignExprContext(estate, &mergestate->ps);
+
+ prunestate = ExecCreatePartitionPruneState(&mergestate->ps,
+ node->part_prune_info);
+ mergestate->ms_prune_state = prunestate;
+
+ /* Perform an initial partition prune, if required. */
+ if (prunestate->do_initial_prune)
+ {
+ /* Determine which subplans survive initial pruning */
+ validsubplans = ExecFindInitialMatchingSubPlans(prunestate,
+ list_length(node->mergeplans));
+
+ nplans = bms_num_members(validsubplans);
+ }
+ else
+ {
+ /* We'll need to initialize all subplans */
+ nplans = list_length(node->mergeplans);
+ Assert(nplans > 0);
+ validsubplans = bms_add_range(NULL, 0, nplans - 1);
+ }
+
+ /*
+ * When no run-time pruning is required and there's at least one
+ * subplan, we can fill as_valid_subplans immediately, preventing
+ * later calls to ExecFindMatchingSubPlans.
+ */
+ if (!prunestate->do_exec_prune && nplans > 0)
+ mergestate->ms_valid_subplans = bms_add_range(NULL, 0, nplans - 1);
+ }
+ else
+ {
+ nplans = list_length(node->mergeplans);
+
+ /*
+ * When run-time partition pruning is not enabled we can just mark all
+ * subplans as valid; they must also all be initialized.
+ */
+ Assert(nplans > 0);
+ mergestate->ms_valid_subplans = validsubplans =
+ bms_add_range(NULL, 0, nplans - 1);
+ mergestate->ms_prune_state = NULL;
+ }
+
+ mergeplanstates = (PlanState **) palloc(nplans * sizeof(PlanState *));
+ mergestate->mergeplans = mergeplanstates;
+ mergestate->ms_nplans = nplans;
+
+ mergestate->ms_slots = (TupleTableSlot **) palloc0(sizeof(TupleTableSlot *) * nplans);
+ mergestate->ms_heap = binaryheap_allocate(nplans, heap_compare_slots,
+ mergestate);
+
+ /*
+ * Miscellaneous initialization
+ *
+ * MergeAppend nodes do have Result slots, which hold pointers to tuples,
+ * so we have to initialize them. FIXME
+ */
+ ExecInitResultTupleSlotTL(&mergestate->ps, &TTSOpsVirtual);
+
+ /* node returns slots from each of its subnodes, therefore not fixed */
+ mergestate->ps.resultopsset = true;
+ mergestate->ps.resultopsfixed = false;
+
+ /*
+ * call ExecInitNode on each of the valid plans to be executed and save
+ * the results into the mergeplanstates array.
+ */
+ j = 0;
+ i = -1;
+ while ((i = bms_next_member(validsubplans, i)) >= 0)
+ {
+ Plan *initNode = (Plan *) list_nth(node->mergeplans, i);
+
+ mergeplanstates[j++] = ExecInitNode(initNode, estate, eflags);
+ }
+
+ mergestate->ps.ps_ProjInfo = NULL;
+
+ /*
+ * initialize sort-key information
+ */
+ mergestate->ms_nkeys = node->numCols;
+ mergestate->ms_sortkeys = palloc0(sizeof(SortSupportData) * node->numCols);
+
+ for (i = 0; i < node->numCols; i++)
+ {
+ SortSupport sortKey = mergestate->ms_sortkeys + i;
+
+ sortKey->ssup_cxt = CurrentMemoryContext;
+ sortKey->ssup_collation = node->collations[i];
+ sortKey->ssup_nulls_first = node->nullsFirst[i];
+ sortKey->ssup_attno = node->sortColIdx[i];
+
+ /*
+ * It isn't feasible to perform abbreviated key conversion, since
+ * tuples are pulled into mergestate's binary heap as needed. It
+ * would likely be counter-productive to convert tuples into an
+ * abbreviated representation as they're pulled up, so opt out of that
+ * additional optimization entirely.
+ */
+ sortKey->abbreviate = false;
+
+ PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
+ }
+
+ /*
+ * initialize to show we have not run the subplans yet
+ */
+ mergestate->ms_initialized = false;
+
+ return mergestate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecMergeAppend
+ *
+ * Handles iteration over multiple subplans.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecMergeAppend(PlanState *pstate)
+{
+ MergeAppendState *node = castNode(MergeAppendState, pstate);
+ TupleTableSlot *result;
+ SlotNumber i;
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!node->ms_initialized)
+ {
+ /* Nothing to do if all subplans were pruned */
+ if (node->ms_nplans == 0)
+ return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ /*
+ * If we've yet to determine the valid subplans then do so now. If
+ * run-time pruning is disabled then the valid subplans will always be
+ * set to all subplans.
+ */
+ if (node->ms_valid_subplans == NULL)
+ node->ms_valid_subplans =
+ ExecFindMatchingSubPlans(node->ms_prune_state);
+
+ /*
+ * First time through: pull the first tuple from each valid subplan,
+ * and set up the heap.
+ */
+ i = -1;
+ while ((i = bms_next_member(node->ms_valid_subplans, i)) >= 0)
+ {
+ node->ms_slots[i] = ExecProcNode(node->mergeplans[i]);
+ if (!TupIsNull(node->ms_slots[i]))
+ binaryheap_add_unordered(node->ms_heap, Int32GetDatum(i));
+ }
+ binaryheap_build(node->ms_heap);
+ node->ms_initialized = true;
+ }
+ else
+ {
+ /*
+ * Otherwise, pull the next tuple from whichever subplan we returned
+ * from last time, and reinsert the subplan index into the heap,
+ * because it might now compare differently against the existing
+ * elements of the heap. (We could perhaps simplify the logic a bit
+ * by doing this before returning from the prior call, but it's better
+ * to not pull tuples until necessary.)
+ */
+ i = DatumGetInt32(binaryheap_first(node->ms_heap));
+ node->ms_slots[i] = ExecProcNode(node->mergeplans[i]);
+ if (!TupIsNull(node->ms_slots[i]))
+ binaryheap_replace_first(node->ms_heap, Int32GetDatum(i));
+ else
+ (void) binaryheap_remove_first(node->ms_heap);
+ }
+
+ if (binaryheap_empty(node->ms_heap))
+ {
+ /* All the subplans are exhausted, and so is the heap */
+ result = ExecClearTuple(node->ps.ps_ResultTupleSlot);
+ }
+ else
+ {
+ i = DatumGetInt32(binaryheap_first(node->ms_heap));
+ result = node->ms_slots[i];
+ }
+
+ return result;
+}
+
+/*
+ * Compare the tuples in the two given slots.
+ */
+static int32
+heap_compare_slots(Datum a, Datum b, void *arg)
+{
+ MergeAppendState *node = (MergeAppendState *) arg;
+ SlotNumber slot1 = DatumGetInt32(a);
+ SlotNumber slot2 = DatumGetInt32(b);
+
+ TupleTableSlot *s1 = node->ms_slots[slot1];
+ TupleTableSlot *s2 = node->ms_slots[slot2];
+ int nkey;
+
+ Assert(!TupIsNull(s1));
+ Assert(!TupIsNull(s2));
+
+ for (nkey = 0; nkey < node->ms_nkeys; nkey++)
+ {
+ SortSupport sortKey = node->ms_sortkeys + nkey;
+ AttrNumber attno = sortKey->ssup_attno;
+ Datum datum1,
+ datum2;
+ bool isNull1,
+ isNull2;
+ int compare;
+
+ datum1 = slot_getattr(s1, attno, &isNull1);
+ datum2 = slot_getattr(s2, attno, &isNull2);
+
+ compare = ApplySortComparator(datum1, isNull1,
+ datum2, isNull2,
+ sortKey);
+ if (compare != 0)
+ {
+ INVERT_COMPARE_RESULT(compare);
+ return compare;
+ }
+ }
+ return 0;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndMergeAppend
+ *
+ * Shuts down the subscans of the MergeAppend node.
+ *
+ * Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndMergeAppend(MergeAppendState *node)
+{
+ PlanState **mergeplans;
+ int nplans;
+ int i;
+
+ /*
+ * get information from the node
+ */
+ mergeplans = node->mergeplans;
+ nplans = node->ms_nplans;
+
+ /*
+ * shut down each of the subscans
+ */
+ for (i = 0; i < nplans; i++)
+ ExecEndNode(mergeplans[i]);
+}
+
+void
+ExecReScanMergeAppend(MergeAppendState *node)
+{
+ int i;
+
+ /*
+ * If any PARAM_EXEC Params used in pruning expressions have changed, then
+ * we'd better unset the valid subplans so that they are reselected for
+ * the new parameter values.
+ */
+ if (node->ms_prune_state &&
+ bms_overlap(node->ps.chgParam,
+ node->ms_prune_state->execparamids))
+ {
+ bms_free(node->ms_valid_subplans);
+ node->ms_valid_subplans = NULL;
+ }
+
+ for (i = 0; i < node->ms_nplans; i++)
+ {
+ PlanState *subnode = node->mergeplans[i];
+
+ /*
+ * ExecReScan doesn't know about my subplans, so I have to do
+ * changed-parameter signaling myself.
+ */
+ if (node->ps.chgParam != NULL)
+ UpdateChangedParamSet(subnode, node->ps.chgParam);
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (subnode->chgParam == NULL)
+ ExecReScan(subnode);
+ }
+ binaryheap_reset(node->ms_heap);
+ node->ms_initialized = false;
+}
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
new file mode 100644
index 0000000..5ff3f4c
--- /dev/null
+++ b/src/backend/executor/nodeMergejoin.c
@@ -0,0 +1,1678 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeMergejoin.c
+ * routines supporting merge joins
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeMergejoin.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecMergeJoin mergejoin outer and inner relations.
+ * ExecInitMergeJoin creates and initializes run time states
+ * ExecEndMergeJoin cleans up the node.
+ *
+ * NOTES
+ *
+ * Merge-join is done by joining the inner and outer tuples satisfying
+ * join clauses of the form ((= outerKey innerKey) ...).
+ * The join clause list is provided by the query planner and may contain
+ * more than one (= outerKey innerKey) clause (for composite sort key).
+ *
+ * However, the query executor needs to know whether an outer
+ * tuple is "greater/smaller" than an inner tuple so that it can
+ * "synchronize" the two relations. For example, consider the following
+ * relations:
+ *
+ * outer: (0 ^1 1 2 5 5 5 6 6 7) current tuple: 1
+ * inner: (1 ^3 5 5 5 5 6) current tuple: 3
+ *
+ * To continue the merge-join, the executor needs to scan both inner
+ * and outer relations till the matching tuples 5. It needs to know
+ * that currently inner tuple 3 is "greater" than outer tuple 1 and
+ * therefore it should scan the outer relation first to find a
+ * matching tuple and so on.
+ *
+ * Therefore, rather than directly executing the merge join clauses,
+ * we evaluate the left and right key expressions separately and then
+ * compare the columns one at a time (see MJCompare). The planner
+ * passes us enough information about the sort ordering of the inputs
+ * to allow us to determine how to make the comparison. We may use the
+ * appropriate btree comparison function, since Postgres' only notion
+ * of ordering is specified by btree opfamilies.
+ *
+ *
+ * Consider the above relations and suppose that the executor has
+ * just joined the first outer "5" with the last inner "5". The
+ * next step is of course to join the second outer "5" with all
+ * the inner "5's". This requires repositioning the inner "cursor"
+ * to point at the first inner "5". This is done by "marking" the
+ * first inner 5 so we can restore the "cursor" to it before joining
+ * with the second outer 5. The access method interface provides
+ * routines to mark and restore to a tuple.
+ *
+ *
+ * Essential operation of the merge join algorithm is as follows:
+ *
+ * Join {
+ * get initial outer and inner tuples INITIALIZE
+ * do forever {
+ * while (outer != inner) { SKIP_TEST
+ * if (outer < inner)
+ * advance outer SKIPOUTER_ADVANCE
+ * else
+ * advance inner SKIPINNER_ADVANCE
+ * }
+ * mark inner position SKIP_TEST
+ * do forever {
+ * while (outer == inner) {
+ * join tuples JOINTUPLES
+ * advance inner position NEXTINNER
+ * }
+ * advance outer position NEXTOUTER
+ * if (outer == mark) TESTOUTER
+ * restore inner position to mark TESTOUTER
+ * else
+ * break // return to top of outer loop
+ * }
+ * }
+ * }
+ *
+ * The merge join operation is coded in the fashion
+ * of a state machine. At each state, we do something and then
+ * proceed to another state. This state is stored in the node's
+ * execution state information and is preserved across calls to
+ * ExecMergeJoin. -cim 10/31/89
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "executor/execdebug.h"
+#include "executor/nodeMergejoin.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+
+
+/*
+ * States of the ExecMergeJoin state machine
+ */
+#define EXEC_MJ_INITIALIZE_OUTER 1
+#define EXEC_MJ_INITIALIZE_INNER 2
+#define EXEC_MJ_JOINTUPLES 3
+#define EXEC_MJ_NEXTOUTER 4
+#define EXEC_MJ_TESTOUTER 5
+#define EXEC_MJ_NEXTINNER 6
+#define EXEC_MJ_SKIP_TEST 7
+#define EXEC_MJ_SKIPOUTER_ADVANCE 8
+#define EXEC_MJ_SKIPINNER_ADVANCE 9
+#define EXEC_MJ_ENDOUTER 10
+#define EXEC_MJ_ENDINNER 11
+
+/*
+ * Runtime data for each mergejoin clause
+ */
+typedef struct MergeJoinClauseData
+{
+ /* Executable expression trees */
+ ExprState *lexpr; /* left-hand (outer) input expression */
+ ExprState *rexpr; /* right-hand (inner) input expression */
+
+ /*
+ * If we have a current left or right input tuple, the values of the
+ * expressions are loaded into these fields:
+ */
+ Datum ldatum; /* current left-hand value */
+ Datum rdatum; /* current right-hand value */
+ bool lisnull; /* and their isnull flags */
+ bool risnull;
+
+ /*
+ * Everything we need to know to compare the left and right values is
+ * stored here.
+ */
+ SortSupportData ssup;
+} MergeJoinClauseData;
+
+/* Result type for MJEvalOuterValues and MJEvalInnerValues */
+typedef enum
+{
+ MJEVAL_MATCHABLE, /* normal, potentially matchable tuple */
+ MJEVAL_NONMATCHABLE, /* tuple cannot join because it has a null */
+ MJEVAL_ENDOFJOIN /* end of input (physical or effective) */
+} MJEvalResult;
+
+
+#define MarkInnerTuple(innerTupleSlot, mergestate) \
+ ExecCopySlot((mergestate)->mj_MarkedTupleSlot, (innerTupleSlot))
+
+
+/*
+ * MJExamineQuals
+ *
+ * This deconstructs the list of mergejoinable expressions, which is given
+ * to us by the planner in the form of a list of "leftexpr = rightexpr"
+ * expression trees in the order matching the sort columns of the inputs.
+ * We build an array of MergeJoinClause structs containing the information
+ * we will need at runtime. Each struct essentially tells us how to compare
+ * the two expressions from the original clause.
+ *
+ * In addition to the expressions themselves, the planner passes the btree
+ * opfamily OID, collation OID, btree strategy number (BTLessStrategyNumber or
+ * BTGreaterStrategyNumber), and nulls-first flag that identify the intended
+ * sort ordering for each merge key. The mergejoinable operator is an
+ * equality operator in the opfamily, and the two inputs are guaranteed to be
+ * ordered in either increasing or decreasing (respectively) order according
+ * to the opfamily and collation, with nulls at the indicated end of the range.
+ * This allows us to obtain the needed comparison function from the opfamily.
+ */
+static MergeJoinClause
+MJExamineQuals(List *mergeclauses,
+ Oid *mergefamilies,
+ Oid *mergecollations,
+ int *mergestrategies,
+ bool *mergenullsfirst,
+ PlanState *parent)
+{
+ MergeJoinClause clauses;
+ int nClauses = list_length(mergeclauses);
+ int iClause;
+ ListCell *cl;
+
+ clauses = (MergeJoinClause) palloc0(nClauses * sizeof(MergeJoinClauseData));
+
+ iClause = 0;
+ foreach(cl, mergeclauses)
+ {
+ OpExpr *qual = (OpExpr *) lfirst(cl);
+ MergeJoinClause clause = &clauses[iClause];
+ Oid opfamily = mergefamilies[iClause];
+ Oid collation = mergecollations[iClause];
+ StrategyNumber opstrategy = mergestrategies[iClause];
+ bool nulls_first = mergenullsfirst[iClause];
+ int op_strategy;
+ Oid op_lefttype;
+ Oid op_righttype;
+ Oid sortfunc;
+
+ if (!IsA(qual, OpExpr))
+ elog(ERROR, "mergejoin clause is not an OpExpr");
+
+ /*
+ * Prepare the input expressions for execution.
+ */
+ clause->lexpr = ExecInitExpr((Expr *) linitial(qual->args), parent);
+ clause->rexpr = ExecInitExpr((Expr *) lsecond(qual->args), parent);
+
+ /* Set up sort support data */
+ clause->ssup.ssup_cxt = CurrentMemoryContext;
+ clause->ssup.ssup_collation = collation;
+ if (opstrategy == BTLessStrategyNumber)
+ clause->ssup.ssup_reverse = false;
+ else if (opstrategy == BTGreaterStrategyNumber)
+ clause->ssup.ssup_reverse = true;
+ else /* planner screwed up */
+ elog(ERROR, "unsupported mergejoin strategy %d", opstrategy);
+ clause->ssup.ssup_nulls_first = nulls_first;
+
+ /* Extract the operator's declared left/right datatypes */
+ get_op_opfamily_properties(qual->opno, opfamily, false,
+ &op_strategy,
+ &op_lefttype,
+ &op_righttype);
+ if (op_strategy != BTEqualStrategyNumber) /* should not happen */
+ elog(ERROR, "cannot merge using non-equality operator %u",
+ qual->opno);
+
+ /*
+ * sortsupport routine must know if abbreviation optimization is
+ * applicable in principle. It is never applicable for merge joins
+ * because there is no convenient opportunity to convert to
+ * alternative representation.
+ */
+ clause->ssup.abbreviate = false;
+
+ /* And get the matching support or comparison function */
+ Assert(clause->ssup.comparator == NULL);
+ sortfunc = get_opfamily_proc(opfamily,
+ op_lefttype,
+ op_righttype,
+ BTSORTSUPPORT_PROC);
+ if (OidIsValid(sortfunc))
+ {
+ /* The sort support function can provide a comparator */
+ OidFunctionCall1(sortfunc, PointerGetDatum(&clause->ssup));
+ }
+ if (clause->ssup.comparator == NULL)
+ {
+ /* support not available, get comparison func */
+ sortfunc = get_opfamily_proc(opfamily,
+ op_lefttype,
+ op_righttype,
+ BTORDER_PROC);
+ if (!OidIsValid(sortfunc)) /* should not happen */
+ elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
+ BTORDER_PROC, op_lefttype, op_righttype, opfamily);
+ /* We'll use a shim to call the old-style btree comparator */
+ PrepareSortSupportComparisonShim(sortfunc, &clause->ssup);
+ }
+
+ iClause++;
+ }
+
+ return clauses;
+}
+
+/*
+ * MJEvalOuterValues
+ *
+ * Compute the values of the mergejoined expressions for the current
+ * outer tuple. We also detect whether it's impossible for the current
+ * outer tuple to match anything --- this is true if it yields a NULL
+ * input, since we assume mergejoin operators are strict. If the NULL
+ * is in the first join column, and that column sorts nulls last, then
+ * we can further conclude that no following tuple can match anything
+ * either, since they must all have nulls in the first column. However,
+ * that case is only interesting if we're not in FillOuter mode, else
+ * we have to visit all the tuples anyway.
+ *
+ * For the convenience of callers, we also make this routine responsible
+ * for testing for end-of-input (null outer tuple), and returning
+ * MJEVAL_ENDOFJOIN when that's seen. This allows the same code to be used
+ * for both real end-of-input and the effective end-of-input represented by
+ * a first-column NULL.
+ *
+ * We evaluate the values in OuterEContext, which can be reset each
+ * time we move to a new tuple.
+ */
+static MJEvalResult
+MJEvalOuterValues(MergeJoinState *mergestate)
+{
+ ExprContext *econtext = mergestate->mj_OuterEContext;
+ MJEvalResult result = MJEVAL_MATCHABLE;
+ int i;
+ MemoryContext oldContext;
+
+ /* Check for end of outer subplan */
+ if (TupIsNull(mergestate->mj_OuterTupleSlot))
+ return MJEVAL_ENDOFJOIN;
+
+ ResetExprContext(econtext);
+
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ econtext->ecxt_outertuple = mergestate->mj_OuterTupleSlot;
+
+ for (i = 0; i < mergestate->mj_NumClauses; i++)
+ {
+ MergeJoinClause clause = &mergestate->mj_Clauses[i];
+
+ clause->ldatum = ExecEvalExpr(clause->lexpr, econtext,
+ &clause->lisnull);
+ if (clause->lisnull)
+ {
+ /* match is impossible; can we end the join early? */
+ if (i == 0 && !clause->ssup.ssup_nulls_first &&
+ !mergestate->mj_FillOuter)
+ result = MJEVAL_ENDOFJOIN;
+ else if (result == MJEVAL_MATCHABLE)
+ result = MJEVAL_NONMATCHABLE;
+ }
+ }
+
+ MemoryContextSwitchTo(oldContext);
+
+ return result;
+}
+
+/*
+ * MJEvalInnerValues
+ *
+ * Same as above, but for the inner tuple. Here, we have to be prepared
+ * to load data from either the true current inner, or the marked inner,
+ * so caller must tell us which slot to load from.
+ */
+static MJEvalResult
+MJEvalInnerValues(MergeJoinState *mergestate, TupleTableSlot *innerslot)
+{
+ ExprContext *econtext = mergestate->mj_InnerEContext;
+ MJEvalResult result = MJEVAL_MATCHABLE;
+ int i;
+ MemoryContext oldContext;
+
+ /* Check for end of inner subplan */
+ if (TupIsNull(innerslot))
+ return MJEVAL_ENDOFJOIN;
+
+ ResetExprContext(econtext);
+
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ econtext->ecxt_innertuple = innerslot;
+
+ for (i = 0; i < mergestate->mj_NumClauses; i++)
+ {
+ MergeJoinClause clause = &mergestate->mj_Clauses[i];
+
+ clause->rdatum = ExecEvalExpr(clause->rexpr, econtext,
+ &clause->risnull);
+ if (clause->risnull)
+ {
+ /* match is impossible; can we end the join early? */
+ if (i == 0 && !clause->ssup.ssup_nulls_first &&
+ !mergestate->mj_FillInner)
+ result = MJEVAL_ENDOFJOIN;
+ else if (result == MJEVAL_MATCHABLE)
+ result = MJEVAL_NONMATCHABLE;
+ }
+ }
+
+ MemoryContextSwitchTo(oldContext);
+
+ return result;
+}
+
+/*
+ * MJCompare
+ *
+ * Compare the mergejoinable values of the current two input tuples
+ * and return 0 if they are equal (ie, the mergejoin equalities all
+ * succeed), >0 if outer > inner, <0 if outer < inner.
+ *
+ * MJEvalOuterValues and MJEvalInnerValues must already have been called
+ * for the current outer and inner tuples, respectively.
+ */
+static int
+MJCompare(MergeJoinState *mergestate)
+{
+ int result = 0;
+ bool nulleqnull = false;
+ ExprContext *econtext = mergestate->js.ps.ps_ExprContext;
+ int i;
+ MemoryContext oldContext;
+
+ /*
+ * Call the comparison functions in short-lived context, in case they leak
+ * memory.
+ */
+ ResetExprContext(econtext);
+
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ for (i = 0; i < mergestate->mj_NumClauses; i++)
+ {
+ MergeJoinClause clause = &mergestate->mj_Clauses[i];
+
+ /*
+ * Special case for NULL-vs-NULL, else use standard comparison.
+ */
+ if (clause->lisnull && clause->risnull)
+ {
+ nulleqnull = true; /* NULL "=" NULL */
+ continue;
+ }
+
+ result = ApplySortComparator(clause->ldatum, clause->lisnull,
+ clause->rdatum, clause->risnull,
+ &clause->ssup);
+
+ if (result != 0)
+ break;
+ }
+
+ /*
+ * If we had any NULL-vs-NULL inputs, we do not want to report that the
+ * tuples are equal. Instead, if result is still 0, change it to +1. This
+ * will result in advancing the inner side of the join.
+ *
+ * Likewise, if there was a constant-false joinqual, do not report
+ * equality. We have to check this as part of the mergequals, else the
+ * rescan logic will do the wrong thing.
+ */
+ if (result == 0 &&
+ (nulleqnull || mergestate->mj_ConstFalseJoin))
+ result = 1;
+
+ MemoryContextSwitchTo(oldContext);
+
+ return result;
+}
+
+
+/*
+ * Generate a fake join tuple with nulls for the inner tuple,
+ * and return it if it passes the non-join quals.
+ */
+static TupleTableSlot *
+MJFillOuter(MergeJoinState *node)
+{
+ ExprContext *econtext = node->js.ps.ps_ExprContext;
+ ExprState *otherqual = node->js.ps.qual;
+
+ ResetExprContext(econtext);
+
+ econtext->ecxt_outertuple = node->mj_OuterTupleSlot;
+ econtext->ecxt_innertuple = node->mj_NullInnerTupleSlot;
+
+ if (ExecQual(otherqual, econtext))
+ {
+ /*
+ * qualification succeeded. now form the desired projection tuple and
+ * return the slot containing it.
+ */
+ MJ_printf("ExecMergeJoin: returning outer fill tuple\n");
+
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ }
+ else
+ InstrCountFiltered2(node, 1);
+
+ return NULL;
+}
+
+/*
+ * Generate a fake join tuple with nulls for the outer tuple,
+ * and return it if it passes the non-join quals.
+ */
+static TupleTableSlot *
+MJFillInner(MergeJoinState *node)
+{
+ ExprContext *econtext = node->js.ps.ps_ExprContext;
+ ExprState *otherqual = node->js.ps.qual;
+
+ ResetExprContext(econtext);
+
+ econtext->ecxt_outertuple = node->mj_NullOuterTupleSlot;
+ econtext->ecxt_innertuple = node->mj_InnerTupleSlot;
+
+ if (ExecQual(otherqual, econtext))
+ {
+ /*
+ * qualification succeeded. now form the desired projection tuple and
+ * return the slot containing it.
+ */
+ MJ_printf("ExecMergeJoin: returning inner fill tuple\n");
+
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ }
+ else
+ InstrCountFiltered2(node, 1);
+
+ return NULL;
+}
+
+
+/*
+ * Check that a qual condition is constant true or constant false.
+ * If it is constant false (or null), set *is_const_false to true.
+ *
+ * Constant true would normally be represented by a NIL list, but we allow an
+ * actual bool Const as well. We do expect that the planner will have thrown
+ * away any non-constant terms that have been ANDed with a constant false.
+ */
+static bool
+check_constant_qual(List *qual, bool *is_const_false)
+{
+ ListCell *lc;
+
+ foreach(lc, qual)
+ {
+ Const *con = (Const *) lfirst(lc);
+
+ if (!con || !IsA(con, Const))
+ return false;
+ if (con->constisnull || !DatumGetBool(con->constvalue))
+ *is_const_false = true;
+ }
+ return true;
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecMergeTupleDump
+ *
+ * This function is called through the MJ_dump() macro
+ * when EXEC_MERGEJOINDEBUG is defined
+ * ----------------------------------------------------------------
+ */
+#ifdef EXEC_MERGEJOINDEBUG
+
+static void
+ExecMergeTupleDumpOuter(MergeJoinState *mergestate)
+{
+ TupleTableSlot *outerSlot = mergestate->mj_OuterTupleSlot;
+
+ printf("==== outer tuple ====\n");
+ if (TupIsNull(outerSlot))
+ printf("(nil)\n");
+ else
+ MJ_debugtup(outerSlot);
+}
+
+static void
+ExecMergeTupleDumpInner(MergeJoinState *mergestate)
+{
+ TupleTableSlot *innerSlot = mergestate->mj_InnerTupleSlot;
+
+ printf("==== inner tuple ====\n");
+ if (TupIsNull(innerSlot))
+ printf("(nil)\n");
+ else
+ MJ_debugtup(innerSlot);
+}
+
+static void
+ExecMergeTupleDumpMarked(MergeJoinState *mergestate)
+{
+ TupleTableSlot *markedSlot = mergestate->mj_MarkedTupleSlot;
+
+ printf("==== marked tuple ====\n");
+ if (TupIsNull(markedSlot))
+ printf("(nil)\n");
+ else
+ MJ_debugtup(markedSlot);
+}
+
+static void
+ExecMergeTupleDump(MergeJoinState *mergestate)
+{
+ printf("******** ExecMergeTupleDump ********\n");
+
+ ExecMergeTupleDumpOuter(mergestate);
+ ExecMergeTupleDumpInner(mergestate);
+ ExecMergeTupleDumpMarked(mergestate);
+
+ printf("********\n");
+}
+#endif
+
+/* ----------------------------------------------------------------
+ * ExecMergeJoin
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecMergeJoin(PlanState *pstate)
+{
+ MergeJoinState *node = castNode(MergeJoinState, pstate);
+ ExprState *joinqual;
+ ExprState *otherqual;
+ bool qualResult;
+ int compareResult;
+ PlanState *innerPlan;
+ TupleTableSlot *innerTupleSlot;
+ PlanState *outerPlan;
+ TupleTableSlot *outerTupleSlot;
+ ExprContext *econtext;
+ bool doFillOuter;
+ bool doFillInner;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * get information from node
+ */
+ innerPlan = innerPlanState(node);
+ outerPlan = outerPlanState(node);
+ econtext = node->js.ps.ps_ExprContext;
+ joinqual = node->js.joinqual;
+ otherqual = node->js.ps.qual;
+ doFillOuter = node->mj_FillOuter;
+ doFillInner = node->mj_FillInner;
+
+ /*
+ * Reset per-tuple memory context to free any expression evaluation
+ * storage allocated in the previous tuple cycle.
+ */
+ ResetExprContext(econtext);
+
+ /*
+ * ok, everything is setup.. let's go to work
+ */
+ for (;;)
+ {
+ MJ_dump(node);
+
+ /*
+ * get the current state of the join and do things accordingly.
+ */
+ switch (node->mj_JoinState)
+ {
+ /*
+ * EXEC_MJ_INITIALIZE_OUTER means that this is the first time
+ * ExecMergeJoin() has been called and so we have to fetch the
+ * first matchable tuple for both outer and inner subplans. We
+ * do the outer side in INITIALIZE_OUTER state, then advance
+ * to INITIALIZE_INNER state for the inner subplan.
+ */
+ case EXEC_MJ_INITIALIZE_OUTER:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_INITIALIZE_OUTER\n");
+
+ outerTupleSlot = ExecProcNode(outerPlan);
+ node->mj_OuterTupleSlot = outerTupleSlot;
+
+ /* Compute join values and check for unmatchability */
+ switch (MJEvalOuterValues(node))
+ {
+ case MJEVAL_MATCHABLE:
+ /* OK to go get the first inner tuple */
+ node->mj_JoinState = EXEC_MJ_INITIALIZE_INNER;
+ break;
+ case MJEVAL_NONMATCHABLE:
+ /* Stay in same state to fetch next outer tuple */
+ if (doFillOuter)
+ {
+ /*
+ * Generate a fake join tuple with nulls for the
+ * inner tuple, and return it if it passes the
+ * non-join quals.
+ */
+ TupleTableSlot *result;
+
+ result = MJFillOuter(node);
+ if (result)
+ return result;
+ }
+ break;
+ case MJEVAL_ENDOFJOIN:
+ /* No more outer tuples */
+ MJ_printf("ExecMergeJoin: nothing in outer subplan\n");
+ if (doFillInner)
+ {
+ /*
+ * Need to emit right-join tuples for remaining
+ * inner tuples. We set MatchedInner = true to
+ * force the ENDOUTER state to advance inner.
+ */
+ node->mj_JoinState = EXEC_MJ_ENDOUTER;
+ node->mj_MatchedInner = true;
+ break;
+ }
+ /* Otherwise we're done. */
+ return NULL;
+ }
+ break;
+
+ case EXEC_MJ_INITIALIZE_INNER:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_INITIALIZE_INNER\n");
+
+ innerTupleSlot = ExecProcNode(innerPlan);
+ node->mj_InnerTupleSlot = innerTupleSlot;
+
+ /* Compute join values and check for unmatchability */
+ switch (MJEvalInnerValues(node, innerTupleSlot))
+ {
+ case MJEVAL_MATCHABLE:
+
+ /*
+ * OK, we have the initial tuples. Begin by skipping
+ * non-matching tuples.
+ */
+ node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+ break;
+ case MJEVAL_NONMATCHABLE:
+ /* Mark before advancing, if wanted */
+ if (node->mj_ExtraMarks)
+ ExecMarkPos(innerPlan);
+ /* Stay in same state to fetch next inner tuple */
+ if (doFillInner)
+ {
+ /*
+ * Generate a fake join tuple with nulls for the
+ * outer tuple, and return it if it passes the
+ * non-join quals.
+ */
+ TupleTableSlot *result;
+
+ result = MJFillInner(node);
+ if (result)
+ return result;
+ }
+ break;
+ case MJEVAL_ENDOFJOIN:
+ /* No more inner tuples */
+ MJ_printf("ExecMergeJoin: nothing in inner subplan\n");
+ if (doFillOuter)
+ {
+ /*
+ * Need to emit left-join tuples for all outer
+ * tuples, including the one we just fetched. We
+ * set MatchedOuter = false to force the ENDINNER
+ * state to emit first tuple before advancing
+ * outer.
+ */
+ node->mj_JoinState = EXEC_MJ_ENDINNER;
+ node->mj_MatchedOuter = false;
+ break;
+ }
+ /* Otherwise we're done. */
+ return NULL;
+ }
+ break;
+
+ /*
+ * EXEC_MJ_JOINTUPLES means we have two tuples which satisfied
+ * the merge clause so we join them and then proceed to get
+ * the next inner tuple (EXEC_MJ_NEXTINNER).
+ */
+ case EXEC_MJ_JOINTUPLES:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_JOINTUPLES\n");
+
+ /*
+ * Set the next state machine state. The right things will
+ * happen whether we return this join tuple or just fall
+ * through to continue the state machine execution.
+ */
+ node->mj_JoinState = EXEC_MJ_NEXTINNER;
+
+ /*
+ * Check the extra qual conditions to see if we actually want
+ * to return this join tuple. If not, can proceed with merge.
+ * We must distinguish the additional joinquals (which must
+ * pass to consider the tuples "matched" for outer-join logic)
+ * from the otherquals (which must pass before we actually
+ * return the tuple).
+ *
+ * We don't bother with a ResetExprContext here, on the
+ * assumption that we just did one while checking the merge
+ * qual. One per tuple should be sufficient. We do have to
+ * set up the econtext links to the tuples for ExecQual to
+ * use.
+ */
+ outerTupleSlot = node->mj_OuterTupleSlot;
+ econtext->ecxt_outertuple = outerTupleSlot;
+ innerTupleSlot = node->mj_InnerTupleSlot;
+ econtext->ecxt_innertuple = innerTupleSlot;
+
+ qualResult = (joinqual == NULL ||
+ ExecQual(joinqual, econtext));
+ MJ_DEBUG_QUAL(joinqual, qualResult);
+
+ if (qualResult)
+ {
+ node->mj_MatchedOuter = true;
+ node->mj_MatchedInner = true;
+
+ /* In an antijoin, we never return a matched tuple */
+ if (node->js.jointype == JOIN_ANTI)
+ {
+ node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+ break;
+ }
+
+ /*
+ * If we only need to join to the first matching inner
+ * tuple, then consider returning this one, but after that
+ * continue with next outer tuple.
+ */
+ if (node->js.single_match)
+ node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+
+ qualResult = (otherqual == NULL ||
+ ExecQual(otherqual, econtext));
+ MJ_DEBUG_QUAL(otherqual, qualResult);
+
+ if (qualResult)
+ {
+ /*
+ * qualification succeeded. now form the desired
+ * projection tuple and return the slot containing it.
+ */
+ MJ_printf("ExecMergeJoin: returning tuple\n");
+
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ }
+ else
+ InstrCountFiltered2(node, 1);
+ }
+ else
+ InstrCountFiltered1(node, 1);
+ break;
+
+ /*
+ * EXEC_MJ_NEXTINNER means advance the inner scan to the next
+ * tuple. If the tuple is not nil, we then proceed to test it
+ * against the join qualification.
+ *
+ * Before advancing, we check to see if we must emit an
+ * outer-join fill tuple for this inner tuple.
+ */
+ case EXEC_MJ_NEXTINNER:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTINNER\n");
+
+ if (doFillInner && !node->mj_MatchedInner)
+ {
+ /*
+ * Generate a fake join tuple with nulls for the outer
+ * tuple, and return it if it passes the non-join quals.
+ */
+ TupleTableSlot *result;
+
+ node->mj_MatchedInner = true; /* do it only once */
+
+ result = MJFillInner(node);
+ if (result)
+ return result;
+ }
+
+ /*
+ * now we get the next inner tuple, if any. If there's none,
+ * advance to next outer tuple (which may be able to join to
+ * previously marked tuples).
+ *
+ * NB: must NOT do "extraMarks" here, since we may need to
+ * return to previously marked tuples.
+ */
+ innerTupleSlot = ExecProcNode(innerPlan);
+ node->mj_InnerTupleSlot = innerTupleSlot;
+ MJ_DEBUG_PROC_NODE(innerTupleSlot);
+ node->mj_MatchedInner = false;
+
+ /* Compute join values and check for unmatchability */
+ switch (MJEvalInnerValues(node, innerTupleSlot))
+ {
+ case MJEVAL_MATCHABLE:
+
+ /*
+ * Test the new inner tuple to see if it matches
+ * outer.
+ *
+ * If they do match, then we join them and move on to
+ * the next inner tuple (EXEC_MJ_JOINTUPLES).
+ *
+ * If they do not match then advance to next outer
+ * tuple.
+ */
+ compareResult = MJCompare(node);
+ MJ_DEBUG_COMPARE(compareResult);
+
+ if (compareResult == 0)
+ node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+ else if (compareResult < 0)
+ node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+ else /* compareResult > 0 should not happen */
+ elog(ERROR, "mergejoin input data is out of order");
+ break;
+ case MJEVAL_NONMATCHABLE:
+
+ /*
+ * It contains a NULL and hence can't match any outer
+ * tuple, so we can skip the comparison and assume the
+ * new tuple is greater than current outer.
+ */
+ node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+ break;
+ case MJEVAL_ENDOFJOIN:
+
+ /*
+ * No more inner tuples. However, this might be only
+ * effective and not physical end of inner plan, so
+ * force mj_InnerTupleSlot to null to make sure we
+ * don't fetch more inner tuples. (We need this hack
+ * because we are not transiting to a state where the
+ * inner plan is assumed to be exhausted.)
+ */
+ node->mj_InnerTupleSlot = NULL;
+ node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+ break;
+ }
+ break;
+
+ /*-------------------------------------------
+ * EXEC_MJ_NEXTOUTER means
+ *
+ * outer inner
+ * outer tuple - 5 5 - marked tuple
+ * 5 5
+ * 6 6 - inner tuple
+ * 7 7
+ *
+ * we know we just bumped into the
+ * first inner tuple > current outer tuple (or possibly
+ * the end of the inner stream)
+ * so get a new outer tuple and then
+ * proceed to test it against the marked tuple
+ * (EXEC_MJ_TESTOUTER)
+ *
+ * Before advancing, we check to see if we must emit an
+ * outer-join fill tuple for this outer tuple.
+ *------------------------------------------------
+ */
+ case EXEC_MJ_NEXTOUTER:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTOUTER\n");
+
+ if (doFillOuter && !node->mj_MatchedOuter)
+ {
+ /*
+ * Generate a fake join tuple with nulls for the inner
+ * tuple, and return it if it passes the non-join quals.
+ */
+ TupleTableSlot *result;
+
+ node->mj_MatchedOuter = true; /* do it only once */
+
+ result = MJFillOuter(node);
+ if (result)
+ return result;
+ }
+
+ /*
+ * now we get the next outer tuple, if any
+ */
+ outerTupleSlot = ExecProcNode(outerPlan);
+ node->mj_OuterTupleSlot = outerTupleSlot;
+ MJ_DEBUG_PROC_NODE(outerTupleSlot);
+ node->mj_MatchedOuter = false;
+
+ /* Compute join values and check for unmatchability */
+ switch (MJEvalOuterValues(node))
+ {
+ case MJEVAL_MATCHABLE:
+ /* Go test the new tuple against the marked tuple */
+ node->mj_JoinState = EXEC_MJ_TESTOUTER;
+ break;
+ case MJEVAL_NONMATCHABLE:
+ /* Can't match, so fetch next outer tuple */
+ node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+ break;
+ case MJEVAL_ENDOFJOIN:
+ /* No more outer tuples */
+ MJ_printf("ExecMergeJoin: end of outer subplan\n");
+ innerTupleSlot = node->mj_InnerTupleSlot;
+ if (doFillInner && !TupIsNull(innerTupleSlot))
+ {
+ /*
+ * Need to emit right-join tuples for remaining
+ * inner tuples.
+ */
+ node->mj_JoinState = EXEC_MJ_ENDOUTER;
+ break;
+ }
+ /* Otherwise we're done. */
+ return NULL;
+ }
+ break;
+
+ /*--------------------------------------------------------
+ * EXEC_MJ_TESTOUTER If the new outer tuple and the marked
+ * tuple satisfy the merge clause then we know we have
+ * duplicates in the outer scan so we have to restore the
+ * inner scan to the marked tuple and proceed to join the
+ * new outer tuple with the inner tuples.
+ *
+ * This is the case when
+ * outer inner
+ * 4 5 - marked tuple
+ * outer tuple - 5 5
+ * new outer tuple - 5 5
+ * 6 8 - inner tuple
+ * 7 12
+ *
+ * new outer tuple == marked tuple
+ *
+ * If the outer tuple fails the test, then we are done
+ * with the marked tuples, and we have to look for a
+ * match to the current inner tuple. So we will
+ * proceed to skip outer tuples until outer >= inner
+ * (EXEC_MJ_SKIP_TEST).
+ *
+ * This is the case when
+ *
+ * outer inner
+ * 5 5 - marked tuple
+ * outer tuple - 5 5
+ * new outer tuple - 6 8 - inner tuple
+ * 7 12
+ *
+ * new outer tuple > marked tuple
+ *
+ *---------------------------------------------------------
+ */
+ case EXEC_MJ_TESTOUTER:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_TESTOUTER\n");
+
+ /*
+ * Here we must compare the outer tuple with the marked inner
+ * tuple. (We can ignore the result of MJEvalInnerValues,
+ * since the marked inner tuple is certainly matchable.)
+ */
+ innerTupleSlot = node->mj_MarkedTupleSlot;
+ (void) MJEvalInnerValues(node, innerTupleSlot);
+
+ compareResult = MJCompare(node);
+ MJ_DEBUG_COMPARE(compareResult);
+
+ if (compareResult == 0)
+ {
+ /*
+ * the merge clause matched so now we restore the inner
+ * scan position to the first mark, and go join that tuple
+ * (and any following ones) to the new outer.
+ *
+ * If we were able to determine mark and restore are not
+ * needed, then we don't have to back up; the current
+ * inner is already the first possible match.
+ *
+ * NOTE: we do not need to worry about the MatchedInner
+ * state for the rescanned inner tuples. We know all of
+ * them will match this new outer tuple and therefore
+ * won't be emitted as fill tuples. This works *only*
+ * because we require the extra joinquals to be constant
+ * when doing a right or full join --- otherwise some of
+ * the rescanned tuples might fail the extra joinquals.
+ * This obviously won't happen for a constant-true extra
+ * joinqual, while the constant-false case is handled by
+ * forcing the merge clause to never match, so we never
+ * get here.
+ */
+ if (!node->mj_SkipMarkRestore)
+ {
+ ExecRestrPos(innerPlan);
+
+ /*
+ * ExecRestrPos probably should give us back a new
+ * Slot, but since it doesn't, use the marked slot.
+ * (The previously returned mj_InnerTupleSlot cannot
+ * be assumed to hold the required tuple.)
+ */
+ node->mj_InnerTupleSlot = innerTupleSlot;
+ /* we need not do MJEvalInnerValues again */
+ }
+
+ node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+ }
+ else if (compareResult > 0)
+ {
+ /* ----------------
+ * if the new outer tuple didn't match the marked inner
+ * tuple then we have a case like:
+ *
+ * outer inner
+ * 4 4 - marked tuple
+ * new outer - 5 4
+ * 6 5 - inner tuple
+ * 7
+ *
+ * which means that all subsequent outer tuples will be
+ * larger than our marked inner tuples. So we need not
+ * revisit any of the marked tuples but can proceed to
+ * look for a match to the current inner. If there's
+ * no more inners, no more matches are possible.
+ * ----------------
+ */
+ innerTupleSlot = node->mj_InnerTupleSlot;
+
+ /* reload comparison data for current inner */
+ switch (MJEvalInnerValues(node, innerTupleSlot))
+ {
+ case MJEVAL_MATCHABLE:
+ /* proceed to compare it to the current outer */
+ node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+ break;
+ case MJEVAL_NONMATCHABLE:
+
+ /*
+ * current inner can't possibly match any outer;
+ * better to advance the inner scan than the
+ * outer.
+ */
+ node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+ break;
+ case MJEVAL_ENDOFJOIN:
+ /* No more inner tuples */
+ if (doFillOuter)
+ {
+ /*
+ * Need to emit left-join tuples for remaining
+ * outer tuples.
+ */
+ node->mj_JoinState = EXEC_MJ_ENDINNER;
+ break;
+ }
+ /* Otherwise we're done. */
+ return NULL;
+ }
+ }
+ else /* compareResult < 0 should not happen */
+ elog(ERROR, "mergejoin input data is out of order");
+ break;
+
+ /*----------------------------------------------------------
+ * EXEC_MJ_SKIP means compare tuples and if they do not
+ * match, skip whichever is lesser.
+ *
+ * For example:
+ *
+ * outer inner
+ * 5 5
+ * 5 5
+ * outer tuple - 6 8 - inner tuple
+ * 7 12
+ * 8 14
+ *
+ * we have to advance the outer scan
+ * until we find the outer 8.
+ *
+ * On the other hand:
+ *
+ * outer inner
+ * 5 5
+ * 5 5
+ * outer tuple - 12 8 - inner tuple
+ * 14 10
+ * 17 12
+ *
+ * we have to advance the inner scan
+ * until we find the inner 12.
+ *----------------------------------------------------------
+ */
+ case EXEC_MJ_SKIP_TEST:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_SKIP_TEST\n");
+
+ /*
+ * before we advance, make sure the current tuples do not
+ * satisfy the mergeclauses. If they do, then we update the
+ * marked tuple position and go join them.
+ */
+ compareResult = MJCompare(node);
+ MJ_DEBUG_COMPARE(compareResult);
+
+ if (compareResult == 0)
+ {
+ if (!node->mj_SkipMarkRestore)
+ ExecMarkPos(innerPlan);
+
+ MarkInnerTuple(node->mj_InnerTupleSlot, node);
+
+ node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+ }
+ else if (compareResult < 0)
+ node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE;
+ else
+ /* compareResult > 0 */
+ node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+ break;
+
+ /*
+ * SKIPOUTER_ADVANCE: advance over an outer tuple that is
+ * known not to join to any inner tuple.
+ *
+ * Before advancing, we check to see if we must emit an
+ * outer-join fill tuple for this outer tuple.
+ */
+ case EXEC_MJ_SKIPOUTER_ADVANCE:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPOUTER_ADVANCE\n");
+
+ if (doFillOuter && !node->mj_MatchedOuter)
+ {
+ /*
+ * Generate a fake join tuple with nulls for the inner
+ * tuple, and return it if it passes the non-join quals.
+ */
+ TupleTableSlot *result;
+
+ node->mj_MatchedOuter = true; /* do it only once */
+
+ result = MJFillOuter(node);
+ if (result)
+ return result;
+ }
+
+ /*
+ * now we get the next outer tuple, if any
+ */
+ outerTupleSlot = ExecProcNode(outerPlan);
+ node->mj_OuterTupleSlot = outerTupleSlot;
+ MJ_DEBUG_PROC_NODE(outerTupleSlot);
+ node->mj_MatchedOuter = false;
+
+ /* Compute join values and check for unmatchability */
+ switch (MJEvalOuterValues(node))
+ {
+ case MJEVAL_MATCHABLE:
+ /* Go test the new tuple against the current inner */
+ node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+ break;
+ case MJEVAL_NONMATCHABLE:
+ /* Can't match, so fetch next outer tuple */
+ node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE;
+ break;
+ case MJEVAL_ENDOFJOIN:
+ /* No more outer tuples */
+ MJ_printf("ExecMergeJoin: end of outer subplan\n");
+ innerTupleSlot = node->mj_InnerTupleSlot;
+ if (doFillInner && !TupIsNull(innerTupleSlot))
+ {
+ /*
+ * Need to emit right-join tuples for remaining
+ * inner tuples.
+ */
+ node->mj_JoinState = EXEC_MJ_ENDOUTER;
+ break;
+ }
+ /* Otherwise we're done. */
+ return NULL;
+ }
+ break;
+
+ /*
+ * SKIPINNER_ADVANCE: advance over an inner tuple that is
+ * known not to join to any outer tuple.
+ *
+ * Before advancing, we check to see if we must emit an
+ * outer-join fill tuple for this inner tuple.
+ */
+ case EXEC_MJ_SKIPINNER_ADVANCE:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPINNER_ADVANCE\n");
+
+ if (doFillInner && !node->mj_MatchedInner)
+ {
+ /*
+ * Generate a fake join tuple with nulls for the outer
+ * tuple, and return it if it passes the non-join quals.
+ */
+ TupleTableSlot *result;
+
+ node->mj_MatchedInner = true; /* do it only once */
+
+ result = MJFillInner(node);
+ if (result)
+ return result;
+ }
+
+ /* Mark before advancing, if wanted */
+ if (node->mj_ExtraMarks)
+ ExecMarkPos(innerPlan);
+
+ /*
+ * now we get the next inner tuple, if any
+ */
+ innerTupleSlot = ExecProcNode(innerPlan);
+ node->mj_InnerTupleSlot = innerTupleSlot;
+ MJ_DEBUG_PROC_NODE(innerTupleSlot);
+ node->mj_MatchedInner = false;
+
+ /* Compute join values and check for unmatchability */
+ switch (MJEvalInnerValues(node, innerTupleSlot))
+ {
+ case MJEVAL_MATCHABLE:
+ /* proceed to compare it to the current outer */
+ node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+ break;
+ case MJEVAL_NONMATCHABLE:
+
+ /*
+ * current inner can't possibly match any outer;
+ * better to advance the inner scan than the outer.
+ */
+ node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+ break;
+ case MJEVAL_ENDOFJOIN:
+ /* No more inner tuples */
+ MJ_printf("ExecMergeJoin: end of inner subplan\n");
+ outerTupleSlot = node->mj_OuterTupleSlot;
+ if (doFillOuter && !TupIsNull(outerTupleSlot))
+ {
+ /*
+ * Need to emit left-join tuples for remaining
+ * outer tuples.
+ */
+ node->mj_JoinState = EXEC_MJ_ENDINNER;
+ break;
+ }
+ /* Otherwise we're done. */
+ return NULL;
+ }
+ break;
+
+ /*
+ * EXEC_MJ_ENDOUTER means we have run out of outer tuples, but
+ * are doing a right/full join and therefore must null-fill
+ * any remaining unmatched inner tuples.
+ */
+ case EXEC_MJ_ENDOUTER:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_ENDOUTER\n");
+
+ Assert(doFillInner);
+
+ if (!node->mj_MatchedInner)
+ {
+ /*
+ * Generate a fake join tuple with nulls for the outer
+ * tuple, and return it if it passes the non-join quals.
+ */
+ TupleTableSlot *result;
+
+ node->mj_MatchedInner = true; /* do it only once */
+
+ result = MJFillInner(node);
+ if (result)
+ return result;
+ }
+
+ /* Mark before advancing, if wanted */
+ if (node->mj_ExtraMarks)
+ ExecMarkPos(innerPlan);
+
+ /*
+ * now we get the next inner tuple, if any
+ */
+ innerTupleSlot = ExecProcNode(innerPlan);
+ node->mj_InnerTupleSlot = innerTupleSlot;
+ MJ_DEBUG_PROC_NODE(innerTupleSlot);
+ node->mj_MatchedInner = false;
+
+ if (TupIsNull(innerTupleSlot))
+ {
+ MJ_printf("ExecMergeJoin: end of inner subplan\n");
+ return NULL;
+ }
+
+ /* Else remain in ENDOUTER state and process next tuple. */
+ break;
+
+ /*
+ * EXEC_MJ_ENDINNER means we have run out of inner tuples, but
+ * are doing a left/full join and therefore must null- fill
+ * any remaining unmatched outer tuples.
+ */
+ case EXEC_MJ_ENDINNER:
+ MJ_printf("ExecMergeJoin: EXEC_MJ_ENDINNER\n");
+
+ Assert(doFillOuter);
+
+ if (!node->mj_MatchedOuter)
+ {
+ /*
+ * Generate a fake join tuple with nulls for the inner
+ * tuple, and return it if it passes the non-join quals.
+ */
+ TupleTableSlot *result;
+
+ node->mj_MatchedOuter = true; /* do it only once */
+
+ result = MJFillOuter(node);
+ if (result)
+ return result;
+ }
+
+ /*
+ * now we get the next outer tuple, if any
+ */
+ outerTupleSlot = ExecProcNode(outerPlan);
+ node->mj_OuterTupleSlot = outerTupleSlot;
+ MJ_DEBUG_PROC_NODE(outerTupleSlot);
+ node->mj_MatchedOuter = false;
+
+ if (TupIsNull(outerTupleSlot))
+ {
+ MJ_printf("ExecMergeJoin: end of outer subplan\n");
+ return NULL;
+ }
+
+ /* Else remain in ENDINNER state and process next tuple. */
+ break;
+
+ /*
+ * broken state value?
+ */
+ default:
+ elog(ERROR, "unrecognized mergejoin state: %d",
+ (int) node->mj_JoinState);
+ }
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitMergeJoin
+ * ----------------------------------------------------------------
+ */
+MergeJoinState *
+ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
+{
+ MergeJoinState *mergestate;
+ TupleDesc outerDesc,
+ innerDesc;
+ const TupleTableSlotOps *innerOps;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ MJ1_printf("ExecInitMergeJoin: %s\n",
+ "initializing node");
+
+ /*
+ * create state structure
+ */
+ mergestate = makeNode(MergeJoinState);
+ mergestate->js.ps.plan = (Plan *) node;
+ mergestate->js.ps.state = estate;
+ mergestate->js.ps.ExecProcNode = ExecMergeJoin;
+ mergestate->js.jointype = node->join.jointype;
+ mergestate->mj_ConstFalseJoin = false;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &mergestate->js.ps);
+
+ /*
+ * we need two additional econtexts in which we can compute the join
+ * expressions from the left and right input tuples. The node's regular
+ * econtext won't do because it gets reset too often.
+ */
+ mergestate->mj_OuterEContext = CreateExprContext(estate);
+ mergestate->mj_InnerEContext = CreateExprContext(estate);
+
+ /*
+ * initialize child nodes
+ *
+ * inner child must support MARK/RESTORE, unless we have detected that we
+ * don't need that. Note that skip_mark_restore must never be set if
+ * there are non-mergeclause joinquals, since the logic wouldn't work.
+ */
+ Assert(node->join.joinqual == NIL || !node->skip_mark_restore);
+ mergestate->mj_SkipMarkRestore = node->skip_mark_restore;
+
+ outerPlanState(mergestate) = ExecInitNode(outerPlan(node), estate, eflags);
+ outerDesc = ExecGetResultType(outerPlanState(mergestate));
+ innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate,
+ mergestate->mj_SkipMarkRestore ?
+ eflags :
+ (eflags | EXEC_FLAG_MARK));
+ innerDesc = ExecGetResultType(innerPlanState(mergestate));
+
+ /*
+ * For certain types of inner child nodes, it is advantageous to issue
+ * MARK every time we advance past an inner tuple we will never return to.
+ * For other types, MARK on a tuple we cannot return to is a waste of
+ * cycles. Detect which case applies and set mj_ExtraMarks if we want to
+ * issue "unnecessary" MARK calls.
+ *
+ * Currently, only Material wants the extra MARKs, and it will be helpful
+ * only if eflags doesn't specify REWIND.
+ *
+ * Note that for IndexScan and IndexOnlyScan, it is *necessary* that we
+ * not set mj_ExtraMarks; otherwise we might attempt to set a mark before
+ * the first inner tuple, which they do not support.
+ */
+ if (IsA(innerPlan(node), Material) &&
+ (eflags & EXEC_FLAG_REWIND) == 0 &&
+ !mergestate->mj_SkipMarkRestore)
+ mergestate->mj_ExtraMarks = true;
+ else
+ mergestate->mj_ExtraMarks = false;
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTupleSlotTL(&mergestate->js.ps, &TTSOpsVirtual);
+ ExecAssignProjectionInfo(&mergestate->js.ps, NULL);
+
+ /*
+ * tuple table initialization
+ */
+ innerOps = ExecGetResultSlotOps(innerPlanState(mergestate), NULL);
+ mergestate->mj_MarkedTupleSlot = ExecInitExtraTupleSlot(estate, innerDesc,
+ innerOps);
+
+ /*
+ * initialize child expressions
+ */
+ mergestate->js.ps.qual =
+ ExecInitQual(node->join.plan.qual, (PlanState *) mergestate);
+ mergestate->js.joinqual =
+ ExecInitQual(node->join.joinqual, (PlanState *) mergestate);
+ /* mergeclauses are handled below */
+
+ /*
+ * detect whether we need only consider the first matching inner tuple
+ */
+ mergestate->js.single_match = (node->join.inner_unique ||
+ node->join.jointype == JOIN_SEMI);
+
+ /* set up null tuples for outer joins, if needed */
+ switch (node->join.jointype)
+ {
+ case JOIN_INNER:
+ case JOIN_SEMI:
+ mergestate->mj_FillOuter = false;
+ mergestate->mj_FillInner = false;
+ break;
+ case JOIN_LEFT:
+ case JOIN_ANTI:
+ mergestate->mj_FillOuter = true;
+ mergestate->mj_FillInner = false;
+ mergestate->mj_NullInnerTupleSlot =
+ ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual);
+ break;
+ case JOIN_RIGHT:
+ mergestate->mj_FillOuter = false;
+ mergestate->mj_FillInner = true;
+ mergestate->mj_NullOuterTupleSlot =
+ ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual);
+
+ /*
+ * Can't handle right or full join with non-constant extra
+ * joinclauses. This should have been caught by planner.
+ */
+ if (!check_constant_qual(node->join.joinqual,
+ &mergestate->mj_ConstFalseJoin))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("RIGHT JOIN is only supported with merge-joinable join conditions")));
+ break;
+ case JOIN_FULL:
+ mergestate->mj_FillOuter = true;
+ mergestate->mj_FillInner = true;
+ mergestate->mj_NullOuterTupleSlot =
+ ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual);
+ mergestate->mj_NullInnerTupleSlot =
+ ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual);
+
+ /*
+ * Can't handle right or full join with non-constant extra
+ * joinclauses. This should have been caught by planner.
+ */
+ if (!check_constant_qual(node->join.joinqual,
+ &mergestate->mj_ConstFalseJoin))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("FULL JOIN is only supported with merge-joinable join conditions")));
+ break;
+ default:
+ elog(ERROR, "unrecognized join type: %d",
+ (int) node->join.jointype);
+ }
+
+ /*
+ * preprocess the merge clauses
+ */
+ mergestate->mj_NumClauses = list_length(node->mergeclauses);
+ mergestate->mj_Clauses = MJExamineQuals(node->mergeclauses,
+ node->mergeFamilies,
+ node->mergeCollations,
+ node->mergeStrategies,
+ node->mergeNullsFirst,
+ (PlanState *) mergestate);
+
+ /*
+ * initialize join state
+ */
+ mergestate->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER;
+ mergestate->mj_MatchedOuter = false;
+ mergestate->mj_MatchedInner = false;
+ mergestate->mj_OuterTupleSlot = NULL;
+ mergestate->mj_InnerTupleSlot = NULL;
+
+ /*
+ * initialization successful
+ */
+ MJ1_printf("ExecInitMergeJoin: %s\n",
+ "node initialized");
+
+ return mergestate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndMergeJoin
+ *
+ * old comments
+ * frees storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndMergeJoin(MergeJoinState *node)
+{
+ MJ1_printf("ExecEndMergeJoin: %s\n",
+ "ending node processing");
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->js.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->mj_MarkedTupleSlot);
+
+ /*
+ * shut down the subplans
+ */
+ ExecEndNode(innerPlanState(node));
+ ExecEndNode(outerPlanState(node));
+
+ MJ1_printf("ExecEndMergeJoin: %s\n",
+ "node processing ended");
+}
+
+void
+ExecReScanMergeJoin(MergeJoinState *node)
+{
+ ExecClearTuple(node->mj_MarkedTupleSlot);
+
+ node->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER;
+ node->mj_MatchedOuter = false;
+ node->mj_MatchedInner = false;
+ node->mj_OuterTupleSlot = NULL;
+ node->mj_InnerTupleSlot = NULL;
+
+ /*
+ * if chgParam of subnodes is not null then plans will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->js.ps.lefttree->chgParam == NULL)
+ ExecReScan(node->js.ps.lefttree);
+ if (node->js.ps.righttree->chgParam == NULL)
+ ExecReScan(node->js.ps.righttree);
+
+}
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
new file mode 100644
index 0000000..1e79d18
--- /dev/null
+++ b/src/backend/executor/nodeModifyTable.c
@@ -0,0 +1,3243 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeModifyTable.c
+ * routines to handle ModifyTable nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeModifyTable.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ * ExecInitModifyTable - initialize the ModifyTable node
+ * ExecModifyTable - retrieve the next tuple from the node
+ * ExecEndModifyTable - shut down the ModifyTable node
+ * ExecReScanModifyTable - rescan the ModifyTable node
+ *
+ * NOTES
+ * The ModifyTable node receives input from its outerPlan, which is
+ * the data to insert for INSERT cases, or the changed columns' new
+ * values plus row-locating info for UPDATE cases, or just the
+ * row-locating info for DELETE cases.
+ *
+ * If the query specifies RETURNING, then the ModifyTable returns a
+ * RETURNING tuple after completing each row insert, update, or delete.
+ * It must be called again to continue the operation. Without RETURNING,
+ * we just loop within the node until all the work is done, then
+ * return NULL. This avoids useless call/return overhead.
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "catalog/catalog.h"
+#include "commands/trigger.h"
+#include "executor/execPartition.h"
+#include "executor/executor.h"
+#include "executor/nodeModifyTable.h"
+#include "foreign/fdwapi.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "rewrite/rewriteHandler.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+typedef struct MTTargetRelLookup
+{
+ Oid relationOid; /* hash key, must be first */
+ int relationIndex; /* rel's index in resultRelInfo[] array */
+} MTTargetRelLookup;
+
+static void ExecBatchInsert(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ TupleTableSlot **slots,
+ TupleTableSlot **planSlots,
+ int numSlots,
+ EState *estate,
+ bool canSetTag);
+static bool ExecOnConflictUpdate(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ ItemPointer conflictTid,
+ TupleTableSlot *planSlot,
+ TupleTableSlot *excludedSlot,
+ EState *estate,
+ bool canSetTag,
+ TupleTableSlot **returning);
+static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
+ EState *estate,
+ PartitionTupleRouting *proute,
+ ResultRelInfo *targetRelInfo,
+ TupleTableSlot *slot,
+ ResultRelInfo **partRelInfo);
+
+/*
+ * Verify that the tuples to be produced by INSERT match the
+ * target relation's rowtype
+ *
+ * We do this to guard against stale plans. If plan invalidation is
+ * functioning properly then we should never get a failure here, but better
+ * safe than sorry. Note that this is called after we have obtained lock
+ * on the target rel, so the rowtype can't change underneath us.
+ *
+ * The plan output is represented by its targetlist, because that makes
+ * handling the dropped-column case easier.
+ *
+ * We used to use this for UPDATE as well, but now the equivalent checks
+ * are done in ExecBuildUpdateProjection.
+ */
+static void
+ExecCheckPlanOutput(Relation resultRel, List *targetList)
+{
+ TupleDesc resultDesc = RelationGetDescr(resultRel);
+ int attno = 0;
+ ListCell *lc;
+
+ foreach(lc, targetList)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+ Form_pg_attribute attr;
+
+ Assert(!tle->resjunk); /* caller removed junk items already */
+
+ if (attno >= resultDesc->natts)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Query has too many columns.")));
+ attr = TupleDescAttr(resultDesc, attno);
+ attno++;
+
+ if (!attr->attisdropped)
+ {
+ /* Normal case: demand type match */
+ if (exprType((Node *) tle->expr) != attr->atttypid)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Table has type %s at ordinal position %d, but query expects %s.",
+ format_type_be(attr->atttypid),
+ attno,
+ format_type_be(exprType((Node *) tle->expr)))));
+ }
+ else
+ {
+ /*
+ * For a dropped column, we can't check atttypid (it's likely 0).
+ * In any case the planner has most likely inserted an INT4 null.
+ * What we insist on is just *some* NULL constant.
+ */
+ if (!IsA(tle->expr, Const) ||
+ !((Const *) tle->expr)->constisnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Query provides a value for a dropped column at ordinal position %d.",
+ attno)));
+ }
+ }
+ if (attno != resultDesc->natts)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table row type and query-specified row type do not match"),
+ errdetail("Query has too few columns.")));
+}
+
+/*
+ * ExecProcessReturning --- evaluate a RETURNING list
+ *
+ * resultRelInfo: current result rel
+ * tupleSlot: slot holding tuple actually inserted/updated/deleted
+ * planSlot: slot holding tuple returned by top subplan node
+ *
+ * Note: If tupleSlot is NULL, the FDW should have already provided econtext's
+ * scan tuple.
+ *
+ * Returns a slot holding the result tuple
+ */
+static TupleTableSlot *
+ExecProcessReturning(ResultRelInfo *resultRelInfo,
+ TupleTableSlot *tupleSlot,
+ TupleTableSlot *planSlot)
+{
+ ProjectionInfo *projectReturning = resultRelInfo->ri_projectReturning;
+ ExprContext *econtext = projectReturning->pi_exprContext;
+
+ /* Make tuple and any needed join variables available to ExecProject */
+ if (tupleSlot)
+ econtext->ecxt_scantuple = tupleSlot;
+ econtext->ecxt_outertuple = planSlot;
+
+ /*
+ * RETURNING expressions might reference the tableoid column, so
+ * reinitialize tts_tableOid before evaluating them.
+ */
+ econtext->ecxt_scantuple->tts_tableOid =
+ RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+ /* Compute the RETURNING expressions */
+ return ExecProject(projectReturning);
+}
+
+/*
+ * ExecCheckTupleVisible -- verify tuple is visible
+ *
+ * It would not be consistent with guarantees of the higher isolation levels to
+ * proceed with avoiding insertion (taking speculative insertion's alternative
+ * path) on the basis of another tuple that is not visible to MVCC snapshot.
+ * Check for the need to raise a serialization failure, and do so as necessary.
+ */
+static void
+ExecCheckTupleVisible(EState *estate,
+ Relation rel,
+ TupleTableSlot *slot)
+{
+ if (!IsolationUsesXactSnapshot())
+ return;
+
+ if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot))
+ {
+ Datum xminDatum;
+ TransactionId xmin;
+ bool isnull;
+
+ xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull);
+ Assert(!isnull);
+ xmin = DatumGetTransactionId(xminDatum);
+
+ /*
+ * We should not raise a serialization failure if the conflict is
+ * against a tuple inserted by our own transaction, even if it's not
+ * visible to our snapshot. (This would happen, for example, if
+ * conflicting keys are proposed for insertion in a single command.)
+ */
+ if (!TransactionIdIsCurrentTransactionId(xmin))
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent update")));
+ }
+}
+
+/*
+ * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible()
+ */
+static void
+ExecCheckTIDVisible(EState *estate,
+ ResultRelInfo *relinfo,
+ ItemPointer tid,
+ TupleTableSlot *tempSlot)
+{
+ Relation rel = relinfo->ri_RelationDesc;
+
+ /* Redundantly check isolation level */
+ if (!IsolationUsesXactSnapshot())
+ return;
+
+ if (!table_tuple_fetch_row_version(rel, tid, SnapshotAny, tempSlot))
+ elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT");
+ ExecCheckTupleVisible(estate, rel, tempSlot);
+ ExecClearTuple(tempSlot);
+}
+
+/*
+ * Compute stored generated columns for a tuple
+ */
+void
+ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
+ EState *estate, TupleTableSlot *slot,
+ CmdType cmdtype)
+{
+ Relation rel = resultRelInfo->ri_RelationDesc;
+ TupleDesc tupdesc = RelationGetDescr(rel);
+ int natts = tupdesc->natts;
+ MemoryContext oldContext;
+ Datum *values;
+ bool *nulls;
+
+ Assert(tupdesc->constr && tupdesc->constr->has_generated_stored);
+
+ /*
+ * If first time through for this result relation, build expression
+ * nodetrees for rel's stored generation expressions. Keep them in the
+ * per-query memory context so they'll survive throughout the query.
+ */
+ if (resultRelInfo->ri_GeneratedExprs == NULL)
+ {
+ oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ resultRelInfo->ri_GeneratedExprs =
+ (ExprState **) palloc(natts * sizeof(ExprState *));
+ resultRelInfo->ri_NumGeneratedNeeded = 0;
+
+ for (int i = 0; i < natts; i++)
+ {
+ if (TupleDescAttr(tupdesc, i)->attgenerated == ATTRIBUTE_GENERATED_STORED)
+ {
+ Expr *expr;
+
+ /*
+ * If it's an update and the current column was not marked as
+ * being updated, then we can skip the computation. But if
+ * there is a BEFORE ROW UPDATE trigger, we cannot skip
+ * because the trigger might affect additional columns.
+ */
+ if (cmdtype == CMD_UPDATE &&
+ !(rel->trigdesc && rel->trigdesc->trig_update_before_row) &&
+ !bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
+ ExecGetExtraUpdatedCols(resultRelInfo, estate)))
+ {
+ resultRelInfo->ri_GeneratedExprs[i] = NULL;
+ continue;
+ }
+
+ expr = (Expr *) build_column_default(rel, i + 1);
+ if (expr == NULL)
+ elog(ERROR, "no generation expression found for column number %d of table \"%s\"",
+ i + 1, RelationGetRelationName(rel));
+
+ resultRelInfo->ri_GeneratedExprs[i] = ExecPrepareExpr(expr, estate);
+ resultRelInfo->ri_NumGeneratedNeeded++;
+ }
+ }
+
+ MemoryContextSwitchTo(oldContext);
+ }
+
+ /*
+ * If no generated columns have been affected by this change, then skip
+ * the rest.
+ */
+ if (resultRelInfo->ri_NumGeneratedNeeded == 0)
+ return;
+
+ oldContext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
+
+ values = palloc(sizeof(*values) * natts);
+ nulls = palloc(sizeof(*nulls) * natts);
+
+ slot_getallattrs(slot);
+ memcpy(nulls, slot->tts_isnull, sizeof(*nulls) * natts);
+
+ for (int i = 0; i < natts; i++)
+ {
+ Form_pg_attribute attr = TupleDescAttr(tupdesc, i);
+
+ if (attr->attgenerated == ATTRIBUTE_GENERATED_STORED &&
+ resultRelInfo->ri_GeneratedExprs[i])
+ {
+ ExprContext *econtext;
+ Datum val;
+ bool isnull;
+
+ econtext = GetPerTupleExprContext(estate);
+ econtext->ecxt_scantuple = slot;
+
+ val = ExecEvalExpr(resultRelInfo->ri_GeneratedExprs[i], econtext, &isnull);
+
+ /*
+ * We must make a copy of val as we have no guarantees about where
+ * memory for a pass-by-reference Datum is located.
+ */
+ if (!isnull)
+ val = datumCopy(val, attr->attbyval, attr->attlen);
+
+ values[i] = val;
+ nulls[i] = isnull;
+ }
+ else
+ {
+ if (!nulls[i])
+ values[i] = datumCopy(slot->tts_values[i], attr->attbyval, attr->attlen);
+ }
+ }
+
+ ExecClearTuple(slot);
+ memcpy(slot->tts_values, values, sizeof(*values) * natts);
+ memcpy(slot->tts_isnull, nulls, sizeof(*nulls) * natts);
+ ExecStoreVirtualTuple(slot);
+ ExecMaterializeSlot(slot);
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * ExecInitInsertProjection
+ * Do one-time initialization of projection data for INSERT tuples.
+ *
+ * INSERT queries may need a projection to filter out junk attrs in the tlist.
+ *
+ * This is also a convenient place to verify that the
+ * output of an INSERT matches the target table.
+ */
+static void
+ExecInitInsertProjection(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo)
+{
+ ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+ Plan *subplan = outerPlan(node);
+ EState *estate = mtstate->ps.state;
+ List *insertTargetList = NIL;
+ bool need_projection = false;
+ ListCell *l;
+
+ /* Extract non-junk columns of the subplan's result tlist. */
+ foreach(l, subplan->targetlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+ if (!tle->resjunk)
+ insertTargetList = lappend(insertTargetList, tle);
+ else
+ need_projection = true;
+ }
+
+ /*
+ * The junk-free list must produce a tuple suitable for the result
+ * relation.
+ */
+ ExecCheckPlanOutput(resultRelInfo->ri_RelationDesc, insertTargetList);
+
+ /* We'll need a slot matching the table's format. */
+ resultRelInfo->ri_newTupleSlot =
+ table_slot_create(resultRelInfo->ri_RelationDesc,
+ &estate->es_tupleTable);
+
+ /* Build ProjectionInfo if needed (it probably isn't). */
+ if (need_projection)
+ {
+ TupleDesc relDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+
+ /* need an expression context to do the projection */
+ if (mtstate->ps.ps_ExprContext == NULL)
+ ExecAssignExprContext(estate, &mtstate->ps);
+
+ resultRelInfo->ri_projectNew =
+ ExecBuildProjectionInfo(insertTargetList,
+ mtstate->ps.ps_ExprContext,
+ resultRelInfo->ri_newTupleSlot,
+ &mtstate->ps,
+ relDesc);
+ }
+
+ resultRelInfo->ri_projectNewInfoValid = true;
+}
+
+/*
+ * ExecInitUpdateProjection
+ * Do one-time initialization of projection data for UPDATE tuples.
+ *
+ * UPDATE always needs a projection, because (1) there's always some junk
+ * attrs, and (2) we may need to merge values of not-updated columns from
+ * the old tuple into the final tuple. In UPDATE, the tuple arriving from
+ * the subplan contains only new values for the changed columns, plus row
+ * identity info in the junk attrs.
+ *
+ * This is "one-time" for any given result rel, but we might touch more than
+ * one result rel in the course of an inherited UPDATE, and each one needs
+ * its own projection due to possible column order variation.
+ *
+ * This is also a convenient place to verify that the output of an UPDATE
+ * matches the target table (ExecBuildUpdateProjection does that).
+ */
+static void
+ExecInitUpdateProjection(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo)
+{
+ ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+ Plan *subplan = outerPlan(node);
+ EState *estate = mtstate->ps.state;
+ TupleDesc relDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+ int whichrel;
+ List *updateColnos;
+
+ /*
+ * Usually, mt_lastResultIndex matches the target rel. If it happens not
+ * to, we can get the index the hard way with an integer division.
+ */
+ whichrel = mtstate->mt_lastResultIndex;
+ if (resultRelInfo != mtstate->resultRelInfo + whichrel)
+ {
+ whichrel = resultRelInfo - mtstate->resultRelInfo;
+ Assert(whichrel >= 0 && whichrel < mtstate->mt_nrels);
+ }
+
+ updateColnos = (List *) list_nth(node->updateColnosLists, whichrel);
+
+ /*
+ * For UPDATE, we use the old tuple to fill up missing values in the tuple
+ * produced by the subplan to get the new tuple. We need two slots, both
+ * matching the table's desired format.
+ */
+ resultRelInfo->ri_oldTupleSlot =
+ table_slot_create(resultRelInfo->ri_RelationDesc,
+ &estate->es_tupleTable);
+ resultRelInfo->ri_newTupleSlot =
+ table_slot_create(resultRelInfo->ri_RelationDesc,
+ &estate->es_tupleTable);
+
+ /* need an expression context to do the projection */
+ if (mtstate->ps.ps_ExprContext == NULL)
+ ExecAssignExprContext(estate, &mtstate->ps);
+
+ resultRelInfo->ri_projectNew =
+ ExecBuildUpdateProjection(subplan->targetlist,
+ false, /* subplan did the evaluation */
+ updateColnos,
+ relDesc,
+ mtstate->ps.ps_ExprContext,
+ resultRelInfo->ri_newTupleSlot,
+ &mtstate->ps);
+
+ resultRelInfo->ri_projectNewInfoValid = true;
+}
+
+/*
+ * ExecGetInsertNewTuple
+ * This prepares a "new" tuple ready to be inserted into given result
+ * relation, by removing any junk columns of the plan's output tuple
+ * and (if necessary) coercing the tuple to the right tuple format.
+ */
+static TupleTableSlot *
+ExecGetInsertNewTuple(ResultRelInfo *relinfo,
+ TupleTableSlot *planSlot)
+{
+ ProjectionInfo *newProj = relinfo->ri_projectNew;
+ ExprContext *econtext;
+
+ /*
+ * If there's no projection to be done, just make sure the slot is of the
+ * right type for the target rel. If the planSlot is the right type we
+ * can use it as-is, else copy the data into ri_newTupleSlot.
+ */
+ if (newProj == NULL)
+ {
+ if (relinfo->ri_newTupleSlot->tts_ops != planSlot->tts_ops)
+ {
+ ExecCopySlot(relinfo->ri_newTupleSlot, planSlot);
+ return relinfo->ri_newTupleSlot;
+ }
+ else
+ return planSlot;
+ }
+
+ /*
+ * Else project; since the projection output slot is ri_newTupleSlot, this
+ * will also fix any slot-type problem.
+ *
+ * Note: currently, this is dead code, because INSERT cases don't receive
+ * any junk columns so there's never a projection to be done.
+ */
+ econtext = newProj->pi_exprContext;
+ econtext->ecxt_outertuple = planSlot;
+ return ExecProject(newProj);
+}
+
+/*
+ * ExecGetUpdateNewTuple
+ * This prepares a "new" tuple by combining an UPDATE subplan's output
+ * tuple (which contains values of changed columns) with unchanged
+ * columns taken from the old tuple.
+ *
+ * The subplan tuple might also contain junk columns, which are ignored.
+ * Note that the projection also ensures we have a slot of the right type.
+ */
+TupleTableSlot *
+ExecGetUpdateNewTuple(ResultRelInfo *relinfo,
+ TupleTableSlot *planSlot,
+ TupleTableSlot *oldSlot)
+{
+ ProjectionInfo *newProj = relinfo->ri_projectNew;
+ ExprContext *econtext;
+
+ /* Use a few extra Asserts to protect against outside callers */
+ Assert(relinfo->ri_projectNewInfoValid);
+ Assert(planSlot != NULL && !TTS_EMPTY(planSlot));
+ Assert(oldSlot != NULL && !TTS_EMPTY(oldSlot));
+
+ econtext = newProj->pi_exprContext;
+ econtext->ecxt_outertuple = planSlot;
+ econtext->ecxt_scantuple = oldSlot;
+ return ExecProject(newProj);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecInsert
+ *
+ * For INSERT, we have to insert the tuple into the target relation
+ * (or partition thereof) and insert appropriate tuples into the index
+ * relations.
+ *
+ * slot contains the new tuple value to be stored.
+ * planSlot is the output of the ModifyTable's subplan; we use it
+ * to access "junk" columns that are not going to be stored.
+ *
+ * Returns RETURNING result if any, otherwise NULL.
+ *
+ * This may change the currently active tuple conversion map in
+ * mtstate->mt_transition_capture, so the callers must take care to
+ * save the previous value to avoid losing track of it.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecInsert(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot,
+ TupleTableSlot *planSlot,
+ EState *estate,
+ bool canSetTag)
+{
+ Relation resultRelationDesc;
+ List *recheckIndexes = NIL;
+ TupleTableSlot *result = NULL;
+ TransitionCaptureState *ar_insert_trig_tcs;
+ ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+ OnConflictAction onconflict = node->onConflictAction;
+ PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+ MemoryContext oldContext;
+
+ /*
+ * If the input result relation is a partitioned table, find the leaf
+ * partition to insert the tuple into.
+ */
+ if (proute)
+ {
+ ResultRelInfo *partRelInfo;
+
+ slot = ExecPrepareTupleRouting(mtstate, estate, proute,
+ resultRelInfo, slot,
+ &partRelInfo);
+ resultRelInfo = partRelInfo;
+ }
+
+ ExecMaterializeSlot(slot);
+
+ resultRelationDesc = resultRelInfo->ri_RelationDesc;
+
+ /*
+ * Open the table's indexes, if we have not done so already, so that we
+ * can add new index entries for the inserted tuple.
+ */
+ if (resultRelationDesc->rd_rel->relhasindex &&
+ resultRelInfo->ri_IndexRelationDescs == NULL)
+ ExecOpenIndices(resultRelInfo, onconflict != ONCONFLICT_NONE);
+
+ /*
+ * BEFORE ROW INSERT Triggers.
+ *
+ * Note: We fire BEFORE ROW TRIGGERS for every attempted insertion in an
+ * INSERT ... ON CONFLICT statement. We cannot check for constraint
+ * violations before firing these triggers, because they can change the
+ * values to insert. Also, they can run arbitrary user-defined code with
+ * side-effects that we can't cancel by just not inserting the tuple.
+ */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_insert_before_row)
+ {
+ if (!ExecBRInsertTriggers(estate, resultRelInfo, slot))
+ return NULL; /* "do nothing" */
+ }
+
+ /* INSTEAD OF ROW INSERT Triggers */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_insert_instead_row)
+ {
+ if (!ExecIRInsertTriggers(estate, resultRelInfo, slot))
+ return NULL; /* "do nothing" */
+ }
+ else if (resultRelInfo->ri_FdwRoutine)
+ {
+ /*
+ * GENERATED expressions might reference the tableoid column, so
+ * (re-)initialize tts_tableOid before evaluating them.
+ */
+ slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+ /*
+ * Compute stored generated columns
+ */
+ if (resultRelationDesc->rd_att->constr &&
+ resultRelationDesc->rd_att->constr->has_generated_stored)
+ ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+ CMD_INSERT);
+
+ /*
+ * If the FDW supports batching, and batching is requested, accumulate
+ * rows and insert them in batches. Otherwise use the per-row inserts.
+ */
+ if (resultRelInfo->ri_BatchSize > 1)
+ {
+ /*
+ * If a certain number of tuples have already been accumulated, or
+ * a tuple has come for a different relation than that for the
+ * accumulated tuples, perform the batch insert
+ */
+ if (resultRelInfo->ri_NumSlots == resultRelInfo->ri_BatchSize)
+ {
+ ExecBatchInsert(mtstate, resultRelInfo,
+ resultRelInfo->ri_Slots,
+ resultRelInfo->ri_PlanSlots,
+ resultRelInfo->ri_NumSlots,
+ estate, canSetTag);
+ resultRelInfo->ri_NumSlots = 0;
+ }
+
+ oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ if (resultRelInfo->ri_Slots == NULL)
+ {
+ resultRelInfo->ri_Slots = palloc(sizeof(TupleTableSlot *) *
+ resultRelInfo->ri_BatchSize);
+ resultRelInfo->ri_PlanSlots = palloc(sizeof(TupleTableSlot *) *
+ resultRelInfo->ri_BatchSize);
+ }
+
+ /*
+ * Initialize the batch slots. We don't know how many slots will
+ * be needed, so we initialize them as the batch grows, and we
+ * keep them across batches. To mitigate an inefficiency in how
+ * resource owner handles objects with many references (as with
+ * many slots all referencing the same tuple descriptor) we copy
+ * the appropriate tuple descriptor for each slot.
+ */
+ if (resultRelInfo->ri_NumSlots >= resultRelInfo->ri_NumSlotsInitialized)
+ {
+ TupleDesc tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
+ TupleDesc plan_tdesc =
+ CreateTupleDescCopy(planSlot->tts_tupleDescriptor);
+
+ resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
+ MakeSingleTupleTableSlot(tdesc, slot->tts_ops);
+
+ resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] =
+ MakeSingleTupleTableSlot(plan_tdesc, planSlot->tts_ops);
+
+ /* remember how many batch slots we initialized */
+ resultRelInfo->ri_NumSlotsInitialized++;
+ }
+
+ ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots],
+ slot);
+
+ ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots],
+ planSlot);
+
+ resultRelInfo->ri_NumSlots++;
+
+ MemoryContextSwitchTo(oldContext);
+
+ return NULL;
+ }
+
+ /*
+ * insert into foreign table: let the FDW do it
+ */
+ slot = resultRelInfo->ri_FdwRoutine->ExecForeignInsert(estate,
+ resultRelInfo,
+ slot,
+ planSlot);
+
+ if (slot == NULL) /* "do nothing" */
+ return NULL;
+
+ /*
+ * AFTER ROW Triggers or RETURNING expressions might reference the
+ * tableoid column, so (re-)initialize tts_tableOid before evaluating
+ * them. (This covers the case where the FDW replaced the slot.)
+ */
+ slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+ }
+ else
+ {
+ WCOKind wco_kind;
+
+ /*
+ * Constraints and GENERATED expressions might reference the tableoid
+ * column, so (re-)initialize tts_tableOid before evaluating them.
+ */
+ slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
+
+ /*
+ * Compute stored generated columns
+ */
+ if (resultRelationDesc->rd_att->constr &&
+ resultRelationDesc->rd_att->constr->has_generated_stored)
+ ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+ CMD_INSERT);
+
+ /*
+ * Check any RLS WITH CHECK policies.
+ *
+ * Normally we should check INSERT policies. But if the insert is the
+ * result of a partition key update that moved the tuple to a new
+ * partition, we should instead check UPDATE policies, because we are
+ * executing policies defined on the target table, and not those
+ * defined on the child partitions.
+ */
+ wco_kind = (mtstate->operation == CMD_UPDATE) ?
+ WCO_RLS_UPDATE_CHECK : WCO_RLS_INSERT_CHECK;
+
+ /*
+ * ExecWithCheckOptions() will skip any WCOs which are not of the kind
+ * we are looking for at this point.
+ */
+ if (resultRelInfo->ri_WithCheckOptions != NIL)
+ ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate);
+
+ /*
+ * Check the constraints of the tuple.
+ */
+ if (resultRelationDesc->rd_att->constr)
+ ExecConstraints(resultRelInfo, slot, estate);
+
+ /*
+ * Also check the tuple against the partition constraint, if there is
+ * one; except that if we got here via tuple-routing, we don't need to
+ * if there's no BR trigger defined on the partition.
+ */
+ if (resultRelationDesc->rd_rel->relispartition &&
+ (resultRelInfo->ri_RootResultRelInfo == NULL ||
+ (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_insert_before_row)))
+ ExecPartitionCheck(resultRelInfo, slot, estate, true);
+
+ if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0)
+ {
+ /* Perform a speculative insertion. */
+ uint32 specToken;
+ ItemPointerData conflictTid;
+ bool specConflict;
+ List *arbiterIndexes;
+
+ arbiterIndexes = resultRelInfo->ri_onConflictArbiterIndexes;
+
+ /*
+ * Do a non-conclusive check for conflicts first.
+ *
+ * We're not holding any locks yet, so this doesn't guarantee that
+ * the later insert won't conflict. But it avoids leaving behind
+ * a lot of canceled speculative insertions, if you run a lot of
+ * INSERT ON CONFLICT statements that do conflict.
+ *
+ * We loop back here if we find a conflict below, either during
+ * the pre-check, or when we re-check after inserting the tuple
+ * speculatively. Better allow interrupts in case some bug makes
+ * this an infinite loop.
+ */
+ vlock:
+ CHECK_FOR_INTERRUPTS();
+ specConflict = false;
+ if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate,
+ &conflictTid, arbiterIndexes))
+ {
+ /* committed conflict tuple found */
+ if (onconflict == ONCONFLICT_UPDATE)
+ {
+ /*
+ * In case of ON CONFLICT DO UPDATE, execute the UPDATE
+ * part. Be prepared to retry if the UPDATE fails because
+ * of another concurrent UPDATE/DELETE to the conflict
+ * tuple.
+ */
+ TupleTableSlot *returning = NULL;
+
+ if (ExecOnConflictUpdate(mtstate, resultRelInfo,
+ &conflictTid, planSlot, slot,
+ estate, canSetTag, &returning))
+ {
+ InstrCountTuples2(&mtstate->ps, 1);
+ return returning;
+ }
+ else
+ goto vlock;
+ }
+ else
+ {
+ /*
+ * In case of ON CONFLICT DO NOTHING, do nothing. However,
+ * verify that the tuple is visible to the executor's MVCC
+ * snapshot at higher isolation levels.
+ *
+ * Using ExecGetReturningSlot() to store the tuple for the
+ * recheck isn't that pretty, but we can't trivially use
+ * the input slot, because it might not be of a compatible
+ * type. As there's no conflicting usage of
+ * ExecGetReturningSlot() in the DO NOTHING case...
+ */
+ Assert(onconflict == ONCONFLICT_NOTHING);
+ ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid,
+ ExecGetReturningSlot(estate, resultRelInfo));
+ InstrCountTuples2(&mtstate->ps, 1);
+ return NULL;
+ }
+ }
+
+ /*
+ * Before we start insertion proper, acquire our "speculative
+ * insertion lock". Others can use that to wait for us to decide
+ * if we're going to go ahead with the insertion, instead of
+ * waiting for the whole transaction to complete.
+ */
+ specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId());
+
+ /* insert the tuple, with the speculative token */
+ table_tuple_insert_speculative(resultRelationDesc, slot,
+ estate->es_output_cid,
+ 0,
+ NULL,
+ specToken);
+
+ /* insert index entries for tuple */
+ recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+ slot, estate, false, true,
+ &specConflict,
+ arbiterIndexes);
+
+ /* adjust the tuple's state accordingly */
+ table_tuple_complete_speculative(resultRelationDesc, slot,
+ specToken, !specConflict);
+
+ /*
+ * Wake up anyone waiting for our decision. They will re-check
+ * the tuple, see that it's no longer speculative, and wait on our
+ * XID as if this was a regularly inserted tuple all along. Or if
+ * we killed the tuple, they will see it's dead, and proceed as if
+ * the tuple never existed.
+ */
+ SpeculativeInsertionLockRelease(GetCurrentTransactionId());
+
+ /*
+ * If there was a conflict, start from the beginning. We'll do
+ * the pre-check again, which will now find the conflicting tuple
+ * (unless it aborts before we get there).
+ */
+ if (specConflict)
+ {
+ list_free(recheckIndexes);
+ goto vlock;
+ }
+
+ /* Since there was no insertion conflict, we're done */
+ }
+ else
+ {
+ /* insert the tuple normally */
+ table_tuple_insert(resultRelationDesc, slot,
+ estate->es_output_cid,
+ 0, NULL);
+
+ /* insert index entries for tuple */
+ if (resultRelInfo->ri_NumIndices > 0)
+ recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+ slot, estate, false,
+ false, NULL, NIL);
+ }
+ }
+
+ if (canSetTag)
+ (estate->es_processed)++;
+
+ /*
+ * If this insert is the result of a partition key update that moved the
+ * tuple to a new partition, put this row into the transition NEW TABLE,
+ * if there is one. We need to do this separately for DELETE and INSERT
+ * because they happen on different tables.
+ */
+ ar_insert_trig_tcs = mtstate->mt_transition_capture;
+ if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+ && mtstate->mt_transition_capture->tcs_update_new_table)
+ {
+ ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+ NULL,
+ slot,
+ NULL,
+ mtstate->mt_transition_capture);
+
+ /*
+ * We've already captured the NEW TABLE row, so make sure any AR
+ * INSERT trigger fired below doesn't capture it again.
+ */
+ ar_insert_trig_tcs = NULL;
+ }
+
+ /* AFTER ROW INSERT Triggers */
+ ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+ ar_insert_trig_tcs);
+
+ list_free(recheckIndexes);
+
+ /*
+ * Check any WITH CHECK OPTION constraints from parent views. We are
+ * required to do this after testing all constraints and uniqueness
+ * violations per the SQL spec, so we do it after actually inserting the
+ * record into the heap and all indexes.
+ *
+ * ExecWithCheckOptions will elog(ERROR) if a violation is found, so the
+ * tuple will never be seen, if it violates the WITH CHECK OPTION.
+ *
+ * ExecWithCheckOptions() will skip any WCOs which are not of the kind we
+ * are looking for at this point.
+ */
+ if (resultRelInfo->ri_WithCheckOptions != NIL)
+ ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);
+
+ /* Process RETURNING if present */
+ if (resultRelInfo->ri_projectReturning)
+ result = ExecProcessReturning(resultRelInfo, slot, planSlot);
+
+ return result;
+}
+
+/* ----------------------------------------------------------------
+ * ExecBatchInsert
+ *
+ * Insert multiple tuples in an efficient way.
+ * Currently, this handles inserting into a foreign table without
+ * RETURNING clause.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecBatchInsert(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ TupleTableSlot **slots,
+ TupleTableSlot **planSlots,
+ int numSlots,
+ EState *estate,
+ bool canSetTag)
+{
+ int i;
+ int numInserted = numSlots;
+ TupleTableSlot *slot = NULL;
+ TupleTableSlot **rslots;
+
+ /*
+ * insert into foreign table: let the FDW do it
+ */
+ rslots = resultRelInfo->ri_FdwRoutine->ExecForeignBatchInsert(estate,
+ resultRelInfo,
+ slots,
+ planSlots,
+ &numInserted);
+
+ for (i = 0; i < numInserted; i++)
+ {
+ slot = rslots[i];
+
+ /*
+ * AFTER ROW Triggers or RETURNING expressions might reference the
+ * tableoid column, so (re-)initialize tts_tableOid before evaluating
+ * them.
+ */
+ slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+ /* AFTER ROW INSERT Triggers */
+ ExecARInsertTriggers(estate, resultRelInfo, slot, NIL,
+ mtstate->mt_transition_capture);
+
+ /*
+ * Check any WITH CHECK OPTION constraints from parent views. See the
+ * comment in ExecInsert.
+ */
+ if (resultRelInfo->ri_WithCheckOptions != NIL)
+ ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);
+ }
+
+ if (canSetTag && numInserted > 0)
+ estate->es_processed += numInserted;
+}
+
+/* ----------------------------------------------------------------
+ * ExecDelete
+ *
+ * DELETE is like UPDATE, except that we delete the tuple and no
+ * index modifications are needed.
+ *
+ * When deleting from a table, tupleid identifies the tuple to
+ * delete and oldtuple is NULL. When deleting from a view,
+ * oldtuple is passed to the INSTEAD OF triggers and identifies
+ * what to delete, and tupleid is invalid. When deleting from a
+ * foreign table, tupleid is invalid; the FDW has to figure out
+ * which row to delete using data from the planSlot. oldtuple is
+ * passed to foreign table triggers; it is NULL when the foreign
+ * table has no relevant triggers. We use tupleDeleted to indicate
+ * whether the tuple is actually deleted, callers can use it to
+ * decide whether to continue the operation. When this DELETE is a
+ * part of an UPDATE of partition-key, then the slot returned by
+ * EvalPlanQual() is passed back using output parameter epqslot.
+ *
+ * Returns RETURNING result if any, otherwise NULL.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecDelete(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ ItemPointer tupleid,
+ HeapTuple oldtuple,
+ TupleTableSlot *planSlot,
+ EPQState *epqstate,
+ EState *estate,
+ bool processReturning,
+ bool canSetTag,
+ bool changingPart,
+ bool *tupleDeleted,
+ TupleTableSlot **epqreturnslot)
+{
+ Relation resultRelationDesc = resultRelInfo->ri_RelationDesc;
+ TM_Result result;
+ TM_FailureData tmfd;
+ TupleTableSlot *slot = NULL;
+ TransitionCaptureState *ar_delete_trig_tcs;
+
+ if (tupleDeleted)
+ *tupleDeleted = false;
+
+ /* BEFORE ROW DELETE Triggers */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_delete_before_row)
+ {
+ bool dodelete;
+
+ dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
+ tupleid, oldtuple, epqreturnslot);
+
+ if (!dodelete) /* "do nothing" */
+ return NULL;
+ }
+
+ /* INSTEAD OF ROW DELETE Triggers */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_delete_instead_row)
+ {
+ bool dodelete;
+
+ Assert(oldtuple != NULL);
+ dodelete = ExecIRDeleteTriggers(estate, resultRelInfo, oldtuple);
+
+ if (!dodelete) /* "do nothing" */
+ return NULL;
+ }
+ else if (resultRelInfo->ri_FdwRoutine)
+ {
+ /*
+ * delete from foreign table: let the FDW do it
+ *
+ * We offer the returning slot as a place to store RETURNING data,
+ * although the FDW can return some other slot if it wants.
+ */
+ slot = ExecGetReturningSlot(estate, resultRelInfo);
+ slot = resultRelInfo->ri_FdwRoutine->ExecForeignDelete(estate,
+ resultRelInfo,
+ slot,
+ planSlot);
+
+ if (slot == NULL) /* "do nothing" */
+ return NULL;
+
+ /*
+ * RETURNING expressions might reference the tableoid column, so
+ * (re)initialize tts_tableOid before evaluating them.
+ */
+ if (TTS_EMPTY(slot))
+ ExecStoreAllNullTuple(slot);
+
+ slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
+ }
+ else
+ {
+ /*
+ * delete the tuple
+ *
+ * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check
+ * that the row to be deleted is visible to that snapshot, and throw a
+ * can't-serialize error if not. This is a special-case behavior
+ * needed for referential integrity updates in transaction-snapshot
+ * mode transactions.
+ */
+ldelete:;
+ result = table_tuple_delete(resultRelationDesc, tupleid,
+ estate->es_output_cid,
+ estate->es_snapshot,
+ estate->es_crosscheck_snapshot,
+ true /* wait for commit */ ,
+ &tmfd,
+ changingPart);
+
+ switch (result)
+ {
+ case TM_SelfModified:
+
+ /*
+ * The target tuple was already updated or deleted by the
+ * current command, or by a later command in the current
+ * transaction. The former case is possible in a join DELETE
+ * where multiple tuples join to the same target tuple. This
+ * is somewhat questionable, but Postgres has always allowed
+ * it: we just ignore additional deletion attempts.
+ *
+ * The latter case arises if the tuple is modified by a
+ * command in a BEFORE trigger, or perhaps by a command in a
+ * volatile function used in the query. In such situations we
+ * should not ignore the deletion, but it is equally unsafe to
+ * proceed. We don't want to discard the original DELETE
+ * while keeping the triggered actions based on its deletion;
+ * and it would be no better to allow the original DELETE
+ * while discarding updates that it triggered. The row update
+ * carries some information that might be important according
+ * to business rules; so throwing an error is the only safe
+ * course.
+ *
+ * If a trigger actually intends this type of interaction, it
+ * can re-execute the DELETE and then return NULL to cancel
+ * the outer delete.
+ */
+ if (tmfd.cmax != estate->es_output_cid)
+ ereport(ERROR,
+ (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
+ errmsg("tuple to be deleted was already modified by an operation triggered by the current command"),
+ errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
+
+ /* Else, already deleted by self; nothing to do */
+ return NULL;
+
+ case TM_Ok:
+ break;
+
+ case TM_Updated:
+ {
+ TupleTableSlot *inputslot;
+ TupleTableSlot *epqslot;
+
+ if (IsolationUsesXactSnapshot())
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent update")));
+
+ /*
+ * Already know that we're going to need to do EPQ, so
+ * fetch tuple directly into the right slot.
+ */
+ EvalPlanQualBegin(epqstate);
+ inputslot = EvalPlanQualSlot(epqstate, resultRelationDesc,
+ resultRelInfo->ri_RangeTableIndex);
+
+ result = table_tuple_lock(resultRelationDesc, tupleid,
+ estate->es_snapshot,
+ inputslot, estate->es_output_cid,
+ LockTupleExclusive, LockWaitBlock,
+ TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
+ &tmfd);
+
+ switch (result)
+ {
+ case TM_Ok:
+ Assert(tmfd.traversed);
+ epqslot = EvalPlanQual(epqstate,
+ resultRelationDesc,
+ resultRelInfo->ri_RangeTableIndex,
+ inputslot);
+ if (TupIsNull(epqslot))
+ /* Tuple not passing quals anymore, exiting... */
+ return NULL;
+
+ /*
+ * If requested, skip delete and pass back the
+ * updated row.
+ */
+ if (epqreturnslot)
+ {
+ *epqreturnslot = epqslot;
+ return NULL;
+ }
+ else
+ goto ldelete;
+
+ case TM_SelfModified:
+
+ /*
+ * This can be reached when following an update
+ * chain from a tuple updated by another session,
+ * reaching a tuple that was already updated in
+ * this transaction. If previously updated by this
+ * command, ignore the delete, otherwise error
+ * out.
+ *
+ * See also TM_SelfModified response to
+ * table_tuple_delete() above.
+ */
+ if (tmfd.cmax != estate->es_output_cid)
+ ereport(ERROR,
+ (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
+ errmsg("tuple to be deleted was already modified by an operation triggered by the current command"),
+ errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
+ return NULL;
+
+ case TM_Deleted:
+ /* tuple already deleted; nothing to do */
+ return NULL;
+
+ default:
+
+ /*
+ * TM_Invisible should be impossible because we're
+ * waiting for updated row versions, and would
+ * already have errored out if the first version
+ * is invisible.
+ *
+ * TM_Updated should be impossible, because we're
+ * locking the latest version via
+ * TUPLE_LOCK_FLAG_FIND_LAST_VERSION.
+ */
+ elog(ERROR, "unexpected table_tuple_lock status: %u",
+ result);
+ return NULL;
+ }
+
+ Assert(false);
+ break;
+ }
+
+ case TM_Deleted:
+ if (IsolationUsesXactSnapshot())
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent delete")));
+ /* tuple already deleted; nothing to do */
+ return NULL;
+
+ default:
+ elog(ERROR, "unrecognized table_tuple_delete status: %u",
+ result);
+ return NULL;
+ }
+
+ /*
+ * Note: Normally one would think that we have to delete index tuples
+ * associated with the heap tuple now...
+ *
+ * ... but in POSTGRES, we have no need to do this because VACUUM will
+ * take care of it later. We can't delete index tuples immediately
+ * anyway, since the tuple is still visible to other transactions.
+ */
+ }
+
+ if (canSetTag)
+ (estate->es_processed)++;
+
+ /* Tell caller that the delete actually happened. */
+ if (tupleDeleted)
+ *tupleDeleted = true;
+
+ /*
+ * If this delete is the result of a partition key update that moved the
+ * tuple to a new partition, put this row into the transition OLD TABLE,
+ * if there is one. We need to do this separately for DELETE and INSERT
+ * because they happen on different tables.
+ */
+ ar_delete_trig_tcs = mtstate->mt_transition_capture;
+ if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+ && mtstate->mt_transition_capture->tcs_update_old_table)
+ {
+ ExecARUpdateTriggers(estate, resultRelInfo,
+ tupleid,
+ oldtuple,
+ NULL,
+ NULL,
+ mtstate->mt_transition_capture);
+
+ /*
+ * We've already captured the NEW TABLE row, so make sure any AR
+ * DELETE trigger fired below doesn't capture it again.
+ */
+ ar_delete_trig_tcs = NULL;
+ }
+
+ /* AFTER ROW DELETE Triggers */
+ ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,
+ ar_delete_trig_tcs);
+
+ /* Process RETURNING if present and if requested */
+ if (processReturning && resultRelInfo->ri_projectReturning)
+ {
+ /*
+ * We have to put the target tuple into a slot, which means first we
+ * gotta fetch it. We can use the trigger tuple slot.
+ */
+ TupleTableSlot *rslot;
+
+ if (resultRelInfo->ri_FdwRoutine)
+ {
+ /* FDW must have provided a slot containing the deleted row */
+ Assert(!TupIsNull(slot));
+ }
+ else
+ {
+ slot = ExecGetReturningSlot(estate, resultRelInfo);
+ if (oldtuple != NULL)
+ {
+ ExecForceStoreHeapTuple(oldtuple, slot, false);
+ }
+ else
+ {
+ if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid,
+ SnapshotAny, slot))
+ elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING");
+ }
+ }
+
+ rslot = ExecProcessReturning(resultRelInfo, slot, planSlot);
+
+ /*
+ * Before releasing the target tuple again, make sure rslot has a
+ * local copy of any pass-by-reference values.
+ */
+ ExecMaterializeSlot(rslot);
+
+ ExecClearTuple(slot);
+
+ return rslot;
+ }
+
+ return NULL;
+}
+
+/*
+ * ExecCrossPartitionUpdate --- Move an updated tuple to another partition.
+ *
+ * This works by first deleting the old tuple from the current partition,
+ * followed by inserting the new tuple into the root parent table, that is,
+ * mtstate->rootResultRelInfo. It will be re-routed from there to the
+ * correct partition.
+ *
+ * Returns true if the tuple has been successfully moved, or if it's found
+ * that the tuple was concurrently deleted so there's nothing more to do
+ * for the caller.
+ *
+ * False is returned if the tuple we're trying to move is found to have been
+ * concurrently updated. In that case, the caller must to check if the
+ * updated tuple that's returned in *retry_slot still needs to be re-routed,
+ * and call this function again or perform a regular update accordingly.
+ */
+static bool
+ExecCrossPartitionUpdate(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ ItemPointer tupleid, HeapTuple oldtuple,
+ TupleTableSlot *slot, TupleTableSlot *planSlot,
+ EPQState *epqstate, bool canSetTag,
+ TupleTableSlot **retry_slot,
+ TupleTableSlot **inserted_tuple)
+{
+ EState *estate = mtstate->ps.state;
+ TupleConversionMap *tupconv_map;
+ bool tuple_deleted;
+ TupleTableSlot *epqslot = NULL;
+
+ *inserted_tuple = NULL;
+ *retry_slot = NULL;
+
+ /*
+ * Disallow an INSERT ON CONFLICT DO UPDATE that causes the original row
+ * to migrate to a different partition. Maybe this can be implemented
+ * some day, but it seems a fringe feature with little redeeming value.
+ */
+ if (((ModifyTable *) mtstate->ps.plan)->onConflictAction == ONCONFLICT_UPDATE)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("invalid ON UPDATE specification"),
+ errdetail("The result tuple would appear in a different partition than the original tuple.")));
+
+ /*
+ * When an UPDATE is run directly on a leaf partition, simply fail with a
+ * partition constraint violation error.
+ */
+ if (resultRelInfo == mtstate->rootResultRelInfo)
+ ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+ /* Initialize tuple routing info if not already done. */
+ if (mtstate->mt_partition_tuple_routing == NULL)
+ {
+ Relation rootRel = mtstate->rootResultRelInfo->ri_RelationDesc;
+ MemoryContext oldcxt;
+
+ /* Things built here have to last for the query duration. */
+ oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ mtstate->mt_partition_tuple_routing =
+ ExecSetupPartitionTupleRouting(estate, rootRel);
+
+ /*
+ * Before a partition's tuple can be re-routed, it must first be
+ * converted to the root's format, so we'll need a slot for storing
+ * such tuples.
+ */
+ Assert(mtstate->mt_root_tuple_slot == NULL);
+ mtstate->mt_root_tuple_slot = table_slot_create(rootRel, NULL);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ /*
+ * Row movement, part 1. Delete the tuple, but skip RETURNING processing.
+ * We want to return rows from INSERT.
+ */
+ ExecDelete(mtstate, resultRelInfo, tupleid, oldtuple, planSlot,
+ epqstate, estate,
+ false, /* processReturning */
+ false, /* canSetTag */
+ true, /* changingPart */
+ &tuple_deleted, &epqslot);
+
+ /*
+ * For some reason if DELETE didn't happen (e.g. trigger prevented it, or
+ * it was already deleted by self, or it was concurrently deleted by
+ * another transaction), then we should skip the insert as well;
+ * otherwise, an UPDATE could cause an increase in the total number of
+ * rows across all partitions, which is clearly wrong.
+ *
+ * For a normal UPDATE, the case where the tuple has been the subject of a
+ * concurrent UPDATE or DELETE would be handled by the EvalPlanQual
+ * machinery, but for an UPDATE that we've translated into a DELETE from
+ * this partition and an INSERT into some other partition, that's not
+ * available, because CTID chains can't span relation boundaries. We
+ * mimic the semantics to a limited extent by skipping the INSERT if the
+ * DELETE fails to find a tuple. This ensures that two concurrent
+ * attempts to UPDATE the same tuple at the same time can't turn one tuple
+ * into two, and that an UPDATE of a just-deleted tuple can't resurrect
+ * it.
+ */
+ if (!tuple_deleted)
+ {
+ /*
+ * epqslot will be typically NULL. But when ExecDelete() finds that
+ * another transaction has concurrently updated the same row, it
+ * re-fetches the row, skips the delete, and epqslot is set to the
+ * re-fetched tuple slot. In that case, we need to do all the checks
+ * again.
+ */
+ if (TupIsNull(epqslot))
+ return true;
+ else
+ {
+ /* Fetch the most recent version of old tuple. */
+ TupleTableSlot *oldSlot;
+
+ /* ... but first, make sure ri_oldTupleSlot is initialized. */
+ if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+ ExecInitUpdateProjection(mtstate, resultRelInfo);
+ oldSlot = resultRelInfo->ri_oldTupleSlot;
+ if (!table_tuple_fetch_row_version(resultRelInfo->ri_RelationDesc,
+ tupleid,
+ SnapshotAny,
+ oldSlot))
+ elog(ERROR, "failed to fetch tuple being updated");
+ *retry_slot = ExecGetUpdateNewTuple(resultRelInfo, epqslot,
+ oldSlot);
+ return false;
+ }
+ }
+
+ /*
+ * resultRelInfo is one of the per-relation resultRelInfos. So we should
+ * convert the tuple into root's tuple descriptor if needed, since
+ * ExecInsert() starts the search from root.
+ */
+ tupconv_map = ExecGetChildToRootMap(resultRelInfo);
+ if (tupconv_map != NULL)
+ slot = execute_attr_map_slot(tupconv_map->attrMap,
+ slot,
+ mtstate->mt_root_tuple_slot);
+
+ /* Tuple routing starts from the root table. */
+ *inserted_tuple = ExecInsert(mtstate, mtstate->rootResultRelInfo, slot,
+ planSlot, estate, canSetTag);
+
+ /*
+ * Reset the transition state that may possibly have been written by
+ * INSERT.
+ */
+ if (mtstate->mt_transition_capture)
+ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+
+ /* We're done moving. */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecUpdate
+ *
+ * note: we can't run UPDATE queries with transactions
+ * off because UPDATEs are actually INSERTs and our
+ * scan will mistakenly loop forever, updating the tuple
+ * it just inserted.. This should be fixed but until it
+ * is, we don't want to get stuck in an infinite loop
+ * which corrupts your database..
+ *
+ * When updating a table, tupleid identifies the tuple to
+ * update and oldtuple is NULL. When updating a view, oldtuple
+ * is passed to the INSTEAD OF triggers and identifies what to
+ * update, and tupleid is invalid. When updating a foreign table,
+ * tupleid is invalid; the FDW has to figure out which row to
+ * update using data from the planSlot. oldtuple is passed to
+ * foreign table triggers; it is NULL when the foreign table has
+ * no relevant triggers.
+ *
+ * slot contains the new tuple value to be stored.
+ * planSlot is the output of the ModifyTable's subplan; we use it
+ * to access values from other input tables (for RETURNING),
+ * row-ID junk columns, etc.
+ *
+ * Returns RETURNING result if any, otherwise NULL.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecUpdate(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ ItemPointer tupleid,
+ HeapTuple oldtuple,
+ TupleTableSlot *slot,
+ TupleTableSlot *planSlot,
+ EPQState *epqstate,
+ EState *estate,
+ bool canSetTag)
+{
+ Relation resultRelationDesc = resultRelInfo->ri_RelationDesc;
+ TM_Result result;
+ TM_FailureData tmfd;
+ List *recheckIndexes = NIL;
+
+ /*
+ * abort the operation if not running transactions
+ */
+ if (IsBootstrapProcessingMode())
+ elog(ERROR, "cannot UPDATE during bootstrap");
+
+ ExecMaterializeSlot(slot);
+
+ /*
+ * Open the table's indexes, if we have not done so already, so that we
+ * can add new index entries for the updated tuple.
+ */
+ if (resultRelationDesc->rd_rel->relhasindex &&
+ resultRelInfo->ri_IndexRelationDescs == NULL)
+ ExecOpenIndices(resultRelInfo, false);
+
+ /* BEFORE ROW UPDATE Triggers */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_update_before_row)
+ {
+ if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo,
+ tupleid, oldtuple, slot))
+ return NULL; /* "do nothing" */
+ }
+
+ /* INSTEAD OF ROW UPDATE Triggers */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_update_instead_row)
+ {
+ if (!ExecIRUpdateTriggers(estate, resultRelInfo,
+ oldtuple, slot))
+ return NULL; /* "do nothing" */
+ }
+ else if (resultRelInfo->ri_FdwRoutine)
+ {
+ /*
+ * GENERATED expressions might reference the tableoid column, so
+ * (re-)initialize tts_tableOid before evaluating them.
+ */
+ slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+ /*
+ * Compute stored generated columns
+ */
+ if (resultRelationDesc->rd_att->constr &&
+ resultRelationDesc->rd_att->constr->has_generated_stored)
+ ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+ CMD_UPDATE);
+
+ /*
+ * update in foreign table: let the FDW do it
+ */
+ slot = resultRelInfo->ri_FdwRoutine->ExecForeignUpdate(estate,
+ resultRelInfo,
+ slot,
+ planSlot);
+
+ if (slot == NULL) /* "do nothing" */
+ return NULL;
+
+ /*
+ * AFTER ROW Triggers or RETURNING expressions might reference the
+ * tableoid column, so (re-)initialize tts_tableOid before evaluating
+ * them. (This covers the case where the FDW replaced the slot.)
+ */
+ slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
+ }
+ else
+ {
+ LockTupleMode lockmode;
+ bool partition_constraint_failed;
+ bool update_indexes;
+
+ /*
+ * Constraints and GENERATED expressions might reference the tableoid
+ * column, so (re-)initialize tts_tableOid before evaluating them.
+ */
+ slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
+
+ /*
+ * Compute stored generated columns
+ */
+ if (resultRelationDesc->rd_att->constr &&
+ resultRelationDesc->rd_att->constr->has_generated_stored)
+ ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+ CMD_UPDATE);
+
+ /*
+ * Check any RLS UPDATE WITH CHECK policies
+ *
+ * If we generate a new candidate tuple after EvalPlanQual testing, we
+ * must loop back here and recheck any RLS policies and constraints.
+ * (We don't need to redo triggers, however. If there are any BEFORE
+ * triggers then trigger.c will have done table_tuple_lock to lock the
+ * correct tuple, so there's no need to do them again.)
+ */
+lreplace:;
+
+ /* ensure slot is independent, consider e.g. EPQ */
+ ExecMaterializeSlot(slot);
+
+ /*
+ * If partition constraint fails, this row might get moved to another
+ * partition, in which case we should check the RLS CHECK policy just
+ * before inserting into the new partition, rather than doing it here.
+ * This is because a trigger on that partition might again change the
+ * row. So skip the WCO checks if the partition constraint fails.
+ */
+ partition_constraint_failed =
+ resultRelationDesc->rd_rel->relispartition &&
+ !ExecPartitionCheck(resultRelInfo, slot, estate, false);
+
+ if (!partition_constraint_failed &&
+ resultRelInfo->ri_WithCheckOptions != NIL)
+ {
+ /*
+ * ExecWithCheckOptions() will skip any WCOs which are not of the
+ * kind we are looking for at this point.
+ */
+ ExecWithCheckOptions(WCO_RLS_UPDATE_CHECK,
+ resultRelInfo, slot, estate);
+ }
+
+ /*
+ * If a partition check failed, try to move the row into the right
+ * partition.
+ */
+ if (partition_constraint_failed)
+ {
+ TupleTableSlot *inserted_tuple,
+ *retry_slot;
+ bool retry;
+
+ /*
+ * ExecCrossPartitionUpdate will first DELETE the row from the
+ * partition it's currently in and then insert it back into the
+ * root table, which will re-route it to the correct partition.
+ * The first part may have to be repeated if it is detected that
+ * the tuple we're trying to move has been concurrently updated.
+ */
+ retry = !ExecCrossPartitionUpdate(mtstate, resultRelInfo, tupleid,
+ oldtuple, slot, planSlot,
+ epqstate, canSetTag,
+ &retry_slot, &inserted_tuple);
+ if (retry)
+ {
+ slot = retry_slot;
+ goto lreplace;
+ }
+
+ return inserted_tuple;
+ }
+
+ /*
+ * Check the constraints of the tuple. We've already checked the
+ * partition constraint above; however, we must still ensure the tuple
+ * passes all other constraints, so we will call ExecConstraints() and
+ * have it validate all remaining checks.
+ */
+ if (resultRelationDesc->rd_att->constr)
+ ExecConstraints(resultRelInfo, slot, estate);
+
+ /*
+ * replace the heap tuple
+ *
+ * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check
+ * that the row to be updated is visible to that snapshot, and throw a
+ * can't-serialize error if not. This is a special-case behavior
+ * needed for referential integrity updates in transaction-snapshot
+ * mode transactions.
+ */
+ result = table_tuple_update(resultRelationDesc, tupleid, slot,
+ estate->es_output_cid,
+ estate->es_snapshot,
+ estate->es_crosscheck_snapshot,
+ true /* wait for commit */ ,
+ &tmfd, &lockmode, &update_indexes);
+
+ switch (result)
+ {
+ case TM_SelfModified:
+
+ /*
+ * The target tuple was already updated or deleted by the
+ * current command, or by a later command in the current
+ * transaction. The former case is possible in a join UPDATE
+ * where multiple tuples join to the same target tuple. This
+ * is pretty questionable, but Postgres has always allowed it:
+ * we just execute the first update action and ignore
+ * additional update attempts.
+ *
+ * The latter case arises if the tuple is modified by a
+ * command in a BEFORE trigger, or perhaps by a command in a
+ * volatile function used in the query. In such situations we
+ * should not ignore the update, but it is equally unsafe to
+ * proceed. We don't want to discard the original UPDATE
+ * while keeping the triggered actions based on it; and we
+ * have no principled way to merge this update with the
+ * previous ones. So throwing an error is the only safe
+ * course.
+ *
+ * If a trigger actually intends this type of interaction, it
+ * can re-execute the UPDATE (assuming it can figure out how)
+ * and then return NULL to cancel the outer update.
+ */
+ if (tmfd.cmax != estate->es_output_cid)
+ ereport(ERROR,
+ (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
+ errmsg("tuple to be updated was already modified by an operation triggered by the current command"),
+ errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
+
+ /* Else, already updated by self; nothing to do */
+ return NULL;
+
+ case TM_Ok:
+ break;
+
+ case TM_Updated:
+ {
+ TupleTableSlot *inputslot;
+ TupleTableSlot *epqslot;
+ TupleTableSlot *oldSlot;
+
+ if (IsolationUsesXactSnapshot())
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent update")));
+
+ /*
+ * Already know that we're going to need to do EPQ, so
+ * fetch tuple directly into the right slot.
+ */
+ inputslot = EvalPlanQualSlot(epqstate, resultRelationDesc,
+ resultRelInfo->ri_RangeTableIndex);
+
+ result = table_tuple_lock(resultRelationDesc, tupleid,
+ estate->es_snapshot,
+ inputslot, estate->es_output_cid,
+ lockmode, LockWaitBlock,
+ TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
+ &tmfd);
+
+ switch (result)
+ {
+ case TM_Ok:
+ Assert(tmfd.traversed);
+
+ epqslot = EvalPlanQual(epqstate,
+ resultRelationDesc,
+ resultRelInfo->ri_RangeTableIndex,
+ inputslot);
+ if (TupIsNull(epqslot))
+ /* Tuple not passing quals anymore, exiting... */
+ return NULL;
+
+ /* Make sure ri_oldTupleSlot is initialized. */
+ if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+ ExecInitUpdateProjection(mtstate, resultRelInfo);
+
+ /* Fetch the most recent version of old tuple. */
+ oldSlot = resultRelInfo->ri_oldTupleSlot;
+ if (!table_tuple_fetch_row_version(resultRelationDesc,
+ tupleid,
+ SnapshotAny,
+ oldSlot))
+ elog(ERROR, "failed to fetch tuple being updated");
+ slot = ExecGetUpdateNewTuple(resultRelInfo,
+ epqslot, oldSlot);
+ goto lreplace;
+
+ case TM_Deleted:
+ /* tuple already deleted; nothing to do */
+ return NULL;
+
+ case TM_SelfModified:
+
+ /*
+ * This can be reached when following an update
+ * chain from a tuple updated by another session,
+ * reaching a tuple that was already updated in
+ * this transaction. If previously modified by
+ * this command, ignore the redundant update,
+ * otherwise error out.
+ *
+ * See also TM_SelfModified response to
+ * table_tuple_update() above.
+ */
+ if (tmfd.cmax != estate->es_output_cid)
+ ereport(ERROR,
+ (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
+ errmsg("tuple to be updated was already modified by an operation triggered by the current command"),
+ errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
+ return NULL;
+
+ default:
+ /* see table_tuple_lock call in ExecDelete() */
+ elog(ERROR, "unexpected table_tuple_lock status: %u",
+ result);
+ return NULL;
+ }
+ }
+
+ break;
+
+ case TM_Deleted:
+ if (IsolationUsesXactSnapshot())
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent delete")));
+ /* tuple already deleted; nothing to do */
+ return NULL;
+
+ default:
+ elog(ERROR, "unrecognized table_tuple_update status: %u",
+ result);
+ return NULL;
+ }
+
+ /* insert index entries for tuple if necessary */
+ if (resultRelInfo->ri_NumIndices > 0 && update_indexes)
+ recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+ slot, estate, true, false,
+ NULL, NIL);
+ }
+
+ if (canSetTag)
+ (estate->es_processed)++;
+
+ /* AFTER ROW UPDATE Triggers */
+ ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple, slot,
+ recheckIndexes,
+ mtstate->operation == CMD_INSERT ?
+ mtstate->mt_oc_transition_capture :
+ mtstate->mt_transition_capture);
+
+ list_free(recheckIndexes);
+
+ /*
+ * Check any WITH CHECK OPTION constraints from parent views. We are
+ * required to do this after testing all constraints and uniqueness
+ * violations per the SQL spec, so we do it after actually updating the
+ * record in the heap and all indexes.
+ *
+ * ExecWithCheckOptions() will skip any WCOs which are not of the kind we
+ * are looking for at this point.
+ */
+ if (resultRelInfo->ri_WithCheckOptions != NIL)
+ ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);
+
+ /* Process RETURNING if present */
+ if (resultRelInfo->ri_projectReturning)
+ return ExecProcessReturning(resultRelInfo, slot, planSlot);
+
+ return NULL;
+}
+
+/*
+ * ExecOnConflictUpdate --- execute UPDATE of INSERT ON CONFLICT DO UPDATE
+ *
+ * Try to lock tuple for update as part of speculative insertion. If
+ * a qual originating from ON CONFLICT DO UPDATE is satisfied, update
+ * (but still lock row, even though it may not satisfy estate's
+ * snapshot).
+ *
+ * Returns true if we're done (with or without an update), or false if
+ * the caller must retry the INSERT from scratch.
+ */
+static bool
+ExecOnConflictUpdate(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ ItemPointer conflictTid,
+ TupleTableSlot *planSlot,
+ TupleTableSlot *excludedSlot,
+ EState *estate,
+ bool canSetTag,
+ TupleTableSlot **returning)
+{
+ ExprContext *econtext = mtstate->ps.ps_ExprContext;
+ Relation relation = resultRelInfo->ri_RelationDesc;
+ ExprState *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause;
+ TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing;
+ TM_FailureData tmfd;
+ LockTupleMode lockmode;
+ TM_Result test;
+ Datum xminDatum;
+ TransactionId xmin;
+ bool isnull;
+
+ /* Determine lock mode to use */
+ lockmode = ExecUpdateLockMode(estate, resultRelInfo);
+
+ /*
+ * Lock tuple for update. Don't follow updates when tuple cannot be
+ * locked without doing so. A row locking conflict here means our
+ * previous conclusion that the tuple is conclusively committed is not
+ * true anymore.
+ */
+ test = table_tuple_lock(relation, conflictTid,
+ estate->es_snapshot,
+ existing, estate->es_output_cid,
+ lockmode, LockWaitBlock, 0,
+ &tmfd);
+ switch (test)
+ {
+ case TM_Ok:
+ /* success! */
+ break;
+
+ case TM_Invisible:
+
+ /*
+ * This can occur when a just inserted tuple is updated again in
+ * the same command. E.g. because multiple rows with the same
+ * conflicting key values are inserted.
+ *
+ * This is somewhat similar to the ExecUpdate() TM_SelfModified
+ * case. We do not want to proceed because it would lead to the
+ * same row being updated a second time in some unspecified order,
+ * and in contrast to plain UPDATEs there's no historical behavior
+ * to break.
+ *
+ * It is the user's responsibility to prevent this situation from
+ * occurring. These problems are why SQL-2003 similarly specifies
+ * that for SQL MERGE, an exception must be raised in the event of
+ * an attempt to update the same row twice.
+ */
+ xminDatum = slot_getsysattr(existing,
+ MinTransactionIdAttributeNumber,
+ &isnull);
+ Assert(!isnull);
+ xmin = DatumGetTransactionId(xminDatum);
+
+ if (TransactionIdIsCurrentTransactionId(xmin))
+ ereport(ERROR,
+ (errcode(ERRCODE_CARDINALITY_VIOLATION),
+ errmsg("ON CONFLICT DO UPDATE command cannot affect row a second time"),
+ errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values.")));
+
+ /* This shouldn't happen */
+ elog(ERROR, "attempted to lock invisible tuple");
+ break;
+
+ case TM_SelfModified:
+
+ /*
+ * This state should never be reached. As a dirty snapshot is used
+ * to find conflicting tuples, speculative insertion wouldn't have
+ * seen this row to conflict with.
+ */
+ elog(ERROR, "unexpected self-updated tuple");
+ break;
+
+ case TM_Updated:
+ if (IsolationUsesXactSnapshot())
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent update")));
+
+ /*
+ * As long as we don't support an UPDATE of INSERT ON CONFLICT for
+ * a partitioned table we shouldn't reach to a case where tuple to
+ * be lock is moved to another partition due to concurrent update
+ * of the partition key.
+ */
+ Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid));
+
+ /*
+ * Tell caller to try again from the very start.
+ *
+ * It does not make sense to use the usual EvalPlanQual() style
+ * loop here, as the new version of the row might not conflict
+ * anymore, or the conflicting tuple has actually been deleted.
+ */
+ ExecClearTuple(existing);
+ return false;
+
+ case TM_Deleted:
+ if (IsolationUsesXactSnapshot())
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to concurrent delete")));
+
+ /* see TM_Updated case */
+ Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid));
+ ExecClearTuple(existing);
+ return false;
+
+ default:
+ elog(ERROR, "unrecognized table_tuple_lock status: %u", test);
+ }
+
+ /* Success, the tuple is locked. */
+
+ /*
+ * Verify that the tuple is visible to our MVCC snapshot if the current
+ * isolation level mandates that.
+ *
+ * It's not sufficient to rely on the check within ExecUpdate() as e.g.
+ * CONFLICT ... WHERE clause may prevent us from reaching that.
+ *
+ * This means we only ever continue when a new command in the current
+ * transaction could see the row, even though in READ COMMITTED mode the
+ * tuple will not be visible according to the current statement's
+ * snapshot. This is in line with the way UPDATE deals with newer tuple
+ * versions.
+ */
+ ExecCheckTupleVisible(estate, relation, existing);
+
+ /*
+ * Make tuple and any needed join variables available to ExecQual and
+ * ExecProject. The EXCLUDED tuple is installed in ecxt_innertuple, while
+ * the target's existing tuple is installed in the scantuple. EXCLUDED
+ * has been made to reference INNER_VAR in setrefs.c, but there is no
+ * other redirection.
+ */
+ econtext->ecxt_scantuple = existing;
+ econtext->ecxt_innertuple = excludedSlot;
+ econtext->ecxt_outertuple = NULL;
+
+ if (!ExecQual(onConflictSetWhere, econtext))
+ {
+ ExecClearTuple(existing); /* see return below */
+ InstrCountFiltered1(&mtstate->ps, 1);
+ return true; /* done with the tuple */
+ }
+
+ if (resultRelInfo->ri_WithCheckOptions != NIL)
+ {
+ /*
+ * Check target's existing tuple against UPDATE-applicable USING
+ * security barrier quals (if any), enforced here as RLS checks/WCOs.
+ *
+ * The rewriter creates UPDATE RLS checks/WCOs for UPDATE security
+ * quals, and stores them as WCOs of "kind" WCO_RLS_CONFLICT_CHECK,
+ * but that's almost the extent of its special handling for ON
+ * CONFLICT DO UPDATE.
+ *
+ * The rewriter will also have associated UPDATE applicable straight
+ * RLS checks/WCOs for the benefit of the ExecUpdate() call that
+ * follows. INSERTs and UPDATEs naturally have mutually exclusive WCO
+ * kinds, so there is no danger of spurious over-enforcement in the
+ * INSERT or UPDATE path.
+ */
+ ExecWithCheckOptions(WCO_RLS_CONFLICT_CHECK, resultRelInfo,
+ existing,
+ mtstate->ps.state);
+ }
+
+ /* Project the new tuple version */
+ ExecProject(resultRelInfo->ri_onConflict->oc_ProjInfo);
+
+ /*
+ * Note that it is possible that the target tuple has been modified in
+ * this session, after the above table_tuple_lock. We choose to not error
+ * out in that case, in line with ExecUpdate's treatment of similar cases.
+ * This can happen if an UPDATE is triggered from within ExecQual(),
+ * ExecWithCheckOptions() or ExecProject() above, e.g. by selecting from a
+ * wCTE in the ON CONFLICT's SET.
+ */
+
+ /* Execute UPDATE with projection */
+ *returning = ExecUpdate(mtstate, resultRelInfo, conflictTid, NULL,
+ resultRelInfo->ri_onConflict->oc_ProjSlot,
+ planSlot,
+ &mtstate->mt_epqstate, mtstate->ps.state,
+ canSetTag);
+
+ /*
+ * Clear out existing tuple, as there might not be another conflict among
+ * the next input rows. Don't want to hold resources till the end of the
+ * query.
+ */
+ ExecClearTuple(existing);
+ return true;
+}
+
+
+/*
+ * Process BEFORE EACH STATEMENT triggers
+ */
+static void
+fireBSTriggers(ModifyTableState *node)
+{
+ ModifyTable *plan = (ModifyTable *) node->ps.plan;
+ ResultRelInfo *resultRelInfo = node->rootResultRelInfo;
+
+ switch (node->operation)
+ {
+ case CMD_INSERT:
+ ExecBSInsertTriggers(node->ps.state, resultRelInfo);
+ if (plan->onConflictAction == ONCONFLICT_UPDATE)
+ ExecBSUpdateTriggers(node->ps.state,
+ resultRelInfo);
+ break;
+ case CMD_UPDATE:
+ ExecBSUpdateTriggers(node->ps.state, resultRelInfo);
+ break;
+ case CMD_DELETE:
+ ExecBSDeleteTriggers(node->ps.state, resultRelInfo);
+ break;
+ default:
+ elog(ERROR, "unknown operation");
+ break;
+ }
+}
+
+/*
+ * Process AFTER EACH STATEMENT triggers
+ */
+static void
+fireASTriggers(ModifyTableState *node)
+{
+ ModifyTable *plan = (ModifyTable *) node->ps.plan;
+ ResultRelInfo *resultRelInfo = node->rootResultRelInfo;
+
+ switch (node->operation)
+ {
+ case CMD_INSERT:
+ if (plan->onConflictAction == ONCONFLICT_UPDATE)
+ ExecASUpdateTriggers(node->ps.state,
+ resultRelInfo,
+ node->mt_oc_transition_capture);
+ ExecASInsertTriggers(node->ps.state, resultRelInfo,
+ node->mt_transition_capture);
+ break;
+ case CMD_UPDATE:
+ ExecASUpdateTriggers(node->ps.state, resultRelInfo,
+ node->mt_transition_capture);
+ break;
+ case CMD_DELETE:
+ ExecASDeleteTriggers(node->ps.state, resultRelInfo,
+ node->mt_transition_capture);
+ break;
+ default:
+ elog(ERROR, "unknown operation");
+ break;
+ }
+}
+
+/*
+ * Set up the state needed for collecting transition tuples for AFTER
+ * triggers.
+ */
+static void
+ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
+{
+ ModifyTable *plan = (ModifyTable *) mtstate->ps.plan;
+ ResultRelInfo *targetRelInfo = mtstate->rootResultRelInfo;
+
+ /* Check for transition tables on the directly targeted relation. */
+ mtstate->mt_transition_capture =
+ MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc,
+ RelationGetRelid(targetRelInfo->ri_RelationDesc),
+ mtstate->operation);
+ if (plan->operation == CMD_INSERT &&
+ plan->onConflictAction == ONCONFLICT_UPDATE)
+ mtstate->mt_oc_transition_capture =
+ MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc,
+ RelationGetRelid(targetRelInfo->ri_RelationDesc),
+ CMD_UPDATE);
+}
+
+/*
+ * ExecPrepareTupleRouting --- prepare for routing one tuple
+ *
+ * Determine the partition in which the tuple in slot is to be inserted,
+ * and return its ResultRelInfo in *partRelInfo. The return value is
+ * a slot holding the tuple of the partition rowtype.
+ *
+ * This also sets the transition table information in mtstate based on the
+ * selected partition.
+ */
+static TupleTableSlot *
+ExecPrepareTupleRouting(ModifyTableState *mtstate,
+ EState *estate,
+ PartitionTupleRouting *proute,
+ ResultRelInfo *targetRelInfo,
+ TupleTableSlot *slot,
+ ResultRelInfo **partRelInfo)
+{
+ ResultRelInfo *partrel;
+ TupleConversionMap *map;
+
+ /*
+ * Lookup the target partition's ResultRelInfo. If ExecFindPartition does
+ * not find a valid partition for the tuple in 'slot' then an error is
+ * raised. An error may also be raised if the found partition is not a
+ * valid target for INSERTs. This is required since a partitioned table
+ * UPDATE to another partition becomes a DELETE+INSERT.
+ */
+ partrel = ExecFindPartition(mtstate, targetRelInfo, proute, slot, estate);
+
+ /*
+ * If we're capturing transition tuples, we might need to convert from the
+ * partition rowtype to root partitioned table's rowtype. But if there
+ * are no BEFORE triggers on the partition that could change the tuple, we
+ * can just remember the original unconverted tuple to avoid a needless
+ * round trip conversion.
+ */
+ if (mtstate->mt_transition_capture != NULL)
+ {
+ bool has_before_insert_row_trig;
+
+ has_before_insert_row_trig = (partrel->ri_TrigDesc &&
+ partrel->ri_TrigDesc->trig_insert_before_row);
+
+ mtstate->mt_transition_capture->tcs_original_insert_tuple =
+ !has_before_insert_row_trig ? slot : NULL;
+ }
+
+ /*
+ * Convert the tuple, if necessary.
+ */
+ map = partrel->ri_RootToPartitionMap;
+ if (map != NULL)
+ {
+ TupleTableSlot *new_slot = partrel->ri_PartitionTupleSlot;
+
+ slot = execute_attr_map_slot(map->attrMap, slot, new_slot);
+ }
+
+ *partRelInfo = partrel;
+ return slot;
+}
+
+/* ----------------------------------------------------------------
+ * ExecModifyTable
+ *
+ * Perform table modifications as required, and return RETURNING results
+ * if needed.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecModifyTable(PlanState *pstate)
+{
+ ModifyTableState *node = castNode(ModifyTableState, pstate);
+ EState *estate = node->ps.state;
+ CmdType operation = node->operation;
+ ResultRelInfo *resultRelInfo;
+ PlanState *subplanstate;
+ TupleTableSlot *slot;
+ TupleTableSlot *planSlot;
+ TupleTableSlot *oldSlot;
+ ItemPointer tupleid;
+ ItemPointerData tuple_ctid;
+ HeapTupleData oldtupdata;
+ HeapTuple oldtuple;
+ PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
+ List *relinfos = NIL;
+ ListCell *lc;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * This should NOT get called during EvalPlanQual; we should have passed a
+ * subplan tree to EvalPlanQual, instead. Use a runtime test not just
+ * Assert because this condition is easy to miss in testing. (Note:
+ * although ModifyTable should not get executed within an EvalPlanQual
+ * operation, we do have to allow it to be initialized and shut down in
+ * case it is within a CTE subplan. Hence this test must be here, not in
+ * ExecInitModifyTable.)
+ */
+ if (estate->es_epq_active != NULL)
+ elog(ERROR, "ModifyTable should not be called during EvalPlanQual");
+
+ /*
+ * If we've already completed processing, don't try to do more. We need
+ * this test because ExecPostprocessPlan might call us an extra time, and
+ * our subplan's nodes aren't necessarily robust against being called
+ * extra times.
+ */
+ if (node->mt_done)
+ return NULL;
+
+ /*
+ * On first call, fire BEFORE STATEMENT triggers before proceeding.
+ */
+ if (node->fireBSTriggers)
+ {
+ fireBSTriggers(node);
+ node->fireBSTriggers = false;
+ }
+
+ /* Preload local variables */
+ resultRelInfo = node->resultRelInfo + node->mt_lastResultIndex;
+ subplanstate = outerPlanState(node);
+
+ /*
+ * Fetch rows from subplan, and execute the required table modification
+ * for each row.
+ */
+ for (;;)
+ {
+ /*
+ * Reset the per-output-tuple exprcontext. This is needed because
+ * triggers expect to use that context as workspace. It's a bit ugly
+ * to do this below the top level of the plan, however. We might need
+ * to rethink this later.
+ */
+ ResetPerTupleExprContext(estate);
+
+ /*
+ * Reset per-tuple memory context used for processing on conflict and
+ * returning clauses, to free any expression evaluation storage
+ * allocated in the previous cycle.
+ */
+ if (pstate->ps_ExprContext)
+ ResetExprContext(pstate->ps_ExprContext);
+
+ planSlot = ExecProcNode(subplanstate);
+
+ /* No more tuples to process? */
+ if (TupIsNull(planSlot))
+ break;
+
+ /*
+ * When there are multiple result relations, each tuple contains a
+ * junk column that gives the OID of the rel from which it came.
+ * Extract it and select the correct result relation.
+ */
+ if (AttributeNumberIsValid(node->mt_resultOidAttno))
+ {
+ Datum datum;
+ bool isNull;
+ Oid resultoid;
+
+ datum = ExecGetJunkAttribute(planSlot, node->mt_resultOidAttno,
+ &isNull);
+ if (isNull)
+ elog(ERROR, "tableoid is NULL");
+ resultoid = DatumGetObjectId(datum);
+
+ /* If it's not the same as last time, we need to locate the rel */
+ if (resultoid != node->mt_lastResultOid)
+ resultRelInfo = ExecLookupResultRelByOid(node, resultoid,
+ false, true);
+ }
+
+ /*
+ * If resultRelInfo->ri_usesFdwDirectModify is true, all we need to do
+ * here is compute the RETURNING expressions.
+ */
+ if (resultRelInfo->ri_usesFdwDirectModify)
+ {
+ Assert(resultRelInfo->ri_projectReturning);
+
+ /*
+ * A scan slot containing the data that was actually inserted,
+ * updated or deleted has already been made available to
+ * ExecProcessReturning by IterateDirectModify, so no need to
+ * provide it here.
+ */
+ slot = ExecProcessReturning(resultRelInfo, NULL, planSlot);
+
+ return slot;
+ }
+
+ EvalPlanQualSetSlot(&node->mt_epqstate, planSlot);
+ slot = planSlot;
+
+ tupleid = NULL;
+ oldtuple = NULL;
+
+ /*
+ * For UPDATE/DELETE, fetch the row identity info for the tuple to be
+ * updated/deleted. For a heap relation, that's a TID; otherwise we
+ * may have a wholerow junk attr that carries the old tuple in toto.
+ * Keep this in step with the part of ExecInitModifyTable that sets up
+ * ri_RowIdAttNo.
+ */
+ if (operation == CMD_UPDATE || operation == CMD_DELETE)
+ {
+ char relkind;
+ Datum datum;
+ bool isNull;
+
+ relkind = resultRelInfo->ri_RelationDesc->rd_rel->relkind;
+ if (relkind == RELKIND_RELATION ||
+ relkind == RELKIND_MATVIEW ||
+ relkind == RELKIND_PARTITIONED_TABLE)
+ {
+ /* ri_RowIdAttNo refers to a ctid attribute */
+ Assert(AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo));
+ datum = ExecGetJunkAttribute(slot,
+ resultRelInfo->ri_RowIdAttNo,
+ &isNull);
+ /* shouldn't ever get a null result... */
+ if (isNull)
+ elog(ERROR, "ctid is NULL");
+
+ tupleid = (ItemPointer) DatumGetPointer(datum);
+ tuple_ctid = *tupleid; /* be sure we don't free ctid!! */
+ tupleid = &tuple_ctid;
+ }
+
+ /*
+ * Use the wholerow attribute, when available, to reconstruct the
+ * old relation tuple. The old tuple serves one or both of two
+ * purposes: 1) it serves as the OLD tuple for row triggers, 2) it
+ * provides values for any unchanged columns for the NEW tuple of
+ * an UPDATE, because the subplan does not produce all the columns
+ * of the target table.
+ *
+ * Note that the wholerow attribute does not carry system columns,
+ * so foreign table triggers miss seeing those, except that we
+ * know enough here to set t_tableOid. Quite separately from
+ * this, the FDW may fetch its own junk attrs to identify the row.
+ *
+ * Other relevant relkinds, currently limited to views, always
+ * have a wholerow attribute.
+ */
+ else if (AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo))
+ {
+ datum = ExecGetJunkAttribute(slot,
+ resultRelInfo->ri_RowIdAttNo,
+ &isNull);
+ /* shouldn't ever get a null result... */
+ if (isNull)
+ elog(ERROR, "wholerow is NULL");
+
+ oldtupdata.t_data = DatumGetHeapTupleHeader(datum);
+ oldtupdata.t_len =
+ HeapTupleHeaderGetDatumLength(oldtupdata.t_data);
+ ItemPointerSetInvalid(&(oldtupdata.t_self));
+ /* Historically, view triggers see invalid t_tableOid. */
+ oldtupdata.t_tableOid =
+ (relkind == RELKIND_VIEW) ? InvalidOid :
+ RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+ oldtuple = &oldtupdata;
+ }
+ else
+ {
+ /* Only foreign tables are allowed to omit a row-ID attr */
+ Assert(relkind == RELKIND_FOREIGN_TABLE);
+ }
+ }
+
+ switch (operation)
+ {
+ case CMD_INSERT:
+ /* Initialize projection info if first time for this table */
+ if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+ ExecInitInsertProjection(node, resultRelInfo);
+ slot = ExecGetInsertNewTuple(resultRelInfo, planSlot);
+ slot = ExecInsert(node, resultRelInfo, slot, planSlot,
+ estate, node->canSetTag);
+ break;
+ case CMD_UPDATE:
+ /* Initialize projection info if first time for this table */
+ if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+ ExecInitUpdateProjection(node, resultRelInfo);
+
+ /*
+ * Make the new tuple by combining plan's output tuple with
+ * the old tuple being updated.
+ */
+ oldSlot = resultRelInfo->ri_oldTupleSlot;
+ if (oldtuple != NULL)
+ {
+ /* Use the wholerow junk attr as the old tuple. */
+ ExecForceStoreHeapTuple(oldtuple, oldSlot, false);
+ }
+ else
+ {
+ /* Fetch the most recent version of old tuple. */
+ Relation relation = resultRelInfo->ri_RelationDesc;
+
+ Assert(tupleid != NULL);
+ if (!table_tuple_fetch_row_version(relation, tupleid,
+ SnapshotAny,
+ oldSlot))
+ elog(ERROR, "failed to fetch tuple being updated");
+ }
+ slot = ExecGetUpdateNewTuple(resultRelInfo, planSlot,
+ oldSlot);
+
+ /* Now apply the update. */
+ slot = ExecUpdate(node, resultRelInfo, tupleid, oldtuple, slot,
+ planSlot, &node->mt_epqstate, estate,
+ node->canSetTag);
+ break;
+ case CMD_DELETE:
+ slot = ExecDelete(node, resultRelInfo, tupleid, oldtuple,
+ planSlot, &node->mt_epqstate, estate,
+ true, /* processReturning */
+ node->canSetTag,
+ false, /* changingPart */
+ NULL, NULL);
+ break;
+ default:
+ elog(ERROR, "unknown operation");
+ break;
+ }
+
+ /*
+ * If we got a RETURNING result, return it to caller. We'll continue
+ * the work on next call.
+ */
+ if (slot)
+ return slot;
+ }
+
+ /*
+ * Insert remaining tuples for batch insert.
+ */
+ if (proute)
+ relinfos = estate->es_tuple_routing_result_relations;
+ else
+ relinfos = estate->es_opened_result_relations;
+
+ foreach(lc, relinfos)
+ {
+ resultRelInfo = lfirst(lc);
+ if (resultRelInfo->ri_NumSlots > 0)
+ ExecBatchInsert(node, resultRelInfo,
+ resultRelInfo->ri_Slots,
+ resultRelInfo->ri_PlanSlots,
+ resultRelInfo->ri_NumSlots,
+ estate, node->canSetTag);
+ }
+
+ /*
+ * We're done, but fire AFTER STATEMENT triggers before exiting.
+ */
+ fireASTriggers(node);
+
+ node->mt_done = true;
+
+ return NULL;
+}
+
+/*
+ * ExecLookupResultRelByOid
+ * If the table with given OID is among the result relations to be
+ * updated by the given ModifyTable node, return its ResultRelInfo.
+ *
+ * If not found, return NULL if missing_ok, else raise error.
+ *
+ * If update_cache is true, then upon successful lookup, update the node's
+ * one-element cache. ONLY ExecModifyTable may pass true for this.
+ */
+ResultRelInfo *
+ExecLookupResultRelByOid(ModifyTableState *node, Oid resultoid,
+ bool missing_ok, bool update_cache)
+{
+ if (node->mt_resultOidHash)
+ {
+ /* Use the pre-built hash table to locate the rel */
+ MTTargetRelLookup *mtlookup;
+
+ mtlookup = (MTTargetRelLookup *)
+ hash_search(node->mt_resultOidHash, &resultoid, HASH_FIND, NULL);
+ if (mtlookup)
+ {
+ if (update_cache)
+ {
+ node->mt_lastResultOid = resultoid;
+ node->mt_lastResultIndex = mtlookup->relationIndex;
+ }
+ return node->resultRelInfo + mtlookup->relationIndex;
+ }
+ }
+ else
+ {
+ /* With few target rels, just search the ResultRelInfo array */
+ for (int ndx = 0; ndx < node->mt_nrels; ndx++)
+ {
+ ResultRelInfo *rInfo = node->resultRelInfo + ndx;
+
+ if (RelationGetRelid(rInfo->ri_RelationDesc) == resultoid)
+ {
+ if (update_cache)
+ {
+ node->mt_lastResultOid = resultoid;
+ node->mt_lastResultIndex = ndx;
+ }
+ return rInfo;
+ }
+ }
+ }
+
+ if (!missing_ok)
+ elog(ERROR, "incorrect result relation OID %u", resultoid);
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitModifyTable
+ * ----------------------------------------------------------------
+ */
+ModifyTableState *
+ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
+{
+ ModifyTableState *mtstate;
+ Plan *subplan = outerPlan(node);
+ CmdType operation = node->operation;
+ int nrels = list_length(node->resultRelations);
+ ResultRelInfo *resultRelInfo;
+ List *arowmarks;
+ ListCell *l;
+ int i;
+ Relation rel;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ mtstate = makeNode(ModifyTableState);
+ mtstate->ps.plan = (Plan *) node;
+ mtstate->ps.state = estate;
+ mtstate->ps.ExecProcNode = ExecModifyTable;
+
+ mtstate->operation = operation;
+ mtstate->canSetTag = node->canSetTag;
+ mtstate->mt_done = false;
+
+ mtstate->mt_nrels = nrels;
+ mtstate->resultRelInfo = (ResultRelInfo *)
+ palloc(nrels * sizeof(ResultRelInfo));
+
+ /*----------
+ * Resolve the target relation. This is the same as:
+ *
+ * - the relation for which we will fire FOR STATEMENT triggers,
+ * - the relation into whose tuple format all captured transition tuples
+ * must be converted, and
+ * - the root partitioned table used for tuple routing.
+ *
+ * If it's a partitioned table, the root partition doesn't appear
+ * elsewhere in the plan and its RT index is given explicitly in
+ * node->rootRelation. Otherwise (i.e. table inheritance) the target
+ * relation is the first relation in the node->resultRelations list.
+ *----------
+ */
+ if (node->rootRelation > 0)
+ {
+ mtstate->rootResultRelInfo = makeNode(ResultRelInfo);
+ ExecInitResultRelation(estate, mtstate->rootResultRelInfo,
+ node->rootRelation);
+ }
+ else
+ {
+ mtstate->rootResultRelInfo = mtstate->resultRelInfo;
+ ExecInitResultRelation(estate, mtstate->resultRelInfo,
+ linitial_int(node->resultRelations));
+ }
+
+ /* set up epqstate with dummy subplan data for the moment */
+ EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
+ mtstate->fireBSTriggers = true;
+
+ /*
+ * Build state for collecting transition tuples. This requires having a
+ * valid trigger query context, so skip it in explain-only mode.
+ */
+ if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ ExecSetupTransitionCaptureState(mtstate, estate);
+
+ /*
+ * Open all the result relations and initialize the ResultRelInfo structs.
+ * (But root relation was initialized above, if it's part of the array.)
+ * We must do this before initializing the subplan, because direct-modify
+ * FDWs expect their ResultRelInfos to be available.
+ */
+ resultRelInfo = mtstate->resultRelInfo;
+ i = 0;
+ foreach(l, node->resultRelations)
+ {
+ Index resultRelation = lfirst_int(l);
+
+ if (resultRelInfo != mtstate->rootResultRelInfo)
+ {
+ ExecInitResultRelation(estate, resultRelInfo, resultRelation);
+
+ /*
+ * For child result relations, store the root result relation
+ * pointer. We do so for the convenience of places that want to
+ * look at the query's original target relation but don't have the
+ * mtstate handy.
+ */
+ resultRelInfo->ri_RootResultRelInfo = mtstate->rootResultRelInfo;
+ }
+
+ /* Initialize the usesFdwDirectModify flag */
+ resultRelInfo->ri_usesFdwDirectModify = bms_is_member(i,
+ node->fdwDirectModifyPlans);
+
+ /*
+ * Verify result relation is a valid target for the current operation
+ */
+ CheckValidResultRel(resultRelInfo, operation);
+
+ resultRelInfo++;
+ i++;
+ }
+
+ /*
+ * Now we may initialize the subplan.
+ */
+ outerPlanState(mtstate) = ExecInitNode(subplan, estate, eflags);
+
+ /*
+ * Do additional per-result-relation initialization.
+ */
+ for (i = 0; i < nrels; i++)
+ {
+ resultRelInfo = &mtstate->resultRelInfo[i];
+
+ /* Let FDWs init themselves for foreign-table result rels */
+ if (!resultRelInfo->ri_usesFdwDirectModify &&
+ resultRelInfo->ri_FdwRoutine != NULL &&
+ resultRelInfo->ri_FdwRoutine->BeginForeignModify != NULL)
+ {
+ List *fdw_private = (List *) list_nth(node->fdwPrivLists, i);
+
+ resultRelInfo->ri_FdwRoutine->BeginForeignModify(mtstate,
+ resultRelInfo,
+ fdw_private,
+ i,
+ eflags);
+ }
+
+ /*
+ * For UPDATE/DELETE, find the appropriate junk attr now, either a
+ * 'ctid' or 'wholerow' attribute depending on relkind. For foreign
+ * tables, the FDW might have created additional junk attr(s), but
+ * those are no concern of ours.
+ */
+ if (operation == CMD_UPDATE || operation == CMD_DELETE)
+ {
+ char relkind;
+
+ relkind = resultRelInfo->ri_RelationDesc->rd_rel->relkind;
+ if (relkind == RELKIND_RELATION ||
+ relkind == RELKIND_MATVIEW ||
+ relkind == RELKIND_PARTITIONED_TABLE)
+ {
+ resultRelInfo->ri_RowIdAttNo =
+ ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid");
+ if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo))
+ elog(ERROR, "could not find junk ctid column");
+ }
+ else if (relkind == RELKIND_FOREIGN_TABLE)
+ {
+ /*
+ * When there is a row-level trigger, there should be a
+ * wholerow attribute. We also require it to be present in
+ * UPDATE, so we can get the values of unchanged columns.
+ */
+ resultRelInfo->ri_RowIdAttNo =
+ ExecFindJunkAttributeInTlist(subplan->targetlist,
+ "wholerow");
+ if (mtstate->operation == CMD_UPDATE &&
+ !AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo))
+ elog(ERROR, "could not find junk wholerow column");
+ }
+ else
+ {
+ /* Other valid target relkinds must provide wholerow */
+ resultRelInfo->ri_RowIdAttNo =
+ ExecFindJunkAttributeInTlist(subplan->targetlist,
+ "wholerow");
+ if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo))
+ elog(ERROR, "could not find junk wholerow column");
+ }
+ }
+ }
+
+ /*
+ * If this is an inherited update/delete, there will be a junk attribute
+ * named "tableoid" present in the subplan's targetlist. It will be used
+ * to identify the result relation for a given tuple to be
+ * updated/deleted.
+ */
+ mtstate->mt_resultOidAttno =
+ ExecFindJunkAttributeInTlist(subplan->targetlist, "tableoid");
+ Assert(AttributeNumberIsValid(mtstate->mt_resultOidAttno) || nrels == 1);
+ mtstate->mt_lastResultOid = InvalidOid; /* force lookup at first tuple */
+ mtstate->mt_lastResultIndex = 0; /* must be zero if no such attr */
+
+ /* Get the root target relation */
+ rel = mtstate->rootResultRelInfo->ri_RelationDesc;
+
+ /*
+ * Build state for tuple routing if it's a partitioned INSERT. An UPDATE
+ * might need this too, but only if it actually moves tuples between
+ * partitions; in that case setup is done by ExecCrossPartitionUpdate.
+ */
+ if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
+ operation == CMD_INSERT)
+ mtstate->mt_partition_tuple_routing =
+ ExecSetupPartitionTupleRouting(estate, rel);
+
+ /*
+ * Initialize any WITH CHECK OPTION constraints if needed.
+ */
+ resultRelInfo = mtstate->resultRelInfo;
+ foreach(l, node->withCheckOptionLists)
+ {
+ List *wcoList = (List *) lfirst(l);
+ List *wcoExprs = NIL;
+ ListCell *ll;
+
+ foreach(ll, wcoList)
+ {
+ WithCheckOption *wco = (WithCheckOption *) lfirst(ll);
+ ExprState *wcoExpr = ExecInitQual((List *) wco->qual,
+ &mtstate->ps);
+
+ wcoExprs = lappend(wcoExprs, wcoExpr);
+ }
+
+ resultRelInfo->ri_WithCheckOptions = wcoList;
+ resultRelInfo->ri_WithCheckOptionExprs = wcoExprs;
+ resultRelInfo++;
+ }
+
+ /*
+ * Initialize RETURNING projections if needed.
+ */
+ if (node->returningLists)
+ {
+ TupleTableSlot *slot;
+ ExprContext *econtext;
+
+ /*
+ * Initialize result tuple slot and assign its rowtype using the first
+ * RETURNING list. We assume the rest will look the same.
+ */
+ mtstate->ps.plan->targetlist = (List *) linitial(node->returningLists);
+
+ /* Set up a slot for the output of the RETURNING projection(s) */
+ ExecInitResultTupleSlotTL(&mtstate->ps, &TTSOpsVirtual);
+ slot = mtstate->ps.ps_ResultTupleSlot;
+
+ /* Need an econtext too */
+ if (mtstate->ps.ps_ExprContext == NULL)
+ ExecAssignExprContext(estate, &mtstate->ps);
+ econtext = mtstate->ps.ps_ExprContext;
+
+ /*
+ * Build a projection for each result rel.
+ */
+ resultRelInfo = mtstate->resultRelInfo;
+ foreach(l, node->returningLists)
+ {
+ List *rlist = (List *) lfirst(l);
+
+ resultRelInfo->ri_returningList = rlist;
+ resultRelInfo->ri_projectReturning =
+ ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps,
+ resultRelInfo->ri_RelationDesc->rd_att);
+ resultRelInfo++;
+ }
+ }
+ else
+ {
+ /*
+ * We still must construct a dummy result tuple type, because InitPlan
+ * expects one (maybe should change that?).
+ */
+ mtstate->ps.plan->targetlist = NIL;
+ ExecInitResultTypeTL(&mtstate->ps);
+
+ mtstate->ps.ps_ExprContext = NULL;
+ }
+
+ /* Set the list of arbiter indexes if needed for ON CONFLICT */
+ resultRelInfo = mtstate->resultRelInfo;
+ if (node->onConflictAction != ONCONFLICT_NONE)
+ {
+ /* insert may only have one relation, inheritance is not expanded */
+ Assert(nrels == 1);
+ resultRelInfo->ri_onConflictArbiterIndexes = node->arbiterIndexes;
+ }
+
+ /*
+ * If needed, Initialize target list, projection and qual for ON CONFLICT
+ * DO UPDATE.
+ */
+ if (node->onConflictAction == ONCONFLICT_UPDATE)
+ {
+ OnConflictSetState *onconfl = makeNode(OnConflictSetState);
+ ExprContext *econtext;
+ TupleDesc relationDesc;
+
+ /* already exists if created by RETURNING processing above */
+ if (mtstate->ps.ps_ExprContext == NULL)
+ ExecAssignExprContext(estate, &mtstate->ps);
+
+ econtext = mtstate->ps.ps_ExprContext;
+ relationDesc = resultRelInfo->ri_RelationDesc->rd_att;
+
+ /* create state for DO UPDATE SET operation */
+ resultRelInfo->ri_onConflict = onconfl;
+
+ /* initialize slot for the existing tuple */
+ onconfl->oc_Existing =
+ table_slot_create(resultRelInfo->ri_RelationDesc,
+ &mtstate->ps.state->es_tupleTable);
+
+ /*
+ * Create the tuple slot for the UPDATE SET projection. We want a slot
+ * of the table's type here, because the slot will be used to insert
+ * into the table, and for RETURNING processing - which may access
+ * system attributes.
+ */
+ onconfl->oc_ProjSlot =
+ table_slot_create(resultRelInfo->ri_RelationDesc,
+ &mtstate->ps.state->es_tupleTable);
+
+ /* build UPDATE SET projection state */
+ onconfl->oc_ProjInfo =
+ ExecBuildUpdateProjection(node->onConflictSet,
+ true,
+ node->onConflictCols,
+ relationDesc,
+ econtext,
+ onconfl->oc_ProjSlot,
+ &mtstate->ps);
+
+ /* initialize state to evaluate the WHERE clause, if any */
+ if (node->onConflictWhere)
+ {
+ ExprState *qualexpr;
+
+ qualexpr = ExecInitQual((List *) node->onConflictWhere,
+ &mtstate->ps);
+ onconfl->oc_WhereClause = qualexpr;
+ }
+ }
+
+ /*
+ * If we have any secondary relations in an UPDATE or DELETE, they need to
+ * be treated like non-locked relations in SELECT FOR UPDATE, ie, the
+ * EvalPlanQual mechanism needs to be told about them. Locate the
+ * relevant ExecRowMarks.
+ */
+ arowmarks = NIL;
+ foreach(l, node->rowMarks)
+ {
+ PlanRowMark *rc = lfirst_node(PlanRowMark, l);
+ ExecRowMark *erm;
+ ExecAuxRowMark *aerm;
+
+ /* ignore "parent" rowmarks; they are irrelevant at runtime */
+ if (rc->isParent)
+ continue;
+
+ /* Find ExecRowMark and build ExecAuxRowMark */
+ erm = ExecFindRowMark(estate, rc->rti, false);
+ aerm = ExecBuildAuxRowMark(erm, subplan->targetlist);
+ arowmarks = lappend(arowmarks, aerm);
+ }
+
+ EvalPlanQualSetPlan(&mtstate->mt_epqstate, subplan, arowmarks);
+
+ /*
+ * If there are a lot of result relations, use a hash table to speed the
+ * lookups. If there are not a lot, a simple linear search is faster.
+ *
+ * It's not clear where the threshold is, but try 64 for starters. In a
+ * debugging build, use a small threshold so that we get some test
+ * coverage of both code paths.
+ */
+#ifdef USE_ASSERT_CHECKING
+#define MT_NRELS_HASH 4
+#else
+#define MT_NRELS_HASH 64
+#endif
+ if (nrels >= MT_NRELS_HASH)
+ {
+ HASHCTL hash_ctl;
+
+ hash_ctl.keysize = sizeof(Oid);
+ hash_ctl.entrysize = sizeof(MTTargetRelLookup);
+ hash_ctl.hcxt = CurrentMemoryContext;
+ mtstate->mt_resultOidHash =
+ hash_create("ModifyTable target hash",
+ nrels, &hash_ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ for (i = 0; i < nrels; i++)
+ {
+ Oid hashkey;
+ MTTargetRelLookup *mtlookup;
+ bool found;
+
+ resultRelInfo = &mtstate->resultRelInfo[i];
+ hashkey = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+ mtlookup = (MTTargetRelLookup *)
+ hash_search(mtstate->mt_resultOidHash, &hashkey,
+ HASH_ENTER, &found);
+ Assert(!found);
+ mtlookup->relationIndex = i;
+ }
+ }
+ else
+ mtstate->mt_resultOidHash = NULL;
+
+ /*
+ * Determine if the FDW supports batch insert and determine the batch size
+ * (a FDW may support batching, but it may be disabled for the
+ * server/table).
+ *
+ * We only do this for INSERT, so that for UPDATE/DELETE the batch size
+ * remains set to 0.
+ */
+ if (operation == CMD_INSERT)
+ {
+ /* insert may only have one relation, inheritance is not expanded */
+ Assert(nrels == 1);
+ resultRelInfo = mtstate->resultRelInfo;
+ if (!resultRelInfo->ri_usesFdwDirectModify &&
+ resultRelInfo->ri_FdwRoutine != NULL &&
+ resultRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
+ resultRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
+ {
+ resultRelInfo->ri_BatchSize =
+ resultRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(resultRelInfo);
+ Assert(resultRelInfo->ri_BatchSize >= 1);
+ }
+ else
+ resultRelInfo->ri_BatchSize = 1;
+ }
+
+ /*
+ * Lastly, if this is not the primary (canSetTag) ModifyTable node, add it
+ * to estate->es_auxmodifytables so that it will be run to completion by
+ * ExecPostprocessPlan. (It'd actually work fine to add the primary
+ * ModifyTable node too, but there's no need.) Note the use of lcons not
+ * lappend: we need later-initialized ModifyTable nodes to be shut down
+ * before earlier ones. This ensures that we don't throw away RETURNING
+ * rows that need to be seen by a later CTE subplan.
+ */
+ if (!mtstate->canSetTag)
+ estate->es_auxmodifytables = lcons(mtstate,
+ estate->es_auxmodifytables);
+
+ return mtstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndModifyTable
+ *
+ * Shuts down the plan.
+ *
+ * Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndModifyTable(ModifyTableState *node)
+{
+ int i;
+
+ /*
+ * Allow any FDWs to shut down
+ */
+ for (i = 0; i < node->mt_nrels; i++)
+ {
+ int j;
+ ResultRelInfo *resultRelInfo = node->resultRelInfo + i;
+
+ if (!resultRelInfo->ri_usesFdwDirectModify &&
+ resultRelInfo->ri_FdwRoutine != NULL &&
+ resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL)
+ resultRelInfo->ri_FdwRoutine->EndForeignModify(node->ps.state,
+ resultRelInfo);
+
+ /*
+ * Cleanup the initialized batch slots. This only matters for FDWs
+ * with batching, but the other cases will have ri_NumSlotsInitialized
+ * == 0.
+ */
+ for (j = 0; j < resultRelInfo->ri_NumSlotsInitialized; j++)
+ {
+ ExecDropSingleTupleTableSlot(resultRelInfo->ri_Slots[j]);
+ ExecDropSingleTupleTableSlot(resultRelInfo->ri_PlanSlots[j]);
+ }
+ }
+
+ /*
+ * Close all the partitioned tables, leaf partitions, and their indices
+ * and release the slot used for tuple routing, if set.
+ */
+ if (node->mt_partition_tuple_routing)
+ {
+ ExecCleanupTupleRouting(node, node->mt_partition_tuple_routing);
+
+ if (node->mt_root_tuple_slot)
+ ExecDropSingleTupleTableSlot(node->mt_root_tuple_slot);
+ }
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ /*
+ * Terminate EPQ execution if active
+ */
+ EvalPlanQualEnd(&node->mt_epqstate);
+
+ /*
+ * shut down subplan
+ */
+ ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanModifyTable(ModifyTableState *node)
+{
+ /*
+ * Currently, we don't need to support rescan on ModifyTable nodes. The
+ * semantics of that would be a bit debatable anyway.
+ */
+ elog(ERROR, "ExecReScanModifyTable is not implemented");
+}
diff --git a/src/backend/executor/nodeNamedtuplestorescan.c b/src/backend/executor/nodeNamedtuplestorescan.c
new file mode 100644
index 0000000..c0d1069
--- /dev/null
+++ b/src/backend/executor/nodeNamedtuplestorescan.c
@@ -0,0 +1,201 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeNamedtuplestorescan.c
+ * routines to handle NamedTuplestoreScan nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeNamedtuplestorescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeNamedtuplestorescan.h"
+#include "miscadmin.h"
+#include "utils/queryenvironment.h"
+
+static TupleTableSlot *NamedTuplestoreScanNext(NamedTuplestoreScanState *node);
+
+/* ----------------------------------------------------------------
+ * NamedTuplestoreScanNext
+ *
+ * This is a workhorse for ExecNamedTuplestoreScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+NamedTuplestoreScanNext(NamedTuplestoreScanState *node)
+{
+ TupleTableSlot *slot;
+
+ /* We intentionally do not support backward scan. */
+ Assert(ScanDirectionIsForward(node->ss.ps.state->es_direction));
+
+ /*
+ * Get the next tuple from tuplestore. Return NULL if no more tuples.
+ */
+ slot = node->ss.ss_ScanTupleSlot;
+ tuplestore_select_read_pointer(node->relation, node->readptr);
+ (void) tuplestore_gettupleslot(node->relation, true, false, slot);
+ return slot;
+}
+
+/*
+ * NamedTuplestoreScanRecheck -- access method routine to recheck a tuple in
+ * EvalPlanQual
+ */
+static bool
+NamedTuplestoreScanRecheck(NamedTuplestoreScanState *node, TupleTableSlot *slot)
+{
+ /* nothing to check */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecNamedTuplestoreScan(node)
+ *
+ * Scans the CTE sequentially and returns the next qualifying tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecNamedTuplestoreScan(PlanState *pstate)
+{
+ NamedTuplestoreScanState *node = castNode(NamedTuplestoreScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) NamedTuplestoreScanNext,
+ (ExecScanRecheckMtd) NamedTuplestoreScanRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecInitNamedTuplestoreScan
+ * ----------------------------------------------------------------
+ */
+NamedTuplestoreScanState *
+ExecInitNamedTuplestoreScan(NamedTuplestoreScan *node, EState *estate, int eflags)
+{
+ NamedTuplestoreScanState *scanstate;
+ EphemeralNamedRelation enr;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * NamedTuplestoreScan should not have any children.
+ */
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create new NamedTuplestoreScanState for node
+ */
+ scanstate = makeNode(NamedTuplestoreScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecNamedTuplestoreScan;
+
+ enr = get_ENR(estate->es_queryEnv, node->enrname);
+ if (!enr)
+ elog(ERROR, "executor could not find named tuplestore \"%s\"",
+ node->enrname);
+
+ Assert(enr->reldata);
+ scanstate->relation = (Tuplestorestate *) enr->reldata;
+ scanstate->tupdesc = ENRMetadataGetTupDesc(&(enr->md));
+ scanstate->readptr =
+ tuplestore_alloc_read_pointer(scanstate->relation, EXEC_FLAG_REWIND);
+
+ /*
+ * The new read pointer copies its position from read pointer 0, which
+ * could be anywhere, so explicitly rewind it.
+ */
+ tuplestore_select_read_pointer(scanstate->relation, scanstate->readptr);
+ tuplestore_rescan(scanstate->relation);
+
+ /*
+ * XXX: Should we add a function to free that read pointer when done?
+ *
+ * This was attempted, but it did not improve performance or memory usage
+ * in any tested cases.
+ */
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ /*
+ * The scan tuple type is specified for the tuplestore.
+ */
+ ExecInitScanTupleSlot(estate, &scanstate->ss, scanstate->tupdesc,
+ &TTSOpsMinimalTuple);
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndNamedTuplestoreScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndNamedTuplestoreScan(NamedTuplestoreScanState *node)
+{
+ /*
+ * Free exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanNamedTuplestoreScan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanNamedTuplestoreScan(NamedTuplestoreScanState *node)
+{
+ Tuplestorestate *tuplestorestate = node->relation;
+
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ ExecScanReScan(&node->ss);
+
+ /*
+ * Rewind my own pointer.
+ */
+ tuplestore_select_read_pointer(tuplestorestate, node->readptr);
+ tuplestore_rescan(tuplestorestate);
+}
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
new file mode 100644
index 0000000..41e5eca
--- /dev/null
+++ b/src/backend/executor/nodeNestloop.c
@@ -0,0 +1,411 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeNestloop.c
+ * routines to support nest-loop joins
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeNestloop.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecNestLoop - process a nestloop join of two plans
+ * ExecInitNestLoop - initialize the join
+ * ExecEndNestLoop - shut down the join
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeNestloop.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/* ----------------------------------------------------------------
+ * ExecNestLoop(node)
+ *
+ * old comments
+ * Returns the tuple joined from inner and outer tuples which
+ * satisfies the qualification clause.
+ *
+ * It scans the inner relation to join with current outer tuple.
+ *
+ * If none is found, next tuple from the outer relation is retrieved
+ * and the inner relation is scanned from the beginning again to join
+ * with the outer tuple.
+ *
+ * NULL is returned if all the remaining outer tuples are tried and
+ * all fail to join with the inner tuples.
+ *
+ * NULL is also returned if there is no tuple from inner relation.
+ *
+ * Conditions:
+ * -- outerTuple contains current tuple from outer relation and
+ * the right son(inner relation) maintains "cursor" at the tuple
+ * returned previously.
+ * This is achieved by maintaining a scan position on the outer
+ * relation.
+ *
+ * Initial States:
+ * -- the outer child and the inner child
+ * are prepared to return the first tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecNestLoop(PlanState *pstate)
+{
+ NestLoopState *node = castNode(NestLoopState, pstate);
+ NestLoop *nl;
+ PlanState *innerPlan;
+ PlanState *outerPlan;
+ TupleTableSlot *outerTupleSlot;
+ TupleTableSlot *innerTupleSlot;
+ ExprState *joinqual;
+ ExprState *otherqual;
+ ExprContext *econtext;
+ ListCell *lc;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * get information from the node
+ */
+ ENL1_printf("getting info from node");
+
+ nl = (NestLoop *) node->js.ps.plan;
+ joinqual = node->js.joinqual;
+ otherqual = node->js.ps.qual;
+ outerPlan = outerPlanState(node);
+ innerPlan = innerPlanState(node);
+ econtext = node->js.ps.ps_ExprContext;
+
+ /*
+ * Reset per-tuple memory context to free any expression evaluation
+ * storage allocated in the previous tuple cycle.
+ */
+ ResetExprContext(econtext);
+
+ /*
+ * Ok, everything is setup for the join so now loop until we return a
+ * qualifying join tuple.
+ */
+ ENL1_printf("entering main loop");
+
+ for (;;)
+ {
+ /*
+ * If we don't have an outer tuple, get the next one and reset the
+ * inner scan.
+ */
+ if (node->nl_NeedNewOuter)
+ {
+ ENL1_printf("getting new outer tuple");
+ outerTupleSlot = ExecProcNode(outerPlan);
+
+ /*
+ * if there are no more outer tuples, then the join is complete..
+ */
+ if (TupIsNull(outerTupleSlot))
+ {
+ ENL1_printf("no outer tuple, ending join");
+ return NULL;
+ }
+
+ ENL1_printf("saving new outer tuple information");
+ econtext->ecxt_outertuple = outerTupleSlot;
+ node->nl_NeedNewOuter = false;
+ node->nl_MatchedOuter = false;
+
+ /*
+ * fetch the values of any outer Vars that must be passed to the
+ * inner scan, and store them in the appropriate PARAM_EXEC slots.
+ */
+ foreach(lc, nl->nestParams)
+ {
+ NestLoopParam *nlp = (NestLoopParam *) lfirst(lc);
+ int paramno = nlp->paramno;
+ ParamExecData *prm;
+
+ prm = &(econtext->ecxt_param_exec_vals[paramno]);
+ /* Param value should be an OUTER_VAR var */
+ Assert(IsA(nlp->paramval, Var));
+ Assert(nlp->paramval->varno == OUTER_VAR);
+ Assert(nlp->paramval->varattno > 0);
+ prm->value = slot_getattr(outerTupleSlot,
+ nlp->paramval->varattno,
+ &(prm->isnull));
+ /* Flag parameter value as changed */
+ innerPlan->chgParam = bms_add_member(innerPlan->chgParam,
+ paramno);
+ }
+
+ /*
+ * now rescan the inner plan
+ */
+ ENL1_printf("rescanning inner plan");
+ ExecReScan(innerPlan);
+ }
+
+ /*
+ * we have an outerTuple, try to get the next inner tuple.
+ */
+ ENL1_printf("getting new inner tuple");
+
+ innerTupleSlot = ExecProcNode(innerPlan);
+ econtext->ecxt_innertuple = innerTupleSlot;
+
+ if (TupIsNull(innerTupleSlot))
+ {
+ ENL1_printf("no inner tuple, need new outer tuple");
+
+ node->nl_NeedNewOuter = true;
+
+ if (!node->nl_MatchedOuter &&
+ (node->js.jointype == JOIN_LEFT ||
+ node->js.jointype == JOIN_ANTI))
+ {
+ /*
+ * We are doing an outer join and there were no join matches
+ * for this outer tuple. Generate a fake join tuple with
+ * nulls for the inner tuple, and return it if it passes the
+ * non-join quals.
+ */
+ econtext->ecxt_innertuple = node->nl_NullInnerTupleSlot;
+
+ ENL1_printf("testing qualification for outer-join tuple");
+
+ if (otherqual == NULL || ExecQual(otherqual, econtext))
+ {
+ /*
+ * qualification was satisfied so we project and return
+ * the slot containing the result tuple using
+ * ExecProject().
+ */
+ ENL1_printf("qualification succeeded, projecting tuple");
+
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ }
+ else
+ InstrCountFiltered2(node, 1);
+ }
+
+ /*
+ * Otherwise just return to top of loop for a new outer tuple.
+ */
+ continue;
+ }
+
+ /*
+ * at this point we have a new pair of inner and outer tuples so we
+ * test the inner and outer tuples to see if they satisfy the node's
+ * qualification.
+ *
+ * Only the joinquals determine MatchedOuter status, but all quals
+ * must pass to actually return the tuple.
+ */
+ ENL1_printf("testing qualification");
+
+ if (ExecQual(joinqual, econtext))
+ {
+ node->nl_MatchedOuter = true;
+
+ /* In an antijoin, we never return a matched tuple */
+ if (node->js.jointype == JOIN_ANTI)
+ {
+ node->nl_NeedNewOuter = true;
+ continue; /* return to top of loop */
+ }
+
+ /*
+ * If we only need to join to the first matching inner tuple, then
+ * consider returning this one, but after that continue with next
+ * outer tuple.
+ */
+ if (node->js.single_match)
+ node->nl_NeedNewOuter = true;
+
+ if (otherqual == NULL || ExecQual(otherqual, econtext))
+ {
+ /*
+ * qualification was satisfied so we project and return the
+ * slot containing the result tuple using ExecProject().
+ */
+ ENL1_printf("qualification succeeded, projecting tuple");
+
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ }
+ else
+ InstrCountFiltered2(node, 1);
+ }
+ else
+ InstrCountFiltered1(node, 1);
+
+ /*
+ * Tuple fails qual, so free per-tuple memory and try again.
+ */
+ ResetExprContext(econtext);
+
+ ENL1_printf("qualification failed, looping");
+ }
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitNestLoop
+ * ----------------------------------------------------------------
+ */
+NestLoopState *
+ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
+{
+ NestLoopState *nlstate;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ NL1_printf("ExecInitNestLoop: %s\n",
+ "initializing node");
+
+ /*
+ * create state structure
+ */
+ nlstate = makeNode(NestLoopState);
+ nlstate->js.ps.plan = (Plan *) node;
+ nlstate->js.ps.state = estate;
+ nlstate->js.ps.ExecProcNode = ExecNestLoop;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &nlstate->js.ps);
+
+ /*
+ * initialize child nodes
+ *
+ * If we have no parameters to pass into the inner rel from the outer,
+ * tell the inner child that cheap rescans would be good. If we do have
+ * such parameters, then there is no point in REWIND support at all in the
+ * inner child, because it will always be rescanned with fresh parameter
+ * values.
+ */
+ outerPlanState(nlstate) = ExecInitNode(outerPlan(node), estate, eflags);
+ if (node->nestParams == NIL)
+ eflags |= EXEC_FLAG_REWIND;
+ else
+ eflags &= ~EXEC_FLAG_REWIND;
+ innerPlanState(nlstate) = ExecInitNode(innerPlan(node), estate, eflags);
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTupleSlotTL(&nlstate->js.ps, &TTSOpsVirtual);
+ ExecAssignProjectionInfo(&nlstate->js.ps, NULL);
+
+ /*
+ * initialize child expressions
+ */
+ nlstate->js.ps.qual =
+ ExecInitQual(node->join.plan.qual, (PlanState *) nlstate);
+ nlstate->js.jointype = node->join.jointype;
+ nlstate->js.joinqual =
+ ExecInitQual(node->join.joinqual, (PlanState *) nlstate);
+
+ /*
+ * detect whether we need only consider the first matching inner tuple
+ */
+ nlstate->js.single_match = (node->join.inner_unique ||
+ node->join.jointype == JOIN_SEMI);
+
+ /* set up null tuples for outer joins, if needed */
+ switch (node->join.jointype)
+ {
+ case JOIN_INNER:
+ case JOIN_SEMI:
+ break;
+ case JOIN_LEFT:
+ case JOIN_ANTI:
+ nlstate->nl_NullInnerTupleSlot =
+ ExecInitNullTupleSlot(estate,
+ ExecGetResultType(innerPlanState(nlstate)),
+ &TTSOpsVirtual);
+ break;
+ default:
+ elog(ERROR, "unrecognized join type: %d",
+ (int) node->join.jointype);
+ }
+
+ /*
+ * finally, wipe the current outer tuple clean.
+ */
+ nlstate->nl_NeedNewOuter = true;
+ nlstate->nl_MatchedOuter = false;
+
+ NL1_printf("ExecInitNestLoop: %s\n",
+ "node initialized");
+
+ return nlstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndNestLoop
+ *
+ * closes down scans and frees allocated storage
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndNestLoop(NestLoopState *node)
+{
+ NL1_printf("ExecEndNestLoop: %s\n",
+ "ending node processing");
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->js.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
+
+ /*
+ * close down subplans
+ */
+ ExecEndNode(outerPlanState(node));
+ ExecEndNode(innerPlanState(node));
+
+ NL1_printf("ExecEndNestLoop: %s\n",
+ "node processing ended");
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanNestLoop
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanNestLoop(NestLoopState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ /*
+ * If outerPlan->chgParam is not null then plan will be automatically
+ * re-scanned by first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+
+ /*
+ * innerPlan is re-scanned for each new outer tuple and MUST NOT be
+ * re-scanned from here or you'll get troubles from inner index scans when
+ * outer Vars are used as run-time keys...
+ */
+
+ node->nl_NeedNewOuter = true;
+ node->nl_MatchedOuter = false;
+}
diff --git a/src/backend/executor/nodeProjectSet.c b/src/backend/executor/nodeProjectSet.c
new file mode 100644
index 0000000..07be814
--- /dev/null
+++ b/src/backend/executor/nodeProjectSet.c
@@ -0,0 +1,351 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeProjectSet.c
+ * support for evaluating targetlists containing set-returning functions
+ *
+ * DESCRIPTION
+ *
+ * ProjectSet nodes are inserted by the planner to evaluate set-returning
+ * functions in the targetlist. It's guaranteed that all set-returning
+ * functions are directly at the top level of the targetlist, i.e. they
+ * can't be inside more-complex expressions. If that'd otherwise be
+ * the case, the planner adds additional ProjectSet nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeProjectSet.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeProjectSet.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/memutils.h"
+
+
+static TupleTableSlot *ExecProjectSRF(ProjectSetState *node, bool continuing);
+
+
+/* ----------------------------------------------------------------
+ * ExecProjectSet(node)
+ *
+ * Return tuples after evaluating the targetlist (which contains set
+ * returning functions).
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecProjectSet(PlanState *pstate)
+{
+ ProjectSetState *node = castNode(ProjectSetState, pstate);
+ TupleTableSlot *outerTupleSlot;
+ TupleTableSlot *resultSlot;
+ PlanState *outerPlan;
+ ExprContext *econtext;
+
+ CHECK_FOR_INTERRUPTS();
+
+ econtext = node->ps.ps_ExprContext;
+
+ /*
+ * Reset per-tuple context to free expression-evaluation storage allocated
+ * for a potentially previously returned tuple. Note that the SRF argument
+ * context has a different lifetime and is reset below.
+ */
+ ResetExprContext(econtext);
+
+ /*
+ * Check to see if we're still projecting out tuples from a previous scan
+ * tuple (because there is a function-returning-set in the projection
+ * expressions). If so, try to project another one.
+ */
+ if (node->pending_srf_tuples)
+ {
+ resultSlot = ExecProjectSRF(node, true);
+
+ if (resultSlot != NULL)
+ return resultSlot;
+ }
+
+ /*
+ * Reset argument context to free any expression evaluation storage
+ * allocated in the previous tuple cycle. Note this can't happen until
+ * we're done projecting out tuples from a scan tuple, as ValuePerCall
+ * functions are allowed to reference the arguments for each returned
+ * tuple.
+ */
+ MemoryContextReset(node->argcontext);
+
+ /*
+ * Get another input tuple and project SRFs from it.
+ */
+ for (;;)
+ {
+ /*
+ * Retrieve tuples from the outer plan until there are no more.
+ */
+ outerPlan = outerPlanState(node);
+ outerTupleSlot = ExecProcNode(outerPlan);
+
+ if (TupIsNull(outerTupleSlot))
+ return NULL;
+
+ /*
+ * Prepare to compute projection expressions, which will expect to
+ * access the input tuples as varno OUTER.
+ */
+ econtext->ecxt_outertuple = outerTupleSlot;
+
+ /* Evaluate the expressions */
+ resultSlot = ExecProjectSRF(node, false);
+
+ /*
+ * Return the tuple unless the projection produced no rows (due to an
+ * empty set), in which case we must loop back to see if there are
+ * more outerPlan tuples.
+ */
+ if (resultSlot)
+ return resultSlot;
+ }
+
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecProjectSRF
+ *
+ * Project a targetlist containing one or more set-returning functions.
+ *
+ * 'continuing' indicates whether to continue projecting rows for the
+ * same input tuple; or whether a new input tuple is being projected.
+ *
+ * Returns NULL if no output tuple has been produced.
+ *
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecProjectSRF(ProjectSetState *node, bool continuing)
+{
+ TupleTableSlot *resultSlot = node->ps.ps_ResultTupleSlot;
+ ExprContext *econtext = node->ps.ps_ExprContext;
+ MemoryContext oldcontext;
+ bool hassrf PG_USED_FOR_ASSERTS_ONLY;
+ bool hasresult;
+ int argno;
+
+ ExecClearTuple(resultSlot);
+
+ /* Call SRFs, as well as plain expressions, in per-tuple context */
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ /*
+ * Assume no further tuples are produced unless an ExprMultipleResult is
+ * encountered from a set returning function.
+ */
+ node->pending_srf_tuples = false;
+
+ hassrf = hasresult = false;
+ for (argno = 0; argno < node->nelems; argno++)
+ {
+ Node *elem = node->elems[argno];
+ ExprDoneCond *isdone = &node->elemdone[argno];
+ Datum *result = &resultSlot->tts_values[argno];
+ bool *isnull = &resultSlot->tts_isnull[argno];
+
+ if (continuing && *isdone == ExprEndResult)
+ {
+ /*
+ * If we're continuing to project output rows from a source tuple,
+ * return NULLs once the SRF has been exhausted.
+ */
+ *result = (Datum) 0;
+ *isnull = true;
+ hassrf = true;
+ }
+ else if (IsA(elem, SetExprState))
+ {
+ /*
+ * Evaluate SRF - possibly continuing previously started output.
+ */
+ *result = ExecMakeFunctionResultSet((SetExprState *) elem,
+ econtext, node->argcontext,
+ isnull, isdone);
+
+ if (*isdone != ExprEndResult)
+ hasresult = true;
+ if (*isdone == ExprMultipleResult)
+ node->pending_srf_tuples = true;
+ hassrf = true;
+ }
+ else
+ {
+ /* Non-SRF tlist expression, just evaluate normally. */
+ *result = ExecEvalExpr((ExprState *) elem, econtext, isnull);
+ *isdone = ExprSingleResult;
+ }
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /* ProjectSet should not be used if there's no SRFs */
+ Assert(hassrf);
+
+ /*
+ * If all the SRFs returned ExprEndResult, we consider that as no row
+ * being produced.
+ */
+ if (hasresult)
+ {
+ ExecStoreVirtualTuple(resultSlot);
+ return resultSlot;
+ }
+
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitProjectSet
+ *
+ * Creates the run-time state information for the ProjectSet node
+ * produced by the planner and initializes outer relations
+ * (child nodes).
+ * ----------------------------------------------------------------
+ */
+ProjectSetState *
+ExecInitProjectSet(ProjectSet *node, EState *estate, int eflags)
+{
+ ProjectSetState *state;
+ ListCell *lc;
+ int off;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)));
+
+ /*
+ * create state structure
+ */
+ state = makeNode(ProjectSetState);
+ state->ps.plan = (Plan *) node;
+ state->ps.state = estate;
+ state->ps.ExecProcNode = ExecProjectSet;
+
+ state->pending_srf_tuples = false;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &state->ps);
+
+ /*
+ * initialize child nodes
+ */
+ outerPlanState(state) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * we don't use inner plan
+ */
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * tuple table and result type initialization
+ */
+ ExecInitResultTupleSlotTL(&state->ps, &TTSOpsVirtual);
+
+ /* Create workspace for per-tlist-entry expr state & SRF-is-done state */
+ state->nelems = list_length(node->plan.targetlist);
+ state->elems = (Node **)
+ palloc(sizeof(Node *) * state->nelems);
+ state->elemdone = (ExprDoneCond *)
+ palloc(sizeof(ExprDoneCond) * state->nelems);
+
+ /*
+ * Build expressions to evaluate targetlist. We can't use
+ * ExecBuildProjectionInfo here, since that doesn't deal with SRFs.
+ * Instead compile each expression separately, using
+ * ExecInitFunctionResultSet where applicable.
+ */
+ off = 0;
+ foreach(lc, node->plan.targetlist)
+ {
+ TargetEntry *te = (TargetEntry *) lfirst(lc);
+ Expr *expr = te->expr;
+
+ if ((IsA(expr, FuncExpr) && ((FuncExpr *) expr)->funcretset) ||
+ (IsA(expr, OpExpr) && ((OpExpr *) expr)->opretset))
+ {
+ state->elems[off] = (Node *)
+ ExecInitFunctionResultSet(expr, state->ps.ps_ExprContext,
+ &state->ps);
+ }
+ else
+ {
+ Assert(!expression_returns_set((Node *) expr));
+ state->elems[off] = (Node *) ExecInitExpr(expr, &state->ps);
+ }
+
+ off++;
+ }
+
+ /* We don't support any qual on ProjectSet nodes */
+ Assert(node->plan.qual == NIL);
+
+ /*
+ * Create a memory context that ExecMakeFunctionResultSet can use to
+ * evaluate function arguments in. We can't use the per-tuple context for
+ * this because it gets reset too often; but we don't want to leak
+ * evaluation results into the query-lifespan context either. We use one
+ * context for the arguments of all tSRFs, as they have roughly equivalent
+ * lifetimes.
+ */
+ state->argcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "tSRF function arguments",
+ ALLOCSET_DEFAULT_SIZES);
+
+ return state;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndProjectSet
+ *
+ * frees up storage allocated through C routines
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndProjectSet(ProjectSetState *node)
+{
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ps);
+
+ /*
+ * clean out the tuple table
+ */
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ /*
+ * shut down subplans
+ */
+ ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanProjectSet(ProjectSetState *node)
+{
+ /* Forget any incompletely-evaluated SRFs */
+ node->pending_srf_tuples = false;
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->ps.lefttree->chgParam == NULL)
+ ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c
new file mode 100644
index 0000000..f9e91fd
--- /dev/null
+++ b/src/backend/executor/nodeRecursiveunion.c
@@ -0,0 +1,331 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeRecursiveunion.c
+ * routines to handle RecursiveUnion nodes.
+ *
+ * To implement UNION (without ALL), we need a hashtable that stores tuples
+ * already seen. The hash key is computed from the grouping columns.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeRecursiveunion.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeRecursiveunion.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(RecursiveUnionState *rustate)
+{
+ RecursiveUnion *node = (RecursiveUnion *) rustate->ps.plan;
+ TupleDesc desc = ExecGetResultType(outerPlanState(rustate));
+
+ Assert(node->numCols > 0);
+ Assert(node->numGroups > 0);
+
+ rustate->hashtable = BuildTupleHashTableExt(&rustate->ps,
+ desc,
+ node->numCols,
+ node->dupColIdx,
+ rustate->eqfuncoids,
+ rustate->hashfunctions,
+ node->dupCollations,
+ node->numGroups,
+ 0,
+ rustate->ps.state->es_query_cxt,
+ rustate->tableContext,
+ rustate->tempContext,
+ false);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecRecursiveUnion(node)
+ *
+ * Scans the recursive query sequentially and returns the next
+ * qualifying tuple.
+ *
+ * 1. evaluate non recursive term and assign the result to RT
+ *
+ * 2. execute recursive terms
+ *
+ * 2.1 WT := RT
+ * 2.2 while WT is not empty repeat 2.3 to 2.6. if WT is empty returns RT
+ * 2.3 replace the name of recursive term with WT
+ * 2.4 evaluate the recursive term and store into WT
+ * 2.5 append WT to RT
+ * 2.6 go back to 2.2
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecRecursiveUnion(PlanState *pstate)
+{
+ RecursiveUnionState *node = castNode(RecursiveUnionState, pstate);
+ PlanState *outerPlan = outerPlanState(node);
+ PlanState *innerPlan = innerPlanState(node);
+ RecursiveUnion *plan = (RecursiveUnion *) node->ps.plan;
+ TupleTableSlot *slot;
+ bool isnew;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* 1. Evaluate non-recursive term */
+ if (!node->recursing)
+ {
+ for (;;)
+ {
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ break;
+ if (plan->numCols > 0)
+ {
+ /* Find or build hashtable entry for this tuple's group */
+ LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
+ /* Must reset temp context after each hashtable lookup */
+ MemoryContextReset(node->tempContext);
+ /* Ignore tuple if already seen */
+ if (!isnew)
+ continue;
+ }
+ /* Each non-duplicate tuple goes to the working table ... */
+ tuplestore_puttupleslot(node->working_table, slot);
+ /* ... and to the caller */
+ return slot;
+ }
+ node->recursing = true;
+ }
+
+ /* 2. Execute recursive term */
+ for (;;)
+ {
+ slot = ExecProcNode(innerPlan);
+ if (TupIsNull(slot))
+ {
+ /* Done if there's nothing in the intermediate table */
+ if (node->intermediate_empty)
+ break;
+
+ /* done with old working table ... */
+ tuplestore_end(node->working_table);
+
+ /* intermediate table becomes working table */
+ node->working_table = node->intermediate_table;
+
+ /* create new empty intermediate table */
+ node->intermediate_table = tuplestore_begin_heap(false, false,
+ work_mem);
+ node->intermediate_empty = true;
+
+ /* reset the recursive term */
+ innerPlan->chgParam = bms_add_member(innerPlan->chgParam,
+ plan->wtParam);
+
+ /* and continue fetching from recursive term */
+ continue;
+ }
+
+ if (plan->numCols > 0)
+ {
+ /* Find or build hashtable entry for this tuple's group */
+ LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
+ /* Must reset temp context after each hashtable lookup */
+ MemoryContextReset(node->tempContext);
+ /* Ignore tuple if already seen */
+ if (!isnew)
+ continue;
+ }
+
+ /* Else, tuple is good; stash it in intermediate table ... */
+ node->intermediate_empty = false;
+ tuplestore_puttupleslot(node->intermediate_table, slot);
+ /* ... and return it */
+ return slot;
+ }
+
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitRecursiveUnion
+ * ----------------------------------------------------------------
+ */
+RecursiveUnionState *
+ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags)
+{
+ RecursiveUnionState *rustate;
+ ParamExecData *prmdata;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ rustate = makeNode(RecursiveUnionState);
+ rustate->ps.plan = (Plan *) node;
+ rustate->ps.state = estate;
+ rustate->ps.ExecProcNode = ExecRecursiveUnion;
+
+ rustate->eqfuncoids = NULL;
+ rustate->hashfunctions = NULL;
+ rustate->hashtable = NULL;
+ rustate->tempContext = NULL;
+ rustate->tableContext = NULL;
+
+ /* initialize processing state */
+ rustate->recursing = false;
+ rustate->intermediate_empty = true;
+ rustate->working_table = tuplestore_begin_heap(false, false, work_mem);
+ rustate->intermediate_table = tuplestore_begin_heap(false, false, work_mem);
+
+ /*
+ * If hashing, we need a per-tuple memory context for comparisons, and a
+ * longer-lived context to store the hash table. The table can't just be
+ * kept in the per-query context because we want to be able to throw it
+ * away when rescanning.
+ */
+ if (node->numCols > 0)
+ {
+ rustate->tempContext =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "RecursiveUnion",
+ ALLOCSET_DEFAULT_SIZES);
+ rustate->tableContext =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "RecursiveUnion hash table",
+ ALLOCSET_DEFAULT_SIZES);
+ }
+
+ /*
+ * Make the state structure available to descendant WorkTableScan nodes
+ * via the Param slot reserved for it.
+ */
+ prmdata = &(estate->es_param_exec_vals[node->wtParam]);
+ Assert(prmdata->execPlan == NULL);
+ prmdata->value = PointerGetDatum(rustate);
+ prmdata->isnull = false;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * RecursiveUnion plans don't have expression contexts because they never
+ * call ExecQual or ExecProject.
+ */
+ Assert(node->plan.qual == NIL);
+
+ /*
+ * RecursiveUnion nodes still have Result slots, which hold pointers to
+ * tuples, so we have to initialize them.
+ */
+ ExecInitResultTypeTL(&rustate->ps);
+
+ /*
+ * Initialize result tuple type. (Note: we have to set up the result type
+ * before initializing child nodes, because nodeWorktablescan.c expects it
+ * to be valid.)
+ */
+ rustate->ps.ps_ProjInfo = NULL;
+
+ /*
+ * initialize child nodes
+ */
+ outerPlanState(rustate) = ExecInitNode(outerPlan(node), estate, eflags);
+ innerPlanState(rustate) = ExecInitNode(innerPlan(node), estate, eflags);
+
+ /*
+ * If hashing, precompute fmgr lookup data for inner loop, and create the
+ * hash table.
+ */
+ if (node->numCols > 0)
+ {
+ execTuplesHashPrepare(node->numCols,
+ node->dupOperators,
+ &rustate->eqfuncoids,
+ &rustate->hashfunctions);
+ build_hash_table(rustate);
+ }
+
+ return rustate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndRecursiveUnion
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndRecursiveUnion(RecursiveUnionState *node)
+{
+ /* Release tuplestores */
+ tuplestore_end(node->working_table);
+ tuplestore_end(node->intermediate_table);
+
+ /* free subsidiary stuff including hashtable */
+ if (node->tempContext)
+ MemoryContextDelete(node->tempContext);
+ if (node->tableContext)
+ MemoryContextDelete(node->tableContext);
+
+ /*
+ * close down subplans
+ */
+ ExecEndNode(outerPlanState(node));
+ ExecEndNode(innerPlanState(node));
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanRecursiveUnion
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanRecursiveUnion(RecursiveUnionState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+ PlanState *innerPlan = innerPlanState(node);
+ RecursiveUnion *plan = (RecursiveUnion *) node->ps.plan;
+
+ /*
+ * Set recursive term's chgParam to tell it that we'll modify the working
+ * table and therefore it has to rescan.
+ */
+ innerPlan->chgParam = bms_add_member(innerPlan->chgParam, plan->wtParam);
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode. Because of above, we only have to do this to the
+ * non-recursive term.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+
+ /* Release any hashtable storage */
+ if (node->tableContext)
+ MemoryContextResetAndDeleteChildren(node->tableContext);
+
+ /* Empty hashtable if needed */
+ if (plan->numCols > 0)
+ ResetTupleHashTable(node->hashtable);
+
+ /* reset processing state */
+ node->recursing = false;
+ node->intermediate_empty = true;
+ tuplestore_clear(node->working_table);
+ tuplestore_clear(node->intermediate_table);
+}
diff --git a/src/backend/executor/nodeResult.c b/src/backend/executor/nodeResult.c
new file mode 100644
index 0000000..0946af0
--- /dev/null
+++ b/src/backend/executor/nodeResult.c
@@ -0,0 +1,272 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeResult.c
+ * support for constant nodes needing special code.
+ *
+ * DESCRIPTION
+ *
+ * Result nodes are used in queries where no relations are scanned.
+ * Examples of such queries are:
+ *
+ * select 1 * 2
+ *
+ * insert into emp values ('mike', 15000)
+ *
+ * (Remember that in an INSERT or UPDATE, we need a plan tree that
+ * generates the new rows.)
+ *
+ * Result nodes are also used to optimise queries with constant
+ * qualifications (ie, quals that do not depend on the scanned data),
+ * such as:
+ *
+ * select * from emp where 2 > 1
+ *
+ * In this case, the plan generated is
+ *
+ * Result (with 2 > 1 qual)
+ * /
+ * SeqScan (emp.*)
+ *
+ * At runtime, the Result node evaluates the constant qual once,
+ * which is shown by EXPLAIN as a One-Time Filter. If it's
+ * false, we can return an empty result set without running the
+ * controlled plan at all. If it's true, we run the controlled
+ * plan normally and pass back the results.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeResult.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeResult.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/* ----------------------------------------------------------------
+ * ExecResult(node)
+ *
+ * returns the tuples from the outer plan which satisfy the
+ * qualification clause. Since result nodes with right
+ * subtrees are never planned, we ignore the right subtree
+ * entirely (for now).. -cim 10/7/89
+ *
+ * The qualification containing only constant clauses are
+ * checked first before any processing is done. It always returns
+ * 'nil' if the constant qualification is not satisfied.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecResult(PlanState *pstate)
+{
+ ResultState *node = castNode(ResultState, pstate);
+ TupleTableSlot *outerTupleSlot;
+ PlanState *outerPlan;
+ ExprContext *econtext;
+
+ CHECK_FOR_INTERRUPTS();
+
+ econtext = node->ps.ps_ExprContext;
+
+ /*
+ * check constant qualifications like (2 > 1), if not already done
+ */
+ if (node->rs_checkqual)
+ {
+ bool qualResult = ExecQual(node->resconstantqual, econtext);
+
+ node->rs_checkqual = false;
+ if (!qualResult)
+ {
+ node->rs_done = true;
+ return NULL;
+ }
+ }
+
+ /*
+ * Reset per-tuple memory context to free any expression evaluation
+ * storage allocated in the previous tuple cycle.
+ */
+ ResetExprContext(econtext);
+
+ /*
+ * if rs_done is true then it means that we were asked to return a
+ * constant tuple and we already did the last time ExecResult() was
+ * called, OR that we failed the constant qual check. Either way, now we
+ * are through.
+ */
+ if (!node->rs_done)
+ {
+ outerPlan = outerPlanState(node);
+
+ if (outerPlan != NULL)
+ {
+ /*
+ * retrieve tuples from the outer plan until there are no more.
+ */
+ outerTupleSlot = ExecProcNode(outerPlan);
+
+ if (TupIsNull(outerTupleSlot))
+ return NULL;
+
+ /*
+ * prepare to compute projection expressions, which will expect to
+ * access the input tuples as varno OUTER.
+ */
+ econtext->ecxt_outertuple = outerTupleSlot;
+ }
+ else
+ {
+ /*
+ * if we don't have an outer plan, then we are just generating the
+ * results from a constant target list. Do it only once.
+ */
+ node->rs_done = true;
+ }
+
+ /* form the result tuple using ExecProject(), and return it */
+ return ExecProject(node->ps.ps_ProjInfo);
+ }
+
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecResultMarkPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecResultMarkPos(ResultState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ if (outerPlan != NULL)
+ ExecMarkPos(outerPlan);
+ else
+ elog(DEBUG2, "Result nodes do not support mark/restore");
+}
+
+/* ----------------------------------------------------------------
+ * ExecResultRestrPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecResultRestrPos(ResultState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ if (outerPlan != NULL)
+ ExecRestrPos(outerPlan);
+ else
+ elog(ERROR, "Result nodes do not support mark/restore");
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitResult
+ *
+ * Creates the run-time state information for the result node
+ * produced by the planner and initializes outer relations
+ * (child nodes).
+ * ----------------------------------------------------------------
+ */
+ResultState *
+ExecInitResult(Result *node, EState *estate, int eflags)
+{
+ ResultState *resstate;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)) ||
+ outerPlan(node) != NULL);
+
+ /*
+ * create state structure
+ */
+ resstate = makeNode(ResultState);
+ resstate->ps.plan = (Plan *) node;
+ resstate->ps.state = estate;
+ resstate->ps.ExecProcNode = ExecResult;
+
+ resstate->rs_done = false;
+ resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &resstate->ps);
+
+ /*
+ * initialize child nodes
+ */
+ outerPlanState(resstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * we don't use inner plan
+ */
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTupleSlotTL(&resstate->ps, &TTSOpsVirtual);
+ ExecAssignProjectionInfo(&resstate->ps, NULL);
+
+ /*
+ * initialize child expressions
+ */
+ resstate->ps.qual =
+ ExecInitQual(node->plan.qual, (PlanState *) resstate);
+ resstate->resconstantqual =
+ ExecInitQual((List *) node->resconstantqual, (PlanState *) resstate);
+
+ return resstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndResult
+ *
+ * frees up storage allocated through C routines
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndResult(ResultState *node)
+{
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ps);
+
+ /*
+ * clean out the tuple table
+ */
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ /*
+ * shut down subplans
+ */
+ ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanResult(ResultState *node)
+{
+ node->rs_done = false;
+ node->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
+
+ /*
+ * If chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->ps.lefttree &&
+ node->ps.lefttree->chgParam == NULL)
+ ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c
new file mode 100644
index 0000000..44232d5
--- /dev/null
+++ b/src/backend/executor/nodeSamplescan.c
@@ -0,0 +1,378 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSamplescan.c
+ * Support routines for sample scans of relations (table sampling).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeSamplescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/tsmapi.h"
+#include "executor/executor.h"
+#include "executor/nodeSamplescan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+
+static TupleTableSlot *SampleNext(SampleScanState *node);
+static void tablesample_init(SampleScanState *scanstate);
+static TupleTableSlot *tablesample_getnext(SampleScanState *scanstate);
+
+/* ----------------------------------------------------------------
+ * Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * SampleNext
+ *
+ * This is a workhorse for ExecSampleScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+SampleNext(SampleScanState *node)
+{
+ /*
+ * if this is first call within a scan, initialize
+ */
+ if (!node->begun)
+ tablesample_init(node);
+
+ /*
+ * get the next tuple, and store it in our result slot
+ */
+ return tablesample_getnext(node);
+}
+
+/*
+ * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
+{
+ /*
+ * No need to recheck for SampleScan, since like SeqScan we don't pass any
+ * checkable keys to heap_beginscan.
+ */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecSampleScan(node)
+ *
+ * Scans the relation using the sampling method and returns
+ * the next qualifying tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecSampleScan(PlanState *pstate)
+{
+ SampleScanState *node = castNode(SampleScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) SampleNext,
+ (ExecScanRecheckMtd) SampleRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitSampleScan
+ * ----------------------------------------------------------------
+ */
+SampleScanState *
+ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
+{
+ SampleScanState *scanstate;
+ TableSampleClause *tsc = node->tablesample;
+ TsmRoutine *tsm;
+
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create state structure
+ */
+ scanstate = makeNode(SampleScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecSampleScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ /*
+ * open the scan relation
+ */
+ scanstate->ss.ss_currentRelation =
+ ExecOpenScanRelation(estate,
+ node->scan.scanrelid,
+ eflags);
+
+ /* we won't set up the HeapScanDesc till later */
+ scanstate->ss.ss_currentScanDesc = NULL;
+
+ /* and create slot with appropriate rowtype */
+ ExecInitScanTupleSlot(estate, &scanstate->ss,
+ RelationGetDescr(scanstate->ss.ss_currentRelation),
+ table_slot_callbacks(scanstate->ss.ss_currentRelation));
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+ scanstate->args = ExecInitExprList(tsc->args, (PlanState *) scanstate);
+ scanstate->repeatable =
+ ExecInitExpr(tsc->repeatable, (PlanState *) scanstate);
+
+ /*
+ * If we don't have a REPEATABLE clause, select a random seed. We want to
+ * do this just once, since the seed shouldn't change over rescans.
+ */
+ if (tsc->repeatable == NULL)
+ scanstate->seed = random();
+
+ /*
+ * Finally, initialize the TABLESAMPLE method handler.
+ */
+ tsm = GetTsmRoutine(tsc->tsmhandler);
+ scanstate->tsmroutine = tsm;
+ scanstate->tsm_state = NULL;
+
+ if (tsm->InitSampleScan)
+ tsm->InitSampleScan(scanstate, eflags);
+
+ /* We'll do BeginSampleScan later; we can't evaluate params yet */
+ scanstate->begun = false;
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndSampleScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSampleScan(SampleScanState *node)
+{
+ /*
+ * Tell sampling function that we finished the scan.
+ */
+ if (node->tsmroutine->EndSampleScan)
+ node->tsmroutine->EndSampleScan(node);
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * close heap scan
+ */
+ if (node->ss.ss_currentScanDesc)
+ table_endscan(node->ss.ss_currentScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanSampleScan
+ *
+ * Rescans the relation.
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanSampleScan(SampleScanState *node)
+{
+ /* Remember we need to do BeginSampleScan again (if we did it at all) */
+ node->begun = false;
+ node->done = false;
+ node->haveblock = false;
+ node->donetuples = 0;
+
+ ExecScanReScan(&node->ss);
+}
+
+
+/*
+ * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
+ */
+static void
+tablesample_init(SampleScanState *scanstate)
+{
+ TsmRoutine *tsm = scanstate->tsmroutine;
+ ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
+ Datum *params;
+ Datum datum;
+ bool isnull;
+ uint32 seed;
+ bool allow_sync;
+ int i;
+ ListCell *arg;
+
+ scanstate->donetuples = 0;
+ params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
+
+ i = 0;
+ foreach(arg, scanstate->args)
+ {
+ ExprState *argstate = (ExprState *) lfirst(arg);
+
+ params[i] = ExecEvalExprSwitchContext(argstate,
+ econtext,
+ &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+ errmsg("TABLESAMPLE parameter cannot be null")));
+ i++;
+ }
+
+ if (scanstate->repeatable)
+ {
+ datum = ExecEvalExprSwitchContext(scanstate->repeatable,
+ econtext,
+ &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
+ errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
+
+ /*
+ * The REPEATABLE parameter has been coerced to float8 by the parser.
+ * The reason for using float8 at the SQL level is that it will
+ * produce unsurprising results both for users used to databases that
+ * accept only integers in the REPEATABLE clause and for those who
+ * might expect that REPEATABLE works like setseed() (a float in the
+ * range from -1 to 1).
+ *
+ * We use hashfloat8() to convert the supplied value into a suitable
+ * seed. For regression-testing purposes, that has the convenient
+ * property that REPEATABLE(0) gives a machine-independent result.
+ */
+ seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
+ }
+ else
+ {
+ /* Use the seed selected by ExecInitSampleScan */
+ seed = scanstate->seed;
+ }
+
+ /* Set default values for params that BeginSampleScan can adjust */
+ scanstate->use_bulkread = true;
+ scanstate->use_pagemode = true;
+
+ /* Let tablesample method do its thing */
+ tsm->BeginSampleScan(scanstate,
+ params,
+ list_length(scanstate->args),
+ seed);
+
+ /* We'll use syncscan if there's no NextSampleBlock function */
+ allow_sync = (tsm->NextSampleBlock == NULL);
+
+ /* Now we can create or reset the HeapScanDesc */
+ if (scanstate->ss.ss_currentScanDesc == NULL)
+ {
+ scanstate->ss.ss_currentScanDesc =
+ table_beginscan_sampling(scanstate->ss.ss_currentRelation,
+ scanstate->ss.ps.state->es_snapshot,
+ 0, NULL,
+ scanstate->use_bulkread,
+ allow_sync,
+ scanstate->use_pagemode);
+ }
+ else
+ {
+ table_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
+ scanstate->use_bulkread,
+ allow_sync,
+ scanstate->use_pagemode);
+ }
+
+ pfree(params);
+
+ /* And we're initialized. */
+ scanstate->begun = true;
+}
+
+/*
+ * Get next tuple from TABLESAMPLE method.
+ */
+static TupleTableSlot *
+tablesample_getnext(SampleScanState *scanstate)
+{
+ TableScanDesc scan = scanstate->ss.ss_currentScanDesc;
+ TupleTableSlot *slot = scanstate->ss.ss_ScanTupleSlot;
+
+ ExecClearTuple(slot);
+
+ if (scanstate->done)
+ return NULL;
+
+ for (;;)
+ {
+ if (!scanstate->haveblock)
+ {
+ if (!table_scan_sample_next_block(scan, scanstate))
+ {
+ scanstate->haveblock = false;
+ scanstate->done = true;
+
+ /* exhausted relation */
+ return NULL;
+ }
+
+ scanstate->haveblock = true;
+ }
+
+ if (!table_scan_sample_next_tuple(scan, scanstate, slot))
+ {
+ /*
+ * If we get here, it means we've exhausted the items on this page
+ * and it's time to move to the next.
+ */
+ scanstate->haveblock = false;
+ continue;
+ }
+
+ /* Found visible tuple, return it. */
+ break;
+ }
+
+ scanstate->donetuples++;
+
+ return slot;
+}
diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c
new file mode 100644
index 0000000..066f9ae
--- /dev/null
+++ b/src/backend/executor/nodeSeqscan.c
@@ -0,0 +1,314 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSeqscan.c
+ * Support routines for sequential scans of relations.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeSeqscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecSeqScan sequentially scans a relation.
+ * ExecSeqNext retrieve next tuple in sequential order.
+ * ExecInitSeqScan creates and initializes a seqscan node.
+ * ExecEndSeqScan releases any storage allocated.
+ * ExecReScanSeqScan rescans the relation
+ *
+ * ExecSeqScanEstimate estimates DSM space needed for parallel scan
+ * ExecSeqScanInitializeDSM initialize DSM for parallel scan
+ * ExecSeqScanReInitializeDSM reinitialize DSM for fresh parallel scan
+ * ExecSeqScanInitializeWorker attach to DSM info in parallel worker
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "executor/execdebug.h"
+#include "executor/nodeSeqscan.h"
+#include "utils/rel.h"
+
+static TupleTableSlot *SeqNext(SeqScanState *node);
+
+/* ----------------------------------------------------------------
+ * Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * SeqNext
+ *
+ * This is a workhorse for ExecSeqScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+SeqNext(SeqScanState *node)
+{
+ TableScanDesc scandesc;
+ EState *estate;
+ ScanDirection direction;
+ TupleTableSlot *slot;
+
+ /*
+ * get information from the estate and scan state
+ */
+ scandesc = node->ss.ss_currentScanDesc;
+ estate = node->ss.ps.state;
+ direction = estate->es_direction;
+ slot = node->ss.ss_ScanTupleSlot;
+
+ if (scandesc == NULL)
+ {
+ /*
+ * We reach here if the scan is not parallel, or if we're serially
+ * executing a scan that was planned to be parallel.
+ */
+ scandesc = table_beginscan(node->ss.ss_currentRelation,
+ estate->es_snapshot,
+ 0, NULL);
+ node->ss.ss_currentScanDesc = scandesc;
+ }
+
+ /*
+ * get the next tuple from the table
+ */
+ if (table_scan_getnextslot(scandesc, direction, slot))
+ return slot;
+ return NULL;
+}
+
+/*
+ * SeqRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+SeqRecheck(SeqScanState *node, TupleTableSlot *slot)
+{
+ /*
+ * Note that unlike IndexScan, SeqScan never use keys in heap_beginscan
+ * (and this is very bad) - so, here we do not check are keys ok or not.
+ */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecSeqScan(node)
+ *
+ * Scans the relation sequentially and returns the next qualifying
+ * tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecSeqScan(PlanState *pstate)
+{
+ SeqScanState *node = castNode(SeqScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) SeqNext,
+ (ExecScanRecheckMtd) SeqRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecInitSeqScan
+ * ----------------------------------------------------------------
+ */
+SeqScanState *
+ExecInitSeqScan(SeqScan *node, EState *estate, int eflags)
+{
+ SeqScanState *scanstate;
+
+ /*
+ * Once upon a time it was possible to have an outerPlan of a SeqScan, but
+ * not any more.
+ */
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create state structure
+ */
+ scanstate = makeNode(SeqScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecSeqScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ /*
+ * open the scan relation
+ */
+ scanstate->ss.ss_currentRelation =
+ ExecOpenScanRelation(estate,
+ node->scanrelid,
+ eflags);
+
+ /* and create slot with the appropriate rowtype */
+ ExecInitScanTupleSlot(estate, &scanstate->ss,
+ RelationGetDescr(scanstate->ss.ss_currentRelation),
+ table_slot_callbacks(scanstate->ss.ss_currentRelation));
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->plan.qual, (PlanState *) scanstate);
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndSeqScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSeqScan(SeqScanState *node)
+{
+ TableScanDesc scanDesc;
+
+ /*
+ * get information from node
+ */
+ scanDesc = node->ss.ss_currentScanDesc;
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * close heap scan
+ */
+ if (scanDesc != NULL)
+ table_endscan(scanDesc);
+}
+
+/* ----------------------------------------------------------------
+ * Join Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecReScanSeqScan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanSeqScan(SeqScanState *node)
+{
+ TableScanDesc scan;
+
+ scan = node->ss.ss_currentScanDesc;
+
+ if (scan != NULL)
+ table_rescan(scan, /* scan desc */
+ NULL); /* new scan keys */
+
+ ExecScanReScan((ScanState *) node);
+}
+
+/* ----------------------------------------------------------------
+ * Parallel Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecSeqScanEstimate
+ *
+ * Compute the amount of space we'll need in the parallel
+ * query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanEstimate(SeqScanState *node,
+ ParallelContext *pcxt)
+{
+ EState *estate = node->ss.ps.state;
+
+ node->pscan_len = table_parallelscan_estimate(node->ss.ss_currentRelation,
+ estate->es_snapshot);
+ shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSeqScanInitializeDSM
+ *
+ * Set up a parallel heap scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanInitializeDSM(SeqScanState *node,
+ ParallelContext *pcxt)
+{
+ EState *estate = node->ss.ps.state;
+ ParallelTableScanDesc pscan;
+
+ pscan = shm_toc_allocate(pcxt->toc, node->pscan_len);
+ table_parallelscan_initialize(node->ss.ss_currentRelation,
+ pscan,
+ estate->es_snapshot);
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
+ node->ss.ss_currentScanDesc =
+ table_beginscan_parallel(node->ss.ss_currentRelation, pscan);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSeqScanReInitializeDSM
+ *
+ * Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanReInitializeDSM(SeqScanState *node,
+ ParallelContext *pcxt)
+{
+ ParallelTableScanDesc pscan;
+
+ pscan = node->ss.ss_currentScanDesc->rs_parallel;
+ table_parallelscan_reinitialize(node->ss.ss_currentRelation, pscan);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSeqScanInitializeWorker
+ *
+ * Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanInitializeWorker(SeqScanState *node,
+ ParallelWorkerContext *pwcxt)
+{
+ ParallelTableScanDesc pscan;
+
+ pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+ node->ss.ss_currentScanDesc =
+ table_beginscan_parallel(node->ss.ss_currentRelation, pscan);
+}
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
new file mode 100644
index 0000000..aad7ac0
--- /dev/null
+++ b/src/backend/executor/nodeSetOp.c
@@ -0,0 +1,651 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSetOp.c
+ * Routines to handle INTERSECT and EXCEPT selection
+ *
+ * The input of a SetOp node consists of tuples from two relations,
+ * which have been combined into one dataset, with a junk attribute added
+ * that shows which relation each tuple came from. In SETOP_SORTED mode,
+ * the input has furthermore been sorted according to all the grouping
+ * columns (ie, all the non-junk attributes). The SetOp node scans each
+ * group of identical tuples to determine how many came from each input
+ * relation. Then it is a simple matter to emit the output demanded by the
+ * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ *
+ * In SETOP_HASHED mode, the input is delivered in no particular order,
+ * except that we know all the tuples from one input relation will come before
+ * all the tuples of the other. The planner guarantees that the first input
+ * relation is the left-hand one for EXCEPT, and tries to make the smaller
+ * input relation come first for INTERSECT. We build a hash table in memory
+ * with one entry for each group of identical tuples, and count the number of
+ * tuples in the group from each relation. After seeing all the input, we
+ * scan the hashtable and generate the correct output using those counts.
+ * We can avoid making hashtable entries for any tuples appearing only in the
+ * second input relation, since they cannot result in any output.
+ *
+ * This node type is not used for UNION or UNION ALL, since those can be
+ * implemented more cheaply (there's no need for the junk attribute to
+ * identify the source relation).
+ *
+ * Note that SetOp does no qual checking nor projection. The delivered
+ * output tuples are just copies of the first-to-arrive tuple in each
+ * input group.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeSetOp.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/executor.h"
+#include "executor/nodeSetOp.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/*
+ * SetOpStatePerGroupData - per-group working state
+ *
+ * These values are working state that is initialized at the start of
+ * an input tuple group and updated for each input tuple.
+ *
+ * In SETOP_SORTED mode, we need only one of these structs, and it's kept in
+ * the plan state node. In SETOP_HASHED mode, the hash table contains one
+ * of these for each tuple group.
+ */
+typedef struct SetOpStatePerGroupData
+{
+ long numLeft; /* number of left-input dups in group */
+ long numRight; /* number of right-input dups in group */
+} SetOpStatePerGroupData;
+
+
+static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate);
+static void setop_fill_hash_table(SetOpState *setopstate);
+static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate);
+
+
+/*
+ * Initialize state for a new group of input values.
+ */
+static inline void
+initialize_counts(SetOpStatePerGroup pergroup)
+{
+ pergroup->numLeft = pergroup->numRight = 0;
+}
+
+/*
+ * Advance the appropriate counter for one input tuple.
+ */
+static inline void
+advance_counts(SetOpStatePerGroup pergroup, int flag)
+{
+ if (flag)
+ pergroup->numRight++;
+ else
+ pergroup->numLeft++;
+}
+
+/*
+ * Fetch the "flag" column from an input tuple.
+ * This is an integer column with value 0 for left side, 1 for right side.
+ */
+static int
+fetch_tuple_flag(SetOpState *setopstate, TupleTableSlot *inputslot)
+{
+ SetOp *node = (SetOp *) setopstate->ps.plan;
+ int flag;
+ bool isNull;
+
+ flag = DatumGetInt32(slot_getattr(inputslot,
+ node->flagColIdx,
+ &isNull));
+ Assert(!isNull);
+ Assert(flag == 0 || flag == 1);
+ return flag;
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(SetOpState *setopstate)
+{
+ SetOp *node = (SetOp *) setopstate->ps.plan;
+ ExprContext *econtext = setopstate->ps.ps_ExprContext;
+ TupleDesc desc = ExecGetResultType(outerPlanState(setopstate));
+
+ Assert(node->strategy == SETOP_HASHED);
+ Assert(node->numGroups > 0);
+
+ setopstate->hashtable = BuildTupleHashTableExt(&setopstate->ps,
+ desc,
+ node->numCols,
+ node->dupColIdx,
+ setopstate->eqfuncoids,
+ setopstate->hashfunctions,
+ node->dupCollations,
+ node->numGroups,
+ 0,
+ setopstate->ps.state->es_query_cxt,
+ setopstate->tableContext,
+ econtext->ecxt_per_tuple_memory,
+ false);
+}
+
+/*
+ * We've completed processing a tuple group. Decide how many copies (if any)
+ * of its representative row to emit, and store the count into numOutput.
+ * This logic is straight from the SQL92 specification.
+ */
+static void
+set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+ SetOp *plannode = (SetOp *) setopstate->ps.plan;
+
+ switch (plannode->cmd)
+ {
+ case SETOPCMD_INTERSECT:
+ if (pergroup->numLeft > 0 && pergroup->numRight > 0)
+ setopstate->numOutput = 1;
+ else
+ setopstate->numOutput = 0;
+ break;
+ case SETOPCMD_INTERSECT_ALL:
+ setopstate->numOutput =
+ (pergroup->numLeft < pergroup->numRight) ?
+ pergroup->numLeft : pergroup->numRight;
+ break;
+ case SETOPCMD_EXCEPT:
+ if (pergroup->numLeft > 0 && pergroup->numRight == 0)
+ setopstate->numOutput = 1;
+ else
+ setopstate->numOutput = 0;
+ break;
+ case SETOPCMD_EXCEPT_ALL:
+ setopstate->numOutput =
+ (pergroup->numLeft < pergroup->numRight) ?
+ 0 : (pergroup->numLeft - pergroup->numRight);
+ break;
+ default:
+ elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd);
+ break;
+ }
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecSetOp
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot * /* return: a tuple or NULL */
+ExecSetOp(PlanState *pstate)
+{
+ SetOpState *node = castNode(SetOpState, pstate);
+ SetOp *plannode = (SetOp *) node->ps.plan;
+ TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * If the previously-returned tuple needs to be returned more than once,
+ * keep returning it.
+ */
+ if (node->numOutput > 0)
+ {
+ node->numOutput--;
+ return resultTupleSlot;
+ }
+
+ /* Otherwise, we're done if we are out of groups */
+ if (node->setop_done)
+ return NULL;
+
+ /* Fetch the next tuple group according to the correct strategy */
+ if (plannode->strategy == SETOP_HASHED)
+ {
+ if (!node->table_filled)
+ setop_fill_hash_table(node);
+ return setop_retrieve_hash_table(node);
+ }
+ else
+ return setop_retrieve_direct(node);
+}
+
+/*
+ * ExecSetOp for non-hashed case
+ */
+static TupleTableSlot *
+setop_retrieve_direct(SetOpState *setopstate)
+{
+ PlanState *outerPlan;
+ SetOpStatePerGroup pergroup;
+ TupleTableSlot *outerslot;
+ TupleTableSlot *resultTupleSlot;
+ ExprContext *econtext = setopstate->ps.ps_ExprContext;
+
+ /*
+ * get state info from node
+ */
+ outerPlan = outerPlanState(setopstate);
+ pergroup = (SetOpStatePerGroup) setopstate->pergroup;
+ resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+
+ /*
+ * We loop retrieving groups until we find one we should return
+ */
+ while (!setopstate->setop_done)
+ {
+ /*
+ * If we don't already have the first tuple of the new group, fetch it
+ * from the outer plan.
+ */
+ if (setopstate->grp_firstTuple == NULL)
+ {
+ outerslot = ExecProcNode(outerPlan);
+ if (!TupIsNull(outerslot))
+ {
+ /* Make a copy of the first input tuple */
+ setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
+ }
+ else
+ {
+ /* outer plan produced no tuples at all */
+ setopstate->setop_done = true;
+ return NULL;
+ }
+ }
+
+ /*
+ * Store the copied first input tuple in the tuple table slot reserved
+ * for it. The tuple will be deleted when it is cleared from the
+ * slot.
+ */
+ ExecStoreHeapTuple(setopstate->grp_firstTuple,
+ resultTupleSlot,
+ true);
+ setopstate->grp_firstTuple = NULL; /* don't keep two pointers */
+
+ /* Initialize working state for a new input tuple group */
+ initialize_counts(pergroup);
+
+ /* Count the first input tuple */
+ advance_counts(pergroup,
+ fetch_tuple_flag(setopstate, resultTupleSlot));
+
+ /*
+ * Scan the outer plan until we exhaust it or cross a group boundary.
+ */
+ for (;;)
+ {
+ outerslot = ExecProcNode(outerPlan);
+ if (TupIsNull(outerslot))
+ {
+ /* no more outer-plan tuples available */
+ setopstate->setop_done = true;
+ break;
+ }
+
+ /*
+ * Check whether we've crossed a group boundary.
+ */
+ econtext->ecxt_outertuple = resultTupleSlot;
+ econtext->ecxt_innertuple = outerslot;
+
+ if (!ExecQualAndReset(setopstate->eqfunction, econtext))
+ {
+ /*
+ * Save the first input tuple of the next group.
+ */
+ setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
+ break;
+ }
+
+ /* Still in same group, so count this tuple */
+ advance_counts(pergroup,
+ fetch_tuple_flag(setopstate, outerslot));
+ }
+
+ /*
+ * Done scanning input tuple group. See if we should emit any copies
+ * of result tuple, and if so return the first copy.
+ */
+ set_output_count(setopstate, pergroup);
+
+ if (setopstate->numOutput > 0)
+ {
+ setopstate->numOutput--;
+ return resultTupleSlot;
+ }
+ }
+
+ /* No more groups */
+ ExecClearTuple(resultTupleSlot);
+ return NULL;
+}
+
+/*
+ * ExecSetOp for hashed case: phase 1, read input and build hash table
+ */
+static void
+setop_fill_hash_table(SetOpState *setopstate)
+{
+ SetOp *node = (SetOp *) setopstate->ps.plan;
+ PlanState *outerPlan;
+ int firstFlag;
+ bool in_first_rel PG_USED_FOR_ASSERTS_ONLY;
+ ExprContext *econtext = setopstate->ps.ps_ExprContext;
+
+ /*
+ * get state info from node
+ */
+ outerPlan = outerPlanState(setopstate);
+ firstFlag = node->firstFlag;
+ /* verify planner didn't mess up */
+ Assert(firstFlag == 0 ||
+ (firstFlag == 1 &&
+ (node->cmd == SETOPCMD_INTERSECT ||
+ node->cmd == SETOPCMD_INTERSECT_ALL)));
+
+ /*
+ * Process each outer-plan tuple, and then fetch the next one, until we
+ * exhaust the outer plan.
+ */
+ in_first_rel = true;
+ for (;;)
+ {
+ TupleTableSlot *outerslot;
+ int flag;
+ TupleHashEntryData *entry;
+ bool isnew;
+
+ outerslot = ExecProcNode(outerPlan);
+ if (TupIsNull(outerslot))
+ break;
+
+ /* Identify whether it's left or right input */
+ flag = fetch_tuple_flag(setopstate, outerslot);
+
+ if (flag == firstFlag)
+ {
+ /* (still) in first input relation */
+ Assert(in_first_rel);
+
+ /* Find or build hashtable entry for this tuple's group */
+ entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
+ &isnew, NULL);
+
+ /* If new tuple group, initialize counts */
+ if (isnew)
+ {
+ entry->additional = (SetOpStatePerGroup)
+ MemoryContextAlloc(setopstate->hashtable->tablecxt,
+ sizeof(SetOpStatePerGroupData));
+ initialize_counts((SetOpStatePerGroup) entry->additional);
+ }
+
+ /* Advance the counts */
+ advance_counts((SetOpStatePerGroup) entry->additional, flag);
+ }
+ else
+ {
+ /* reached second relation */
+ in_first_rel = false;
+
+ /* For tuples not seen previously, do not make hashtable entry */
+ entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
+ NULL, NULL);
+
+ /* Advance the counts if entry is already present */
+ if (entry)
+ advance_counts((SetOpStatePerGroup) entry->additional, flag);
+ }
+
+ /* Must reset expression context after each hashtable lookup */
+ ResetExprContext(econtext);
+ }
+
+ setopstate->table_filled = true;
+ /* Initialize to walk the hash table */
+ ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
+}
+
+/*
+ * ExecSetOp for hashed case: phase 2, retrieving groups from hash table
+ */
+static TupleTableSlot *
+setop_retrieve_hash_table(SetOpState *setopstate)
+{
+ TupleHashEntryData *entry;
+ TupleTableSlot *resultTupleSlot;
+
+ /*
+ * get state info from node
+ */
+ resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+
+ /*
+ * We loop retrieving groups until we find one we should return
+ */
+ while (!setopstate->setop_done)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Find the next entry in the hash table
+ */
+ entry = ScanTupleHashTable(setopstate->hashtable, &setopstate->hashiter);
+ if (entry == NULL)
+ {
+ /* No more entries in hashtable, so done */
+ setopstate->setop_done = true;
+ return NULL;
+ }
+
+ /*
+ * See if we should emit any copies of this tuple, and if so return
+ * the first copy.
+ */
+ set_output_count(setopstate, (SetOpStatePerGroup) entry->additional);
+
+ if (setopstate->numOutput > 0)
+ {
+ setopstate->numOutput--;
+ return ExecStoreMinimalTuple(entry->firstTuple,
+ resultTupleSlot,
+ false);
+ }
+ }
+
+ /* No more groups */
+ ExecClearTuple(resultTupleSlot);
+ return NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitSetOp
+ *
+ * This initializes the setop node state structures and
+ * the node's subplan.
+ * ----------------------------------------------------------------
+ */
+SetOpState *
+ExecInitSetOp(SetOp *node, EState *estate, int eflags)
+{
+ SetOpState *setopstate;
+ TupleDesc outerDesc;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ setopstate = makeNode(SetOpState);
+ setopstate->ps.plan = (Plan *) node;
+ setopstate->ps.state = estate;
+ setopstate->ps.ExecProcNode = ExecSetOp;
+
+ setopstate->eqfuncoids = NULL;
+ setopstate->hashfunctions = NULL;
+ setopstate->setop_done = false;
+ setopstate->numOutput = 0;
+ setopstate->pergroup = NULL;
+ setopstate->grp_firstTuple = NULL;
+ setopstate->hashtable = NULL;
+ setopstate->tableContext = NULL;
+
+ /*
+ * create expression context
+ */
+ ExecAssignExprContext(estate, &setopstate->ps);
+
+ /*
+ * If hashing, we also need a longer-lived context to store the hash
+ * table. The table can't just be kept in the per-query context because
+ * we want to be able to throw it away in ExecReScanSetOp.
+ */
+ if (node->strategy == SETOP_HASHED)
+ setopstate->tableContext =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "SetOp hash table",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * initialize child nodes
+ *
+ * If we are hashing then the child plan does not need to handle REWIND
+ * efficiently; see ExecReScanSetOp.
+ */
+ if (node->strategy == SETOP_HASHED)
+ eflags &= ~EXEC_FLAG_REWIND;
+ outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags);
+ outerDesc = ExecGetResultType(outerPlanState(setopstate));
+
+ /*
+ * Initialize result slot and type. Setop nodes do no projections, so
+ * initialize projection info for this node appropriately.
+ */
+ ExecInitResultTupleSlotTL(&setopstate->ps,
+ node->strategy == SETOP_HASHED ?
+ &TTSOpsMinimalTuple : &TTSOpsHeapTuple);
+ setopstate->ps.ps_ProjInfo = NULL;
+
+ /*
+ * Precompute fmgr lookup data for inner loop. We need both equality and
+ * hashing functions to do it by hashing, but only equality if not
+ * hashing.
+ */
+ if (node->strategy == SETOP_HASHED)
+ execTuplesHashPrepare(node->numCols,
+ node->dupOperators,
+ &setopstate->eqfuncoids,
+ &setopstate->hashfunctions);
+ else
+ setopstate->eqfunction =
+ execTuplesMatchPrepare(outerDesc,
+ node->numCols,
+ node->dupColIdx,
+ node->dupOperators,
+ node->dupCollations,
+ &setopstate->ps);
+
+ if (node->strategy == SETOP_HASHED)
+ {
+ build_hash_table(setopstate);
+ setopstate->table_filled = false;
+ }
+ else
+ {
+ setopstate->pergroup =
+ (SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData));
+ }
+
+ return setopstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndSetOp
+ *
+ * This shuts down the subplan and frees resources allocated
+ * to this node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSetOp(SetOpState *node)
+{
+ /* clean up tuple table */
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ /* free subsidiary stuff including hashtable */
+ if (node->tableContext)
+ MemoryContextDelete(node->tableContext);
+ ExecFreeExprContext(&node->ps);
+
+ ExecEndNode(outerPlanState(node));
+}
+
+
+void
+ExecReScanSetOp(SetOpState *node)
+{
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+ node->setop_done = false;
+ node->numOutput = 0;
+
+ if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+ {
+ /*
+ * In the hashed case, if we haven't yet built the hash table then we
+ * can just return; nothing done yet, so nothing to undo. If subnode's
+ * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+ * else no reason to re-scan it at all.
+ */
+ if (!node->table_filled)
+ return;
+
+ /*
+ * If we do have the hash table and the subplan does not have any
+ * parameter changes, then we can just rescan the existing hash table;
+ * no need to build it again.
+ */
+ if (node->ps.lefttree->chgParam == NULL)
+ {
+ ResetTupleHashIterator(node->hashtable, &node->hashiter);
+ return;
+ }
+ }
+
+ /* Release first tuple of group, if we have made a copy */
+ if (node->grp_firstTuple != NULL)
+ {
+ heap_freetuple(node->grp_firstTuple);
+ node->grp_firstTuple = NULL;
+ }
+
+ /* Release any hashtable storage */
+ if (node->tableContext)
+ MemoryContextResetAndDeleteChildren(node->tableContext);
+
+ /* And rebuild empty hashtable if needed */
+ if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+ {
+ ResetTupleHashTable(node->hashtable);
+ node->table_filled = false;
+ }
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->ps.lefttree->chgParam == NULL)
+ ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
new file mode 100644
index 0000000..b99027e
--- /dev/null
+++ b/src/backend/executor/nodeSort.c
@@ -0,0 +1,430 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSort.c
+ * Routines to handle sorting of relations.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeSort.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "executor/execdebug.h"
+#include "executor/nodeSort.h"
+#include "miscadmin.h"
+#include "utils/tuplesort.h"
+
+
+/* ----------------------------------------------------------------
+ * ExecSort
+ *
+ * Sorts tuples from the outer subtree of the node using tuplesort,
+ * which saves the results in a temporary file or memory. After the
+ * initial call, returns a tuple from the file with each call.
+ *
+ * Conditions:
+ * -- none.
+ *
+ * Initial States:
+ * -- the outer child is prepared to return the first tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecSort(PlanState *pstate)
+{
+ SortState *node = castNode(SortState, pstate);
+ EState *estate;
+ ScanDirection dir;
+ Tuplesortstate *tuplesortstate;
+ TupleTableSlot *slot;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * get state info from node
+ */
+ SO1_printf("ExecSort: %s\n",
+ "entering routine");
+
+ estate = node->ss.ps.state;
+ dir = estate->es_direction;
+ tuplesortstate = (Tuplesortstate *) node->tuplesortstate;
+
+ /*
+ * If first time through, read all tuples from outer plan and pass them to
+ * tuplesort.c. Subsequent calls just fetch tuples from tuplesort.
+ */
+
+ if (!node->sort_Done)
+ {
+ Sort *plannode = (Sort *) node->ss.ps.plan;
+ PlanState *outerNode;
+ TupleDesc tupDesc;
+
+ SO1_printf("ExecSort: %s\n",
+ "sorting subplan");
+
+ /*
+ * Want to scan subplan in the forward direction while creating the
+ * sorted data.
+ */
+ estate->es_direction = ForwardScanDirection;
+
+ /*
+ * Initialize tuplesort module.
+ */
+ SO1_printf("ExecSort: %s\n",
+ "calling tuplesort_begin");
+
+ outerNode = outerPlanState(node);
+ tupDesc = ExecGetResultType(outerNode);
+
+ tuplesortstate = tuplesort_begin_heap(tupDesc,
+ plannode->numCols,
+ plannode->sortColIdx,
+ plannode->sortOperators,
+ plannode->collations,
+ plannode->nullsFirst,
+ work_mem,
+ NULL,
+ node->randomAccess);
+ if (node->bounded)
+ tuplesort_set_bound(tuplesortstate, node->bound);
+ node->tuplesortstate = (void *) tuplesortstate;
+
+ /*
+ * Scan the subplan and feed all the tuples to tuplesort.
+ */
+
+ for (;;)
+ {
+ slot = ExecProcNode(outerNode);
+
+ if (TupIsNull(slot))
+ break;
+
+ tuplesort_puttupleslot(tuplesortstate, slot);
+ }
+
+ /*
+ * Complete the sort.
+ */
+ tuplesort_performsort(tuplesortstate);
+
+ /*
+ * restore to user specified direction
+ */
+ estate->es_direction = dir;
+
+ /*
+ * finally set the sorted flag to true
+ */
+ node->sort_Done = true;
+ node->bounded_Done = node->bounded;
+ node->bound_Done = node->bound;
+ if (node->shared_info && node->am_worker)
+ {
+ TuplesortInstrumentation *si;
+
+ Assert(IsParallelWorker());
+ Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+ si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+ tuplesort_get_stats(tuplesortstate, si);
+ }
+ SO1_printf("ExecSort: %s\n", "sorting done");
+ }
+
+ SO1_printf("ExecSort: %s\n",
+ "retrieving tuple from tuplesort");
+
+ /*
+ * Get the first or next tuple from tuplesort. Returns NULL if no more
+ * tuples. Note that we only rely on slot tuple remaining valid until the
+ * next fetch from the tuplesort.
+ */
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ (void) tuplesort_gettupleslot(tuplesortstate,
+ ScanDirectionIsForward(dir),
+ false, slot, NULL);
+ return slot;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitSort
+ *
+ * Creates the run-time state information for the sort node
+ * produced by the planner and initializes its outer subtree.
+ * ----------------------------------------------------------------
+ */
+SortState *
+ExecInitSort(Sort *node, EState *estate, int eflags)
+{
+ SortState *sortstate;
+
+ SO1_printf("ExecInitSort: %s\n",
+ "initializing sort node");
+
+ /*
+ * create state structure
+ */
+ sortstate = makeNode(SortState);
+ sortstate->ss.ps.plan = (Plan *) node;
+ sortstate->ss.ps.state = estate;
+ sortstate->ss.ps.ExecProcNode = ExecSort;
+
+ /*
+ * We must have random access to the sort output to do backward scan or
+ * mark/restore. We also prefer to materialize the sort output if we
+ * might be called on to rewind and replay it many times.
+ */
+ sortstate->randomAccess = (eflags & (EXEC_FLAG_REWIND |
+ EXEC_FLAG_BACKWARD |
+ EXEC_FLAG_MARK)) != 0;
+
+ sortstate->bounded = false;
+ sortstate->sort_Done = false;
+ sortstate->tuplesortstate = NULL;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * Sort nodes don't initialize their ExprContexts because they never call
+ * ExecQual or ExecProject.
+ */
+
+ /*
+ * initialize child nodes
+ *
+ * We shield the child node from the need to support REWIND, BACKWARD, or
+ * MARK/RESTORE.
+ */
+ eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);
+
+ outerPlanState(sortstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * Initialize scan slot and type.
+ */
+ ExecCreateScanSlotFromOuterPlan(estate, &sortstate->ss, &TTSOpsVirtual);
+
+ /*
+ * Initialize return slot and type. No need to initialize projection info
+ * because this node doesn't do projections.
+ */
+ ExecInitResultTupleSlotTL(&sortstate->ss.ps, &TTSOpsMinimalTuple);
+ sortstate->ss.ps.ps_ProjInfo = NULL;
+
+ SO1_printf("ExecInitSort: %s\n",
+ "sort node initialized");
+
+ return sortstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndSort(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSort(SortState *node)
+{
+ SO1_printf("ExecEndSort: %s\n",
+ "shutting down sort node");
+
+ /*
+ * clean out the tuple table
+ */
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+ /* must drop pointer to sort result tuple */
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ /*
+ * Release tuplesort resources
+ */
+ if (node->tuplesortstate != NULL)
+ tuplesort_end((Tuplesortstate *) node->tuplesortstate);
+ node->tuplesortstate = NULL;
+
+ /*
+ * shut down the subplan
+ */
+ ExecEndNode(outerPlanState(node));
+
+ SO1_printf("ExecEndSort: %s\n",
+ "sort node shutdown");
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortMarkPos
+ *
+ * Calls tuplesort to save the current position in the sorted file.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortMarkPos(SortState *node)
+{
+ /*
+ * if we haven't sorted yet, just return
+ */
+ if (!node->sort_Done)
+ return;
+
+ tuplesort_markpos((Tuplesortstate *) node->tuplesortstate);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortRestrPos
+ *
+ * Calls tuplesort to restore the last saved sort file position.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortRestrPos(SortState *node)
+{
+ /*
+ * if we haven't sorted yet, just return.
+ */
+ if (!node->sort_Done)
+ return;
+
+ /*
+ * restore the scan to the previously marked position
+ */
+ tuplesort_restorepos((Tuplesortstate *) node->tuplesortstate);
+}
+
+void
+ExecReScanSort(SortState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ /*
+ * If we haven't sorted yet, just return. If outerplan's chgParam is not
+ * NULL then it will be re-scanned by ExecProcNode, else no reason to
+ * re-scan it at all.
+ */
+ if (!node->sort_Done)
+ return;
+
+ /* must drop pointer to sort result tuple */
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ /*
+ * If subnode is to be rescanned then we forget previous sort results; we
+ * have to re-read the subplan and re-sort. Also must re-sort if the
+ * bounded-sort parameters changed or we didn't select randomAccess.
+ *
+ * Otherwise we can just rewind and rescan the sorted output.
+ */
+ if (outerPlan->chgParam != NULL ||
+ node->bounded != node->bounded_Done ||
+ node->bound != node->bound_Done ||
+ !node->randomAccess)
+ {
+ node->sort_Done = false;
+ tuplesort_end((Tuplesortstate *) node->tuplesortstate);
+ node->tuplesortstate = NULL;
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+ }
+ else
+ tuplesort_rescan((Tuplesortstate *) node->tuplesortstate);
+}
+
+/* ----------------------------------------------------------------
+ * Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecSortEstimate
+ *
+ * Estimate space required to propagate sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortEstimate(SortState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = mul_size(pcxt->nworkers, sizeof(TuplesortInstrumentation));
+ size = add_size(size, offsetof(SharedSortInfo, sinstrument));
+ shm_toc_estimate_chunk(&pcxt->estimator, size);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortInitializeDSM
+ *
+ * Initialize DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = offsetof(SharedSortInfo, sinstrument)
+ + pcxt->nworkers * sizeof(TuplesortInstrumentation);
+ node->shared_info = shm_toc_allocate(pcxt->toc, size);
+ /* ensure any unfilled slots will contain zeroes */
+ memset(node->shared_info, 0, size);
+ node->shared_info->num_workers = pcxt->nworkers;
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+ node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortInitializeWorker
+ *
+ * Attach worker to DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortInitializeWorker(SortState *node, ParallelWorkerContext *pwcxt)
+{
+ node->shared_info =
+ shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+ node->am_worker = true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortRetrieveInstrumentation
+ *
+ * Transfer sort statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortRetrieveInstrumentation(SortState *node)
+{
+ Size size;
+ SharedSortInfo *si;
+
+ if (node->shared_info == NULL)
+ return;
+
+ size = offsetof(SharedSortInfo, sinstrument)
+ + node->shared_info->num_workers * sizeof(TuplesortInstrumentation);
+ si = palloc(size);
+ memcpy(si, node->shared_info, size);
+ node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
new file mode 100644
index 0000000..d46227e
--- /dev/null
+++ b/src/backend/executor/nodeSubplan.c
@@ -0,0 +1,1313 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSubplan.c
+ * routines to support sub-selects appearing in expressions
+ *
+ * This module is concerned with executing SubPlan expression nodes, which
+ * should not be confused with sub-SELECTs appearing in FROM. SubPlans are
+ * divided into "initplans", which are those that need only one evaluation per
+ * query (among other restrictions, this requires that they don't use any
+ * direct correlation variables from the parent plan level), and "regular"
+ * subplans, which are re-evaluated every time their result is required.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeSubplan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecSubPlan - process a subselect
+ * ExecInitSubPlan - initialize a subselect
+ */
+#include "postgres.h"
+
+#include <limits.h>
+#include <math.h>
+
+#include "access/htup_details.h"
+#include "executor/executor.h"
+#include "executor/nodeSubplan.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/array.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+
+static Datum ExecHashSubPlan(SubPlanState *node,
+ ExprContext *econtext,
+ bool *isNull);
+static Datum ExecScanSubPlan(SubPlanState *node,
+ ExprContext *econtext,
+ bool *isNull);
+static void buildSubPlanHash(SubPlanState *node, ExprContext *econtext);
+static bool findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
+ FmgrInfo *eqfunctions);
+static bool slotAllNulls(TupleTableSlot *slot);
+static bool slotNoNulls(TupleTableSlot *slot);
+
+
+/* ----------------------------------------------------------------
+ * ExecSubPlan
+ *
+ * This is the main entry point for execution of a regular SubPlan.
+ * ----------------------------------------------------------------
+ */
+Datum
+ExecSubPlan(SubPlanState *node,
+ ExprContext *econtext,
+ bool *isNull)
+{
+ SubPlan *subplan = node->subplan;
+ EState *estate = node->planstate->state;
+ ScanDirection dir = estate->es_direction;
+ Datum retval;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* Set non-null as default */
+ *isNull = false;
+
+ /* Sanity checks */
+ if (subplan->subLinkType == CTE_SUBLINK)
+ elog(ERROR, "CTE subplans should not be executed via ExecSubPlan");
+ if (subplan->setParam != NIL && subplan->subLinkType != MULTIEXPR_SUBLINK)
+ elog(ERROR, "cannot set parent params from subquery");
+
+ /* Force forward-scan mode for evaluation */
+ estate->es_direction = ForwardScanDirection;
+
+ /* Select appropriate evaluation strategy */
+ if (subplan->useHashTable)
+ retval = ExecHashSubPlan(node, econtext, isNull);
+ else
+ retval = ExecScanSubPlan(node, econtext, isNull);
+
+ /* restore scan direction */
+ estate->es_direction = dir;
+
+ return retval;
+}
+
+/*
+ * ExecHashSubPlan: store subselect result in an in-memory hash table
+ */
+static Datum
+ExecHashSubPlan(SubPlanState *node,
+ ExprContext *econtext,
+ bool *isNull)
+{
+ SubPlan *subplan = node->subplan;
+ PlanState *planstate = node->planstate;
+ TupleTableSlot *slot;
+
+ /* Shouldn't have any direct correlation Vars */
+ if (subplan->parParam != NIL || node->args != NIL)
+ elog(ERROR, "hashed subplan with direct correlation not supported");
+
+ /*
+ * If first time through or we need to rescan the subplan, build the hash
+ * table.
+ */
+ if (node->hashtable == NULL || planstate->chgParam != NULL)
+ buildSubPlanHash(node, econtext);
+
+ /*
+ * The result for an empty subplan is always FALSE; no need to evaluate
+ * lefthand side.
+ */
+ *isNull = false;
+ if (!node->havehashrows && !node->havenullrows)
+ return BoolGetDatum(false);
+
+ /*
+ * Evaluate lefthand expressions and form a projection tuple. First we
+ * have to set the econtext to use (hack alert!).
+ */
+ node->projLeft->pi_exprContext = econtext;
+ slot = ExecProject(node->projLeft);
+
+ /*
+ * Note: because we are typically called in a per-tuple context, we have
+ * to explicitly clear the projected tuple before returning. Otherwise,
+ * we'll have a double-free situation: the per-tuple context will probably
+ * be reset before we're called again, and then the tuple slot will think
+ * it still needs to free the tuple.
+ */
+
+ /*
+ * If the LHS is all non-null, probe for an exact match in the main hash
+ * table. If we find one, the result is TRUE. Otherwise, scan the
+ * partly-null table to see if there are any rows that aren't provably
+ * unequal to the LHS; if so, the result is UNKNOWN. (We skip that part
+ * if we don't care about UNKNOWN.) Otherwise, the result is FALSE.
+ *
+ * Note: the reason we can avoid a full scan of the main hash table is
+ * that the combining operators are assumed never to yield NULL when both
+ * inputs are non-null. If they were to do so, we might need to produce
+ * UNKNOWN instead of FALSE because of an UNKNOWN result in comparing the
+ * LHS to some main-table entry --- which is a comparison we will not even
+ * make, unless there's a chance match of hash keys.
+ */
+ if (slotNoNulls(slot))
+ {
+ if (node->havehashrows &&
+ FindTupleHashEntry(node->hashtable,
+ slot,
+ node->cur_eq_comp,
+ node->lhs_hash_funcs) != NULL)
+ {
+ ExecClearTuple(slot);
+ return BoolGetDatum(true);
+ }
+ if (node->havenullrows &&
+ findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs))
+ {
+ ExecClearTuple(slot);
+ *isNull = true;
+ return BoolGetDatum(false);
+ }
+ ExecClearTuple(slot);
+ return BoolGetDatum(false);
+ }
+
+ /*
+ * When the LHS is partly or wholly NULL, we can never return TRUE. If we
+ * don't care about UNKNOWN, just return FALSE. Otherwise, if the LHS is
+ * wholly NULL, immediately return UNKNOWN. (Since the combining
+ * operators are strict, the result could only be FALSE if the sub-select
+ * were empty, but we already handled that case.) Otherwise, we must scan
+ * both the main and partly-null tables to see if there are any rows that
+ * aren't provably unequal to the LHS; if so, the result is UNKNOWN.
+ * Otherwise, the result is FALSE.
+ */
+ if (node->hashnulls == NULL)
+ {
+ ExecClearTuple(slot);
+ return BoolGetDatum(false);
+ }
+ if (slotAllNulls(slot))
+ {
+ ExecClearTuple(slot);
+ *isNull = true;
+ return BoolGetDatum(false);
+ }
+ /* Scan partly-null table first, since more likely to get a match */
+ if (node->havenullrows &&
+ findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs))
+ {
+ ExecClearTuple(slot);
+ *isNull = true;
+ return BoolGetDatum(false);
+ }
+ if (node->havehashrows &&
+ findPartialMatch(node->hashtable, slot, node->cur_eq_funcs))
+ {
+ ExecClearTuple(slot);
+ *isNull = true;
+ return BoolGetDatum(false);
+ }
+ ExecClearTuple(slot);
+ return BoolGetDatum(false);
+}
+
+/*
+ * ExecScanSubPlan: default case where we have to rescan subplan each time
+ */
+static Datum
+ExecScanSubPlan(SubPlanState *node,
+ ExprContext *econtext,
+ bool *isNull)
+{
+ SubPlan *subplan = node->subplan;
+ PlanState *planstate = node->planstate;
+ SubLinkType subLinkType = subplan->subLinkType;
+ MemoryContext oldcontext;
+ TupleTableSlot *slot;
+ Datum result;
+ bool found = false; /* true if got at least one subplan tuple */
+ ListCell *pvar;
+ ListCell *l;
+ ArrayBuildStateAny *astate = NULL;
+
+ /*
+ * MULTIEXPR subplans, when "executed", just return NULL; but first we
+ * mark the subplan's output parameters as needing recalculation. (This
+ * is a bit of a hack: it relies on the subplan appearing later in its
+ * targetlist than any of the referencing Params, so that all the Params
+ * have been evaluated before we re-mark them for the next evaluation
+ * cycle. But in general resjunk tlist items appear after non-resjunk
+ * ones, so this should be safe.) Unlike ExecReScanSetParamPlan, we do
+ * *not* set bits in the parent plan node's chgParam, because we don't
+ * want to cause a rescan of the parent.
+ */
+ if (subLinkType == MULTIEXPR_SUBLINK)
+ {
+ EState *estate = node->parent->state;
+
+ foreach(l, subplan->setParam)
+ {
+ int paramid = lfirst_int(l);
+ ParamExecData *prm = &(estate->es_param_exec_vals[paramid]);
+
+ prm->execPlan = node;
+ }
+ *isNull = true;
+ return (Datum) 0;
+ }
+
+ /* Initialize ArrayBuildStateAny in caller's context, if needed */
+ if (subLinkType == ARRAY_SUBLINK)
+ astate = initArrayResultAny(subplan->firstColType,
+ CurrentMemoryContext, true);
+
+ /*
+ * We are probably in a short-lived expression-evaluation context. Switch
+ * to the per-query context for manipulating the child plan's chgParam,
+ * calling ExecProcNode on it, etc.
+ */
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+ /*
+ * Set Params of this plan from parent plan correlation values. (Any
+ * calculation we have to do is done in the parent econtext, since the
+ * Param values don't need to have per-query lifetime.)
+ */
+ Assert(list_length(subplan->parParam) == list_length(node->args));
+
+ forboth(l, subplan->parParam, pvar, node->args)
+ {
+ int paramid = lfirst_int(l);
+ ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+ prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
+ econtext,
+ &(prm->isnull));
+ planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
+ }
+
+ /*
+ * Now that we've set up its parameters, we can reset the subplan.
+ */
+ ExecReScan(planstate);
+
+ /*
+ * For all sublink types except EXPR_SUBLINK and ARRAY_SUBLINK, the result
+ * is boolean as are the results of the combining operators. We combine
+ * results across tuples (if the subplan produces more than one) using OR
+ * semantics for ANY_SUBLINK or AND semantics for ALL_SUBLINK.
+ * (ROWCOMPARE_SUBLINK doesn't allow multiple tuples from the subplan.)
+ * NULL results from the combining operators are handled according to the
+ * usual SQL semantics for OR and AND. The result for no input tuples is
+ * FALSE for ANY_SUBLINK, TRUE for ALL_SUBLINK, NULL for
+ * ROWCOMPARE_SUBLINK.
+ *
+ * For EXPR_SUBLINK we require the subplan to produce no more than one
+ * tuple, else an error is raised. If zero tuples are produced, we return
+ * NULL. Assuming we get a tuple, we just use its first column (there can
+ * be only one non-junk column in this case).
+ *
+ * For ARRAY_SUBLINK we allow the subplan to produce any number of tuples,
+ * and form an array of the first column's values. Note in particular
+ * that we produce a zero-element array if no tuples are produced (this is
+ * a change from pre-8.3 behavior of returning NULL).
+ */
+ result = BoolGetDatum(subLinkType == ALL_SUBLINK);
+ *isNull = false;
+
+ for (slot = ExecProcNode(planstate);
+ !TupIsNull(slot);
+ slot = ExecProcNode(planstate))
+ {
+ TupleDesc tdesc = slot->tts_tupleDescriptor;
+ Datum rowresult;
+ bool rownull;
+ int col;
+ ListCell *plst;
+
+ if (subLinkType == EXISTS_SUBLINK)
+ {
+ found = true;
+ result = BoolGetDatum(true);
+ break;
+ }
+
+ if (subLinkType == EXPR_SUBLINK)
+ {
+ /* cannot allow multiple input tuples for EXPR sublink */
+ if (found)
+ ereport(ERROR,
+ (errcode(ERRCODE_CARDINALITY_VIOLATION),
+ errmsg("more than one row returned by a subquery used as an expression")));
+ found = true;
+
+ /*
+ * We need to copy the subplan's tuple in case the result is of
+ * pass-by-ref type --- our return value will point into this
+ * copied tuple! Can't use the subplan's instance of the tuple
+ * since it won't still be valid after next ExecProcNode() call.
+ * node->curTuple keeps track of the copied tuple for eventual
+ * freeing.
+ */
+ if (node->curTuple)
+ heap_freetuple(node->curTuple);
+ node->curTuple = ExecCopySlotHeapTuple(slot);
+
+ result = heap_getattr(node->curTuple, 1, tdesc, isNull);
+ /* keep scanning subplan to make sure there's only one tuple */
+ continue;
+ }
+
+ if (subLinkType == ARRAY_SUBLINK)
+ {
+ Datum dvalue;
+ bool disnull;
+
+ found = true;
+ /* stash away current value */
+ Assert(subplan->firstColType == TupleDescAttr(tdesc, 0)->atttypid);
+ dvalue = slot_getattr(slot, 1, &disnull);
+ astate = accumArrayResultAny(astate, dvalue, disnull,
+ subplan->firstColType, oldcontext);
+ /* keep scanning subplan to collect all values */
+ continue;
+ }
+
+ /* cannot allow multiple input tuples for ROWCOMPARE sublink either */
+ if (subLinkType == ROWCOMPARE_SUBLINK && found)
+ ereport(ERROR,
+ (errcode(ERRCODE_CARDINALITY_VIOLATION),
+ errmsg("more than one row returned by a subquery used as an expression")));
+
+ found = true;
+
+ /*
+ * For ALL, ANY, and ROWCOMPARE sublinks, load up the Params
+ * representing the columns of the sub-select, and then evaluate the
+ * combining expression.
+ */
+ col = 1;
+ foreach(plst, subplan->paramIds)
+ {
+ int paramid = lfirst_int(plst);
+ ParamExecData *prmdata;
+
+ prmdata = &(econtext->ecxt_param_exec_vals[paramid]);
+ Assert(prmdata->execPlan == NULL);
+ prmdata->value = slot_getattr(slot, col, &(prmdata->isnull));
+ col++;
+ }
+
+ rowresult = ExecEvalExprSwitchContext(node->testexpr, econtext,
+ &rownull);
+
+ if (subLinkType == ANY_SUBLINK)
+ {
+ /* combine across rows per OR semantics */
+ if (rownull)
+ *isNull = true;
+ else if (DatumGetBool(rowresult))
+ {
+ result = BoolGetDatum(true);
+ *isNull = false;
+ break; /* needn't look at any more rows */
+ }
+ }
+ else if (subLinkType == ALL_SUBLINK)
+ {
+ /* combine across rows per AND semantics */
+ if (rownull)
+ *isNull = true;
+ else if (!DatumGetBool(rowresult))
+ {
+ result = BoolGetDatum(false);
+ *isNull = false;
+ break; /* needn't look at any more rows */
+ }
+ }
+ else
+ {
+ /* must be ROWCOMPARE_SUBLINK */
+ result = rowresult;
+ *isNull = rownull;
+ }
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ if (subLinkType == ARRAY_SUBLINK)
+ {
+ /* We return the result in the caller's context */
+ result = makeArrayResultAny(astate, oldcontext, true);
+ }
+ else if (!found)
+ {
+ /*
+ * deal with empty subplan result. result/isNull were previously
+ * initialized correctly for all sublink types except EXPR and
+ * ROWCOMPARE; for those, return NULL.
+ */
+ if (subLinkType == EXPR_SUBLINK ||
+ subLinkType == ROWCOMPARE_SUBLINK)
+ {
+ result = (Datum) 0;
+ *isNull = true;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * buildSubPlanHash: load hash table by scanning subplan output.
+ */
+static void
+buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
+{
+ SubPlan *subplan = node->subplan;
+ PlanState *planstate = node->planstate;
+ int ncols = node->numCols;
+ ExprContext *innerecontext = node->innerecontext;
+ MemoryContext oldcontext;
+ long nbuckets;
+ TupleTableSlot *slot;
+
+ Assert(subplan->subLinkType == ANY_SUBLINK);
+
+ /*
+ * If we already had any hash tables, reset 'em; otherwise create empty
+ * hash table(s).
+ *
+ * If we need to distinguish accurately between FALSE and UNKNOWN (i.e.,
+ * NULL) results of the IN operation, then we have to store subplan output
+ * rows that are partly or wholly NULL. We store such rows in a separate
+ * hash table that we expect will be much smaller than the main table. (We
+ * can use hashing to eliminate partly-null rows that are not distinct. We
+ * keep them separate to minimize the cost of the inevitable full-table
+ * searches; see findPartialMatch.)
+ *
+ * If it's not necessary to distinguish FALSE and UNKNOWN, then we don't
+ * need to store subplan output rows that contain NULL.
+ */
+ MemoryContextReset(node->hashtablecxt);
+ node->havehashrows = false;
+ node->havenullrows = false;
+
+ nbuckets = (long) Min(planstate->plan->plan_rows, (double) LONG_MAX);
+ if (nbuckets < 1)
+ nbuckets = 1;
+
+ if (node->hashtable)
+ ResetTupleHashTable(node->hashtable);
+ else
+ node->hashtable = BuildTupleHashTableExt(node->parent,
+ node->descRight,
+ ncols,
+ node->keyColIdx,
+ node->tab_eq_funcoids,
+ node->tab_hash_funcs,
+ node->tab_collations,
+ nbuckets,
+ 0,
+ node->planstate->state->es_query_cxt,
+ node->hashtablecxt,
+ node->hashtempcxt,
+ false);
+
+ if (!subplan->unknownEqFalse)
+ {
+ if (ncols == 1)
+ nbuckets = 1; /* there can only be one entry */
+ else
+ {
+ nbuckets /= 16;
+ if (nbuckets < 1)
+ nbuckets = 1;
+ }
+
+ if (node->hashnulls)
+ ResetTupleHashTable(node->hashnulls);
+ else
+ node->hashnulls = BuildTupleHashTableExt(node->parent,
+ node->descRight,
+ ncols,
+ node->keyColIdx,
+ node->tab_eq_funcoids,
+ node->tab_hash_funcs,
+ node->tab_collations,
+ nbuckets,
+ 0,
+ node->planstate->state->es_query_cxt,
+ node->hashtablecxt,
+ node->hashtempcxt,
+ false);
+ }
+ else
+ node->hashnulls = NULL;
+
+ /*
+ * We are probably in a short-lived expression-evaluation context. Switch
+ * to the per-query context for manipulating the child plan.
+ */
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+ /*
+ * Reset subplan to start.
+ */
+ ExecReScan(planstate);
+
+ /*
+ * Scan the subplan and load the hash table(s). Note that when there are
+ * duplicate rows coming out of the sub-select, only one copy is stored.
+ */
+ for (slot = ExecProcNode(planstate);
+ !TupIsNull(slot);
+ slot = ExecProcNode(planstate))
+ {
+ int col = 1;
+ ListCell *plst;
+ bool isnew;
+
+ /*
+ * Load up the Params representing the raw sub-select outputs, then
+ * form the projection tuple to store in the hashtable.
+ */
+ foreach(plst, subplan->paramIds)
+ {
+ int paramid = lfirst_int(plst);
+ ParamExecData *prmdata;
+
+ prmdata = &(innerecontext->ecxt_param_exec_vals[paramid]);
+ Assert(prmdata->execPlan == NULL);
+ prmdata->value = slot_getattr(slot, col,
+ &(prmdata->isnull));
+ col++;
+ }
+ slot = ExecProject(node->projRight);
+
+ /*
+ * If result contains any nulls, store separately or not at all.
+ */
+ if (slotNoNulls(slot))
+ {
+ (void) LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
+ node->havehashrows = true;
+ }
+ else if (node->hashnulls)
+ {
+ (void) LookupTupleHashEntry(node->hashnulls, slot, &isnew, NULL);
+ node->havenullrows = true;
+ }
+
+ /*
+ * Reset innerecontext after each inner tuple to free any memory used
+ * during ExecProject.
+ */
+ ResetExprContext(innerecontext);
+ }
+
+ /*
+ * Since the projected tuples are in the sub-query's context and not the
+ * main context, we'd better clear the tuple slot before there's any
+ * chance of a reset of the sub-query's context. Else we will have the
+ * potential for a double free attempt. (XXX possibly no longer needed,
+ * but can't hurt.)
+ */
+ ExecClearTuple(node->projRight->pi_state.resultslot);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * execTuplesUnequal
+ * Return true if two tuples are definitely unequal in the indicated
+ * fields.
+ *
+ * Nulls are neither equal nor unequal to anything else. A true result
+ * is obtained only if there are non-null fields that compare not-equal.
+ *
+ * slot1, slot2: the tuples to compare (must have same columns!)
+ * numCols: the number of attributes to be examined
+ * matchColIdx: array of attribute column numbers
+ * eqFunctions: array of fmgr lookup info for the equality functions to use
+ * evalContext: short-term memory context for executing the functions
+ */
+static bool
+execTuplesUnequal(TupleTableSlot *slot1,
+ TupleTableSlot *slot2,
+ int numCols,
+ AttrNumber *matchColIdx,
+ FmgrInfo *eqfunctions,
+ const Oid *collations,
+ MemoryContext evalContext)
+{
+ MemoryContext oldContext;
+ bool result;
+ int i;
+
+ /* Reset and switch into the temp context. */
+ MemoryContextReset(evalContext);
+ oldContext = MemoryContextSwitchTo(evalContext);
+
+ /*
+ * We cannot report a match without checking all the fields, but we can
+ * report a non-match as soon as we find unequal fields. So, start
+ * comparing at the last field (least significant sort key). That's the
+ * most likely to be different if we are dealing with sorted input.
+ */
+ result = false;
+
+ for (i = numCols; --i >= 0;)
+ {
+ AttrNumber att = matchColIdx[i];
+ Datum attr1,
+ attr2;
+ bool isNull1,
+ isNull2;
+
+ attr1 = slot_getattr(slot1, att, &isNull1);
+
+ if (isNull1)
+ continue; /* can't prove anything here */
+
+ attr2 = slot_getattr(slot2, att, &isNull2);
+
+ if (isNull2)
+ continue; /* can't prove anything here */
+
+ /* Apply the type-specific equality function */
+ if (!DatumGetBool(FunctionCall2Coll(&eqfunctions[i],
+ collations[i],
+ attr1, attr2)))
+ {
+ result = true; /* they are unequal */
+ break;
+ }
+ }
+
+ MemoryContextSwitchTo(oldContext);
+
+ return result;
+}
+
+/*
+ * findPartialMatch: does the hashtable contain an entry that is not
+ * provably distinct from the tuple?
+ *
+ * We have to scan the whole hashtable; we can't usefully use hashkeys
+ * to guide probing, since we might get partial matches on tuples with
+ * hashkeys quite unrelated to what we'd get from the given tuple.
+ *
+ * Caller must provide the equality functions to use, since in cross-type
+ * cases these are different from the hashtable's internal functions.
+ */
+static bool
+findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
+ FmgrInfo *eqfunctions)
+{
+ int numCols = hashtable->numCols;
+ AttrNumber *keyColIdx = hashtable->keyColIdx;
+ TupleHashIterator hashiter;
+ TupleHashEntry entry;
+
+ InitTupleHashIterator(hashtable, &hashiter);
+ while ((entry = ScanTupleHashTable(hashtable, &hashiter)) != NULL)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
+ if (!execTuplesUnequal(slot, hashtable->tableslot,
+ numCols, keyColIdx,
+ eqfunctions,
+ hashtable->tab_collations,
+ hashtable->tempcxt))
+ {
+ TermTupleHashIterator(&hashiter);
+ return true;
+ }
+ }
+ /* No TermTupleHashIterator call needed here */
+ return false;
+}
+
+/*
+ * slotAllNulls: is the slot completely NULL?
+ *
+ * This does not test for dropped columns, which is OK because we only
+ * use it on projected tuples.
+ */
+static bool
+slotAllNulls(TupleTableSlot *slot)
+{
+ int ncols = slot->tts_tupleDescriptor->natts;
+ int i;
+
+ for (i = 1; i <= ncols; i++)
+ {
+ if (!slot_attisnull(slot, i))
+ return false;
+ }
+ return true;
+}
+
+/*
+ * slotNoNulls: is the slot entirely not NULL?
+ *
+ * This does not test for dropped columns, which is OK because we only
+ * use it on projected tuples.
+ */
+static bool
+slotNoNulls(TupleTableSlot *slot)
+{
+ int ncols = slot->tts_tupleDescriptor->natts;
+ int i;
+
+ for (i = 1; i <= ncols; i++)
+ {
+ if (slot_attisnull(slot, i))
+ return false;
+ }
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitSubPlan
+ *
+ * Create a SubPlanState for a SubPlan; this is the SubPlan-specific part
+ * of ExecInitExpr(). We split it out so that it can be used for InitPlans
+ * as well as regular SubPlans. Note that we don't link the SubPlan into
+ * the parent's subPlan list, because that shouldn't happen for InitPlans.
+ * Instead, ExecInitExpr() does that one part.
+ * ----------------------------------------------------------------
+ */
+SubPlanState *
+ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
+{
+ SubPlanState *sstate = makeNode(SubPlanState);
+ EState *estate = parent->state;
+
+ sstate->subplan = subplan;
+
+ /* Link the SubPlanState to already-initialized subplan */
+ sstate->planstate = (PlanState *) list_nth(estate->es_subplanstates,
+ subplan->plan_id - 1);
+
+ /*
+ * This check can fail if the planner mistakenly puts a parallel-unsafe
+ * subplan into a parallelized subquery; see ExecSerializePlan.
+ */
+ if (sstate->planstate == NULL)
+ elog(ERROR, "subplan \"%s\" was not initialized",
+ subplan->plan_name);
+
+ /* Link to parent's state, too */
+ sstate->parent = parent;
+
+ /* Initialize subexpressions */
+ sstate->testexpr = ExecInitExpr((Expr *) subplan->testexpr, parent);
+ sstate->args = ExecInitExprList(subplan->args, parent);
+
+ /*
+ * initialize my state
+ */
+ sstate->curTuple = NULL;
+ sstate->curArray = PointerGetDatum(NULL);
+ sstate->projLeft = NULL;
+ sstate->projRight = NULL;
+ sstate->hashtable = NULL;
+ sstate->hashnulls = NULL;
+ sstate->hashtablecxt = NULL;
+ sstate->hashtempcxt = NULL;
+ sstate->innerecontext = NULL;
+ sstate->keyColIdx = NULL;
+ sstate->tab_eq_funcoids = NULL;
+ sstate->tab_hash_funcs = NULL;
+ sstate->tab_eq_funcs = NULL;
+ sstate->tab_collations = NULL;
+ sstate->lhs_hash_funcs = NULL;
+ sstate->cur_eq_funcs = NULL;
+
+ /*
+ * If this is an initplan or MULTIEXPR subplan, it has output parameters
+ * that the parent plan will use, so mark those parameters as needing
+ * evaluation. We don't actually run the subplan until we first need one
+ * of its outputs.
+ *
+ * A CTE subplan's output parameter is never to be evaluated in the normal
+ * way, so skip this in that case.
+ *
+ * Note that we don't set parent->chgParam here: the parent plan hasn't
+ * been run yet, so no need to force it to re-run.
+ */
+ if (subplan->setParam != NIL && subplan->subLinkType != CTE_SUBLINK)
+ {
+ ListCell *lst;
+
+ foreach(lst, subplan->setParam)
+ {
+ int paramid = lfirst_int(lst);
+ ParamExecData *prm = &(estate->es_param_exec_vals[paramid]);
+
+ prm->execPlan = sstate;
+ }
+ }
+
+ /*
+ * If we are going to hash the subquery output, initialize relevant stuff.
+ * (We don't create the hashtable until needed, though.)
+ */
+ if (subplan->useHashTable)
+ {
+ int ncols,
+ i;
+ TupleDesc tupDescLeft;
+ TupleDesc tupDescRight;
+ Oid *cross_eq_funcoids;
+ TupleTableSlot *slot;
+ List *oplist,
+ *lefttlist,
+ *righttlist;
+ ListCell *l;
+
+ /* We need a memory context to hold the hash table(s) */
+ sstate->hashtablecxt =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "Subplan HashTable Context",
+ ALLOCSET_DEFAULT_SIZES);
+ /* and a small one for the hash tables to use as temp storage */
+ sstate->hashtempcxt =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "Subplan HashTable Temp Context",
+ ALLOCSET_SMALL_SIZES);
+ /* and a short-lived exprcontext for function evaluation */
+ sstate->innerecontext = CreateExprContext(estate);
+
+ /*
+ * We use ExecProject to evaluate the lefthand and righthand
+ * expression lists and form tuples. (You might think that we could
+ * use the sub-select's output tuples directly, but that is not the
+ * case if we had to insert any run-time coercions of the sub-select's
+ * output datatypes; anyway this avoids storing any resjunk columns
+ * that might be in the sub-select's output.) Run through the
+ * combining expressions to build tlists for the lefthand and
+ * righthand sides.
+ *
+ * We also extract the combining operators themselves to initialize
+ * the equality and hashing functions for the hash tables.
+ */
+ if (IsA(subplan->testexpr, OpExpr))
+ {
+ /* single combining operator */
+ oplist = list_make1(subplan->testexpr);
+ }
+ else if (is_andclause(subplan->testexpr))
+ {
+ /* multiple combining operators */
+ oplist = castNode(BoolExpr, subplan->testexpr)->args;
+ }
+ else
+ {
+ /* shouldn't see anything else in a hashable subplan */
+ elog(ERROR, "unrecognized testexpr type: %d",
+ (int) nodeTag(subplan->testexpr));
+ oplist = NIL; /* keep compiler quiet */
+ }
+ ncols = list_length(oplist);
+
+ lefttlist = righttlist = NIL;
+ sstate->numCols = ncols;
+ sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber));
+ sstate->tab_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid));
+ sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid));
+ sstate->tab_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+ sstate->tab_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+ sstate->lhs_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+ sstate->cur_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+ /* we'll need the cross-type equality fns below, but not in sstate */
+ cross_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid));
+
+ i = 1;
+ foreach(l, oplist)
+ {
+ OpExpr *opexpr = lfirst_node(OpExpr, l);
+ Expr *expr;
+ TargetEntry *tle;
+ Oid rhs_eq_oper;
+ Oid left_hashfn;
+ Oid right_hashfn;
+
+ Assert(list_length(opexpr->args) == 2);
+
+ /* Process lefthand argument */
+ expr = (Expr *) linitial(opexpr->args);
+ tle = makeTargetEntry(expr,
+ i,
+ NULL,
+ false);
+ lefttlist = lappend(lefttlist, tle);
+
+ /* Process righthand argument */
+ expr = (Expr *) lsecond(opexpr->args);
+ tle = makeTargetEntry(expr,
+ i,
+ NULL,
+ false);
+ righttlist = lappend(righttlist, tle);
+
+ /* Lookup the equality function (potentially cross-type) */
+ cross_eq_funcoids[i - 1] = opexpr->opfuncid;
+ fmgr_info(opexpr->opfuncid, &sstate->cur_eq_funcs[i - 1]);
+ fmgr_info_set_expr((Node *) opexpr, &sstate->cur_eq_funcs[i - 1]);
+
+ /* Look up the equality function for the RHS type */
+ if (!get_compatible_hash_operators(opexpr->opno,
+ NULL, &rhs_eq_oper))
+ elog(ERROR, "could not find compatible hash operator for operator %u",
+ opexpr->opno);
+ sstate->tab_eq_funcoids[i - 1] = get_opcode(rhs_eq_oper);
+ fmgr_info(sstate->tab_eq_funcoids[i - 1],
+ &sstate->tab_eq_funcs[i - 1]);
+
+ /* Lookup the associated hash functions */
+ if (!get_op_hash_functions(opexpr->opno,
+ &left_hashfn, &right_hashfn))
+ elog(ERROR, "could not find hash function for hash operator %u",
+ opexpr->opno);
+ fmgr_info(left_hashfn, &sstate->lhs_hash_funcs[i - 1]);
+ fmgr_info(right_hashfn, &sstate->tab_hash_funcs[i - 1]);
+
+ /* Set collation */
+ sstate->tab_collations[i - 1] = opexpr->inputcollid;
+
+ /* keyColIdx is just column numbers 1..n */
+ sstate->keyColIdx[i - 1] = i;
+
+ i++;
+ }
+
+ /*
+ * Construct tupdescs, slots and projection nodes for left and right
+ * sides. The lefthand expressions will be evaluated in the parent
+ * plan node's exprcontext, which we don't have access to here.
+ * Fortunately we can just pass NULL for now and fill it in later
+ * (hack alert!). The righthand expressions will be evaluated in our
+ * own innerecontext.
+ */
+ tupDescLeft = ExecTypeFromTL(lefttlist);
+ slot = ExecInitExtraTupleSlot(estate, tupDescLeft, &TTSOpsVirtual);
+ sstate->projLeft = ExecBuildProjectionInfo(lefttlist,
+ NULL,
+ slot,
+ parent,
+ NULL);
+
+ sstate->descRight = tupDescRight = ExecTypeFromTL(righttlist);
+ slot = ExecInitExtraTupleSlot(estate, tupDescRight, &TTSOpsVirtual);
+ sstate->projRight = ExecBuildProjectionInfo(righttlist,
+ sstate->innerecontext,
+ slot,
+ sstate->planstate,
+ NULL);
+
+ /*
+ * Create comparator for lookups of rows in the table (potentially
+ * cross-type comparisons).
+ */
+ sstate->cur_eq_comp = ExecBuildGroupingEqual(tupDescLeft, tupDescRight,
+ &TTSOpsVirtual, &TTSOpsMinimalTuple,
+ ncols,
+ sstate->keyColIdx,
+ cross_eq_funcoids,
+ sstate->tab_collations,
+ parent);
+ }
+
+ return sstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecSetParamPlan
+ *
+ * Executes a subplan and sets its output parameters.
+ *
+ * This is called from ExecEvalParamExec() when the value of a PARAM_EXEC
+ * parameter is requested and the param's execPlan field is set (indicating
+ * that the param has not yet been evaluated). This allows lazy evaluation
+ * of initplans: we don't run the subplan until/unless we need its output.
+ * Note that this routine MUST clear the execPlan fields of the plan's
+ * output parameters after evaluating them!
+ *
+ * The results of this function are stored in the EState associated with the
+ * ExprContext (particularly, its ecxt_param_exec_vals); any pass-by-ref
+ * result Datums are allocated in the EState's per-query memory. The passed
+ * econtext can be any ExprContext belonging to that EState; which one is
+ * important only to the extent that the ExprContext's per-tuple memory
+ * context is used to evaluate any parameters passed down to the subplan.
+ * (Thus in principle, the shorter-lived the ExprContext the better, since
+ * that data isn't needed after we return. In practice, because initplan
+ * parameters are never more complex than Vars, Aggrefs, etc, evaluating them
+ * currently never leaks any memory anyway.)
+ * ----------------------------------------------------------------
+ */
+void
+ExecSetParamPlan(SubPlanState *node, ExprContext *econtext)
+{
+ SubPlan *subplan = node->subplan;
+ PlanState *planstate = node->planstate;
+ SubLinkType subLinkType = subplan->subLinkType;
+ EState *estate = planstate->state;
+ ScanDirection dir = estate->es_direction;
+ MemoryContext oldcontext;
+ TupleTableSlot *slot;
+ ListCell *pvar;
+ ListCell *l;
+ bool found = false;
+ ArrayBuildStateAny *astate = NULL;
+
+ if (subLinkType == ANY_SUBLINK ||
+ subLinkType == ALL_SUBLINK)
+ elog(ERROR, "ANY/ALL subselect unsupported as initplan");
+ if (subLinkType == CTE_SUBLINK)
+ elog(ERROR, "CTE subplans should not be executed via ExecSetParamPlan");
+
+ /*
+ * Enforce forward scan direction regardless of caller. It's hard but not
+ * impossible to get here in backward scan, so make it work anyway.
+ */
+ estate->es_direction = ForwardScanDirection;
+
+ /* Initialize ArrayBuildStateAny in caller's context, if needed */
+ if (subLinkType == ARRAY_SUBLINK)
+ astate = initArrayResultAny(subplan->firstColType,
+ CurrentMemoryContext, true);
+
+ /*
+ * Must switch to per-query memory context.
+ */
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+ /*
+ * Set Params of this plan from parent plan correlation values. (Any
+ * calculation we have to do is done in the parent econtext, since the
+ * Param values don't need to have per-query lifetime.) Currently, we
+ * expect only MULTIEXPR_SUBLINK plans to have any correlation values.
+ */
+ Assert(subplan->parParam == NIL || subLinkType == MULTIEXPR_SUBLINK);
+ Assert(list_length(subplan->parParam) == list_length(node->args));
+
+ forboth(l, subplan->parParam, pvar, node->args)
+ {
+ int paramid = lfirst_int(l);
+ ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+ prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
+ econtext,
+ &(prm->isnull));
+ planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
+ }
+
+ /*
+ * Run the plan. (If it needs to be rescanned, the first ExecProcNode
+ * call will take care of that.)
+ */
+ for (slot = ExecProcNode(planstate);
+ !TupIsNull(slot);
+ slot = ExecProcNode(planstate))
+ {
+ TupleDesc tdesc = slot->tts_tupleDescriptor;
+ int i = 1;
+
+ if (subLinkType == EXISTS_SUBLINK)
+ {
+ /* There can be only one setParam... */
+ int paramid = linitial_int(subplan->setParam);
+ ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+ prm->execPlan = NULL;
+ prm->value = BoolGetDatum(true);
+ prm->isnull = false;
+ found = true;
+ break;
+ }
+
+ if (subLinkType == ARRAY_SUBLINK)
+ {
+ Datum dvalue;
+ bool disnull;
+
+ found = true;
+ /* stash away current value */
+ Assert(subplan->firstColType == TupleDescAttr(tdesc, 0)->atttypid);
+ dvalue = slot_getattr(slot, 1, &disnull);
+ astate = accumArrayResultAny(astate, dvalue, disnull,
+ subplan->firstColType, oldcontext);
+ /* keep scanning subplan to collect all values */
+ continue;
+ }
+
+ if (found &&
+ (subLinkType == EXPR_SUBLINK ||
+ subLinkType == MULTIEXPR_SUBLINK ||
+ subLinkType == ROWCOMPARE_SUBLINK))
+ ereport(ERROR,
+ (errcode(ERRCODE_CARDINALITY_VIOLATION),
+ errmsg("more than one row returned by a subquery used as an expression")));
+
+ found = true;
+
+ /*
+ * We need to copy the subplan's tuple into our own context, in case
+ * any of the params are pass-by-ref type --- the pointers stored in
+ * the param structs will point at this copied tuple! node->curTuple
+ * keeps track of the copied tuple for eventual freeing.
+ */
+ if (node->curTuple)
+ heap_freetuple(node->curTuple);
+ node->curTuple = ExecCopySlotHeapTuple(slot);
+
+ /*
+ * Now set all the setParam params from the columns of the tuple
+ */
+ foreach(l, subplan->setParam)
+ {
+ int paramid = lfirst_int(l);
+ ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+ prm->execPlan = NULL;
+ prm->value = heap_getattr(node->curTuple, i, tdesc,
+ &(prm->isnull));
+ i++;
+ }
+ }
+
+ if (subLinkType == ARRAY_SUBLINK)
+ {
+ /* There can be only one setParam... */
+ int paramid = linitial_int(subplan->setParam);
+ ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+ /*
+ * We build the result array in query context so it won't disappear;
+ * to avoid leaking memory across repeated calls, we have to remember
+ * the latest value, much as for curTuple above.
+ */
+ if (node->curArray != PointerGetDatum(NULL))
+ pfree(DatumGetPointer(node->curArray));
+ node->curArray = makeArrayResultAny(astate,
+ econtext->ecxt_per_query_memory,
+ true);
+ prm->execPlan = NULL;
+ prm->value = node->curArray;
+ prm->isnull = false;
+ }
+ else if (!found)
+ {
+ if (subLinkType == EXISTS_SUBLINK)
+ {
+ /* There can be only one setParam... */
+ int paramid = linitial_int(subplan->setParam);
+ ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+ prm->execPlan = NULL;
+ prm->value = BoolGetDatum(false);
+ prm->isnull = false;
+ }
+ else
+ {
+ /* For other sublink types, set all the output params to NULL */
+ foreach(l, subplan->setParam)
+ {
+ int paramid = lfirst_int(l);
+ ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+ prm->execPlan = NULL;
+ prm->value = (Datum) 0;
+ prm->isnull = true;
+ }
+ }
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /* restore scan direction */
+ estate->es_direction = dir;
+}
+
+/*
+ * ExecSetParamPlanMulti
+ *
+ * Apply ExecSetParamPlan to evaluate any not-yet-evaluated initplan output
+ * parameters whose ParamIDs are listed in "params". Any listed params that
+ * are not initplan outputs are ignored.
+ *
+ * As with ExecSetParamPlan, any ExprContext belonging to the current EState
+ * can be used, but in principle a shorter-lived ExprContext is better than a
+ * longer-lived one.
+ */
+void
+ExecSetParamPlanMulti(const Bitmapset *params, ExprContext *econtext)
+{
+ int paramid;
+
+ paramid = -1;
+ while ((paramid = bms_next_member(params, paramid)) >= 0)
+ {
+ ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+ if (prm->execPlan != NULL)
+ {
+ /* Parameter not evaluated yet, so go do it */
+ ExecSetParamPlan(prm->execPlan, econtext);
+ /* ExecSetParamPlan should have processed this param... */
+ Assert(prm->execPlan == NULL);
+ }
+ }
+}
+
+/*
+ * Mark an initplan as needing recalculation
+ */
+void
+ExecReScanSetParamPlan(SubPlanState *node, PlanState *parent)
+{
+ PlanState *planstate = node->planstate;
+ SubPlan *subplan = node->subplan;
+ EState *estate = parent->state;
+ ListCell *l;
+
+ /* sanity checks */
+ if (subplan->parParam != NIL)
+ elog(ERROR, "direct correlated subquery unsupported as initplan");
+ if (subplan->setParam == NIL)
+ elog(ERROR, "setParam list of initplan is empty");
+ if (bms_is_empty(planstate->plan->extParam))
+ elog(ERROR, "extParam set of initplan is empty");
+
+ /*
+ * Don't actually re-scan: it'll happen inside ExecSetParamPlan if needed.
+ */
+
+ /*
+ * Mark this subplan's output parameters as needing recalculation.
+ *
+ * CTE subplans are never executed via parameter recalculation; instead
+ * they get run when called by nodeCtescan.c. So don't mark the output
+ * parameter of a CTE subplan as dirty, but do set the chgParam bit for it
+ * so that dependent plan nodes will get told to rescan.
+ */
+ foreach(l, subplan->setParam)
+ {
+ int paramid = lfirst_int(l);
+ ParamExecData *prm = &(estate->es_param_exec_vals[paramid]);
+
+ if (subplan->subLinkType != CTE_SUBLINK)
+ prm->execPlan = node;
+
+ parent->chgParam = bms_add_member(parent->chgParam, paramid);
+ }
+}
diff --git a/src/backend/executor/nodeSubqueryscan.c b/src/backend/executor/nodeSubqueryscan.c
new file mode 100644
index 0000000..c09f628
--- /dev/null
+++ b/src/backend/executor/nodeSubqueryscan.c
@@ -0,0 +1,213 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSubqueryscan.c
+ * Support routines for scanning subqueries (subselects in rangetable).
+ *
+ * This is just enough different from sublinks (nodeSubplan.c) to mean that
+ * we need two sets of code. Ought to look at trying to unify the cases.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeSubqueryscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecSubqueryScan scans a subquery.
+ * ExecSubqueryNext retrieve next tuple in sequential order.
+ * ExecInitSubqueryScan creates and initializes a subqueryscan node.
+ * ExecEndSubqueryScan releases any storage allocated.
+ * ExecReScanSubqueryScan rescans the relation
+ *
+ */
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeSubqueryscan.h"
+
+static TupleTableSlot *SubqueryNext(SubqueryScanState *node);
+
+/* ----------------------------------------------------------------
+ * Scan Support
+ * ----------------------------------------------------------------
+ */
+/* ----------------------------------------------------------------
+ * SubqueryNext
+ *
+ * This is a workhorse for ExecSubqueryScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+SubqueryNext(SubqueryScanState *node)
+{
+ TupleTableSlot *slot;
+
+ /*
+ * Get the next tuple from the sub-query.
+ */
+ slot = ExecProcNode(node->subplan);
+
+ /*
+ * We just return the subplan's result slot, rather than expending extra
+ * cycles for ExecCopySlot(). (Our own ScanTupleSlot is used only for
+ * EvalPlanQual rechecks.)
+ */
+ return slot;
+}
+
+/*
+ * SubqueryRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+SubqueryRecheck(SubqueryScanState *node, TupleTableSlot *slot)
+{
+ /* nothing to check */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecSubqueryScan(node)
+ *
+ * Scans the subquery sequentially and returns the next qualifying
+ * tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecSubqueryScan(PlanState *pstate)
+{
+ SubqueryScanState *node = castNode(SubqueryScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) SubqueryNext,
+ (ExecScanRecheckMtd) SubqueryRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitSubqueryScan
+ * ----------------------------------------------------------------
+ */
+SubqueryScanState *
+ExecInitSubqueryScan(SubqueryScan *node, EState *estate, int eflags)
+{
+ SubqueryScanState *subquerystate;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & EXEC_FLAG_MARK));
+
+ /* SubqueryScan should not have any "normal" children */
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create state structure
+ */
+ subquerystate = makeNode(SubqueryScanState);
+ subquerystate->ss.ps.plan = (Plan *) node;
+ subquerystate->ss.ps.state = estate;
+ subquerystate->ss.ps.ExecProcNode = ExecSubqueryScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &subquerystate->ss.ps);
+
+ /*
+ * initialize subquery
+ */
+ subquerystate->subplan = ExecInitNode(node->subplan, estate, eflags);
+
+ /*
+ * Initialize scan slot and type (needed by ExecAssignScanProjectionInfo)
+ */
+ ExecInitScanTupleSlot(estate, &subquerystate->ss,
+ ExecGetResultType(subquerystate->subplan),
+ ExecGetResultSlotOps(subquerystate->subplan, NULL));
+
+ /*
+ * The slot used as the scantuple isn't the slot above (outside of EPQ),
+ * but the one from the node below.
+ */
+ subquerystate->ss.ps.scanopsset = true;
+ subquerystate->ss.ps.scanops = ExecGetResultSlotOps(subquerystate->subplan,
+ &subquerystate->ss.ps.scanopsfixed);
+ subquerystate->ss.ps.resultopsset = true;
+ subquerystate->ss.ps.resultops = subquerystate->ss.ps.scanops;
+ subquerystate->ss.ps.resultopsfixed = subquerystate->ss.ps.scanopsfixed;
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&subquerystate->ss.ps);
+ ExecAssignScanProjectionInfo(&subquerystate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ subquerystate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) subquerystate);
+
+ return subquerystate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndSubqueryScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSubqueryScan(SubqueryScanState *node)
+{
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the upper tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * close down subquery
+ */
+ ExecEndNode(node->subplan);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanSubqueryScan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanSubqueryScan(SubqueryScanState *node)
+{
+ ExecScanReScan(&node->ss);
+
+ /*
+ * ExecReScan doesn't know about my subplan, so I have to do
+ * changed-parameter signaling myself. This is just as well, because the
+ * subplan has its own memory context in which its chgParam state lives.
+ */
+ if (node->ss.ps.chgParam != NULL)
+ UpdateChangedParamSet(node->subplan, node->ss.ps.chgParam);
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->subplan->chgParam == NULL)
+ ExecReScan(node->subplan);
+}
diff --git a/src/backend/executor/nodeTableFuncscan.c b/src/backend/executor/nodeTableFuncscan.c
new file mode 100644
index 0000000..4d7eca4
--- /dev/null
+++ b/src/backend/executor/nodeTableFuncscan.c
@@ -0,0 +1,523 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeTableFuncscan.c
+ * Support routines for scanning RangeTableFunc (XMLTABLE like functions).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeTableFuncscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecTableFuncscan scans a function.
+ * ExecFunctionNext retrieve next tuple in sequential order.
+ * ExecInitTableFuncscan creates and initializes a TableFuncscan node.
+ * ExecEndTableFuncscan releases any storage allocated.
+ * ExecReScanTableFuncscan rescans the function
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeTableFuncscan.h"
+#include "executor/tablefunc.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/xml.h"
+
+static TupleTableSlot *TableFuncNext(TableFuncScanState *node);
+static bool TableFuncRecheck(TableFuncScanState *node, TupleTableSlot *slot);
+
+static void tfuncFetchRows(TableFuncScanState *tstate, ExprContext *econtext);
+static void tfuncInitialize(TableFuncScanState *tstate, ExprContext *econtext, Datum doc);
+static void tfuncLoadRows(TableFuncScanState *tstate, ExprContext *econtext);
+
+/* ----------------------------------------------------------------
+ * Scan Support
+ * ----------------------------------------------------------------
+ */
+/* ----------------------------------------------------------------
+ * TableFuncNext
+ *
+ * This is a workhorse for ExecTableFuncscan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+TableFuncNext(TableFuncScanState *node)
+{
+ TupleTableSlot *scanslot;
+
+ scanslot = node->ss.ss_ScanTupleSlot;
+
+ /*
+ * If first time through, read all tuples from function and put them in a
+ * tuplestore. Subsequent calls just fetch tuples from tuplestore.
+ */
+ if (node->tupstore == NULL)
+ tfuncFetchRows(node, node->ss.ps.ps_ExprContext);
+
+ /*
+ * Get the next tuple from tuplestore.
+ */
+ (void) tuplestore_gettupleslot(node->tupstore,
+ true,
+ false,
+ scanslot);
+ return scanslot;
+}
+
+/*
+ * TableFuncRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+TableFuncRecheck(TableFuncScanState *node, TupleTableSlot *slot)
+{
+ /* nothing to check */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecTableFuncscan(node)
+ *
+ * Scans the function sequentially and returns the next qualifying
+ * tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecTableFuncScan(PlanState *pstate)
+{
+ TableFuncScanState *node = castNode(TableFuncScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) TableFuncNext,
+ (ExecScanRecheckMtd) TableFuncRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitTableFuncscan
+ * ----------------------------------------------------------------
+ */
+TableFuncScanState *
+ExecInitTableFuncScan(TableFuncScan *node, EState *estate, int eflags)
+{
+ TableFuncScanState *scanstate;
+ TableFunc *tf = node->tablefunc;
+ TupleDesc tupdesc;
+ int i;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & EXEC_FLAG_MARK));
+
+ /*
+ * TableFuncscan should not have any children.
+ */
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create new ScanState for node
+ */
+ scanstate = makeNode(TableFuncScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecTableFuncScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ /*
+ * initialize source tuple type
+ */
+ tupdesc = BuildDescFromLists(tf->colnames,
+ tf->coltypes,
+ tf->coltypmods,
+ tf->colcollations);
+ /* and the corresponding scan slot */
+ ExecInitScanTupleSlot(estate, &scanstate->ss, tupdesc,
+ &TTSOpsMinimalTuple);
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, &scanstate->ss.ps);
+
+ /* Only XMLTABLE is supported currently */
+ scanstate->routine = &XmlTableRoutine;
+
+ scanstate->perTableCxt =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "TableFunc per value context",
+ ALLOCSET_DEFAULT_SIZES);
+ scanstate->opaque = NULL; /* initialized at runtime */
+
+ scanstate->ns_names = tf->ns_names;
+
+ scanstate->ns_uris =
+ ExecInitExprList(tf->ns_uris, (PlanState *) scanstate);
+ scanstate->docexpr =
+ ExecInitExpr((Expr *) tf->docexpr, (PlanState *) scanstate);
+ scanstate->rowexpr =
+ ExecInitExpr((Expr *) tf->rowexpr, (PlanState *) scanstate);
+ scanstate->colexprs =
+ ExecInitExprList(tf->colexprs, (PlanState *) scanstate);
+ scanstate->coldefexprs =
+ ExecInitExprList(tf->coldefexprs, (PlanState *) scanstate);
+
+ scanstate->notnulls = tf->notnulls;
+
+ /* these are allocated now and initialized later */
+ scanstate->in_functions = palloc(sizeof(FmgrInfo) * tupdesc->natts);
+ scanstate->typioparams = palloc(sizeof(Oid) * tupdesc->natts);
+
+ /*
+ * Fill in the necessary fmgr infos.
+ */
+ for (i = 0; i < tupdesc->natts; i++)
+ {
+ Oid in_funcid;
+
+ getTypeInputInfo(TupleDescAttr(tupdesc, i)->atttypid,
+ &in_funcid, &scanstate->typioparams[i]);
+ fmgr_info(in_funcid, &scanstate->in_functions[i]);
+ }
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndTableFuncscan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndTableFuncScan(TableFuncScanState *node)
+{
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+ /*
+ * Release tuplestore resources
+ */
+ if (node->tupstore != NULL)
+ tuplestore_end(node->tupstore);
+ node->tupstore = NULL;
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanTableFuncscan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanTableFuncScan(TableFuncScanState *node)
+{
+ Bitmapset *chgparam = node->ss.ps.chgParam;
+
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecScanReScan(&node->ss);
+
+ /*
+ * Recompute when parameters are changed.
+ */
+ if (chgparam)
+ {
+ if (node->tupstore != NULL)
+ {
+ tuplestore_end(node->tupstore);
+ node->tupstore = NULL;
+ }
+ }
+
+ if (node->tupstore != NULL)
+ tuplestore_rescan(node->tupstore);
+}
+
+/* ----------------------------------------------------------------
+ * tfuncFetchRows
+ *
+ * Read rows from a TableFunc producer
+ * ----------------------------------------------------------------
+ */
+static void
+tfuncFetchRows(TableFuncScanState *tstate, ExprContext *econtext)
+{
+ const TableFuncRoutine *routine = tstate->routine;
+ MemoryContext oldcxt;
+ Datum value;
+ bool isnull;
+
+ Assert(tstate->opaque == NULL);
+
+ /* build tuplestore for the result */
+ oldcxt = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+ tstate->tupstore = tuplestore_begin_heap(false, false, work_mem);
+
+ /*
+ * Each call to fetch a new set of rows - of which there may be very many
+ * if XMLTABLE is being used in a lateral join - will allocate a possibly
+ * substantial amount of memory, so we cannot use the per-query context
+ * here. perTableCxt now serves the same function as "argcontext" does in
+ * FunctionScan - a place to store per-one-call (i.e. one result table)
+ * lifetime data (as opposed to per-query or per-result-tuple).
+ */
+ MemoryContextSwitchTo(tstate->perTableCxt);
+
+ PG_TRY();
+ {
+ routine->InitOpaque(tstate,
+ tstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor->natts);
+
+ /*
+ * If evaluating the document expression returns NULL, the table
+ * expression is empty and we return immediately.
+ */
+ value = ExecEvalExpr(tstate->docexpr, econtext, &isnull);
+
+ if (!isnull)
+ {
+ /* otherwise, pass the document value to the table builder */
+ tfuncInitialize(tstate, econtext, value);
+
+ /* initialize ordinality counter */
+ tstate->ordinal = 1;
+
+ /* Load all rows into the tuplestore, and we're done */
+ tfuncLoadRows(tstate, econtext);
+ }
+ }
+ PG_CATCH();
+ {
+ if (tstate->opaque != NULL)
+ routine->DestroyOpaque(tstate);
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ /* clean up and return to original memory context */
+
+ if (tstate->opaque != NULL)
+ {
+ routine->DestroyOpaque(tstate);
+ tstate->opaque = NULL;
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextReset(tstate->perTableCxt);
+}
+
+/*
+ * Fill in namespace declarations, the row filter, and column filters in a
+ * table expression builder context.
+ */
+static void
+tfuncInitialize(TableFuncScanState *tstate, ExprContext *econtext, Datum doc)
+{
+ const TableFuncRoutine *routine = tstate->routine;
+ TupleDesc tupdesc;
+ ListCell *lc1,
+ *lc2;
+ bool isnull;
+ int colno;
+ Datum value;
+ int ordinalitycol =
+ ((TableFuncScan *) (tstate->ss.ps.plan))->tablefunc->ordinalitycol;
+
+ /*
+ * Install the document as a possibly-toasted Datum into the tablefunc
+ * context.
+ */
+ routine->SetDocument(tstate, doc);
+
+ /* Evaluate namespace specifications */
+ forboth(lc1, tstate->ns_uris, lc2, tstate->ns_names)
+ {
+ ExprState *expr = (ExprState *) lfirst(lc1);
+ Value *ns_node = (Value *) lfirst(lc2);
+ char *ns_uri;
+ char *ns_name;
+
+ value = ExecEvalExpr((ExprState *) expr, econtext, &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("namespace URI must not be null")));
+ ns_uri = TextDatumGetCString(value);
+
+ /* DEFAULT is passed down to SetNamespace as NULL */
+ ns_name = ns_node ? strVal(ns_node) : NULL;
+
+ routine->SetNamespace(tstate, ns_name, ns_uri);
+ }
+
+ /* Install the row filter expression into the table builder context */
+ value = ExecEvalExpr(tstate->rowexpr, econtext, &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("row filter expression must not be null")));
+
+ routine->SetRowFilter(tstate, TextDatumGetCString(value));
+
+ /*
+ * Install the column filter expressions into the table builder context.
+ * If an expression is given, use that; otherwise the column name itself
+ * is the column filter.
+ */
+ colno = 0;
+ tupdesc = tstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
+ foreach(lc1, tstate->colexprs)
+ {
+ char *colfilter;
+ Form_pg_attribute att = TupleDescAttr(tupdesc, colno);
+
+ if (colno != ordinalitycol)
+ {
+ ExprState *colexpr = lfirst(lc1);
+
+ if (colexpr != NULL)
+ {
+ value = ExecEvalExpr(colexpr, econtext, &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("column filter expression must not be null"),
+ errdetail("Filter for column \"%s\" is null.",
+ NameStr(att->attname))));
+ colfilter = TextDatumGetCString(value);
+ }
+ else
+ colfilter = NameStr(att->attname);
+
+ routine->SetColumnFilter(tstate, colfilter, colno);
+ }
+
+ colno++;
+ }
+}
+
+/*
+ * Load all the rows from the TableFunc table builder into a tuplestore.
+ */
+static void
+tfuncLoadRows(TableFuncScanState *tstate, ExprContext *econtext)
+{
+ const TableFuncRoutine *routine = tstate->routine;
+ TupleTableSlot *slot = tstate->ss.ss_ScanTupleSlot;
+ TupleDesc tupdesc = slot->tts_tupleDescriptor;
+ Datum *values = slot->tts_values;
+ bool *nulls = slot->tts_isnull;
+ int natts = tupdesc->natts;
+ MemoryContext oldcxt;
+ int ordinalitycol;
+
+ ordinalitycol =
+ ((TableFuncScan *) (tstate->ss.ps.plan))->tablefunc->ordinalitycol;
+
+ /*
+ * We need a short-lived memory context that we can clean up each time
+ * around the loop, to avoid wasting space. Our default per-tuple context
+ * is fine for the job, since we won't have used it for anything yet in
+ * this tuple cycle.
+ */
+ oldcxt = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ /*
+ * Keep requesting rows from the table builder until there aren't any.
+ */
+ while (routine->FetchRow(tstate))
+ {
+ ListCell *cell = list_head(tstate->coldefexprs);
+ int colno;
+
+ CHECK_FOR_INTERRUPTS();
+
+ ExecClearTuple(tstate->ss.ss_ScanTupleSlot);
+
+ /*
+ * Obtain the value of each column for this row, installing them into
+ * the slot; then add the tuple to the tuplestore.
+ */
+ for (colno = 0; colno < natts; colno++)
+ {
+ Form_pg_attribute att = TupleDescAttr(tupdesc, colno);
+
+ if (colno == ordinalitycol)
+ {
+ /* Fast path for ordinality column */
+ values[colno] = Int32GetDatum(tstate->ordinal++);
+ nulls[colno] = false;
+ }
+ else
+ {
+ bool isnull;
+
+ values[colno] = routine->GetValue(tstate,
+ colno,
+ att->atttypid,
+ att->atttypmod,
+ &isnull);
+
+ /* No value? Evaluate and apply the default, if any */
+ if (isnull && cell != NULL)
+ {
+ ExprState *coldefexpr = (ExprState *) lfirst(cell);
+
+ if (coldefexpr != NULL)
+ values[colno] = ExecEvalExpr(coldefexpr, econtext,
+ &isnull);
+ }
+
+ /* Verify a possible NOT NULL constraint */
+ if (isnull && bms_is_member(colno, tstate->notnulls))
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("null is not allowed in column \"%s\"",
+ NameStr(att->attname))));
+
+ nulls[colno] = isnull;
+ }
+
+ /* advance list of default expressions */
+ if (cell != NULL)
+ cell = lnext(tstate->coldefexprs, cell);
+ }
+
+ tuplestore_putvalues(tstate->tupstore, tupdesc, values, nulls);
+
+ MemoryContextReset(econtext->ecxt_per_tuple_memory);
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+}
diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c
new file mode 100644
index 0000000..2b0d205
--- /dev/null
+++ b/src/backend/executor/nodeTidrangescan.c
@@ -0,0 +1,413 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeTidrangescan.c
+ * Routines to support TID range scans of relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeTidrangescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/sysattr.h"
+#include "access/tableam.h"
+#include "catalog/pg_operator.h"
+#include "executor/execdebug.h"
+#include "executor/nodeTidrangescan.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/bufmgr.h"
+#include "utils/rel.h"
+
+
+#define IsCTIDVar(node) \
+ ((node) != NULL && \
+ IsA((node), Var) && \
+ ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \
+ ((Var *) (node))->varlevelsup == 0)
+
+typedef enum
+{
+ TIDEXPR_UPPER_BOUND,
+ TIDEXPR_LOWER_BOUND
+} TidExprType;
+
+/* Upper or lower range bound for scan */
+typedef struct TidOpExpr
+{
+ TidExprType exprtype; /* type of op; lower or upper */
+ ExprState *exprstate; /* ExprState for a TID-yielding subexpr */
+ bool inclusive; /* whether op is inclusive */
+} TidOpExpr;
+
+/*
+ * For the given 'expr', build and return an appropriate TidOpExpr taking into
+ * account the expr's operator and operand order.
+ */
+static TidOpExpr *
+MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate)
+{
+ Node *arg1 = get_leftop((Expr *) expr);
+ Node *arg2 = get_rightop((Expr *) expr);
+ ExprState *exprstate = NULL;
+ bool invert = false;
+ TidOpExpr *tidopexpr;
+
+ if (IsCTIDVar(arg1))
+ exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps);
+ else if (IsCTIDVar(arg2))
+ {
+ exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps);
+ invert = true;
+ }
+ else
+ elog(ERROR, "could not identify CTID variable");
+
+ tidopexpr = (TidOpExpr *) palloc(sizeof(TidOpExpr));
+ tidopexpr->inclusive = false; /* for now */
+
+ switch (expr->opno)
+ {
+ case TIDLessEqOperator:
+ tidopexpr->inclusive = true;
+ /* fall through */
+ case TIDLessOperator:
+ tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND;
+ break;
+ case TIDGreaterEqOperator:
+ tidopexpr->inclusive = true;
+ /* fall through */
+ case TIDGreaterOperator:
+ tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND;
+ break;
+ default:
+ elog(ERROR, "could not identify CTID operator");
+ }
+
+ tidopexpr->exprstate = exprstate;
+
+ return tidopexpr;
+}
+
+/*
+ * Extract the qual subexpressions that yield TIDs to search for,
+ * and compile them into ExprStates if they're ordinary expressions.
+ */
+static void
+TidExprListCreate(TidRangeScanState *tidrangestate)
+{
+ TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan;
+ List *tidexprs = NIL;
+ ListCell *l;
+
+ foreach(l, node->tidrangequals)
+ {
+ OpExpr *opexpr = lfirst(l);
+ TidOpExpr *tidopexpr;
+
+ if (!IsA(opexpr, OpExpr))
+ elog(ERROR, "could not identify CTID expression");
+
+ tidopexpr = MakeTidOpExpr(opexpr, tidrangestate);
+ tidexprs = lappend(tidexprs, tidopexpr);
+ }
+
+ tidrangestate->trss_tidexprs = tidexprs;
+}
+
+/* ----------------------------------------------------------------
+ * TidRangeEval
+ *
+ * Compute and set node's block and offset range to scan by evaluating
+ * the trss_tidexprs. Returns false if we detect the range cannot
+ * contain any tuples. Returns true if it's possible for the range to
+ * contain tuples.
+ * ----------------------------------------------------------------
+ */
+static bool
+TidRangeEval(TidRangeScanState *node)
+{
+ ExprContext *econtext = node->ss.ps.ps_ExprContext;
+ ItemPointerData lowerBound;
+ ItemPointerData upperBound;
+ ListCell *l;
+
+ /*
+ * Set the upper and lower bounds to the absolute limits of the range of
+ * the ItemPointer type. Below we'll try to narrow this range on either
+ * side by looking at the TidOpExprs.
+ */
+ ItemPointerSet(&lowerBound, 0, 0);
+ ItemPointerSet(&upperBound, InvalidBlockNumber, PG_UINT16_MAX);
+
+ foreach(l, node->trss_tidexprs)
+ {
+ TidOpExpr *tidopexpr = (TidOpExpr *) lfirst(l);
+ ItemPointer itemptr;
+ bool isNull;
+
+ /* Evaluate this bound. */
+ itemptr = (ItemPointer)
+ DatumGetPointer(ExecEvalExprSwitchContext(tidopexpr->exprstate,
+ econtext,
+ &isNull));
+
+ /* If the bound is NULL, *nothing* matches the qual. */
+ if (isNull)
+ return false;
+
+ if (tidopexpr->exprtype == TIDEXPR_LOWER_BOUND)
+ {
+ ItemPointerData lb;
+
+ ItemPointerCopy(itemptr, &lb);
+
+ /*
+ * Normalize non-inclusive ranges to become inclusive. The
+ * resulting ItemPointer here may not be a valid item pointer.
+ */
+ if (!tidopexpr->inclusive)
+ ItemPointerInc(&lb);
+
+ /* Check if we can narrow the range using this qual */
+ if (ItemPointerCompare(&lb, &lowerBound) > 0)
+ ItemPointerCopy(&lb, &lowerBound);
+ }
+
+ else if (tidopexpr->exprtype == TIDEXPR_UPPER_BOUND)
+ {
+ ItemPointerData ub;
+
+ ItemPointerCopy(itemptr, &ub);
+
+ /*
+ * Normalize non-inclusive ranges to become inclusive. The
+ * resulting ItemPointer here may not be a valid item pointer.
+ */
+ if (!tidopexpr->inclusive)
+ ItemPointerDec(&ub);
+
+ /* Check if we can narrow the range using this qual */
+ if (ItemPointerCompare(&ub, &upperBound) < 0)
+ ItemPointerCopy(&ub, &upperBound);
+ }
+ }
+
+ ItemPointerCopy(&lowerBound, &node->trss_mintid);
+ ItemPointerCopy(&upperBound, &node->trss_maxtid);
+
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * TidRangeNext
+ *
+ * Retrieve a tuple from the TidRangeScan node's currentRelation
+ * using the TIDs in the TidRangeScanState information.
+ *
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+TidRangeNext(TidRangeScanState *node)
+{
+ TableScanDesc scandesc;
+ EState *estate;
+ ScanDirection direction;
+ TupleTableSlot *slot;
+
+ /*
+ * extract necessary information from TID scan node
+ */
+ scandesc = node->ss.ss_currentScanDesc;
+ estate = node->ss.ps.state;
+ slot = node->ss.ss_ScanTupleSlot;
+ direction = estate->es_direction;
+
+ if (!node->trss_inScan)
+ {
+ /* First time through, compute TID range to scan */
+ if (!TidRangeEval(node))
+ return NULL;
+
+ if (scandesc == NULL)
+ {
+ scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation,
+ estate->es_snapshot,
+ &node->trss_mintid,
+ &node->trss_maxtid);
+ node->ss.ss_currentScanDesc = scandesc;
+ }
+ else
+ {
+ /* rescan with the updated TID range */
+ table_rescan_tidrange(scandesc, &node->trss_mintid,
+ &node->trss_maxtid);
+ }
+
+ node->trss_inScan = true;
+ }
+
+ /* Fetch the next tuple. */
+ if (!table_scan_getnextslot_tidrange(scandesc, direction, slot))
+ {
+ node->trss_inScan = false;
+ ExecClearTuple(slot);
+ }
+
+ return slot;
+}
+
+/*
+ * TidRangeRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot)
+{
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecTidRangeScan(node)
+ *
+ * Scans the relation using tids and returns the next qualifying tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ *
+ * Conditions:
+ * -- the "cursor" maintained by the AMI is positioned at the tuple
+ * returned previously.
+ *
+ * Initial States:
+ * -- the relation indicated is opened for TID range scanning.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecTidRangeScan(PlanState *pstate)
+{
+ TidRangeScanState *node = castNode(TidRangeScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) TidRangeNext,
+ (ExecScanRecheckMtd) TidRangeRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanTidRangeScan(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanTidRangeScan(TidRangeScanState *node)
+{
+ /* mark scan as not in progress, and tid range list as not computed yet */
+ node->trss_inScan = false;
+
+ /*
+ * We must wait until TidRangeNext before calling table_rescan_tidrange.
+ */
+ ExecScanReScan(&node->ss);
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndTidRangeScan
+ *
+ * Releases any storage allocated through C routines.
+ * Returns nothing.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndTidRangeScan(TidRangeScanState *node)
+{
+ TableScanDesc scan = node->ss.ss_currentScanDesc;
+
+ if (scan != NULL)
+ table_endscan(scan);
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clear out tuple table slots
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitTidRangeScan
+ *
+ * Initializes the tid range scan's state information, creates
+ * scan keys, and opens the scan relation.
+ *
+ * Parameters:
+ * node: TidRangeScan node produced by the planner.
+ * estate: the execution state initialized in InitPlan.
+ * ----------------------------------------------------------------
+ */
+TidRangeScanState *
+ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags)
+{
+ TidRangeScanState *tidrangestate;
+ Relation currentRelation;
+
+ /*
+ * create state structure
+ */
+ tidrangestate = makeNode(TidRangeScanState);
+ tidrangestate->ss.ps.plan = (Plan *) node;
+ tidrangestate->ss.ps.state = estate;
+ tidrangestate->ss.ps.ExecProcNode = ExecTidRangeScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &tidrangestate->ss.ps);
+
+ /*
+ * mark scan as not in progress, and TID range as not computed yet
+ */
+ tidrangestate->trss_inScan = false;
+
+ /*
+ * open the scan relation
+ */
+ currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+ tidrangestate->ss.ss_currentRelation = currentRelation;
+ tidrangestate->ss.ss_currentScanDesc = NULL; /* no table scan here */
+
+ /*
+ * get the scan type from the relation descriptor.
+ */
+ ExecInitScanTupleSlot(estate, &tidrangestate->ss,
+ RelationGetDescr(currentRelation),
+ table_slot_callbacks(currentRelation));
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&tidrangestate->ss.ps);
+ ExecAssignScanProjectionInfo(&tidrangestate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ tidrangestate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) tidrangestate);
+
+ TidExprListCreate(tidrangestate);
+
+ /*
+ * all done.
+ */
+ return tidrangestate;
+}
diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c
new file mode 100644
index 0000000..48c3737
--- /dev/null
+++ b/src/backend/executor/nodeTidscan.c
@@ -0,0 +1,558 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeTidscan.c
+ * Routines to support direct tid scans of relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeTidscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *
+ * ExecTidScan scans a relation using tids
+ * ExecInitTidScan creates and initializes state info.
+ * ExecReScanTidScan rescans the tid relation.
+ * ExecEndTidScan releases all storage.
+ */
+#include "postgres.h"
+
+#include "access/sysattr.h"
+#include "access/tableam.h"
+#include "catalog/pg_type.h"
+#include "executor/execdebug.h"
+#include "executor/nodeTidscan.h"
+#include "lib/qunique.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/bufmgr.h"
+#include "utils/array.h"
+#include "utils/rel.h"
+
+
+#define IsCTIDVar(node) \
+ ((node) != NULL && \
+ IsA((node), Var) && \
+ ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \
+ ((Var *) (node))->varlevelsup == 0)
+
+/* one element in tss_tidexprs */
+typedef struct TidExpr
+{
+ ExprState *exprstate; /* ExprState for a TID-yielding subexpr */
+ bool isarray; /* if true, it yields tid[] not just tid */
+ CurrentOfExpr *cexpr; /* alternatively, we can have CURRENT OF */
+} TidExpr;
+
+static void TidExprListCreate(TidScanState *tidstate);
+static void TidListEval(TidScanState *tidstate);
+static int itemptr_comparator(const void *a, const void *b);
+static TupleTableSlot *TidNext(TidScanState *node);
+
+
+/*
+ * Extract the qual subexpressions that yield TIDs to search for,
+ * and compile them into ExprStates if they're ordinary expressions.
+ *
+ * CURRENT OF is a special case that we can't compile usefully;
+ * just drop it into the TidExpr list as-is.
+ */
+static void
+TidExprListCreate(TidScanState *tidstate)
+{
+ TidScan *node = (TidScan *) tidstate->ss.ps.plan;
+ ListCell *l;
+
+ tidstate->tss_tidexprs = NIL;
+ tidstate->tss_isCurrentOf = false;
+
+ foreach(l, node->tidquals)
+ {
+ Expr *expr = (Expr *) lfirst(l);
+ TidExpr *tidexpr = (TidExpr *) palloc0(sizeof(TidExpr));
+
+ if (is_opclause(expr))
+ {
+ Node *arg1;
+ Node *arg2;
+
+ arg1 = get_leftop(expr);
+ arg2 = get_rightop(expr);
+ if (IsCTIDVar(arg1))
+ tidexpr->exprstate = ExecInitExpr((Expr *) arg2,
+ &tidstate->ss.ps);
+ else if (IsCTIDVar(arg2))
+ tidexpr->exprstate = ExecInitExpr((Expr *) arg1,
+ &tidstate->ss.ps);
+ else
+ elog(ERROR, "could not identify CTID variable");
+ tidexpr->isarray = false;
+ }
+ else if (expr && IsA(expr, ScalarArrayOpExpr))
+ {
+ ScalarArrayOpExpr *saex = (ScalarArrayOpExpr *) expr;
+
+ Assert(IsCTIDVar(linitial(saex->args)));
+ tidexpr->exprstate = ExecInitExpr(lsecond(saex->args),
+ &tidstate->ss.ps);
+ tidexpr->isarray = true;
+ }
+ else if (expr && IsA(expr, CurrentOfExpr))
+ {
+ CurrentOfExpr *cexpr = (CurrentOfExpr *) expr;
+
+ tidexpr->cexpr = cexpr;
+ tidstate->tss_isCurrentOf = true;
+ }
+ else
+ elog(ERROR, "could not identify CTID expression");
+
+ tidstate->tss_tidexprs = lappend(tidstate->tss_tidexprs, tidexpr);
+ }
+
+ /* CurrentOfExpr could never appear OR'd with something else */
+ Assert(list_length(tidstate->tss_tidexprs) == 1 ||
+ !tidstate->tss_isCurrentOf);
+}
+
+/*
+ * Compute the list of TIDs to be visited, by evaluating the expressions
+ * for them.
+ *
+ * (The result is actually an array, not a list.)
+ */
+static void
+TidListEval(TidScanState *tidstate)
+{
+ ExprContext *econtext = tidstate->ss.ps.ps_ExprContext;
+ TableScanDesc scan;
+ ItemPointerData *tidList;
+ int numAllocTids;
+ int numTids;
+ ListCell *l;
+
+ /*
+ * Start scan on-demand - initializing a scan isn't free (e.g. heap stats
+ * the size of the table), so it makes sense to delay that until needed -
+ * the node might never get executed.
+ */
+ if (tidstate->ss.ss_currentScanDesc == NULL)
+ tidstate->ss.ss_currentScanDesc =
+ table_beginscan_tid(tidstate->ss.ss_currentRelation,
+ tidstate->ss.ps.state->es_snapshot);
+ scan = tidstate->ss.ss_currentScanDesc;
+
+ /*
+ * We initialize the array with enough slots for the case that all quals
+ * are simple OpExprs or CurrentOfExprs. If there are any
+ * ScalarArrayOpExprs, we may have to enlarge the array.
+ */
+ numAllocTids = list_length(tidstate->tss_tidexprs);
+ tidList = (ItemPointerData *)
+ palloc(numAllocTids * sizeof(ItemPointerData));
+ numTids = 0;
+
+ foreach(l, tidstate->tss_tidexprs)
+ {
+ TidExpr *tidexpr = (TidExpr *) lfirst(l);
+ ItemPointer itemptr;
+ bool isNull;
+
+ if (tidexpr->exprstate && !tidexpr->isarray)
+ {
+ itemptr = (ItemPointer)
+ DatumGetPointer(ExecEvalExprSwitchContext(tidexpr->exprstate,
+ econtext,
+ &isNull));
+ if (isNull)
+ continue;
+
+ /*
+ * We silently discard any TIDs that the AM considers invalid
+ * (E.g. for heap, they could be out of range at the time of scan
+ * start. Since we hold at least AccessShareLock on the table, it
+ * won't be possible for someone to truncate away the blocks we
+ * intend to visit.).
+ */
+ if (!table_tuple_tid_valid(scan, itemptr))
+ continue;
+
+ if (numTids >= numAllocTids)
+ {
+ numAllocTids *= 2;
+ tidList = (ItemPointerData *)
+ repalloc(tidList,
+ numAllocTids * sizeof(ItemPointerData));
+ }
+ tidList[numTids++] = *itemptr;
+ }
+ else if (tidexpr->exprstate && tidexpr->isarray)
+ {
+ Datum arraydatum;
+ ArrayType *itemarray;
+ Datum *ipdatums;
+ bool *ipnulls;
+ int ndatums;
+ int i;
+
+ arraydatum = ExecEvalExprSwitchContext(tidexpr->exprstate,
+ econtext,
+ &isNull);
+ if (isNull)
+ continue;
+ itemarray = DatumGetArrayTypeP(arraydatum);
+ deconstruct_array(itemarray,
+ TIDOID, sizeof(ItemPointerData), false, TYPALIGN_SHORT,
+ &ipdatums, &ipnulls, &ndatums);
+ if (numTids + ndatums > numAllocTids)
+ {
+ numAllocTids = numTids + ndatums;
+ tidList = (ItemPointerData *)
+ repalloc(tidList,
+ numAllocTids * sizeof(ItemPointerData));
+ }
+ for (i = 0; i < ndatums; i++)
+ {
+ if (ipnulls[i])
+ continue;
+
+ itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]);
+
+ if (!table_tuple_tid_valid(scan, itemptr))
+ continue;
+
+ tidList[numTids++] = *itemptr;
+ }
+ pfree(ipdatums);
+ pfree(ipnulls);
+ }
+ else
+ {
+ ItemPointerData cursor_tid;
+
+ Assert(tidexpr->cexpr);
+ if (execCurrentOf(tidexpr->cexpr, econtext,
+ RelationGetRelid(tidstate->ss.ss_currentRelation),
+ &cursor_tid))
+ {
+ if (numTids >= numAllocTids)
+ {
+ numAllocTids *= 2;
+ tidList = (ItemPointerData *)
+ repalloc(tidList,
+ numAllocTids * sizeof(ItemPointerData));
+ }
+ tidList[numTids++] = cursor_tid;
+ }
+ }
+ }
+
+ /*
+ * Sort the array of TIDs into order, and eliminate duplicates.
+ * Eliminating duplicates is necessary since we want OR semantics across
+ * the list. Sorting makes it easier to detect duplicates, and as a bonus
+ * ensures that we will visit the heap in the most efficient way.
+ */
+ if (numTids > 1)
+ {
+ /* CurrentOfExpr could never appear OR'd with something else */
+ Assert(!tidstate->tss_isCurrentOf);
+
+ qsort((void *) tidList, numTids, sizeof(ItemPointerData),
+ itemptr_comparator);
+ numTids = qunique(tidList, numTids, sizeof(ItemPointerData),
+ itemptr_comparator);
+ }
+
+ tidstate->tss_TidList = tidList;
+ tidstate->tss_NumTids = numTids;
+ tidstate->tss_TidPtr = -1;
+}
+
+/*
+ * qsort comparator for ItemPointerData items
+ */
+static int
+itemptr_comparator(const void *a, const void *b)
+{
+ const ItemPointerData *ipa = (const ItemPointerData *) a;
+ const ItemPointerData *ipb = (const ItemPointerData *) b;
+ BlockNumber ba = ItemPointerGetBlockNumber(ipa);
+ BlockNumber bb = ItemPointerGetBlockNumber(ipb);
+ OffsetNumber oa = ItemPointerGetOffsetNumber(ipa);
+ OffsetNumber ob = ItemPointerGetOffsetNumber(ipb);
+
+ if (ba < bb)
+ return -1;
+ if (ba > bb)
+ return 1;
+ if (oa < ob)
+ return -1;
+ if (oa > ob)
+ return 1;
+ return 0;
+}
+
+/* ----------------------------------------------------------------
+ * TidNext
+ *
+ * Retrieve a tuple from the TidScan node's currentRelation
+ * using the tids in the TidScanState information.
+ *
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+TidNext(TidScanState *node)
+{
+ EState *estate;
+ ScanDirection direction;
+ Snapshot snapshot;
+ TableScanDesc scan;
+ Relation heapRelation;
+ TupleTableSlot *slot;
+ ItemPointerData *tidList;
+ int numTids;
+ bool bBackward;
+
+ /*
+ * extract necessary information from tid scan node
+ */
+ estate = node->ss.ps.state;
+ direction = estate->es_direction;
+ snapshot = estate->es_snapshot;
+ heapRelation = node->ss.ss_currentRelation;
+ slot = node->ss.ss_ScanTupleSlot;
+
+ /*
+ * First time through, compute the list of TIDs to be visited
+ */
+ if (node->tss_TidList == NULL)
+ TidListEval(node);
+
+ scan = node->ss.ss_currentScanDesc;
+ tidList = node->tss_TidList;
+ numTids = node->tss_NumTids;
+
+ /*
+ * Initialize or advance scan position, depending on direction.
+ */
+ bBackward = ScanDirectionIsBackward(direction);
+ if (bBackward)
+ {
+ if (node->tss_TidPtr < 0)
+ {
+ /* initialize for backward scan */
+ node->tss_TidPtr = numTids - 1;
+ }
+ else
+ node->tss_TidPtr--;
+ }
+ else
+ {
+ if (node->tss_TidPtr < 0)
+ {
+ /* initialize for forward scan */
+ node->tss_TidPtr = 0;
+ }
+ else
+ node->tss_TidPtr++;
+ }
+
+ while (node->tss_TidPtr >= 0 && node->tss_TidPtr < numTids)
+ {
+ ItemPointerData tid = tidList[node->tss_TidPtr];
+
+ /*
+ * For WHERE CURRENT OF, the tuple retrieved from the cursor might
+ * since have been updated; if so, we should fetch the version that is
+ * current according to our snapshot.
+ */
+ if (node->tss_isCurrentOf)
+ table_tuple_get_latest_tid(scan, &tid);
+
+ if (table_tuple_fetch_row_version(heapRelation, &tid, snapshot, slot))
+ return slot;
+
+ /* Bad TID or failed snapshot qual; try next */
+ if (bBackward)
+ node->tss_TidPtr--;
+ else
+ node->tss_TidPtr++;
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * if we get here it means the tid scan failed so we are at the end of the
+ * scan..
+ */
+ return ExecClearTuple(slot);
+}
+
+/*
+ * TidRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+TidRecheck(TidScanState *node, TupleTableSlot *slot)
+{
+ /*
+ * XXX shouldn't we check here to make sure tuple matches TID list? In
+ * runtime-key case this is not certain, is it? However, in the WHERE
+ * CURRENT OF case it might not match anyway ...
+ */
+ return true;
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecTidScan(node)
+ *
+ * Scans the relation using tids and returns
+ * the next qualifying tuple in the direction specified.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ *
+ * Conditions:
+ * -- the "cursor" maintained by the AMI is positioned at the tuple
+ * returned previously.
+ *
+ * Initial States:
+ * -- the relation indicated is opened for scanning so that the
+ * "cursor" is positioned before the first qualifying tuple.
+ * -- tss_TidPtr is -1.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecTidScan(PlanState *pstate)
+{
+ TidScanState *node = castNode(TidScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) TidNext,
+ (ExecScanRecheckMtd) TidRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanTidScan(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanTidScan(TidScanState *node)
+{
+ if (node->tss_TidList)
+ pfree(node->tss_TidList);
+ node->tss_TidList = NULL;
+ node->tss_NumTids = 0;
+ node->tss_TidPtr = -1;
+
+ /* not really necessary, but seems good form */
+ if (node->ss.ss_currentScanDesc)
+ table_rescan(node->ss.ss_currentScanDesc, NULL);
+
+ ExecScanReScan(&node->ss);
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndTidScan
+ *
+ * Releases any storage allocated through C routines.
+ * Returns nothing.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndTidScan(TidScanState *node)
+{
+ if (node->ss.ss_currentScanDesc)
+ table_endscan(node->ss.ss_currentScanDesc);
+
+ /*
+ * Free the exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clear out tuple table slots
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitTidScan
+ *
+ * Initializes the tid scan's state information, creates
+ * scan keys, and opens the base and tid relations.
+ *
+ * Parameters:
+ * node: TidScan node produced by the planner.
+ * estate: the execution state initialized in InitPlan.
+ * ----------------------------------------------------------------
+ */
+TidScanState *
+ExecInitTidScan(TidScan *node, EState *estate, int eflags)
+{
+ TidScanState *tidstate;
+ Relation currentRelation;
+
+ /*
+ * create state structure
+ */
+ tidstate = makeNode(TidScanState);
+ tidstate->ss.ps.plan = (Plan *) node;
+ tidstate->ss.ps.state = estate;
+ tidstate->ss.ps.ExecProcNode = ExecTidScan;
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &tidstate->ss.ps);
+
+ /*
+ * mark tid list as not computed yet
+ */
+ tidstate->tss_TidList = NULL;
+ tidstate->tss_NumTids = 0;
+ tidstate->tss_TidPtr = -1;
+
+ /*
+ * open the scan relation
+ */
+ currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+ tidstate->ss.ss_currentRelation = currentRelation;
+ tidstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
+
+ /*
+ * get the scan type from the relation descriptor.
+ */
+ ExecInitScanTupleSlot(estate, &tidstate->ss,
+ RelationGetDescr(currentRelation),
+ table_slot_callbacks(currentRelation));
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&tidstate->ss.ps);
+ ExecAssignScanProjectionInfo(&tidstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ tidstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) tidstate);
+
+ TidExprListCreate(tidstate);
+
+ /*
+ * all done.
+ */
+ return tidstate;
+}
diff --git a/src/backend/executor/nodeUnique.c b/src/backend/executor/nodeUnique.c
new file mode 100644
index 0000000..9214d6f
--- /dev/null
+++ b/src/backend/executor/nodeUnique.c
@@ -0,0 +1,192 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeUnique.c
+ * Routines to handle unique'ing of queries where appropriate
+ *
+ * Unique is a very simple node type that just filters out duplicate
+ * tuples from a stream of sorted tuples from its subplan. It's essentially
+ * a dumbed-down form of Group: the duplicate-removal functionality is
+ * identical. However, Unique doesn't do projection nor qual checking,
+ * so it's marginally more efficient for cases where neither is needed.
+ * (It's debatable whether the savings justifies carrying two plan node
+ * types, though.)
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeUnique.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecUnique - generate a unique'd temporary relation
+ * ExecInitUnique - initialize node and subnodes
+ * ExecEndUnique - shutdown node and subnodes
+ *
+ * NOTES
+ * Assumes tuples returned from subplan arrive in
+ * sorted order.
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeUnique.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/* ----------------------------------------------------------------
+ * ExecUnique
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot * /* return: a tuple or NULL */
+ExecUnique(PlanState *pstate)
+{
+ UniqueState *node = castNode(UniqueState, pstate);
+ ExprContext *econtext = node->ps.ps_ExprContext;
+ TupleTableSlot *resultTupleSlot;
+ TupleTableSlot *slot;
+ PlanState *outerPlan;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * get information from the node
+ */
+ outerPlan = outerPlanState(node);
+ resultTupleSlot = node->ps.ps_ResultTupleSlot;
+
+ /*
+ * now loop, returning only non-duplicate tuples. We assume that the
+ * tuples arrive in sorted order so we can detect duplicates easily. The
+ * first tuple of each group is returned.
+ */
+ for (;;)
+ {
+ /*
+ * fetch a tuple from the outer subplan
+ */
+ slot = ExecProcNode(outerPlan);
+ if (TupIsNull(slot))
+ {
+ /* end of subplan, so we're done */
+ ExecClearTuple(resultTupleSlot);
+ return NULL;
+ }
+
+ /*
+ * Always return the first tuple from the subplan.
+ */
+ if (TupIsNull(resultTupleSlot))
+ break;
+
+ /*
+ * Else test if the new tuple and the previously returned tuple match.
+ * If so then we loop back and fetch another new tuple from the
+ * subplan.
+ */
+ econtext->ecxt_innertuple = slot;
+ econtext->ecxt_outertuple = resultTupleSlot;
+ if (!ExecQualAndReset(node->eqfunction, econtext))
+ break;
+ }
+
+ /*
+ * We have a new tuple different from the previous saved tuple (if any).
+ * Save it and return it. We must copy it because the source subplan
+ * won't guarantee that this source tuple is still accessible after
+ * fetching the next source tuple.
+ */
+ return ExecCopySlot(resultTupleSlot, slot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitUnique
+ *
+ * This initializes the unique node state structures and
+ * the node's subplan.
+ * ----------------------------------------------------------------
+ */
+UniqueState *
+ExecInitUnique(Unique *node, EState *estate, int eflags)
+{
+ UniqueState *uniquestate;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ uniquestate = makeNode(UniqueState);
+ uniquestate->ps.plan = (Plan *) node;
+ uniquestate->ps.state = estate;
+ uniquestate->ps.ExecProcNode = ExecUnique;
+
+ /*
+ * create expression context
+ */
+ ExecAssignExprContext(estate, &uniquestate->ps);
+
+ /*
+ * then initialize outer plan
+ */
+ outerPlanState(uniquestate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * Initialize result slot and type. Unique nodes do no projections, so
+ * initialize projection info for this node appropriately.
+ */
+ ExecInitResultTupleSlotTL(&uniquestate->ps, &TTSOpsMinimalTuple);
+ uniquestate->ps.ps_ProjInfo = NULL;
+
+ /*
+ * Precompute fmgr lookup data for inner loop
+ */
+ uniquestate->eqfunction =
+ execTuplesMatchPrepare(ExecGetResultType(outerPlanState(uniquestate)),
+ node->numCols,
+ node->uniqColIdx,
+ node->uniqOperators,
+ node->uniqCollations,
+ &uniquestate->ps);
+
+ return uniquestate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndUnique
+ *
+ * This shuts down the subplan and frees resources allocated
+ * to this node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndUnique(UniqueState *node)
+{
+ /* clean up tuple table */
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ ExecFreeExprContext(&node->ps);
+
+ ExecEndNode(outerPlanState(node));
+}
+
+
+void
+ExecReScanUnique(UniqueState *node)
+{
+ /* must clear result tuple so first input tuple is returned */
+ ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (node->ps.lefttree->chgParam == NULL)
+ ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeValuesscan.c b/src/backend/executor/nodeValuesscan.c
new file mode 100644
index 0000000..5de1429
--- /dev/null
+++ b/src/backend/executor/nodeValuesscan.c
@@ -0,0 +1,361 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeValuesscan.c
+ * Support routines for scanning Values lists
+ * ("VALUES (...), (...), ..." in rangetable).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeValuesscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ * ExecValuesScan scans a values list.
+ * ExecValuesNext retrieve next tuple in sequential order.
+ * ExecInitValuesScan creates and initializes a valuesscan node.
+ * ExecEndValuesScan releases any storage allocated.
+ * ExecReScanValuesScan rescans the values list
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeValuesscan.h"
+#include "jit/jit.h"
+#include "optimizer/clauses.h"
+#include "utils/expandeddatum.h"
+
+
+static TupleTableSlot *ValuesNext(ValuesScanState *node);
+
+
+/* ----------------------------------------------------------------
+ * Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ValuesNext
+ *
+ * This is a workhorse for ExecValuesScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ValuesNext(ValuesScanState *node)
+{
+ TupleTableSlot *slot;
+ EState *estate;
+ ExprContext *econtext;
+ ScanDirection direction;
+ int curr_idx;
+
+ /*
+ * get information from the estate and scan state
+ */
+ estate = node->ss.ps.state;
+ direction = estate->es_direction;
+ slot = node->ss.ss_ScanTupleSlot;
+ econtext = node->rowcontext;
+
+ /*
+ * Get the next tuple. Return NULL if no more tuples.
+ */
+ if (ScanDirectionIsForward(direction))
+ {
+ if (node->curr_idx < node->array_len)
+ node->curr_idx++;
+ }
+ else
+ {
+ if (node->curr_idx >= 0)
+ node->curr_idx--;
+ }
+
+ /*
+ * Always clear the result slot; this is appropriate if we are at the end
+ * of the data, and if we're not, we still need it as the first step of
+ * the store-virtual-tuple protocol. It seems wise to clear the slot
+ * before we reset the context it might have pointers into.
+ */
+ ExecClearTuple(slot);
+
+ curr_idx = node->curr_idx;
+ if (curr_idx >= 0 && curr_idx < node->array_len)
+ {
+ List *exprlist = node->exprlists[curr_idx];
+ List *exprstatelist = node->exprstatelists[curr_idx];
+ MemoryContext oldContext;
+ Datum *values;
+ bool *isnull;
+ ListCell *lc;
+ int resind;
+
+ /*
+ * Get rid of any prior cycle's leftovers. We use ReScanExprContext
+ * not just ResetExprContext because we want any registered shutdown
+ * callbacks to be called.
+ */
+ ReScanExprContext(econtext);
+
+ /*
+ * Do per-VALUES-row work in the per-tuple context.
+ */
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ /*
+ * Unless we already made the expression eval state for this row,
+ * build it in the econtext's per-tuple memory. This is a tad
+ * unusual, but we want to delete the eval state again when we move to
+ * the next row, to avoid growth of memory requirements over a long
+ * values list. For rows in which that won't work, we already built
+ * the eval state at plan startup.
+ */
+ if (exprstatelist == NIL)
+ {
+ /*
+ * Pass parent as NULL, not my plan node, because we don't want
+ * anything in this transient state linking into permanent state.
+ * The only expression type that might wish to do so is a SubPlan,
+ * and we already checked that there aren't any.
+ *
+ * Note that passing parent = NULL also disables JIT compilation
+ * of the expressions, which is a win, because they're only going
+ * to be used once under normal circumstances.
+ */
+ exprstatelist = ExecInitExprList(exprlist, NULL);
+ }
+
+ /* parser should have checked all sublists are the same length */
+ Assert(list_length(exprstatelist) == slot->tts_tupleDescriptor->natts);
+
+ /*
+ * Compute the expressions and build a virtual result tuple. We
+ * already did ExecClearTuple(slot).
+ */
+ values = slot->tts_values;
+ isnull = slot->tts_isnull;
+
+ resind = 0;
+ foreach(lc, exprstatelist)
+ {
+ ExprState *estate = (ExprState *) lfirst(lc);
+ Form_pg_attribute attr = TupleDescAttr(slot->tts_tupleDescriptor,
+ resind);
+
+ values[resind] = ExecEvalExpr(estate,
+ econtext,
+ &isnull[resind]);
+
+ /*
+ * We must force any R/W expanded datums to read-only state, in
+ * case they are multiply referenced in the plan node's output
+ * expressions, or in case we skip the output projection and the
+ * output column is multiply referenced in higher plan nodes.
+ */
+ values[resind] = MakeExpandedObjectReadOnly(values[resind],
+ isnull[resind],
+ attr->attlen);
+
+ resind++;
+ }
+
+ MemoryContextSwitchTo(oldContext);
+
+ /*
+ * And return the virtual tuple.
+ */
+ ExecStoreVirtualTuple(slot);
+ }
+
+ return slot;
+}
+
+/*
+ * ValuesRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+ValuesRecheck(ValuesScanState *node, TupleTableSlot *slot)
+{
+ /* nothing to check */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecValuesScan(node)
+ *
+ * Scans the values lists sequentially and returns the next qualifying
+ * tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecValuesScan(PlanState *pstate)
+{
+ ValuesScanState *node = castNode(ValuesScanState, pstate);
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) ValuesNext,
+ (ExecScanRecheckMtd) ValuesRecheck);
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitValuesScan
+ * ----------------------------------------------------------------
+ */
+ValuesScanState *
+ExecInitValuesScan(ValuesScan *node, EState *estate, int eflags)
+{
+ ValuesScanState *scanstate;
+ TupleDesc tupdesc;
+ ListCell *vtl;
+ int i;
+ PlanState *planstate;
+
+ /*
+ * ValuesScan should not have any children.
+ */
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create new ScanState for node
+ */
+ scanstate = makeNode(ValuesScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecValuesScan;
+
+ /*
+ * Miscellaneous initialization
+ */
+ planstate = &scanstate->ss.ps;
+
+ /*
+ * Create expression contexts. We need two, one for per-sublist
+ * processing and one for execScan.c to use for quals and projections. We
+ * cheat a little by using ExecAssignExprContext() to build both.
+ */
+ ExecAssignExprContext(estate, planstate);
+ scanstate->rowcontext = planstate->ps_ExprContext;
+ ExecAssignExprContext(estate, planstate);
+
+ /*
+ * Get info about values list, initialize scan slot with it.
+ */
+ tupdesc = ExecTypeFromExprList((List *) linitial(node->values_lists));
+ ExecInitScanTupleSlot(estate, &scanstate->ss, tupdesc, &TTSOpsVirtual);
+
+ /*
+ * Initialize result type and projection.
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+ ExecAssignScanProjectionInfo(&scanstate->ss);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+ /*
+ * Other node-specific setup
+ */
+ scanstate->curr_idx = -1;
+ scanstate->array_len = list_length(node->values_lists);
+
+ /*
+ * Convert the list of expression sublists into an array for easier
+ * addressing at runtime. Also, detect whether any sublists contain
+ * SubPlans; for just those sublists, go ahead and do expression
+ * initialization. (This avoids problems with SubPlans wanting to connect
+ * themselves up to the outer plan tree. Notably, EXPLAIN won't see the
+ * subplans otherwise; also we will have troubles with dangling pointers
+ * and/or leaked resources if we try to handle SubPlans the same as
+ * simpler expressions.)
+ */
+ scanstate->exprlists = (List **)
+ palloc(scanstate->array_len * sizeof(List *));
+ scanstate->exprstatelists = (List **)
+ palloc0(scanstate->array_len * sizeof(List *));
+ i = 0;
+ foreach(vtl, node->values_lists)
+ {
+ List *exprs = castNode(List, lfirst(vtl));
+
+ scanstate->exprlists[i] = exprs;
+
+ /*
+ * We can avoid the cost of a contain_subplans() scan in the simple
+ * case where there are no SubPlans anywhere.
+ */
+ if (estate->es_subplanstates &&
+ contain_subplans((Node *) exprs))
+ {
+ int saved_jit_flags;
+
+ /*
+ * As these expressions are only used once, disable JIT for them.
+ * This is worthwhile because it's common to insert significant
+ * amounts of data via VALUES(). Note that this doesn't prevent
+ * use of JIT *within* a subplan, since that's initialized
+ * separately; this just affects the upper-level subexpressions.
+ */
+ saved_jit_flags = estate->es_jit_flags;
+ estate->es_jit_flags = PGJIT_NONE;
+
+ scanstate->exprstatelists[i] = ExecInitExprList(exprs,
+ &scanstate->ss.ps);
+
+ estate->es_jit_flags = saved_jit_flags;
+ }
+ i++;
+ }
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndValuesScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndValuesScan(ValuesScanState *node)
+{
+ /*
+ * Free both exprcontexts
+ */
+ ExecFreeExprContext(&node->ss.ps);
+ node->ss.ps.ps_ExprContext = node->rowcontext;
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanValuesScan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanValuesScan(ValuesScanState *node)
+{
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ ExecScanReScan(&node->ss);
+
+ node->curr_idx = -1;
+}
diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c
new file mode 100644
index 0000000..f8ea9e9
--- /dev/null
+++ b/src/backend/executor/nodeWindowAgg.c
@@ -0,0 +1,3463 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeWindowAgg.c
+ * routines to handle WindowAgg nodes.
+ *
+ * A WindowAgg node evaluates "window functions" across suitable partitions
+ * of the input tuple set. Any one WindowAgg works for just a single window
+ * specification, though it can evaluate multiple window functions sharing
+ * identical window specifications. The input tuples are required to be
+ * delivered in sorted order, with the PARTITION BY columns (if any) as
+ * major sort keys and the ORDER BY columns (if any) as minor sort keys.
+ * (The planner generates a stack of WindowAggs with intervening Sort nodes
+ * as needed, if a query involves more than one window specification.)
+ *
+ * Since window functions can require access to any or all of the rows in
+ * the current partition, we accumulate rows of the partition into a
+ * tuplestore. The window functions are called using the WindowObject API
+ * so that they can access those rows as needed.
+ *
+ * We also support using plain aggregate functions as window functions.
+ * For these, the regular Agg-node environment is emulated for each partition.
+ * As required by the SQL spec, the output represents the value of the
+ * aggregate function over all rows in the current row's window frame.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeWindowAgg.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/objectaccess.h"
+#include "catalog/pg_aggregate.h"
+#include "catalog/pg_proc.h"
+#include "executor/executor.h"
+#include "executor/nodeWindowAgg.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
+#include "parser/parse_agg.h"
+#include "parser/parse_coerce.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/expandeddatum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+#include "windowapi.h"
+
+/*
+ * All the window function APIs are called with this object, which is passed
+ * to window functions as fcinfo->context.
+ */
+typedef struct WindowObjectData
+{
+ NodeTag type;
+ WindowAggState *winstate; /* parent WindowAggState */
+ List *argstates; /* ExprState trees for fn's arguments */
+ void *localmem; /* WinGetPartitionLocalMemory's chunk */
+ int markptr; /* tuplestore mark pointer for this fn */
+ int readptr; /* tuplestore read pointer for this fn */
+ int64 markpos; /* row that markptr is positioned on */
+ int64 seekpos; /* row that readptr is positioned on */
+} WindowObjectData;
+
+/*
+ * We have one WindowStatePerFunc struct for each window function and
+ * window aggregate handled by this node.
+ */
+typedef struct WindowStatePerFuncData
+{
+ /* Links to WindowFunc expr and state nodes this working state is for */
+ WindowFuncExprState *wfuncstate;
+ WindowFunc *wfunc;
+
+ int numArguments; /* number of arguments */
+
+ FmgrInfo flinfo; /* fmgr lookup data for window function */
+
+ Oid winCollation; /* collation derived for window function */
+
+ /*
+ * We need the len and byval info for the result of each function in order
+ * to know how to copy/delete values.
+ */
+ int16 resulttypeLen;
+ bool resulttypeByVal;
+
+ bool plain_agg; /* is it just a plain aggregate function? */
+ int aggno; /* if so, index of its WindowStatePerAggData */
+
+ WindowObject winobj; /* object used in window function API */
+} WindowStatePerFuncData;
+
+/*
+ * For plain aggregate window functions, we also have one of these.
+ */
+typedef struct WindowStatePerAggData
+{
+ /* Oids of transition functions */
+ Oid transfn_oid;
+ Oid invtransfn_oid; /* may be InvalidOid */
+ Oid finalfn_oid; /* may be InvalidOid */
+
+ /*
+ * fmgr lookup data for transition functions --- only valid when
+ * corresponding oid is not InvalidOid. Note in particular that fn_strict
+ * flags are kept here.
+ */
+ FmgrInfo transfn;
+ FmgrInfo invtransfn;
+ FmgrInfo finalfn;
+
+ int numFinalArgs; /* number of arguments to pass to finalfn */
+
+ /*
+ * initial value from pg_aggregate entry
+ */
+ Datum initValue;
+ bool initValueIsNull;
+
+ /*
+ * cached value for current frame boundaries
+ */
+ Datum resultValue;
+ bool resultValueIsNull;
+
+ /*
+ * We need the len and byval info for the agg's input, result, and
+ * transition data types in order to know how to copy/delete values.
+ */
+ int16 inputtypeLen,
+ resulttypeLen,
+ transtypeLen;
+ bool inputtypeByVal,
+ resulttypeByVal,
+ transtypeByVal;
+
+ int wfuncno; /* index of associated WindowStatePerFuncData */
+
+ /* Context holding transition value and possibly other subsidiary data */
+ MemoryContext aggcontext; /* may be private, or winstate->aggcontext */
+
+ /* Current transition value */
+ Datum transValue; /* current transition value */
+ bool transValueIsNull;
+
+ int64 transValueCount; /* number of currently-aggregated rows */
+
+ /* Data local to eval_windowaggregates() */
+ bool restart; /* need to restart this agg in this cycle? */
+} WindowStatePerAggData;
+
+static void initialize_windowaggregate(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ WindowStatePerAgg peraggstate);
+static void advance_windowaggregate(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ WindowStatePerAgg peraggstate);
+static bool advance_windowaggregate_base(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ WindowStatePerAgg peraggstate);
+static void finalize_windowaggregate(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ WindowStatePerAgg peraggstate,
+ Datum *result, bool *isnull);
+
+static void eval_windowaggregates(WindowAggState *winstate);
+static void eval_windowfunction(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ Datum *result, bool *isnull);
+
+static void begin_partition(WindowAggState *winstate);
+static void spool_tuples(WindowAggState *winstate, int64 pos);
+static void release_partition(WindowAggState *winstate);
+
+static int row_is_in_frame(WindowAggState *winstate, int64 pos,
+ TupleTableSlot *slot);
+static void update_frameheadpos(WindowAggState *winstate);
+static void update_frametailpos(WindowAggState *winstate);
+static void update_grouptailpos(WindowAggState *winstate);
+
+static WindowStatePerAggData *initialize_peragg(WindowAggState *winstate,
+ WindowFunc *wfunc,
+ WindowStatePerAgg peraggstate);
+static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
+
+static bool are_peers(WindowAggState *winstate, TupleTableSlot *slot1,
+ TupleTableSlot *slot2);
+static bool window_gettupleslot(WindowObject winobj, int64 pos,
+ TupleTableSlot *slot);
+
+
+/*
+ * initialize_windowaggregate
+ * parallel to initialize_aggregates in nodeAgg.c
+ */
+static void
+initialize_windowaggregate(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ WindowStatePerAgg peraggstate)
+{
+ MemoryContext oldContext;
+
+ /*
+ * If we're using a private aggcontext, we may reset it here. But if the
+ * context is shared, we don't know which other aggregates may still need
+ * it, so we must leave it to the caller to reset at an appropriate time.
+ */
+ if (peraggstate->aggcontext != winstate->aggcontext)
+ MemoryContextResetAndDeleteChildren(peraggstate->aggcontext);
+
+ if (peraggstate->initValueIsNull)
+ peraggstate->transValue = peraggstate->initValue;
+ else
+ {
+ oldContext = MemoryContextSwitchTo(peraggstate->aggcontext);
+ peraggstate->transValue = datumCopy(peraggstate->initValue,
+ peraggstate->transtypeByVal,
+ peraggstate->transtypeLen);
+ MemoryContextSwitchTo(oldContext);
+ }
+ peraggstate->transValueIsNull = peraggstate->initValueIsNull;
+ peraggstate->transValueCount = 0;
+ peraggstate->resultValue = (Datum) 0;
+ peraggstate->resultValueIsNull = true;
+}
+
+/*
+ * advance_windowaggregate
+ * parallel to advance_aggregates in nodeAgg.c
+ */
+static void
+advance_windowaggregate(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ WindowStatePerAgg peraggstate)
+{
+ LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+ WindowFuncExprState *wfuncstate = perfuncstate->wfuncstate;
+ int numArguments = perfuncstate->numArguments;
+ Datum newVal;
+ ListCell *arg;
+ int i;
+ MemoryContext oldContext;
+ ExprContext *econtext = winstate->tmpcontext;
+ ExprState *filter = wfuncstate->aggfilter;
+
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ /* Skip anything FILTERed out */
+ if (filter)
+ {
+ bool isnull;
+ Datum res = ExecEvalExpr(filter, econtext, &isnull);
+
+ if (isnull || !DatumGetBool(res))
+ {
+ MemoryContextSwitchTo(oldContext);
+ return;
+ }
+ }
+
+ /* We start from 1, since the 0th arg will be the transition value */
+ i = 1;
+ foreach(arg, wfuncstate->args)
+ {
+ ExprState *argstate = (ExprState *) lfirst(arg);
+
+ fcinfo->args[i].value = ExecEvalExpr(argstate, econtext,
+ &fcinfo->args[i].isnull);
+ i++;
+ }
+
+ if (peraggstate->transfn.fn_strict)
+ {
+ /*
+ * For a strict transfn, nothing happens when there's a NULL input; we
+ * just keep the prior transValue. Note transValueCount doesn't
+ * change either.
+ */
+ for (i = 1; i <= numArguments; i++)
+ {
+ if (fcinfo->args[i].isnull)
+ {
+ MemoryContextSwitchTo(oldContext);
+ return;
+ }
+ }
+
+ /*
+ * For strict transition functions with initial value NULL we use the
+ * first non-NULL input as the initial state. (We already checked
+ * that the agg's input type is binary-compatible with its transtype,
+ * so straight copy here is OK.)
+ *
+ * We must copy the datum into aggcontext if it is pass-by-ref. We do
+ * not need to pfree the old transValue, since it's NULL.
+ */
+ if (peraggstate->transValueCount == 0 && peraggstate->transValueIsNull)
+ {
+ MemoryContextSwitchTo(peraggstate->aggcontext);
+ peraggstate->transValue = datumCopy(fcinfo->args[1].value,
+ peraggstate->transtypeByVal,
+ peraggstate->transtypeLen);
+ peraggstate->transValueIsNull = false;
+ peraggstate->transValueCount = 1;
+ MemoryContextSwitchTo(oldContext);
+ return;
+ }
+
+ if (peraggstate->transValueIsNull)
+ {
+ /*
+ * Don't call a strict function with NULL inputs. Note it is
+ * possible to get here despite the above tests, if the transfn is
+ * strict *and* returned a NULL on a prior cycle. If that happens
+ * we will propagate the NULL all the way to the end. That can
+ * only happen if there's no inverse transition function, though,
+ * since we disallow transitions back to NULL when there is one.
+ */
+ MemoryContextSwitchTo(oldContext);
+ Assert(!OidIsValid(peraggstate->invtransfn_oid));
+ return;
+ }
+ }
+
+ /*
+ * OK to call the transition function. Set winstate->curaggcontext while
+ * calling it, for possible use by AggCheckCallContext.
+ */
+ InitFunctionCallInfoData(*fcinfo, &(peraggstate->transfn),
+ numArguments + 1,
+ perfuncstate->winCollation,
+ (void *) winstate, NULL);
+ fcinfo->args[0].value = peraggstate->transValue;
+ fcinfo->args[0].isnull = peraggstate->transValueIsNull;
+ winstate->curaggcontext = peraggstate->aggcontext;
+ newVal = FunctionCallInvoke(fcinfo);
+ winstate->curaggcontext = NULL;
+
+ /*
+ * Moving-aggregate transition functions must not return null, see
+ * advance_windowaggregate_base().
+ */
+ if (fcinfo->isnull && OidIsValid(peraggstate->invtransfn_oid))
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("moving-aggregate transition function must not return null")));
+
+ /*
+ * We must track the number of rows included in transValue, since to
+ * remove the last input, advance_windowaggregate_base() mustn't call the
+ * inverse transition function, but simply reset transValue back to its
+ * initial value.
+ */
+ peraggstate->transValueCount++;
+
+ /*
+ * If pass-by-ref datatype, must copy the new value into aggcontext and
+ * free the prior transValue. But if transfn returned a pointer to its
+ * first input, we don't need to do anything. Also, if transfn returned a
+ * pointer to a R/W expanded object that is already a child of the
+ * aggcontext, assume we can adopt that value without copying it.
+ */
+ if (!peraggstate->transtypeByVal &&
+ DatumGetPointer(newVal) != DatumGetPointer(peraggstate->transValue))
+ {
+ if (!fcinfo->isnull)
+ {
+ MemoryContextSwitchTo(peraggstate->aggcontext);
+ if (DatumIsReadWriteExpandedObject(newVal,
+ false,
+ peraggstate->transtypeLen) &&
+ MemoryContextGetParent(DatumGetEOHP(newVal)->eoh_context) == CurrentMemoryContext)
+ /* do nothing */ ;
+ else
+ newVal = datumCopy(newVal,
+ peraggstate->transtypeByVal,
+ peraggstate->transtypeLen);
+ }
+ if (!peraggstate->transValueIsNull)
+ {
+ if (DatumIsReadWriteExpandedObject(peraggstate->transValue,
+ false,
+ peraggstate->transtypeLen))
+ DeleteExpandedObject(peraggstate->transValue);
+ else
+ pfree(DatumGetPointer(peraggstate->transValue));
+ }
+ }
+
+ MemoryContextSwitchTo(oldContext);
+ peraggstate->transValue = newVal;
+ peraggstate->transValueIsNull = fcinfo->isnull;
+}
+
+/*
+ * advance_windowaggregate_base
+ * Remove the oldest tuple from an aggregation.
+ *
+ * This is very much like advance_windowaggregate, except that we will call
+ * the inverse transition function (which caller must have checked is
+ * available).
+ *
+ * Returns true if we successfully removed the current row from this
+ * aggregate, false if not (in the latter case, caller is responsible
+ * for cleaning up by restarting the aggregation).
+ */
+static bool
+advance_windowaggregate_base(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ WindowStatePerAgg peraggstate)
+{
+ LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+ WindowFuncExprState *wfuncstate = perfuncstate->wfuncstate;
+ int numArguments = perfuncstate->numArguments;
+ Datum newVal;
+ ListCell *arg;
+ int i;
+ MemoryContext oldContext;
+ ExprContext *econtext = winstate->tmpcontext;
+ ExprState *filter = wfuncstate->aggfilter;
+
+ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+ /* Skip anything FILTERed out */
+ if (filter)
+ {
+ bool isnull;
+ Datum res = ExecEvalExpr(filter, econtext, &isnull);
+
+ if (isnull || !DatumGetBool(res))
+ {
+ MemoryContextSwitchTo(oldContext);
+ return true;
+ }
+ }
+
+ /* We start from 1, since the 0th arg will be the transition value */
+ i = 1;
+ foreach(arg, wfuncstate->args)
+ {
+ ExprState *argstate = (ExprState *) lfirst(arg);
+
+ fcinfo->args[i].value = ExecEvalExpr(argstate, econtext,
+ &fcinfo->args[i].isnull);
+ i++;
+ }
+
+ if (peraggstate->invtransfn.fn_strict)
+ {
+ /*
+ * For a strict (inv)transfn, nothing happens when there's a NULL
+ * input; we just keep the prior transValue. Note transValueCount
+ * doesn't change either.
+ */
+ for (i = 1; i <= numArguments; i++)
+ {
+ if (fcinfo->args[i].isnull)
+ {
+ MemoryContextSwitchTo(oldContext);
+ return true;
+ }
+ }
+ }
+
+ /* There should still be an added but not yet removed value */
+ Assert(peraggstate->transValueCount > 0);
+
+ /*
+ * In moving-aggregate mode, the state must never be NULL, except possibly
+ * before any rows have been aggregated (which is surely not the case at
+ * this point). This restriction allows us to interpret a NULL result
+ * from the inverse function as meaning "sorry, can't do an inverse
+ * transition in this case". We already checked this in
+ * advance_windowaggregate, but just for safety, check again.
+ */
+ if (peraggstate->transValueIsNull)
+ elog(ERROR, "aggregate transition value is NULL before inverse transition");
+
+ /*
+ * We mustn't use the inverse transition function to remove the last
+ * input. Doing so would yield a non-NULL state, whereas we should be in
+ * the initial state afterwards which may very well be NULL. So instead,
+ * we simply re-initialize the aggregate in this case.
+ */
+ if (peraggstate->transValueCount == 1)
+ {
+ MemoryContextSwitchTo(oldContext);
+ initialize_windowaggregate(winstate,
+ &winstate->perfunc[peraggstate->wfuncno],
+ peraggstate);
+ return true;
+ }
+
+ /*
+ * OK to call the inverse transition function. Set
+ * winstate->curaggcontext while calling it, for possible use by
+ * AggCheckCallContext.
+ */
+ InitFunctionCallInfoData(*fcinfo, &(peraggstate->invtransfn),
+ numArguments + 1,
+ perfuncstate->winCollation,
+ (void *) winstate, NULL);
+ fcinfo->args[0].value = peraggstate->transValue;
+ fcinfo->args[0].isnull = peraggstate->transValueIsNull;
+ winstate->curaggcontext = peraggstate->aggcontext;
+ newVal = FunctionCallInvoke(fcinfo);
+ winstate->curaggcontext = NULL;
+
+ /*
+ * If the function returns NULL, report failure, forcing a restart.
+ */
+ if (fcinfo->isnull)
+ {
+ MemoryContextSwitchTo(oldContext);
+ return false;
+ }
+
+ /* Update number of rows included in transValue */
+ peraggstate->transValueCount--;
+
+ /*
+ * If pass-by-ref datatype, must copy the new value into aggcontext and
+ * free the prior transValue. But if invtransfn returned a pointer to its
+ * first input, we don't need to do anything. Also, if invtransfn
+ * returned a pointer to a R/W expanded object that is already a child of
+ * the aggcontext, assume we can adopt that value without copying it.
+ *
+ * Note: the checks for null values here will never fire, but it seems
+ * best to have this stanza look just like advance_windowaggregate.
+ */
+ if (!peraggstate->transtypeByVal &&
+ DatumGetPointer(newVal) != DatumGetPointer(peraggstate->transValue))
+ {
+ if (!fcinfo->isnull)
+ {
+ MemoryContextSwitchTo(peraggstate->aggcontext);
+ if (DatumIsReadWriteExpandedObject(newVal,
+ false,
+ peraggstate->transtypeLen) &&
+ MemoryContextGetParent(DatumGetEOHP(newVal)->eoh_context) == CurrentMemoryContext)
+ /* do nothing */ ;
+ else
+ newVal = datumCopy(newVal,
+ peraggstate->transtypeByVal,
+ peraggstate->transtypeLen);
+ }
+ if (!peraggstate->transValueIsNull)
+ {
+ if (DatumIsReadWriteExpandedObject(peraggstate->transValue,
+ false,
+ peraggstate->transtypeLen))
+ DeleteExpandedObject(peraggstate->transValue);
+ else
+ pfree(DatumGetPointer(peraggstate->transValue));
+ }
+ }
+
+ MemoryContextSwitchTo(oldContext);
+ peraggstate->transValue = newVal;
+ peraggstate->transValueIsNull = fcinfo->isnull;
+
+ return true;
+}
+
+/*
+ * finalize_windowaggregate
+ * parallel to finalize_aggregate in nodeAgg.c
+ */
+static void
+finalize_windowaggregate(WindowAggState *winstate,
+ WindowStatePerFunc perfuncstate,
+ WindowStatePerAgg peraggstate,
+ Datum *result, bool *isnull)
+{
+ MemoryContext oldContext;
+
+ oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+ /*
+ * Apply the agg's finalfn if one is provided, else return transValue.
+ */
+ if (OidIsValid(peraggstate->finalfn_oid))
+ {
+ LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+ int numFinalArgs = peraggstate->numFinalArgs;
+ bool anynull;
+ int i;
+
+ InitFunctionCallInfoData(fcinfodata.fcinfo, &(peraggstate->finalfn),
+ numFinalArgs,
+ perfuncstate->winCollation,
+ (void *) winstate, NULL);
+ fcinfo->args[0].value =
+ MakeExpandedObjectReadOnly(peraggstate->transValue,
+ peraggstate->transValueIsNull,
+ peraggstate->transtypeLen);
+ fcinfo->args[0].isnull = peraggstate->transValueIsNull;
+ anynull = peraggstate->transValueIsNull;
+
+ /* Fill any remaining argument positions with nulls */
+ for (i = 1; i < numFinalArgs; i++)
+ {
+ fcinfo->args[i].value = (Datum) 0;
+ fcinfo->args[i].isnull = true;
+ anynull = true;
+ }
+
+ if (fcinfo->flinfo->fn_strict && anynull)
+ {
+ /* don't call a strict function with NULL inputs */
+ *result = (Datum) 0;
+ *isnull = true;
+ }
+ else
+ {
+ winstate->curaggcontext = peraggstate->aggcontext;
+ *result = FunctionCallInvoke(fcinfo);
+ winstate->curaggcontext = NULL;
+ *isnull = fcinfo->isnull;
+ }
+ }
+ else
+ {
+ /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
+ *result = peraggstate->transValue;
+ *isnull = peraggstate->transValueIsNull;
+ }
+
+ /*
+ * If result is pass-by-ref, make sure it is in the right context.
+ */
+ if (!peraggstate->resulttypeByVal && !*isnull &&
+ !MemoryContextContains(CurrentMemoryContext,
+ DatumGetPointer(*result)))
+ *result = datumCopy(*result,
+ peraggstate->resulttypeByVal,
+ peraggstate->resulttypeLen);
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * eval_windowaggregates
+ * evaluate plain aggregates being used as window functions
+ *
+ * This differs from nodeAgg.c in two ways. First, if the window's frame
+ * start position moves, we use the inverse transition function (if it exists)
+ * to remove rows from the transition value. And second, we expect to be
+ * able to call aggregate final functions repeatedly after aggregating more
+ * data onto the same transition value. This is not a behavior required by
+ * nodeAgg.c.
+ */
+static void
+eval_windowaggregates(WindowAggState *winstate)
+{
+ WindowStatePerAgg peraggstate;
+ int wfuncno,
+ numaggs,
+ numaggs_restart,
+ i;
+ int64 aggregatedupto_nonrestarted;
+ MemoryContext oldContext;
+ ExprContext *econtext;
+ WindowObject agg_winobj;
+ TupleTableSlot *agg_row_slot;
+ TupleTableSlot *temp_slot;
+
+ numaggs = winstate->numaggs;
+ if (numaggs == 0)
+ return; /* nothing to do */
+
+ /* final output execution is in ps_ExprContext */
+ econtext = winstate->ss.ps.ps_ExprContext;
+ agg_winobj = winstate->agg_winobj;
+ agg_row_slot = winstate->agg_row_slot;
+ temp_slot = winstate->temp_slot_1;
+
+ /*
+ * If the window's frame start clause is UNBOUNDED_PRECEDING and no
+ * exclusion clause is specified, then the window frame consists of a
+ * contiguous group of rows extending forward from the start of the
+ * partition, and rows only enter the frame, never exit it, as the current
+ * row advances forward. This makes it possible to use an incremental
+ * strategy for evaluating aggregates: we run the transition function for
+ * each row added to the frame, and run the final function whenever we
+ * need the current aggregate value. This is considerably more efficient
+ * than the naive approach of re-running the entire aggregate calculation
+ * for each current row. It does assume that the final function doesn't
+ * damage the running transition value, but we have the same assumption in
+ * nodeAgg.c too (when it rescans an existing hash table).
+ *
+ * If the frame start does sometimes move, we can still optimize as above
+ * whenever successive rows share the same frame head, but if the frame
+ * head moves beyond the previous head we try to remove those rows using
+ * the aggregate's inverse transition function. This function restores
+ * the aggregate's current state to what it would be if the removed row
+ * had never been aggregated in the first place. Inverse transition
+ * functions may optionally return NULL, indicating that the function was
+ * unable to remove the tuple from aggregation. If this happens, or if
+ * the aggregate doesn't have an inverse transition function at all, we
+ * must perform the aggregation all over again for all tuples within the
+ * new frame boundaries.
+ *
+ * If there's any exclusion clause, then we may have to aggregate over a
+ * non-contiguous set of rows, so we punt and recalculate for every row.
+ * (For some frame end choices, it might be that the frame is always
+ * contiguous anyway, but that's an optimization to investigate later.)
+ *
+ * In many common cases, multiple rows share the same frame and hence the
+ * same aggregate value. (In particular, if there's no ORDER BY in a RANGE
+ * window, then all rows are peers and so they all have window frame equal
+ * to the whole partition.) We optimize such cases by calculating the
+ * aggregate value once when we reach the first row of a peer group, and
+ * then returning the saved value for all subsequent rows.
+ *
+ * 'aggregatedupto' keeps track of the first row that has not yet been
+ * accumulated into the aggregate transition values. Whenever we start a
+ * new peer group, we accumulate forward to the end of the peer group.
+ */
+
+ /*
+ * First, update the frame head position.
+ *
+ * The frame head should never move backwards, and the code below wouldn't
+ * cope if it did, so for safety we complain if it does.
+ */
+ update_frameheadpos(winstate);
+ if (winstate->frameheadpos < winstate->aggregatedbase)
+ elog(ERROR, "window frame head moved backward");
+
+ /*
+ * If the frame didn't change compared to the previous row, we can re-use
+ * the result values that were previously saved at the bottom of this
+ * function. Since we don't know the current frame's end yet, this is not
+ * possible to check for fully. But if the frame end mode is UNBOUNDED
+ * FOLLOWING or CURRENT ROW, no exclusion clause is specified, and the
+ * current row lies within the previous row's frame, then the two frames'
+ * ends must coincide. Note that on the first row aggregatedbase ==
+ * aggregatedupto, meaning this test must fail, so we don't need to check
+ * the "there was no previous row" case explicitly here.
+ */
+ if (winstate->aggregatedbase == winstate->frameheadpos &&
+ (winstate->frameOptions & (FRAMEOPTION_END_UNBOUNDED_FOLLOWING |
+ FRAMEOPTION_END_CURRENT_ROW)) &&
+ !(winstate->frameOptions & FRAMEOPTION_EXCLUSION) &&
+ winstate->aggregatedbase <= winstate->currentpos &&
+ winstate->aggregatedupto > winstate->currentpos)
+ {
+ for (i = 0; i < numaggs; i++)
+ {
+ peraggstate = &winstate->peragg[i];
+ wfuncno = peraggstate->wfuncno;
+ econtext->ecxt_aggvalues[wfuncno] = peraggstate->resultValue;
+ econtext->ecxt_aggnulls[wfuncno] = peraggstate->resultValueIsNull;
+ }
+ return;
+ }
+
+ /*----------
+ * Initialize restart flags.
+ *
+ * We restart the aggregation:
+ * - if we're processing the first row in the partition, or
+ * - if the frame's head moved and we cannot use an inverse
+ * transition function, or
+ * - we have an EXCLUSION clause, or
+ * - if the new frame doesn't overlap the old one
+ *
+ * Note that we don't strictly need to restart in the last case, but if
+ * we're going to remove all rows from the aggregation anyway, a restart
+ * surely is faster.
+ *----------
+ */
+ numaggs_restart = 0;
+ for (i = 0; i < numaggs; i++)
+ {
+ peraggstate = &winstate->peragg[i];
+ if (winstate->currentpos == 0 ||
+ (winstate->aggregatedbase != winstate->frameheadpos &&
+ !OidIsValid(peraggstate->invtransfn_oid)) ||
+ (winstate->frameOptions & FRAMEOPTION_EXCLUSION) ||
+ winstate->aggregatedupto <= winstate->frameheadpos)
+ {
+ peraggstate->restart = true;
+ numaggs_restart++;
+ }
+ else
+ peraggstate->restart = false;
+ }
+
+ /*
+ * If we have any possibly-moving aggregates, attempt to advance
+ * aggregatedbase to match the frame's head by removing input rows that
+ * fell off the top of the frame from the aggregations. This can fail,
+ * i.e. advance_windowaggregate_base() can return false, in which case
+ * we'll restart that aggregate below.
+ */
+ while (numaggs_restart < numaggs &&
+ winstate->aggregatedbase < winstate->frameheadpos)
+ {
+ /*
+ * Fetch the next tuple of those being removed. This should never fail
+ * as we should have been here before.
+ */
+ if (!window_gettupleslot(agg_winobj, winstate->aggregatedbase,
+ temp_slot))
+ elog(ERROR, "could not re-fetch previously fetched frame row");
+
+ /* Set tuple context for evaluation of aggregate arguments */
+ winstate->tmpcontext->ecxt_outertuple = temp_slot;
+
+ /*
+ * Perform the inverse transition for each aggregate function in the
+ * window, unless it has already been marked as needing a restart.
+ */
+ for (i = 0; i < numaggs; i++)
+ {
+ bool ok;
+
+ peraggstate = &winstate->peragg[i];
+ if (peraggstate->restart)
+ continue;
+
+ wfuncno = peraggstate->wfuncno;
+ ok = advance_windowaggregate_base(winstate,
+ &winstate->perfunc[wfuncno],
+ peraggstate);
+ if (!ok)
+ {
+ /* Inverse transition function has failed, must restart */
+ peraggstate->restart = true;
+ numaggs_restart++;
+ }
+ }
+
+ /* Reset per-input-tuple context after each tuple */
+ ResetExprContext(winstate->tmpcontext);
+
+ /* And advance the aggregated-row state */
+ winstate->aggregatedbase++;
+ ExecClearTuple(temp_slot);
+ }
+
+ /*
+ * If we successfully advanced the base rows of all the aggregates,
+ * aggregatedbase now equals frameheadpos; but if we failed for any, we
+ * must forcibly update aggregatedbase.
+ */
+ winstate->aggregatedbase = winstate->frameheadpos;
+
+ /*
+ * If we created a mark pointer for aggregates, keep it pushed up to frame
+ * head, so that tuplestore can discard unnecessary rows.
+ */
+ if (agg_winobj->markptr >= 0)
+ WinSetMarkPosition(agg_winobj, winstate->frameheadpos);
+
+ /*
+ * Now restart the aggregates that require it.
+ *
+ * We assume that aggregates using the shared context always restart if
+ * *any* aggregate restarts, and we may thus clean up the shared
+ * aggcontext if that is the case. Private aggcontexts are reset by
+ * initialize_windowaggregate() if their owning aggregate restarts. If we
+ * aren't restarting an aggregate, we need to free any previously saved
+ * result for it, else we'll leak memory.
+ */
+ if (numaggs_restart > 0)
+ MemoryContextResetAndDeleteChildren(winstate->aggcontext);
+ for (i = 0; i < numaggs; i++)
+ {
+ peraggstate = &winstate->peragg[i];
+
+ /* Aggregates using the shared ctx must restart if *any* agg does */
+ Assert(peraggstate->aggcontext != winstate->aggcontext ||
+ numaggs_restart == 0 ||
+ peraggstate->restart);
+
+ if (peraggstate->restart)
+ {
+ wfuncno = peraggstate->wfuncno;
+ initialize_windowaggregate(winstate,
+ &winstate->perfunc[wfuncno],
+ peraggstate);
+ }
+ else if (!peraggstate->resultValueIsNull)
+ {
+ if (!peraggstate->resulttypeByVal)
+ pfree(DatumGetPointer(peraggstate->resultValue));
+ peraggstate->resultValue = (Datum) 0;
+ peraggstate->resultValueIsNull = true;
+ }
+ }
+
+ /*
+ * Non-restarted aggregates now contain the rows between aggregatedbase
+ * (i.e., frameheadpos) and aggregatedupto, while restarted aggregates
+ * contain no rows. If there are any restarted aggregates, we must thus
+ * begin aggregating anew at frameheadpos, otherwise we may simply
+ * continue at aggregatedupto. We must remember the old value of
+ * aggregatedupto to know how long to skip advancing non-restarted
+ * aggregates. If we modify aggregatedupto, we must also clear
+ * agg_row_slot, per the loop invariant below.
+ */
+ aggregatedupto_nonrestarted = winstate->aggregatedupto;
+ if (numaggs_restart > 0 &&
+ winstate->aggregatedupto != winstate->frameheadpos)
+ {
+ winstate->aggregatedupto = winstate->frameheadpos;
+ ExecClearTuple(agg_row_slot);
+ }
+
+ /*
+ * Advance until we reach a row not in frame (or end of partition).
+ *
+ * Note the loop invariant: agg_row_slot is either empty or holds the row
+ * at position aggregatedupto. We advance aggregatedupto after processing
+ * a row.
+ */
+ for (;;)
+ {
+ int ret;
+
+ /* Fetch next row if we didn't already */
+ if (TupIsNull(agg_row_slot))
+ {
+ if (!window_gettupleslot(agg_winobj, winstate->aggregatedupto,
+ agg_row_slot))
+ break; /* must be end of partition */
+ }
+
+ /*
+ * Exit loop if no more rows can be in frame. Skip aggregation if
+ * current row is not in frame but there might be more in the frame.
+ */
+ ret = row_is_in_frame(winstate, winstate->aggregatedupto, agg_row_slot);
+ if (ret < 0)
+ break;
+ if (ret == 0)
+ goto next_tuple;
+
+ /* Set tuple context for evaluation of aggregate arguments */
+ winstate->tmpcontext->ecxt_outertuple = agg_row_slot;
+
+ /* Accumulate row into the aggregates */
+ for (i = 0; i < numaggs; i++)
+ {
+ peraggstate = &winstate->peragg[i];
+
+ /* Non-restarted aggs skip until aggregatedupto_nonrestarted */
+ if (!peraggstate->restart &&
+ winstate->aggregatedupto < aggregatedupto_nonrestarted)
+ continue;
+
+ wfuncno = peraggstate->wfuncno;
+ advance_windowaggregate(winstate,
+ &winstate->perfunc[wfuncno],
+ peraggstate);
+ }
+
+next_tuple:
+ /* Reset per-input-tuple context after each tuple */
+ ResetExprContext(winstate->tmpcontext);
+
+ /* And advance the aggregated-row state */
+ winstate->aggregatedupto++;
+ ExecClearTuple(agg_row_slot);
+ }
+
+ /* The frame's end is not supposed to move backwards, ever */
+ Assert(aggregatedupto_nonrestarted <= winstate->aggregatedupto);
+
+ /*
+ * finalize aggregates and fill result/isnull fields.
+ */
+ for (i = 0; i < numaggs; i++)
+ {
+ Datum *result;
+ bool *isnull;
+
+ peraggstate = &winstate->peragg[i];
+ wfuncno = peraggstate->wfuncno;
+ result = &econtext->ecxt_aggvalues[wfuncno];
+ isnull = &econtext->ecxt_aggnulls[wfuncno];
+ finalize_windowaggregate(winstate,
+ &winstate->perfunc[wfuncno],
+ peraggstate,
+ result, isnull);
+
+ /*
+ * save the result in case next row shares the same frame.
+ *
+ * XXX in some framing modes, eg ROWS/END_CURRENT_ROW, we can know in
+ * advance that the next row can't possibly share the same frame. Is
+ * it worth detecting that and skipping this code?
+ */
+ if (!peraggstate->resulttypeByVal && !*isnull)
+ {
+ oldContext = MemoryContextSwitchTo(peraggstate->aggcontext);
+ peraggstate->resultValue =
+ datumCopy(*result,
+ peraggstate->resulttypeByVal,
+ peraggstate->resulttypeLen);
+ MemoryContextSwitchTo(oldContext);
+ }
+ else
+ {
+ peraggstate->resultValue = *result;
+ }
+ peraggstate->resultValueIsNull = *isnull;
+ }
+}
+
+/*
+ * eval_windowfunction
+ *
+ * Arguments of window functions are not evaluated here, because a window
+ * function can need random access to arbitrary rows in the partition.
+ * The window function uses the special WinGetFuncArgInPartition and
+ * WinGetFuncArgInFrame functions to evaluate the arguments for the rows
+ * it wants.
+ */
+static void
+eval_windowfunction(WindowAggState *winstate, WindowStatePerFunc perfuncstate,
+ Datum *result, bool *isnull)
+{
+ LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+ MemoryContext oldContext;
+
+ oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+ /*
+ * We don't pass any normal arguments to a window function, but we do pass
+ * it the number of arguments, in order to permit window function
+ * implementations to support varying numbers of arguments. The real info
+ * goes through the WindowObject, which is passed via fcinfo->context.
+ */
+ InitFunctionCallInfoData(*fcinfo, &(perfuncstate->flinfo),
+ perfuncstate->numArguments,
+ perfuncstate->winCollation,
+ (void *) perfuncstate->winobj, NULL);
+ /* Just in case, make all the regular argument slots be null */
+ for (int argno = 0; argno < perfuncstate->numArguments; argno++)
+ fcinfo->args[argno].isnull = true;
+ /* Window functions don't have a current aggregate context, either */
+ winstate->curaggcontext = NULL;
+
+ *result = FunctionCallInvoke(fcinfo);
+ *isnull = fcinfo->isnull;
+
+ /*
+ * Make sure pass-by-ref data is allocated in the appropriate context. (We
+ * need this in case the function returns a pointer into some short-lived
+ * tuple, as is entirely possible.)
+ */
+ if (!perfuncstate->resulttypeByVal && !fcinfo->isnull &&
+ !MemoryContextContains(CurrentMemoryContext,
+ DatumGetPointer(*result)))
+ *result = datumCopy(*result,
+ perfuncstate->resulttypeByVal,
+ perfuncstate->resulttypeLen);
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * begin_partition
+ * Start buffering rows of the next partition.
+ */
+static void
+begin_partition(WindowAggState *winstate)
+{
+ WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan;
+ PlanState *outerPlan = outerPlanState(winstate);
+ int frameOptions = winstate->frameOptions;
+ int numfuncs = winstate->numfuncs;
+ int i;
+
+ winstate->partition_spooled = false;
+ winstate->framehead_valid = false;
+ winstate->frametail_valid = false;
+ winstate->grouptail_valid = false;
+ winstate->spooled_rows = 0;
+ winstate->currentpos = 0;
+ winstate->frameheadpos = 0;
+ winstate->frametailpos = 0;
+ winstate->currentgroup = 0;
+ winstate->frameheadgroup = 0;
+ winstate->frametailgroup = 0;
+ winstate->groupheadpos = 0;
+ winstate->grouptailpos = -1; /* see update_grouptailpos */
+ ExecClearTuple(winstate->agg_row_slot);
+ if (winstate->framehead_slot)
+ ExecClearTuple(winstate->framehead_slot);
+ if (winstate->frametail_slot)
+ ExecClearTuple(winstate->frametail_slot);
+
+ /*
+ * If this is the very first partition, we need to fetch the first input
+ * row to store in first_part_slot.
+ */
+ if (TupIsNull(winstate->first_part_slot))
+ {
+ TupleTableSlot *outerslot = ExecProcNode(outerPlan);
+
+ if (!TupIsNull(outerslot))
+ ExecCopySlot(winstate->first_part_slot, outerslot);
+ else
+ {
+ /* outer plan is empty, so we have nothing to do */
+ winstate->partition_spooled = true;
+ winstate->more_partitions = false;
+ return;
+ }
+ }
+
+ /* Create new tuplestore for this partition */
+ winstate->buffer = tuplestore_begin_heap(false, false, work_mem);
+
+ /*
+ * Set up read pointers for the tuplestore. The current pointer doesn't
+ * need BACKWARD capability, but the per-window-function read pointers do,
+ * and the aggregate pointer does if we might need to restart aggregation.
+ */
+ winstate->current_ptr = 0; /* read pointer 0 is pre-allocated */
+
+ /* reset default REWIND capability bit for current ptr */
+ tuplestore_set_eflags(winstate->buffer, 0);
+
+ /* create read pointers for aggregates, if needed */
+ if (winstate->numaggs > 0)
+ {
+ WindowObject agg_winobj = winstate->agg_winobj;
+ int readptr_flags = 0;
+
+ /*
+ * If the frame head is potentially movable, or we have an EXCLUSION
+ * clause, we might need to restart aggregation ...
+ */
+ if (!(frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING) ||
+ (frameOptions & FRAMEOPTION_EXCLUSION))
+ {
+ /* ... so create a mark pointer to track the frame head */
+ agg_winobj->markptr = tuplestore_alloc_read_pointer(winstate->buffer, 0);
+ /* and the read pointer will need BACKWARD capability */
+ readptr_flags |= EXEC_FLAG_BACKWARD;
+ }
+
+ agg_winobj->readptr = tuplestore_alloc_read_pointer(winstate->buffer,
+ readptr_flags);
+ agg_winobj->markpos = -1;
+ agg_winobj->seekpos = -1;
+
+ /* Also reset the row counters for aggregates */
+ winstate->aggregatedbase = 0;
+ winstate->aggregatedupto = 0;
+ }
+
+ /* create mark and read pointers for each real window function */
+ for (i = 0; i < numfuncs; i++)
+ {
+ WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]);
+
+ if (!perfuncstate->plain_agg)
+ {
+ WindowObject winobj = perfuncstate->winobj;
+
+ winobj->markptr = tuplestore_alloc_read_pointer(winstate->buffer,
+ 0);
+ winobj->readptr = tuplestore_alloc_read_pointer(winstate->buffer,
+ EXEC_FLAG_BACKWARD);
+ winobj->markpos = -1;
+ winobj->seekpos = -1;
+ }
+ }
+
+ /*
+ * If we are in RANGE or GROUPS mode, then determining frame boundaries
+ * requires physical access to the frame endpoint rows, except in certain
+ * degenerate cases. We create read pointers to point to those rows, to
+ * simplify access and ensure that the tuplestore doesn't discard the
+ * endpoint rows prematurely. (Must create pointers in exactly the same
+ * cases that update_frameheadpos and update_frametailpos need them.)
+ */
+ winstate->framehead_ptr = winstate->frametail_ptr = -1; /* if not used */
+
+ if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+ {
+ if (((frameOptions & FRAMEOPTION_START_CURRENT_ROW) &&
+ node->ordNumCols != 0) ||
+ (frameOptions & FRAMEOPTION_START_OFFSET))
+ winstate->framehead_ptr =
+ tuplestore_alloc_read_pointer(winstate->buffer, 0);
+ if (((frameOptions & FRAMEOPTION_END_CURRENT_ROW) &&
+ node->ordNumCols != 0) ||
+ (frameOptions & FRAMEOPTION_END_OFFSET))
+ winstate->frametail_ptr =
+ tuplestore_alloc_read_pointer(winstate->buffer, 0);
+ }
+
+ /*
+ * If we have an exclusion clause that requires knowing the boundaries of
+ * the current row's peer group, we create a read pointer to track the
+ * tail position of the peer group (i.e., first row of the next peer
+ * group). The head position does not require its own pointer because we
+ * maintain that as a side effect of advancing the current row.
+ */
+ winstate->grouptail_ptr = -1;
+
+ if ((frameOptions & (FRAMEOPTION_EXCLUDE_GROUP |
+ FRAMEOPTION_EXCLUDE_TIES)) &&
+ node->ordNumCols != 0)
+ {
+ winstate->grouptail_ptr =
+ tuplestore_alloc_read_pointer(winstate->buffer, 0);
+ }
+
+ /*
+ * Store the first tuple into the tuplestore (it's always available now;
+ * we either read it above, or saved it at the end of previous partition)
+ */
+ tuplestore_puttupleslot(winstate->buffer, winstate->first_part_slot);
+ winstate->spooled_rows++;
+}
+
+/*
+ * Read tuples from the outer node, up to and including position 'pos', and
+ * store them into the tuplestore. If pos is -1, reads the whole partition.
+ */
+static void
+spool_tuples(WindowAggState *winstate, int64 pos)
+{
+ WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan;
+ PlanState *outerPlan;
+ TupleTableSlot *outerslot;
+ MemoryContext oldcontext;
+
+ if (!winstate->buffer)
+ return; /* just a safety check */
+ if (winstate->partition_spooled)
+ return; /* whole partition done already */
+
+ /*
+ * If the tuplestore has spilled to disk, alternate reading and writing
+ * becomes quite expensive due to frequent buffer flushes. It's cheaper
+ * to force the entire partition to get spooled in one go.
+ *
+ * XXX this is a horrid kluge --- it'd be better to fix the performance
+ * problem inside tuplestore. FIXME
+ */
+ if (!tuplestore_in_memory(winstate->buffer))
+ pos = -1;
+
+ outerPlan = outerPlanState(winstate);
+
+ /* Must be in query context to call outerplan */
+ oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+ while (winstate->spooled_rows <= pos || pos == -1)
+ {
+ outerslot = ExecProcNode(outerPlan);
+ if (TupIsNull(outerslot))
+ {
+ /* reached the end of the last partition */
+ winstate->partition_spooled = true;
+ winstate->more_partitions = false;
+ break;
+ }
+
+ if (node->partNumCols > 0)
+ {
+ ExprContext *econtext = winstate->tmpcontext;
+
+ econtext->ecxt_innertuple = winstate->first_part_slot;
+ econtext->ecxt_outertuple = outerslot;
+
+ /* Check if this tuple still belongs to the current partition */
+ if (!ExecQualAndReset(winstate->partEqfunction, econtext))
+ {
+ /*
+ * end of partition; copy the tuple for the next cycle.
+ */
+ ExecCopySlot(winstate->first_part_slot, outerslot);
+ winstate->partition_spooled = true;
+ winstate->more_partitions = true;
+ break;
+ }
+ }
+
+ /* Still in partition, so save it into the tuplestore */
+ tuplestore_puttupleslot(winstate->buffer, outerslot);
+ winstate->spooled_rows++;
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * release_partition
+ * clear information kept within a partition, including
+ * tuplestore and aggregate results.
+ */
+static void
+release_partition(WindowAggState *winstate)
+{
+ int i;
+
+ for (i = 0; i < winstate->numfuncs; i++)
+ {
+ WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]);
+
+ /* Release any partition-local state of this window function */
+ if (perfuncstate->winobj)
+ perfuncstate->winobj->localmem = NULL;
+ }
+
+ /*
+ * Release all partition-local memory (in particular, any partition-local
+ * state that we might have trashed our pointers to in the above loop, and
+ * any aggregate temp data). We don't rely on retail pfree because some
+ * aggregates might have allocated data we don't have direct pointers to.
+ */
+ MemoryContextResetAndDeleteChildren(winstate->partcontext);
+ MemoryContextResetAndDeleteChildren(winstate->aggcontext);
+ for (i = 0; i < winstate->numaggs; i++)
+ {
+ if (winstate->peragg[i].aggcontext != winstate->aggcontext)
+ MemoryContextResetAndDeleteChildren(winstate->peragg[i].aggcontext);
+ }
+
+ if (winstate->buffer)
+ tuplestore_end(winstate->buffer);
+ winstate->buffer = NULL;
+ winstate->partition_spooled = false;
+}
+
+/*
+ * row_is_in_frame
+ * Determine whether a row is in the current row's window frame according
+ * to our window framing rule
+ *
+ * The caller must have already determined that the row is in the partition
+ * and fetched it into a slot. This function just encapsulates the framing
+ * rules.
+ *
+ * Returns:
+ * -1, if the row is out of frame and no succeeding rows can be in frame
+ * 0, if the row is out of frame but succeeding rows might be in frame
+ * 1, if the row is in frame
+ *
+ * May clobber winstate->temp_slot_2.
+ */
+static int
+row_is_in_frame(WindowAggState *winstate, int64 pos, TupleTableSlot *slot)
+{
+ int frameOptions = winstate->frameOptions;
+
+ Assert(pos >= 0); /* else caller error */
+
+ /*
+ * First, check frame starting conditions. We might as well delegate this
+ * to update_frameheadpos always; it doesn't add any notable cost.
+ */
+ update_frameheadpos(winstate);
+ if (pos < winstate->frameheadpos)
+ return 0;
+
+ /*
+ * Okay so far, now check frame ending conditions. Here, we avoid calling
+ * update_frametailpos in simple cases, so as not to spool tuples further
+ * ahead than necessary.
+ */
+ if (frameOptions & FRAMEOPTION_END_CURRENT_ROW)
+ {
+ if (frameOptions & FRAMEOPTION_ROWS)
+ {
+ /* rows after current row are out of frame */
+ if (pos > winstate->currentpos)
+ return -1;
+ }
+ else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+ {
+ /* following row that is not peer is out of frame */
+ if (pos > winstate->currentpos &&
+ !are_peers(winstate, slot, winstate->ss.ss_ScanTupleSlot))
+ return -1;
+ }
+ else
+ Assert(false);
+ }
+ else if (frameOptions & FRAMEOPTION_END_OFFSET)
+ {
+ if (frameOptions & FRAMEOPTION_ROWS)
+ {
+ int64 offset = DatumGetInt64(winstate->endOffsetValue);
+
+ /* rows after current row + offset are out of frame */
+ if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+ offset = -offset;
+
+ if (pos > winstate->currentpos + offset)
+ return -1;
+ }
+ else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+ {
+ /* hard cases, so delegate to update_frametailpos */
+ update_frametailpos(winstate);
+ if (pos >= winstate->frametailpos)
+ return -1;
+ }
+ else
+ Assert(false);
+ }
+
+ /* Check exclusion clause */
+ if (frameOptions & FRAMEOPTION_EXCLUDE_CURRENT_ROW)
+ {
+ if (pos == winstate->currentpos)
+ return 0;
+ }
+ else if ((frameOptions & FRAMEOPTION_EXCLUDE_GROUP) ||
+ ((frameOptions & FRAMEOPTION_EXCLUDE_TIES) &&
+ pos != winstate->currentpos))
+ {
+ WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan;
+
+ /* If no ORDER BY, all rows are peers with each other */
+ if (node->ordNumCols == 0)
+ return 0;
+ /* Otherwise, check the group boundaries */
+ if (pos >= winstate->groupheadpos)
+ {
+ update_grouptailpos(winstate);
+ if (pos < winstate->grouptailpos)
+ return 0;
+ }
+ }
+
+ /* If we get here, it's in frame */
+ return 1;
+}
+
+/*
+ * update_frameheadpos
+ * make frameheadpos valid for the current row
+ *
+ * Note that frameheadpos is computed without regard for any window exclusion
+ * clause; the current row and/or its peers are considered part of the frame
+ * for this purpose even if they must be excluded later.
+ *
+ * May clobber winstate->temp_slot_2.
+ */
+static void
+update_frameheadpos(WindowAggState *winstate)
+{
+ WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan;
+ int frameOptions = winstate->frameOptions;
+ MemoryContext oldcontext;
+
+ if (winstate->framehead_valid)
+ return; /* already known for current row */
+
+ /* We may be called in a short-lived context */
+ oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+ if (frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING)
+ {
+ /* In UNBOUNDED PRECEDING mode, frame head is always row 0 */
+ winstate->frameheadpos = 0;
+ winstate->framehead_valid = true;
+ }
+ else if (frameOptions & FRAMEOPTION_START_CURRENT_ROW)
+ {
+ if (frameOptions & FRAMEOPTION_ROWS)
+ {
+ /* In ROWS mode, frame head is the same as current */
+ winstate->frameheadpos = winstate->currentpos;
+ winstate->framehead_valid = true;
+ }
+ else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+ {
+ /* If no ORDER BY, all rows are peers with each other */
+ if (node->ordNumCols == 0)
+ {
+ winstate->frameheadpos = 0;
+ winstate->framehead_valid = true;
+ MemoryContextSwitchTo(oldcontext);
+ return;
+ }
+
+ /*
+ * In RANGE or GROUPS START_CURRENT_ROW mode, frame head is the
+ * first row that is a peer of current row. We keep a copy of the
+ * last-known frame head row in framehead_slot, and advance as
+ * necessary. Note that if we reach end of partition, we will
+ * leave frameheadpos = end+1 and framehead_slot empty.
+ */
+ tuplestore_select_read_pointer(winstate->buffer,
+ winstate->framehead_ptr);
+ if (winstate->frameheadpos == 0 &&
+ TupIsNull(winstate->framehead_slot))
+ {
+ /* fetch first row into framehead_slot, if we didn't already */
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->framehead_slot))
+ elog(ERROR, "unexpected end of tuplestore");
+ }
+
+ while (!TupIsNull(winstate->framehead_slot))
+ {
+ if (are_peers(winstate, winstate->framehead_slot,
+ winstate->ss.ss_ScanTupleSlot))
+ break; /* this row is the correct frame head */
+ /* Note we advance frameheadpos even if the fetch fails */
+ winstate->frameheadpos++;
+ spool_tuples(winstate, winstate->frameheadpos);
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->framehead_slot))
+ break; /* end of partition */
+ }
+ winstate->framehead_valid = true;
+ }
+ else
+ Assert(false);
+ }
+ else if (frameOptions & FRAMEOPTION_START_OFFSET)
+ {
+ if (frameOptions & FRAMEOPTION_ROWS)
+ {
+ /* In ROWS mode, bound is physically n before/after current */
+ int64 offset = DatumGetInt64(winstate->startOffsetValue);
+
+ if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING)
+ offset = -offset;
+
+ winstate->frameheadpos = winstate->currentpos + offset;
+ /* frame head can't go before first row */
+ if (winstate->frameheadpos < 0)
+ winstate->frameheadpos = 0;
+ else if (winstate->frameheadpos > winstate->currentpos + 1)
+ {
+ /* make sure frameheadpos is not past end of partition */
+ spool_tuples(winstate, winstate->frameheadpos - 1);
+ if (winstate->frameheadpos > winstate->spooled_rows)
+ winstate->frameheadpos = winstate->spooled_rows;
+ }
+ winstate->framehead_valid = true;
+ }
+ else if (frameOptions & FRAMEOPTION_RANGE)
+ {
+ /*
+ * In RANGE START_OFFSET mode, frame head is the first row that
+ * satisfies the in_range constraint relative to the current row.
+ * We keep a copy of the last-known frame head row in
+ * framehead_slot, and advance as necessary. Note that if we
+ * reach end of partition, we will leave frameheadpos = end+1 and
+ * framehead_slot empty.
+ */
+ int sortCol = node->ordColIdx[0];
+ bool sub,
+ less;
+
+ /* We must have an ordering column */
+ Assert(node->ordNumCols == 1);
+
+ /* Precompute flags for in_range checks */
+ if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING)
+ sub = true; /* subtract startOffset from current row */
+ else
+ sub = false; /* add it */
+ less = false; /* normally, we want frame head >= sum */
+ /* If sort order is descending, flip both flags */
+ if (!winstate->inRangeAsc)
+ {
+ sub = !sub;
+ less = true;
+ }
+
+ tuplestore_select_read_pointer(winstate->buffer,
+ winstate->framehead_ptr);
+ if (winstate->frameheadpos == 0 &&
+ TupIsNull(winstate->framehead_slot))
+ {
+ /* fetch first row into framehead_slot, if we didn't already */
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->framehead_slot))
+ elog(ERROR, "unexpected end of tuplestore");
+ }
+
+ while (!TupIsNull(winstate->framehead_slot))
+ {
+ Datum headval,
+ currval;
+ bool headisnull,
+ currisnull;
+
+ headval = slot_getattr(winstate->framehead_slot, sortCol,
+ &headisnull);
+ currval = slot_getattr(winstate->ss.ss_ScanTupleSlot, sortCol,
+ &currisnull);
+ if (headisnull || currisnull)
+ {
+ /* order of the rows depends only on nulls_first */
+ if (winstate->inRangeNullsFirst)
+ {
+ /* advance head if head is null and curr is not */
+ if (!headisnull || currisnull)
+ break;
+ }
+ else
+ {
+ /* advance head if head is not null and curr is null */
+ if (headisnull || !currisnull)
+ break;
+ }
+ }
+ else
+ {
+ if (DatumGetBool(FunctionCall5Coll(&winstate->startInRangeFunc,
+ winstate->inRangeColl,
+ headval,
+ currval,
+ winstate->startOffsetValue,
+ BoolGetDatum(sub),
+ BoolGetDatum(less))))
+ break; /* this row is the correct frame head */
+ }
+ /* Note we advance frameheadpos even if the fetch fails */
+ winstate->frameheadpos++;
+ spool_tuples(winstate, winstate->frameheadpos);
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->framehead_slot))
+ break; /* end of partition */
+ }
+ winstate->framehead_valid = true;
+ }
+ else if (frameOptions & FRAMEOPTION_GROUPS)
+ {
+ /*
+ * In GROUPS START_OFFSET mode, frame head is the first row of the
+ * first peer group whose number satisfies the offset constraint.
+ * We keep a copy of the last-known frame head row in
+ * framehead_slot, and advance as necessary. Note that if we
+ * reach end of partition, we will leave frameheadpos = end+1 and
+ * framehead_slot empty.
+ */
+ int64 offset = DatumGetInt64(winstate->startOffsetValue);
+ int64 minheadgroup;
+
+ if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING)
+ minheadgroup = winstate->currentgroup - offset;
+ else
+ minheadgroup = winstate->currentgroup + offset;
+
+ tuplestore_select_read_pointer(winstate->buffer,
+ winstate->framehead_ptr);
+ if (winstate->frameheadpos == 0 &&
+ TupIsNull(winstate->framehead_slot))
+ {
+ /* fetch first row into framehead_slot, if we didn't already */
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->framehead_slot))
+ elog(ERROR, "unexpected end of tuplestore");
+ }
+
+ while (!TupIsNull(winstate->framehead_slot))
+ {
+ if (winstate->frameheadgroup >= minheadgroup)
+ break; /* this row is the correct frame head */
+ ExecCopySlot(winstate->temp_slot_2, winstate->framehead_slot);
+ /* Note we advance frameheadpos even if the fetch fails */
+ winstate->frameheadpos++;
+ spool_tuples(winstate, winstate->frameheadpos);
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->framehead_slot))
+ break; /* end of partition */
+ if (!are_peers(winstate, winstate->temp_slot_2,
+ winstate->framehead_slot))
+ winstate->frameheadgroup++;
+ }
+ ExecClearTuple(winstate->temp_slot_2);
+ winstate->framehead_valid = true;
+ }
+ else
+ Assert(false);
+ }
+ else
+ Assert(false);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * update_frametailpos
+ * make frametailpos valid for the current row
+ *
+ * Note that frametailpos is computed without regard for any window exclusion
+ * clause; the current row and/or its peers are considered part of the frame
+ * for this purpose even if they must be excluded later.
+ *
+ * May clobber winstate->temp_slot_2.
+ */
+static void
+update_frametailpos(WindowAggState *winstate)
+{
+ WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan;
+ int frameOptions = winstate->frameOptions;
+ MemoryContext oldcontext;
+
+ if (winstate->frametail_valid)
+ return; /* already known for current row */
+
+ /* We may be called in a short-lived context */
+ oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+ if (frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING)
+ {
+ /* In UNBOUNDED FOLLOWING mode, all partition rows are in frame */
+ spool_tuples(winstate, -1);
+ winstate->frametailpos = winstate->spooled_rows;
+ winstate->frametail_valid = true;
+ }
+ else if (frameOptions & FRAMEOPTION_END_CURRENT_ROW)
+ {
+ if (frameOptions & FRAMEOPTION_ROWS)
+ {
+ /* In ROWS mode, exactly the rows up to current are in frame */
+ winstate->frametailpos = winstate->currentpos + 1;
+ winstate->frametail_valid = true;
+ }
+ else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+ {
+ /* If no ORDER BY, all rows are peers with each other */
+ if (node->ordNumCols == 0)
+ {
+ spool_tuples(winstate, -1);
+ winstate->frametailpos = winstate->spooled_rows;
+ winstate->frametail_valid = true;
+ MemoryContextSwitchTo(oldcontext);
+ return;
+ }
+
+ /*
+ * In RANGE or GROUPS END_CURRENT_ROW mode, frame end is the last
+ * row that is a peer of current row, frame tail is the row after
+ * that (if any). We keep a copy of the last-known frame tail row
+ * in frametail_slot, and advance as necessary. Note that if we
+ * reach end of partition, we will leave frametailpos = end+1 and
+ * frametail_slot empty.
+ */
+ tuplestore_select_read_pointer(winstate->buffer,
+ winstate->frametail_ptr);
+ if (winstate->frametailpos == 0 &&
+ TupIsNull(winstate->frametail_slot))
+ {
+ /* fetch first row into frametail_slot, if we didn't already */
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->frametail_slot))
+ elog(ERROR, "unexpected end of tuplestore");
+ }
+
+ while (!TupIsNull(winstate->frametail_slot))
+ {
+ if (winstate->frametailpos > winstate->currentpos &&
+ !are_peers(winstate, winstate->frametail_slot,
+ winstate->ss.ss_ScanTupleSlot))
+ break; /* this row is the frame tail */
+ /* Note we advance frametailpos even if the fetch fails */
+ winstate->frametailpos++;
+ spool_tuples(winstate, winstate->frametailpos);
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->frametail_slot))
+ break; /* end of partition */
+ }
+ winstate->frametail_valid = true;
+ }
+ else
+ Assert(false);
+ }
+ else if (frameOptions & FRAMEOPTION_END_OFFSET)
+ {
+ if (frameOptions & FRAMEOPTION_ROWS)
+ {
+ /* In ROWS mode, bound is physically n before/after current */
+ int64 offset = DatumGetInt64(winstate->endOffsetValue);
+
+ if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+ offset = -offset;
+
+ winstate->frametailpos = winstate->currentpos + offset + 1;
+ /* smallest allowable value of frametailpos is 0 */
+ if (winstate->frametailpos < 0)
+ winstate->frametailpos = 0;
+ else if (winstate->frametailpos > winstate->currentpos + 1)
+ {
+ /* make sure frametailpos is not past end of partition */
+ spool_tuples(winstate, winstate->frametailpos - 1);
+ if (winstate->frametailpos > winstate->spooled_rows)
+ winstate->frametailpos = winstate->spooled_rows;
+ }
+ winstate->frametail_valid = true;
+ }
+ else if (frameOptions & FRAMEOPTION_RANGE)
+ {
+ /*
+ * In RANGE END_OFFSET mode, frame end is the last row that
+ * satisfies the in_range constraint relative to the current row,
+ * frame tail is the row after that (if any). We keep a copy of
+ * the last-known frame tail row in frametail_slot, and advance as
+ * necessary. Note that if we reach end of partition, we will
+ * leave frametailpos = end+1 and frametail_slot empty.
+ */
+ int sortCol = node->ordColIdx[0];
+ bool sub,
+ less;
+
+ /* We must have an ordering column */
+ Assert(node->ordNumCols == 1);
+
+ /* Precompute flags for in_range checks */
+ if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+ sub = true; /* subtract endOffset from current row */
+ else
+ sub = false; /* add it */
+ less = true; /* normally, we want frame tail <= sum */
+ /* If sort order is descending, flip both flags */
+ if (!winstate->inRangeAsc)
+ {
+ sub = !sub;
+ less = false;
+ }
+
+ tuplestore_select_read_pointer(winstate->buffer,
+ winstate->frametail_ptr);
+ if (winstate->frametailpos == 0 &&
+ TupIsNull(winstate->frametail_slot))
+ {
+ /* fetch first row into frametail_slot, if we didn't already */
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->frametail_slot))
+ elog(ERROR, "unexpected end of tuplestore");
+ }
+
+ while (!TupIsNull(winstate->frametail_slot))
+ {
+ Datum tailval,
+ currval;
+ bool tailisnull,
+ currisnull;
+
+ tailval = slot_getattr(winstate->frametail_slot, sortCol,
+ &tailisnull);
+ currval = slot_getattr(winstate->ss.ss_ScanTupleSlot, sortCol,
+ &currisnull);
+ if (tailisnull || currisnull)
+ {
+ /* order of the rows depends only on nulls_first */
+ if (winstate->inRangeNullsFirst)
+ {
+ /* advance tail if tail is null or curr is not */
+ if (!tailisnull)
+ break;
+ }
+ else
+ {
+ /* advance tail if tail is not null or curr is null */
+ if (!currisnull)
+ break;
+ }
+ }
+ else
+ {
+ if (!DatumGetBool(FunctionCall5Coll(&winstate->endInRangeFunc,
+ winstate->inRangeColl,
+ tailval,
+ currval,
+ winstate->endOffsetValue,
+ BoolGetDatum(sub),
+ BoolGetDatum(less))))
+ break; /* this row is the correct frame tail */
+ }
+ /* Note we advance frametailpos even if the fetch fails */
+ winstate->frametailpos++;
+ spool_tuples(winstate, winstate->frametailpos);
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->frametail_slot))
+ break; /* end of partition */
+ }
+ winstate->frametail_valid = true;
+ }
+ else if (frameOptions & FRAMEOPTION_GROUPS)
+ {
+ /*
+ * In GROUPS END_OFFSET mode, frame end is the last row of the
+ * last peer group whose number satisfies the offset constraint,
+ * and frame tail is the row after that (if any). We keep a copy
+ * of the last-known frame tail row in frametail_slot, and advance
+ * as necessary. Note that if we reach end of partition, we will
+ * leave frametailpos = end+1 and frametail_slot empty.
+ */
+ int64 offset = DatumGetInt64(winstate->endOffsetValue);
+ int64 maxtailgroup;
+
+ if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+ maxtailgroup = winstate->currentgroup - offset;
+ else
+ maxtailgroup = winstate->currentgroup + offset;
+
+ tuplestore_select_read_pointer(winstate->buffer,
+ winstate->frametail_ptr);
+ if (winstate->frametailpos == 0 &&
+ TupIsNull(winstate->frametail_slot))
+ {
+ /* fetch first row into frametail_slot, if we didn't already */
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->frametail_slot))
+ elog(ERROR, "unexpected end of tuplestore");
+ }
+
+ while (!TupIsNull(winstate->frametail_slot))
+ {
+ if (winstate->frametailgroup > maxtailgroup)
+ break; /* this row is the correct frame tail */
+ ExecCopySlot(winstate->temp_slot_2, winstate->frametail_slot);
+ /* Note we advance frametailpos even if the fetch fails */
+ winstate->frametailpos++;
+ spool_tuples(winstate, winstate->frametailpos);
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->frametail_slot))
+ break; /* end of partition */
+ if (!are_peers(winstate, winstate->temp_slot_2,
+ winstate->frametail_slot))
+ winstate->frametailgroup++;
+ }
+ ExecClearTuple(winstate->temp_slot_2);
+ winstate->frametail_valid = true;
+ }
+ else
+ Assert(false);
+ }
+ else
+ Assert(false);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * update_grouptailpos
+ * make grouptailpos valid for the current row
+ *
+ * May clobber winstate->temp_slot_2.
+ */
+static void
+update_grouptailpos(WindowAggState *winstate)
+{
+ WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan;
+ MemoryContext oldcontext;
+
+ if (winstate->grouptail_valid)
+ return; /* already known for current row */
+
+ /* We may be called in a short-lived context */
+ oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+ /* If no ORDER BY, all rows are peers with each other */
+ if (node->ordNumCols == 0)
+ {
+ spool_tuples(winstate, -1);
+ winstate->grouptailpos = winstate->spooled_rows;
+ winstate->grouptail_valid = true;
+ MemoryContextSwitchTo(oldcontext);
+ return;
+ }
+
+ /*
+ * Because grouptail_valid is reset only when current row advances into a
+ * new peer group, we always reach here knowing that grouptailpos needs to
+ * be advanced by at least one row. Hence, unlike the otherwise similar
+ * case for frame tail tracking, we do not need persistent storage of the
+ * group tail row.
+ */
+ Assert(winstate->grouptailpos <= winstate->currentpos);
+ tuplestore_select_read_pointer(winstate->buffer,
+ winstate->grouptail_ptr);
+ for (;;)
+ {
+ /* Note we advance grouptailpos even if the fetch fails */
+ winstate->grouptailpos++;
+ spool_tuples(winstate, winstate->grouptailpos);
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->temp_slot_2))
+ break; /* end of partition */
+ if (winstate->grouptailpos > winstate->currentpos &&
+ !are_peers(winstate, winstate->temp_slot_2,
+ winstate->ss.ss_ScanTupleSlot))
+ break; /* this row is the group tail */
+ }
+ ExecClearTuple(winstate->temp_slot_2);
+ winstate->grouptail_valid = true;
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+
+/* -----------------
+ * ExecWindowAgg
+ *
+ * ExecWindowAgg receives tuples from its outer subplan and
+ * stores them into a tuplestore, then processes window functions.
+ * This node doesn't reduce nor qualify any row so the number of
+ * returned rows is exactly the same as its outer subplan's result.
+ * -----------------
+ */
+static TupleTableSlot *
+ExecWindowAgg(PlanState *pstate)
+{
+ WindowAggState *winstate = castNode(WindowAggState, pstate);
+ ExprContext *econtext;
+ int i;
+ int numfuncs;
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (winstate->all_done)
+ return NULL;
+
+ /*
+ * Compute frame offset values, if any, during first call (or after a
+ * rescan). These are assumed to hold constant throughout the scan; if
+ * user gives us a volatile expression, we'll only use its initial value.
+ */
+ if (winstate->all_first)
+ {
+ int frameOptions = winstate->frameOptions;
+ ExprContext *econtext = winstate->ss.ps.ps_ExprContext;
+ Datum value;
+ bool isnull;
+ int16 len;
+ bool byval;
+
+ if (frameOptions & FRAMEOPTION_START_OFFSET)
+ {
+ Assert(winstate->startOffset != NULL);
+ value = ExecEvalExprSwitchContext(winstate->startOffset,
+ econtext,
+ &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("frame starting offset must not be null")));
+ /* copy value into query-lifespan context */
+ get_typlenbyval(exprType((Node *) winstate->startOffset->expr),
+ &len, &byval);
+ winstate->startOffsetValue = datumCopy(value, byval, len);
+ if (frameOptions & (FRAMEOPTION_ROWS | FRAMEOPTION_GROUPS))
+ {
+ /* value is known to be int8 */
+ int64 offset = DatumGetInt64(value);
+
+ if (offset < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PRECEDING_OR_FOLLOWING_SIZE),
+ errmsg("frame starting offset must not be negative")));
+ }
+ }
+ if (frameOptions & FRAMEOPTION_END_OFFSET)
+ {
+ Assert(winstate->endOffset != NULL);
+ value = ExecEvalExprSwitchContext(winstate->endOffset,
+ econtext,
+ &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("frame ending offset must not be null")));
+ /* copy value into query-lifespan context */
+ get_typlenbyval(exprType((Node *) winstate->endOffset->expr),
+ &len, &byval);
+ winstate->endOffsetValue = datumCopy(value, byval, len);
+ if (frameOptions & (FRAMEOPTION_ROWS | FRAMEOPTION_GROUPS))
+ {
+ /* value is known to be int8 */
+ int64 offset = DatumGetInt64(value);
+
+ if (offset < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PRECEDING_OR_FOLLOWING_SIZE),
+ errmsg("frame ending offset must not be negative")));
+ }
+ }
+ winstate->all_first = false;
+ }
+
+ if (winstate->buffer == NULL)
+ {
+ /* Initialize for first partition and set current row = 0 */
+ begin_partition(winstate);
+ /* If there are no input rows, we'll detect that and exit below */
+ }
+ else
+ {
+ /* Advance current row within partition */
+ winstate->currentpos++;
+ /* This might mean that the frame moves, too */
+ winstate->framehead_valid = false;
+ winstate->frametail_valid = false;
+ /* we don't need to invalidate grouptail here; see below */
+ }
+
+ /*
+ * Spool all tuples up to and including the current row, if we haven't
+ * already
+ */
+ spool_tuples(winstate, winstate->currentpos);
+
+ /* Move to the next partition if we reached the end of this partition */
+ if (winstate->partition_spooled &&
+ winstate->currentpos >= winstate->spooled_rows)
+ {
+ release_partition(winstate);
+
+ if (winstate->more_partitions)
+ {
+ begin_partition(winstate);
+ Assert(winstate->spooled_rows > 0);
+ }
+ else
+ {
+ winstate->all_done = true;
+ return NULL;
+ }
+ }
+
+ /* final output execution is in ps_ExprContext */
+ econtext = winstate->ss.ps.ps_ExprContext;
+
+ /* Clear the per-output-tuple context for current row */
+ ResetExprContext(econtext);
+
+ /*
+ * Read the current row from the tuplestore, and save in ScanTupleSlot.
+ * (We can't rely on the outerplan's output slot because we may have to
+ * read beyond the current row. Also, we have to actually copy the row
+ * out of the tuplestore, since window function evaluation might cause the
+ * tuplestore to dump its state to disk.)
+ *
+ * In GROUPS mode, or when tracking a group-oriented exclusion clause, we
+ * must also detect entering a new peer group and update associated state
+ * when that happens. We use temp_slot_2 to temporarily hold the previous
+ * row for this purpose.
+ *
+ * Current row must be in the tuplestore, since we spooled it above.
+ */
+ tuplestore_select_read_pointer(winstate->buffer, winstate->current_ptr);
+ if ((winstate->frameOptions & (FRAMEOPTION_GROUPS |
+ FRAMEOPTION_EXCLUDE_GROUP |
+ FRAMEOPTION_EXCLUDE_TIES)) &&
+ winstate->currentpos > 0)
+ {
+ ExecCopySlot(winstate->temp_slot_2, winstate->ss.ss_ScanTupleSlot);
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->ss.ss_ScanTupleSlot))
+ elog(ERROR, "unexpected end of tuplestore");
+ if (!are_peers(winstate, winstate->temp_slot_2,
+ winstate->ss.ss_ScanTupleSlot))
+ {
+ winstate->currentgroup++;
+ winstate->groupheadpos = winstate->currentpos;
+ winstate->grouptail_valid = false;
+ }
+ ExecClearTuple(winstate->temp_slot_2);
+ }
+ else
+ {
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+ winstate->ss.ss_ScanTupleSlot))
+ elog(ERROR, "unexpected end of tuplestore");
+ }
+
+ /*
+ * Evaluate true window functions
+ */
+ numfuncs = winstate->numfuncs;
+ for (i = 0; i < numfuncs; i++)
+ {
+ WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]);
+
+ if (perfuncstate->plain_agg)
+ continue;
+ eval_windowfunction(winstate, perfuncstate,
+ &(econtext->ecxt_aggvalues[perfuncstate->wfuncstate->wfuncno]),
+ &(econtext->ecxt_aggnulls[perfuncstate->wfuncstate->wfuncno]));
+ }
+
+ /*
+ * Evaluate aggregates
+ */
+ if (winstate->numaggs > 0)
+ eval_windowaggregates(winstate);
+
+ /*
+ * If we have created auxiliary read pointers for the frame or group
+ * boundaries, force them to be kept up-to-date, because we don't know
+ * whether the window function(s) will do anything that requires that.
+ * Failing to advance the pointers would result in being unable to trim
+ * data from the tuplestore, which is bad. (If we could know in advance
+ * whether the window functions will use frame boundary info, we could
+ * skip creating these pointers in the first place ... but unfortunately
+ * the window function API doesn't require that.)
+ */
+ if (winstate->framehead_ptr >= 0)
+ update_frameheadpos(winstate);
+ if (winstate->frametail_ptr >= 0)
+ update_frametailpos(winstate);
+ if (winstate->grouptail_ptr >= 0)
+ update_grouptailpos(winstate);
+
+ /*
+ * Truncate any no-longer-needed rows from the tuplestore.
+ */
+ tuplestore_trim(winstate->buffer);
+
+ /*
+ * Form and return a projection tuple using the windowfunc results and the
+ * current row. Setting ecxt_outertuple arranges that any Vars will be
+ * evaluated with respect to that row.
+ */
+ econtext->ecxt_outertuple = winstate->ss.ss_ScanTupleSlot;
+
+ return ExecProject(winstate->ss.ps.ps_ProjInfo);
+}
+
+/* -----------------
+ * ExecInitWindowAgg
+ *
+ * Creates the run-time information for the WindowAgg node produced by the
+ * planner and initializes its outer subtree
+ * -----------------
+ */
+WindowAggState *
+ExecInitWindowAgg(WindowAgg *node, EState *estate, int eflags)
+{
+ WindowAggState *winstate;
+ Plan *outerPlan;
+ ExprContext *econtext;
+ ExprContext *tmpcontext;
+ WindowStatePerFunc perfunc;
+ WindowStatePerAgg peragg;
+ int frameOptions = node->frameOptions;
+ int numfuncs,
+ wfuncno,
+ numaggs,
+ aggno;
+ TupleDesc scanDesc;
+ ListCell *l;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * create state structure
+ */
+ winstate = makeNode(WindowAggState);
+ winstate->ss.ps.plan = (Plan *) node;
+ winstate->ss.ps.state = estate;
+ winstate->ss.ps.ExecProcNode = ExecWindowAgg;
+
+ /*
+ * Create expression contexts. We need two, one for per-input-tuple
+ * processing and one for per-output-tuple processing. We cheat a little
+ * by using ExecAssignExprContext() to build both.
+ */
+ ExecAssignExprContext(estate, &winstate->ss.ps);
+ tmpcontext = winstate->ss.ps.ps_ExprContext;
+ winstate->tmpcontext = tmpcontext;
+ ExecAssignExprContext(estate, &winstate->ss.ps);
+
+ /* Create long-lived context for storage of partition-local memory etc */
+ winstate->partcontext =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "WindowAgg Partition",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * Create mid-lived context for aggregate trans values etc.
+ *
+ * Note that moving aggregates each use their own private context, not
+ * this one.
+ */
+ winstate->aggcontext =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "WindowAgg Aggregates",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * WindowAgg nodes never have quals, since they can only occur at the
+ * logical top level of a query (ie, after any WHERE or HAVING filters)
+ */
+ Assert(node->plan.qual == NIL);
+ winstate->ss.ps.qual = NULL;
+
+ /*
+ * initialize child nodes
+ */
+ outerPlan = outerPlan(node);
+ outerPlanState(winstate) = ExecInitNode(outerPlan, estate, eflags);
+
+ /*
+ * initialize source tuple type (which is also the tuple type that we'll
+ * store in the tuplestore and use in all our working slots).
+ */
+ ExecCreateScanSlotFromOuterPlan(estate, &winstate->ss, &TTSOpsMinimalTuple);
+ scanDesc = winstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
+
+ /* the outer tuple isn't the child's tuple, but always a minimal tuple */
+ winstate->ss.ps.outeropsset = true;
+ winstate->ss.ps.outerops = &TTSOpsMinimalTuple;
+ winstate->ss.ps.outeropsfixed = true;
+
+ /*
+ * tuple table initialization
+ */
+ winstate->first_part_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsMinimalTuple);
+ winstate->agg_row_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsMinimalTuple);
+ winstate->temp_slot_1 = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsMinimalTuple);
+ winstate->temp_slot_2 = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsMinimalTuple);
+
+ /*
+ * create frame head and tail slots only if needed (must create slots in
+ * exactly the same cases that update_frameheadpos and update_frametailpos
+ * need them)
+ */
+ winstate->framehead_slot = winstate->frametail_slot = NULL;
+
+ if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+ {
+ if (((frameOptions & FRAMEOPTION_START_CURRENT_ROW) &&
+ node->ordNumCols != 0) ||
+ (frameOptions & FRAMEOPTION_START_OFFSET))
+ winstate->framehead_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsMinimalTuple);
+ if (((frameOptions & FRAMEOPTION_END_CURRENT_ROW) &&
+ node->ordNumCols != 0) ||
+ (frameOptions & FRAMEOPTION_END_OFFSET))
+ winstate->frametail_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+ &TTSOpsMinimalTuple);
+ }
+
+ /*
+ * Initialize result slot, type and projection.
+ */
+ ExecInitResultTupleSlotTL(&winstate->ss.ps, &TTSOpsVirtual);
+ ExecAssignProjectionInfo(&winstate->ss.ps, NULL);
+
+ /* Set up data for comparing tuples */
+ if (node->partNumCols > 0)
+ winstate->partEqfunction =
+ execTuplesMatchPrepare(scanDesc,
+ node->partNumCols,
+ node->partColIdx,
+ node->partOperators,
+ node->partCollations,
+ &winstate->ss.ps);
+
+ if (node->ordNumCols > 0)
+ winstate->ordEqfunction =
+ execTuplesMatchPrepare(scanDesc,
+ node->ordNumCols,
+ node->ordColIdx,
+ node->ordOperators,
+ node->ordCollations,
+ &winstate->ss.ps);
+
+ /*
+ * WindowAgg nodes use aggvalues and aggnulls as well as Agg nodes.
+ */
+ numfuncs = winstate->numfuncs;
+ numaggs = winstate->numaggs;
+ econtext = winstate->ss.ps.ps_ExprContext;
+ econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numfuncs);
+ econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numfuncs);
+
+ /*
+ * allocate per-wfunc/per-agg state information.
+ */
+ perfunc = (WindowStatePerFunc) palloc0(sizeof(WindowStatePerFuncData) * numfuncs);
+ peragg = (WindowStatePerAgg) palloc0(sizeof(WindowStatePerAggData) * numaggs);
+ winstate->perfunc = perfunc;
+ winstate->peragg = peragg;
+
+ wfuncno = -1;
+ aggno = -1;
+ foreach(l, winstate->funcs)
+ {
+ WindowFuncExprState *wfuncstate = (WindowFuncExprState *) lfirst(l);
+ WindowFunc *wfunc = wfuncstate->wfunc;
+ WindowStatePerFunc perfuncstate;
+ AclResult aclresult;
+ int i;
+
+ if (wfunc->winref != node->winref) /* planner screwed up? */
+ elog(ERROR, "WindowFunc with winref %u assigned to WindowAgg with winref %u",
+ wfunc->winref, node->winref);
+
+ /* Look for a previous duplicate window function */
+ for (i = 0; i <= wfuncno; i++)
+ {
+ if (equal(wfunc, perfunc[i].wfunc) &&
+ !contain_volatile_functions((Node *) wfunc))
+ break;
+ }
+ if (i <= wfuncno)
+ {
+ /* Found a match to an existing entry, so just mark it */
+ wfuncstate->wfuncno = i;
+ continue;
+ }
+
+ /* Nope, so assign a new PerAgg record */
+ perfuncstate = &perfunc[++wfuncno];
+
+ /* Mark WindowFunc state node with assigned index in the result array */
+ wfuncstate->wfuncno = wfuncno;
+
+ /* Check permission to call window function */
+ aclresult = pg_proc_aclcheck(wfunc->winfnoid, GetUserId(),
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(wfunc->winfnoid));
+ InvokeFunctionExecuteHook(wfunc->winfnoid);
+
+ /* Fill in the perfuncstate data */
+ perfuncstate->wfuncstate = wfuncstate;
+ perfuncstate->wfunc = wfunc;
+ perfuncstate->numArguments = list_length(wfuncstate->args);
+ perfuncstate->winCollation = wfunc->inputcollid;
+
+ get_typlenbyval(wfunc->wintype,
+ &perfuncstate->resulttypeLen,
+ &perfuncstate->resulttypeByVal);
+
+ /*
+ * If it's really just a plain aggregate function, we'll emulate the
+ * Agg environment for it.
+ */
+ perfuncstate->plain_agg = wfunc->winagg;
+ if (wfunc->winagg)
+ {
+ WindowStatePerAgg peraggstate;
+
+ perfuncstate->aggno = ++aggno;
+ peraggstate = &winstate->peragg[aggno];
+ initialize_peragg(winstate, wfunc, peraggstate);
+ peraggstate->wfuncno = wfuncno;
+ }
+ else
+ {
+ WindowObject winobj = makeNode(WindowObjectData);
+
+ winobj->winstate = winstate;
+ winobj->argstates = wfuncstate->args;
+ winobj->localmem = NULL;
+ perfuncstate->winobj = winobj;
+
+ /* It's a real window function, so set up to call it. */
+ fmgr_info_cxt(wfunc->winfnoid, &perfuncstate->flinfo,
+ econtext->ecxt_per_query_memory);
+ fmgr_info_set_expr((Node *) wfunc, &perfuncstate->flinfo);
+ }
+ }
+
+ /* Update numfuncs, numaggs to match number of unique functions found */
+ winstate->numfuncs = wfuncno + 1;
+ winstate->numaggs = aggno + 1;
+
+ /* Set up WindowObject for aggregates, if needed */
+ if (winstate->numaggs > 0)
+ {
+ WindowObject agg_winobj = makeNode(WindowObjectData);
+
+ agg_winobj->winstate = winstate;
+ agg_winobj->argstates = NIL;
+ agg_winobj->localmem = NULL;
+ /* make sure markptr = -1 to invalidate. It may not get used */
+ agg_winobj->markptr = -1;
+ agg_winobj->readptr = -1;
+ winstate->agg_winobj = agg_winobj;
+ }
+
+ /* copy frame options to state node for easy access */
+ winstate->frameOptions = frameOptions;
+
+ /* initialize frame bound offset expressions */
+ winstate->startOffset = ExecInitExpr((Expr *) node->startOffset,
+ (PlanState *) winstate);
+ winstate->endOffset = ExecInitExpr((Expr *) node->endOffset,
+ (PlanState *) winstate);
+
+ /* Lookup in_range support functions if needed */
+ if (OidIsValid(node->startInRangeFunc))
+ fmgr_info(node->startInRangeFunc, &winstate->startInRangeFunc);
+ if (OidIsValid(node->endInRangeFunc))
+ fmgr_info(node->endInRangeFunc, &winstate->endInRangeFunc);
+ winstate->inRangeColl = node->inRangeColl;
+ winstate->inRangeAsc = node->inRangeAsc;
+ winstate->inRangeNullsFirst = node->inRangeNullsFirst;
+
+ winstate->all_first = true;
+ winstate->partition_spooled = false;
+ winstate->more_partitions = false;
+
+ return winstate;
+}
+
+/* -----------------
+ * ExecEndWindowAgg
+ * -----------------
+ */
+void
+ExecEndWindowAgg(WindowAggState *node)
+{
+ PlanState *outerPlan;
+ int i;
+
+ release_partition(node);
+
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+ ExecClearTuple(node->first_part_slot);
+ ExecClearTuple(node->agg_row_slot);
+ ExecClearTuple(node->temp_slot_1);
+ ExecClearTuple(node->temp_slot_2);
+ if (node->framehead_slot)
+ ExecClearTuple(node->framehead_slot);
+ if (node->frametail_slot)
+ ExecClearTuple(node->frametail_slot);
+
+ /*
+ * Free both the expr contexts.
+ */
+ ExecFreeExprContext(&node->ss.ps);
+ node->ss.ps.ps_ExprContext = node->tmpcontext;
+ ExecFreeExprContext(&node->ss.ps);
+
+ for (i = 0; i < node->numaggs; i++)
+ {
+ if (node->peragg[i].aggcontext != node->aggcontext)
+ MemoryContextDelete(node->peragg[i].aggcontext);
+ }
+ MemoryContextDelete(node->partcontext);
+ MemoryContextDelete(node->aggcontext);
+
+ pfree(node->perfunc);
+ pfree(node->peragg);
+
+ outerPlan = outerPlanState(node);
+ ExecEndNode(outerPlan);
+}
+
+/* -----------------
+ * ExecReScanWindowAgg
+ * -----------------
+ */
+void
+ExecReScanWindowAgg(WindowAggState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+ ExprContext *econtext = node->ss.ps.ps_ExprContext;
+
+ node->all_done = false;
+ node->all_first = true;
+
+ /* release tuplestore et al */
+ release_partition(node);
+
+ /* release all temp tuples, but especially first_part_slot */
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+ ExecClearTuple(node->first_part_slot);
+ ExecClearTuple(node->agg_row_slot);
+ ExecClearTuple(node->temp_slot_1);
+ ExecClearTuple(node->temp_slot_2);
+ if (node->framehead_slot)
+ ExecClearTuple(node->framehead_slot);
+ if (node->frametail_slot)
+ ExecClearTuple(node->frametail_slot);
+
+ /* Forget current wfunc values */
+ MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numfuncs);
+ MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numfuncs);
+
+ /*
+ * if chgParam of subnode is not null then plan will be re-scanned by
+ * first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+}
+
+/*
+ * initialize_peragg
+ *
+ * Almost same as in nodeAgg.c, except we don't support DISTINCT currently.
+ */
+static WindowStatePerAggData *
+initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc,
+ WindowStatePerAgg peraggstate)
+{
+ Oid inputTypes[FUNC_MAX_ARGS];
+ int numArguments;
+ HeapTuple aggTuple;
+ Form_pg_aggregate aggform;
+ Oid aggtranstype;
+ AttrNumber initvalAttNo;
+ AclResult aclresult;
+ bool use_ma_code;
+ Oid transfn_oid,
+ invtransfn_oid,
+ finalfn_oid;
+ bool finalextra;
+ char finalmodify;
+ Expr *transfnexpr,
+ *invtransfnexpr,
+ *finalfnexpr;
+ Datum textInitVal;
+ int i;
+ ListCell *lc;
+
+ numArguments = list_length(wfunc->args);
+
+ i = 0;
+ foreach(lc, wfunc->args)
+ {
+ inputTypes[i++] = exprType((Node *) lfirst(lc));
+ }
+
+ aggTuple = SearchSysCache1(AGGFNOID, ObjectIdGetDatum(wfunc->winfnoid));
+ if (!HeapTupleIsValid(aggTuple))
+ elog(ERROR, "cache lookup failed for aggregate %u",
+ wfunc->winfnoid);
+ aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
+
+ /*
+ * Figure out whether we want to use the moving-aggregate implementation,
+ * and collect the right set of fields from the pg_attribute entry.
+ *
+ * It's possible that an aggregate would supply a safe moving-aggregate
+ * implementation and an unsafe normal one, in which case our hand is
+ * forced. Otherwise, if the frame head can't move, we don't need
+ * moving-aggregate code. Even if we'd like to use it, don't do so if the
+ * aggregate's arguments (and FILTER clause if any) contain any calls to
+ * volatile functions. Otherwise, the difference between restarting and
+ * not restarting the aggregation would be user-visible.
+ */
+ if (!OidIsValid(aggform->aggminvtransfn))
+ use_ma_code = false; /* sine qua non */
+ else if (aggform->aggmfinalmodify == AGGMODIFY_READ_ONLY &&
+ aggform->aggfinalmodify != AGGMODIFY_READ_ONLY)
+ use_ma_code = true; /* decision forced by safety */
+ else if (winstate->frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING)
+ use_ma_code = false; /* non-moving frame head */
+ else if (contain_volatile_functions((Node *) wfunc))
+ use_ma_code = false; /* avoid possible behavioral change */
+ else
+ use_ma_code = true; /* yes, let's use it */
+ if (use_ma_code)
+ {
+ peraggstate->transfn_oid = transfn_oid = aggform->aggmtransfn;
+ peraggstate->invtransfn_oid = invtransfn_oid = aggform->aggminvtransfn;
+ peraggstate->finalfn_oid = finalfn_oid = aggform->aggmfinalfn;
+ finalextra = aggform->aggmfinalextra;
+ finalmodify = aggform->aggmfinalmodify;
+ aggtranstype = aggform->aggmtranstype;
+ initvalAttNo = Anum_pg_aggregate_aggminitval;
+ }
+ else
+ {
+ peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn;
+ peraggstate->invtransfn_oid = invtransfn_oid = InvalidOid;
+ peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
+ finalextra = aggform->aggfinalextra;
+ finalmodify = aggform->aggfinalmodify;
+ aggtranstype = aggform->aggtranstype;
+ initvalAttNo = Anum_pg_aggregate_agginitval;
+ }
+
+ /*
+ * ExecInitWindowAgg already checked permission to call aggregate function
+ * ... but we still need to check the component functions
+ */
+
+ /* Check that aggregate owner has permission to call component fns */
+ {
+ HeapTuple procTuple;
+ Oid aggOwner;
+
+ procTuple = SearchSysCache1(PROCOID,
+ ObjectIdGetDatum(wfunc->winfnoid));
+ if (!HeapTupleIsValid(procTuple))
+ elog(ERROR, "cache lookup failed for function %u",
+ wfunc->winfnoid);
+ aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
+ ReleaseSysCache(procTuple);
+
+ aclresult = pg_proc_aclcheck(transfn_oid, aggOwner,
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(transfn_oid));
+ InvokeFunctionExecuteHook(transfn_oid);
+
+ if (OidIsValid(invtransfn_oid))
+ {
+ aclresult = pg_proc_aclcheck(invtransfn_oid, aggOwner,
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(invtransfn_oid));
+ InvokeFunctionExecuteHook(invtransfn_oid);
+ }
+
+ if (OidIsValid(finalfn_oid))
+ {
+ aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
+ ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ get_func_name(finalfn_oid));
+ InvokeFunctionExecuteHook(finalfn_oid);
+ }
+ }
+
+ /*
+ * If the selected finalfn isn't read-only, we can't run this aggregate as
+ * a window function. This is a user-facing error, so we take a bit more
+ * care with the error message than elsewhere in this function.
+ */
+ if (finalmodify != AGGMODIFY_READ_ONLY)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("aggregate function %s does not support use as a window function",
+ format_procedure(wfunc->winfnoid))));
+
+ /* Detect how many arguments to pass to the finalfn */
+ if (finalextra)
+ peraggstate->numFinalArgs = numArguments + 1;
+ else
+ peraggstate->numFinalArgs = 1;
+
+ /* resolve actual type of transition state, if polymorphic */
+ aggtranstype = resolve_aggregate_transtype(wfunc->winfnoid,
+ aggtranstype,
+ inputTypes,
+ numArguments);
+
+ /* build expression trees using actual argument & result types */
+ build_aggregate_transfn_expr(inputTypes,
+ numArguments,
+ 0, /* no ordered-set window functions yet */
+ false, /* no variadic window functions yet */
+ aggtranstype,
+ wfunc->inputcollid,
+ transfn_oid,
+ invtransfn_oid,
+ &transfnexpr,
+ &invtransfnexpr);
+
+ /* set up infrastructure for calling the transfn(s) and finalfn */
+ fmgr_info(transfn_oid, &peraggstate->transfn);
+ fmgr_info_set_expr((Node *) transfnexpr, &peraggstate->transfn);
+
+ if (OidIsValid(invtransfn_oid))
+ {
+ fmgr_info(invtransfn_oid, &peraggstate->invtransfn);
+ fmgr_info_set_expr((Node *) invtransfnexpr, &peraggstate->invtransfn);
+ }
+
+ if (OidIsValid(finalfn_oid))
+ {
+ build_aggregate_finalfn_expr(inputTypes,
+ peraggstate->numFinalArgs,
+ aggtranstype,
+ wfunc->wintype,
+ wfunc->inputcollid,
+ finalfn_oid,
+ &finalfnexpr);
+ fmgr_info(finalfn_oid, &peraggstate->finalfn);
+ fmgr_info_set_expr((Node *) finalfnexpr, &peraggstate->finalfn);
+ }
+
+ /* get info about relevant datatypes */
+ get_typlenbyval(wfunc->wintype,
+ &peraggstate->resulttypeLen,
+ &peraggstate->resulttypeByVal);
+ get_typlenbyval(aggtranstype,
+ &peraggstate->transtypeLen,
+ &peraggstate->transtypeByVal);
+
+ /*
+ * initval is potentially null, so don't try to access it as a struct
+ * field. Must do it the hard way with SysCacheGetAttr.
+ */
+ textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, initvalAttNo,
+ &peraggstate->initValueIsNull);
+
+ if (peraggstate->initValueIsNull)
+ peraggstate->initValue = (Datum) 0;
+ else
+ peraggstate->initValue = GetAggInitVal(textInitVal,
+ aggtranstype);
+
+ /*
+ * If the transfn is strict and the initval is NULL, make sure input type
+ * and transtype are the same (or at least binary-compatible), so that
+ * it's OK to use the first input value as the initial transValue. This
+ * should have been checked at agg definition time, but we must check
+ * again in case the transfn's strictness property has been changed.
+ */
+ if (peraggstate->transfn.fn_strict && peraggstate->initValueIsNull)
+ {
+ if (numArguments < 1 ||
+ !IsBinaryCoercible(inputTypes[0], aggtranstype))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("aggregate %u needs to have compatible input type and transition type",
+ wfunc->winfnoid)));
+ }
+
+ /*
+ * Insist that forward and inverse transition functions have the same
+ * strictness setting. Allowing them to differ would require handling
+ * more special cases in advance_windowaggregate and
+ * advance_windowaggregate_base, for no discernible benefit. This should
+ * have been checked at agg definition time, but we must check again in
+ * case either function's strictness property has been changed.
+ */
+ if (OidIsValid(invtransfn_oid) &&
+ peraggstate->transfn.fn_strict != peraggstate->invtransfn.fn_strict)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("strictness of aggregate's forward and inverse transition functions must match")));
+
+ /*
+ * Moving aggregates use their own aggcontext.
+ *
+ * This is necessary because they might restart at different times, so we
+ * might never be able to reset the shared context otherwise. We can't
+ * make it the aggregates' responsibility to clean up after themselves,
+ * because strict aggregates must be restarted whenever we remove their
+ * last non-NULL input, which the aggregate won't be aware is happening.
+ * Also, just pfree()ing the transValue upon restarting wouldn't help,
+ * since we'd miss any indirectly referenced data. We could, in theory,
+ * make the memory allocation rules for moving aggregates different than
+ * they have historically been for plain aggregates, but that seems grotty
+ * and likely to lead to memory leaks.
+ */
+ if (OidIsValid(invtransfn_oid))
+ peraggstate->aggcontext =
+ AllocSetContextCreate(CurrentMemoryContext,
+ "WindowAgg Per Aggregate",
+ ALLOCSET_DEFAULT_SIZES);
+ else
+ peraggstate->aggcontext = winstate->aggcontext;
+
+ ReleaseSysCache(aggTuple);
+
+ return peraggstate;
+}
+
+static Datum
+GetAggInitVal(Datum textInitVal, Oid transtype)
+{
+ Oid typinput,
+ typioparam;
+ char *strInitVal;
+ Datum initVal;
+
+ getTypeInputInfo(transtype, &typinput, &typioparam);
+ strInitVal = TextDatumGetCString(textInitVal);
+ initVal = OidInputFunctionCall(typinput, strInitVal,
+ typioparam, -1);
+ pfree(strInitVal);
+ return initVal;
+}
+
+/*
+ * are_peers
+ * compare two rows to see if they are equal according to the ORDER BY clause
+ *
+ * NB: this does not consider the window frame mode.
+ */
+static bool
+are_peers(WindowAggState *winstate, TupleTableSlot *slot1,
+ TupleTableSlot *slot2)
+{
+ WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan;
+ ExprContext *econtext = winstate->tmpcontext;
+
+ /* If no ORDER BY, all rows are peers with each other */
+ if (node->ordNumCols == 0)
+ return true;
+
+ econtext->ecxt_outertuple = slot1;
+ econtext->ecxt_innertuple = slot2;
+ return ExecQualAndReset(winstate->ordEqfunction, econtext);
+}
+
+/*
+ * window_gettupleslot
+ * Fetch the pos'th tuple of the current partition into the slot,
+ * using the winobj's read pointer
+ *
+ * Returns true if successful, false if no such row
+ */
+static bool
+window_gettupleslot(WindowObject winobj, int64 pos, TupleTableSlot *slot)
+{
+ WindowAggState *winstate = winobj->winstate;
+ MemoryContext oldcontext;
+
+ /* often called repeatedly in a row */
+ CHECK_FOR_INTERRUPTS();
+
+ /* Don't allow passing -1 to spool_tuples here */
+ if (pos < 0)
+ return false;
+
+ /* If necessary, fetch the tuple into the spool */
+ spool_tuples(winstate, pos);
+
+ if (pos >= winstate->spooled_rows)
+ return false;
+
+ if (pos < winobj->markpos)
+ elog(ERROR, "cannot fetch row before WindowObject's mark position");
+
+ oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+ tuplestore_select_read_pointer(winstate->buffer, winobj->readptr);
+
+ /*
+ * Advance or rewind until we are within one tuple of the one we want.
+ */
+ if (winobj->seekpos < pos - 1)
+ {
+ if (!tuplestore_skiptuples(winstate->buffer,
+ pos - 1 - winobj->seekpos,
+ true))
+ elog(ERROR, "unexpected end of tuplestore");
+ winobj->seekpos = pos - 1;
+ }
+ else if (winobj->seekpos > pos + 1)
+ {
+ if (!tuplestore_skiptuples(winstate->buffer,
+ winobj->seekpos - (pos + 1),
+ false))
+ elog(ERROR, "unexpected end of tuplestore");
+ winobj->seekpos = pos + 1;
+ }
+ else if (winobj->seekpos == pos)
+ {
+ /*
+ * There's no API to refetch the tuple at the current position. We
+ * have to move one tuple forward, and then one backward. (We don't
+ * do it the other way because we might try to fetch the row before
+ * our mark, which isn't allowed.) XXX this case could stand to be
+ * optimized.
+ */
+ tuplestore_advance(winstate->buffer, true);
+ winobj->seekpos++;
+ }
+
+ /*
+ * Now we should be on the tuple immediately before or after the one we
+ * want, so just fetch forwards or backwards as appropriate.
+ */
+ if (winobj->seekpos > pos)
+ {
+ if (!tuplestore_gettupleslot(winstate->buffer, false, true, slot))
+ elog(ERROR, "unexpected end of tuplestore");
+ winobj->seekpos--;
+ }
+ else
+ {
+ if (!tuplestore_gettupleslot(winstate->buffer, true, true, slot))
+ elog(ERROR, "unexpected end of tuplestore");
+ winobj->seekpos++;
+ }
+
+ Assert(winobj->seekpos == pos);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return true;
+}
+
+
+/***********************************************************************
+ * API exposed to window functions
+ ***********************************************************************/
+
+
+/*
+ * WinGetPartitionLocalMemory
+ * Get working memory that lives till end of partition processing
+ *
+ * On first call within a given partition, this allocates and zeroes the
+ * requested amount of space. Subsequent calls just return the same chunk.
+ *
+ * Memory obtained this way is normally used to hold state that should be
+ * automatically reset for each new partition. If a window function wants
+ * to hold state across the whole query, fcinfo->fn_extra can be used in the
+ * usual way for that.
+ */
+void *
+WinGetPartitionLocalMemory(WindowObject winobj, Size sz)
+{
+ Assert(WindowObjectIsValid(winobj));
+ if (winobj->localmem == NULL)
+ winobj->localmem =
+ MemoryContextAllocZero(winobj->winstate->partcontext, sz);
+ return winobj->localmem;
+}
+
+/*
+ * WinGetCurrentPosition
+ * Return the current row's position (counting from 0) within the current
+ * partition.
+ */
+int64
+WinGetCurrentPosition(WindowObject winobj)
+{
+ Assert(WindowObjectIsValid(winobj));
+ return winobj->winstate->currentpos;
+}
+
+/*
+ * WinGetPartitionRowCount
+ * Return total number of rows contained in the current partition.
+ *
+ * Note: this is a relatively expensive operation because it forces the
+ * whole partition to be "spooled" into the tuplestore at once. Once
+ * executed, however, additional calls within the same partition are cheap.
+ */
+int64
+WinGetPartitionRowCount(WindowObject winobj)
+{
+ Assert(WindowObjectIsValid(winobj));
+ spool_tuples(winobj->winstate, -1);
+ return winobj->winstate->spooled_rows;
+}
+
+/*
+ * WinSetMarkPosition
+ * Set the "mark" position for the window object, which is the oldest row
+ * number (counting from 0) it is allowed to fetch during all subsequent
+ * operations within the current partition.
+ *
+ * Window functions do not have to call this, but are encouraged to move the
+ * mark forward when possible to keep the tuplestore size down and prevent
+ * having to spill rows to disk.
+ */
+void
+WinSetMarkPosition(WindowObject winobj, int64 markpos)
+{
+ WindowAggState *winstate;
+
+ Assert(WindowObjectIsValid(winobj));
+ winstate = winobj->winstate;
+
+ if (markpos < winobj->markpos)
+ elog(ERROR, "cannot move WindowObject's mark position backward");
+ tuplestore_select_read_pointer(winstate->buffer, winobj->markptr);
+ if (markpos > winobj->markpos)
+ {
+ tuplestore_skiptuples(winstate->buffer,
+ markpos - winobj->markpos,
+ true);
+ winobj->markpos = markpos;
+ }
+ tuplestore_select_read_pointer(winstate->buffer, winobj->readptr);
+ if (markpos > winobj->seekpos)
+ {
+ tuplestore_skiptuples(winstate->buffer,
+ markpos - winobj->seekpos,
+ true);
+ winobj->seekpos = markpos;
+ }
+}
+
+/*
+ * WinRowsArePeers
+ * Compare two rows (specified by absolute position in partition) to see
+ * if they are equal according to the ORDER BY clause.
+ *
+ * NB: this does not consider the window frame mode.
+ */
+bool
+WinRowsArePeers(WindowObject winobj, int64 pos1, int64 pos2)
+{
+ WindowAggState *winstate;
+ WindowAgg *node;
+ TupleTableSlot *slot1;
+ TupleTableSlot *slot2;
+ bool res;
+
+ Assert(WindowObjectIsValid(winobj));
+ winstate = winobj->winstate;
+ node = (WindowAgg *) winstate->ss.ps.plan;
+
+ /* If no ORDER BY, all rows are peers; don't bother to fetch them */
+ if (node->ordNumCols == 0)
+ return true;
+
+ /*
+ * Note: OK to use temp_slot_2 here because we aren't calling any
+ * frame-related functions (those tend to clobber temp_slot_2).
+ */
+ slot1 = winstate->temp_slot_1;
+ slot2 = winstate->temp_slot_2;
+
+ if (!window_gettupleslot(winobj, pos1, slot1))
+ elog(ERROR, "specified position is out of window: " INT64_FORMAT,
+ pos1);
+ if (!window_gettupleslot(winobj, pos2, slot2))
+ elog(ERROR, "specified position is out of window: " INT64_FORMAT,
+ pos2);
+
+ res = are_peers(winstate, slot1, slot2);
+
+ ExecClearTuple(slot1);
+ ExecClearTuple(slot2);
+
+ return res;
+}
+
+/*
+ * WinGetFuncArgInPartition
+ * Evaluate a window function's argument expression on a specified
+ * row of the partition. The row is identified in lseek(2) style,
+ * i.e. relative to the current, first, or last row.
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * relpos: signed rowcount offset from the seek position
+ * seektype: WINDOW_SEEK_CURRENT, WINDOW_SEEK_HEAD, or WINDOW_SEEK_TAIL
+ * set_mark: If the row is found and set_mark is true, the mark is moved to
+ * the row as a side-effect.
+ * isnull: output argument, receives isnull status of result
+ * isout: output argument, set to indicate whether target row position
+ * is out of partition (can pass NULL if caller doesn't care about this)
+ *
+ * Specifying a nonexistent row is not an error, it just causes a null result
+ * (plus setting *isout true, if isout isn't NULL).
+ */
+Datum
+WinGetFuncArgInPartition(WindowObject winobj, int argno,
+ int relpos, int seektype, bool set_mark,
+ bool *isnull, bool *isout)
+{
+ WindowAggState *winstate;
+ ExprContext *econtext;
+ TupleTableSlot *slot;
+ bool gottuple;
+ int64 abs_pos;
+
+ Assert(WindowObjectIsValid(winobj));
+ winstate = winobj->winstate;
+ econtext = winstate->ss.ps.ps_ExprContext;
+ slot = winstate->temp_slot_1;
+
+ switch (seektype)
+ {
+ case WINDOW_SEEK_CURRENT:
+ abs_pos = winstate->currentpos + relpos;
+ break;
+ case WINDOW_SEEK_HEAD:
+ abs_pos = relpos;
+ break;
+ case WINDOW_SEEK_TAIL:
+ spool_tuples(winstate, -1);
+ abs_pos = winstate->spooled_rows - 1 + relpos;
+ break;
+ default:
+ elog(ERROR, "unrecognized window seek type: %d", seektype);
+ abs_pos = 0; /* keep compiler quiet */
+ break;
+ }
+
+ gottuple = window_gettupleslot(winobj, abs_pos, slot);
+
+ if (!gottuple)
+ {
+ if (isout)
+ *isout = true;
+ *isnull = true;
+ return (Datum) 0;
+ }
+ else
+ {
+ if (isout)
+ *isout = false;
+ if (set_mark)
+ WinSetMarkPosition(winobj, abs_pos);
+ econtext->ecxt_outertuple = slot;
+ return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+ econtext, isnull);
+ }
+}
+
+/*
+ * WinGetFuncArgInFrame
+ * Evaluate a window function's argument expression on a specified
+ * row of the window frame. The row is identified in lseek(2) style,
+ * i.e. relative to the first or last row of the frame. (We do not
+ * support WINDOW_SEEK_CURRENT here, because it's not very clear what
+ * that should mean if the current row isn't part of the frame.)
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * relpos: signed rowcount offset from the seek position
+ * seektype: WINDOW_SEEK_HEAD or WINDOW_SEEK_TAIL
+ * set_mark: If the row is found/in frame and set_mark is true, the mark is
+ * moved to the row as a side-effect.
+ * isnull: output argument, receives isnull status of result
+ * isout: output argument, set to indicate whether target row position
+ * is out of frame (can pass NULL if caller doesn't care about this)
+ *
+ * Specifying a nonexistent or not-in-frame row is not an error, it just
+ * causes a null result (plus setting *isout true, if isout isn't NULL).
+ *
+ * Note that some exclusion-clause options lead to situations where the
+ * rows that are in-frame are not consecutive in the partition. But we
+ * count only in-frame rows when measuring relpos.
+ *
+ * The set_mark flag is interpreted as meaning that the caller will specify
+ * a constant (or, perhaps, monotonically increasing) relpos in successive
+ * calls, so that *if there is no exclusion clause* there will be no need
+ * to fetch a row before the previously fetched row. But we do not expect
+ * the caller to know how to account for exclusion clauses. Therefore,
+ * if there is an exclusion clause we take responsibility for adjusting the
+ * mark request to something that will be safe given the above assumption
+ * about relpos.
+ */
+Datum
+WinGetFuncArgInFrame(WindowObject winobj, int argno,
+ int relpos, int seektype, bool set_mark,
+ bool *isnull, bool *isout)
+{
+ WindowAggState *winstate;
+ ExprContext *econtext;
+ TupleTableSlot *slot;
+ int64 abs_pos;
+ int64 mark_pos;
+
+ Assert(WindowObjectIsValid(winobj));
+ winstate = winobj->winstate;
+ econtext = winstate->ss.ps.ps_ExprContext;
+ slot = winstate->temp_slot_1;
+
+ switch (seektype)
+ {
+ case WINDOW_SEEK_CURRENT:
+ elog(ERROR, "WINDOW_SEEK_CURRENT is not supported for WinGetFuncArgInFrame");
+ abs_pos = mark_pos = 0; /* keep compiler quiet */
+ break;
+ case WINDOW_SEEK_HEAD:
+ /* rejecting relpos < 0 is easy and simplifies code below */
+ if (relpos < 0)
+ goto out_of_frame;
+ update_frameheadpos(winstate);
+ abs_pos = winstate->frameheadpos + relpos;
+ mark_pos = abs_pos;
+
+ /*
+ * Account for exclusion option if one is active, but advance only
+ * abs_pos not mark_pos. This prevents changes of the current
+ * row's peer group from resulting in trying to fetch a row before
+ * some previous mark position.
+ *
+ * Note that in some corner cases such as current row being
+ * outside frame, these calculations are theoretically too simple,
+ * but it doesn't matter because we'll end up deciding the row is
+ * out of frame. We do not attempt to avoid fetching rows past
+ * end of frame; that would happen in some cases anyway.
+ */
+ switch (winstate->frameOptions & FRAMEOPTION_EXCLUSION)
+ {
+ case 0:
+ /* no adjustment needed */
+ break;
+ case FRAMEOPTION_EXCLUDE_CURRENT_ROW:
+ if (abs_pos >= winstate->currentpos &&
+ winstate->currentpos >= winstate->frameheadpos)
+ abs_pos++;
+ break;
+ case FRAMEOPTION_EXCLUDE_GROUP:
+ update_grouptailpos(winstate);
+ if (abs_pos >= winstate->groupheadpos &&
+ winstate->grouptailpos > winstate->frameheadpos)
+ {
+ int64 overlapstart = Max(winstate->groupheadpos,
+ winstate->frameheadpos);
+
+ abs_pos += winstate->grouptailpos - overlapstart;
+ }
+ break;
+ case FRAMEOPTION_EXCLUDE_TIES:
+ update_grouptailpos(winstate);
+ if (abs_pos >= winstate->groupheadpos &&
+ winstate->grouptailpos > winstate->frameheadpos)
+ {
+ int64 overlapstart = Max(winstate->groupheadpos,
+ winstate->frameheadpos);
+
+ if (abs_pos == overlapstart)
+ abs_pos = winstate->currentpos;
+ else
+ abs_pos += winstate->grouptailpos - overlapstart - 1;
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized frame option state: 0x%x",
+ winstate->frameOptions);
+ break;
+ }
+ break;
+ case WINDOW_SEEK_TAIL:
+ /* rejecting relpos > 0 is easy and simplifies code below */
+ if (relpos > 0)
+ goto out_of_frame;
+ update_frametailpos(winstate);
+ abs_pos = winstate->frametailpos - 1 + relpos;
+
+ /*
+ * Account for exclusion option if one is active. If there is no
+ * exclusion, we can safely set the mark at the accessed row. But
+ * if there is, we can only mark the frame start, because we can't
+ * be sure how far back in the frame the exclusion might cause us
+ * to fetch in future. Furthermore, we have to actually check
+ * against frameheadpos here, since it's unsafe to try to fetch a
+ * row before frame start if the mark might be there already.
+ */
+ switch (winstate->frameOptions & FRAMEOPTION_EXCLUSION)
+ {
+ case 0:
+ /* no adjustment needed */
+ mark_pos = abs_pos;
+ break;
+ case FRAMEOPTION_EXCLUDE_CURRENT_ROW:
+ if (abs_pos <= winstate->currentpos &&
+ winstate->currentpos < winstate->frametailpos)
+ abs_pos--;
+ update_frameheadpos(winstate);
+ if (abs_pos < winstate->frameheadpos)
+ goto out_of_frame;
+ mark_pos = winstate->frameheadpos;
+ break;
+ case FRAMEOPTION_EXCLUDE_GROUP:
+ update_grouptailpos(winstate);
+ if (abs_pos < winstate->grouptailpos &&
+ winstate->groupheadpos < winstate->frametailpos)
+ {
+ int64 overlapend = Min(winstate->grouptailpos,
+ winstate->frametailpos);
+
+ abs_pos -= overlapend - winstate->groupheadpos;
+ }
+ update_frameheadpos(winstate);
+ if (abs_pos < winstate->frameheadpos)
+ goto out_of_frame;
+ mark_pos = winstate->frameheadpos;
+ break;
+ case FRAMEOPTION_EXCLUDE_TIES:
+ update_grouptailpos(winstate);
+ if (abs_pos < winstate->grouptailpos &&
+ winstate->groupheadpos < winstate->frametailpos)
+ {
+ int64 overlapend = Min(winstate->grouptailpos,
+ winstate->frametailpos);
+
+ if (abs_pos == overlapend - 1)
+ abs_pos = winstate->currentpos;
+ else
+ abs_pos -= overlapend - 1 - winstate->groupheadpos;
+ }
+ update_frameheadpos(winstate);
+ if (abs_pos < winstate->frameheadpos)
+ goto out_of_frame;
+ mark_pos = winstate->frameheadpos;
+ break;
+ default:
+ elog(ERROR, "unrecognized frame option state: 0x%x",
+ winstate->frameOptions);
+ mark_pos = 0; /* keep compiler quiet */
+ break;
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized window seek type: %d", seektype);
+ abs_pos = mark_pos = 0; /* keep compiler quiet */
+ break;
+ }
+
+ if (!window_gettupleslot(winobj, abs_pos, slot))
+ goto out_of_frame;
+
+ /* The code above does not detect all out-of-frame cases, so check */
+ if (row_is_in_frame(winstate, abs_pos, slot) <= 0)
+ goto out_of_frame;
+
+ if (isout)
+ *isout = false;
+ if (set_mark)
+ WinSetMarkPosition(winobj, mark_pos);
+ econtext->ecxt_outertuple = slot;
+ return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+ econtext, isnull);
+
+out_of_frame:
+ if (isout)
+ *isout = true;
+ *isnull = true;
+ return (Datum) 0;
+}
+
+/*
+ * WinGetFuncArgCurrent
+ * Evaluate a window function's argument expression on the current row.
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * isnull: output argument, receives isnull status of result
+ *
+ * Note: this isn't quite equivalent to WinGetFuncArgInPartition or
+ * WinGetFuncArgInFrame targeting the current row, because it will succeed
+ * even if the WindowObject's mark has been set beyond the current row.
+ * This should generally be used for "ordinary" arguments of a window
+ * function, such as the offset argument of lead() or lag().
+ */
+Datum
+WinGetFuncArgCurrent(WindowObject winobj, int argno, bool *isnull)
+{
+ WindowAggState *winstate;
+ ExprContext *econtext;
+
+ Assert(WindowObjectIsValid(winobj));
+ winstate = winobj->winstate;
+
+ econtext = winstate->ss.ps.ps_ExprContext;
+
+ econtext->ecxt_outertuple = winstate->ss.ss_ScanTupleSlot;
+ return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+ econtext, isnull);
+}
diff --git a/src/backend/executor/nodeWorktablescan.c b/src/backend/executor/nodeWorktablescan.c
new file mode 100644
index 0000000..91d3bf3
--- /dev/null
+++ b/src/backend/executor/nodeWorktablescan.c
@@ -0,0 +1,223 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeWorktablescan.c
+ * routines to handle WorkTableScan nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeWorktablescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeWorktablescan.h"
+
+static TupleTableSlot *WorkTableScanNext(WorkTableScanState *node);
+
+/* ----------------------------------------------------------------
+ * WorkTableScanNext
+ *
+ * This is a workhorse for ExecWorkTableScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+WorkTableScanNext(WorkTableScanState *node)
+{
+ TupleTableSlot *slot;
+ Tuplestorestate *tuplestorestate;
+
+ /*
+ * get information from the estate and scan state
+ *
+ * Note: we intentionally do not support backward scan. Although it would
+ * take only a couple more lines here, it would force nodeRecursiveunion.c
+ * to create the tuplestore with backward scan enabled, which has a
+ * performance cost. In practice backward scan is never useful for a
+ * worktable plan node, since it cannot appear high enough in the plan
+ * tree of a scrollable cursor to be exposed to a backward-scan
+ * requirement. So it's not worth expending effort to support it.
+ *
+ * Note: we are also assuming that this node is the only reader of the
+ * worktable. Therefore, we don't need a private read pointer for the
+ * tuplestore, nor do we need to tell tuplestore_gettupleslot to copy.
+ */
+ Assert(ScanDirectionIsForward(node->ss.ps.state->es_direction));
+
+ tuplestorestate = node->rustate->working_table;
+
+ /*
+ * Get the next tuple from tuplestore. Return NULL if no more tuples.
+ */
+ slot = node->ss.ss_ScanTupleSlot;
+ (void) tuplestore_gettupleslot(tuplestorestate, true, false, slot);
+ return slot;
+}
+
+/*
+ * WorkTableScanRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+WorkTableScanRecheck(WorkTableScanState *node, TupleTableSlot *slot)
+{
+ /* nothing to check */
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecWorkTableScan(node)
+ *
+ * Scans the worktable sequentially and returns the next qualifying tuple.
+ * We call the ExecScan() routine and pass it the appropriate
+ * access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecWorkTableScan(PlanState *pstate)
+{
+ WorkTableScanState *node = castNode(WorkTableScanState, pstate);
+
+ /*
+ * On the first call, find the ancestor RecursiveUnion's state via the
+ * Param slot reserved for it. (We can't do this during node init because
+ * there are corner cases where we'll get the init call before the
+ * RecursiveUnion does.)
+ */
+ if (node->rustate == NULL)
+ {
+ WorkTableScan *plan = (WorkTableScan *) node->ss.ps.plan;
+ EState *estate = node->ss.ps.state;
+ ParamExecData *param;
+
+ param = &(estate->es_param_exec_vals[plan->wtParam]);
+ Assert(param->execPlan == NULL);
+ Assert(!param->isnull);
+ node->rustate = castNode(RecursiveUnionState, DatumGetPointer(param->value));
+ Assert(node->rustate);
+
+ /*
+ * The scan tuple type (ie, the rowtype we expect to find in the work
+ * table) is the same as the result rowtype of the ancestor
+ * RecursiveUnion node. Note this depends on the assumption that
+ * RecursiveUnion doesn't allow projection.
+ */
+ ExecAssignScanType(&node->ss,
+ ExecGetResultType(&node->rustate->ps));
+
+ /*
+ * Now we can initialize the projection info. This must be completed
+ * before we can call ExecScan().
+ */
+ ExecAssignScanProjectionInfo(&node->ss);
+ }
+
+ return ExecScan(&node->ss,
+ (ExecScanAccessMtd) WorkTableScanNext,
+ (ExecScanRecheckMtd) WorkTableScanRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ * ExecInitWorkTableScan
+ * ----------------------------------------------------------------
+ */
+WorkTableScanState *
+ExecInitWorkTableScan(WorkTableScan *node, EState *estate, int eflags)
+{
+ WorkTableScanState *scanstate;
+
+ /* check for unsupported flags */
+ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+ /*
+ * WorkTableScan should not have any children.
+ */
+ Assert(outerPlan(node) == NULL);
+ Assert(innerPlan(node) == NULL);
+
+ /*
+ * create new WorkTableScanState for node
+ */
+ scanstate = makeNode(WorkTableScanState);
+ scanstate->ss.ps.plan = (Plan *) node;
+ scanstate->ss.ps.state = estate;
+ scanstate->ss.ps.ExecProcNode = ExecWorkTableScan;
+ scanstate->rustate = NULL; /* we'll set this later */
+
+ /*
+ * Miscellaneous initialization
+ *
+ * create expression context for node
+ */
+ ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+ /*
+ * tuple table initialization
+ */
+ ExecInitResultTypeTL(&scanstate->ss.ps);
+
+ /* signal that return type is not yet known */
+ scanstate->ss.ps.resultopsset = true;
+ scanstate->ss.ps.resultopsfixed = false;
+
+ ExecInitScanTupleSlot(estate, &scanstate->ss, NULL, &TTSOpsMinimalTuple);
+
+ /*
+ * initialize child expressions
+ */
+ scanstate->ss.ps.qual =
+ ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+ /*
+ * Do not yet initialize projection info, see ExecWorkTableScan() for
+ * details.
+ */
+
+ return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndWorkTableScan
+ *
+ * frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndWorkTableScan(WorkTableScanState *node)
+{
+ /*
+ * Free exprcontext
+ */
+ ExecFreeExprContext(&node->ss.ps);
+
+ /*
+ * clean out the tuple table
+ */
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ * ExecReScanWorkTableScan
+ *
+ * Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanWorkTableScan(WorkTableScanState *node)
+{
+ if (node->ss.ps.ps_ResultTupleSlot)
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ ExecScanReScan(&node->ss);
+
+ /* No need (or way) to rescan if ExecWorkTableScan not called yet */
+ if (node->rustate)
+ tuplestore_rescan(node->rustate->working_table);
+}
diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
new file mode 100644
index 0000000..f73c1e7
--- /dev/null
+++ b/src/backend/executor/spi.c
@@ -0,0 +1,3383 @@
+/*-------------------------------------------------------------------------
+ *
+ * spi.c
+ * Server Programming Interface
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/executor/spi.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/printtup.h"
+#include "access/sysattr.h"
+#include "access/xact.h"
+#include "catalog/heap.h"
+#include "catalog/pg_type.h"
+#include "commands/trigger.h"
+#include "executor/executor.h"
+#include "executor/spi_priv.h"
+#include "miscadmin.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/typcache.h"
+
+
+/*
+ * These global variables are part of the API for various SPI functions
+ * (a horrible API choice, but it's too late now). To reduce the risk of
+ * interference between different SPI callers, we save and restore them
+ * when entering/exiting a SPI nesting level.
+ */
+uint64 SPI_processed = 0;
+SPITupleTable *SPI_tuptable = NULL;
+int SPI_result = 0;
+
+static _SPI_connection *_SPI_stack = NULL;
+static _SPI_connection *_SPI_current = NULL;
+static int _SPI_stack_depth = 0; /* allocated size of _SPI_stack */
+static int _SPI_connected = -1; /* current stack index */
+
+typedef struct SPICallbackArg
+{
+ const char *query;
+ RawParseMode mode;
+} SPICallbackArg;
+
+static Portal SPI_cursor_open_internal(const char *name, SPIPlanPtr plan,
+ ParamListInfo paramLI, bool read_only);
+
+static void _SPI_prepare_plan(const char *src, SPIPlanPtr plan);
+
+static void _SPI_prepare_oneshot_plan(const char *src, SPIPlanPtr plan);
+
+static int _SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options,
+ Snapshot snapshot, Snapshot crosscheck_snapshot,
+ bool fire_triggers);
+
+static ParamListInfo _SPI_convert_params(int nargs, Oid *argtypes,
+ Datum *Values, const char *Nulls);
+
+static int _SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount);
+
+static void _SPI_error_callback(void *arg);
+
+static void _SPI_cursor_operation(Portal portal,
+ FetchDirection direction, long count,
+ DestReceiver *dest);
+
+static SPIPlanPtr _SPI_make_plan_non_temp(SPIPlanPtr plan);
+static SPIPlanPtr _SPI_save_plan(SPIPlanPtr plan);
+
+static int _SPI_begin_call(bool use_exec);
+static int _SPI_end_call(bool use_exec);
+static MemoryContext _SPI_execmem(void);
+static MemoryContext _SPI_procmem(void);
+static bool _SPI_checktuples(void);
+
+
+/* =================== interface functions =================== */
+
+int
+SPI_connect(void)
+{
+ return SPI_connect_ext(0);
+}
+
+int
+SPI_connect_ext(int options)
+{
+ int newdepth;
+
+ /* Enlarge stack if necessary */
+ if (_SPI_stack == NULL)
+ {
+ if (_SPI_connected != -1 || _SPI_stack_depth != 0)
+ elog(ERROR, "SPI stack corrupted");
+ newdepth = 16;
+ _SPI_stack = (_SPI_connection *)
+ MemoryContextAlloc(TopMemoryContext,
+ newdepth * sizeof(_SPI_connection));
+ _SPI_stack_depth = newdepth;
+ }
+ else
+ {
+ if (_SPI_stack_depth <= 0 || _SPI_stack_depth <= _SPI_connected)
+ elog(ERROR, "SPI stack corrupted");
+ if (_SPI_stack_depth == _SPI_connected + 1)
+ {
+ newdepth = _SPI_stack_depth * 2;
+ _SPI_stack = (_SPI_connection *)
+ repalloc(_SPI_stack,
+ newdepth * sizeof(_SPI_connection));
+ _SPI_stack_depth = newdepth;
+ }
+ }
+
+ /* Enter new stack level */
+ _SPI_connected++;
+ Assert(_SPI_connected >= 0 && _SPI_connected < _SPI_stack_depth);
+
+ _SPI_current = &(_SPI_stack[_SPI_connected]);
+ _SPI_current->processed = 0;
+ _SPI_current->tuptable = NULL;
+ _SPI_current->execSubid = InvalidSubTransactionId;
+ slist_init(&_SPI_current->tuptables);
+ _SPI_current->procCxt = NULL; /* in case we fail to create 'em */
+ _SPI_current->execCxt = NULL;
+ _SPI_current->connectSubid = GetCurrentSubTransactionId();
+ _SPI_current->queryEnv = NULL;
+ _SPI_current->atomic = (options & SPI_OPT_NONATOMIC ? false : true);
+ _SPI_current->internal_xact = false;
+ _SPI_current->outer_processed = SPI_processed;
+ _SPI_current->outer_tuptable = SPI_tuptable;
+ _SPI_current->outer_result = SPI_result;
+
+ /*
+ * Create memory contexts for this procedure
+ *
+ * In atomic contexts (the normal case), we use TopTransactionContext,
+ * otherwise PortalContext, so that it lives across transaction
+ * boundaries.
+ *
+ * XXX It could be better to use PortalContext as the parent context in
+ * all cases, but we may not be inside a portal (consider deferred-trigger
+ * execution). Perhaps CurTransactionContext could be an option? For now
+ * it doesn't matter because we clean up explicitly in AtEOSubXact_SPI();
+ * but see also AtEOXact_SPI().
+ */
+ _SPI_current->procCxt = AllocSetContextCreate(_SPI_current->atomic ? TopTransactionContext : PortalContext,
+ "SPI Proc",
+ ALLOCSET_DEFAULT_SIZES);
+ _SPI_current->execCxt = AllocSetContextCreate(_SPI_current->atomic ? TopTransactionContext : _SPI_current->procCxt,
+ "SPI Exec",
+ ALLOCSET_DEFAULT_SIZES);
+ /* ... and switch to procedure's context */
+ _SPI_current->savedcxt = MemoryContextSwitchTo(_SPI_current->procCxt);
+
+ /*
+ * Reset API global variables so that current caller cannot accidentally
+ * depend on state of an outer caller.
+ */
+ SPI_processed = 0;
+ SPI_tuptable = NULL;
+ SPI_result = 0;
+
+ return SPI_OK_CONNECT;
+}
+
+int
+SPI_finish(void)
+{
+ int res;
+
+ res = _SPI_begin_call(false); /* just check we're connected */
+ if (res < 0)
+ return res;
+
+ /* Restore memory context as it was before procedure call */
+ MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+ /* Release memory used in procedure call (including tuptables) */
+ MemoryContextDelete(_SPI_current->execCxt);
+ _SPI_current->execCxt = NULL;
+ MemoryContextDelete(_SPI_current->procCxt);
+ _SPI_current->procCxt = NULL;
+
+ /*
+ * Restore outer API variables, especially SPI_tuptable which is probably
+ * pointing at a just-deleted tuptable
+ */
+ SPI_processed = _SPI_current->outer_processed;
+ SPI_tuptable = _SPI_current->outer_tuptable;
+ SPI_result = _SPI_current->outer_result;
+
+ /* Exit stack level */
+ _SPI_connected--;
+ if (_SPI_connected < 0)
+ _SPI_current = NULL;
+ else
+ _SPI_current = &(_SPI_stack[_SPI_connected]);
+
+ return SPI_OK_FINISH;
+}
+
+/*
+ * SPI_start_transaction is a no-op, kept for backwards compatibility.
+ * SPI callers are *always* inside a transaction.
+ */
+void
+SPI_start_transaction(void)
+{
+}
+
+static void
+_SPI_commit(bool chain)
+{
+ MemoryContext oldcontext = CurrentMemoryContext;
+
+ /*
+ * Complain if we are in a context that doesn't permit transaction
+ * termination. (Note: here and _SPI_rollback should be the only places
+ * that throw ERRCODE_INVALID_TRANSACTION_TERMINATION, so that callers can
+ * test for that with security that they know what happened.)
+ */
+ if (_SPI_current->atomic)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION),
+ errmsg("invalid transaction termination")));
+
+ /*
+ * This restriction is required by PLs implemented on top of SPI. They
+ * use subtransactions to establish exception blocks that are supposed to
+ * be rolled back together if there is an error. Terminating the
+ * top-level transaction in such a block violates that idea. A future PL
+ * implementation might have different ideas about this, in which case
+ * this restriction would have to be refined or the check possibly be
+ * moved out of SPI into the PLs. Note however that the code below relies
+ * on not being within a subtransaction.
+ */
+ if (IsSubTransaction())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION),
+ errmsg("cannot commit while a subtransaction is active")));
+
+ /* XXX this ain't re-entrant enough for my taste */
+ if (chain)
+ SaveTransactionCharacteristics();
+
+ /* Catch any error occurring during the COMMIT */
+ PG_TRY();
+ {
+ /* Protect current SPI stack entry against deletion */
+ _SPI_current->internal_xact = true;
+
+ /*
+ * Hold any pinned portals that any PLs might be using. We have to do
+ * this before changing transaction state, since this will run
+ * user-defined code that might throw an error.
+ */
+ HoldPinnedPortals();
+
+ /* Release snapshots associated with portals */
+ ForgetPortalSnapshots();
+
+ /* Do the deed */
+ CommitTransactionCommand();
+
+ /* Immediately start a new transaction */
+ StartTransactionCommand();
+ if (chain)
+ RestoreTransactionCharacteristics();
+
+ MemoryContextSwitchTo(oldcontext);
+
+ _SPI_current->internal_xact = false;
+ }
+ PG_CATCH();
+ {
+ ErrorData *edata;
+
+ /* Save error info in caller's context */
+ MemoryContextSwitchTo(oldcontext);
+ edata = CopyErrorData();
+ FlushErrorState();
+
+ /*
+ * Abort the failed transaction. If this fails too, we'll just
+ * propagate the error out ... there's not that much we can do.
+ */
+ AbortCurrentTransaction();
+
+ /* ... and start a new one */
+ StartTransactionCommand();
+ if (chain)
+ RestoreTransactionCharacteristics();
+
+ MemoryContextSwitchTo(oldcontext);
+
+ _SPI_current->internal_xact = false;
+
+ /* Now that we've cleaned up the transaction, re-throw the error */
+ ReThrowError(edata);
+ }
+ PG_END_TRY();
+}
+
+void
+SPI_commit(void)
+{
+ _SPI_commit(false);
+}
+
+void
+SPI_commit_and_chain(void)
+{
+ _SPI_commit(true);
+}
+
+static void
+_SPI_rollback(bool chain)
+{
+ MemoryContext oldcontext = CurrentMemoryContext;
+
+ /* see under SPI_commit() */
+ if (_SPI_current->atomic)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION),
+ errmsg("invalid transaction termination")));
+
+ /* see under SPI_commit() */
+ if (IsSubTransaction())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION),
+ errmsg("cannot roll back while a subtransaction is active")));
+
+ /* XXX this ain't re-entrant enough for my taste */
+ if (chain)
+ SaveTransactionCharacteristics();
+
+ /* Catch any error occurring during the ROLLBACK */
+ PG_TRY();
+ {
+ /* Protect current SPI stack entry against deletion */
+ _SPI_current->internal_xact = true;
+
+ /*
+ * Hold any pinned portals that any PLs might be using. We have to do
+ * this before changing transaction state, since this will run
+ * user-defined code that might throw an error, and in any case
+ * couldn't be run in an already-aborted transaction.
+ */
+ HoldPinnedPortals();
+
+ /* Release snapshots associated with portals */
+ ForgetPortalSnapshots();
+
+ /* Do the deed */
+ AbortCurrentTransaction();
+
+ /* Immediately start a new transaction */
+ StartTransactionCommand();
+ if (chain)
+ RestoreTransactionCharacteristics();
+
+ MemoryContextSwitchTo(oldcontext);
+
+ _SPI_current->internal_xact = false;
+ }
+ PG_CATCH();
+ {
+ ErrorData *edata;
+
+ /* Save error info in caller's context */
+ MemoryContextSwitchTo(oldcontext);
+ edata = CopyErrorData();
+ FlushErrorState();
+
+ /*
+ * Try again to abort the failed transaction. If this fails too,
+ * we'll just propagate the error out ... there's not that much we can
+ * do.
+ */
+ AbortCurrentTransaction();
+
+ /* ... and start a new one */
+ StartTransactionCommand();
+ if (chain)
+ RestoreTransactionCharacteristics();
+
+ MemoryContextSwitchTo(oldcontext);
+
+ _SPI_current->internal_xact = false;
+
+ /* Now that we've cleaned up the transaction, re-throw the error */
+ ReThrowError(edata);
+ }
+ PG_END_TRY();
+}
+
+void
+SPI_rollback(void)
+{
+ _SPI_rollback(false);
+}
+
+void
+SPI_rollback_and_chain(void)
+{
+ _SPI_rollback(true);
+}
+
+/*
+ * SPICleanup is a no-op, kept for backwards compatibility. We rely on
+ * AtEOXact_SPI to cleanup. Extensions should not (need to) fiddle with the
+ * internal SPI state directly.
+ */
+void
+SPICleanup(void)
+{
+}
+
+/*
+ * Clean up SPI state at transaction commit or abort.
+ */
+void
+AtEOXact_SPI(bool isCommit)
+{
+ bool found = false;
+
+ /*
+ * Pop stack entries, stopping if we find one marked internal_xact (that
+ * one belongs to the caller of SPI_commit or SPI_abort).
+ */
+ while (_SPI_connected >= 0)
+ {
+ _SPI_connection *connection = &(_SPI_stack[_SPI_connected]);
+
+ if (connection->internal_xact)
+ break;
+
+ found = true;
+
+ /*
+ * We need not release the procedure's memory contexts explicitly, as
+ * they'll go away automatically when their parent context does; see
+ * notes in SPI_connect_ext.
+ */
+
+ /*
+ * Restore outer global variables and pop the stack entry. Unlike
+ * SPI_finish(), we don't risk switching to memory contexts that might
+ * be already gone.
+ */
+ SPI_processed = connection->outer_processed;
+ SPI_tuptable = connection->outer_tuptable;
+ SPI_result = connection->outer_result;
+
+ _SPI_connected--;
+ if (_SPI_connected < 0)
+ _SPI_current = NULL;
+ else
+ _SPI_current = &(_SPI_stack[_SPI_connected]);
+ }
+
+ /* We should only find entries to pop during an ABORT. */
+ if (found && isCommit)
+ ereport(WARNING,
+ (errcode(ERRCODE_WARNING),
+ errmsg("transaction left non-empty SPI stack"),
+ errhint("Check for missing \"SPI_finish\" calls.")));
+}
+
+/*
+ * Clean up SPI state at subtransaction commit or abort.
+ *
+ * During commit, there shouldn't be any unclosed entries remaining from
+ * the current subtransaction; we emit a warning if any are found.
+ */
+void
+AtEOSubXact_SPI(bool isCommit, SubTransactionId mySubid)
+{
+ bool found = false;
+
+ while (_SPI_connected >= 0)
+ {
+ _SPI_connection *connection = &(_SPI_stack[_SPI_connected]);
+
+ if (connection->connectSubid != mySubid)
+ break; /* couldn't be any underneath it either */
+
+ if (connection->internal_xact)
+ break;
+
+ found = true;
+
+ /*
+ * Release procedure memory explicitly (see note in SPI_connect)
+ */
+ if (connection->execCxt)
+ {
+ MemoryContextDelete(connection->execCxt);
+ connection->execCxt = NULL;
+ }
+ if (connection->procCxt)
+ {
+ MemoryContextDelete(connection->procCxt);
+ connection->procCxt = NULL;
+ }
+
+ /*
+ * Restore outer global variables and pop the stack entry. Unlike
+ * SPI_finish(), we don't risk switching to memory contexts that might
+ * be already gone.
+ */
+ SPI_processed = connection->outer_processed;
+ SPI_tuptable = connection->outer_tuptable;
+ SPI_result = connection->outer_result;
+
+ _SPI_connected--;
+ if (_SPI_connected < 0)
+ _SPI_current = NULL;
+ else
+ _SPI_current = &(_SPI_stack[_SPI_connected]);
+ }
+
+ if (found && isCommit)
+ ereport(WARNING,
+ (errcode(ERRCODE_WARNING),
+ errmsg("subtransaction left non-empty SPI stack"),
+ errhint("Check for missing \"SPI_finish\" calls.")));
+
+ /*
+ * If we are aborting a subtransaction and there is an open SPI context
+ * surrounding the subxact, clean up to prevent memory leakage.
+ */
+ if (_SPI_current && !isCommit)
+ {
+ slist_mutable_iter siter;
+
+ /*
+ * Throw away executor state if current executor operation was started
+ * within current subxact (essentially, force a _SPI_end_call(true)).
+ */
+ if (_SPI_current->execSubid >= mySubid)
+ {
+ _SPI_current->execSubid = InvalidSubTransactionId;
+ MemoryContextResetAndDeleteChildren(_SPI_current->execCxt);
+ }
+
+ /* throw away any tuple tables created within current subxact */
+ slist_foreach_modify(siter, &_SPI_current->tuptables)
+ {
+ SPITupleTable *tuptable;
+
+ tuptable = slist_container(SPITupleTable, next, siter.cur);
+ if (tuptable->subid >= mySubid)
+ {
+ /*
+ * If we used SPI_freetuptable() here, its internal search of
+ * the tuptables list would make this operation O(N^2).
+ * Instead, just free the tuptable manually. This should
+ * match what SPI_freetuptable() does.
+ */
+ slist_delete_current(&siter);
+ if (tuptable == _SPI_current->tuptable)
+ _SPI_current->tuptable = NULL;
+ if (tuptable == SPI_tuptable)
+ SPI_tuptable = NULL;
+ MemoryContextDelete(tuptable->tuptabcxt);
+ }
+ }
+ }
+}
+
+/*
+ * Are we executing inside a procedure (that is, a nonatomic SPI context)?
+ */
+bool
+SPI_inside_nonatomic_context(void)
+{
+ if (_SPI_current == NULL)
+ return false; /* not in any SPI context at all */
+ if (_SPI_current->atomic)
+ return false; /* it's atomic (ie function not procedure) */
+ return true;
+}
+
+
+/* Parse, plan, and execute a query string */
+int
+SPI_execute(const char *src, bool read_only, long tcount)
+{
+ _SPI_plan plan;
+ SPIExecuteOptions options;
+ int res;
+
+ if (src == NULL || tcount < 0)
+ return SPI_ERROR_ARGUMENT;
+
+ res = _SPI_begin_call(true);
+ if (res < 0)
+ return res;
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.parse_mode = RAW_PARSE_DEFAULT;
+ plan.cursor_options = CURSOR_OPT_PARALLEL_OK;
+
+ _SPI_prepare_oneshot_plan(src, &plan);
+
+ memset(&options, 0, sizeof(options));
+ options.read_only = read_only;
+ options.tcount = tcount;
+
+ res = _SPI_execute_plan(&plan, &options,
+ InvalidSnapshot, InvalidSnapshot,
+ true);
+
+ _SPI_end_call(true);
+ return res;
+}
+
+/* Obsolete version of SPI_execute */
+int
+SPI_exec(const char *src, long tcount)
+{
+ return SPI_execute(src, false, tcount);
+}
+
+/* Parse, plan, and execute a query string, with extensible options */
+int
+SPI_execute_extended(const char *src,
+ const SPIExecuteOptions *options)
+{
+ int res;
+ _SPI_plan plan;
+
+ if (src == NULL || options == NULL)
+ return SPI_ERROR_ARGUMENT;
+
+ res = _SPI_begin_call(true);
+ if (res < 0)
+ return res;
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.parse_mode = RAW_PARSE_DEFAULT;
+ plan.cursor_options = CURSOR_OPT_PARALLEL_OK;
+ if (options->params)
+ {
+ plan.parserSetup = options->params->parserSetup;
+ plan.parserSetupArg = options->params->parserSetupArg;
+ }
+
+ _SPI_prepare_oneshot_plan(src, &plan);
+
+ res = _SPI_execute_plan(&plan, options,
+ InvalidSnapshot, InvalidSnapshot,
+ true);
+
+ _SPI_end_call(true);
+ return res;
+}
+
+/* Execute a previously prepared plan */
+int
+SPI_execute_plan(SPIPlanPtr plan, Datum *Values, const char *Nulls,
+ bool read_only, long tcount)
+{
+ SPIExecuteOptions options;
+ int res;
+
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0)
+ return SPI_ERROR_ARGUMENT;
+
+ if (plan->nargs > 0 && Values == NULL)
+ return SPI_ERROR_PARAM;
+
+ res = _SPI_begin_call(true);
+ if (res < 0)
+ return res;
+
+ memset(&options, 0, sizeof(options));
+ options.params = _SPI_convert_params(plan->nargs, plan->argtypes,
+ Values, Nulls);
+ options.read_only = read_only;
+ options.tcount = tcount;
+
+ res = _SPI_execute_plan(plan, &options,
+ InvalidSnapshot, InvalidSnapshot,
+ true);
+
+ _SPI_end_call(true);
+ return res;
+}
+
+/* Obsolete version of SPI_execute_plan */
+int
+SPI_execp(SPIPlanPtr plan, Datum *Values, const char *Nulls, long tcount)
+{
+ return SPI_execute_plan(plan, Values, Nulls, false, tcount);
+}
+
+/* Execute a previously prepared plan */
+int
+SPI_execute_plan_extended(SPIPlanPtr plan,
+ const SPIExecuteOptions *options)
+{
+ int res;
+
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || options == NULL)
+ return SPI_ERROR_ARGUMENT;
+
+ res = _SPI_begin_call(true);
+ if (res < 0)
+ return res;
+
+ res = _SPI_execute_plan(plan, options,
+ InvalidSnapshot, InvalidSnapshot,
+ true);
+
+ _SPI_end_call(true);
+ return res;
+}
+
+/* Execute a previously prepared plan */
+int
+SPI_execute_plan_with_paramlist(SPIPlanPtr plan, ParamListInfo params,
+ bool read_only, long tcount)
+{
+ SPIExecuteOptions options;
+ int res;
+
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0)
+ return SPI_ERROR_ARGUMENT;
+
+ res = _SPI_begin_call(true);
+ if (res < 0)
+ return res;
+
+ memset(&options, 0, sizeof(options));
+ options.params = params;
+ options.read_only = read_only;
+ options.tcount = tcount;
+
+ res = _SPI_execute_plan(plan, &options,
+ InvalidSnapshot, InvalidSnapshot,
+ true);
+
+ _SPI_end_call(true);
+ return res;
+}
+
+/*
+ * SPI_execute_snapshot -- identical to SPI_execute_plan, except that we allow
+ * the caller to specify exactly which snapshots to use, which will be
+ * registered here. Also, the caller may specify that AFTER triggers should be
+ * queued as part of the outer query rather than being fired immediately at the
+ * end of the command.
+ *
+ * This is currently not documented in spi.sgml because it is only intended
+ * for use by RI triggers.
+ *
+ * Passing snapshot == InvalidSnapshot will select the normal behavior of
+ * fetching a new snapshot for each query.
+ */
+int
+SPI_execute_snapshot(SPIPlanPtr plan,
+ Datum *Values, const char *Nulls,
+ Snapshot snapshot, Snapshot crosscheck_snapshot,
+ bool read_only, bool fire_triggers, long tcount)
+{
+ SPIExecuteOptions options;
+ int res;
+
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0)
+ return SPI_ERROR_ARGUMENT;
+
+ if (plan->nargs > 0 && Values == NULL)
+ return SPI_ERROR_PARAM;
+
+ res = _SPI_begin_call(true);
+ if (res < 0)
+ return res;
+
+ memset(&options, 0, sizeof(options));
+ options.params = _SPI_convert_params(plan->nargs, plan->argtypes,
+ Values, Nulls);
+ options.read_only = read_only;
+ options.tcount = tcount;
+
+ res = _SPI_execute_plan(plan, &options,
+ snapshot, crosscheck_snapshot,
+ fire_triggers);
+
+ _SPI_end_call(true);
+ return res;
+}
+
+/*
+ * SPI_execute_with_args -- plan and execute a query with supplied arguments
+ *
+ * This is functionally equivalent to SPI_prepare followed by
+ * SPI_execute_plan.
+ */
+int
+SPI_execute_with_args(const char *src,
+ int nargs, Oid *argtypes,
+ Datum *Values, const char *Nulls,
+ bool read_only, long tcount)
+{
+ int res;
+ _SPI_plan plan;
+ ParamListInfo paramLI;
+ SPIExecuteOptions options;
+
+ if (src == NULL || nargs < 0 || tcount < 0)
+ return SPI_ERROR_ARGUMENT;
+
+ if (nargs > 0 && (argtypes == NULL || Values == NULL))
+ return SPI_ERROR_PARAM;
+
+ res = _SPI_begin_call(true);
+ if (res < 0)
+ return res;
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.parse_mode = RAW_PARSE_DEFAULT;
+ plan.cursor_options = CURSOR_OPT_PARALLEL_OK;
+ plan.nargs = nargs;
+ plan.argtypes = argtypes;
+ plan.parserSetup = NULL;
+ plan.parserSetupArg = NULL;
+
+ paramLI = _SPI_convert_params(nargs, argtypes,
+ Values, Nulls);
+
+ _SPI_prepare_oneshot_plan(src, &plan);
+
+ memset(&options, 0, sizeof(options));
+ options.params = paramLI;
+ options.read_only = read_only;
+ options.tcount = tcount;
+
+ res = _SPI_execute_plan(&plan, &options,
+ InvalidSnapshot, InvalidSnapshot,
+ true);
+
+ _SPI_end_call(true);
+ return res;
+}
+
+SPIPlanPtr
+SPI_prepare(const char *src, int nargs, Oid *argtypes)
+{
+ return SPI_prepare_cursor(src, nargs, argtypes, 0);
+}
+
+SPIPlanPtr
+SPI_prepare_cursor(const char *src, int nargs, Oid *argtypes,
+ int cursorOptions)
+{
+ _SPI_plan plan;
+ SPIPlanPtr result;
+
+ if (src == NULL || nargs < 0 || (nargs > 0 && argtypes == NULL))
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return NULL;
+ }
+
+ SPI_result = _SPI_begin_call(true);
+ if (SPI_result < 0)
+ return NULL;
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.parse_mode = RAW_PARSE_DEFAULT;
+ plan.cursor_options = cursorOptions;
+ plan.nargs = nargs;
+ plan.argtypes = argtypes;
+ plan.parserSetup = NULL;
+ plan.parserSetupArg = NULL;
+
+ _SPI_prepare_plan(src, &plan);
+
+ /* copy plan to procedure context */
+ result = _SPI_make_plan_non_temp(&plan);
+
+ _SPI_end_call(true);
+
+ return result;
+}
+
+SPIPlanPtr
+SPI_prepare_extended(const char *src,
+ const SPIPrepareOptions *options)
+{
+ _SPI_plan plan;
+ SPIPlanPtr result;
+
+ if (src == NULL || options == NULL)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return NULL;
+ }
+
+ SPI_result = _SPI_begin_call(true);
+ if (SPI_result < 0)
+ return NULL;
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.parse_mode = options->parseMode;
+ plan.cursor_options = options->cursorOptions;
+ plan.nargs = 0;
+ plan.argtypes = NULL;
+ plan.parserSetup = options->parserSetup;
+ plan.parserSetupArg = options->parserSetupArg;
+
+ _SPI_prepare_plan(src, &plan);
+
+ /* copy plan to procedure context */
+ result = _SPI_make_plan_non_temp(&plan);
+
+ _SPI_end_call(true);
+
+ return result;
+}
+
+SPIPlanPtr
+SPI_prepare_params(const char *src,
+ ParserSetupHook parserSetup,
+ void *parserSetupArg,
+ int cursorOptions)
+{
+ _SPI_plan plan;
+ SPIPlanPtr result;
+
+ if (src == NULL)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return NULL;
+ }
+
+ SPI_result = _SPI_begin_call(true);
+ if (SPI_result < 0)
+ return NULL;
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.parse_mode = RAW_PARSE_DEFAULT;
+ plan.cursor_options = cursorOptions;
+ plan.nargs = 0;
+ plan.argtypes = NULL;
+ plan.parserSetup = parserSetup;
+ plan.parserSetupArg = parserSetupArg;
+
+ _SPI_prepare_plan(src, &plan);
+
+ /* copy plan to procedure context */
+ result = _SPI_make_plan_non_temp(&plan);
+
+ _SPI_end_call(true);
+
+ return result;
+}
+
+int
+SPI_keepplan(SPIPlanPtr plan)
+{
+ ListCell *lc;
+
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC ||
+ plan->saved || plan->oneshot)
+ return SPI_ERROR_ARGUMENT;
+
+ /*
+ * Mark it saved, reparent it under CacheMemoryContext, and mark all the
+ * component CachedPlanSources as saved. This sequence cannot fail
+ * partway through, so there's no risk of long-term memory leakage.
+ */
+ plan->saved = true;
+ MemoryContextSetParent(plan->plancxt, CacheMemoryContext);
+
+ foreach(lc, plan->plancache_list)
+ {
+ CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+ SaveCachedPlan(plansource);
+ }
+
+ return 0;
+}
+
+SPIPlanPtr
+SPI_saveplan(SPIPlanPtr plan)
+{
+ SPIPlanPtr newplan;
+
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return NULL;
+ }
+
+ SPI_result = _SPI_begin_call(false); /* don't change context */
+ if (SPI_result < 0)
+ return NULL;
+
+ newplan = _SPI_save_plan(plan);
+
+ SPI_result = _SPI_end_call(false);
+
+ return newplan;
+}
+
+int
+SPI_freeplan(SPIPlanPtr plan)
+{
+ ListCell *lc;
+
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC)
+ return SPI_ERROR_ARGUMENT;
+
+ /* Release the plancache entries */
+ foreach(lc, plan->plancache_list)
+ {
+ CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+ DropCachedPlan(plansource);
+ }
+
+ /* Now get rid of the _SPI_plan and subsidiary data in its plancxt */
+ MemoryContextDelete(plan->plancxt);
+
+ return 0;
+}
+
+HeapTuple
+SPI_copytuple(HeapTuple tuple)
+{
+ MemoryContext oldcxt;
+ HeapTuple ctuple;
+
+ if (tuple == NULL)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return NULL;
+ }
+
+ if (_SPI_current == NULL)
+ {
+ SPI_result = SPI_ERROR_UNCONNECTED;
+ return NULL;
+ }
+
+ oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+ ctuple = heap_copytuple(tuple);
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return ctuple;
+}
+
+HeapTupleHeader
+SPI_returntuple(HeapTuple tuple, TupleDesc tupdesc)
+{
+ MemoryContext oldcxt;
+ HeapTupleHeader dtup;
+
+ if (tuple == NULL || tupdesc == NULL)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return NULL;
+ }
+
+ if (_SPI_current == NULL)
+ {
+ SPI_result = SPI_ERROR_UNCONNECTED;
+ return NULL;
+ }
+
+ /* For RECORD results, make sure a typmod has been assigned */
+ if (tupdesc->tdtypeid == RECORDOID &&
+ tupdesc->tdtypmod < 0)
+ assign_record_type_typmod(tupdesc);
+
+ oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+ dtup = DatumGetHeapTupleHeader(heap_copy_tuple_as_datum(tuple, tupdesc));
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return dtup;
+}
+
+HeapTuple
+SPI_modifytuple(Relation rel, HeapTuple tuple, int natts, int *attnum,
+ Datum *Values, const char *Nulls)
+{
+ MemoryContext oldcxt;
+ HeapTuple mtuple;
+ int numberOfAttributes;
+ Datum *v;
+ bool *n;
+ int i;
+
+ if (rel == NULL || tuple == NULL || natts < 0 || attnum == NULL || Values == NULL)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return NULL;
+ }
+
+ if (_SPI_current == NULL)
+ {
+ SPI_result = SPI_ERROR_UNCONNECTED;
+ return NULL;
+ }
+
+ oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+ SPI_result = 0;
+
+ numberOfAttributes = rel->rd_att->natts;
+ v = (Datum *) palloc(numberOfAttributes * sizeof(Datum));
+ n = (bool *) palloc(numberOfAttributes * sizeof(bool));
+
+ /* fetch old values and nulls */
+ heap_deform_tuple(tuple, rel->rd_att, v, n);
+
+ /* replace values and nulls */
+ for (i = 0; i < natts; i++)
+ {
+ if (attnum[i] <= 0 || attnum[i] > numberOfAttributes)
+ break;
+ v[attnum[i] - 1] = Values[i];
+ n[attnum[i] - 1] = (Nulls && Nulls[i] == 'n') ? true : false;
+ }
+
+ if (i == natts) /* no errors in *attnum */
+ {
+ mtuple = heap_form_tuple(rel->rd_att, v, n);
+
+ /*
+ * copy the identification info of the old tuple: t_ctid, t_self, and
+ * OID (if any)
+ */
+ mtuple->t_data->t_ctid = tuple->t_data->t_ctid;
+ mtuple->t_self = tuple->t_self;
+ mtuple->t_tableOid = tuple->t_tableOid;
+ }
+ else
+ {
+ mtuple = NULL;
+ SPI_result = SPI_ERROR_NOATTRIBUTE;
+ }
+
+ pfree(v);
+ pfree(n);
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return mtuple;
+}
+
+int
+SPI_fnumber(TupleDesc tupdesc, const char *fname)
+{
+ int res;
+ const FormData_pg_attribute *sysatt;
+
+ for (res = 0; res < tupdesc->natts; res++)
+ {
+ Form_pg_attribute attr = TupleDescAttr(tupdesc, res);
+
+ if (namestrcmp(&attr->attname, fname) == 0 &&
+ !attr->attisdropped)
+ return res + 1;
+ }
+
+ sysatt = SystemAttributeByName(fname);
+ if (sysatt != NULL)
+ return sysatt->attnum;
+
+ /* SPI_ERROR_NOATTRIBUTE is different from all sys column numbers */
+ return SPI_ERROR_NOATTRIBUTE;
+}
+
+char *
+SPI_fname(TupleDesc tupdesc, int fnumber)
+{
+ const FormData_pg_attribute *att;
+
+ SPI_result = 0;
+
+ if (fnumber > tupdesc->natts || fnumber == 0 ||
+ fnumber <= FirstLowInvalidHeapAttributeNumber)
+ {
+ SPI_result = SPI_ERROR_NOATTRIBUTE;
+ return NULL;
+ }
+
+ if (fnumber > 0)
+ att = TupleDescAttr(tupdesc, fnumber - 1);
+ else
+ att = SystemAttributeDefinition(fnumber);
+
+ return pstrdup(NameStr(att->attname));
+}
+
+char *
+SPI_getvalue(HeapTuple tuple, TupleDesc tupdesc, int fnumber)
+{
+ Datum val;
+ bool isnull;
+ Oid typoid,
+ foutoid;
+ bool typisvarlena;
+
+ SPI_result = 0;
+
+ if (fnumber > tupdesc->natts || fnumber == 0 ||
+ fnumber <= FirstLowInvalidHeapAttributeNumber)
+ {
+ SPI_result = SPI_ERROR_NOATTRIBUTE;
+ return NULL;
+ }
+
+ val = heap_getattr(tuple, fnumber, tupdesc, &isnull);
+ if (isnull)
+ return NULL;
+
+ if (fnumber > 0)
+ typoid = TupleDescAttr(tupdesc, fnumber - 1)->atttypid;
+ else
+ typoid = (SystemAttributeDefinition(fnumber))->atttypid;
+
+ getTypeOutputInfo(typoid, &foutoid, &typisvarlena);
+
+ return OidOutputFunctionCall(foutoid, val);
+}
+
+Datum
+SPI_getbinval(HeapTuple tuple, TupleDesc tupdesc, int fnumber, bool *isnull)
+{
+ SPI_result = 0;
+
+ if (fnumber > tupdesc->natts || fnumber == 0 ||
+ fnumber <= FirstLowInvalidHeapAttributeNumber)
+ {
+ SPI_result = SPI_ERROR_NOATTRIBUTE;
+ *isnull = true;
+ return (Datum) NULL;
+ }
+
+ return heap_getattr(tuple, fnumber, tupdesc, isnull);
+}
+
+char *
+SPI_gettype(TupleDesc tupdesc, int fnumber)
+{
+ Oid typoid;
+ HeapTuple typeTuple;
+ char *result;
+
+ SPI_result = 0;
+
+ if (fnumber > tupdesc->natts || fnumber == 0 ||
+ fnumber <= FirstLowInvalidHeapAttributeNumber)
+ {
+ SPI_result = SPI_ERROR_NOATTRIBUTE;
+ return NULL;
+ }
+
+ if (fnumber > 0)
+ typoid = TupleDescAttr(tupdesc, fnumber - 1)->atttypid;
+ else
+ typoid = (SystemAttributeDefinition(fnumber))->atttypid;
+
+ typeTuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid));
+
+ if (!HeapTupleIsValid(typeTuple))
+ {
+ SPI_result = SPI_ERROR_TYPUNKNOWN;
+ return NULL;
+ }
+
+ result = pstrdup(NameStr(((Form_pg_type) GETSTRUCT(typeTuple))->typname));
+ ReleaseSysCache(typeTuple);
+ return result;
+}
+
+/*
+ * Get the data type OID for a column.
+ *
+ * There's nothing similar for typmod and typcollation. The rare consumers
+ * thereof should inspect the TupleDesc directly.
+ */
+Oid
+SPI_gettypeid(TupleDesc tupdesc, int fnumber)
+{
+ SPI_result = 0;
+
+ if (fnumber > tupdesc->natts || fnumber == 0 ||
+ fnumber <= FirstLowInvalidHeapAttributeNumber)
+ {
+ SPI_result = SPI_ERROR_NOATTRIBUTE;
+ return InvalidOid;
+ }
+
+ if (fnumber > 0)
+ return TupleDescAttr(tupdesc, fnumber - 1)->atttypid;
+ else
+ return (SystemAttributeDefinition(fnumber))->atttypid;
+}
+
+char *
+SPI_getrelname(Relation rel)
+{
+ return pstrdup(RelationGetRelationName(rel));
+}
+
+char *
+SPI_getnspname(Relation rel)
+{
+ return get_namespace_name(RelationGetNamespace(rel));
+}
+
+void *
+SPI_palloc(Size size)
+{
+ if (_SPI_current == NULL)
+ elog(ERROR, "SPI_palloc called while not connected to SPI");
+
+ return MemoryContextAlloc(_SPI_current->savedcxt, size);
+}
+
+void *
+SPI_repalloc(void *pointer, Size size)
+{
+ /* No longer need to worry which context chunk was in... */
+ return repalloc(pointer, size);
+}
+
+void
+SPI_pfree(void *pointer)
+{
+ /* No longer need to worry which context chunk was in... */
+ pfree(pointer);
+}
+
+Datum
+SPI_datumTransfer(Datum value, bool typByVal, int typLen)
+{
+ MemoryContext oldcxt;
+ Datum result;
+
+ if (_SPI_current == NULL)
+ elog(ERROR, "SPI_datumTransfer called while not connected to SPI");
+
+ oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+ result = datumTransfer(value, typByVal, typLen);
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return result;
+}
+
+void
+SPI_freetuple(HeapTuple tuple)
+{
+ /* No longer need to worry which context tuple was in... */
+ heap_freetuple(tuple);
+}
+
+void
+SPI_freetuptable(SPITupleTable *tuptable)
+{
+ bool found = false;
+
+ /* ignore call if NULL pointer */
+ if (tuptable == NULL)
+ return;
+
+ /*
+ * Search only the topmost SPI context for a matching tuple table.
+ */
+ if (_SPI_current != NULL)
+ {
+ slist_mutable_iter siter;
+
+ /* find tuptable in active list, then remove it */
+ slist_foreach_modify(siter, &_SPI_current->tuptables)
+ {
+ SPITupleTable *tt;
+
+ tt = slist_container(SPITupleTable, next, siter.cur);
+ if (tt == tuptable)
+ {
+ slist_delete_current(&siter);
+ found = true;
+ break;
+ }
+ }
+ }
+
+ /*
+ * Refuse the deletion if we didn't find it in the topmost SPI context.
+ * This is primarily a guard against double deletion, but might prevent
+ * other errors as well. Since the worst consequence of not deleting a
+ * tuptable would be a transient memory leak, this is just a WARNING.
+ */
+ if (!found)
+ {
+ elog(WARNING, "attempt to delete invalid SPITupleTable %p", tuptable);
+ return;
+ }
+
+ /* for safety, reset global variables that might point at tuptable */
+ if (tuptable == _SPI_current->tuptable)
+ _SPI_current->tuptable = NULL;
+ if (tuptable == SPI_tuptable)
+ SPI_tuptable = NULL;
+
+ /* release all memory belonging to tuptable */
+ MemoryContextDelete(tuptable->tuptabcxt);
+}
+
+
+/*
+ * SPI_cursor_open()
+ *
+ * Open a prepared SPI plan as a portal
+ */
+Portal
+SPI_cursor_open(const char *name, SPIPlanPtr plan,
+ Datum *Values, const char *Nulls,
+ bool read_only)
+{
+ Portal portal;
+ ParamListInfo paramLI;
+
+ /* build transient ParamListInfo in caller's context */
+ paramLI = _SPI_convert_params(plan->nargs, plan->argtypes,
+ Values, Nulls);
+
+ portal = SPI_cursor_open_internal(name, plan, paramLI, read_only);
+
+ /* done with the transient ParamListInfo */
+ if (paramLI)
+ pfree(paramLI);
+
+ return portal;
+}
+
+
+/*
+ * SPI_cursor_open_with_args()
+ *
+ * Parse and plan a query and open it as a portal.
+ */
+Portal
+SPI_cursor_open_with_args(const char *name,
+ const char *src,
+ int nargs, Oid *argtypes,
+ Datum *Values, const char *Nulls,
+ bool read_only, int cursorOptions)
+{
+ Portal result;
+ _SPI_plan plan;
+ ParamListInfo paramLI;
+
+ if (src == NULL || nargs < 0)
+ elog(ERROR, "SPI_cursor_open_with_args called with invalid arguments");
+
+ if (nargs > 0 && (argtypes == NULL || Values == NULL))
+ elog(ERROR, "SPI_cursor_open_with_args called with missing parameters");
+
+ SPI_result = _SPI_begin_call(true);
+ if (SPI_result < 0)
+ elog(ERROR, "SPI_cursor_open_with_args called while not connected");
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.parse_mode = RAW_PARSE_DEFAULT;
+ plan.cursor_options = cursorOptions;
+ plan.nargs = nargs;
+ plan.argtypes = argtypes;
+ plan.parserSetup = NULL;
+ plan.parserSetupArg = NULL;
+
+ /* build transient ParamListInfo in executor context */
+ paramLI = _SPI_convert_params(nargs, argtypes,
+ Values, Nulls);
+
+ _SPI_prepare_plan(src, &plan);
+
+ /* We needn't copy the plan; SPI_cursor_open_internal will do so */
+
+ result = SPI_cursor_open_internal(name, &plan, paramLI, read_only);
+
+ /* And clean up */
+ _SPI_end_call(true);
+
+ return result;
+}
+
+
+/*
+ * SPI_cursor_open_with_paramlist()
+ *
+ * Same as SPI_cursor_open except that parameters (if any) are passed
+ * as a ParamListInfo, which supports dynamic parameter set determination
+ */
+Portal
+SPI_cursor_open_with_paramlist(const char *name, SPIPlanPtr plan,
+ ParamListInfo params, bool read_only)
+{
+ return SPI_cursor_open_internal(name, plan, params, read_only);
+}
+
+/* Parse a query and open it as a cursor */
+Portal
+SPI_cursor_parse_open(const char *name,
+ const char *src,
+ const SPIParseOpenOptions *options)
+{
+ Portal result;
+ _SPI_plan plan;
+
+ if (src == NULL || options == NULL)
+ elog(ERROR, "SPI_cursor_parse_open called with invalid arguments");
+
+ SPI_result = _SPI_begin_call(true);
+ if (SPI_result < 0)
+ elog(ERROR, "SPI_cursor_parse_open called while not connected");
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.parse_mode = RAW_PARSE_DEFAULT;
+ plan.cursor_options = options->cursorOptions;
+ if (options->params)
+ {
+ plan.parserSetup = options->params->parserSetup;
+ plan.parserSetupArg = options->params->parserSetupArg;
+ }
+
+ _SPI_prepare_plan(src, &plan);
+
+ /* We needn't copy the plan; SPI_cursor_open_internal will do so */
+
+ result = SPI_cursor_open_internal(name, &plan,
+ options->params, options->read_only);
+
+ /* And clean up */
+ _SPI_end_call(true);
+
+ return result;
+}
+
+
+/*
+ * SPI_cursor_open_internal()
+ *
+ * Common code for SPI_cursor_open variants
+ */
+static Portal
+SPI_cursor_open_internal(const char *name, SPIPlanPtr plan,
+ ParamListInfo paramLI, bool read_only)
+{
+ CachedPlanSource *plansource;
+ CachedPlan *cplan;
+ List *stmt_list;
+ char *query_string;
+ Snapshot snapshot;
+ MemoryContext oldcontext;
+ Portal portal;
+ SPICallbackArg spicallbackarg;
+ ErrorContextCallback spierrcontext;
+
+ /*
+ * Check that the plan is something the Portal code will special-case as
+ * returning one tupleset.
+ */
+ if (!SPI_is_cursor_plan(plan))
+ {
+ /* try to give a good error message */
+ const char *cmdtag;
+
+ if (list_length(plan->plancache_list) != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_DEFINITION),
+ errmsg("cannot open multi-query plan as cursor")));
+ plansource = (CachedPlanSource *) linitial(plan->plancache_list);
+ /* A SELECT that fails SPI_is_cursor_plan() must be SELECT INTO */
+ if (plansource->commandTag == CMDTAG_SELECT)
+ cmdtag = "SELECT INTO";
+ else
+ cmdtag = GetCommandTagName(plansource->commandTag);
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_DEFINITION),
+ /* translator: %s is name of a SQL command, eg INSERT */
+ errmsg("cannot open %s query as cursor", cmdtag)));
+ }
+
+ Assert(list_length(plan->plancache_list) == 1);
+ plansource = (CachedPlanSource *) linitial(plan->plancache_list);
+
+ /* Push the SPI stack */
+ if (_SPI_begin_call(true) < 0)
+ elog(ERROR, "SPI_cursor_open called while not connected");
+
+ /* Reset SPI result (note we deliberately don't touch lastoid) */
+ SPI_processed = 0;
+ SPI_tuptable = NULL;
+ _SPI_current->processed = 0;
+ _SPI_current->tuptable = NULL;
+
+ /* Create the portal */
+ if (name == NULL || name[0] == '\0')
+ {
+ /* Use a random nonconflicting name */
+ portal = CreateNewPortal();
+ }
+ else
+ {
+ /* In this path, error if portal of same name already exists */
+ portal = CreatePortal(name, false, false);
+ }
+
+ /* Copy the plan's query string into the portal */
+ query_string = MemoryContextStrdup(portal->portalContext,
+ plansource->query_string);
+
+ /*
+ * Setup error traceback support for ereport(), in case GetCachedPlan
+ * throws an error.
+ */
+ spicallbackarg.query = plansource->query_string;
+ spicallbackarg.mode = plan->parse_mode;
+ spierrcontext.callback = _SPI_error_callback;
+ spierrcontext.arg = &spicallbackarg;
+ spierrcontext.previous = error_context_stack;
+ error_context_stack = &spierrcontext;
+
+ /*
+ * Note: for a saved plan, we mustn't have any failure occur between
+ * GetCachedPlan and PortalDefineQuery; that would result in leaking our
+ * plancache refcount.
+ */
+
+ /* Replan if needed, and increment plan refcount for portal */
+ cplan = GetCachedPlan(plansource, paramLI, NULL, _SPI_current->queryEnv);
+ stmt_list = cplan->stmt_list;
+
+ if (!plan->saved)
+ {
+ /*
+ * We don't want the portal to depend on an unsaved CachedPlanSource,
+ * so must copy the plan into the portal's context. An error here
+ * will result in leaking our refcount on the plan, but it doesn't
+ * matter because the plan is unsaved and hence transient anyway.
+ */
+ oldcontext = MemoryContextSwitchTo(portal->portalContext);
+ stmt_list = copyObject(stmt_list);
+ MemoryContextSwitchTo(oldcontext);
+ ReleaseCachedPlan(cplan, NULL);
+ cplan = NULL; /* portal shouldn't depend on cplan */
+ }
+
+ /*
+ * Set up the portal.
+ */
+ PortalDefineQuery(portal,
+ NULL, /* no statement name */
+ query_string,
+ plansource->commandTag,
+ stmt_list,
+ cplan);
+
+ /*
+ * Set up options for portal. Default SCROLL type is chosen the same way
+ * as PerformCursorOpen does it.
+ */
+ portal->cursorOptions = plan->cursor_options;
+ if (!(portal->cursorOptions & (CURSOR_OPT_SCROLL | CURSOR_OPT_NO_SCROLL)))
+ {
+ if (list_length(stmt_list) == 1 &&
+ linitial_node(PlannedStmt, stmt_list)->commandType != CMD_UTILITY &&
+ linitial_node(PlannedStmt, stmt_list)->rowMarks == NIL &&
+ ExecSupportsBackwardScan(linitial_node(PlannedStmt, stmt_list)->planTree))
+ portal->cursorOptions |= CURSOR_OPT_SCROLL;
+ else
+ portal->cursorOptions |= CURSOR_OPT_NO_SCROLL;
+ }
+
+ /*
+ * Disallow SCROLL with SELECT FOR UPDATE. This is not redundant with the
+ * check in transformDeclareCursorStmt because the cursor options might
+ * not have come through there.
+ */
+ if (portal->cursorOptions & CURSOR_OPT_SCROLL)
+ {
+ if (list_length(stmt_list) == 1 &&
+ linitial_node(PlannedStmt, stmt_list)->commandType != CMD_UTILITY &&
+ linitial_node(PlannedStmt, stmt_list)->rowMarks != NIL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("DECLARE SCROLL CURSOR ... FOR UPDATE/SHARE is not supported"),
+ errdetail("Scrollable cursors must be READ ONLY.")));
+ }
+
+ /* Make current query environment available to portal at execution time. */
+ portal->queryEnv = _SPI_current->queryEnv;
+
+ /*
+ * If told to be read-only, we'd better check for read-only queries. This
+ * can't be done earlier because we need to look at the finished, planned
+ * queries. (In particular, we don't want to do it between GetCachedPlan
+ * and PortalDefineQuery, because throwing an error between those steps
+ * would result in leaking our plancache refcount.)
+ */
+ if (read_only)
+ {
+ ListCell *lc;
+
+ foreach(lc, stmt_list)
+ {
+ PlannedStmt *pstmt = lfirst_node(PlannedStmt, lc);
+
+ if (!CommandIsReadOnly(pstmt))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ /* translator: %s is a SQL statement name */
+ errmsg("%s is not allowed in a non-volatile function",
+ CreateCommandName((Node *) pstmt))));
+ }
+ }
+
+ /* Set up the snapshot to use. */
+ if (read_only)
+ snapshot = GetActiveSnapshot();
+ else
+ {
+ CommandCounterIncrement();
+ snapshot = GetTransactionSnapshot();
+ }
+
+ /*
+ * If the plan has parameters, copy them into the portal. Note that this
+ * must be done after revalidating the plan, because in dynamic parameter
+ * cases the set of parameters could have changed during re-parsing.
+ */
+ if (paramLI)
+ {
+ oldcontext = MemoryContextSwitchTo(portal->portalContext);
+ paramLI = copyParamList(paramLI);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ /*
+ * Start portal execution.
+ */
+ PortalStart(portal, paramLI, 0, snapshot);
+
+ Assert(portal->strategy != PORTAL_MULTI_QUERY);
+
+ /* Pop the error context stack */
+ error_context_stack = spierrcontext.previous;
+
+ /* Pop the SPI stack */
+ _SPI_end_call(true);
+
+ /* Return the created portal */
+ return portal;
+}
+
+
+/*
+ * SPI_cursor_find()
+ *
+ * Find the portal of an existing open cursor
+ */
+Portal
+SPI_cursor_find(const char *name)
+{
+ return GetPortalByName(name);
+}
+
+
+/*
+ * SPI_cursor_fetch()
+ *
+ * Fetch rows in a cursor
+ */
+void
+SPI_cursor_fetch(Portal portal, bool forward, long count)
+{
+ _SPI_cursor_operation(portal,
+ forward ? FETCH_FORWARD : FETCH_BACKWARD, count,
+ CreateDestReceiver(DestSPI));
+ /* we know that the DestSPI receiver doesn't need a destroy call */
+}
+
+
+/*
+ * SPI_cursor_move()
+ *
+ * Move in a cursor
+ */
+void
+SPI_cursor_move(Portal portal, bool forward, long count)
+{
+ _SPI_cursor_operation(portal,
+ forward ? FETCH_FORWARD : FETCH_BACKWARD, count,
+ None_Receiver);
+}
+
+
+/*
+ * SPI_scroll_cursor_fetch()
+ *
+ * Fetch rows in a scrollable cursor
+ */
+void
+SPI_scroll_cursor_fetch(Portal portal, FetchDirection direction, long count)
+{
+ _SPI_cursor_operation(portal,
+ direction, count,
+ CreateDestReceiver(DestSPI));
+ /* we know that the DestSPI receiver doesn't need a destroy call */
+}
+
+
+/*
+ * SPI_scroll_cursor_move()
+ *
+ * Move in a scrollable cursor
+ */
+void
+SPI_scroll_cursor_move(Portal portal, FetchDirection direction, long count)
+{
+ _SPI_cursor_operation(portal, direction, count, None_Receiver);
+}
+
+
+/*
+ * SPI_cursor_close()
+ *
+ * Close a cursor
+ */
+void
+SPI_cursor_close(Portal portal)
+{
+ if (!PortalIsValid(portal))
+ elog(ERROR, "invalid portal in SPI cursor operation");
+
+ PortalDrop(portal, false);
+}
+
+/*
+ * Returns the Oid representing the type id for argument at argIndex. First
+ * parameter is at index zero.
+ */
+Oid
+SPI_getargtypeid(SPIPlanPtr plan, int argIndex)
+{
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC ||
+ argIndex < 0 || argIndex >= plan->nargs)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return InvalidOid;
+ }
+ return plan->argtypes[argIndex];
+}
+
+/*
+ * Returns the number of arguments for the prepared plan.
+ */
+int
+SPI_getargcount(SPIPlanPtr plan)
+{
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return -1;
+ }
+ return plan->nargs;
+}
+
+/*
+ * Returns true if the plan contains exactly one command
+ * and that command returns tuples to the caller (eg, SELECT or
+ * INSERT ... RETURNING, but not SELECT ... INTO). In essence,
+ * the result indicates if the command can be used with SPI_cursor_open
+ *
+ * Parameters
+ * plan: A plan previously prepared using SPI_prepare
+ */
+bool
+SPI_is_cursor_plan(SPIPlanPtr plan)
+{
+ CachedPlanSource *plansource;
+
+ if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC)
+ {
+ SPI_result = SPI_ERROR_ARGUMENT;
+ return false;
+ }
+
+ if (list_length(plan->plancache_list) != 1)
+ {
+ SPI_result = 0;
+ return false; /* not exactly 1 pre-rewrite command */
+ }
+ plansource = (CachedPlanSource *) linitial(plan->plancache_list);
+
+ /*
+ * We used to force revalidation of the cached plan here, but that seems
+ * unnecessary: invalidation could mean a change in the rowtype of the
+ * tuples returned by a plan, but not whether it returns tuples at all.
+ */
+ SPI_result = 0;
+
+ /* Does it return tuples? */
+ if (plansource->resultDesc)
+ return true;
+
+ return false;
+}
+
+/*
+ * SPI_plan_is_valid --- test whether a SPI plan is currently valid
+ * (that is, not marked as being in need of revalidation).
+ *
+ * See notes for CachedPlanIsValid before using this.
+ */
+bool
+SPI_plan_is_valid(SPIPlanPtr plan)
+{
+ ListCell *lc;
+
+ Assert(plan->magic == _SPI_PLAN_MAGIC);
+
+ foreach(lc, plan->plancache_list)
+ {
+ CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+ if (!CachedPlanIsValid(plansource))
+ return false;
+ }
+ return true;
+}
+
+/*
+ * SPI_result_code_string --- convert any SPI return code to a string
+ *
+ * This is often useful in error messages. Most callers will probably
+ * only pass negative (error-case) codes, but for generality we recognize
+ * the success codes too.
+ */
+const char *
+SPI_result_code_string(int code)
+{
+ static char buf[64];
+
+ switch (code)
+ {
+ case SPI_ERROR_CONNECT:
+ return "SPI_ERROR_CONNECT";
+ case SPI_ERROR_COPY:
+ return "SPI_ERROR_COPY";
+ case SPI_ERROR_OPUNKNOWN:
+ return "SPI_ERROR_OPUNKNOWN";
+ case SPI_ERROR_UNCONNECTED:
+ return "SPI_ERROR_UNCONNECTED";
+ case SPI_ERROR_ARGUMENT:
+ return "SPI_ERROR_ARGUMENT";
+ case SPI_ERROR_PARAM:
+ return "SPI_ERROR_PARAM";
+ case SPI_ERROR_TRANSACTION:
+ return "SPI_ERROR_TRANSACTION";
+ case SPI_ERROR_NOATTRIBUTE:
+ return "SPI_ERROR_NOATTRIBUTE";
+ case SPI_ERROR_NOOUTFUNC:
+ return "SPI_ERROR_NOOUTFUNC";
+ case SPI_ERROR_TYPUNKNOWN:
+ return "SPI_ERROR_TYPUNKNOWN";
+ case SPI_ERROR_REL_DUPLICATE:
+ return "SPI_ERROR_REL_DUPLICATE";
+ case SPI_ERROR_REL_NOT_FOUND:
+ return "SPI_ERROR_REL_NOT_FOUND";
+ case SPI_OK_CONNECT:
+ return "SPI_OK_CONNECT";
+ case SPI_OK_FINISH:
+ return "SPI_OK_FINISH";
+ case SPI_OK_FETCH:
+ return "SPI_OK_FETCH";
+ case SPI_OK_UTILITY:
+ return "SPI_OK_UTILITY";
+ case SPI_OK_SELECT:
+ return "SPI_OK_SELECT";
+ case SPI_OK_SELINTO:
+ return "SPI_OK_SELINTO";
+ case SPI_OK_INSERT:
+ return "SPI_OK_INSERT";
+ case SPI_OK_DELETE:
+ return "SPI_OK_DELETE";
+ case SPI_OK_UPDATE:
+ return "SPI_OK_UPDATE";
+ case SPI_OK_CURSOR:
+ return "SPI_OK_CURSOR";
+ case SPI_OK_INSERT_RETURNING:
+ return "SPI_OK_INSERT_RETURNING";
+ case SPI_OK_DELETE_RETURNING:
+ return "SPI_OK_DELETE_RETURNING";
+ case SPI_OK_UPDATE_RETURNING:
+ return "SPI_OK_UPDATE_RETURNING";
+ case SPI_OK_REWRITTEN:
+ return "SPI_OK_REWRITTEN";
+ case SPI_OK_REL_REGISTER:
+ return "SPI_OK_REL_REGISTER";
+ case SPI_OK_REL_UNREGISTER:
+ return "SPI_OK_REL_UNREGISTER";
+ }
+ /* Unrecognized code ... return something useful ... */
+ sprintf(buf, "Unrecognized SPI code %d", code);
+ return buf;
+}
+
+/*
+ * SPI_plan_get_plan_sources --- get a SPI plan's underlying list of
+ * CachedPlanSources.
+ *
+ * This is exported so that PL/pgSQL can use it (this beats letting PL/pgSQL
+ * look directly into the SPIPlan for itself). It's not documented in
+ * spi.sgml because we'd just as soon not have too many places using this.
+ */
+List *
+SPI_plan_get_plan_sources(SPIPlanPtr plan)
+{
+ Assert(plan->magic == _SPI_PLAN_MAGIC);
+ return plan->plancache_list;
+}
+
+/*
+ * SPI_plan_get_cached_plan --- get a SPI plan's generic CachedPlan,
+ * if the SPI plan contains exactly one CachedPlanSource. If not,
+ * return NULL.
+ *
+ * The plan's refcount is incremented (and logged in CurrentResourceOwner,
+ * if it's a saved plan). Caller is responsible for doing ReleaseCachedPlan.
+ *
+ * This is exported so that PL/pgSQL can use it (this beats letting PL/pgSQL
+ * look directly into the SPIPlan for itself). It's not documented in
+ * spi.sgml because we'd just as soon not have too many places using this.
+ */
+CachedPlan *
+SPI_plan_get_cached_plan(SPIPlanPtr plan)
+{
+ CachedPlanSource *plansource;
+ CachedPlan *cplan;
+ SPICallbackArg spicallbackarg;
+ ErrorContextCallback spierrcontext;
+
+ Assert(plan->magic == _SPI_PLAN_MAGIC);
+
+ /* Can't support one-shot plans here */
+ if (plan->oneshot)
+ return NULL;
+
+ /* Must have exactly one CachedPlanSource */
+ if (list_length(plan->plancache_list) != 1)
+ return NULL;
+ plansource = (CachedPlanSource *) linitial(plan->plancache_list);
+
+ /* Setup error traceback support for ereport() */
+ spicallbackarg.query = plansource->query_string;
+ spicallbackarg.mode = plan->parse_mode;
+ spierrcontext.callback = _SPI_error_callback;
+ spierrcontext.arg = &spicallbackarg;
+ spierrcontext.previous = error_context_stack;
+ error_context_stack = &spierrcontext;
+
+ /* Get the generic plan for the query */
+ cplan = GetCachedPlan(plansource, NULL,
+ plan->saved ? CurrentResourceOwner : NULL,
+ _SPI_current->queryEnv);
+ Assert(cplan == plansource->gplan);
+
+ /* Pop the error context stack */
+ error_context_stack = spierrcontext.previous;
+
+ return cplan;
+}
+
+
+/* =================== private functions =================== */
+
+/*
+ * spi_dest_startup
+ * Initialize to receive tuples from Executor into SPITupleTable
+ * of current SPI procedure
+ */
+void
+spi_dest_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+ SPITupleTable *tuptable;
+ MemoryContext oldcxt;
+ MemoryContext tuptabcxt;
+
+ if (_SPI_current == NULL)
+ elog(ERROR, "spi_dest_startup called while not connected to SPI");
+
+ if (_SPI_current->tuptable != NULL)
+ elog(ERROR, "improper call to spi_dest_startup");
+
+ /* We create the tuple table context as a child of procCxt */
+
+ oldcxt = _SPI_procmem(); /* switch to procedure memory context */
+
+ tuptabcxt = AllocSetContextCreate(CurrentMemoryContext,
+ "SPI TupTable",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextSwitchTo(tuptabcxt);
+
+ _SPI_current->tuptable = tuptable = (SPITupleTable *)
+ palloc0(sizeof(SPITupleTable));
+ tuptable->tuptabcxt = tuptabcxt;
+ tuptable->subid = GetCurrentSubTransactionId();
+
+ /*
+ * The tuptable is now valid enough to be freed by AtEOSubXact_SPI, so put
+ * it onto the SPI context's tuptables list. This will ensure it's not
+ * leaked even in the unlikely event the following few lines fail.
+ */
+ slist_push_head(&_SPI_current->tuptables, &tuptable->next);
+
+ /* set up initial allocations */
+ tuptable->alloced = 128;
+ tuptable->vals = (HeapTuple *) palloc(tuptable->alloced * sizeof(HeapTuple));
+ tuptable->numvals = 0;
+ tuptable->tupdesc = CreateTupleDescCopy(typeinfo);
+
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * spi_printtup
+ * store tuple retrieved by Executor into SPITupleTable
+ * of current SPI procedure
+ */
+bool
+spi_printtup(TupleTableSlot *slot, DestReceiver *self)
+{
+ SPITupleTable *tuptable;
+ MemoryContext oldcxt;
+
+ if (_SPI_current == NULL)
+ elog(ERROR, "spi_printtup called while not connected to SPI");
+
+ tuptable = _SPI_current->tuptable;
+ if (tuptable == NULL)
+ elog(ERROR, "improper call to spi_printtup");
+
+ oldcxt = MemoryContextSwitchTo(tuptable->tuptabcxt);
+
+ if (tuptable->numvals >= tuptable->alloced)
+ {
+ /* Double the size of the pointer array */
+ uint64 newalloced = tuptable->alloced * 2;
+
+ tuptable->vals = (HeapTuple *) repalloc_huge(tuptable->vals,
+ newalloced * sizeof(HeapTuple));
+ tuptable->alloced = newalloced;
+ }
+
+ tuptable->vals[tuptable->numvals] = ExecCopySlotHeapTuple(slot);
+ (tuptable->numvals)++;
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return true;
+}
+
+/*
+ * Static functions
+ */
+
+/*
+ * Parse and analyze a querystring.
+ *
+ * At entry, plan->argtypes and plan->nargs (or alternatively plan->parserSetup
+ * and plan->parserSetupArg) must be valid, as must plan->parse_mode and
+ * plan->cursor_options.
+ *
+ * Results are stored into *plan (specifically, plan->plancache_list).
+ * Note that the result data is all in CurrentMemoryContext or child contexts
+ * thereof; in practice this means it is in the SPI executor context, and
+ * what we are creating is a "temporary" SPIPlan. Cruft generated during
+ * parsing is also left in CurrentMemoryContext.
+ */
+static void
+_SPI_prepare_plan(const char *src, SPIPlanPtr plan)
+{
+ List *raw_parsetree_list;
+ List *plancache_list;
+ ListCell *list_item;
+ SPICallbackArg spicallbackarg;
+ ErrorContextCallback spierrcontext;
+
+ /*
+ * Setup error traceback support for ereport()
+ */
+ spicallbackarg.query = src;
+ spicallbackarg.mode = plan->parse_mode;
+ spierrcontext.callback = _SPI_error_callback;
+ spierrcontext.arg = &spicallbackarg;
+ spierrcontext.previous = error_context_stack;
+ error_context_stack = &spierrcontext;
+
+ /*
+ * Parse the request string into a list of raw parse trees.
+ */
+ raw_parsetree_list = raw_parser(src, plan->parse_mode);
+
+ /*
+ * Do parse analysis and rule rewrite for each raw parsetree, storing the
+ * results into unsaved plancache entries.
+ */
+ plancache_list = NIL;
+
+ foreach(list_item, raw_parsetree_list)
+ {
+ RawStmt *parsetree = lfirst_node(RawStmt, list_item);
+ List *stmt_list;
+ CachedPlanSource *plansource;
+
+ /*
+ * Create the CachedPlanSource before we do parse analysis, since it
+ * needs to see the unmodified raw parse tree.
+ */
+ plansource = CreateCachedPlan(parsetree,
+ src,
+ CreateCommandTag(parsetree->stmt));
+
+ /*
+ * Parameter datatypes are driven by parserSetup hook if provided,
+ * otherwise we use the fixed parameter list.
+ */
+ if (plan->parserSetup != NULL)
+ {
+ Assert(plan->nargs == 0);
+ stmt_list = pg_analyze_and_rewrite_params(parsetree,
+ src,
+ plan->parserSetup,
+ plan->parserSetupArg,
+ _SPI_current->queryEnv);
+ }
+ else
+ {
+ stmt_list = pg_analyze_and_rewrite(parsetree,
+ src,
+ plan->argtypes,
+ plan->nargs,
+ _SPI_current->queryEnv);
+ }
+
+ /* Finish filling in the CachedPlanSource */
+ CompleteCachedPlan(plansource,
+ stmt_list,
+ NULL,
+ plan->argtypes,
+ plan->nargs,
+ plan->parserSetup,
+ plan->parserSetupArg,
+ plan->cursor_options,
+ false); /* not fixed result */
+
+ plancache_list = lappend(plancache_list, plansource);
+ }
+
+ plan->plancache_list = plancache_list;
+ plan->oneshot = false;
+
+ /*
+ * Pop the error context stack
+ */
+ error_context_stack = spierrcontext.previous;
+}
+
+/*
+ * Parse, but don't analyze, a querystring.
+ *
+ * This is a stripped-down version of _SPI_prepare_plan that only does the
+ * initial raw parsing. It creates "one shot" CachedPlanSources
+ * that still require parse analysis before execution is possible.
+ *
+ * The advantage of using the "one shot" form of CachedPlanSource is that
+ * we eliminate data copying and invalidation overhead. Postponing parse
+ * analysis also prevents issues if some of the raw parsetrees are DDL
+ * commands that affect validity of later parsetrees. Both of these
+ * attributes are good things for SPI_execute() and similar cases.
+ *
+ * Results are stored into *plan (specifically, plan->plancache_list).
+ * Note that the result data is all in CurrentMemoryContext or child contexts
+ * thereof; in practice this means it is in the SPI executor context, and
+ * what we are creating is a "temporary" SPIPlan. Cruft generated during
+ * parsing is also left in CurrentMemoryContext.
+ */
+static void
+_SPI_prepare_oneshot_plan(const char *src, SPIPlanPtr plan)
+{
+ List *raw_parsetree_list;
+ List *plancache_list;
+ ListCell *list_item;
+ SPICallbackArg spicallbackarg;
+ ErrorContextCallback spierrcontext;
+
+ /*
+ * Setup error traceback support for ereport()
+ */
+ spicallbackarg.query = src;
+ spicallbackarg.mode = plan->parse_mode;
+ spierrcontext.callback = _SPI_error_callback;
+ spierrcontext.arg = &spicallbackarg;
+ spierrcontext.previous = error_context_stack;
+ error_context_stack = &spierrcontext;
+
+ /*
+ * Parse the request string into a list of raw parse trees.
+ */
+ raw_parsetree_list = raw_parser(src, plan->parse_mode);
+
+ /*
+ * Construct plancache entries, but don't do parse analysis yet.
+ */
+ plancache_list = NIL;
+
+ foreach(list_item, raw_parsetree_list)
+ {
+ RawStmt *parsetree = lfirst_node(RawStmt, list_item);
+ CachedPlanSource *plansource;
+
+ plansource = CreateOneShotCachedPlan(parsetree,
+ src,
+ CreateCommandTag(parsetree->stmt));
+
+ plancache_list = lappend(plancache_list, plansource);
+ }
+
+ plan->plancache_list = plancache_list;
+ plan->oneshot = true;
+
+ /*
+ * Pop the error context stack
+ */
+ error_context_stack = spierrcontext.previous;
+}
+
+/*
+ * _SPI_execute_plan: execute the given plan with the given options
+ *
+ * options contains options accessible from outside SPI:
+ * params: parameter values to pass to query
+ * read_only: true for read-only execution (no CommandCounterIncrement)
+ * allow_nonatomic: true to allow nonatomic CALL/DO execution
+ * must_return_tuples: throw error if query doesn't return tuples
+ * tcount: execution tuple-count limit, or 0 for none
+ * dest: DestReceiver to receive output, or NULL for normal SPI output
+ * owner: ResourceOwner that will be used to hold refcount on plan;
+ * if NULL, CurrentResourceOwner is used (ignored for non-saved plan)
+ *
+ * Additional, only-internally-accessible options:
+ * snapshot: query snapshot to use, or InvalidSnapshot for the normal
+ * behavior of taking a new snapshot for each query.
+ * crosscheck_snapshot: for RI use, all others pass InvalidSnapshot
+ * fire_triggers: true to fire AFTER triggers at end of query (normal case);
+ * false means any AFTER triggers are postponed to end of outer query
+ */
+static int
+_SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options,
+ Snapshot snapshot, Snapshot crosscheck_snapshot,
+ bool fire_triggers)
+{
+ int my_res = 0;
+ uint64 my_processed = 0;
+ SPITupleTable *my_tuptable = NULL;
+ int res = 0;
+ bool pushed_active_snap = false;
+ ResourceOwner plan_owner = options->owner;
+ SPICallbackArg spicallbackarg;
+ ErrorContextCallback spierrcontext;
+ CachedPlan *cplan = NULL;
+ ListCell *lc1;
+
+ /*
+ * Setup error traceback support for ereport()
+ */
+ spicallbackarg.query = NULL; /* we'll fill this below */
+ spicallbackarg.mode = plan->parse_mode;
+ spierrcontext.callback = _SPI_error_callback;
+ spierrcontext.arg = &spicallbackarg;
+ spierrcontext.previous = error_context_stack;
+ error_context_stack = &spierrcontext;
+
+ /*
+ * We support four distinct snapshot management behaviors:
+ *
+ * snapshot != InvalidSnapshot, read_only = true: use exactly the given
+ * snapshot.
+ *
+ * snapshot != InvalidSnapshot, read_only = false: use the given snapshot,
+ * modified by advancing its command ID before each querytree.
+ *
+ * snapshot == InvalidSnapshot, read_only = true: use the entry-time
+ * ActiveSnapshot, if any (if there isn't one, we run with no snapshot).
+ *
+ * snapshot == InvalidSnapshot, read_only = false: take a full new
+ * snapshot for each user command, and advance its command ID before each
+ * querytree within the command.
+ *
+ * In the first two cases, we can just push the snap onto the stack once
+ * for the whole plan list.
+ *
+ * Note that snapshot != InvalidSnapshot implies an atomic execution
+ * context.
+ */
+ if (snapshot != InvalidSnapshot)
+ {
+ Assert(!options->allow_nonatomic);
+ if (options->read_only)
+ {
+ PushActiveSnapshot(snapshot);
+ pushed_active_snap = true;
+ }
+ else
+ {
+ /* Make sure we have a private copy of the snapshot to modify */
+ PushCopiedSnapshot(snapshot);
+ pushed_active_snap = true;
+ }
+ }
+
+ /*
+ * Ensure that we have a resource owner if plan is saved, and not if it
+ * isn't.
+ */
+ if (!plan->saved)
+ plan_owner = NULL;
+ else if (plan_owner == NULL)
+ plan_owner = CurrentResourceOwner;
+
+ /*
+ * We interpret must_return_tuples as "there must be at least one query,
+ * and all of them must return tuples". This is a bit laxer than
+ * SPI_is_cursor_plan's check, but there seems no reason to enforce that
+ * there be only one query.
+ */
+ if (options->must_return_tuples && plan->plancache_list == NIL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("empty query does not return tuples")));
+
+ foreach(lc1, plan->plancache_list)
+ {
+ CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc1);
+ List *stmt_list;
+ ListCell *lc2;
+
+ spicallbackarg.query = plansource->query_string;
+
+ /*
+ * If this is a one-shot plan, we still need to do parse analysis.
+ */
+ if (plan->oneshot)
+ {
+ RawStmt *parsetree = plansource->raw_parse_tree;
+ const char *src = plansource->query_string;
+ List *stmt_list;
+
+ /*
+ * Parameter datatypes are driven by parserSetup hook if provided,
+ * otherwise we use the fixed parameter list.
+ */
+ if (parsetree == NULL)
+ stmt_list = NIL;
+ else if (plan->parserSetup != NULL)
+ {
+ Assert(plan->nargs == 0);
+ stmt_list = pg_analyze_and_rewrite_params(parsetree,
+ src,
+ plan->parserSetup,
+ plan->parserSetupArg,
+ _SPI_current->queryEnv);
+ }
+ else
+ {
+ stmt_list = pg_analyze_and_rewrite(parsetree,
+ src,
+ plan->argtypes,
+ plan->nargs,
+ _SPI_current->queryEnv);
+ }
+
+ /* Finish filling in the CachedPlanSource */
+ CompleteCachedPlan(plansource,
+ stmt_list,
+ NULL,
+ plan->argtypes,
+ plan->nargs,
+ plan->parserSetup,
+ plan->parserSetupArg,
+ plan->cursor_options,
+ false); /* not fixed result */
+ }
+
+ /*
+ * If asked to, complain when query does not return tuples.
+ * (Replanning can't change this, so we can check it before that.
+ * However, we can't check it till after parse analysis, so in the
+ * case of a one-shot plan this is the earliest we could check.)
+ */
+ if (options->must_return_tuples && !plansource->resultDesc)
+ {
+ /* try to give a good error message */
+ const char *cmdtag;
+
+ /* A SELECT without resultDesc must be SELECT INTO */
+ if (plansource->commandTag == CMDTAG_SELECT)
+ cmdtag = "SELECT INTO";
+ else
+ cmdtag = GetCommandTagName(plansource->commandTag);
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ /* translator: %s is name of a SQL command, eg INSERT */
+ errmsg("%s query does not return tuples", cmdtag)));
+ }
+
+ /*
+ * Replan if needed, and increment plan refcount. If it's a saved
+ * plan, the refcount must be backed by the plan_owner.
+ */
+ cplan = GetCachedPlan(plansource, options->params,
+ plan_owner, _SPI_current->queryEnv);
+
+ stmt_list = cplan->stmt_list;
+
+ /*
+ * If we weren't given a specific snapshot to use, and the statement
+ * list requires a snapshot, set that up.
+ */
+ if (snapshot == InvalidSnapshot &&
+ (list_length(stmt_list) > 1 ||
+ (list_length(stmt_list) == 1 &&
+ PlannedStmtRequiresSnapshot(linitial_node(PlannedStmt,
+ stmt_list)))))
+ {
+ /*
+ * First, ensure there's a Portal-level snapshot. This back-fills
+ * the snapshot stack in case the previous operation was a COMMIT
+ * or ROLLBACK inside a procedure or DO block. (We can't put back
+ * the Portal snapshot any sooner, or we'd break cases like doing
+ * SET or LOCK just after COMMIT.) It's enough to check once per
+ * statement list, since COMMIT/ROLLBACK/CALL/DO can't appear
+ * within a multi-statement list.
+ */
+ EnsurePortalSnapshotExists();
+
+ /*
+ * In the default non-read-only case, get a new per-statement-list
+ * snapshot, replacing any that we pushed in a previous cycle.
+ * Skip it when doing non-atomic execution, though (we rely
+ * entirely on the Portal snapshot in that case).
+ */
+ if (!options->read_only && !options->allow_nonatomic)
+ {
+ if (pushed_active_snap)
+ PopActiveSnapshot();
+ PushActiveSnapshot(GetTransactionSnapshot());
+ pushed_active_snap = true;
+ }
+ }
+
+ foreach(lc2, stmt_list)
+ {
+ PlannedStmt *stmt = lfirst_node(PlannedStmt, lc2);
+ bool canSetTag = stmt->canSetTag;
+ DestReceiver *dest;
+
+ /*
+ * Reset output state. (Note that if a non-SPI receiver is used,
+ * _SPI_current->processed will stay zero, and that's what we'll
+ * report to the caller. It's the receiver's job to count tuples
+ * in that case.)
+ */
+ _SPI_current->processed = 0;
+ _SPI_current->tuptable = NULL;
+
+ /* Check for unsupported cases. */
+ if (stmt->utilityStmt)
+ {
+ if (IsA(stmt->utilityStmt, CopyStmt))
+ {
+ CopyStmt *cstmt = (CopyStmt *) stmt->utilityStmt;
+
+ if (cstmt->filename == NULL)
+ {
+ my_res = SPI_ERROR_COPY;
+ goto fail;
+ }
+ }
+ else if (IsA(stmt->utilityStmt, TransactionStmt))
+ {
+ my_res = SPI_ERROR_TRANSACTION;
+ goto fail;
+ }
+ }
+
+ if (options->read_only && !CommandIsReadOnly(stmt))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ /* translator: %s is a SQL statement name */
+ errmsg("%s is not allowed in a non-volatile function",
+ CreateCommandName((Node *) stmt))));
+
+ /*
+ * If not read-only mode, advance the command counter before each
+ * command and update the snapshot. (But skip it if the snapshot
+ * isn't under our control.)
+ */
+ if (!options->read_only && pushed_active_snap)
+ {
+ CommandCounterIncrement();
+ UpdateActiveSnapshotCommandId();
+ }
+
+ /*
+ * Select appropriate tuple receiver. Output from non-canSetTag
+ * subqueries always goes to the bit bucket.
+ */
+ if (!canSetTag)
+ dest = CreateDestReceiver(DestNone);
+ else if (options->dest)
+ dest = options->dest;
+ else
+ dest = CreateDestReceiver(DestSPI);
+
+ if (stmt->utilityStmt == NULL)
+ {
+ QueryDesc *qdesc;
+ Snapshot snap;
+
+ if (ActiveSnapshotSet())
+ snap = GetActiveSnapshot();
+ else
+ snap = InvalidSnapshot;
+
+ qdesc = CreateQueryDesc(stmt,
+ plansource->query_string,
+ snap, crosscheck_snapshot,
+ dest,
+ options->params,
+ _SPI_current->queryEnv,
+ 0);
+ res = _SPI_pquery(qdesc, fire_triggers,
+ canSetTag ? options->tcount : 0);
+ FreeQueryDesc(qdesc);
+ }
+ else
+ {
+ ProcessUtilityContext context;
+ QueryCompletion qc;
+
+ /*
+ * If the SPI context is atomic, or we were not told to allow
+ * nonatomic operations, tell ProcessUtility this is an atomic
+ * execution context.
+ */
+ if (_SPI_current->atomic || !options->allow_nonatomic)
+ context = PROCESS_UTILITY_QUERY;
+ else
+ context = PROCESS_UTILITY_QUERY_NONATOMIC;
+
+ InitializeQueryCompletion(&qc);
+ ProcessUtility(stmt,
+ plansource->query_string,
+ true, /* protect plancache's node tree */
+ context,
+ options->params,
+ _SPI_current->queryEnv,
+ dest,
+ &qc);
+
+ /* Update "processed" if stmt returned tuples */
+ if (_SPI_current->tuptable)
+ _SPI_current->processed = _SPI_current->tuptable->numvals;
+
+ res = SPI_OK_UTILITY;
+
+ /*
+ * Some utility statements return a row count, even though the
+ * tuples are not returned to the caller.
+ */
+ if (IsA(stmt->utilityStmt, CreateTableAsStmt))
+ {
+ CreateTableAsStmt *ctastmt = (CreateTableAsStmt *) stmt->utilityStmt;
+
+ if (qc.commandTag == CMDTAG_SELECT)
+ _SPI_current->processed = qc.nprocessed;
+ else
+ {
+ /*
+ * Must be an IF NOT EXISTS that did nothing, or a
+ * CREATE ... WITH NO DATA.
+ */
+ Assert(ctastmt->if_not_exists ||
+ ctastmt->into->skipData);
+ _SPI_current->processed = 0;
+ }
+
+ /*
+ * For historical reasons, if CREATE TABLE AS was spelled
+ * as SELECT INTO, return a special return code.
+ */
+ if (ctastmt->is_select_into)
+ res = SPI_OK_SELINTO;
+ }
+ else if (IsA(stmt->utilityStmt, CopyStmt))
+ {
+ Assert(qc.commandTag == CMDTAG_COPY);
+ _SPI_current->processed = qc.nprocessed;
+ }
+ }
+
+ /*
+ * The last canSetTag query sets the status values returned to the
+ * caller. Be careful to free any tuptables not returned, to
+ * avoid intra-transaction memory leak.
+ */
+ if (canSetTag)
+ {
+ my_processed = _SPI_current->processed;
+ SPI_freetuptable(my_tuptable);
+ my_tuptable = _SPI_current->tuptable;
+ my_res = res;
+ }
+ else
+ {
+ SPI_freetuptable(_SPI_current->tuptable);
+ _SPI_current->tuptable = NULL;
+ }
+
+ /*
+ * We don't issue a destroy call to the receiver. The SPI and
+ * None receivers would ignore it anyway, while if the caller
+ * supplied a receiver, it's not our job to destroy it.
+ */
+
+ if (res < 0)
+ {
+ my_res = res;
+ goto fail;
+ }
+ }
+
+ /* Done with this plan, so release refcount */
+ ReleaseCachedPlan(cplan, plan_owner);
+ cplan = NULL;
+
+ /*
+ * If not read-only mode, advance the command counter after the last
+ * command. This ensures that its effects are visible, in case it was
+ * DDL that would affect the next CachedPlanSource.
+ */
+ if (!options->read_only)
+ CommandCounterIncrement();
+ }
+
+fail:
+
+ /* Pop the snapshot off the stack if we pushed one */
+ if (pushed_active_snap)
+ PopActiveSnapshot();
+
+ /* We no longer need the cached plan refcount, if any */
+ if (cplan)
+ ReleaseCachedPlan(cplan, plan_owner);
+
+ /*
+ * Pop the error context stack
+ */
+ error_context_stack = spierrcontext.previous;
+
+ /* Save results for caller */
+ SPI_processed = my_processed;
+ SPI_tuptable = my_tuptable;
+
+ /* tuptable now is caller's responsibility, not SPI's */
+ _SPI_current->tuptable = NULL;
+
+ /*
+ * If none of the queries had canSetTag, return SPI_OK_REWRITTEN. Prior to
+ * 8.4, we used return the last query's result code, but not its auxiliary
+ * results, but that's confusing.
+ */
+ if (my_res == 0)
+ my_res = SPI_OK_REWRITTEN;
+
+ return my_res;
+}
+
+/*
+ * Convert arrays of query parameters to form wanted by planner and executor
+ */
+static ParamListInfo
+_SPI_convert_params(int nargs, Oid *argtypes,
+ Datum *Values, const char *Nulls)
+{
+ ParamListInfo paramLI;
+
+ if (nargs > 0)
+ {
+ paramLI = makeParamList(nargs);
+
+ for (int i = 0; i < nargs; i++)
+ {
+ ParamExternData *prm = &paramLI->params[i];
+
+ prm->value = Values[i];
+ prm->isnull = (Nulls && Nulls[i] == 'n');
+ prm->pflags = PARAM_FLAG_CONST;
+ prm->ptype = argtypes[i];
+ }
+ }
+ else
+ paramLI = NULL;
+ return paramLI;
+}
+
+static int
+_SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount)
+{
+ int operation = queryDesc->operation;
+ int eflags;
+ int res;
+
+ switch (operation)
+ {
+ case CMD_SELECT:
+ if (queryDesc->dest->mydest == DestNone)
+ {
+ /* Don't return SPI_OK_SELECT if we're discarding result */
+ res = SPI_OK_UTILITY;
+ }
+ else
+ res = SPI_OK_SELECT;
+ break;
+ case CMD_INSERT:
+ if (queryDesc->plannedstmt->hasReturning)
+ res = SPI_OK_INSERT_RETURNING;
+ else
+ res = SPI_OK_INSERT;
+ break;
+ case CMD_DELETE:
+ if (queryDesc->plannedstmt->hasReturning)
+ res = SPI_OK_DELETE_RETURNING;
+ else
+ res = SPI_OK_DELETE;
+ break;
+ case CMD_UPDATE:
+ if (queryDesc->plannedstmt->hasReturning)
+ res = SPI_OK_UPDATE_RETURNING;
+ else
+ res = SPI_OK_UPDATE;
+ break;
+ default:
+ return SPI_ERROR_OPUNKNOWN;
+ }
+
+#ifdef SPI_EXECUTOR_STATS
+ if (ShowExecutorStats)
+ ResetUsage();
+#endif
+
+ /* Select execution options */
+ if (fire_triggers)
+ eflags = 0; /* default run-to-completion flags */
+ else
+ eflags = EXEC_FLAG_SKIP_TRIGGERS;
+
+ ExecutorStart(queryDesc, eflags);
+
+ ExecutorRun(queryDesc, ForwardScanDirection, tcount, true);
+
+ _SPI_current->processed = queryDesc->estate->es_processed;
+
+ if ((res == SPI_OK_SELECT || queryDesc->plannedstmt->hasReturning) &&
+ queryDesc->dest->mydest == DestSPI)
+ {
+ if (_SPI_checktuples())
+ elog(ERROR, "consistency check on SPI tuple count failed");
+ }
+
+ ExecutorFinish(queryDesc);
+ ExecutorEnd(queryDesc);
+ /* FreeQueryDesc is done by the caller */
+
+#ifdef SPI_EXECUTOR_STATS
+ if (ShowExecutorStats)
+ ShowUsage("SPI EXECUTOR STATS");
+#endif
+
+ return res;
+}
+
+/*
+ * _SPI_error_callback
+ *
+ * Add context information when a query invoked via SPI fails
+ */
+static void
+_SPI_error_callback(void *arg)
+{
+ SPICallbackArg *carg = (SPICallbackArg *) arg;
+ const char *query = carg->query;
+ int syntaxerrposition;
+
+ if (query == NULL) /* in case arg wasn't set yet */
+ return;
+
+ /*
+ * If there is a syntax error position, convert to internal syntax error;
+ * otherwise treat the query as an item of context stack
+ */
+ syntaxerrposition = geterrposition();
+ if (syntaxerrposition > 0)
+ {
+ errposition(0);
+ internalerrposition(syntaxerrposition);
+ internalerrquery(query);
+ }
+ else
+ {
+ /* Use the parse mode to decide how to describe the query */
+ switch (carg->mode)
+ {
+ case RAW_PARSE_PLPGSQL_EXPR:
+ errcontext("SQL expression \"%s\"", query);
+ break;
+ case RAW_PARSE_PLPGSQL_ASSIGN1:
+ case RAW_PARSE_PLPGSQL_ASSIGN2:
+ case RAW_PARSE_PLPGSQL_ASSIGN3:
+ errcontext("PL/pgSQL assignment \"%s\"", query);
+ break;
+ default:
+ errcontext("SQL statement \"%s\"", query);
+ break;
+ }
+ }
+}
+
+/*
+ * _SPI_cursor_operation()
+ *
+ * Do a FETCH or MOVE in a cursor
+ */
+static void
+_SPI_cursor_operation(Portal portal, FetchDirection direction, long count,
+ DestReceiver *dest)
+{
+ uint64 nfetched;
+
+ /* Check that the portal is valid */
+ if (!PortalIsValid(portal))
+ elog(ERROR, "invalid portal in SPI cursor operation");
+
+ /* Push the SPI stack */
+ if (_SPI_begin_call(true) < 0)
+ elog(ERROR, "SPI cursor operation called while not connected");
+
+ /* Reset the SPI result (note we deliberately don't touch lastoid) */
+ SPI_processed = 0;
+ SPI_tuptable = NULL;
+ _SPI_current->processed = 0;
+ _SPI_current->tuptable = NULL;
+
+ /* Run the cursor */
+ nfetched = PortalRunFetch(portal,
+ direction,
+ count,
+ dest);
+
+ /*
+ * Think not to combine this store with the preceding function call. If
+ * the portal contains calls to functions that use SPI, then _SPI_stack is
+ * likely to move around while the portal runs. When control returns,
+ * _SPI_current will point to the correct stack entry... but the pointer
+ * may be different than it was beforehand. So we must be sure to re-fetch
+ * the pointer after the function call completes.
+ */
+ _SPI_current->processed = nfetched;
+
+ if (dest->mydest == DestSPI && _SPI_checktuples())
+ elog(ERROR, "consistency check on SPI tuple count failed");
+
+ /* Put the result into place for access by caller */
+ SPI_processed = _SPI_current->processed;
+ SPI_tuptable = _SPI_current->tuptable;
+
+ /* tuptable now is caller's responsibility, not SPI's */
+ _SPI_current->tuptable = NULL;
+
+ /* Pop the SPI stack */
+ _SPI_end_call(true);
+}
+
+
+static MemoryContext
+_SPI_execmem(void)
+{
+ return MemoryContextSwitchTo(_SPI_current->execCxt);
+}
+
+static MemoryContext
+_SPI_procmem(void)
+{
+ return MemoryContextSwitchTo(_SPI_current->procCxt);
+}
+
+/*
+ * _SPI_begin_call: begin a SPI operation within a connected procedure
+ *
+ * use_exec is true if we intend to make use of the procedure's execCxt
+ * during this SPI operation. We'll switch into that context, and arrange
+ * for it to be cleaned up at _SPI_end_call or if an error occurs.
+ */
+static int
+_SPI_begin_call(bool use_exec)
+{
+ if (_SPI_current == NULL)
+ return SPI_ERROR_UNCONNECTED;
+
+ if (use_exec)
+ {
+ /* remember when the Executor operation started */
+ _SPI_current->execSubid = GetCurrentSubTransactionId();
+ /* switch to the Executor memory context */
+ _SPI_execmem();
+ }
+
+ return 0;
+}
+
+/*
+ * _SPI_end_call: end a SPI operation within a connected procedure
+ *
+ * use_exec must be the same as in the previous _SPI_begin_call
+ *
+ * Note: this currently has no failure return cases, so callers don't check
+ */
+static int
+_SPI_end_call(bool use_exec)
+{
+ if (use_exec)
+ {
+ /* switch to the procedure memory context */
+ _SPI_procmem();
+ /* mark Executor context no longer in use */
+ _SPI_current->execSubid = InvalidSubTransactionId;
+ /* and free Executor memory */
+ MemoryContextResetAndDeleteChildren(_SPI_current->execCxt);
+ }
+
+ return 0;
+}
+
+static bool
+_SPI_checktuples(void)
+{
+ uint64 processed = _SPI_current->processed;
+ SPITupleTable *tuptable = _SPI_current->tuptable;
+ bool failed = false;
+
+ if (tuptable == NULL) /* spi_dest_startup was not called */
+ failed = true;
+ else if (processed != tuptable->numvals)
+ failed = true;
+
+ return failed;
+}
+
+/*
+ * Convert a "temporary" SPIPlan into an "unsaved" plan.
+ *
+ * The passed _SPI_plan struct is on the stack, and all its subsidiary data
+ * is in or under the current SPI executor context. Copy the plan into the
+ * SPI procedure context so it will survive _SPI_end_call(). To minimize
+ * data copying, this destructively modifies the input plan, by taking the
+ * plancache entries away from it and reparenting them to the new SPIPlan.
+ */
+static SPIPlanPtr
+_SPI_make_plan_non_temp(SPIPlanPtr plan)
+{
+ SPIPlanPtr newplan;
+ MemoryContext parentcxt = _SPI_current->procCxt;
+ MemoryContext plancxt;
+ MemoryContext oldcxt;
+ ListCell *lc;
+
+ /* Assert the input is a temporary SPIPlan */
+ Assert(plan->magic == _SPI_PLAN_MAGIC);
+ Assert(plan->plancxt == NULL);
+ /* One-shot plans can't be saved */
+ Assert(!plan->oneshot);
+
+ /*
+ * Create a memory context for the plan, underneath the procedure context.
+ * We don't expect the plan to be very large.
+ */
+ plancxt = AllocSetContextCreate(parentcxt,
+ "SPI Plan",
+ ALLOCSET_SMALL_SIZES);
+ oldcxt = MemoryContextSwitchTo(plancxt);
+
+ /* Copy the _SPI_plan struct and subsidiary data into the new context */
+ newplan = (SPIPlanPtr) palloc0(sizeof(_SPI_plan));
+ newplan->magic = _SPI_PLAN_MAGIC;
+ newplan->plancxt = plancxt;
+ newplan->parse_mode = plan->parse_mode;
+ newplan->cursor_options = plan->cursor_options;
+ newplan->nargs = plan->nargs;
+ if (plan->nargs > 0)
+ {
+ newplan->argtypes = (Oid *) palloc(plan->nargs * sizeof(Oid));
+ memcpy(newplan->argtypes, plan->argtypes, plan->nargs * sizeof(Oid));
+ }
+ else
+ newplan->argtypes = NULL;
+ newplan->parserSetup = plan->parserSetup;
+ newplan->parserSetupArg = plan->parserSetupArg;
+
+ /*
+ * Reparent all the CachedPlanSources into the procedure context. In
+ * theory this could fail partway through due to the pallocs, but we don't
+ * care too much since both the procedure context and the executor context
+ * would go away on error.
+ */
+ foreach(lc, plan->plancache_list)
+ {
+ CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+ CachedPlanSetParentContext(plansource, parentcxt);
+
+ /* Build new list, with list cells in plancxt */
+ newplan->plancache_list = lappend(newplan->plancache_list, plansource);
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+
+ /* For safety, unlink the CachedPlanSources from the temporary plan */
+ plan->plancache_list = NIL;
+
+ return newplan;
+}
+
+/*
+ * Make a "saved" copy of the given plan.
+ */
+static SPIPlanPtr
+_SPI_save_plan(SPIPlanPtr plan)
+{
+ SPIPlanPtr newplan;
+ MemoryContext plancxt;
+ MemoryContext oldcxt;
+ ListCell *lc;
+
+ /* One-shot plans can't be saved */
+ Assert(!plan->oneshot);
+
+ /*
+ * Create a memory context for the plan. We don't expect the plan to be
+ * very large, so use smaller-than-default alloc parameters. It's a
+ * transient context until we finish copying everything.
+ */
+ plancxt = AllocSetContextCreate(CurrentMemoryContext,
+ "SPI Plan",
+ ALLOCSET_SMALL_SIZES);
+ oldcxt = MemoryContextSwitchTo(plancxt);
+
+ /* Copy the SPI plan into its own context */
+ newplan = (SPIPlanPtr) palloc0(sizeof(_SPI_plan));
+ newplan->magic = _SPI_PLAN_MAGIC;
+ newplan->plancxt = plancxt;
+ newplan->parse_mode = plan->parse_mode;
+ newplan->cursor_options = plan->cursor_options;
+ newplan->nargs = plan->nargs;
+ if (plan->nargs > 0)
+ {
+ newplan->argtypes = (Oid *) palloc(plan->nargs * sizeof(Oid));
+ memcpy(newplan->argtypes, plan->argtypes, plan->nargs * sizeof(Oid));
+ }
+ else
+ newplan->argtypes = NULL;
+ newplan->parserSetup = plan->parserSetup;
+ newplan->parserSetupArg = plan->parserSetupArg;
+
+ /* Copy all the plancache entries */
+ foreach(lc, plan->plancache_list)
+ {
+ CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+ CachedPlanSource *newsource;
+
+ newsource = CopyCachedPlan(plansource);
+ newplan->plancache_list = lappend(newplan->plancache_list, newsource);
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+
+ /*
+ * Mark it saved, reparent it under CacheMemoryContext, and mark all the
+ * component CachedPlanSources as saved. This sequence cannot fail
+ * partway through, so there's no risk of long-term memory leakage.
+ */
+ newplan->saved = true;
+ MemoryContextSetParent(newplan->plancxt, CacheMemoryContext);
+
+ foreach(lc, newplan->plancache_list)
+ {
+ CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+ SaveCachedPlan(plansource);
+ }
+
+ return newplan;
+}
+
+/*
+ * Internal lookup of ephemeral named relation by name.
+ */
+static EphemeralNamedRelation
+_SPI_find_ENR_by_name(const char *name)
+{
+ /* internal static function; any error is bug in SPI itself */
+ Assert(name != NULL);
+
+ /* fast exit if no tuplestores have been added */
+ if (_SPI_current->queryEnv == NULL)
+ return NULL;
+
+ return get_ENR(_SPI_current->queryEnv, name);
+}
+
+/*
+ * Register an ephemeral named relation for use by the planner and executor on
+ * subsequent calls using this SPI connection.
+ */
+int
+SPI_register_relation(EphemeralNamedRelation enr)
+{
+ EphemeralNamedRelation match;
+ int res;
+
+ if (enr == NULL || enr->md.name == NULL)
+ return SPI_ERROR_ARGUMENT;
+
+ res = _SPI_begin_call(false); /* keep current memory context */
+ if (res < 0)
+ return res;
+
+ match = _SPI_find_ENR_by_name(enr->md.name);
+ if (match)
+ res = SPI_ERROR_REL_DUPLICATE;
+ else
+ {
+ if (_SPI_current->queryEnv == NULL)
+ _SPI_current->queryEnv = create_queryEnv();
+
+ register_ENR(_SPI_current->queryEnv, enr);
+ res = SPI_OK_REL_REGISTER;
+ }
+
+ _SPI_end_call(false);
+
+ return res;
+}
+
+/*
+ * Unregister an ephemeral named relation by name. This will probably be a
+ * rarely used function, since SPI_finish will clear it automatically.
+ */
+int
+SPI_unregister_relation(const char *name)
+{
+ EphemeralNamedRelation match;
+ int res;
+
+ if (name == NULL)
+ return SPI_ERROR_ARGUMENT;
+
+ res = _SPI_begin_call(false); /* keep current memory context */
+ if (res < 0)
+ return res;
+
+ match = _SPI_find_ENR_by_name(name);
+ if (match)
+ {
+ unregister_ENR(_SPI_current->queryEnv, match->md.name);
+ res = SPI_OK_REL_UNREGISTER;
+ }
+ else
+ res = SPI_ERROR_REL_NOT_FOUND;
+
+ _SPI_end_call(false);
+
+ return res;
+}
+
+/*
+ * Register the transient relations from 'tdata' using this SPI connection.
+ * This should be called by PL implementations' trigger handlers after
+ * connecting, in order to make transition tables visible to any queries run
+ * in this connection.
+ */
+int
+SPI_register_trigger_data(TriggerData *tdata)
+{
+ if (tdata == NULL)
+ return SPI_ERROR_ARGUMENT;
+
+ if (tdata->tg_newtable)
+ {
+ EphemeralNamedRelation enr =
+ palloc(sizeof(EphemeralNamedRelationData));
+ int rc;
+
+ enr->md.name = tdata->tg_trigger->tgnewtable;
+ enr->md.reliddesc = tdata->tg_relation->rd_id;
+ enr->md.tupdesc = NULL;
+ enr->md.enrtype = ENR_NAMED_TUPLESTORE;
+ enr->md.enrtuples = tuplestore_tuple_count(tdata->tg_newtable);
+ enr->reldata = tdata->tg_newtable;
+ rc = SPI_register_relation(enr);
+ if (rc != SPI_OK_REL_REGISTER)
+ return rc;
+ }
+
+ if (tdata->tg_oldtable)
+ {
+ EphemeralNamedRelation enr =
+ palloc(sizeof(EphemeralNamedRelationData));
+ int rc;
+
+ enr->md.name = tdata->tg_trigger->tgoldtable;
+ enr->md.reliddesc = tdata->tg_relation->rd_id;
+ enr->md.tupdesc = NULL;
+ enr->md.enrtype = ENR_NAMED_TUPLESTORE;
+ enr->md.enrtuples = tuplestore_tuple_count(tdata->tg_oldtable);
+ enr->reldata = tdata->tg_oldtable;
+ rc = SPI_register_relation(enr);
+ if (rc != SPI_OK_REL_REGISTER)
+ return rc;
+ }
+
+ return SPI_OK_TD_REGISTER;
+}
diff --git a/src/backend/executor/tqueue.c b/src/backend/executor/tqueue.c
new file mode 100644
index 0000000..7af9fbe
--- /dev/null
+++ b/src/backend/executor/tqueue.c
@@ -0,0 +1,210 @@
+/*-------------------------------------------------------------------------
+ *
+ * tqueue.c
+ * Use shm_mq to send & receive tuples between parallel backends
+ *
+ * A DestReceiver of type DestTupleQueue, which is a TQueueDestReceiver
+ * under the hood, writes tuples from the executor to a shm_mq.
+ *
+ * A TupleQueueReader reads tuples from a shm_mq and returns the tuples.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/tqueue.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/tqueue.h"
+
+/*
+ * DestReceiver object's private contents
+ *
+ * queue is a pointer to data supplied by DestReceiver's caller.
+ */
+typedef struct TQueueDestReceiver
+{
+ DestReceiver pub; /* public fields */
+ shm_mq_handle *queue; /* shm_mq to send to */
+} TQueueDestReceiver;
+
+/*
+ * TupleQueueReader object's private contents
+ *
+ * queue is a pointer to data supplied by reader's caller.
+ *
+ * "typedef struct TupleQueueReader TupleQueueReader" is in tqueue.h
+ */
+struct TupleQueueReader
+{
+ shm_mq_handle *queue; /* shm_mq to receive from */
+};
+
+/*
+ * Receive a tuple from a query, and send it to the designated shm_mq.
+ *
+ * Returns true if successful, false if shm_mq has been detached.
+ */
+static bool
+tqueueReceiveSlot(TupleTableSlot *slot, DestReceiver *self)
+{
+ TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self;
+ MinimalTuple tuple;
+ shm_mq_result result;
+ bool should_free;
+
+ /* Send the tuple itself. */
+ tuple = ExecFetchSlotMinimalTuple(slot, &should_free);
+ result = shm_mq_send(tqueue->queue, tuple->t_len, tuple, false);
+
+ if (should_free)
+ pfree(tuple);
+
+ /* Check for failure. */
+ if (result == SHM_MQ_DETACHED)
+ return false;
+ else if (result != SHM_MQ_SUCCESS)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not send tuple to shared-memory queue")));
+
+ return true;
+}
+
+/*
+ * Prepare to receive tuples from executor.
+ */
+static void
+tqueueStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+ /* do nothing */
+}
+
+/*
+ * Clean up at end of an executor run
+ */
+static void
+tqueueShutdownReceiver(DestReceiver *self)
+{
+ TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self;
+
+ if (tqueue->queue != NULL)
+ shm_mq_detach(tqueue->queue);
+ tqueue->queue = NULL;
+}
+
+/*
+ * Destroy receiver when done with it
+ */
+static void
+tqueueDestroyReceiver(DestReceiver *self)
+{
+ TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self;
+
+ /* We probably already detached from queue, but let's be sure */
+ if (tqueue->queue != NULL)
+ shm_mq_detach(tqueue->queue);
+ pfree(self);
+}
+
+/*
+ * Create a DestReceiver that writes tuples to a tuple queue.
+ */
+DestReceiver *
+CreateTupleQueueDestReceiver(shm_mq_handle *handle)
+{
+ TQueueDestReceiver *self;
+
+ self = (TQueueDestReceiver *) palloc0(sizeof(TQueueDestReceiver));
+
+ self->pub.receiveSlot = tqueueReceiveSlot;
+ self->pub.rStartup = tqueueStartupReceiver;
+ self->pub.rShutdown = tqueueShutdownReceiver;
+ self->pub.rDestroy = tqueueDestroyReceiver;
+ self->pub.mydest = DestTupleQueue;
+ self->queue = handle;
+
+ return (DestReceiver *) self;
+}
+
+/*
+ * Create a tuple queue reader.
+ */
+TupleQueueReader *
+CreateTupleQueueReader(shm_mq_handle *handle)
+{
+ TupleQueueReader *reader = palloc0(sizeof(TupleQueueReader));
+
+ reader->queue = handle;
+
+ return reader;
+}
+
+/*
+ * Destroy a tuple queue reader.
+ *
+ * Note: cleaning up the underlying shm_mq is the caller's responsibility.
+ * We won't access it here, as it may be detached already.
+ */
+void
+DestroyTupleQueueReader(TupleQueueReader *reader)
+{
+ pfree(reader);
+}
+
+/*
+ * Fetch a tuple from a tuple queue reader.
+ *
+ * The return value is NULL if there are no remaining tuples or if
+ * nowait = true and no tuple is ready to return. *done, if not NULL,
+ * is set to true when there are no remaining tuples and otherwise to false.
+ *
+ * The returned tuple, if any, is either in shared memory or a private buffer
+ * and should not be freed. The pointer is invalid after the next call to
+ * TupleQueueReaderNext().
+ *
+ * Even when shm_mq_receive() returns SHM_MQ_WOULD_BLOCK, this can still
+ * accumulate bytes from a partially-read message, so it's useful to call
+ * this with nowait = true even if nothing is returned.
+ */
+MinimalTuple
+TupleQueueReaderNext(TupleQueueReader *reader, bool nowait, bool *done)
+{
+ MinimalTuple tuple;
+ shm_mq_result result;
+ Size nbytes;
+ void *data;
+
+ if (done != NULL)
+ *done = false;
+
+ /* Attempt to read a message. */
+ result = shm_mq_receive(reader->queue, &nbytes, &data, nowait);
+
+ /* If queue is detached, set *done and return NULL. */
+ if (result == SHM_MQ_DETACHED)
+ {
+ if (done != NULL)
+ *done = true;
+ return NULL;
+ }
+
+ /* In non-blocking mode, bail out if no message ready yet. */
+ if (result == SHM_MQ_WOULD_BLOCK)
+ return NULL;
+ Assert(result == SHM_MQ_SUCCESS);
+
+ /*
+ * Return a pointer to the queue memory directly (which had better be
+ * sufficiently aligned).
+ */
+ tuple = (MinimalTuple) data;
+ Assert(tuple->t_len == nbytes);
+
+ return tuple;
+}
diff --git a/src/backend/executor/tstoreReceiver.c b/src/backend/executor/tstoreReceiver.c
new file mode 100644
index 0000000..e07664f
--- /dev/null
+++ b/src/backend/executor/tstoreReceiver.c
@@ -0,0 +1,283 @@
+/*-------------------------------------------------------------------------
+ *
+ * tstoreReceiver.c
+ * An implementation of DestReceiver that stores the result tuples in
+ * a Tuplestore.
+ *
+ * Optionally, we can force detoasting (but not decompression) of out-of-line
+ * toasted values. This is to support cursors WITH HOLD, which must retain
+ * data even if the underlying table is dropped.
+ *
+ * Also optionally, we can apply a tuple conversion map before storing.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/tstoreReceiver.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/detoast.h"
+#include "access/tupconvert.h"
+#include "executor/tstoreReceiver.h"
+
+
+typedef struct
+{
+ DestReceiver pub;
+ /* parameters: */
+ Tuplestorestate *tstore; /* where to put the data */
+ MemoryContext cxt; /* context containing tstore */
+ bool detoast; /* were we told to detoast? */
+ TupleDesc target_tupdesc; /* target tupdesc, or NULL if none */
+ const char *map_failure_msg; /* tupdesc mapping failure message */
+ /* workspace: */
+ Datum *outvalues; /* values array for result tuple */
+ Datum *tofree; /* temp values to be pfree'd */
+ TupleConversionMap *tupmap; /* conversion map, if needed */
+ TupleTableSlot *mapslot; /* slot for mapped tuples */
+} TStoreState;
+
+
+static bool tstoreReceiveSlot_notoast(TupleTableSlot *slot, DestReceiver *self);
+static bool tstoreReceiveSlot_detoast(TupleTableSlot *slot, DestReceiver *self);
+static bool tstoreReceiveSlot_tupmap(TupleTableSlot *slot, DestReceiver *self);
+
+
+/*
+ * Prepare to receive tuples from executor.
+ */
+static void
+tstoreStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+ TStoreState *myState = (TStoreState *) self;
+ bool needtoast = false;
+ int natts = typeinfo->natts;
+ int i;
+
+ /* Check if any columns require detoast work */
+ if (myState->detoast)
+ {
+ for (i = 0; i < natts; i++)
+ {
+ Form_pg_attribute attr = TupleDescAttr(typeinfo, i);
+
+ if (attr->attisdropped)
+ continue;
+ if (attr->attlen == -1)
+ {
+ needtoast = true;
+ break;
+ }
+ }
+ }
+
+ /* Check if tuple conversion is needed */
+ if (myState->target_tupdesc)
+ myState->tupmap = convert_tuples_by_position(typeinfo,
+ myState->target_tupdesc,
+ myState->map_failure_msg);
+ else
+ myState->tupmap = NULL;
+
+ /* Set up appropriate callback */
+ if (needtoast)
+ {
+ Assert(!myState->tupmap);
+ myState->pub.receiveSlot = tstoreReceiveSlot_detoast;
+ /* Create workspace */
+ myState->outvalues = (Datum *)
+ MemoryContextAlloc(myState->cxt, natts * sizeof(Datum));
+ myState->tofree = (Datum *)
+ MemoryContextAlloc(myState->cxt, natts * sizeof(Datum));
+ myState->mapslot = NULL;
+ }
+ else if (myState->tupmap)
+ {
+ myState->pub.receiveSlot = tstoreReceiveSlot_tupmap;
+ myState->outvalues = NULL;
+ myState->tofree = NULL;
+ myState->mapslot = MakeSingleTupleTableSlot(myState->target_tupdesc,
+ &TTSOpsVirtual);
+ }
+ else
+ {
+ myState->pub.receiveSlot = tstoreReceiveSlot_notoast;
+ myState->outvalues = NULL;
+ myState->tofree = NULL;
+ myState->mapslot = NULL;
+ }
+}
+
+/*
+ * Receive a tuple from the executor and store it in the tuplestore.
+ * This is for the easy case where we don't have to detoast nor map anything.
+ */
+static bool
+tstoreReceiveSlot_notoast(TupleTableSlot *slot, DestReceiver *self)
+{
+ TStoreState *myState = (TStoreState *) self;
+
+ tuplestore_puttupleslot(myState->tstore, slot);
+
+ return true;
+}
+
+/*
+ * Receive a tuple from the executor and store it in the tuplestore.
+ * This is for the case where we have to detoast any toasted values.
+ */
+static bool
+tstoreReceiveSlot_detoast(TupleTableSlot *slot, DestReceiver *self)
+{
+ TStoreState *myState = (TStoreState *) self;
+ TupleDesc typeinfo = slot->tts_tupleDescriptor;
+ int natts = typeinfo->natts;
+ int nfree;
+ int i;
+ MemoryContext oldcxt;
+
+ /* Make sure the tuple is fully deconstructed */
+ slot_getallattrs(slot);
+
+ /*
+ * Fetch back any out-of-line datums. We build the new datums array in
+ * myState->outvalues[] (but we can re-use the slot's isnull array). Also,
+ * remember the fetched values to free afterwards.
+ */
+ nfree = 0;
+ for (i = 0; i < natts; i++)
+ {
+ Datum val = slot->tts_values[i];
+ Form_pg_attribute attr = TupleDescAttr(typeinfo, i);
+
+ if (!attr->attisdropped && attr->attlen == -1 && !slot->tts_isnull[i])
+ {
+ if (VARATT_IS_EXTERNAL(DatumGetPointer(val)))
+ {
+ val = PointerGetDatum(detoast_external_attr((struct varlena *)
+ DatumGetPointer(val)));
+ myState->tofree[nfree++] = val;
+ }
+ }
+
+ myState->outvalues[i] = val;
+ }
+
+ /*
+ * Push the modified tuple into the tuplestore.
+ */
+ oldcxt = MemoryContextSwitchTo(myState->cxt);
+ tuplestore_putvalues(myState->tstore, typeinfo,
+ myState->outvalues, slot->tts_isnull);
+ MemoryContextSwitchTo(oldcxt);
+
+ /* And release any temporary detoasted values */
+ for (i = 0; i < nfree; i++)
+ pfree(DatumGetPointer(myState->tofree[i]));
+
+ return true;
+}
+
+/*
+ * Receive a tuple from the executor and store it in the tuplestore.
+ * This is for the case where we must apply a tuple conversion map.
+ */
+static bool
+tstoreReceiveSlot_tupmap(TupleTableSlot *slot, DestReceiver *self)
+{
+ TStoreState *myState = (TStoreState *) self;
+
+ execute_attr_map_slot(myState->tupmap->attrMap, slot, myState->mapslot);
+ tuplestore_puttupleslot(myState->tstore, myState->mapslot);
+
+ return true;
+}
+
+/*
+ * Clean up at end of an executor run
+ */
+static void
+tstoreShutdownReceiver(DestReceiver *self)
+{
+ TStoreState *myState = (TStoreState *) self;
+
+ /* Release workspace if any */
+ if (myState->outvalues)
+ pfree(myState->outvalues);
+ myState->outvalues = NULL;
+ if (myState->tofree)
+ pfree(myState->tofree);
+ myState->tofree = NULL;
+ if (myState->tupmap)
+ free_conversion_map(myState->tupmap);
+ myState->tupmap = NULL;
+ if (myState->mapslot)
+ ExecDropSingleTupleTableSlot(myState->mapslot);
+ myState->mapslot = NULL;
+}
+
+/*
+ * Destroy receiver when done with it
+ */
+static void
+tstoreDestroyReceiver(DestReceiver *self)
+{
+ pfree(self);
+}
+
+/*
+ * Initially create a DestReceiver object.
+ */
+DestReceiver *
+CreateTuplestoreDestReceiver(void)
+{
+ TStoreState *self = (TStoreState *) palloc0(sizeof(TStoreState));
+
+ self->pub.receiveSlot = tstoreReceiveSlot_notoast; /* might change */
+ self->pub.rStartup = tstoreStartupReceiver;
+ self->pub.rShutdown = tstoreShutdownReceiver;
+ self->pub.rDestroy = tstoreDestroyReceiver;
+ self->pub.mydest = DestTuplestore;
+
+ /* private fields will be set by SetTuplestoreDestReceiverParams */
+
+ return (DestReceiver *) self;
+}
+
+/*
+ * Set parameters for a TuplestoreDestReceiver
+ *
+ * tStore: where to store the tuples
+ * tContext: memory context containing tStore
+ * detoast: forcibly detoast contained data?
+ * target_tupdesc: if not NULL, forcibly convert tuples to this rowtype
+ * map_failure_msg: error message to use if mapping to target_tupdesc fails
+ *
+ * We don't currently support both detoast and target_tupdesc at the same
+ * time, just because no existing caller needs that combination.
+ */
+void
+SetTuplestoreDestReceiverParams(DestReceiver *self,
+ Tuplestorestate *tStore,
+ MemoryContext tContext,
+ bool detoast,
+ TupleDesc target_tupdesc,
+ const char *map_failure_msg)
+{
+ TStoreState *myState = (TStoreState *) self;
+
+ Assert(!(detoast && target_tupdesc));
+
+ Assert(myState->pub.mydest == DestTuplestore);
+ myState->tstore = tStore;
+ myState->cxt = tContext;
+ myState->detoast = detoast;
+ myState->target_tupdesc = target_tupdesc;
+ myState->map_failure_msg = map_failure_msg;
+}