From 46651ce6fe013220ed397add242004d764fc0153 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sat, 4 May 2024 14:15:05 +0200
Subject: Adding upstream version 14.5.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/backend/executor/Makefile                  |   82 +
 src/backend/executor/README                    |  405 ++
 src/backend/executor/execAmi.c                 |  662 ++++
 src/backend/executor/execAsync.c               |  154 +
 src/backend/executor/execCurrent.c             |  426 +++
 src/backend/executor/execExpr.c                | 3965 +++++++++++++++++++
 src/backend/executor/execExprInterp.c          | 4373 +++++++++++++++++++++
 src/backend/executor/execGrouping.c            |  560 +++
 src/backend/executor/execIndexing.c            |  921 +++++
 src/backend/executor/execJunk.c                |  304 ++
 src/backend/executor/execMain.c                | 2886 ++++++++++++++
 src/backend/executor/execParallel.c            | 1498 ++++++++
 src/backend/executor/execPartition.c           | 2107 +++++++++++
 src/backend/executor/execProcnode.c            |  981 +++++
 src/backend/executor/execReplication.c         |  629 +++
 src/backend/executor/execSRF.c                 |  980 +++++
 src/backend/executor/execScan.c                |  342 ++
 src/backend/executor/execTuples.c              | 2339 ++++++++++++
 src/backend/executor/execUtils.c               | 1351 +++++++
 src/backend/executor/functions.c               | 2103 +++++++++++
 src/backend/executor/instrument.c              |  279 ++
 src/backend/executor/nodeAgg.c                 | 4829 ++++++++++++++++++++++++
 src/backend/executor/nodeAppend.c              | 1186 ++++++
 src/backend/executor/nodeBitmapAnd.c           |  223 ++
 src/backend/executor/nodeBitmapHeapscan.c      |  954 +++++
 src/backend/executor/nodeBitmapIndexscan.c     |  330 ++
 src/backend/executor/nodeBitmapOr.c            |  241 ++
 src/backend/executor/nodeCtescan.c             |  351 ++
 src/backend/executor/nodeCustom.c              |  228 ++
 src/backend/executor/nodeForeignscan.c         |  504 +++
 src/backend/executor/nodeFunctionscan.c        |  620 +++
 src/backend/executor/nodeGather.c              |  477 +++
 src/backend/executor/nodeGatherMerge.c         |  789 ++++
 src/backend/executor/nodeGroup.c               |  255 ++
 src/backend/executor/nodeHash.c                | 3434 +++++++++++++++++
 src/backend/executor/nodeHashjoin.c            | 1551 ++++++++
 src/backend/executor/nodeIncrementalSort.c     | 1257 ++++++
 src/backend/executor/nodeIndexonlyscan.c       |  735 ++++
 src/backend/executor/nodeIndexscan.c           | 1747 +++++++++
 src/backend/executor/nodeLimit.c               |  558 +++
 src/backend/executor/nodeLockRows.c            |  403 ++
 src/backend/executor/nodeMaterial.c            |  368 ++
 src/backend/executor/nodeMemoize.c             | 1225 ++++++
 src/backend/executor/nodeMergeAppend.c         |  389 ++
 src/backend/executor/nodeMergejoin.c           | 1678 ++++++++
 src/backend/executor/nodeModifyTable.c         | 3243 ++++++++++++++++
 src/backend/executor/nodeNamedtuplestorescan.c |  201 +
 src/backend/executor/nodeNestloop.c            |  411 ++
 src/backend/executor/nodeProjectSet.c          |  351 ++
 src/backend/executor/nodeRecursiveunion.c      |  331 ++
 src/backend/executor/nodeResult.c              |  272 ++
 src/backend/executor/nodeSamplescan.c          |  378 ++
 src/backend/executor/nodeSeqscan.c             |  314 ++
 src/backend/executor/nodeSetOp.c               |  651 ++++
 src/backend/executor/nodeSort.c                |  430 +++
 src/backend/executor/nodeSubplan.c             | 1313 +++++++
 src/backend/executor/nodeSubqueryscan.c        |  213 ++
 src/backend/executor/nodeTableFuncscan.c       |  523 +++
 src/backend/executor/nodeTidrangescan.c        |  413 ++
 src/backend/executor/nodeTidscan.c             |  558 +++
 src/backend/executor/nodeUnique.c              |  192 +
 src/backend/executor/nodeValuesscan.c          |  361 ++
 src/backend/executor/nodeWindowAgg.c           | 3463 +++++++++++++++++
 src/backend/executor/nodeWorktablescan.c       |  223 ++
 src/backend/executor/spi.c                     | 3383 +++++++++++++++++
 src/backend/executor/tqueue.c                  |  210 ++
 src/backend/executor/tstoreReceiver.c          |  283 ++
 67 files changed, 69396 insertions(+)
 create mode 100644 src/backend/executor/Makefile
 create mode 100644 src/backend/executor/README
 create mode 100644 src/backend/executor/execAmi.c
 create mode 100644 src/backend/executor/execAsync.c
 create mode 100644 src/backend/executor/execCurrent.c
 create mode 100644 src/backend/executor/execExpr.c
 create mode 100644 src/backend/executor/execExprInterp.c
 create mode 100644 src/backend/executor/execGrouping.c
 create mode 100644 src/backend/executor/execIndexing.c
 create mode 100644 src/backend/executor/execJunk.c
 create mode 100644 src/backend/executor/execMain.c
 create mode 100644 src/backend/executor/execParallel.c
 create mode 100644 src/backend/executor/execPartition.c
 create mode 100644 src/backend/executor/execProcnode.c
 create mode 100644 src/backend/executor/execReplication.c
 create mode 100644 src/backend/executor/execSRF.c
 create mode 100644 src/backend/executor/execScan.c
 create mode 100644 src/backend/executor/execTuples.c
 create mode 100644 src/backend/executor/execUtils.c
 create mode 100644 src/backend/executor/functions.c
 create mode 100644 src/backend/executor/instrument.c
 create mode 100644 src/backend/executor/nodeAgg.c
 create mode 100644 src/backend/executor/nodeAppend.c
 create mode 100644 src/backend/executor/nodeBitmapAnd.c
 create mode 100644 src/backend/executor/nodeBitmapHeapscan.c
 create mode 100644 src/backend/executor/nodeBitmapIndexscan.c
 create mode 100644 src/backend/executor/nodeBitmapOr.c
 create mode 100644 src/backend/executor/nodeCtescan.c
 create mode 100644 src/backend/executor/nodeCustom.c
 create mode 100644 src/backend/executor/nodeForeignscan.c
 create mode 100644 src/backend/executor/nodeFunctionscan.c
 create mode 100644 src/backend/executor/nodeGather.c
 create mode 100644 src/backend/executor/nodeGatherMerge.c
 create mode 100644 src/backend/executor/nodeGroup.c
 create mode 100644 src/backend/executor/nodeHash.c
 create mode 100644 src/backend/executor/nodeHashjoin.c
 create mode 100644 src/backend/executor/nodeIncrementalSort.c
 create mode 100644 src/backend/executor/nodeIndexonlyscan.c
 create mode 100644 src/backend/executor/nodeIndexscan.c
 create mode 100644 src/backend/executor/nodeLimit.c
 create mode 100644 src/backend/executor/nodeLockRows.c
 create mode 100644 src/backend/executor/nodeMaterial.c
 create mode 100644 src/backend/executor/nodeMemoize.c
 create mode 100644 src/backend/executor/nodeMergeAppend.c
 create mode 100644 src/backend/executor/nodeMergejoin.c
 create mode 100644 src/backend/executor/nodeModifyTable.c
 create mode 100644 src/backend/executor/nodeNamedtuplestorescan.c
 create mode 100644 src/backend/executor/nodeNestloop.c
 create mode 100644 src/backend/executor/nodeProjectSet.c
 create mode 100644 src/backend/executor/nodeRecursiveunion.c
 create mode 100644 src/backend/executor/nodeResult.c
 create mode 100644 src/backend/executor/nodeSamplescan.c
 create mode 100644 src/backend/executor/nodeSeqscan.c
 create mode 100644 src/backend/executor/nodeSetOp.c
 create mode 100644 src/backend/executor/nodeSort.c
 create mode 100644 src/backend/executor/nodeSubplan.c
 create mode 100644 src/backend/executor/nodeSubqueryscan.c
 create mode 100644 src/backend/executor/nodeTableFuncscan.c
 create mode 100644 src/backend/executor/nodeTidrangescan.c
 create mode 100644 src/backend/executor/nodeTidscan.c
 create mode 100644 src/backend/executor/nodeUnique.c
 create mode 100644 src/backend/executor/nodeValuesscan.c
 create mode 100644 src/backend/executor/nodeWindowAgg.c
 create mode 100644 src/backend/executor/nodeWorktablescan.c
 create mode 100644 src/backend/executor/spi.c
 create mode 100644 src/backend/executor/tqueue.c
 create mode 100644 src/backend/executor/tstoreReceiver.c

(limited to 'src/backend/executor')

diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
new file mode 100644
index 0000000..11118d0
--- /dev/null
+++ b/src/backend/executor/Makefile
@@ -0,0 +1,82 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for executor
+#
+# IDENTIFICATION
+#    src/backend/executor/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/executor
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	execAmi.o \
+	execAsync.o \
+	execCurrent.o \
+	execExpr.o \
+	execExprInterp.o \
+	execGrouping.o \
+	execIndexing.o \
+	execJunk.o \
+	execMain.o \
+	execParallel.o \
+	execPartition.o \
+	execProcnode.o \
+	execReplication.o \
+	execSRF.o \
+	execScan.o \
+	execTuples.o \
+	execUtils.o \
+	functions.o \
+	instrument.o \
+	nodeAgg.o \
+	nodeAppend.o \
+	nodeBitmapAnd.o \
+	nodeBitmapHeapscan.o \
+	nodeBitmapIndexscan.o \
+	nodeBitmapOr.o \
+	nodeCtescan.o \
+	nodeCustom.o \
+	nodeForeignscan.o \
+	nodeFunctionscan.o \
+	nodeGather.o \
+	nodeGatherMerge.o \
+	nodeGroup.o \
+	nodeHash.o \
+	nodeHashjoin.o \
+	nodeIncrementalSort.o \
+	nodeIndexonlyscan.o \
+	nodeIndexscan.o \
+	nodeLimit.o \
+	nodeLockRows.o \
+	nodeMaterial.o \
+	nodeMemoize.o \
+	nodeMergeAppend.o \
+	nodeMergejoin.o \
+	nodeModifyTable.o \
+	nodeNamedtuplestorescan.o \
+	nodeNestloop.o \
+	nodeProjectSet.o \
+	nodeRecursiveunion.o \
+	nodeResult.o \
+	nodeSamplescan.o \
+	nodeSeqscan.o \
+	nodeSetOp.o \
+	nodeSort.o \
+	nodeSubplan.o \
+	nodeSubqueryscan.o \
+	nodeTableFuncscan.o \
+	nodeTidrangescan.o \
+	nodeTidscan.o \
+	nodeUnique.o \
+	nodeValuesscan.o \
+	nodeWindowAgg.o \
+	nodeWorktablescan.o \
+	spi.o \
+	tqueue.o \
+	tstoreReceiver.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/executor/README b/src/backend/executor/README
new file mode 100644
index 0000000..bf5e708
--- /dev/null
+++ b/src/backend/executor/README
@@ -0,0 +1,405 @@
+src/backend/executor/README
+
+The Postgres Executor
+=====================
+
+The executor processes a tree of "plan nodes".  The plan tree is essentially
+a demand-pull pipeline of tuple processing operations.  Each node, when
+called, will produce the next tuple in its output sequence, or NULL if no
+more tuples are available.  If the node is not a primitive relation-scanning
+node, it will have child node(s) that it calls in turn to obtain input
+tuples.
+
+Refinements on this basic model include:
+
+* Choice of scan direction (forwards or backwards).  Caution: this is not
+currently well-supported.  It works for primitive scan nodes, but not very
+well for joins, aggregates, etc.
+
+* Rescan command to reset a node and make it generate its output sequence
+over again.
+
+* Parameters that can alter a node's results.  After adjusting a parameter,
+the rescan command must be applied to that node and all nodes above it.
+There is a moderately intelligent scheme to avoid rescanning nodes
+unnecessarily (for example, Sort does not rescan its input if no parameters
+of the input have changed, since it can just reread its stored sorted data).
+
+For a SELECT, it is only necessary to deliver the top-level result tuples
+to the client.  For INSERT/UPDATE/DELETE, the actual table modification
+operations happen in a top-level ModifyTable plan node.  If the query
+includes a RETURNING clause, the ModifyTable node delivers the computed
+RETURNING rows as output, otherwise it returns nothing.  Handling INSERT
+is pretty straightforward: the tuples returned from the plan tree below
+ModifyTable are inserted into the correct result relation.  For UPDATE,
+the plan tree returns the new values of the updated columns, plus "junk"
+(hidden) column(s) identifying which table row is to be updated.  The
+ModifyTable node must fetch that row to extract values for the unchanged
+columns, combine the values into a new row, and apply the update.  (For a
+heap table, the row-identity junk column is a CTID, but other things may
+be used for other table types.)  For DELETE, the plan tree need only deliver
+junk row-identity column(s), and the ModifyTable node visits each of those
+rows and marks the row deleted.
+
+XXX a great deal more documentation needs to be written here...
+
+
+Plan Trees and State Trees
+--------------------------
+
+The plan tree delivered by the planner contains a tree of Plan nodes (struct
+types derived from struct Plan).  During executor startup we build a parallel
+tree of identical structure containing executor state nodes --- generally,
+every plan node type has a corresponding executor state node type.  Each node
+in the state tree has a pointer to its corresponding node in the plan tree,
+plus executor state data as needed to implement that node type.  This
+arrangement allows the plan tree to be completely read-only so far as the
+executor is concerned: all data that is modified during execution is in the
+state tree.  Read-only plan trees make life much simpler for plan caching and
+reuse.
+
+A corresponding executor state node may not be created during executor startup
+if the executor determines that an entire subplan is not required due to
+execution time partition pruning determining that no matching records will be
+found there.  This currently only occurs for Append and MergeAppend nodes.  In
+this case the non-required subplans are ignored and the executor state's
+subnode array will become out of sequence to the plan's subplan list.
+
+Each Plan node may have expression trees associated with it, to represent
+its target list, qualification conditions, etc.  These trees are also
+read-only to the executor, but the executor state for expression evaluation
+does not mirror the Plan expression's tree shape, as explained below.
+Rather, there's just one ExprState node per expression tree, although this
+may have sub-nodes for some complex expression node types.
+
+Altogether there are four classes of nodes used in these trees: Plan nodes,
+their corresponding PlanState nodes, Expr nodes, and ExprState nodes.
+(Actually, there are also List nodes, which are used as "glue" in all
+three tree-based representations.)
+
+
+Expression Trees and ExprState nodes
+------------------------------------
+
+Expression trees, in contrast to Plan trees, are not mirrored into a
+corresponding tree of state nodes.  Instead each separately executable
+expression tree (e.g. a Plan's qual or targetlist) is represented by one
+ExprState node.  The ExprState node contains the information needed to
+evaluate the expression in a compact, linear form.  That compact form is
+stored as a flat array in ExprState->steps[] (an array of ExprEvalStep,
+not ExprEvalStep *).
+
+The reasons for choosing such a representation include:
+- commonly the amount of work needed to evaluate one Expr-type node is
+  small enough that the overhead of having to perform a tree-walk
+  during evaluation is significant.
+- the flat representation can be evaluated non-recursively within a single
+  function, reducing stack depth and function call overhead.
+- such a representation is usable both for fast interpreted execution,
+  and for compiling into native code.
+
+The Plan-tree representation of an expression is compiled into an
+ExprState node by ExecInitExpr().  As much complexity as possible should
+be handled by ExecInitExpr() (and helpers), instead of execution time
+where both interpreted and compiled versions would need to deal with the
+complexity.  Besides duplicating effort between execution approaches,
+runtime initialization checks also have a small but noticeable cost every
+time the expression is evaluated.  Therefore, we allow ExecInitExpr() to
+precompute information that we do not expect to vary across execution of a
+single query, for example the set of CHECK constraint expressions to be
+applied to a domain type.  This could not be done at plan time without
+greatly increasing the number of events that require plan invalidation.
+(Previously, some information of this kind was rechecked on each
+expression evaluation, but that seems like unnecessary overhead.)
+
+
+Expression Initialization
+-------------------------
+
+During ExecInitExpr() and similar routines, Expr trees are converted
+into the flat representation.  Each Expr node might be represented by
+zero, one, or more ExprEvalSteps.
+
+Each ExprEvalStep's work is determined by its opcode (of enum ExprEvalOp)
+and it stores the result of its work into the Datum variable and boolean
+null flag variable pointed to by ExprEvalStep->resvalue/resnull.
+Complex expressions are performed by chaining together several steps.
+For example, "a + b" (one OpExpr, with two Var expressions) would be
+represented as two steps to fetch the Var values, and one step for the
+evaluation of the function underlying the + operator.  The steps for the
+Vars would have their resvalue/resnull pointing directly to the appropriate
+args[].value .isnull elements in the FunctionCallInfoBaseData struct that
+is used by the function evaluation step, thus avoiding extra work to copy
+the result values around.
+
+The last entry in a completed ExprState->steps array is always an
+EEOP_DONE step; this removes the need to test for end-of-array while
+iterating.  Also, if the expression contains any variable references (to
+user columns of the ExprContext's INNER, OUTER, or SCAN tuples), the steps
+array begins with EEOP_*_FETCHSOME steps that ensure that the relevant
+tuples have been deconstructed to make the required columns directly
+available (cf. slot_getsomeattrs()).  This allows individual Var-fetching
+steps to be little more than an array lookup.
+
+Most of ExecInitExpr()'s work is done by the recursive function
+ExecInitExprRec() and its subroutines.  ExecInitExprRec() maps one Expr
+node into the steps required for execution, recursing as needed for
+sub-expressions.
+
+Each ExecInitExprRec() call has to specify where that subexpression's
+results are to be stored (via the resv/resnull parameters).  This allows
+the above scenario of evaluating a (sub-)expression directly into
+fcinfo->args[].value/isnull, but also requires some care: target Datum/isnull
+variables may not be shared with another ExecInitExprRec() unless the
+results are only needed by steps executing before further usages of those
+target Datum/isnull variables.  Due to the non-recursiveness of the
+ExprEvalStep representation that's usually easy to guarantee.
+
+ExecInitExprRec() pushes new operations into the ExprState->steps array
+using ExprEvalPushStep().  To keep the steps as a consecutively laid out
+array, ExprEvalPushStep() has to repalloc the entire array when there's
+not enough space.  Because of that it is *not* allowed to point directly
+into any of the steps during expression initialization.  Therefore, the
+resv/resnull for a subexpression usually point to some storage that is
+palloc'd separately from the steps array.  For instance, the
+FunctionCallInfoBaseData for a function call step is separately allocated
+rather than being part of the ExprEvalStep array.  The overall result
+of a complete expression is typically returned into the resvalue/resnull
+fields of the ExprState node itself.
+
+Some steps, e.g. boolean expressions, allow skipping evaluation of
+certain subexpressions.  In the flat representation this amounts to
+jumping to some later step rather than just continuing consecutively
+with the next step.  The target for such a jump is represented by
+the integer index in the ExprState->steps array of the step to execute
+next.  (Compare the EEO_NEXT and EEO_JUMP macros in execExprInterp.c.)
+
+Typically, ExecInitExprRec() has to push a jumping step into the steps
+array, then recursively generate steps for the subexpression that might
+get skipped over, then go back and fix up the jump target index using
+the now-known length of the subexpression's steps.  This is handled by
+adjust_jumps lists in execExpr.c.
+
+The last step in constructing an ExprState is to apply ExecReadyExpr(),
+which readies it for execution using whichever execution method has been
+selected.
+
+
+Expression Evaluation
+---------------------
+
+To allow for different methods of expression evaluation, and for
+better branch/jump target prediction, expressions are evaluated by
+calling ExprState->evalfunc (via ExecEvalExpr() and friends).
+
+ExecReadyExpr() can choose the method of interpretation by setting
+evalfunc to an appropriate function.  The default execution function,
+ExecInterpExpr, is implemented in execExprInterp.c; see its header
+comment for details.  Special-case evalfuncs are used for certain
+especially-simple expressions.
+
+Note that a lot of the more complex expression evaluation steps, which are
+less performance-critical than the simpler ones, are implemented as
+separate functions outside the fast-path of expression execution, allowing
+their implementation to be shared between interpreted and compiled
+expression evaluation.  This means that these helper functions are not
+allowed to perform expression step dispatch themselves, as the method of
+dispatch will vary based on the caller.  The helpers therefore cannot call
+for the execution of subexpressions; all subexpression results they need
+must be computed by earlier steps.  And dispatch to the following
+expression step must be performed after returning from the helper.
+
+
+Targetlist Evaluation
+---------------------
+
+ExecBuildProjectionInfo builds an ExprState that has the effect of
+evaluating a targetlist into ExprState->resultslot.  A generic targetlist
+expression is executed by evaluating it as discussed above (storing the
+result into the ExprState's resvalue/resnull fields) and then using an
+EEOP_ASSIGN_TMP step to move the result into the appropriate tts_values[]
+and tts_isnull[] array elements of the result slot.  There are special
+fast-path step types (EEOP_ASSIGN_*_VAR) to handle targetlist entries that
+are simple Vars using only one step instead of two.
+
+
+Memory Management
+-----------------
+
+A "per query" memory context is created during CreateExecutorState();
+all storage allocated during an executor invocation is allocated in that
+context or a child context.  This allows easy reclamation of storage
+during executor shutdown --- rather than messing with retail pfree's and
+probable storage leaks, we just destroy the memory context.
+
+In particular, the plan state trees and expression state trees described
+in the previous section are allocated in the per-query memory context.
+
+To avoid intra-query memory leaks, most processing while a query runs
+is done in "per tuple" memory contexts, which are so-called because they
+are typically reset to empty once per tuple.  Per-tuple contexts are usually
+associated with ExprContexts, and commonly each PlanState node has its own
+ExprContext to evaluate its qual and targetlist expressions in.
+
+
+Query Processing Control Flow
+-----------------------------
+
+This is a sketch of control flow for full query processing:
+
+	CreateQueryDesc
+
+	ExecutorStart
+		CreateExecutorState
+			creates per-query context
+		switch to per-query context to run ExecInitNode
+		AfterTriggerBeginQuery
+		ExecInitNode --- recursively scans plan tree
+			ExecInitNode
+				recurse into subsidiary nodes
+			CreateExprContext
+				creates per-tuple context
+			ExecInitExpr
+
+	ExecutorRun
+		ExecProcNode --- recursively called in per-query context
+			ExecEvalExpr --- called in per-tuple context
+			ResetExprContext --- to free memory
+
+	ExecutorFinish
+		ExecPostprocessPlan --- run any unfinished ModifyTable nodes
+		AfterTriggerEndQuery
+
+	ExecutorEnd
+		ExecEndNode --- recursively releases resources
+		FreeExecutorState
+			frees per-query context and child contexts
+
+	FreeQueryDesc
+
+Per above comments, it's not really critical for ExecEndNode to free any
+memory; it'll all go away in FreeExecutorState anyway.  However, we do need to
+be careful to close relations, drop buffer pins, etc, so we do need to scan
+the plan state tree to find these sorts of resources.
+
+
+The executor can also be used to evaluate simple expressions without any Plan
+tree ("simple" meaning "no aggregates and no sub-selects", though such might
+be hidden inside function calls).  This case has a flow of control like
+
+	CreateExecutorState
+		creates per-query context
+
+	CreateExprContext	-- or use GetPerTupleExprContext(estate)
+		creates per-tuple context
+
+	ExecPrepareExpr
+		temporarily switch to per-query context
+		run the expression through expression_planner
+		ExecInitExpr
+
+	Repeatedly do:
+		ExecEvalExprSwitchContext
+			ExecEvalExpr --- called in per-tuple context
+		ResetExprContext --- to free memory
+
+	FreeExecutorState
+		frees per-query context, as well as ExprContext
+		(a separate FreeExprContext call is not necessary)
+
+
+EvalPlanQual (READ COMMITTED Update Checking)
+---------------------------------------------
+
+For simple SELECTs, the executor need only pay attention to tuples that are
+valid according to the snapshot seen by the current transaction (ie, they
+were inserted by a previously committed transaction, and not deleted by any
+previously committed transaction).  However, for UPDATE and DELETE it is not
+cool to modify or delete a tuple that's been modified by an open or
+concurrently-committed transaction.  If we are running in SERIALIZABLE
+isolation level then we just raise an error when this condition is seen to
+occur.  In READ COMMITTED isolation level, we must work a lot harder.
+
+The basic idea in READ COMMITTED mode is to take the modified tuple
+committed by the concurrent transaction (after waiting for it to commit,
+if need be) and re-evaluate the query qualifications to see if it would
+still meet the quals.  If so, we regenerate the updated tuple (if we are
+doing an UPDATE) from the modified tuple, and finally update/delete the
+modified tuple.  SELECT FOR UPDATE/SHARE behaves similarly, except that its
+action is just to lock the modified tuple and return results based on that
+version of the tuple.
+
+To implement this checking, we actually re-run the query from scratch for
+each modified tuple (or set of tuples, for SELECT FOR UPDATE), with the
+relation scan nodes tweaked to return only the current tuples --- either
+the original ones, or the updated (and now locked) versions of the modified
+tuple(s).  If this query returns a tuple, then the modified tuple(s) pass
+the quals (and the query output is the suitably modified update tuple, if
+we're doing UPDATE).  If no tuple is returned, then the modified tuple(s)
+fail the quals, so we ignore the current result tuple and continue the
+original query.
+
+In UPDATE/DELETE, only the target relation needs to be handled this way.
+In SELECT FOR UPDATE, there may be multiple relations flagged FOR UPDATE,
+so we obtain lock on the current tuple version in each such relation before
+executing the recheck.
+
+It is also possible that there are relations in the query that are not
+to be locked (they are neither the UPDATE/DELETE target nor specified to
+be locked in SELECT FOR UPDATE/SHARE).  When re-running the test query
+we want to use the same rows from these relations that were joined to
+the locked rows.  For ordinary relations this can be implemented relatively
+cheaply by including the row TID in the join outputs and re-fetching that
+TID.  (The re-fetch is expensive, but we're trying to optimize the normal
+case where no re-test is needed.)  We have also to consider non-table
+relations, such as a ValuesScan or FunctionScan.  For these, since there
+is no equivalent of TID, the only practical solution seems to be to include
+the entire row value in the join output row.
+
+We disallow set-returning functions in the targetlist of SELECT FOR UPDATE,
+so as to ensure that at most one tuple can be returned for any particular
+set of scan tuples.  Otherwise we'd get duplicates due to the original
+query returning the same set of scan tuples multiple times.  Likewise,
+SRFs are disallowed in an UPDATE's targetlist.  There, they would have the
+effect of the same row being updated multiple times, which is not very
+useful --- and updates after the first would have no effect anyway.
+
+
+Asynchronous Execution
+----------------------
+
+In cases where a node is waiting on an event external to the database system,
+such as a ForeignScan awaiting network I/O, it's desirable for the node to
+indicate that it cannot return any tuple immediately but may be able to do so
+at a later time.  A process which discovers this type of situation can always
+handle it simply by blocking, but this may waste time that could be spent
+executing some other part of the plan tree where progress could be made
+immediately.  This is particularly likely to occur when the plan tree contains
+an Append node.  Asynchronous execution runs multiple parts of an Append node
+concurrently rather than serially to improve performance.
+
+For asynchronous execution, an Append node must first request a tuple from an
+async-capable child node using ExecAsyncRequest.  Next, it must execute the
+asynchronous event loop using ExecAppendAsyncEventWait.  Eventually, when a
+child node to which an asynchronous request has been made produces a tuple,
+the Append node will receive it from the event loop via ExecAsyncResponse.  In
+the current implementation of asynchronous execution, the only node type that
+requests tuples from an async-capable child node is an Append, while the only
+node type that might be async-capable is a ForeignScan.
+
+Typically, the ExecAsyncResponse callback is the only one required for nodes
+that wish to request tuples asynchronously.  On the other hand, async-capable
+nodes generally need to implement three methods:
+
+1. When an asynchronous request is made, the node's ExecAsyncRequest callback
+   will be invoked; it should use ExecAsyncRequestPending to indicate that the
+   request is pending for a callback described below.  Alternatively, it can
+   instead use ExecAsyncRequestDone if a result is available immediately.
+
+2. When the event loop wishes to wait or poll for file descriptor events, the
+   node's ExecAsyncConfigureWait callback will be invoked to configure the
+   file descriptor event for which the node wishes to wait.
+
+3. When the file descriptor becomes ready, the node's ExecAsyncNotify callback
+   will be invoked; like #1, it should use ExecAsyncRequestPending for another
+   callback or ExecAsyncRequestDone to return a result immediately.
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
new file mode 100644
index 0000000..c3aa650
--- /dev/null
+++ b/src/backend/executor/execAmi.c
@@ -0,0 +1,662 @@
+/*-------------------------------------------------------------------------
+ *
+ * execAmi.c
+ *	  miscellaneous executor access method routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *	src/backend/executor/execAmi.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amapi.h"
+#include "access/htup_details.h"
+#include "executor/execdebug.h"
+#include "executor/nodeAgg.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeBitmapAnd.h"
+#include "executor/nodeBitmapHeapscan.h"
+#include "executor/nodeBitmapIndexscan.h"
+#include "executor/nodeBitmapOr.h"
+#include "executor/nodeCtescan.h"
+#include "executor/nodeCustom.h"
+#include "executor/nodeForeignscan.h"
+#include "executor/nodeFunctionscan.h"
+#include "executor/nodeGather.h"
+#include "executor/nodeGatherMerge.h"
+#include "executor/nodeGroup.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "executor/nodeLimit.h"
+#include "executor/nodeLockRows.h"
+#include "executor/nodeMaterial.h"
+#include "executor/nodeMemoize.h"
+#include "executor/nodeMergeAppend.h"
+#include "executor/nodeMergejoin.h"
+#include "executor/nodeModifyTable.h"
+#include "executor/nodeNamedtuplestorescan.h"
+#include "executor/nodeNestloop.h"
+#include "executor/nodeProjectSet.h"
+#include "executor/nodeRecursiveunion.h"
+#include "executor/nodeResult.h"
+#include "executor/nodeSamplescan.h"
+#include "executor/nodeSeqscan.h"
+#include "executor/nodeSetOp.h"
+#include "executor/nodeSort.h"
+#include "executor/nodeSubplan.h"
+#include "executor/nodeSubqueryscan.h"
+#include "executor/nodeTableFuncscan.h"
+#include "executor/nodeTidrangescan.h"
+#include "executor/nodeTidscan.h"
+#include "executor/nodeUnique.h"
+#include "executor/nodeValuesscan.h"
+#include "executor/nodeWindowAgg.h"
+#include "executor/nodeWorktablescan.h"
+#include "nodes/extensible.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/pathnodes.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+static bool IndexSupportsBackwardScan(Oid indexid);
+
+
+/*
+ * ExecReScan
+ *		Reset a plan node so that its output can be re-scanned.
+ *
+ * Note that if the plan node has parameters that have changed value,
+ * the output might be different from last time.
+ */
+void
+ExecReScan(PlanState *node)
+{
+	/* If collecting timing stats, update them */
+	if (node->instrument)
+		InstrEndLoop(node->instrument);
+
+	/*
+	 * If we have changed parameters, propagate that info.
+	 *
+	 * Note: ExecReScanSetParamPlan() can add bits to node->chgParam,
+	 * corresponding to the output param(s) that the InitPlan will update.
+	 * Since we make only one pass over the list, that means that an InitPlan
+	 * can depend on the output param(s) of a sibling InitPlan only if that
+	 * sibling appears earlier in the list.  This is workable for now given
+	 * the limited ways in which one InitPlan could depend on another, but
+	 * eventually we might need to work harder (or else make the planner
+	 * enlarge the extParam/allParam sets to include the params of depended-on
+	 * InitPlans).
+	 */
+	if (node->chgParam != NULL)
+	{
+		ListCell   *l;
+
+		foreach(l, node->initPlan)
+		{
+			SubPlanState *sstate = (SubPlanState *) lfirst(l);
+			PlanState  *splan = sstate->planstate;
+
+			if (splan->plan->extParam != NULL)	/* don't care about child
+												 * local Params */
+				UpdateChangedParamSet(splan, node->chgParam);
+			if (splan->chgParam != NULL)
+				ExecReScanSetParamPlan(sstate, node);
+		}
+		foreach(l, node->subPlan)
+		{
+			SubPlanState *sstate = (SubPlanState *) lfirst(l);
+			PlanState  *splan = sstate->planstate;
+
+			if (splan->plan->extParam != NULL)
+				UpdateChangedParamSet(splan, node->chgParam);
+		}
+		/* Well. Now set chgParam for left/right trees. */
+		if (node->lefttree != NULL)
+			UpdateChangedParamSet(node->lefttree, node->chgParam);
+		if (node->righttree != NULL)
+			UpdateChangedParamSet(node->righttree, node->chgParam);
+	}
+
+	/* Call expression callbacks */
+	if (node->ps_ExprContext)
+		ReScanExprContext(node->ps_ExprContext);
+
+	/* And do node-type-specific processing */
+	switch (nodeTag(node))
+	{
+		case T_ResultState:
+			ExecReScanResult((ResultState *) node);
+			break;
+
+		case T_ProjectSetState:
+			ExecReScanProjectSet((ProjectSetState *) node);
+			break;
+
+		case T_ModifyTableState:
+			ExecReScanModifyTable((ModifyTableState *) node);
+			break;
+
+		case T_AppendState:
+			ExecReScanAppend((AppendState *) node);
+			break;
+
+		case T_MergeAppendState:
+			ExecReScanMergeAppend((MergeAppendState *) node);
+			break;
+
+		case T_RecursiveUnionState:
+			ExecReScanRecursiveUnion((RecursiveUnionState *) node);
+			break;
+
+		case T_BitmapAndState:
+			ExecReScanBitmapAnd((BitmapAndState *) node);
+			break;
+
+		case T_BitmapOrState:
+			ExecReScanBitmapOr((BitmapOrState *) node);
+			break;
+
+		case T_SeqScanState:
+			ExecReScanSeqScan((SeqScanState *) node);
+			break;
+
+		case T_SampleScanState:
+			ExecReScanSampleScan((SampleScanState *) node);
+			break;
+
+		case T_GatherState:
+			ExecReScanGather((GatherState *) node);
+			break;
+
+		case T_GatherMergeState:
+			ExecReScanGatherMerge((GatherMergeState *) node);
+			break;
+
+		case T_IndexScanState:
+			ExecReScanIndexScan((IndexScanState *) node);
+			break;
+
+		case T_IndexOnlyScanState:
+			ExecReScanIndexOnlyScan((IndexOnlyScanState *) node);
+			break;
+
+		case T_BitmapIndexScanState:
+			ExecReScanBitmapIndexScan((BitmapIndexScanState *) node);
+			break;
+
+		case T_BitmapHeapScanState:
+			ExecReScanBitmapHeapScan((BitmapHeapScanState *) node);
+			break;
+
+		case T_TidScanState:
+			ExecReScanTidScan((TidScanState *) node);
+			break;
+
+		case T_TidRangeScanState:
+			ExecReScanTidRangeScan((TidRangeScanState *) node);
+			break;
+
+		case T_SubqueryScanState:
+			ExecReScanSubqueryScan((SubqueryScanState *) node);
+			break;
+
+		case T_FunctionScanState:
+			ExecReScanFunctionScan((FunctionScanState *) node);
+			break;
+
+		case T_TableFuncScanState:
+			ExecReScanTableFuncScan((TableFuncScanState *) node);
+			break;
+
+		case T_ValuesScanState:
+			ExecReScanValuesScan((ValuesScanState *) node);
+			break;
+
+		case T_CteScanState:
+			ExecReScanCteScan((CteScanState *) node);
+			break;
+
+		case T_NamedTuplestoreScanState:
+			ExecReScanNamedTuplestoreScan((NamedTuplestoreScanState *) node);
+			break;
+
+		case T_WorkTableScanState:
+			ExecReScanWorkTableScan((WorkTableScanState *) node);
+			break;
+
+		case T_ForeignScanState:
+			ExecReScanForeignScan((ForeignScanState *) node);
+			break;
+
+		case T_CustomScanState:
+			ExecReScanCustomScan((CustomScanState *) node);
+			break;
+
+		case T_NestLoopState:
+			ExecReScanNestLoop((NestLoopState *) node);
+			break;
+
+		case T_MergeJoinState:
+			ExecReScanMergeJoin((MergeJoinState *) node);
+			break;
+
+		case T_HashJoinState:
+			ExecReScanHashJoin((HashJoinState *) node);
+			break;
+
+		case T_MaterialState:
+			ExecReScanMaterial((MaterialState *) node);
+			break;
+
+		case T_MemoizeState:
+			ExecReScanMemoize((MemoizeState *) node);
+			break;
+
+		case T_SortState:
+			ExecReScanSort((SortState *) node);
+			break;
+
+		case T_IncrementalSortState:
+			ExecReScanIncrementalSort((IncrementalSortState *) node);
+			break;
+
+		case T_GroupState:
+			ExecReScanGroup((GroupState *) node);
+			break;
+
+		case T_AggState:
+			ExecReScanAgg((AggState *) node);
+			break;
+
+		case T_WindowAggState:
+			ExecReScanWindowAgg((WindowAggState *) node);
+			break;
+
+		case T_UniqueState:
+			ExecReScanUnique((UniqueState *) node);
+			break;
+
+		case T_HashState:
+			ExecReScanHash((HashState *) node);
+			break;
+
+		case T_SetOpState:
+			ExecReScanSetOp((SetOpState *) node);
+			break;
+
+		case T_LockRowsState:
+			ExecReScanLockRows((LockRowsState *) node);
+			break;
+
+		case T_LimitState:
+			ExecReScanLimit((LimitState *) node);
+			break;
+
+		default:
+			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+			break;
+	}
+
+	if (node->chgParam != NULL)
+	{
+		bms_free(node->chgParam);
+		node->chgParam = NULL;
+	}
+}
+
+/*
+ * ExecMarkPos
+ *
+ * Marks the current scan position.
+ *
+ * NOTE: mark/restore capability is currently needed only for plan nodes
+ * that are the immediate inner child of a MergeJoin node.  Since MergeJoin
+ * requires sorted input, there is never any need to support mark/restore in
+ * node types that cannot produce sorted output.  There are some cases in
+ * which a node can pass through sorted data from its child; if we don't
+ * implement mark/restore for such a node type, the planner compensates by
+ * inserting a Material node above that node.
+ */
+void
+ExecMarkPos(PlanState *node)
+{
+	switch (nodeTag(node))
+	{
+		case T_IndexScanState:
+			ExecIndexMarkPos((IndexScanState *) node);
+			break;
+
+		case T_IndexOnlyScanState:
+			ExecIndexOnlyMarkPos((IndexOnlyScanState *) node);
+			break;
+
+		case T_CustomScanState:
+			ExecCustomMarkPos((CustomScanState *) node);
+			break;
+
+		case T_MaterialState:
+			ExecMaterialMarkPos((MaterialState *) node);
+			break;
+
+		case T_SortState:
+			ExecSortMarkPos((SortState *) node);
+			break;
+
+		case T_ResultState:
+			ExecResultMarkPos((ResultState *) node);
+			break;
+
+		default:
+			/* don't make hard error unless caller asks to restore... */
+			elog(DEBUG2, "unrecognized node type: %d", (int) nodeTag(node));
+			break;
+	}
+}
+
+/*
+ * ExecRestrPos
+ *
+ * restores the scan position previously saved with ExecMarkPos()
+ *
+ * NOTE: the semantics of this are that the first ExecProcNode following
+ * the restore operation will yield the same tuple as the first one following
+ * the mark operation.  It is unspecified what happens to the plan node's
+ * result TupleTableSlot.  (In most cases the result slot is unchanged by
+ * a restore, but the node may choose to clear it or to load it with the
+ * restored-to tuple.)	Hence the caller should discard any previously
+ * returned TupleTableSlot after doing a restore.
+ */
+void
+ExecRestrPos(PlanState *node)
+{
+	switch (nodeTag(node))
+	{
+		case T_IndexScanState:
+			ExecIndexRestrPos((IndexScanState *) node);
+			break;
+
+		case T_IndexOnlyScanState:
+			ExecIndexOnlyRestrPos((IndexOnlyScanState *) node);
+			break;
+
+		case T_CustomScanState:
+			ExecCustomRestrPos((CustomScanState *) node);
+			break;
+
+		case T_MaterialState:
+			ExecMaterialRestrPos((MaterialState *) node);
+			break;
+
+		case T_SortState:
+			ExecSortRestrPos((SortState *) node);
+			break;
+
+		case T_ResultState:
+			ExecResultRestrPos((ResultState *) node);
+			break;
+
+		default:
+			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+			break;
+	}
+}
+
+/*
+ * ExecSupportsMarkRestore - does a Path support mark/restore?
+ *
+ * This is used during planning and so must accept a Path, not a Plan.
+ * We keep it here to be adjacent to the routines above, which also must
+ * know which plan types support mark/restore.
+ */
+bool
+ExecSupportsMarkRestore(Path *pathnode)
+{
+	/*
+	 * For consistency with the routines above, we do not examine the nodeTag
+	 * but rather the pathtype, which is the Plan node type the Path would
+	 * produce.
+	 */
+	switch (pathnode->pathtype)
+	{
+		case T_IndexScan:
+		case T_IndexOnlyScan:
+
+			/*
+			 * Not all index types support mark/restore.
+			 */
+			return castNode(IndexPath, pathnode)->indexinfo->amcanmarkpos;
+
+		case T_Material:
+		case T_Sort:
+			return true;
+
+		case T_CustomScan:
+			{
+				CustomPath *customPath = castNode(CustomPath, pathnode);
+
+				if (customPath->flags & CUSTOMPATH_SUPPORT_MARK_RESTORE)
+					return true;
+				return false;
+			}
+		case T_Result:
+
+			/*
+			 * Result supports mark/restore iff it has a child plan that does.
+			 *
+			 * We have to be careful here because there is more than one Path
+			 * type that can produce a Result plan node.
+			 */
+			if (IsA(pathnode, ProjectionPath))
+				return ExecSupportsMarkRestore(((ProjectionPath *) pathnode)->subpath);
+			else if (IsA(pathnode, MinMaxAggPath))
+				return false;	/* childless Result */
+			else if (IsA(pathnode, GroupResultPath))
+				return false;	/* childless Result */
+			else
+			{
+				/* Simple RTE_RESULT base relation */
+				Assert(IsA(pathnode, Path));
+				return false;	/* childless Result */
+			}
+
+		case T_Append:
+			{
+				AppendPath *appendPath = castNode(AppendPath, pathnode);
+
+				/*
+				 * If there's exactly one child, then there will be no Append
+				 * in the final plan, so we can handle mark/restore if the
+				 * child plan node can.
+				 */
+				if (list_length(appendPath->subpaths) == 1)
+					return ExecSupportsMarkRestore((Path *) linitial(appendPath->subpaths));
+				/* Otherwise, Append can't handle it */
+				return false;
+			}
+
+		case T_MergeAppend:
+			{
+				MergeAppendPath *mapath = castNode(MergeAppendPath, pathnode);
+
+				/*
+				 * Like the Append case above, single-subpath MergeAppends
+				 * won't be in the final plan, so just return the child's
+				 * mark/restore ability.
+				 */
+				if (list_length(mapath->subpaths) == 1)
+					return ExecSupportsMarkRestore((Path *) linitial(mapath->subpaths));
+				/* Otherwise, MergeAppend can't handle it */
+				return false;
+			}
+
+		default:
+			break;
+	}
+
+	return false;
+}
+
+/*
+ * ExecSupportsBackwardScan - does a plan type support backwards scanning?
+ *
+ * Ideally, all plan types would support backwards scan, but that seems
+ * unlikely to happen soon.  In some cases, a plan node passes the backwards
+ * scan down to its children, and so supports backwards scan only if its
+ * children do.  Therefore, this routine must be passed a complete plan tree.
+ */
+bool
+ExecSupportsBackwardScan(Plan *node)
+{
+	if (node == NULL)
+		return false;
+
+	/*
+	 * Parallel-aware nodes return a subset of the tuples in each worker, and
+	 * in general we can't expect to have enough bookkeeping state to know
+	 * which ones we returned in this worker as opposed to some other worker.
+	 */
+	if (node->parallel_aware)
+		return false;
+
+	switch (nodeTag(node))
+	{
+		case T_Result:
+			if (outerPlan(node) != NULL)
+				return ExecSupportsBackwardScan(outerPlan(node));
+			else
+				return false;
+
+		case T_Append:
+			{
+				ListCell   *l;
+
+				/* With async, tuples may be interleaved, so can't back up. */
+				if (((Append *) node)->nasyncplans > 0)
+					return false;
+
+				foreach(l, ((Append *) node)->appendplans)
+				{
+					if (!ExecSupportsBackwardScan((Plan *) lfirst(l)))
+						return false;
+				}
+				/* need not check tlist because Append doesn't evaluate it */
+				return true;
+			}
+
+		case T_SampleScan:
+			/* Simplify life for tablesample methods by disallowing this */
+			return false;
+
+		case T_Gather:
+			return false;
+
+		case T_IndexScan:
+			return IndexSupportsBackwardScan(((IndexScan *) node)->indexid);
+
+		case T_IndexOnlyScan:
+			return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid);
+
+		case T_SubqueryScan:
+			return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan);
+
+		case T_CustomScan:
+			{
+				uint32		flags = ((CustomScan *) node)->flags;
+
+				if (flags & CUSTOMPATH_SUPPORT_BACKWARD_SCAN)
+					return true;
+			}
+			return false;
+
+		case T_SeqScan:
+		case T_TidScan:
+		case T_TidRangeScan:
+		case T_FunctionScan:
+		case T_ValuesScan:
+		case T_CteScan:
+		case T_Material:
+		case T_Sort:
+			/* these don't evaluate tlist */
+			return true;
+
+		case T_IncrementalSort:
+
+			/*
+			 * Unlike full sort, incremental sort keeps only a single group of
+			 * tuples in memory, so it can't scan backwards.
+			 */
+			return false;
+
+		case T_LockRows:
+		case T_Limit:
+			return ExecSupportsBackwardScan(outerPlan(node));
+
+		default:
+			return false;
+	}
+}
+
+/*
+ * An IndexScan or IndexOnlyScan node supports backward scan only if the
+ * index's AM does.
+ */
+static bool
+IndexSupportsBackwardScan(Oid indexid)
+{
+	bool		result;
+	HeapTuple	ht_idxrel;
+	Form_pg_class idxrelrec;
+	IndexAmRoutine *amroutine;
+
+	/* Fetch the pg_class tuple of the index relation */
+	ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexid));
+	if (!HeapTupleIsValid(ht_idxrel))
+		elog(ERROR, "cache lookup failed for relation %u", indexid);
+	idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel);
+
+	/* Fetch the index AM's API struct */
+	amroutine = GetIndexAmRoutineByAmId(idxrelrec->relam, false);
+
+	result = amroutine->amcanbackward;
+
+	pfree(amroutine);
+	ReleaseSysCache(ht_idxrel);
+
+	return result;
+}
+
+/*
+ * ExecMaterializesOutput - does a plan type materialize its output?
+ *
+ * Returns true if the plan node type is one that automatically materializes
+ * its output (typically by keeping it in a tuplestore).  For such plans,
+ * a rescan without any parameter change will have zero startup cost and
+ * very low per-tuple cost.
+ */
+bool
+ExecMaterializesOutput(NodeTag plantype)
+{
+	switch (plantype)
+	{
+		case T_Material:
+		case T_FunctionScan:
+		case T_TableFuncScan:
+		case T_CteScan:
+		case T_NamedTuplestoreScan:
+		case T_WorkTableScan:
+		case T_Sort:
+			return true;
+
+		default:
+			break;
+	}
+
+	return false;
+}
diff --git a/src/backend/executor/execAsync.c b/src/backend/executor/execAsync.c
new file mode 100644
index 0000000..94a284a
--- /dev/null
+++ b/src/backend/executor/execAsync.c
@@ -0,0 +1,154 @@
+/*-------------------------------------------------------------------------
+ *
+ * execAsync.c
+ *	  Support routines for asynchronous execution
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execAsync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execAsync.h"
+#include "executor/executor.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeForeignscan.h"
+
+/*
+ * Asynchronously request a tuple from a designed async-capable node.
+ */
+void
+ExecAsyncRequest(AsyncRequest *areq)
+{
+	if (areq->requestee->chgParam != NULL)	/* something changed? */
+		ExecReScan(areq->requestee);	/* let ReScan handle this */
+
+	/* must provide our own instrumentation support */
+	if (areq->requestee->instrument)
+		InstrStartNode(areq->requestee->instrument);
+
+	switch (nodeTag(areq->requestee))
+	{
+		case T_ForeignScanState:
+			ExecAsyncForeignScanRequest(areq);
+			break;
+		default:
+			/* If the node doesn't support async, caller messed up. */
+			elog(ERROR, "unrecognized node type: %d",
+				 (int) nodeTag(areq->requestee));
+	}
+
+	ExecAsyncResponse(areq);
+
+	/* must provide our own instrumentation support */
+	if (areq->requestee->instrument)
+		InstrStopNode(areq->requestee->instrument,
+					  TupIsNull(areq->result) ? 0.0 : 1.0);
+}
+
+/*
+ * Give the asynchronous node a chance to configure the file descriptor event
+ * for which it wishes to wait.  We expect the node-type specific callback to
+ * make a single call of the following form:
+ *
+ * AddWaitEventToSet(set, WL_SOCKET_READABLE, fd, NULL, areq);
+ */
+void
+ExecAsyncConfigureWait(AsyncRequest *areq)
+{
+	/* must provide our own instrumentation support */
+	if (areq->requestee->instrument)
+		InstrStartNode(areq->requestee->instrument);
+
+	switch (nodeTag(areq->requestee))
+	{
+		case T_ForeignScanState:
+			ExecAsyncForeignScanConfigureWait(areq);
+			break;
+		default:
+			/* If the node doesn't support async, caller messed up. */
+			elog(ERROR, "unrecognized node type: %d",
+				 (int) nodeTag(areq->requestee));
+	}
+
+	/* must provide our own instrumentation support */
+	if (areq->requestee->instrument)
+		InstrStopNode(areq->requestee->instrument, 0.0);
+}
+
+/*
+ * Call the asynchronous node back when a relevant event has occurred.
+ */
+void
+ExecAsyncNotify(AsyncRequest *areq)
+{
+	/* must provide our own instrumentation support */
+	if (areq->requestee->instrument)
+		InstrStartNode(areq->requestee->instrument);
+
+	switch (nodeTag(areq->requestee))
+	{
+		case T_ForeignScanState:
+			ExecAsyncForeignScanNotify(areq);
+			break;
+		default:
+			/* If the node doesn't support async, caller messed up. */
+			elog(ERROR, "unrecognized node type: %d",
+				 (int) nodeTag(areq->requestee));
+	}
+
+	ExecAsyncResponse(areq);
+
+	/* must provide our own instrumentation support */
+	if (areq->requestee->instrument)
+		InstrStopNode(areq->requestee->instrument,
+					  TupIsNull(areq->result) ? 0.0 : 1.0);
+}
+
+/*
+ * Call the requestor back when an asynchronous node has produced a result.
+ */
+void
+ExecAsyncResponse(AsyncRequest *areq)
+{
+	switch (nodeTag(areq->requestor))
+	{
+		case T_AppendState:
+			ExecAsyncAppendResponse(areq);
+			break;
+		default:
+			/* If the node doesn't support async, caller messed up. */
+			elog(ERROR, "unrecognized node type: %d",
+				 (int) nodeTag(areq->requestor));
+	}
+}
+
+/*
+ * A requestee node should call this function to deliver the tuple to its
+ * requestor node.  The requestee node can call this from its ExecAsyncRequest
+ * or ExecAsyncNotify callback.
+ */
+void
+ExecAsyncRequestDone(AsyncRequest *areq, TupleTableSlot *result)
+{
+	areq->request_complete = true;
+	areq->result = result;
+}
+
+/*
+ * A requestee node should call this function to indicate that it is pending
+ * for a callback.  The requestee node can call this from its ExecAsyncRequest
+ * or ExecAsyncNotify callback.
+ */
+void
+ExecAsyncRequestPending(AsyncRequest *areq)
+{
+	areq->callback_pending = true;
+	areq->request_complete = false;
+	areq->result = NULL;
+}
diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c
new file mode 100644
index 0000000..4f430fb
--- /dev/null
+++ b/src/backend/executor/execCurrent.c
@@ -0,0 +1,426 @@
+/*-------------------------------------------------------------------------
+ *
+ * execCurrent.c
+ *	  executor support for WHERE CURRENT OF cursor
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *	src/backend/executor/execCurrent.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/sysattr.h"
+#include "catalog/pg_type.h"
+#include "executor/executor.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/portal.h"
+#include "utils/rel.h"
+
+
+static char *fetch_cursor_param_value(ExprContext *econtext, int paramId);
+static ScanState *search_plan_tree(PlanState *node, Oid table_oid,
+								   bool *pending_rescan);
+
+
+/*
+ * execCurrentOf
+ *
+ * Given a CURRENT OF expression and the OID of a table, determine which row
+ * of the table is currently being scanned by the cursor named by CURRENT OF,
+ * and return the row's TID into *current_tid.
+ *
+ * Returns true if a row was identified.  Returns false if the cursor is valid
+ * for the table but is not currently scanning a row of the table (this is a
+ * legal situation in inheritance cases).  Raises error if cursor is not a
+ * valid updatable scan of the specified table.
+ */
+bool
+execCurrentOf(CurrentOfExpr *cexpr,
+			  ExprContext *econtext,
+			  Oid table_oid,
+			  ItemPointer current_tid)
+{
+	char	   *cursor_name;
+	char	   *table_name;
+	Portal		portal;
+	QueryDesc  *queryDesc;
+
+	/* Get the cursor name --- may have to look up a parameter reference */
+	if (cexpr->cursor_name)
+		cursor_name = cexpr->cursor_name;
+	else
+		cursor_name = fetch_cursor_param_value(econtext, cexpr->cursor_param);
+
+	/* Fetch table name for possible use in error messages */
+	table_name = get_rel_name(table_oid);
+	if (table_name == NULL)
+		elog(ERROR, "cache lookup failed for relation %u", table_oid);
+
+	/* Find the cursor's portal */
+	portal = GetPortalByName(cursor_name);
+	if (!PortalIsValid(portal))
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_CURSOR),
+				 errmsg("cursor \"%s\" does not exist", cursor_name)));
+
+	/*
+	 * We have to watch out for non-SELECT queries as well as held cursors,
+	 * both of which may have null queryDesc.
+	 */
+	if (portal->strategy != PORTAL_ONE_SELECT)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_CURSOR_STATE),
+				 errmsg("cursor \"%s\" is not a SELECT query",
+						cursor_name)));
+	queryDesc = portal->queryDesc;
+	if (queryDesc == NULL || queryDesc->estate == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_CURSOR_STATE),
+				 errmsg("cursor \"%s\" is held from a previous transaction",
+						cursor_name)));
+
+	/*
+	 * We have two different strategies depending on whether the cursor uses
+	 * FOR UPDATE/SHARE or not.  The reason for supporting both is that the
+	 * FOR UPDATE code is able to identify a target table in many cases where
+	 * the other code can't, while the non-FOR-UPDATE case allows use of WHERE
+	 * CURRENT OF with an insensitive cursor.
+	 */
+	if (queryDesc->estate->es_rowmarks)
+	{
+		ExecRowMark *erm;
+		Index		i;
+
+		/*
+		 * Here, the query must have exactly one FOR UPDATE/SHARE reference to
+		 * the target table, and we dig the ctid info out of that.
+		 */
+		erm = NULL;
+		for (i = 0; i < queryDesc->estate->es_range_table_size; i++)
+		{
+			ExecRowMark *thiserm = queryDesc->estate->es_rowmarks[i];
+
+			if (thiserm == NULL ||
+				!RowMarkRequiresRowShareLock(thiserm->markType))
+				continue;		/* ignore non-FOR UPDATE/SHARE items */
+
+			if (thiserm->relid == table_oid)
+			{
+				if (erm)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_CURSOR_STATE),
+							 errmsg("cursor \"%s\" has multiple FOR UPDATE/SHARE references to table \"%s\"",
+									cursor_name, table_name)));
+				erm = thiserm;
+			}
+		}
+
+		if (erm == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_CURSOR_STATE),
+					 errmsg("cursor \"%s\" does not have a FOR UPDATE/SHARE reference to table \"%s\"",
+							cursor_name, table_name)));
+
+		/*
+		 * The cursor must have a current result row: per the SQL spec, it's
+		 * an error if not.
+		 */
+		if (portal->atStart || portal->atEnd)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_CURSOR_STATE),
+					 errmsg("cursor \"%s\" is not positioned on a row",
+							cursor_name)));
+
+		/* Return the currently scanned TID, if there is one */
+		if (ItemPointerIsValid(&(erm->curCtid)))
+		{
+			*current_tid = erm->curCtid;
+			return true;
+		}
+
+		/*
+		 * This table didn't produce the cursor's current row; some other
+		 * inheritance child of the same parent must have.  Signal caller to
+		 * do nothing on this table.
+		 */
+		return false;
+	}
+	else
+	{
+		/*
+		 * Without FOR UPDATE, we dig through the cursor's plan to find the
+		 * scan node.  Fail if it's not there or buried underneath
+		 * aggregation.
+		 */
+		ScanState  *scanstate;
+		bool		pending_rescan = false;
+
+		scanstate = search_plan_tree(queryDesc->planstate, table_oid,
+									 &pending_rescan);
+		if (!scanstate)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_CURSOR_STATE),
+					 errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"",
+							cursor_name, table_name)));
+
+		/*
+		 * The cursor must have a current result row: per the SQL spec, it's
+		 * an error if not.  We test this at the top level, rather than at the
+		 * scan node level, because in inheritance cases any one table scan
+		 * could easily not be on a row. We want to return false, not raise
+		 * error, if the passed-in table OID is for one of the inactive scans.
+		 */
+		if (portal->atStart || portal->atEnd)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_CURSOR_STATE),
+					 errmsg("cursor \"%s\" is not positioned on a row",
+							cursor_name)));
+
+		/*
+		 * Now OK to return false if we found an inactive scan.  It is
+		 * inactive either if it's not positioned on a row, or there's a
+		 * rescan pending for it.
+		 */
+		if (TupIsNull(scanstate->ss_ScanTupleSlot) || pending_rescan)
+			return false;
+
+		/*
+		 * Extract TID of the scan's current row.  The mechanism for this is
+		 * in principle scan-type-dependent, but for most scan types, we can
+		 * just dig the TID out of the physical scan tuple.
+		 */
+		if (IsA(scanstate, IndexOnlyScanState))
+		{
+			/*
+			 * For IndexOnlyScan, the tuple stored in ss_ScanTupleSlot may be
+			 * a virtual tuple that does not have the ctid column, so we have
+			 * to get the TID from xs_ctup.t_self.
+			 */
+			IndexScanDesc scan = ((IndexOnlyScanState *) scanstate)->ioss_ScanDesc;
+
+			*current_tid = scan->xs_heaptid;
+		}
+		else
+		{
+			/*
+			 * Default case: try to fetch TID from the scan node's current
+			 * tuple.  As an extra cross-check, verify tableoid in the current
+			 * tuple.  If the scan hasn't provided a physical tuple, we have
+			 * to fail.
+			 */
+			Datum		ldatum;
+			bool		lisnull;
+			ItemPointer tuple_tid;
+
+#ifdef USE_ASSERT_CHECKING
+			ldatum = slot_getsysattr(scanstate->ss_ScanTupleSlot,
+									 TableOidAttributeNumber,
+									 &lisnull);
+			if (lisnull)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_CURSOR_STATE),
+						 errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"",
+								cursor_name, table_name)));
+			Assert(DatumGetObjectId(ldatum) == table_oid);
+#endif
+
+			ldatum = slot_getsysattr(scanstate->ss_ScanTupleSlot,
+									 SelfItemPointerAttributeNumber,
+									 &lisnull);
+			if (lisnull)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_CURSOR_STATE),
+						 errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"",
+								cursor_name, table_name)));
+			tuple_tid = (ItemPointer) DatumGetPointer(ldatum);
+
+			*current_tid = *tuple_tid;
+		}
+
+		Assert(ItemPointerIsValid(current_tid));
+
+		return true;
+	}
+}
+
+/*
+ * fetch_cursor_param_value
+ *
+ * Fetch the string value of a param, verifying it is of type REFCURSOR.
+ */
+static char *
+fetch_cursor_param_value(ExprContext *econtext, int paramId)
+{
+	ParamListInfo paramInfo = econtext->ecxt_param_list_info;
+
+	if (paramInfo &&
+		paramId > 0 && paramId <= paramInfo->numParams)
+	{
+		ParamExternData *prm;
+		ParamExternData prmdata;
+
+		/* give hook a chance in case parameter is dynamic */
+		if (paramInfo->paramFetch != NULL)
+			prm = paramInfo->paramFetch(paramInfo, paramId, false, &prmdata);
+		else
+			prm = &paramInfo->params[paramId - 1];
+
+		if (OidIsValid(prm->ptype) && !prm->isnull)
+		{
+			/* safety check in case hook did something unexpected */
+			if (prm->ptype != REFCURSOROID)
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("type of parameter %d (%s) does not match that when preparing the plan (%s)",
+								paramId,
+								format_type_be(prm->ptype),
+								format_type_be(REFCURSOROID))));
+
+			/* We know that refcursor uses text's I/O routines */
+			return TextDatumGetCString(prm->value);
+		}
+	}
+
+	ereport(ERROR,
+			(errcode(ERRCODE_UNDEFINED_OBJECT),
+			 errmsg("no value found for parameter %d", paramId)));
+	return NULL;
+}
+
+/*
+ * search_plan_tree
+ *
+ * Search through a PlanState tree for a scan node on the specified table.
+ * Return NULL if not found or multiple candidates.
+ *
+ * CAUTION: this function is not charged simply with finding some candidate
+ * scan, but with ensuring that that scan returned the plan tree's current
+ * output row.  That's why we must reject multiple-match cases.
+ *
+ * If a candidate is found, set *pending_rescan to true if that candidate
+ * or any node above it has a pending rescan action, i.e. chgParam != NULL.
+ * That indicates that we shouldn't consider the node to be positioned on a
+ * valid tuple, even if its own state would indicate that it is.  (Caller
+ * must initialize *pending_rescan to false, and should not trust its state
+ * if multiple candidates are found.)
+ */
+static ScanState *
+search_plan_tree(PlanState *node, Oid table_oid,
+				 bool *pending_rescan)
+{
+	ScanState  *result = NULL;
+
+	if (node == NULL)
+		return NULL;
+	switch (nodeTag(node))
+	{
+			/*
+			 * Relation scan nodes can all be treated alike: check to see if
+			 * they are scanning the specified table.
+			 *
+			 * ForeignScan and CustomScan might not have a currentRelation, in
+			 * which case we just ignore them.  (We dare not descend to any
+			 * child plan nodes they might have, since we do not know the
+			 * relationship of such a node's current output tuple to the
+			 * children's current outputs.)
+			 */
+		case T_SeqScanState:
+		case T_SampleScanState:
+		case T_IndexScanState:
+		case T_IndexOnlyScanState:
+		case T_BitmapHeapScanState:
+		case T_TidScanState:
+		case T_TidRangeScanState:
+		case T_ForeignScanState:
+		case T_CustomScanState:
+			{
+				ScanState  *sstate = (ScanState *) node;
+
+				if (sstate->ss_currentRelation &&
+					RelationGetRelid(sstate->ss_currentRelation) == table_oid)
+					result = sstate;
+				break;
+			}
+
+			/*
+			 * For Append, we can check each input node.  It is safe to
+			 * descend to the inputs because only the input that resulted in
+			 * the Append's current output node could be positioned on a tuple
+			 * at all; the other inputs are either at EOF or not yet started.
+			 * Hence, if the desired table is scanned by some
+			 * currently-inactive input node, we will find that node but then
+			 * our caller will realize that it didn't emit the tuple of
+			 * interest.
+			 *
+			 * We do need to watch out for multiple matches (possible if
+			 * Append was from UNION ALL rather than an inheritance tree).
+			 *
+			 * Note: we can NOT descend through MergeAppend similarly, since
+			 * its inputs are likely all active, and we don't know which one
+			 * returned the current output tuple.  (Perhaps that could be
+			 * fixed if we were to let this code know more about MergeAppend's
+			 * internal state, but it does not seem worth the trouble.  Users
+			 * should not expect plans for ORDER BY queries to be considered
+			 * simply-updatable, since they won't be if the sorting is
+			 * implemented by a Sort node.)
+			 */
+		case T_AppendState:
+			{
+				AppendState *astate = (AppendState *) node;
+				int			i;
+
+				for (i = 0; i < astate->as_nplans; i++)
+				{
+					ScanState  *elem = search_plan_tree(astate->appendplans[i],
+														table_oid,
+														pending_rescan);
+
+					if (!elem)
+						continue;
+					if (result)
+						return NULL;	/* multiple matches */
+					result = elem;
+				}
+				break;
+			}
+
+			/*
+			 * Result and Limit can be descended through (these are safe
+			 * because they always return their input's current row)
+			 */
+		case T_ResultState:
+		case T_LimitState:
+			result = search_plan_tree(node->lefttree,
+									  table_oid,
+									  pending_rescan);
+			break;
+
+			/*
+			 * SubqueryScan too, but it keeps the child in a different place
+			 */
+		case T_SubqueryScanState:
+			result = search_plan_tree(((SubqueryScanState *) node)->subplan,
+									  table_oid,
+									  pending_rescan);
+			break;
+
+		default:
+			/* Otherwise, assume we can't descend through it */
+			break;
+	}
+
+	/*
+	 * If we found a candidate at or below this node, then this node's
+	 * chgParam indicates a pending rescan that will affect the candidate.
+	 */
+	if (result && node->chgParam != NULL)
+		*pending_rescan = true;
+
+	return result;
+}
diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c
new file mode 100644
index 0000000..bec249f
--- /dev/null
+++ b/src/backend/executor/execExpr.c
@@ -0,0 +1,3965 @@
+/*-------------------------------------------------------------------------
+ *
+ * execExpr.c
+ *	  Expression evaluation infrastructure.
+ *
+ *	During executor startup, we compile each expression tree (which has
+ *	previously been processed by the parser and planner) into an ExprState,
+ *	using ExecInitExpr() et al.  This converts the tree into a flat array
+ *	of ExprEvalSteps, which may be thought of as instructions in a program.
+ *	At runtime, we'll execute steps, starting with the first, until we reach
+ *	an EEOP_DONE opcode.
+ *
+ *	This file contains the "compilation" logic.  It is independent of the
+ *	specific execution technology we use (switch statement, computed goto,
+ *	JIT compilation, etc).
+ *
+ *	See src/backend/executor/README for some background, specifically the
+ *	"Expression Trees and ExprState nodes", "Expression Initialization",
+ *	and "Expression Evaluation" sections.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execExpr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "catalog/objectaccess.h"
+#include "catalog/pg_type.h"
+#include "executor/execExpr.h"
+#include "executor/nodeSubplan.h"
+#include "funcapi.h"
+#include "jit/jit.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/subscripting.h"
+#include "optimizer/optimizer.h"
+#include "pgstat.h"
+#include "utils/acl.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/typcache.h"
+
+
+typedef struct LastAttnumInfo
+{
+	AttrNumber	last_inner;
+	AttrNumber	last_outer;
+	AttrNumber	last_scan;
+} LastAttnumInfo;
+
+static void ExecReadyExpr(ExprState *state);
+static void ExecInitExprRec(Expr *node, ExprState *state,
+							Datum *resv, bool *resnull);
+static void ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args,
+						 Oid funcid, Oid inputcollid,
+						 ExprState *state);
+static void ExecInitExprSlots(ExprState *state, Node *node);
+static void ExecPushExprSlots(ExprState *state, LastAttnumInfo *info);
+static bool get_last_attnums_walker(Node *node, LastAttnumInfo *info);
+static bool ExecComputeSlotInfo(ExprState *state, ExprEvalStep *op);
+static void ExecInitWholeRowVar(ExprEvalStep *scratch, Var *variable,
+								ExprState *state);
+static void ExecInitSubscriptingRef(ExprEvalStep *scratch,
+									SubscriptingRef *sbsref,
+									ExprState *state,
+									Datum *resv, bool *resnull);
+static bool isAssignmentIndirectionExpr(Expr *expr);
+static void ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest,
+								   ExprState *state,
+								   Datum *resv, bool *resnull);
+static void ExecBuildAggTransCall(ExprState *state, AggState *aggstate,
+								  ExprEvalStep *scratch,
+								  FunctionCallInfo fcinfo, AggStatePerTrans pertrans,
+								  int transno, int setno, int setoff, bool ishash,
+								  bool nullcheck);
+
+
+/*
+ * ExecInitExpr: prepare an expression tree for execution
+ *
+ * This function builds and returns an ExprState implementing the given
+ * Expr node tree.  The return ExprState can then be handed to ExecEvalExpr
+ * for execution.  Because the Expr tree itself is read-only as far as
+ * ExecInitExpr and ExecEvalExpr are concerned, several different executions
+ * of the same plan tree can occur concurrently.  (But note that an ExprState
+ * does mutate at runtime, so it can't be re-used concurrently.)
+ *
+ * This must be called in a memory context that will last as long as repeated
+ * executions of the expression are needed.  Typically the context will be
+ * the same as the per-query context of the associated ExprContext.
+ *
+ * Any Aggref, WindowFunc, or SubPlan nodes found in the tree are added to
+ * the lists of such nodes held by the parent PlanState.
+ *
+ * Note: there is no ExecEndExpr function; we assume that any resource
+ * cleanup needed will be handled by just releasing the memory context
+ * in which the state tree is built.  Functions that require additional
+ * cleanup work can register a shutdown callback in the ExprContext.
+ *
+ *	'node' is the root of the expression tree to compile.
+ *	'parent' is the PlanState node that owns the expression.
+ *
+ * 'parent' may be NULL if we are preparing an expression that is not
+ * associated with a plan tree.  (If so, it can't have aggs or subplans.)
+ * Such cases should usually come through ExecPrepareExpr, not directly here.
+ *
+ * Also, if 'node' is NULL, we just return NULL.  This is convenient for some
+ * callers that may or may not have an expression that needs to be compiled.
+ * Note that a NULL ExprState pointer *cannot* be handed to ExecEvalExpr,
+ * although ExecQual and ExecCheck will accept one (and treat it as "true").
+ */
+ExprState *
+ExecInitExpr(Expr *node, PlanState *parent)
+{
+	ExprState  *state;
+	ExprEvalStep scratch = {0};
+
+	/* Special case: NULL expression produces a NULL ExprState pointer */
+	if (node == NULL)
+		return NULL;
+
+	/* Initialize ExprState with empty step list */
+	state = makeNode(ExprState);
+	state->expr = node;
+	state->parent = parent;
+	state->ext_params = NULL;
+
+	/* Insert EEOP_*_FETCHSOME steps as needed */
+	ExecInitExprSlots(state, (Node *) node);
+
+	/* Compile the expression proper */
+	ExecInitExprRec(node, state, &state->resvalue, &state->resnull);
+
+	/* Finally, append a DONE step */
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return state;
+}
+
+/*
+ * ExecInitExprWithParams: prepare a standalone expression tree for execution
+ *
+ * This is the same as ExecInitExpr, except that there is no parent PlanState,
+ * and instead we may have a ParamListInfo describing PARAM_EXTERN Params.
+ */
+ExprState *
+ExecInitExprWithParams(Expr *node, ParamListInfo ext_params)
+{
+	ExprState  *state;
+	ExprEvalStep scratch = {0};
+
+	/* Special case: NULL expression produces a NULL ExprState pointer */
+	if (node == NULL)
+		return NULL;
+
+	/* Initialize ExprState with empty step list */
+	state = makeNode(ExprState);
+	state->expr = node;
+	state->parent = NULL;
+	state->ext_params = ext_params;
+
+	/* Insert EEOP_*_FETCHSOME steps as needed */
+	ExecInitExprSlots(state, (Node *) node);
+
+	/* Compile the expression proper */
+	ExecInitExprRec(node, state, &state->resvalue, &state->resnull);
+
+	/* Finally, append a DONE step */
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return state;
+}
+
+/*
+ * ExecInitQual: prepare a qual for execution by ExecQual
+ *
+ * Prepares for the evaluation of a conjunctive boolean expression (qual list
+ * with implicit AND semantics) that returns true if none of the
+ * subexpressions are false.
+ *
+ * We must return true if the list is empty.  Since that's a very common case,
+ * we optimize it a bit further by translating to a NULL ExprState pointer
+ * rather than setting up an ExprState that computes constant TRUE.  (Some
+ * especially hot-spot callers of ExecQual detect this and avoid calling
+ * ExecQual at all.)
+ *
+ * If any of the subexpressions yield NULL, then the result of the conjunction
+ * is false.  This makes ExecQual primarily useful for evaluating WHERE
+ * clauses, since SQL specifies that tuples with null WHERE results do not
+ * get selected.
+ */
+ExprState *
+ExecInitQual(List *qual, PlanState *parent)
+{
+	ExprState  *state;
+	ExprEvalStep scratch = {0};
+	List	   *adjust_jumps = NIL;
+	ListCell   *lc;
+
+	/* short-circuit (here and in ExecQual) for empty restriction list */
+	if (qual == NIL)
+		return NULL;
+
+	Assert(IsA(qual, List));
+
+	state = makeNode(ExprState);
+	state->expr = (Expr *) qual;
+	state->parent = parent;
+	state->ext_params = NULL;
+
+	/* mark expression as to be used with ExecQual() */
+	state->flags = EEO_FLAG_IS_QUAL;
+
+	/* Insert EEOP_*_FETCHSOME steps as needed */
+	ExecInitExprSlots(state, (Node *) qual);
+
+	/*
+	 * ExecQual() needs to return false for an expression returning NULL. That
+	 * allows us to short-circuit the evaluation the first time a NULL is
+	 * encountered.  As qual evaluation is a hot-path this warrants using a
+	 * special opcode for qual evaluation that's simpler than BOOL_AND (which
+	 * has more complex NULL handling).
+	 */
+	scratch.opcode = EEOP_QUAL;
+
+	/*
+	 * We can use ExprState's resvalue/resnull as target for each qual expr.
+	 */
+	scratch.resvalue = &state->resvalue;
+	scratch.resnull = &state->resnull;
+
+	foreach(lc, qual)
+	{
+		Expr	   *node = (Expr *) lfirst(lc);
+
+		/* first evaluate expression */
+		ExecInitExprRec(node, state, &state->resvalue, &state->resnull);
+
+		/* then emit EEOP_QUAL to detect if it's false (or null) */
+		scratch.d.qualexpr.jumpdone = -1;
+		ExprEvalPushStep(state, &scratch);
+		adjust_jumps = lappend_int(adjust_jumps,
+								   state->steps_len - 1);
+	}
+
+	/* adjust jump targets */
+	foreach(lc, adjust_jumps)
+	{
+		ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+		Assert(as->opcode == EEOP_QUAL);
+		Assert(as->d.qualexpr.jumpdone == -1);
+		as->d.qualexpr.jumpdone = state->steps_len;
+	}
+
+	/*
+	 * At the end, we don't need to do anything more.  The last qual expr must
+	 * have yielded TRUE, and since its result is stored in the desired output
+	 * location, we're done.
+	 */
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return state;
+}
+
+/*
+ * ExecInitCheck: prepare a check constraint for execution by ExecCheck
+ *
+ * This is much like ExecInitQual/ExecQual, except that a null result from
+ * the conjunction is treated as TRUE.  This behavior is appropriate for
+ * evaluating CHECK constraints, since SQL specifies that NULL constraint
+ * conditions are not failures.
+ *
+ * Note that like ExecInitQual, this expects input in implicit-AND format.
+ * Users of ExecCheck that have expressions in normal explicit-AND format
+ * can just apply ExecInitExpr to produce suitable input for ExecCheck.
+ */
+ExprState *
+ExecInitCheck(List *qual, PlanState *parent)
+{
+	/* short-circuit (here and in ExecCheck) for empty restriction list */
+	if (qual == NIL)
+		return NULL;
+
+	Assert(IsA(qual, List));
+
+	/*
+	 * Just convert the implicit-AND list to an explicit AND (if there's more
+	 * than one entry), and compile normally.  Unlike ExecQual, we can't
+	 * short-circuit on NULL results, so the regular AND behavior is needed.
+	 */
+	return ExecInitExpr(make_ands_explicit(qual), parent);
+}
+
+/*
+ * Call ExecInitExpr() on a list of expressions, return a list of ExprStates.
+ */
+List *
+ExecInitExprList(List *nodes, PlanState *parent)
+{
+	List	   *result = NIL;
+	ListCell   *lc;
+
+	foreach(lc, nodes)
+	{
+		Expr	   *e = lfirst(lc);
+
+		result = lappend(result, ExecInitExpr(e, parent));
+	}
+
+	return result;
+}
+
+/*
+ *		ExecBuildProjectionInfo
+ *
+ * Build a ProjectionInfo node for evaluating the given tlist in the given
+ * econtext, and storing the result into the tuple slot.  (Caller must have
+ * ensured that tuple slot has a descriptor matching the tlist!)
+ *
+ * inputDesc can be NULL, but if it is not, we check to see whether simple
+ * Vars in the tlist match the descriptor.  It is important to provide
+ * inputDesc for relation-scan plan nodes, as a cross check that the relation
+ * hasn't been changed since the plan was made.  At higher levels of a plan,
+ * there is no need to recheck.
+ *
+ * This is implemented by internally building an ExprState that performs the
+ * whole projection in one go.
+ *
+ * Caution: before PG v10, the targetList was a list of ExprStates; now it
+ * should be the planner-created targetlist, since we do the compilation here.
+ */
+ProjectionInfo *
+ExecBuildProjectionInfo(List *targetList,
+						ExprContext *econtext,
+						TupleTableSlot *slot,
+						PlanState *parent,
+						TupleDesc inputDesc)
+{
+	ProjectionInfo *projInfo = makeNode(ProjectionInfo);
+	ExprState  *state;
+	ExprEvalStep scratch = {0};
+	ListCell   *lc;
+
+	projInfo->pi_exprContext = econtext;
+	/* We embed ExprState into ProjectionInfo instead of doing extra palloc */
+	projInfo->pi_state.tag = T_ExprState;
+	state = &projInfo->pi_state;
+	state->expr = (Expr *) targetList;
+	state->parent = parent;
+	state->ext_params = NULL;
+
+	state->resultslot = slot;
+
+	/* Insert EEOP_*_FETCHSOME steps as needed */
+	ExecInitExprSlots(state, (Node *) targetList);
+
+	/* Now compile each tlist column */
+	foreach(lc, targetList)
+	{
+		TargetEntry *tle = lfirst_node(TargetEntry, lc);
+		Var		   *variable = NULL;
+		AttrNumber	attnum = 0;
+		bool		isSafeVar = false;
+
+		/*
+		 * If tlist expression is a safe non-system Var, use the fast-path
+		 * ASSIGN_*_VAR opcodes.  "Safe" means that we don't need to apply
+		 * CheckVarSlotCompatibility() during plan startup.  If a source slot
+		 * was provided, we make the equivalent tests here; if a slot was not
+		 * provided, we assume that no check is needed because we're dealing
+		 * with a non-relation-scan-level expression.
+		 */
+		if (tle->expr != NULL &&
+			IsA(tle->expr, Var) &&
+			((Var *) tle->expr)->varattno > 0)
+		{
+			/* Non-system Var, but how safe is it? */
+			variable = (Var *) tle->expr;
+			attnum = variable->varattno;
+
+			if (inputDesc == NULL)
+				isSafeVar = true;	/* can't check, just assume OK */
+			else if (attnum <= inputDesc->natts)
+			{
+				Form_pg_attribute attr = TupleDescAttr(inputDesc, attnum - 1);
+
+				/*
+				 * If user attribute is dropped or has a type mismatch, don't
+				 * use ASSIGN_*_VAR.  Instead let the normal expression
+				 * machinery handle it (which'll possibly error out).
+				 */
+				if (!attr->attisdropped && variable->vartype == attr->atttypid)
+				{
+					isSafeVar = true;
+				}
+			}
+		}
+
+		if (isSafeVar)
+		{
+			/* Fast-path: just generate an EEOP_ASSIGN_*_VAR step */
+			switch (variable->varno)
+			{
+				case INNER_VAR:
+					/* get the tuple from the inner node */
+					scratch.opcode = EEOP_ASSIGN_INNER_VAR;
+					break;
+
+				case OUTER_VAR:
+					/* get the tuple from the outer node */
+					scratch.opcode = EEOP_ASSIGN_OUTER_VAR;
+					break;
+
+					/* INDEX_VAR is handled by default case */
+
+				default:
+					/* get the tuple from the relation being scanned */
+					scratch.opcode = EEOP_ASSIGN_SCAN_VAR;
+					break;
+			}
+
+			scratch.d.assign_var.attnum = attnum - 1;
+			scratch.d.assign_var.resultnum = tle->resno - 1;
+			ExprEvalPushStep(state, &scratch);
+		}
+		else
+		{
+			/*
+			 * Otherwise, compile the column expression normally.
+			 *
+			 * We can't tell the expression to evaluate directly into the
+			 * result slot, as the result slot (and the exprstate for that
+			 * matter) can change between executions.  We instead evaluate
+			 * into the ExprState's resvalue/resnull and then move.
+			 */
+			ExecInitExprRec(tle->expr, state,
+							&state->resvalue, &state->resnull);
+
+			/*
+			 * Column might be referenced multiple times in upper nodes, so
+			 * force value to R/O - but only if it could be an expanded datum.
+			 */
+			if (get_typlen(exprType((Node *) tle->expr)) == -1)
+				scratch.opcode = EEOP_ASSIGN_TMP_MAKE_RO;
+			else
+				scratch.opcode = EEOP_ASSIGN_TMP;
+			scratch.d.assign_tmp.resultnum = tle->resno - 1;
+			ExprEvalPushStep(state, &scratch);
+		}
+	}
+
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return projInfo;
+}
+
+/*
+ *		ExecBuildUpdateProjection
+ *
+ * Build a ProjectionInfo node for constructing a new tuple during UPDATE.
+ * The projection will be executed in the given econtext and the result will
+ * be stored into the given tuple slot.  (Caller must have ensured that tuple
+ * slot has a descriptor matching the target rel!)
+ *
+ * When evalTargetList is false, targetList contains the UPDATE ... SET
+ * expressions that have already been computed by a subplan node; the values
+ * from this tlist are assumed to be available in the "outer" tuple slot.
+ * When evalTargetList is true, targetList contains the UPDATE ... SET
+ * expressions that must be computed (which could contain references to
+ * the outer, inner, or scan tuple slots).
+ *
+ * In either case, targetColnos contains a list of the target column numbers
+ * corresponding to the non-resjunk entries of targetList.  The tlist values
+ * are assigned into these columns of the result tuple slot.  Target columns
+ * not listed in targetColnos are filled from the UPDATE's old tuple, which
+ * is assumed to be available in the "scan" tuple slot.
+ *
+ * targetList can also contain resjunk columns.  These must be evaluated
+ * if evalTargetList is true, but their values are discarded.
+ *
+ * relDesc must describe the relation we intend to update.
+ *
+ * This is basically a specialized variant of ExecBuildProjectionInfo.
+ * However, it also performs sanity checks equivalent to ExecCheckPlanOutput.
+ * Since we never make a normal tlist equivalent to the whole
+ * tuple-to-be-assigned, there is no convenient way to apply
+ * ExecCheckPlanOutput, so we must do our safety checks here.
+ */
+ProjectionInfo *
+ExecBuildUpdateProjection(List *targetList,
+						  bool evalTargetList,
+						  List *targetColnos,
+						  TupleDesc relDesc,
+						  ExprContext *econtext,
+						  TupleTableSlot *slot,
+						  PlanState *parent)
+{
+	ProjectionInfo *projInfo = makeNode(ProjectionInfo);
+	ExprState  *state;
+	int			nAssignableCols;
+	bool		sawJunk;
+	Bitmapset  *assignedCols;
+	LastAttnumInfo deform = {0, 0, 0};
+	ExprEvalStep scratch = {0};
+	int			outerattnum;
+	ListCell   *lc,
+			   *lc2;
+
+	projInfo->pi_exprContext = econtext;
+	/* We embed ExprState into ProjectionInfo instead of doing extra palloc */
+	projInfo->pi_state.tag = T_ExprState;
+	state = &projInfo->pi_state;
+	if (evalTargetList)
+		state->expr = (Expr *) targetList;
+	else
+		state->expr = NULL;		/* not used */
+	state->parent = parent;
+	state->ext_params = NULL;
+
+	state->resultslot = slot;
+
+	/*
+	 * Examine the targetList to see how many non-junk columns there are, and
+	 * to verify that the non-junk columns come before the junk ones.
+	 */
+	nAssignableCols = 0;
+	sawJunk = false;
+	foreach(lc, targetList)
+	{
+		TargetEntry *tle = lfirst_node(TargetEntry, lc);
+
+		if (tle->resjunk)
+			sawJunk = true;
+		else
+		{
+			if (sawJunk)
+				elog(ERROR, "subplan target list is out of order");
+			nAssignableCols++;
+		}
+	}
+
+	/* We should have one targetColnos entry per non-junk column */
+	if (nAssignableCols != list_length(targetColnos))
+		elog(ERROR, "targetColnos does not match subplan target list");
+
+	/*
+	 * Build a bitmapset of the columns in targetColnos.  (We could just use
+	 * list_member_int() tests, but that risks O(N^2) behavior with many
+	 * columns.)
+	 */
+	assignedCols = NULL;
+	foreach(lc, targetColnos)
+	{
+		AttrNumber	targetattnum = lfirst_int(lc);
+
+		assignedCols = bms_add_member(assignedCols, targetattnum);
+	}
+
+	/*
+	 * We need to insert EEOP_*_FETCHSOME steps to ensure the input tuples are
+	 * sufficiently deconstructed.  The scan tuple must be deconstructed at
+	 * least as far as the last old column we need.
+	 */
+	for (int attnum = relDesc->natts; attnum > 0; attnum--)
+	{
+		Form_pg_attribute attr = TupleDescAttr(relDesc, attnum - 1);
+
+		if (attr->attisdropped)
+			continue;
+		if (bms_is_member(attnum, assignedCols))
+			continue;
+		deform.last_scan = attnum;
+		break;
+	}
+
+	/*
+	 * If we're actually evaluating the tlist, incorporate its input
+	 * requirements too; otherwise, we'll just need to fetch the appropriate
+	 * number of columns of the "outer" tuple.
+	 */
+	if (evalTargetList)
+		get_last_attnums_walker((Node *) targetList, &deform);
+	else
+		deform.last_outer = nAssignableCols;
+
+	ExecPushExprSlots(state, &deform);
+
+	/*
+	 * Now generate code to evaluate the tlist's assignable expressions or
+	 * fetch them from the outer tuple, incidentally validating that they'll
+	 * be of the right data type.  The checks above ensure that the forboth()
+	 * will iterate over exactly the non-junk columns.
+	 */
+	outerattnum = 0;
+	forboth(lc, targetList, lc2, targetColnos)
+	{
+		TargetEntry *tle = lfirst_node(TargetEntry, lc);
+		AttrNumber	targetattnum = lfirst_int(lc2);
+		Form_pg_attribute attr;
+
+		Assert(!tle->resjunk);
+
+		/*
+		 * Apply sanity checks comparable to ExecCheckPlanOutput().
+		 */
+		if (targetattnum <= 0 || targetattnum > relDesc->natts)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("table row type and query-specified row type do not match"),
+					 errdetail("Query has too many columns.")));
+		attr = TupleDescAttr(relDesc, targetattnum - 1);
+
+		if (attr->attisdropped)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("table row type and query-specified row type do not match"),
+					 errdetail("Query provides a value for a dropped column at ordinal position %d.",
+							   targetattnum)));
+		if (exprType((Node *) tle->expr) != attr->atttypid)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("table row type and query-specified row type do not match"),
+					 errdetail("Table has type %s at ordinal position %d, but query expects %s.",
+							   format_type_be(attr->atttypid),
+							   targetattnum,
+							   format_type_be(exprType((Node *) tle->expr)))));
+
+		/* OK, generate code to perform the assignment. */
+		if (evalTargetList)
+		{
+			/*
+			 * We must evaluate the TLE's expression and assign it.  We do not
+			 * bother jumping through hoops for "safe" Vars like
+			 * ExecBuildProjectionInfo does; this is a relatively less-used
+			 * path and it doesn't seem worth expending code for that.
+			 */
+			ExecInitExprRec(tle->expr, state,
+							&state->resvalue, &state->resnull);
+			/* Needn't worry about read-only-ness here, either. */
+			scratch.opcode = EEOP_ASSIGN_TMP;
+			scratch.d.assign_tmp.resultnum = targetattnum - 1;
+			ExprEvalPushStep(state, &scratch);
+		}
+		else
+		{
+			/* Just assign from the outer tuple. */
+			scratch.opcode = EEOP_ASSIGN_OUTER_VAR;
+			scratch.d.assign_var.attnum = outerattnum;
+			scratch.d.assign_var.resultnum = targetattnum - 1;
+			ExprEvalPushStep(state, &scratch);
+		}
+		outerattnum++;
+	}
+
+	/*
+	 * If we're evaluating the tlist, must evaluate any resjunk columns too.
+	 * (This matters for things like MULTIEXPR_SUBLINK SubPlans.)
+	 */
+	if (evalTargetList)
+	{
+		for_each_cell(lc, targetList, lc)
+		{
+			TargetEntry *tle = lfirst_node(TargetEntry, lc);
+
+			Assert(tle->resjunk);
+			ExecInitExprRec(tle->expr, state,
+							&state->resvalue, &state->resnull);
+		}
+	}
+
+	/*
+	 * Now generate code to copy over any old columns that were not assigned
+	 * to, and to ensure that dropped columns are set to NULL.
+	 */
+	for (int attnum = 1; attnum <= relDesc->natts; attnum++)
+	{
+		Form_pg_attribute attr = TupleDescAttr(relDesc, attnum - 1);
+
+		if (attr->attisdropped)
+		{
+			/* Put a null into the ExprState's resvalue/resnull ... */
+			scratch.opcode = EEOP_CONST;
+			scratch.resvalue = &state->resvalue;
+			scratch.resnull = &state->resnull;
+			scratch.d.constval.value = (Datum) 0;
+			scratch.d.constval.isnull = true;
+			ExprEvalPushStep(state, &scratch);
+			/* ... then assign it to the result slot */
+			scratch.opcode = EEOP_ASSIGN_TMP;
+			scratch.d.assign_tmp.resultnum = attnum - 1;
+			ExprEvalPushStep(state, &scratch);
+		}
+		else if (!bms_is_member(attnum, assignedCols))
+		{
+			/* Certainly the right type, so needn't check */
+			scratch.opcode = EEOP_ASSIGN_SCAN_VAR;
+			scratch.d.assign_var.attnum = attnum - 1;
+			scratch.d.assign_var.resultnum = attnum - 1;
+			ExprEvalPushStep(state, &scratch);
+		}
+	}
+
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return projInfo;
+}
+
+/*
+ * ExecPrepareExpr --- initialize for expression execution outside a normal
+ * Plan tree context.
+ *
+ * This differs from ExecInitExpr in that we don't assume the caller is
+ * already running in the EState's per-query context.  Also, we run the
+ * passed expression tree through expression_planner() to prepare it for
+ * execution.  (In ordinary Plan trees the regular planning process will have
+ * made the appropriate transformations on expressions, but for standalone
+ * expressions this won't have happened.)
+ */
+ExprState *
+ExecPrepareExpr(Expr *node, EState *estate)
+{
+	ExprState  *result;
+	MemoryContext oldcontext;
+
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	node = expression_planner(node);
+
+	result = ExecInitExpr(node, NULL);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return result;
+}
+
+/*
+ * ExecPrepareQual --- initialize for qual execution outside a normal
+ * Plan tree context.
+ *
+ * This differs from ExecInitQual in that we don't assume the caller is
+ * already running in the EState's per-query context.  Also, we run the
+ * passed expression tree through expression_planner() to prepare it for
+ * execution.  (In ordinary Plan trees the regular planning process will have
+ * made the appropriate transformations on expressions, but for standalone
+ * expressions this won't have happened.)
+ */
+ExprState *
+ExecPrepareQual(List *qual, EState *estate)
+{
+	ExprState  *result;
+	MemoryContext oldcontext;
+
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	qual = (List *) expression_planner((Expr *) qual);
+
+	result = ExecInitQual(qual, NULL);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return result;
+}
+
+/*
+ * ExecPrepareCheck -- initialize check constraint for execution outside a
+ * normal Plan tree context.
+ *
+ * See ExecPrepareExpr() and ExecInitCheck() for details.
+ */
+ExprState *
+ExecPrepareCheck(List *qual, EState *estate)
+{
+	ExprState  *result;
+	MemoryContext oldcontext;
+
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	qual = (List *) expression_planner((Expr *) qual);
+
+	result = ExecInitCheck(qual, NULL);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return result;
+}
+
+/*
+ * Call ExecPrepareExpr() on each member of a list of Exprs, and return
+ * a list of ExprStates.
+ *
+ * See ExecPrepareExpr() for details.
+ */
+List *
+ExecPrepareExprList(List *nodes, EState *estate)
+{
+	List	   *result = NIL;
+	MemoryContext oldcontext;
+	ListCell   *lc;
+
+	/* Ensure that the list cell nodes are in the right context too */
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	foreach(lc, nodes)
+	{
+		Expr	   *e = (Expr *) lfirst(lc);
+
+		result = lappend(result, ExecPrepareExpr(e, estate));
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return result;
+}
+
+/*
+ * ExecCheck - evaluate a check constraint
+ *
+ * For check constraints, a null result is taken as TRUE, ie the constraint
+ * passes.
+ *
+ * The check constraint may have been prepared with ExecInitCheck
+ * (possibly via ExecPrepareCheck) if the caller had it in implicit-AND
+ * format, but a regular boolean expression prepared with ExecInitExpr or
+ * ExecPrepareExpr works too.
+ */
+bool
+ExecCheck(ExprState *state, ExprContext *econtext)
+{
+	Datum		ret;
+	bool		isnull;
+
+	/* short-circuit (here and in ExecInitCheck) for empty restriction list */
+	if (state == NULL)
+		return true;
+
+	/* verify that expression was not compiled using ExecInitQual */
+	Assert(!(state->flags & EEO_FLAG_IS_QUAL));
+
+	ret = ExecEvalExprSwitchContext(state, econtext, &isnull);
+
+	if (isnull)
+		return true;
+
+	return DatumGetBool(ret);
+}
+
+/*
+ * Prepare a compiled expression for execution.  This has to be called for
+ * every ExprState before it can be executed.
+ *
+ * NB: While this currently only calls ExecReadyInterpretedExpr(),
+ * this will likely get extended to further expression evaluation methods.
+ * Therefore this should be used instead of directly calling
+ * ExecReadyInterpretedExpr().
+ */
+static void
+ExecReadyExpr(ExprState *state)
+{
+	if (jit_compile_expr(state))
+		return;
+
+	ExecReadyInterpretedExpr(state);
+}
+
+/*
+ * Append the steps necessary for the evaluation of node to ExprState->steps,
+ * possibly recursing into sub-expressions of node.
+ *
+ * node - expression to evaluate
+ * state - ExprState to whose ->steps to append the necessary operations
+ * resv / resnull - where to store the result of the node into
+ */
+static void
+ExecInitExprRec(Expr *node, ExprState *state,
+				Datum *resv, bool *resnull)
+{
+	ExprEvalStep scratch = {0};
+
+	/* Guard against stack overflow due to overly complex expressions */
+	check_stack_depth();
+
+	/* Step's output location is always what the caller gave us */
+	Assert(resv != NULL && resnull != NULL);
+	scratch.resvalue = resv;
+	scratch.resnull = resnull;
+
+	/* cases should be ordered as they are in enum NodeTag */
+	switch (nodeTag(node))
+	{
+		case T_Var:
+			{
+				Var		   *variable = (Var *) node;
+
+				if (variable->varattno == InvalidAttrNumber)
+				{
+					/* whole-row Var */
+					ExecInitWholeRowVar(&scratch, variable, state);
+				}
+				else if (variable->varattno <= 0)
+				{
+					/* system column */
+					scratch.d.var.attnum = variable->varattno;
+					scratch.d.var.vartype = variable->vartype;
+					switch (variable->varno)
+					{
+						case INNER_VAR:
+							scratch.opcode = EEOP_INNER_SYSVAR;
+							break;
+						case OUTER_VAR:
+							scratch.opcode = EEOP_OUTER_SYSVAR;
+							break;
+
+							/* INDEX_VAR is handled by default case */
+
+						default:
+							scratch.opcode = EEOP_SCAN_SYSVAR;
+							break;
+					}
+				}
+				else
+				{
+					/* regular user column */
+					scratch.d.var.attnum = variable->varattno - 1;
+					scratch.d.var.vartype = variable->vartype;
+					switch (variable->varno)
+					{
+						case INNER_VAR:
+							scratch.opcode = EEOP_INNER_VAR;
+							break;
+						case OUTER_VAR:
+							scratch.opcode = EEOP_OUTER_VAR;
+							break;
+
+							/* INDEX_VAR is handled by default case */
+
+						default:
+							scratch.opcode = EEOP_SCAN_VAR;
+							break;
+					}
+				}
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_Const:
+			{
+				Const	   *con = (Const *) node;
+
+				scratch.opcode = EEOP_CONST;
+				scratch.d.constval.value = con->constvalue;
+				scratch.d.constval.isnull = con->constisnull;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_Param:
+			{
+				Param	   *param = (Param *) node;
+				ParamListInfo params;
+
+				switch (param->paramkind)
+				{
+					case PARAM_EXEC:
+						scratch.opcode = EEOP_PARAM_EXEC;
+						scratch.d.param.paramid = param->paramid;
+						scratch.d.param.paramtype = param->paramtype;
+						ExprEvalPushStep(state, &scratch);
+						break;
+					case PARAM_EXTERN:
+
+						/*
+						 * If we have a relevant ParamCompileHook, use it;
+						 * otherwise compile a standard EEOP_PARAM_EXTERN
+						 * step.  ext_params, if supplied, takes precedence
+						 * over info from the parent node's EState (if any).
+						 */
+						if (state->ext_params)
+							params = state->ext_params;
+						else if (state->parent &&
+								 state->parent->state)
+							params = state->parent->state->es_param_list_info;
+						else
+							params = NULL;
+						if (params && params->paramCompile)
+						{
+							params->paramCompile(params, param, state,
+												 resv, resnull);
+						}
+						else
+						{
+							scratch.opcode = EEOP_PARAM_EXTERN;
+							scratch.d.param.paramid = param->paramid;
+							scratch.d.param.paramtype = param->paramtype;
+							ExprEvalPushStep(state, &scratch);
+						}
+						break;
+					default:
+						elog(ERROR, "unrecognized paramkind: %d",
+							 (int) param->paramkind);
+						break;
+				}
+				break;
+			}
+
+		case T_Aggref:
+			{
+				Aggref	   *aggref = (Aggref *) node;
+
+				scratch.opcode = EEOP_AGGREF;
+				scratch.d.aggref.aggno = aggref->aggno;
+
+				if (state->parent && IsA(state->parent, AggState))
+				{
+					AggState   *aggstate = (AggState *) state->parent;
+
+					aggstate->aggs = lappend(aggstate->aggs, aggref);
+				}
+				else
+				{
+					/* planner messed up */
+					elog(ERROR, "Aggref found in non-Agg plan node");
+				}
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_GroupingFunc:
+			{
+				GroupingFunc *grp_node = (GroupingFunc *) node;
+				Agg		   *agg;
+
+				if (!state->parent || !IsA(state->parent, AggState) ||
+					!IsA(state->parent->plan, Agg))
+					elog(ERROR, "GroupingFunc found in non-Agg plan node");
+
+				scratch.opcode = EEOP_GROUPING_FUNC;
+
+				agg = (Agg *) (state->parent->plan);
+
+				if (agg->groupingSets)
+					scratch.d.grouping_func.clauses = grp_node->cols;
+				else
+					scratch.d.grouping_func.clauses = NIL;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_WindowFunc:
+			{
+				WindowFunc *wfunc = (WindowFunc *) node;
+				WindowFuncExprState *wfstate = makeNode(WindowFuncExprState);
+
+				wfstate->wfunc = wfunc;
+
+				if (state->parent && IsA(state->parent, WindowAggState))
+				{
+					WindowAggState *winstate = (WindowAggState *) state->parent;
+					int			nfuncs;
+
+					winstate->funcs = lappend(winstate->funcs, wfstate);
+					nfuncs = ++winstate->numfuncs;
+					if (wfunc->winagg)
+						winstate->numaggs++;
+
+					/* for now initialize agg using old style expressions */
+					wfstate->args = ExecInitExprList(wfunc->args,
+													 state->parent);
+					wfstate->aggfilter = ExecInitExpr(wfunc->aggfilter,
+													  state->parent);
+
+					/*
+					 * Complain if the windowfunc's arguments contain any
+					 * windowfuncs; nested window functions are semantically
+					 * nonsensical.  (This should have been caught earlier,
+					 * but we defend against it here anyway.)
+					 */
+					if (nfuncs != winstate->numfuncs)
+						ereport(ERROR,
+								(errcode(ERRCODE_WINDOWING_ERROR),
+								 errmsg("window function calls cannot be nested")));
+				}
+				else
+				{
+					/* planner messed up */
+					elog(ERROR, "WindowFunc found in non-WindowAgg plan node");
+				}
+
+				scratch.opcode = EEOP_WINDOW_FUNC;
+				scratch.d.window_func.wfstate = wfstate;
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_SubscriptingRef:
+			{
+				SubscriptingRef *sbsref = (SubscriptingRef *) node;
+
+				ExecInitSubscriptingRef(&scratch, sbsref, state, resv, resnull);
+				break;
+			}
+
+		case T_FuncExpr:
+			{
+				FuncExpr   *func = (FuncExpr *) node;
+
+				ExecInitFunc(&scratch, node,
+							 func->args, func->funcid, func->inputcollid,
+							 state);
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_OpExpr:
+			{
+				OpExpr	   *op = (OpExpr *) node;
+
+				ExecInitFunc(&scratch, node,
+							 op->args, op->opfuncid, op->inputcollid,
+							 state);
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_DistinctExpr:
+			{
+				DistinctExpr *op = (DistinctExpr *) node;
+
+				ExecInitFunc(&scratch, node,
+							 op->args, op->opfuncid, op->inputcollid,
+							 state);
+
+				/*
+				 * Change opcode of call instruction to EEOP_DISTINCT.
+				 *
+				 * XXX: historically we've not called the function usage
+				 * pgstat infrastructure - that seems inconsistent given that
+				 * we do so for normal function *and* operator evaluation.  If
+				 * we decided to do that here, we'd probably want separate
+				 * opcodes for FUSAGE or not.
+				 */
+				scratch.opcode = EEOP_DISTINCT;
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_NullIfExpr:
+			{
+				NullIfExpr *op = (NullIfExpr *) node;
+
+				ExecInitFunc(&scratch, node,
+							 op->args, op->opfuncid, op->inputcollid,
+							 state);
+
+				/*
+				 * Change opcode of call instruction to EEOP_NULLIF.
+				 *
+				 * XXX: historically we've not called the function usage
+				 * pgstat infrastructure - that seems inconsistent given that
+				 * we do so for normal function *and* operator evaluation.  If
+				 * we decided to do that here, we'd probably want separate
+				 * opcodes for FUSAGE or not.
+				 */
+				scratch.opcode = EEOP_NULLIF;
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_ScalarArrayOpExpr:
+			{
+				ScalarArrayOpExpr *opexpr = (ScalarArrayOpExpr *) node;
+				Expr	   *scalararg;
+				Expr	   *arrayarg;
+				FmgrInfo   *finfo;
+				FunctionCallInfo fcinfo;
+				AclResult	aclresult;
+				FmgrInfo   *hash_finfo;
+				FunctionCallInfo hash_fcinfo;
+
+				Assert(list_length(opexpr->args) == 2);
+				scalararg = (Expr *) linitial(opexpr->args);
+				arrayarg = (Expr *) lsecond(opexpr->args);
+
+				/* Check permission to call function */
+				aclresult = pg_proc_aclcheck(opexpr->opfuncid,
+											 GetUserId(),
+											 ACL_EXECUTE);
+				if (aclresult != ACLCHECK_OK)
+					aclcheck_error(aclresult, OBJECT_FUNCTION,
+								   get_func_name(opexpr->opfuncid));
+				InvokeFunctionExecuteHook(opexpr->opfuncid);
+
+				if (OidIsValid(opexpr->hashfuncid))
+				{
+					aclresult = pg_proc_aclcheck(opexpr->hashfuncid,
+												 GetUserId(),
+												 ACL_EXECUTE);
+					if (aclresult != ACLCHECK_OK)
+						aclcheck_error(aclresult, OBJECT_FUNCTION,
+									   get_func_name(opexpr->hashfuncid));
+					InvokeFunctionExecuteHook(opexpr->hashfuncid);
+				}
+
+				/* Set up the primary fmgr lookup information */
+				finfo = palloc0(sizeof(FmgrInfo));
+				fcinfo = palloc0(SizeForFunctionCallInfo(2));
+				fmgr_info(opexpr->opfuncid, finfo);
+				fmgr_info_set_expr((Node *) node, finfo);
+				InitFunctionCallInfoData(*fcinfo, finfo, 2,
+										 opexpr->inputcollid, NULL, NULL);
+
+				/*
+				 * If hashfuncid is set, we create a EEOP_HASHED_SCALARARRAYOP
+				 * step instead of a EEOP_SCALARARRAYOP.  This provides much
+				 * faster lookup performance than the normal linear search
+				 * when the number of items in the array is anything but very
+				 * small.
+				 */
+				if (OidIsValid(opexpr->hashfuncid))
+				{
+					hash_finfo = palloc0(sizeof(FmgrInfo));
+					hash_fcinfo = palloc0(SizeForFunctionCallInfo(1));
+					fmgr_info(opexpr->hashfuncid, hash_finfo);
+					fmgr_info_set_expr((Node *) node, hash_finfo);
+					InitFunctionCallInfoData(*hash_fcinfo, hash_finfo,
+											 1, opexpr->inputcollid, NULL,
+											 NULL);
+
+					scratch.d.hashedscalararrayop.hash_finfo = hash_finfo;
+					scratch.d.hashedscalararrayop.hash_fcinfo_data = hash_fcinfo;
+					scratch.d.hashedscalararrayop.hash_fn_addr = hash_finfo->fn_addr;
+
+					/* Evaluate scalar directly into left function argument */
+					ExecInitExprRec(scalararg, state,
+									&fcinfo->args[0].value, &fcinfo->args[0].isnull);
+
+					/*
+					 * Evaluate array argument into our return value.  There's
+					 * no danger in that, because the return value is
+					 * guaranteed to be overwritten by
+					 * EEOP_HASHED_SCALARARRAYOP, and will not be passed to
+					 * any other expression.
+					 */
+					ExecInitExprRec(arrayarg, state, resv, resnull);
+
+					/* And perform the operation */
+					scratch.opcode = EEOP_HASHED_SCALARARRAYOP;
+					scratch.d.hashedscalararrayop.finfo = finfo;
+					scratch.d.hashedscalararrayop.fcinfo_data = fcinfo;
+					scratch.d.hashedscalararrayop.fn_addr = finfo->fn_addr;
+
+					scratch.d.hashedscalararrayop.hash_finfo = hash_finfo;
+					scratch.d.hashedscalararrayop.hash_fcinfo_data = hash_fcinfo;
+					scratch.d.hashedscalararrayop.hash_fn_addr = hash_finfo->fn_addr;
+
+					ExprEvalPushStep(state, &scratch);
+				}
+				else
+				{
+					/* Evaluate scalar directly into left function argument */
+					ExecInitExprRec(scalararg, state,
+									&fcinfo->args[0].value,
+									&fcinfo->args[0].isnull);
+
+					/*
+					 * Evaluate array argument into our return value.  There's
+					 * no danger in that, because the return value is
+					 * guaranteed to be overwritten by EEOP_SCALARARRAYOP, and
+					 * will not be passed to any other expression.
+					 */
+					ExecInitExprRec(arrayarg, state, resv, resnull);
+
+					/* And perform the operation */
+					scratch.opcode = EEOP_SCALARARRAYOP;
+					scratch.d.scalararrayop.element_type = InvalidOid;
+					scratch.d.scalararrayop.useOr = opexpr->useOr;
+					scratch.d.scalararrayop.finfo = finfo;
+					scratch.d.scalararrayop.fcinfo_data = fcinfo;
+					scratch.d.scalararrayop.fn_addr = finfo->fn_addr;
+					ExprEvalPushStep(state, &scratch);
+				}
+				break;
+			}
+
+		case T_BoolExpr:
+			{
+				BoolExpr   *boolexpr = (BoolExpr *) node;
+				int			nargs = list_length(boolexpr->args);
+				List	   *adjust_jumps = NIL;
+				int			off;
+				ListCell   *lc;
+
+				/* allocate scratch memory used by all steps of AND/OR */
+				if (boolexpr->boolop != NOT_EXPR)
+					scratch.d.boolexpr.anynull = (bool *) palloc(sizeof(bool));
+
+				/*
+				 * For each argument evaluate the argument itself, then
+				 * perform the bool operation's appropriate handling.
+				 *
+				 * We can evaluate each argument into our result area, since
+				 * the short-circuiting logic means we only need to remember
+				 * previous NULL values.
+				 *
+				 * AND/OR is split into separate STEP_FIRST (one) / STEP (zero
+				 * or more) / STEP_LAST (one) steps, as each of those has to
+				 * perform different work.  The FIRST/LAST split is valid
+				 * because AND/OR have at least two arguments.
+				 */
+				off = 0;
+				foreach(lc, boolexpr->args)
+				{
+					Expr	   *arg = (Expr *) lfirst(lc);
+
+					/* Evaluate argument into our output variable */
+					ExecInitExprRec(arg, state, resv, resnull);
+
+					/* Perform the appropriate step type */
+					switch (boolexpr->boolop)
+					{
+						case AND_EXPR:
+							Assert(nargs >= 2);
+
+							if (off == 0)
+								scratch.opcode = EEOP_BOOL_AND_STEP_FIRST;
+							else if (off + 1 == nargs)
+								scratch.opcode = EEOP_BOOL_AND_STEP_LAST;
+							else
+								scratch.opcode = EEOP_BOOL_AND_STEP;
+							break;
+						case OR_EXPR:
+							Assert(nargs >= 2);
+
+							if (off == 0)
+								scratch.opcode = EEOP_BOOL_OR_STEP_FIRST;
+							else if (off + 1 == nargs)
+								scratch.opcode = EEOP_BOOL_OR_STEP_LAST;
+							else
+								scratch.opcode = EEOP_BOOL_OR_STEP;
+							break;
+						case NOT_EXPR:
+							Assert(nargs == 1);
+
+							scratch.opcode = EEOP_BOOL_NOT_STEP;
+							break;
+						default:
+							elog(ERROR, "unrecognized boolop: %d",
+								 (int) boolexpr->boolop);
+							break;
+					}
+
+					scratch.d.boolexpr.jumpdone = -1;
+					ExprEvalPushStep(state, &scratch);
+					adjust_jumps = lappend_int(adjust_jumps,
+											   state->steps_len - 1);
+					off++;
+				}
+
+				/* adjust jump targets */
+				foreach(lc, adjust_jumps)
+				{
+					ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+					Assert(as->d.boolexpr.jumpdone == -1);
+					as->d.boolexpr.jumpdone = state->steps_len;
+				}
+
+				break;
+			}
+
+		case T_SubPlan:
+			{
+				SubPlan    *subplan = (SubPlan *) node;
+				SubPlanState *sstate;
+
+				if (!state->parent)
+					elog(ERROR, "SubPlan found with no parent plan");
+
+				sstate = ExecInitSubPlan(subplan, state->parent);
+
+				/* add SubPlanState nodes to state->parent->subPlan */
+				state->parent->subPlan = lappend(state->parent->subPlan,
+												 sstate);
+
+				scratch.opcode = EEOP_SUBPLAN;
+				scratch.d.subplan.sstate = sstate;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_FieldSelect:
+			{
+				FieldSelect *fselect = (FieldSelect *) node;
+
+				/* evaluate row/record argument into result area */
+				ExecInitExprRec(fselect->arg, state, resv, resnull);
+
+				/* and extract field */
+				scratch.opcode = EEOP_FIELDSELECT;
+				scratch.d.fieldselect.fieldnum = fselect->fieldnum;
+				scratch.d.fieldselect.resulttype = fselect->resulttype;
+				scratch.d.fieldselect.rowcache.cacheptr = NULL;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_FieldStore:
+			{
+				FieldStore *fstore = (FieldStore *) node;
+				TupleDesc	tupDesc;
+				ExprEvalRowtypeCache *rowcachep;
+				Datum	   *values;
+				bool	   *nulls;
+				int			ncolumns;
+				ListCell   *l1,
+						   *l2;
+
+				/* find out the number of columns in the composite type */
+				tupDesc = lookup_rowtype_tupdesc(fstore->resulttype, -1);
+				ncolumns = tupDesc->natts;
+				DecrTupleDescRefCount(tupDesc);
+
+				/* create workspace for column values */
+				values = (Datum *) palloc(sizeof(Datum) * ncolumns);
+				nulls = (bool *) palloc(sizeof(bool) * ncolumns);
+
+				/* create shared composite-type-lookup cache struct */
+				rowcachep = palloc(sizeof(ExprEvalRowtypeCache));
+				rowcachep->cacheptr = NULL;
+
+				/* emit code to evaluate the composite input value */
+				ExecInitExprRec(fstore->arg, state, resv, resnull);
+
+				/* next, deform the input tuple into our workspace */
+				scratch.opcode = EEOP_FIELDSTORE_DEFORM;
+				scratch.d.fieldstore.fstore = fstore;
+				scratch.d.fieldstore.rowcache = rowcachep;
+				scratch.d.fieldstore.values = values;
+				scratch.d.fieldstore.nulls = nulls;
+				scratch.d.fieldstore.ncolumns = ncolumns;
+				ExprEvalPushStep(state, &scratch);
+
+				/* evaluate new field values, store in workspace columns */
+				forboth(l1, fstore->newvals, l2, fstore->fieldnums)
+				{
+					Expr	   *e = (Expr *) lfirst(l1);
+					AttrNumber	fieldnum = lfirst_int(l2);
+					Datum	   *save_innermost_caseval;
+					bool	   *save_innermost_casenull;
+
+					if (fieldnum <= 0 || fieldnum > ncolumns)
+						elog(ERROR, "field number %d is out of range in FieldStore",
+							 fieldnum);
+
+					/*
+					 * Use the CaseTestExpr mechanism to pass down the old
+					 * value of the field being replaced; this is needed in
+					 * case the newval is itself a FieldStore or
+					 * SubscriptingRef that has to obtain and modify the old
+					 * value.  It's safe to reuse the CASE mechanism because
+					 * there cannot be a CASE between here and where the value
+					 * would be needed, and a field assignment can't be within
+					 * a CASE either.  (So saving and restoring
+					 * innermost_caseval is just paranoia, but let's do it
+					 * anyway.)
+					 *
+					 * Another non-obvious point is that it's safe to use the
+					 * field's values[]/nulls[] entries as both the caseval
+					 * source and the result address for this subexpression.
+					 * That's okay only because (1) both FieldStore and
+					 * SubscriptingRef evaluate their arg or refexpr inputs
+					 * first, and (2) any such CaseTestExpr is directly the
+					 * arg or refexpr input.  So any read of the caseval will
+					 * occur before there's a chance to overwrite it.  Also,
+					 * if multiple entries in the newvals/fieldnums lists
+					 * target the same field, they'll effectively be applied
+					 * left-to-right which is what we want.
+					 */
+					save_innermost_caseval = state->innermost_caseval;
+					save_innermost_casenull = state->innermost_casenull;
+					state->innermost_caseval = &values[fieldnum - 1];
+					state->innermost_casenull = &nulls[fieldnum - 1];
+
+					ExecInitExprRec(e, state,
+									&values[fieldnum - 1],
+									&nulls[fieldnum - 1]);
+
+					state->innermost_caseval = save_innermost_caseval;
+					state->innermost_casenull = save_innermost_casenull;
+				}
+
+				/* finally, form result tuple */
+				scratch.opcode = EEOP_FIELDSTORE_FORM;
+				scratch.d.fieldstore.fstore = fstore;
+				scratch.d.fieldstore.rowcache = rowcachep;
+				scratch.d.fieldstore.values = values;
+				scratch.d.fieldstore.nulls = nulls;
+				scratch.d.fieldstore.ncolumns = ncolumns;
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_RelabelType:
+			{
+				/* relabel doesn't need to do anything at runtime */
+				RelabelType *relabel = (RelabelType *) node;
+
+				ExecInitExprRec(relabel->arg, state, resv, resnull);
+				break;
+			}
+
+		case T_CoerceViaIO:
+			{
+				CoerceViaIO *iocoerce = (CoerceViaIO *) node;
+				Oid			iofunc;
+				bool		typisvarlena;
+				Oid			typioparam;
+				FunctionCallInfo fcinfo_in;
+
+				/* evaluate argument into step's result area */
+				ExecInitExprRec(iocoerce->arg, state, resv, resnull);
+
+				/*
+				 * Prepare both output and input function calls, to be
+				 * evaluated inside a single evaluation step for speed - this
+				 * can be a very common operation.
+				 *
+				 * We don't check permissions here as a type's input/output
+				 * function are assumed to be executable by everyone.
+				 */
+				scratch.opcode = EEOP_IOCOERCE;
+
+				/* lookup the source type's output function */
+				scratch.d.iocoerce.finfo_out = palloc0(sizeof(FmgrInfo));
+				scratch.d.iocoerce.fcinfo_data_out = palloc0(SizeForFunctionCallInfo(1));
+
+				getTypeOutputInfo(exprType((Node *) iocoerce->arg),
+								  &iofunc, &typisvarlena);
+				fmgr_info(iofunc, scratch.d.iocoerce.finfo_out);
+				fmgr_info_set_expr((Node *) node, scratch.d.iocoerce.finfo_out);
+				InitFunctionCallInfoData(*scratch.d.iocoerce.fcinfo_data_out,
+										 scratch.d.iocoerce.finfo_out,
+										 1, InvalidOid, NULL, NULL);
+
+				/* lookup the result type's input function */
+				scratch.d.iocoerce.finfo_in = palloc0(sizeof(FmgrInfo));
+				scratch.d.iocoerce.fcinfo_data_in = palloc0(SizeForFunctionCallInfo(3));
+
+				getTypeInputInfo(iocoerce->resulttype,
+								 &iofunc, &typioparam);
+				fmgr_info(iofunc, scratch.d.iocoerce.finfo_in);
+				fmgr_info_set_expr((Node *) node, scratch.d.iocoerce.finfo_in);
+				InitFunctionCallInfoData(*scratch.d.iocoerce.fcinfo_data_in,
+										 scratch.d.iocoerce.finfo_in,
+										 3, InvalidOid, NULL, NULL);
+
+				/*
+				 * We can preload the second and third arguments for the input
+				 * function, since they're constants.
+				 */
+				fcinfo_in = scratch.d.iocoerce.fcinfo_data_in;
+				fcinfo_in->args[1].value = ObjectIdGetDatum(typioparam);
+				fcinfo_in->args[1].isnull = false;
+				fcinfo_in->args[2].value = Int32GetDatum(-1);
+				fcinfo_in->args[2].isnull = false;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_ArrayCoerceExpr:
+			{
+				ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
+				Oid			resultelemtype;
+				ExprState  *elemstate;
+
+				/* evaluate argument into step's result area */
+				ExecInitExprRec(acoerce->arg, state, resv, resnull);
+
+				resultelemtype = get_element_type(acoerce->resulttype);
+				if (!OidIsValid(resultelemtype))
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("target type is not an array")));
+
+				/*
+				 * Construct a sub-expression for the per-element expression;
+				 * but don't ready it until after we check it for triviality.
+				 * We assume it hasn't any Var references, but does have a
+				 * CaseTestExpr representing the source array element values.
+				 */
+				elemstate = makeNode(ExprState);
+				elemstate->expr = acoerce->elemexpr;
+				elemstate->parent = state->parent;
+				elemstate->ext_params = state->ext_params;
+
+				elemstate->innermost_caseval = (Datum *) palloc(sizeof(Datum));
+				elemstate->innermost_casenull = (bool *) palloc(sizeof(bool));
+
+				ExecInitExprRec(acoerce->elemexpr, elemstate,
+								&elemstate->resvalue, &elemstate->resnull);
+
+				if (elemstate->steps_len == 1 &&
+					elemstate->steps[0].opcode == EEOP_CASE_TESTVAL)
+				{
+					/* Trivial, so we need no per-element work at runtime */
+					elemstate = NULL;
+				}
+				else
+				{
+					/* Not trivial, so append a DONE step */
+					scratch.opcode = EEOP_DONE;
+					ExprEvalPushStep(elemstate, &scratch);
+					/* and ready the subexpression */
+					ExecReadyExpr(elemstate);
+				}
+
+				scratch.opcode = EEOP_ARRAYCOERCE;
+				scratch.d.arraycoerce.elemexprstate = elemstate;
+				scratch.d.arraycoerce.resultelemtype = resultelemtype;
+
+				if (elemstate)
+				{
+					/* Set up workspace for array_map */
+					scratch.d.arraycoerce.amstate =
+						(ArrayMapState *) palloc0(sizeof(ArrayMapState));
+				}
+				else
+				{
+					/* Don't need workspace if there's no subexpression */
+					scratch.d.arraycoerce.amstate = NULL;
+				}
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_ConvertRowtypeExpr:
+			{
+				ConvertRowtypeExpr *convert = (ConvertRowtypeExpr *) node;
+				ExprEvalRowtypeCache *rowcachep;
+
+				/* cache structs must be out-of-line for space reasons */
+				rowcachep = palloc(2 * sizeof(ExprEvalRowtypeCache));
+				rowcachep[0].cacheptr = NULL;
+				rowcachep[1].cacheptr = NULL;
+
+				/* evaluate argument into step's result area */
+				ExecInitExprRec(convert->arg, state, resv, resnull);
+
+				/* and push conversion step */
+				scratch.opcode = EEOP_CONVERT_ROWTYPE;
+				scratch.d.convert_rowtype.inputtype =
+					exprType((Node *) convert->arg);
+				scratch.d.convert_rowtype.outputtype = convert->resulttype;
+				scratch.d.convert_rowtype.incache = &rowcachep[0];
+				scratch.d.convert_rowtype.outcache = &rowcachep[1];
+				scratch.d.convert_rowtype.map = NULL;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+			/* note that CaseWhen expressions are handled within this block */
+		case T_CaseExpr:
+			{
+				CaseExpr   *caseExpr = (CaseExpr *) node;
+				List	   *adjust_jumps = NIL;
+				Datum	   *caseval = NULL;
+				bool	   *casenull = NULL;
+				ListCell   *lc;
+
+				/*
+				 * If there's a test expression, we have to evaluate it and
+				 * save the value where the CaseTestExpr placeholders can find
+				 * it.
+				 */
+				if (caseExpr->arg != NULL)
+				{
+					/* Evaluate testexpr into caseval/casenull workspace */
+					caseval = palloc(sizeof(Datum));
+					casenull = palloc(sizeof(bool));
+
+					ExecInitExprRec(caseExpr->arg, state,
+									caseval, casenull);
+
+					/*
+					 * Since value might be read multiple times, force to R/O
+					 * - but only if it could be an expanded datum.
+					 */
+					if (get_typlen(exprType((Node *) caseExpr->arg)) == -1)
+					{
+						/* change caseval in-place */
+						scratch.opcode = EEOP_MAKE_READONLY;
+						scratch.resvalue = caseval;
+						scratch.resnull = casenull;
+						scratch.d.make_readonly.value = caseval;
+						scratch.d.make_readonly.isnull = casenull;
+						ExprEvalPushStep(state, &scratch);
+						/* restore normal settings of scratch fields */
+						scratch.resvalue = resv;
+						scratch.resnull = resnull;
+					}
+				}
+
+				/*
+				 * Prepare to evaluate each of the WHEN clauses in turn; as
+				 * soon as one is true we return the value of the
+				 * corresponding THEN clause.  If none are true then we return
+				 * the value of the ELSE clause, or NULL if there is none.
+				 */
+				foreach(lc, caseExpr->args)
+				{
+					CaseWhen   *when = (CaseWhen *) lfirst(lc);
+					Datum	   *save_innermost_caseval;
+					bool	   *save_innermost_casenull;
+					int			whenstep;
+
+					/*
+					 * Make testexpr result available to CaseTestExpr nodes
+					 * within the condition.  We must save and restore prior
+					 * setting of innermost_caseval fields, in case this node
+					 * is itself within a larger CASE.
+					 *
+					 * If there's no test expression, we don't actually need
+					 * to save and restore these fields; but it's less code to
+					 * just do so unconditionally.
+					 */
+					save_innermost_caseval = state->innermost_caseval;
+					save_innermost_casenull = state->innermost_casenull;
+					state->innermost_caseval = caseval;
+					state->innermost_casenull = casenull;
+
+					/* evaluate condition into CASE's result variables */
+					ExecInitExprRec(when->expr, state, resv, resnull);
+
+					state->innermost_caseval = save_innermost_caseval;
+					state->innermost_casenull = save_innermost_casenull;
+
+					/* If WHEN result isn't true, jump to next CASE arm */
+					scratch.opcode = EEOP_JUMP_IF_NOT_TRUE;
+					scratch.d.jump.jumpdone = -1;	/* computed later */
+					ExprEvalPushStep(state, &scratch);
+					whenstep = state->steps_len - 1;
+
+					/*
+					 * If WHEN result is true, evaluate THEN result, storing
+					 * it into the CASE's result variables.
+					 */
+					ExecInitExprRec(when->result, state, resv, resnull);
+
+					/* Emit JUMP step to jump to end of CASE's code */
+					scratch.opcode = EEOP_JUMP;
+					scratch.d.jump.jumpdone = -1;	/* computed later */
+					ExprEvalPushStep(state, &scratch);
+
+					/*
+					 * Don't know address for that jump yet, compute once the
+					 * whole CASE expression is built.
+					 */
+					adjust_jumps = lappend_int(adjust_jumps,
+											   state->steps_len - 1);
+
+					/*
+					 * But we can set WHEN test's jump target now, to make it
+					 * jump to the next WHEN subexpression or the ELSE.
+					 */
+					state->steps[whenstep].d.jump.jumpdone = state->steps_len;
+				}
+
+				/* transformCaseExpr always adds a default */
+				Assert(caseExpr->defresult);
+
+				/* evaluate ELSE expr into CASE's result variables */
+				ExecInitExprRec(caseExpr->defresult, state,
+								resv, resnull);
+
+				/* adjust jump targets */
+				foreach(lc, adjust_jumps)
+				{
+					ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+					Assert(as->opcode == EEOP_JUMP);
+					Assert(as->d.jump.jumpdone == -1);
+					as->d.jump.jumpdone = state->steps_len;
+				}
+
+				break;
+			}
+
+		case T_CaseTestExpr:
+			{
+				/*
+				 * Read from location identified by innermost_caseval.  Note
+				 * that innermost_caseval could be NULL, if this node isn't
+				 * actually within a CaseExpr, ArrayCoerceExpr, etc structure.
+				 * That can happen because some parts of the system abuse
+				 * CaseTestExpr to cause a read of a value externally supplied
+				 * in econtext->caseValue_datum.  We'll take care of that
+				 * scenario at runtime.
+				 */
+				scratch.opcode = EEOP_CASE_TESTVAL;
+				scratch.d.casetest.value = state->innermost_caseval;
+				scratch.d.casetest.isnull = state->innermost_casenull;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_ArrayExpr:
+			{
+				ArrayExpr  *arrayexpr = (ArrayExpr *) node;
+				int			nelems = list_length(arrayexpr->elements);
+				ListCell   *lc;
+				int			elemoff;
+
+				/*
+				 * Evaluate by computing each element, and then forming the
+				 * array.  Elements are computed into scratch arrays
+				 * associated with the ARRAYEXPR step.
+				 */
+				scratch.opcode = EEOP_ARRAYEXPR;
+				scratch.d.arrayexpr.elemvalues =
+					(Datum *) palloc(sizeof(Datum) * nelems);
+				scratch.d.arrayexpr.elemnulls =
+					(bool *) palloc(sizeof(bool) * nelems);
+				scratch.d.arrayexpr.nelems = nelems;
+
+				/* fill remaining fields of step */
+				scratch.d.arrayexpr.multidims = arrayexpr->multidims;
+				scratch.d.arrayexpr.elemtype = arrayexpr->element_typeid;
+
+				/* do one-time catalog lookup for type info */
+				get_typlenbyvalalign(arrayexpr->element_typeid,
+									 &scratch.d.arrayexpr.elemlength,
+									 &scratch.d.arrayexpr.elembyval,
+									 &scratch.d.arrayexpr.elemalign);
+
+				/* prepare to evaluate all arguments */
+				elemoff = 0;
+				foreach(lc, arrayexpr->elements)
+				{
+					Expr	   *e = (Expr *) lfirst(lc);
+
+					ExecInitExprRec(e, state,
+									&scratch.d.arrayexpr.elemvalues[elemoff],
+									&scratch.d.arrayexpr.elemnulls[elemoff]);
+					elemoff++;
+				}
+
+				/* and then collect all into an array */
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_RowExpr:
+			{
+				RowExpr    *rowexpr = (RowExpr *) node;
+				int			nelems = list_length(rowexpr->args);
+				TupleDesc	tupdesc;
+				int			i;
+				ListCell   *l;
+
+				/* Build tupdesc to describe result tuples */
+				if (rowexpr->row_typeid == RECORDOID)
+				{
+					/* generic record, use types of given expressions */
+					tupdesc = ExecTypeFromExprList(rowexpr->args);
+					/* ... but adopt RowExpr's column aliases */
+					ExecTypeSetColNames(tupdesc, rowexpr->colnames);
+					/* Bless the tupdesc so it can be looked up later */
+					BlessTupleDesc(tupdesc);
+				}
+				else
+				{
+					/* it's been cast to a named type, use that */
+					tupdesc = lookup_rowtype_tupdesc_copy(rowexpr->row_typeid, -1);
+				}
+
+				/*
+				 * In the named-type case, the tupdesc could have more columns
+				 * than are in the args list, since the type might have had
+				 * columns added since the ROW() was parsed.  We want those
+				 * extra columns to go to nulls, so we make sure that the
+				 * workspace arrays are large enough and then initialize any
+				 * extra columns to read as NULLs.
+				 */
+				Assert(nelems <= tupdesc->natts);
+				nelems = Max(nelems, tupdesc->natts);
+
+				/*
+				 * Evaluate by first building datums for each field, and then
+				 * a final step forming the composite datum.
+				 */
+				scratch.opcode = EEOP_ROW;
+				scratch.d.row.tupdesc = tupdesc;
+
+				/* space for the individual field datums */
+				scratch.d.row.elemvalues =
+					(Datum *) palloc(sizeof(Datum) * nelems);
+				scratch.d.row.elemnulls =
+					(bool *) palloc(sizeof(bool) * nelems);
+				/* as explained above, make sure any extra columns are null */
+				memset(scratch.d.row.elemnulls, true, sizeof(bool) * nelems);
+
+				/* Set up evaluation, skipping any deleted columns */
+				i = 0;
+				foreach(l, rowexpr->args)
+				{
+					Form_pg_attribute att = TupleDescAttr(tupdesc, i);
+					Expr	   *e = (Expr *) lfirst(l);
+
+					if (!att->attisdropped)
+					{
+						/*
+						 * Guard against ALTER COLUMN TYPE on rowtype since
+						 * the RowExpr was created.  XXX should we check
+						 * typmod too?	Not sure we can be sure it'll be the
+						 * same.
+						 */
+						if (exprType((Node *) e) != att->atttypid)
+							ereport(ERROR,
+									(errcode(ERRCODE_DATATYPE_MISMATCH),
+									 errmsg("ROW() column has type %s instead of type %s",
+											format_type_be(exprType((Node *) e)),
+											format_type_be(att->atttypid))));
+					}
+					else
+					{
+						/*
+						 * Ignore original expression and insert a NULL. We
+						 * don't really care what type of NULL it is, so
+						 * always make an int4 NULL.
+						 */
+						e = (Expr *) makeNullConst(INT4OID, -1, InvalidOid);
+					}
+
+					/* Evaluate column expr into appropriate workspace slot */
+					ExecInitExprRec(e, state,
+									&scratch.d.row.elemvalues[i],
+									&scratch.d.row.elemnulls[i]);
+					i++;
+				}
+
+				/* And finally build the row value */
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_RowCompareExpr:
+			{
+				RowCompareExpr *rcexpr = (RowCompareExpr *) node;
+				int			nopers = list_length(rcexpr->opnos);
+				List	   *adjust_jumps = NIL;
+				ListCell   *l_left_expr,
+						   *l_right_expr,
+						   *l_opno,
+						   *l_opfamily,
+						   *l_inputcollid;
+				ListCell   *lc;
+
+				/*
+				 * Iterate over each field, prepare comparisons.  To handle
+				 * NULL results, prepare jumps to after the expression.  If a
+				 * comparison yields a != 0 result, jump to the final step.
+				 */
+				Assert(list_length(rcexpr->largs) == nopers);
+				Assert(list_length(rcexpr->rargs) == nopers);
+				Assert(list_length(rcexpr->opfamilies) == nopers);
+				Assert(list_length(rcexpr->inputcollids) == nopers);
+
+				forfive(l_left_expr, rcexpr->largs,
+						l_right_expr, rcexpr->rargs,
+						l_opno, rcexpr->opnos,
+						l_opfamily, rcexpr->opfamilies,
+						l_inputcollid, rcexpr->inputcollids)
+				{
+					Expr	   *left_expr = (Expr *) lfirst(l_left_expr);
+					Expr	   *right_expr = (Expr *) lfirst(l_right_expr);
+					Oid			opno = lfirst_oid(l_opno);
+					Oid			opfamily = lfirst_oid(l_opfamily);
+					Oid			inputcollid = lfirst_oid(l_inputcollid);
+					int			strategy;
+					Oid			lefttype;
+					Oid			righttype;
+					Oid			proc;
+					FmgrInfo   *finfo;
+					FunctionCallInfo fcinfo;
+
+					get_op_opfamily_properties(opno, opfamily, false,
+											   &strategy,
+											   &lefttype,
+											   &righttype);
+					proc = get_opfamily_proc(opfamily,
+											 lefttype,
+											 righttype,
+											 BTORDER_PROC);
+					if (!OidIsValid(proc))
+						elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
+							 BTORDER_PROC, lefttype, righttype, opfamily);
+
+					/* Set up the primary fmgr lookup information */
+					finfo = palloc0(sizeof(FmgrInfo));
+					fcinfo = palloc0(SizeForFunctionCallInfo(2));
+					fmgr_info(proc, finfo);
+					fmgr_info_set_expr((Node *) node, finfo);
+					InitFunctionCallInfoData(*fcinfo, finfo, 2,
+											 inputcollid, NULL, NULL);
+
+					/*
+					 * If we enforced permissions checks on index support
+					 * functions, we'd need to make a check here.  But the
+					 * index support machinery doesn't do that, and thus
+					 * neither does this code.
+					 */
+
+					/* evaluate left and right args directly into fcinfo */
+					ExecInitExprRec(left_expr, state,
+									&fcinfo->args[0].value, &fcinfo->args[0].isnull);
+					ExecInitExprRec(right_expr, state,
+									&fcinfo->args[1].value, &fcinfo->args[1].isnull);
+
+					scratch.opcode = EEOP_ROWCOMPARE_STEP;
+					scratch.d.rowcompare_step.finfo = finfo;
+					scratch.d.rowcompare_step.fcinfo_data = fcinfo;
+					scratch.d.rowcompare_step.fn_addr = finfo->fn_addr;
+					/* jump targets filled below */
+					scratch.d.rowcompare_step.jumpnull = -1;
+					scratch.d.rowcompare_step.jumpdone = -1;
+
+					ExprEvalPushStep(state, &scratch);
+					adjust_jumps = lappend_int(adjust_jumps,
+											   state->steps_len - 1);
+				}
+
+				/*
+				 * We could have a zero-column rowtype, in which case the rows
+				 * necessarily compare equal.
+				 */
+				if (nopers == 0)
+				{
+					scratch.opcode = EEOP_CONST;
+					scratch.d.constval.value = Int32GetDatum(0);
+					scratch.d.constval.isnull = false;
+					ExprEvalPushStep(state, &scratch);
+				}
+
+				/* Finally, examine the last comparison result */
+				scratch.opcode = EEOP_ROWCOMPARE_FINAL;
+				scratch.d.rowcompare_final.rctype = rcexpr->rctype;
+				ExprEvalPushStep(state, &scratch);
+
+				/* adjust jump targets */
+				foreach(lc, adjust_jumps)
+				{
+					ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+					Assert(as->opcode == EEOP_ROWCOMPARE_STEP);
+					Assert(as->d.rowcompare_step.jumpdone == -1);
+					Assert(as->d.rowcompare_step.jumpnull == -1);
+
+					/* jump to comparison evaluation */
+					as->d.rowcompare_step.jumpdone = state->steps_len - 1;
+					/* jump to the following expression */
+					as->d.rowcompare_step.jumpnull = state->steps_len;
+				}
+
+				break;
+			}
+
+		case T_CoalesceExpr:
+			{
+				CoalesceExpr *coalesce = (CoalesceExpr *) node;
+				List	   *adjust_jumps = NIL;
+				ListCell   *lc;
+
+				/* We assume there's at least one arg */
+				Assert(coalesce->args != NIL);
+
+				/*
+				 * Prepare evaluation of all coalesced arguments, after each
+				 * one push a step that short-circuits if not null.
+				 */
+				foreach(lc, coalesce->args)
+				{
+					Expr	   *e = (Expr *) lfirst(lc);
+
+					/* evaluate argument, directly into result datum */
+					ExecInitExprRec(e, state, resv, resnull);
+
+					/* if it's not null, skip to end of COALESCE expr */
+					scratch.opcode = EEOP_JUMP_IF_NOT_NULL;
+					scratch.d.jump.jumpdone = -1;	/* adjust later */
+					ExprEvalPushStep(state, &scratch);
+
+					adjust_jumps = lappend_int(adjust_jumps,
+											   state->steps_len - 1);
+				}
+
+				/*
+				 * No need to add a constant NULL return - we only can get to
+				 * the end of the expression if a NULL already is being
+				 * returned.
+				 */
+
+				/* adjust jump targets */
+				foreach(lc, adjust_jumps)
+				{
+					ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+					Assert(as->opcode == EEOP_JUMP_IF_NOT_NULL);
+					Assert(as->d.jump.jumpdone == -1);
+					as->d.jump.jumpdone = state->steps_len;
+				}
+
+				break;
+			}
+
+		case T_MinMaxExpr:
+			{
+				MinMaxExpr *minmaxexpr = (MinMaxExpr *) node;
+				int			nelems = list_length(minmaxexpr->args);
+				TypeCacheEntry *typentry;
+				FmgrInfo   *finfo;
+				FunctionCallInfo fcinfo;
+				ListCell   *lc;
+				int			off;
+
+				/* Look up the btree comparison function for the datatype */
+				typentry = lookup_type_cache(minmaxexpr->minmaxtype,
+											 TYPECACHE_CMP_PROC);
+				if (!OidIsValid(typentry->cmp_proc))
+					ereport(ERROR,
+							(errcode(ERRCODE_UNDEFINED_FUNCTION),
+							 errmsg("could not identify a comparison function for type %s",
+									format_type_be(minmaxexpr->minmaxtype))));
+
+				/*
+				 * If we enforced permissions checks on index support
+				 * functions, we'd need to make a check here.  But the index
+				 * support machinery doesn't do that, and thus neither does
+				 * this code.
+				 */
+
+				/* Perform function lookup */
+				finfo = palloc0(sizeof(FmgrInfo));
+				fcinfo = palloc0(SizeForFunctionCallInfo(2));
+				fmgr_info(typentry->cmp_proc, finfo);
+				fmgr_info_set_expr((Node *) node, finfo);
+				InitFunctionCallInfoData(*fcinfo, finfo, 2,
+										 minmaxexpr->inputcollid, NULL, NULL);
+
+				scratch.opcode = EEOP_MINMAX;
+				/* allocate space to store arguments */
+				scratch.d.minmax.values =
+					(Datum *) palloc(sizeof(Datum) * nelems);
+				scratch.d.minmax.nulls =
+					(bool *) palloc(sizeof(bool) * nelems);
+				scratch.d.minmax.nelems = nelems;
+
+				scratch.d.minmax.op = minmaxexpr->op;
+				scratch.d.minmax.finfo = finfo;
+				scratch.d.minmax.fcinfo_data = fcinfo;
+
+				/* evaluate expressions into minmax->values/nulls */
+				off = 0;
+				foreach(lc, minmaxexpr->args)
+				{
+					Expr	   *e = (Expr *) lfirst(lc);
+
+					ExecInitExprRec(e, state,
+									&scratch.d.minmax.values[off],
+									&scratch.d.minmax.nulls[off]);
+					off++;
+				}
+
+				/* and push the final comparison */
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_SQLValueFunction:
+			{
+				SQLValueFunction *svf = (SQLValueFunction *) node;
+
+				scratch.opcode = EEOP_SQLVALUEFUNCTION;
+				scratch.d.sqlvaluefunction.svf = svf;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_XmlExpr:
+			{
+				XmlExpr    *xexpr = (XmlExpr *) node;
+				int			nnamed = list_length(xexpr->named_args);
+				int			nargs = list_length(xexpr->args);
+				int			off;
+				ListCell   *arg;
+
+				scratch.opcode = EEOP_XMLEXPR;
+				scratch.d.xmlexpr.xexpr = xexpr;
+
+				/* allocate space for storing all the arguments */
+				if (nnamed)
+				{
+					scratch.d.xmlexpr.named_argvalue =
+						(Datum *) palloc(sizeof(Datum) * nnamed);
+					scratch.d.xmlexpr.named_argnull =
+						(bool *) palloc(sizeof(bool) * nnamed);
+				}
+				else
+				{
+					scratch.d.xmlexpr.named_argvalue = NULL;
+					scratch.d.xmlexpr.named_argnull = NULL;
+				}
+
+				if (nargs)
+				{
+					scratch.d.xmlexpr.argvalue =
+						(Datum *) palloc(sizeof(Datum) * nargs);
+					scratch.d.xmlexpr.argnull =
+						(bool *) palloc(sizeof(bool) * nargs);
+				}
+				else
+				{
+					scratch.d.xmlexpr.argvalue = NULL;
+					scratch.d.xmlexpr.argnull = NULL;
+				}
+
+				/* prepare argument execution */
+				off = 0;
+				foreach(arg, xexpr->named_args)
+				{
+					Expr	   *e = (Expr *) lfirst(arg);
+
+					ExecInitExprRec(e, state,
+									&scratch.d.xmlexpr.named_argvalue[off],
+									&scratch.d.xmlexpr.named_argnull[off]);
+					off++;
+				}
+
+				off = 0;
+				foreach(arg, xexpr->args)
+				{
+					Expr	   *e = (Expr *) lfirst(arg);
+
+					ExecInitExprRec(e, state,
+									&scratch.d.xmlexpr.argvalue[off],
+									&scratch.d.xmlexpr.argnull[off]);
+					off++;
+				}
+
+				/* and evaluate the actual XML expression */
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_NullTest:
+			{
+				NullTest   *ntest = (NullTest *) node;
+
+				if (ntest->nulltesttype == IS_NULL)
+				{
+					if (ntest->argisrow)
+						scratch.opcode = EEOP_NULLTEST_ROWISNULL;
+					else
+						scratch.opcode = EEOP_NULLTEST_ISNULL;
+				}
+				else if (ntest->nulltesttype == IS_NOT_NULL)
+				{
+					if (ntest->argisrow)
+						scratch.opcode = EEOP_NULLTEST_ROWISNOTNULL;
+					else
+						scratch.opcode = EEOP_NULLTEST_ISNOTNULL;
+				}
+				else
+				{
+					elog(ERROR, "unrecognized nulltesttype: %d",
+						 (int) ntest->nulltesttype);
+				}
+				/* initialize cache in case it's a row test */
+				scratch.d.nulltest_row.rowcache.cacheptr = NULL;
+
+				/* first evaluate argument into result variable */
+				ExecInitExprRec(ntest->arg, state,
+								resv, resnull);
+
+				/* then push the test of that argument */
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_BooleanTest:
+			{
+				BooleanTest *btest = (BooleanTest *) node;
+
+				/*
+				 * Evaluate argument, directly into result datum.  That's ok,
+				 * because resv/resnull is definitely not used anywhere else,
+				 * and will get overwritten by the below EEOP_BOOLTEST_IS_*
+				 * step.
+				 */
+				ExecInitExprRec(btest->arg, state, resv, resnull);
+
+				switch (btest->booltesttype)
+				{
+					case IS_TRUE:
+						scratch.opcode = EEOP_BOOLTEST_IS_TRUE;
+						break;
+					case IS_NOT_TRUE:
+						scratch.opcode = EEOP_BOOLTEST_IS_NOT_TRUE;
+						break;
+					case IS_FALSE:
+						scratch.opcode = EEOP_BOOLTEST_IS_FALSE;
+						break;
+					case IS_NOT_FALSE:
+						scratch.opcode = EEOP_BOOLTEST_IS_NOT_FALSE;
+						break;
+					case IS_UNKNOWN:
+						/* Same as scalar IS NULL test */
+						scratch.opcode = EEOP_NULLTEST_ISNULL;
+						break;
+					case IS_NOT_UNKNOWN:
+						/* Same as scalar IS NOT NULL test */
+						scratch.opcode = EEOP_NULLTEST_ISNOTNULL;
+						break;
+					default:
+						elog(ERROR, "unrecognized booltesttype: %d",
+							 (int) btest->booltesttype);
+				}
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_CoerceToDomain:
+			{
+				CoerceToDomain *ctest = (CoerceToDomain *) node;
+
+				ExecInitCoerceToDomain(&scratch, ctest, state,
+									   resv, resnull);
+				break;
+			}
+
+		case T_CoerceToDomainValue:
+			{
+				/*
+				 * Read from location identified by innermost_domainval.  Note
+				 * that innermost_domainval could be NULL, if we're compiling
+				 * a standalone domain check rather than one embedded in a
+				 * larger expression.  In that case we must read from
+				 * econtext->domainValue_datum.  We'll take care of that
+				 * scenario at runtime.
+				 */
+				scratch.opcode = EEOP_DOMAIN_TESTVAL;
+				/* we share instruction union variant with case testval */
+				scratch.d.casetest.value = state->innermost_domainval;
+				scratch.d.casetest.isnull = state->innermost_domainnull;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_CurrentOfExpr:
+			{
+				scratch.opcode = EEOP_CURRENTOFEXPR;
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		case T_NextValueExpr:
+			{
+				NextValueExpr *nve = (NextValueExpr *) node;
+
+				scratch.opcode = EEOP_NEXTVALUEEXPR;
+				scratch.d.nextvalueexpr.seqid = nve->seqid;
+				scratch.d.nextvalueexpr.seqtypid = nve->typeId;
+
+				ExprEvalPushStep(state, &scratch);
+				break;
+			}
+
+		default:
+			elog(ERROR, "unrecognized node type: %d",
+				 (int) nodeTag(node));
+			break;
+	}
+}
+
+/*
+ * Add another expression evaluation step to ExprState->steps.
+ *
+ * Note that this potentially re-allocates es->steps, therefore no pointer
+ * into that array may be used while the expression is still being built.
+ */
+void
+ExprEvalPushStep(ExprState *es, const ExprEvalStep *s)
+{
+	if (es->steps_alloc == 0)
+	{
+		es->steps_alloc = 16;
+		es->steps = palloc(sizeof(ExprEvalStep) * es->steps_alloc);
+	}
+	else if (es->steps_alloc == es->steps_len)
+	{
+		es->steps_alloc *= 2;
+		es->steps = repalloc(es->steps,
+							 sizeof(ExprEvalStep) * es->steps_alloc);
+	}
+
+	memcpy(&es->steps[es->steps_len++], s, sizeof(ExprEvalStep));
+}
+
+/*
+ * Perform setup necessary for the evaluation of a function-like expression,
+ * appending argument evaluation steps to the steps list in *state, and
+ * setting up *scratch so it is ready to be pushed.
+ *
+ * *scratch is not pushed here, so that callers may override the opcode,
+ * which is useful for function-like cases like DISTINCT.
+ */
+static void
+ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, Oid funcid,
+			 Oid inputcollid, ExprState *state)
+{
+	int			nargs = list_length(args);
+	AclResult	aclresult;
+	FmgrInfo   *flinfo;
+	FunctionCallInfo fcinfo;
+	int			argno;
+	ListCell   *lc;
+
+	/* Check permission to call function */
+	aclresult = pg_proc_aclcheck(funcid, GetUserId(), ACL_EXECUTE);
+	if (aclresult != ACLCHECK_OK)
+		aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(funcid));
+	InvokeFunctionExecuteHook(funcid);
+
+	/*
+	 * Safety check on nargs.  Under normal circumstances this should never
+	 * fail, as parser should check sooner.  But possibly it might fail if
+	 * server has been compiled with FUNC_MAX_ARGS smaller than some functions
+	 * declared in pg_proc?
+	 */
+	if (nargs > FUNC_MAX_ARGS)
+		ereport(ERROR,
+				(errcode(ERRCODE_TOO_MANY_ARGUMENTS),
+				 errmsg_plural("cannot pass more than %d argument to a function",
+							   "cannot pass more than %d arguments to a function",
+							   FUNC_MAX_ARGS,
+							   FUNC_MAX_ARGS)));
+
+	/* Allocate function lookup data and parameter workspace for this call */
+	scratch->d.func.finfo = palloc0(sizeof(FmgrInfo));
+	scratch->d.func.fcinfo_data = palloc0(SizeForFunctionCallInfo(nargs));
+	flinfo = scratch->d.func.finfo;
+	fcinfo = scratch->d.func.fcinfo_data;
+
+	/* Set up the primary fmgr lookup information */
+	fmgr_info(funcid, flinfo);
+	fmgr_info_set_expr((Node *) node, flinfo);
+
+	/* Initialize function call parameter structure too */
+	InitFunctionCallInfoData(*fcinfo, flinfo,
+							 nargs, inputcollid, NULL, NULL);
+
+	/* Keep extra copies of this info to save an indirection at runtime */
+	scratch->d.func.fn_addr = flinfo->fn_addr;
+	scratch->d.func.nargs = nargs;
+
+	/* We only support non-set functions here */
+	if (flinfo->fn_retset)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set"),
+				 state->parent ?
+				 executor_errposition(state->parent->state,
+									  exprLocation((Node *) node)) : 0));
+
+	/* Build code to evaluate arguments directly into the fcinfo struct */
+	argno = 0;
+	foreach(lc, args)
+	{
+		Expr	   *arg = (Expr *) lfirst(lc);
+
+		if (IsA(arg, Const))
+		{
+			/*
+			 * Don't evaluate const arguments every round; especially
+			 * interesting for constants in comparisons.
+			 */
+			Const	   *con = (Const *) arg;
+
+			fcinfo->args[argno].value = con->constvalue;
+			fcinfo->args[argno].isnull = con->constisnull;
+		}
+		else
+		{
+			ExecInitExprRec(arg, state,
+							&fcinfo->args[argno].value,
+							&fcinfo->args[argno].isnull);
+		}
+		argno++;
+	}
+
+	/* Insert appropriate opcode depending on strictness and stats level */
+	if (pgstat_track_functions <= flinfo->fn_stats)
+	{
+		if (flinfo->fn_strict && nargs > 0)
+			scratch->opcode = EEOP_FUNCEXPR_STRICT;
+		else
+			scratch->opcode = EEOP_FUNCEXPR;
+	}
+	else
+	{
+		if (flinfo->fn_strict && nargs > 0)
+			scratch->opcode = EEOP_FUNCEXPR_STRICT_FUSAGE;
+		else
+			scratch->opcode = EEOP_FUNCEXPR_FUSAGE;
+	}
+}
+
+/*
+ * Add expression steps deforming the ExprState's inner/outer/scan slots
+ * as much as required by the expression.
+ */
+static void
+ExecInitExprSlots(ExprState *state, Node *node)
+{
+	LastAttnumInfo info = {0, 0, 0};
+
+	/*
+	 * Figure out which attributes we're going to need.
+	 */
+	get_last_attnums_walker(node, &info);
+
+	ExecPushExprSlots(state, &info);
+}
+
+/*
+ * Add steps deforming the ExprState's inner/out/scan slots as much as
+ * indicated by info. This is useful when building an ExprState covering more
+ * than one expression.
+ */
+static void
+ExecPushExprSlots(ExprState *state, LastAttnumInfo *info)
+{
+	ExprEvalStep scratch = {0};
+
+	scratch.resvalue = NULL;
+	scratch.resnull = NULL;
+
+	/* Emit steps as needed */
+	if (info->last_inner > 0)
+	{
+		scratch.opcode = EEOP_INNER_FETCHSOME;
+		scratch.d.fetch.last_var = info->last_inner;
+		scratch.d.fetch.fixed = false;
+		scratch.d.fetch.kind = NULL;
+		scratch.d.fetch.known_desc = NULL;
+		if (ExecComputeSlotInfo(state, &scratch))
+			ExprEvalPushStep(state, &scratch);
+	}
+	if (info->last_outer > 0)
+	{
+		scratch.opcode = EEOP_OUTER_FETCHSOME;
+		scratch.d.fetch.last_var = info->last_outer;
+		scratch.d.fetch.fixed = false;
+		scratch.d.fetch.kind = NULL;
+		scratch.d.fetch.known_desc = NULL;
+		if (ExecComputeSlotInfo(state, &scratch))
+			ExprEvalPushStep(state, &scratch);
+	}
+	if (info->last_scan > 0)
+	{
+		scratch.opcode = EEOP_SCAN_FETCHSOME;
+		scratch.d.fetch.last_var = info->last_scan;
+		scratch.d.fetch.fixed = false;
+		scratch.d.fetch.kind = NULL;
+		scratch.d.fetch.known_desc = NULL;
+		if (ExecComputeSlotInfo(state, &scratch))
+			ExprEvalPushStep(state, &scratch);
+	}
+}
+
+/*
+ * get_last_attnums_walker: expression walker for ExecInitExprSlots
+ */
+static bool
+get_last_attnums_walker(Node *node, LastAttnumInfo *info)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Var))
+	{
+		Var		   *variable = (Var *) node;
+		AttrNumber	attnum = variable->varattno;
+
+		switch (variable->varno)
+		{
+			case INNER_VAR:
+				info->last_inner = Max(info->last_inner, attnum);
+				break;
+
+			case OUTER_VAR:
+				info->last_outer = Max(info->last_outer, attnum);
+				break;
+
+				/* INDEX_VAR is handled by default case */
+
+			default:
+				info->last_scan = Max(info->last_scan, attnum);
+				break;
+		}
+		return false;
+	}
+
+	/*
+	 * Don't examine the arguments or filters of Aggrefs or WindowFuncs,
+	 * because those do not represent expressions to be evaluated within the
+	 * calling expression's econtext.  GroupingFunc arguments are never
+	 * evaluated at all.
+	 */
+	if (IsA(node, Aggref))
+		return false;
+	if (IsA(node, WindowFunc))
+		return false;
+	if (IsA(node, GroupingFunc))
+		return false;
+	return expression_tree_walker(node, get_last_attnums_walker,
+								  (void *) info);
+}
+
+/*
+ * Compute additional information for EEOP_*_FETCHSOME ops.
+ *
+ * The goal is to determine whether a slot is 'fixed', that is, every
+ * evaluation of the expression will have the same type of slot, with an
+ * equivalent descriptor.
+ *
+ * Returns true if the deforming step is required, false otherwise.
+ */
+static bool
+ExecComputeSlotInfo(ExprState *state, ExprEvalStep *op)
+{
+	PlanState  *parent = state->parent;
+	TupleDesc	desc = NULL;
+	const TupleTableSlotOps *tts_ops = NULL;
+	bool		isfixed = false;
+	ExprEvalOp	opcode = op->opcode;
+
+	Assert(opcode == EEOP_INNER_FETCHSOME ||
+		   opcode == EEOP_OUTER_FETCHSOME ||
+		   opcode == EEOP_SCAN_FETCHSOME);
+
+	if (op->d.fetch.known_desc != NULL)
+	{
+		desc = op->d.fetch.known_desc;
+		tts_ops = op->d.fetch.kind;
+		isfixed = op->d.fetch.kind != NULL;
+	}
+	else if (!parent)
+	{
+		isfixed = false;
+	}
+	else if (opcode == EEOP_INNER_FETCHSOME)
+	{
+		PlanState  *is = innerPlanState(parent);
+
+		if (parent->inneropsset && !parent->inneropsfixed)
+		{
+			isfixed = false;
+		}
+		else if (parent->inneropsset && parent->innerops)
+		{
+			isfixed = true;
+			tts_ops = parent->innerops;
+			desc = ExecGetResultType(is);
+		}
+		else if (is)
+		{
+			tts_ops = ExecGetResultSlotOps(is, &isfixed);
+			desc = ExecGetResultType(is);
+		}
+	}
+	else if (opcode == EEOP_OUTER_FETCHSOME)
+	{
+		PlanState  *os = outerPlanState(parent);
+
+		if (parent->outeropsset && !parent->outeropsfixed)
+		{
+			isfixed = false;
+		}
+		else if (parent->outeropsset && parent->outerops)
+		{
+			isfixed = true;
+			tts_ops = parent->outerops;
+			desc = ExecGetResultType(os);
+		}
+		else if (os)
+		{
+			tts_ops = ExecGetResultSlotOps(os, &isfixed);
+			desc = ExecGetResultType(os);
+		}
+	}
+	else if (opcode == EEOP_SCAN_FETCHSOME)
+	{
+		desc = parent->scandesc;
+
+		if (parent->scanops)
+			tts_ops = parent->scanops;
+
+		if (parent->scanopsset)
+			isfixed = parent->scanopsfixed;
+	}
+
+	if (isfixed && desc != NULL && tts_ops != NULL)
+	{
+		op->d.fetch.fixed = true;
+		op->d.fetch.kind = tts_ops;
+		op->d.fetch.known_desc = desc;
+	}
+	else
+	{
+		op->d.fetch.fixed = false;
+		op->d.fetch.kind = NULL;
+		op->d.fetch.known_desc = NULL;
+	}
+
+	/* if the slot is known to always virtual we never need to deform */
+	if (op->d.fetch.fixed && op->d.fetch.kind == &TTSOpsVirtual)
+		return false;
+
+	return true;
+}
+
+/*
+ * Prepare step for the evaluation of a whole-row variable.
+ * The caller still has to push the step.
+ */
+static void
+ExecInitWholeRowVar(ExprEvalStep *scratch, Var *variable, ExprState *state)
+{
+	PlanState  *parent = state->parent;
+
+	/* fill in all but the target */
+	scratch->opcode = EEOP_WHOLEROW;
+	scratch->d.wholerow.var = variable;
+	scratch->d.wholerow.first = true;
+	scratch->d.wholerow.slow = false;
+	scratch->d.wholerow.tupdesc = NULL; /* filled at runtime */
+	scratch->d.wholerow.junkFilter = NULL;
+
+	/*
+	 * If the input tuple came from a subquery, it might contain "resjunk"
+	 * columns (such as GROUP BY or ORDER BY columns), which we don't want to
+	 * keep in the whole-row result.  We can get rid of such columns by
+	 * passing the tuple through a JunkFilter --- but to make one, we have to
+	 * lay our hands on the subquery's targetlist.  Fortunately, there are not
+	 * very many cases where this can happen, and we can identify all of them
+	 * by examining our parent PlanState.  We assume this is not an issue in
+	 * standalone expressions that don't have parent plans.  (Whole-row Vars
+	 * can occur in such expressions, but they will always be referencing
+	 * table rows.)
+	 */
+	if (parent)
+	{
+		PlanState  *subplan = NULL;
+
+		switch (nodeTag(parent))
+		{
+			case T_SubqueryScanState:
+				subplan = ((SubqueryScanState *) parent)->subplan;
+				break;
+			case T_CteScanState:
+				subplan = ((CteScanState *) parent)->cteplanstate;
+				break;
+			default:
+				break;
+		}
+
+		if (subplan)
+		{
+			bool		junk_filter_needed = false;
+			ListCell   *tlist;
+
+			/* Detect whether subplan tlist actually has any junk columns */
+			foreach(tlist, subplan->plan->targetlist)
+			{
+				TargetEntry *tle = (TargetEntry *) lfirst(tlist);
+
+				if (tle->resjunk)
+				{
+					junk_filter_needed = true;
+					break;
+				}
+			}
+
+			/* If so, build the junkfilter now */
+			if (junk_filter_needed)
+			{
+				scratch->d.wholerow.junkFilter =
+					ExecInitJunkFilter(subplan->plan->targetlist,
+									   ExecInitExtraTupleSlot(parent->state, NULL,
+															  &TTSOpsVirtual));
+			}
+		}
+	}
+}
+
+/*
+ * Prepare evaluation of a SubscriptingRef expression.
+ */
+static void
+ExecInitSubscriptingRef(ExprEvalStep *scratch, SubscriptingRef *sbsref,
+						ExprState *state, Datum *resv, bool *resnull)
+{
+	bool		isAssignment = (sbsref->refassgnexpr != NULL);
+	int			nupper = list_length(sbsref->refupperindexpr);
+	int			nlower = list_length(sbsref->reflowerindexpr);
+	const SubscriptRoutines *sbsroutines;
+	SubscriptingRefState *sbsrefstate;
+	SubscriptExecSteps methods;
+	char	   *ptr;
+	List	   *adjust_jumps = NIL;
+	ListCell   *lc;
+	int			i;
+
+	/* Look up the subscripting support methods */
+	sbsroutines = getSubscriptingRoutines(sbsref->refcontainertype, NULL);
+	if (!sbsroutines)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATATYPE_MISMATCH),
+				 errmsg("cannot subscript type %s because it does not support subscripting",
+						format_type_be(sbsref->refcontainertype)),
+				 state->parent ?
+				 executor_errposition(state->parent->state,
+									  exprLocation((Node *) sbsref)) : 0));
+
+	/* Allocate sbsrefstate, with enough space for per-subscript arrays too */
+	sbsrefstate = palloc0(MAXALIGN(sizeof(SubscriptingRefState)) +
+						  (nupper + nlower) * (sizeof(Datum) +
+											   2 * sizeof(bool)));
+
+	/* Fill constant fields of SubscriptingRefState */
+	sbsrefstate->isassignment = isAssignment;
+	sbsrefstate->numupper = nupper;
+	sbsrefstate->numlower = nlower;
+	/* Set up per-subscript arrays */
+	ptr = ((char *) sbsrefstate) + MAXALIGN(sizeof(SubscriptingRefState));
+	sbsrefstate->upperindex = (Datum *) ptr;
+	ptr += nupper * sizeof(Datum);
+	sbsrefstate->lowerindex = (Datum *) ptr;
+	ptr += nlower * sizeof(Datum);
+	sbsrefstate->upperprovided = (bool *) ptr;
+	ptr += nupper * sizeof(bool);
+	sbsrefstate->lowerprovided = (bool *) ptr;
+	ptr += nlower * sizeof(bool);
+	sbsrefstate->upperindexnull = (bool *) ptr;
+	ptr += nupper * sizeof(bool);
+	sbsrefstate->lowerindexnull = (bool *) ptr;
+	/* ptr += nlower * sizeof(bool); */
+
+	/*
+	 * Let the container-type-specific code have a chance.  It must fill the
+	 * "methods" struct with function pointers for us to possibly use in
+	 * execution steps below; and it can optionally set up some data pointed
+	 * to by the workspace field.
+	 */
+	memset(&methods, 0, sizeof(methods));
+	sbsroutines->exec_setup(sbsref, sbsrefstate, &methods);
+
+	/*
+	 * Evaluate array input.  It's safe to do so into resv/resnull, because we
+	 * won't use that as target for any of the other subexpressions, and it'll
+	 * be overwritten by the final EEOP_SBSREF_FETCH/ASSIGN step, which is
+	 * pushed last.
+	 */
+	ExecInitExprRec(sbsref->refexpr, state, resv, resnull);
+
+	/*
+	 * If refexpr yields NULL, and the operation should be strict, then result
+	 * is NULL.  We can implement this with just JUMP_IF_NULL, since we
+	 * evaluated the array into the desired target location.
+	 */
+	if (!isAssignment && sbsroutines->fetch_strict)
+	{
+		scratch->opcode = EEOP_JUMP_IF_NULL;
+		scratch->d.jump.jumpdone = -1;	/* adjust later */
+		ExprEvalPushStep(state, scratch);
+		adjust_jumps = lappend_int(adjust_jumps,
+								   state->steps_len - 1);
+	}
+
+	/* Evaluate upper subscripts */
+	i = 0;
+	foreach(lc, sbsref->refupperindexpr)
+	{
+		Expr	   *e = (Expr *) lfirst(lc);
+
+		/* When slicing, individual subscript bounds can be omitted */
+		if (!e)
+		{
+			sbsrefstate->upperprovided[i] = false;
+			sbsrefstate->upperindexnull[i] = true;
+		}
+		else
+		{
+			sbsrefstate->upperprovided[i] = true;
+			/* Each subscript is evaluated into appropriate array entry */
+			ExecInitExprRec(e, state,
+							&sbsrefstate->upperindex[i],
+							&sbsrefstate->upperindexnull[i]);
+		}
+		i++;
+	}
+
+	/* Evaluate lower subscripts similarly */
+	i = 0;
+	foreach(lc, sbsref->reflowerindexpr)
+	{
+		Expr	   *e = (Expr *) lfirst(lc);
+
+		/* When slicing, individual subscript bounds can be omitted */
+		if (!e)
+		{
+			sbsrefstate->lowerprovided[i] = false;
+			sbsrefstate->lowerindexnull[i] = true;
+		}
+		else
+		{
+			sbsrefstate->lowerprovided[i] = true;
+			/* Each subscript is evaluated into appropriate array entry */
+			ExecInitExprRec(e, state,
+							&sbsrefstate->lowerindex[i],
+							&sbsrefstate->lowerindexnull[i]);
+		}
+		i++;
+	}
+
+	/* SBSREF_SUBSCRIPTS checks and converts all the subscripts at once */
+	if (methods.sbs_check_subscripts)
+	{
+		scratch->opcode = EEOP_SBSREF_SUBSCRIPTS;
+		scratch->d.sbsref_subscript.subscriptfunc = methods.sbs_check_subscripts;
+		scratch->d.sbsref_subscript.state = sbsrefstate;
+		scratch->d.sbsref_subscript.jumpdone = -1;	/* adjust later */
+		ExprEvalPushStep(state, scratch);
+		adjust_jumps = lappend_int(adjust_jumps,
+								   state->steps_len - 1);
+	}
+
+	if (isAssignment)
+	{
+		Datum	   *save_innermost_caseval;
+		bool	   *save_innermost_casenull;
+
+		/* Check for unimplemented methods */
+		if (!methods.sbs_assign)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("type %s does not support subscripted assignment",
+							format_type_be(sbsref->refcontainertype))));
+
+		/*
+		 * We might have a nested-assignment situation, in which the
+		 * refassgnexpr is itself a FieldStore or SubscriptingRef that needs
+		 * to obtain and modify the previous value of the array element or
+		 * slice being replaced.  If so, we have to extract that value from
+		 * the array and pass it down via the CaseTestExpr mechanism.  It's
+		 * safe to reuse the CASE mechanism because there cannot be a CASE
+		 * between here and where the value would be needed, and an array
+		 * assignment can't be within a CASE either.  (So saving and restoring
+		 * innermost_caseval is just paranoia, but let's do it anyway.)
+		 *
+		 * Since fetching the old element might be a nontrivial expense, do it
+		 * only if the argument actually needs it.
+		 */
+		if (isAssignmentIndirectionExpr(sbsref->refassgnexpr))
+		{
+			if (!methods.sbs_fetch_old)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("type %s does not support subscripted assignment",
+								format_type_be(sbsref->refcontainertype))));
+			scratch->opcode = EEOP_SBSREF_OLD;
+			scratch->d.sbsref.subscriptfunc = methods.sbs_fetch_old;
+			scratch->d.sbsref.state = sbsrefstate;
+			ExprEvalPushStep(state, scratch);
+		}
+
+		/* SBSREF_OLD puts extracted value into prevvalue/prevnull */
+		save_innermost_caseval = state->innermost_caseval;
+		save_innermost_casenull = state->innermost_casenull;
+		state->innermost_caseval = &sbsrefstate->prevvalue;
+		state->innermost_casenull = &sbsrefstate->prevnull;
+
+		/* evaluate replacement value into replacevalue/replacenull */
+		ExecInitExprRec(sbsref->refassgnexpr, state,
+						&sbsrefstate->replacevalue, &sbsrefstate->replacenull);
+
+		state->innermost_caseval = save_innermost_caseval;
+		state->innermost_casenull = save_innermost_casenull;
+
+		/* and perform the assignment */
+		scratch->opcode = EEOP_SBSREF_ASSIGN;
+		scratch->d.sbsref.subscriptfunc = methods.sbs_assign;
+		scratch->d.sbsref.state = sbsrefstate;
+		ExprEvalPushStep(state, scratch);
+	}
+	else
+	{
+		/* array fetch is much simpler */
+		scratch->opcode = EEOP_SBSREF_FETCH;
+		scratch->d.sbsref.subscriptfunc = methods.sbs_fetch;
+		scratch->d.sbsref.state = sbsrefstate;
+		ExprEvalPushStep(state, scratch);
+	}
+
+	/* adjust jump targets */
+	foreach(lc, adjust_jumps)
+	{
+		ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+		if (as->opcode == EEOP_SBSREF_SUBSCRIPTS)
+		{
+			Assert(as->d.sbsref_subscript.jumpdone == -1);
+			as->d.sbsref_subscript.jumpdone = state->steps_len;
+		}
+		else
+		{
+			Assert(as->opcode == EEOP_JUMP_IF_NULL);
+			Assert(as->d.jump.jumpdone == -1);
+			as->d.jump.jumpdone = state->steps_len;
+		}
+	}
+}
+
+/*
+ * Helper for preparing SubscriptingRef expressions for evaluation: is expr
+ * a nested FieldStore or SubscriptingRef that needs the old element value
+ * passed down?
+ *
+ * (We could use this in FieldStore too, but in that case passing the old
+ * value is so cheap there's no need.)
+ *
+ * Note: it might seem that this needs to recurse, but in most cases it does
+ * not; the CaseTestExpr, if any, will be directly the arg or refexpr of the
+ * top-level node.  Nested-assignment situations give rise to expression
+ * trees in which each level of assignment has its own CaseTestExpr, and the
+ * recursive structure appears within the newvals or refassgnexpr field.
+ * There is an exception, though: if the array is an array-of-domain, we will
+ * have a CoerceToDomain as the refassgnexpr, and we need to be able to look
+ * through that.
+ */
+static bool
+isAssignmentIndirectionExpr(Expr *expr)
+{
+	if (expr == NULL)
+		return false;			/* just paranoia */
+	if (IsA(expr, FieldStore))
+	{
+		FieldStore *fstore = (FieldStore *) expr;
+
+		if (fstore->arg && IsA(fstore->arg, CaseTestExpr))
+			return true;
+	}
+	else if (IsA(expr, SubscriptingRef))
+	{
+		SubscriptingRef *sbsRef = (SubscriptingRef *) expr;
+
+		if (sbsRef->refexpr && IsA(sbsRef->refexpr, CaseTestExpr))
+			return true;
+	}
+	else if (IsA(expr, CoerceToDomain))
+	{
+		CoerceToDomain *cd = (CoerceToDomain *) expr;
+
+		return isAssignmentIndirectionExpr(cd->arg);
+	}
+	return false;
+}
+
+/*
+ * Prepare evaluation of a CoerceToDomain expression.
+ */
+static void
+ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest,
+					   ExprState *state, Datum *resv, bool *resnull)
+{
+	DomainConstraintRef *constraint_ref;
+	Datum	   *domainval = NULL;
+	bool	   *domainnull = NULL;
+	ListCell   *l;
+
+	scratch->d.domaincheck.resulttype = ctest->resulttype;
+	/* we'll allocate workspace only if needed */
+	scratch->d.domaincheck.checkvalue = NULL;
+	scratch->d.domaincheck.checknull = NULL;
+
+	/*
+	 * Evaluate argument - it's fine to directly store it into resv/resnull,
+	 * if there's constraint failures there'll be errors, otherwise it's what
+	 * needs to be returned.
+	 */
+	ExecInitExprRec(ctest->arg, state, resv, resnull);
+
+	/*
+	 * Note: if the argument is of varlena type, it could be a R/W expanded
+	 * object.  We want to return the R/W pointer as the final result, but we
+	 * have to pass a R/O pointer as the value to be tested by any functions
+	 * in check expressions.  We don't bother to emit a MAKE_READONLY step
+	 * unless there's actually at least one check expression, though.  Until
+	 * we've tested that, domainval/domainnull are NULL.
+	 */
+
+	/*
+	 * Collect the constraints associated with the domain.
+	 *
+	 * Note: before PG v10 we'd recheck the set of constraints during each
+	 * evaluation of the expression.  Now we bake them into the ExprState
+	 * during executor initialization.  That means we don't need typcache.c to
+	 * provide compiled exprs.
+	 */
+	constraint_ref = (DomainConstraintRef *)
+		palloc(sizeof(DomainConstraintRef));
+	InitDomainConstraintRef(ctest->resulttype,
+							constraint_ref,
+							CurrentMemoryContext,
+							false);
+
+	/*
+	 * Compile code to check each domain constraint.  NOTNULL constraints can
+	 * just be applied on the resv/resnull value, but for CHECK constraints we
+	 * need more pushups.
+	 */
+	foreach(l, constraint_ref->constraints)
+	{
+		DomainConstraintState *con = (DomainConstraintState *) lfirst(l);
+		Datum	   *save_innermost_domainval;
+		bool	   *save_innermost_domainnull;
+
+		scratch->d.domaincheck.constraintname = con->name;
+
+		switch (con->constrainttype)
+		{
+			case DOM_CONSTRAINT_NOTNULL:
+				scratch->opcode = EEOP_DOMAIN_NOTNULL;
+				ExprEvalPushStep(state, scratch);
+				break;
+			case DOM_CONSTRAINT_CHECK:
+				/* Allocate workspace for CHECK output if we didn't yet */
+				if (scratch->d.domaincheck.checkvalue == NULL)
+				{
+					scratch->d.domaincheck.checkvalue =
+						(Datum *) palloc(sizeof(Datum));
+					scratch->d.domaincheck.checknull =
+						(bool *) palloc(sizeof(bool));
+				}
+
+				/*
+				 * If first time through, determine where CoerceToDomainValue
+				 * nodes should read from.
+				 */
+				if (domainval == NULL)
+				{
+					/*
+					 * Since value might be read multiple times, force to R/O
+					 * - but only if it could be an expanded datum.
+					 */
+					if (get_typlen(ctest->resulttype) == -1)
+					{
+						ExprEvalStep scratch2 = {0};
+
+						/* Yes, so make output workspace for MAKE_READONLY */
+						domainval = (Datum *) palloc(sizeof(Datum));
+						domainnull = (bool *) palloc(sizeof(bool));
+
+						/* Emit MAKE_READONLY */
+						scratch2.opcode = EEOP_MAKE_READONLY;
+						scratch2.resvalue = domainval;
+						scratch2.resnull = domainnull;
+						scratch2.d.make_readonly.value = resv;
+						scratch2.d.make_readonly.isnull = resnull;
+						ExprEvalPushStep(state, &scratch2);
+					}
+					else
+					{
+						/* No, so it's fine to read from resv/resnull */
+						domainval = resv;
+						domainnull = resnull;
+					}
+				}
+
+				/*
+				 * Set up value to be returned by CoerceToDomainValue nodes.
+				 * We must save and restore innermost_domainval/null fields,
+				 * in case this node is itself within a check expression for
+				 * another domain.
+				 */
+				save_innermost_domainval = state->innermost_domainval;
+				save_innermost_domainnull = state->innermost_domainnull;
+				state->innermost_domainval = domainval;
+				state->innermost_domainnull = domainnull;
+
+				/* evaluate check expression value */
+				ExecInitExprRec(con->check_expr, state,
+								scratch->d.domaincheck.checkvalue,
+								scratch->d.domaincheck.checknull);
+
+				state->innermost_domainval = save_innermost_domainval;
+				state->innermost_domainnull = save_innermost_domainnull;
+
+				/* now test result */
+				scratch->opcode = EEOP_DOMAIN_CHECK;
+				ExprEvalPushStep(state, scratch);
+
+				break;
+			default:
+				elog(ERROR, "unrecognized constraint type: %d",
+					 (int) con->constrainttype);
+				break;
+		}
+	}
+}
+
+/*
+ * Build transition/combine function invocations for all aggregate transition
+ * / combination function invocations in a grouping sets phase. This has to
+ * invoke all sort based transitions in a phase (if doSort is true), all hash
+ * based transitions (if doHash is true), or both (both true).
+ *
+ * The resulting expression will, for each set of transition values, first
+ * check for filters, evaluate aggregate input, check that that input is not
+ * NULL for a strict transition function, and then finally invoke the
+ * transition for each of the concurrently computed grouping sets.
+ *
+ * If nullcheck is true, the generated code will check for a NULL pointer to
+ * the array of AggStatePerGroup, and skip evaluation if so.
+ */
+ExprState *
+ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase,
+				  bool doSort, bool doHash, bool nullcheck)
+{
+	ExprState  *state = makeNode(ExprState);
+	PlanState  *parent = &aggstate->ss.ps;
+	ExprEvalStep scratch = {0};
+	bool		isCombine = DO_AGGSPLIT_COMBINE(aggstate->aggsplit);
+	LastAttnumInfo deform = {0, 0, 0};
+
+	state->expr = (Expr *) aggstate;
+	state->parent = parent;
+
+	scratch.resvalue = &state->resvalue;
+	scratch.resnull = &state->resnull;
+
+	/*
+	 * First figure out which slots, and how many columns from each, we're
+	 * going to need.
+	 */
+	for (int transno = 0; transno < aggstate->numtrans; transno++)
+	{
+		AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+
+		get_last_attnums_walker((Node *) pertrans->aggref->aggdirectargs,
+								&deform);
+		get_last_attnums_walker((Node *) pertrans->aggref->args,
+								&deform);
+		get_last_attnums_walker((Node *) pertrans->aggref->aggorder,
+								&deform);
+		get_last_attnums_walker((Node *) pertrans->aggref->aggdistinct,
+								&deform);
+		get_last_attnums_walker((Node *) pertrans->aggref->aggfilter,
+								&deform);
+	}
+	ExecPushExprSlots(state, &deform);
+
+	/*
+	 * Emit instructions for each transition value / grouping set combination.
+	 */
+	for (int transno = 0; transno < aggstate->numtrans; transno++)
+	{
+		AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+		FunctionCallInfo trans_fcinfo = pertrans->transfn_fcinfo;
+		List	   *adjust_bailout = NIL;
+		NullableDatum *strictargs = NULL;
+		bool	   *strictnulls = NULL;
+		int			argno;
+		ListCell   *bail;
+
+		/*
+		 * If filter present, emit. Do so before evaluating the input, to
+		 * avoid potentially unneeded computations, or even worse, unintended
+		 * side-effects.  When combining, all the necessary filtering has
+		 * already been done.
+		 */
+		if (pertrans->aggref->aggfilter && !isCombine)
+		{
+			/* evaluate filter expression */
+			ExecInitExprRec(pertrans->aggref->aggfilter, state,
+							&state->resvalue, &state->resnull);
+			/* and jump out if false */
+			scratch.opcode = EEOP_JUMP_IF_NOT_TRUE;
+			scratch.d.jump.jumpdone = -1;	/* adjust later */
+			ExprEvalPushStep(state, &scratch);
+			adjust_bailout = lappend_int(adjust_bailout,
+										 state->steps_len - 1);
+		}
+
+		/*
+		 * Evaluate arguments to aggregate/combine function.
+		 */
+		argno = 0;
+		if (isCombine)
+		{
+			/*
+			 * Combining two aggregate transition values. Instead of directly
+			 * coming from a tuple the input is a, potentially deserialized,
+			 * transition value.
+			 */
+			TargetEntry *source_tle;
+
+			Assert(pertrans->numSortCols == 0);
+			Assert(list_length(pertrans->aggref->args) == 1);
+
+			strictargs = trans_fcinfo->args + 1;
+			source_tle = (TargetEntry *) linitial(pertrans->aggref->args);
+
+			/*
+			 * deserialfn_oid will be set if we must deserialize the input
+			 * state before calling the combine function.
+			 */
+			if (!OidIsValid(pertrans->deserialfn_oid))
+			{
+				/*
+				 * Start from 1, since the 0th arg will be the transition
+				 * value
+				 */
+				ExecInitExprRec(source_tle->expr, state,
+								&trans_fcinfo->args[argno + 1].value,
+								&trans_fcinfo->args[argno + 1].isnull);
+			}
+			else
+			{
+				FunctionCallInfo ds_fcinfo = pertrans->deserialfn_fcinfo;
+
+				/* evaluate argument */
+				ExecInitExprRec(source_tle->expr, state,
+								&ds_fcinfo->args[0].value,
+								&ds_fcinfo->args[0].isnull);
+
+				/* Dummy second argument for type-safety reasons */
+				ds_fcinfo->args[1].value = PointerGetDatum(NULL);
+				ds_fcinfo->args[1].isnull = false;
+
+				/*
+				 * Don't call a strict deserialization function with NULL
+				 * input
+				 */
+				if (pertrans->deserialfn.fn_strict)
+					scratch.opcode = EEOP_AGG_STRICT_DESERIALIZE;
+				else
+					scratch.opcode = EEOP_AGG_DESERIALIZE;
+
+				scratch.d.agg_deserialize.fcinfo_data = ds_fcinfo;
+				scratch.d.agg_deserialize.jumpnull = -1;	/* adjust later */
+				scratch.resvalue = &trans_fcinfo->args[argno + 1].value;
+				scratch.resnull = &trans_fcinfo->args[argno + 1].isnull;
+
+				ExprEvalPushStep(state, &scratch);
+				/* don't add an adjustment unless the function is strict */
+				if (pertrans->deserialfn.fn_strict)
+					adjust_bailout = lappend_int(adjust_bailout,
+												 state->steps_len - 1);
+
+				/* restore normal settings of scratch fields */
+				scratch.resvalue = &state->resvalue;
+				scratch.resnull = &state->resnull;
+			}
+			argno++;
+		}
+		else if (pertrans->numSortCols == 0)
+		{
+			ListCell   *arg;
+
+			/*
+			 * Normal transition function without ORDER BY / DISTINCT.
+			 */
+			strictargs = trans_fcinfo->args + 1;
+
+			foreach(arg, pertrans->aggref->args)
+			{
+				TargetEntry *source_tle = (TargetEntry *) lfirst(arg);
+
+				/*
+				 * Start from 1, since the 0th arg will be the transition
+				 * value
+				 */
+				ExecInitExprRec(source_tle->expr, state,
+								&trans_fcinfo->args[argno + 1].value,
+								&trans_fcinfo->args[argno + 1].isnull);
+				argno++;
+			}
+		}
+		else if (pertrans->numInputs == 1)
+		{
+			/*
+			 * DISTINCT and/or ORDER BY case, with a single column sorted on.
+			 */
+			TargetEntry *source_tle =
+			(TargetEntry *) linitial(pertrans->aggref->args);
+
+			Assert(list_length(pertrans->aggref->args) == 1);
+
+			ExecInitExprRec(source_tle->expr, state,
+							&state->resvalue,
+							&state->resnull);
+			strictnulls = &state->resnull;
+			argno++;
+		}
+		else
+		{
+			/*
+			 * DISTINCT and/or ORDER BY case, with multiple columns sorted on.
+			 */
+			Datum	   *values = pertrans->sortslot->tts_values;
+			bool	   *nulls = pertrans->sortslot->tts_isnull;
+			ListCell   *arg;
+
+			strictnulls = nulls;
+
+			foreach(arg, pertrans->aggref->args)
+			{
+				TargetEntry *source_tle = (TargetEntry *) lfirst(arg);
+
+				ExecInitExprRec(source_tle->expr, state,
+								&values[argno], &nulls[argno]);
+				argno++;
+			}
+		}
+		Assert(pertrans->numInputs == argno);
+
+		/*
+		 * For a strict transfn, nothing happens when there's a NULL input; we
+		 * just keep the prior transValue. This is true for both plain and
+		 * sorted/distinct aggregates.
+		 */
+		if (trans_fcinfo->flinfo->fn_strict && pertrans->numTransInputs > 0)
+		{
+			if (strictnulls)
+				scratch.opcode = EEOP_AGG_STRICT_INPUT_CHECK_NULLS;
+			else
+				scratch.opcode = EEOP_AGG_STRICT_INPUT_CHECK_ARGS;
+			scratch.d.agg_strict_input_check.nulls = strictnulls;
+			scratch.d.agg_strict_input_check.args = strictargs;
+			scratch.d.agg_strict_input_check.jumpnull = -1; /* adjust later */
+			scratch.d.agg_strict_input_check.nargs = pertrans->numTransInputs;
+			ExprEvalPushStep(state, &scratch);
+			adjust_bailout = lappend_int(adjust_bailout,
+										 state->steps_len - 1);
+		}
+
+		/*
+		 * Call transition function (once for each concurrently evaluated
+		 * grouping set). Do so for both sort and hash based computations, as
+		 * applicable.
+		 */
+		if (doSort)
+		{
+			int			processGroupingSets = Max(phase->numsets, 1);
+			int			setoff = 0;
+
+			for (int setno = 0; setno < processGroupingSets; setno++)
+			{
+				ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo,
+									  pertrans, transno, setno, setoff, false,
+									  nullcheck);
+				setoff++;
+			}
+		}
+
+		if (doHash)
+		{
+			int			numHashes = aggstate->num_hashes;
+			int			setoff;
+
+			/* in MIXED mode, there'll be preceding transition values */
+			if (aggstate->aggstrategy != AGG_HASHED)
+				setoff = aggstate->maxsets;
+			else
+				setoff = 0;
+
+			for (int setno = 0; setno < numHashes; setno++)
+			{
+				ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo,
+									  pertrans, transno, setno, setoff, true,
+									  nullcheck);
+				setoff++;
+			}
+		}
+
+		/* adjust early bail out jump target(s) */
+		foreach(bail, adjust_bailout)
+		{
+			ExprEvalStep *as = &state->steps[lfirst_int(bail)];
+
+			if (as->opcode == EEOP_JUMP_IF_NOT_TRUE)
+			{
+				Assert(as->d.jump.jumpdone == -1);
+				as->d.jump.jumpdone = state->steps_len;
+			}
+			else if (as->opcode == EEOP_AGG_STRICT_INPUT_CHECK_ARGS ||
+					 as->opcode == EEOP_AGG_STRICT_INPUT_CHECK_NULLS)
+			{
+				Assert(as->d.agg_strict_input_check.jumpnull == -1);
+				as->d.agg_strict_input_check.jumpnull = state->steps_len;
+			}
+			else if (as->opcode == EEOP_AGG_STRICT_DESERIALIZE)
+			{
+				Assert(as->d.agg_deserialize.jumpnull == -1);
+				as->d.agg_deserialize.jumpnull = state->steps_len;
+			}
+			else
+				Assert(false);
+		}
+	}
+
+	scratch.resvalue = NULL;
+	scratch.resnull = NULL;
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return state;
+}
+
+/*
+ * Build transition/combine function invocation for a single transition
+ * value. This is separated from ExecBuildAggTrans() because there are
+ * multiple callsites (hash and sort in some grouping set cases).
+ */
+static void
+ExecBuildAggTransCall(ExprState *state, AggState *aggstate,
+					  ExprEvalStep *scratch,
+					  FunctionCallInfo fcinfo, AggStatePerTrans pertrans,
+					  int transno, int setno, int setoff, bool ishash,
+					  bool nullcheck)
+{
+	ExprContext *aggcontext;
+	int			adjust_jumpnull = -1;
+
+	if (ishash)
+		aggcontext = aggstate->hashcontext;
+	else
+		aggcontext = aggstate->aggcontexts[setno];
+
+	/* add check for NULL pointer? */
+	if (nullcheck)
+	{
+		scratch->opcode = EEOP_AGG_PLAIN_PERGROUP_NULLCHECK;
+		scratch->d.agg_plain_pergroup_nullcheck.setoff = setoff;
+		/* adjust later */
+		scratch->d.agg_plain_pergroup_nullcheck.jumpnull = -1;
+		ExprEvalPushStep(state, scratch);
+		adjust_jumpnull = state->steps_len - 1;
+	}
+
+	/*
+	 * Determine appropriate transition implementation.
+	 *
+	 * For non-ordered aggregates:
+	 *
+	 * If the initial value for the transition state doesn't exist in the
+	 * pg_aggregate table then we will let the first non-NULL value returned
+	 * from the outer procNode become the initial value. (This is useful for
+	 * aggregates like max() and min().) The noTransValue flag signals that we
+	 * need to do so. If true, generate a
+	 * EEOP_AGG_INIT_STRICT_PLAIN_TRANS{,_BYVAL} step. This step also needs to
+	 * do the work described next:
+	 *
+	 * If the function is strict, but does have an initial value, choose
+	 * EEOP_AGG_STRICT_PLAIN_TRANS{,_BYVAL}, which skips the transition
+	 * function if the transition value has become NULL (because a previous
+	 * transition function returned NULL). This step also needs to do the work
+	 * described next:
+	 *
+	 * Otherwise we call EEOP_AGG_PLAIN_TRANS{,_BYVAL}, which does not have to
+	 * perform either of the above checks.
+	 *
+	 * Having steps with overlapping responsibilities is not nice, but
+	 * aggregations are very performance sensitive, making this worthwhile.
+	 *
+	 * For ordered aggregates:
+	 *
+	 * Only need to choose between the faster path for a single ordered
+	 * column, and the one between multiple columns. Checking strictness etc
+	 * is done when finalizing the aggregate. See
+	 * process_ordered_aggregate_{single, multi} and
+	 * advance_transition_function.
+	 */
+	if (pertrans->numSortCols == 0)
+	{
+		if (pertrans->transtypeByVal)
+		{
+			if (fcinfo->flinfo->fn_strict &&
+				pertrans->initValueIsNull)
+				scratch->opcode = EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL;
+			else if (fcinfo->flinfo->fn_strict)
+				scratch->opcode = EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL;
+			else
+				scratch->opcode = EEOP_AGG_PLAIN_TRANS_BYVAL;
+		}
+		else
+		{
+			if (fcinfo->flinfo->fn_strict &&
+				pertrans->initValueIsNull)
+				scratch->opcode = EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF;
+			else if (fcinfo->flinfo->fn_strict)
+				scratch->opcode = EEOP_AGG_PLAIN_TRANS_STRICT_BYREF;
+			else
+				scratch->opcode = EEOP_AGG_PLAIN_TRANS_BYREF;
+		}
+	}
+	else if (pertrans->numInputs == 1)
+		scratch->opcode = EEOP_AGG_ORDERED_TRANS_DATUM;
+	else
+		scratch->opcode = EEOP_AGG_ORDERED_TRANS_TUPLE;
+
+	scratch->d.agg_trans.pertrans = pertrans;
+	scratch->d.agg_trans.setno = setno;
+	scratch->d.agg_trans.setoff = setoff;
+	scratch->d.agg_trans.transno = transno;
+	scratch->d.agg_trans.aggcontext = aggcontext;
+	ExprEvalPushStep(state, scratch);
+
+	/* fix up jumpnull */
+	if (adjust_jumpnull != -1)
+	{
+		ExprEvalStep *as = &state->steps[adjust_jumpnull];
+
+		Assert(as->opcode == EEOP_AGG_PLAIN_PERGROUP_NULLCHECK);
+		Assert(as->d.agg_plain_pergroup_nullcheck.jumpnull == -1);
+		as->d.agg_plain_pergroup_nullcheck.jumpnull = state->steps_len;
+	}
+}
+
+/*
+ * Build equality expression that can be evaluated using ExecQual(), returning
+ * true if the expression context's inner/outer tuple are NOT DISTINCT. I.e
+ * two nulls match, a null and a not-null don't match.
+ *
+ * desc: tuple descriptor of the to-be-compared tuples
+ * numCols: the number of attributes to be examined
+ * keyColIdx: array of attribute column numbers
+ * eqFunctions: array of function oids of the equality functions to use
+ * parent: parent executor node
+ */
+ExprState *
+ExecBuildGroupingEqual(TupleDesc ldesc, TupleDesc rdesc,
+					   const TupleTableSlotOps *lops, const TupleTableSlotOps *rops,
+					   int numCols,
+					   const AttrNumber *keyColIdx,
+					   const Oid *eqfunctions,
+					   const Oid *collations,
+					   PlanState *parent)
+{
+	ExprState  *state = makeNode(ExprState);
+	ExprEvalStep scratch = {0};
+	int			maxatt = -1;
+	List	   *adjust_jumps = NIL;
+	ListCell   *lc;
+
+	/*
+	 * When no columns are actually compared, the result's always true. See
+	 * special case in ExecQual().
+	 */
+	if (numCols == 0)
+		return NULL;
+
+	state->expr = NULL;
+	state->flags = EEO_FLAG_IS_QUAL;
+	state->parent = parent;
+
+	scratch.resvalue = &state->resvalue;
+	scratch.resnull = &state->resnull;
+
+	/* compute max needed attribute */
+	for (int natt = 0; natt < numCols; natt++)
+	{
+		int			attno = keyColIdx[natt];
+
+		if (attno > maxatt)
+			maxatt = attno;
+	}
+	Assert(maxatt >= 0);
+
+	/* push deform steps */
+	scratch.opcode = EEOP_INNER_FETCHSOME;
+	scratch.d.fetch.last_var = maxatt;
+	scratch.d.fetch.fixed = false;
+	scratch.d.fetch.known_desc = ldesc;
+	scratch.d.fetch.kind = lops;
+	if (ExecComputeSlotInfo(state, &scratch))
+		ExprEvalPushStep(state, &scratch);
+
+	scratch.opcode = EEOP_OUTER_FETCHSOME;
+	scratch.d.fetch.last_var = maxatt;
+	scratch.d.fetch.fixed = false;
+	scratch.d.fetch.known_desc = rdesc;
+	scratch.d.fetch.kind = rops;
+	if (ExecComputeSlotInfo(state, &scratch))
+		ExprEvalPushStep(state, &scratch);
+
+	/*
+	 * Start comparing at the last field (least significant sort key). That's
+	 * the most likely to be different if we are dealing with sorted input.
+	 */
+	for (int natt = numCols; --natt >= 0;)
+	{
+		int			attno = keyColIdx[natt];
+		Form_pg_attribute latt = TupleDescAttr(ldesc, attno - 1);
+		Form_pg_attribute ratt = TupleDescAttr(rdesc, attno - 1);
+		Oid			foid = eqfunctions[natt];
+		Oid			collid = collations[natt];
+		FmgrInfo   *finfo;
+		FunctionCallInfo fcinfo;
+		AclResult	aclresult;
+
+		/* Check permission to call function */
+		aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid));
+
+		InvokeFunctionExecuteHook(foid);
+
+		/* Set up the primary fmgr lookup information */
+		finfo = palloc0(sizeof(FmgrInfo));
+		fcinfo = palloc0(SizeForFunctionCallInfo(2));
+		fmgr_info(foid, finfo);
+		fmgr_info_set_expr(NULL, finfo);
+		InitFunctionCallInfoData(*fcinfo, finfo, 2,
+								 collid, NULL, NULL);
+
+		/* left arg */
+		scratch.opcode = EEOP_INNER_VAR;
+		scratch.d.var.attnum = attno - 1;
+		scratch.d.var.vartype = latt->atttypid;
+		scratch.resvalue = &fcinfo->args[0].value;
+		scratch.resnull = &fcinfo->args[0].isnull;
+		ExprEvalPushStep(state, &scratch);
+
+		/* right arg */
+		scratch.opcode = EEOP_OUTER_VAR;
+		scratch.d.var.attnum = attno - 1;
+		scratch.d.var.vartype = ratt->atttypid;
+		scratch.resvalue = &fcinfo->args[1].value;
+		scratch.resnull = &fcinfo->args[1].isnull;
+		ExprEvalPushStep(state, &scratch);
+
+		/* evaluate distinctness */
+		scratch.opcode = EEOP_NOT_DISTINCT;
+		scratch.d.func.finfo = finfo;
+		scratch.d.func.fcinfo_data = fcinfo;
+		scratch.d.func.fn_addr = finfo->fn_addr;
+		scratch.d.func.nargs = 2;
+		scratch.resvalue = &state->resvalue;
+		scratch.resnull = &state->resnull;
+		ExprEvalPushStep(state, &scratch);
+
+		/* then emit EEOP_QUAL to detect if result is false (or null) */
+		scratch.opcode = EEOP_QUAL;
+		scratch.d.qualexpr.jumpdone = -1;
+		scratch.resvalue = &state->resvalue;
+		scratch.resnull = &state->resnull;
+		ExprEvalPushStep(state, &scratch);
+		adjust_jumps = lappend_int(adjust_jumps,
+								   state->steps_len - 1);
+	}
+
+	/* adjust jump targets */
+	foreach(lc, adjust_jumps)
+	{
+		ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+		Assert(as->opcode == EEOP_QUAL);
+		Assert(as->d.qualexpr.jumpdone == -1);
+		as->d.qualexpr.jumpdone = state->steps_len;
+	}
+
+	scratch.resvalue = NULL;
+	scratch.resnull = NULL;
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return state;
+}
+
+/*
+ * Build equality expression that can be evaluated using ExecQual(), returning
+ * true if the expression context's inner/outer tuples are equal.  Datums in
+ * the inner/outer slots are assumed to be in the same order and quantity as
+ * the 'eqfunctions' parameter.  NULLs are treated as equal.
+ *
+ * desc: tuple descriptor of the to-be-compared tuples
+ * lops: the slot ops for the inner tuple slots
+ * rops: the slot ops for the outer tuple slots
+ * eqFunctions: array of function oids of the equality functions to use
+ * this must be the same length as the 'param_exprs' list.
+ * collations: collation Oids to use for equality comparison. Must be the
+ * same length as the 'param_exprs' list.
+ * parent: parent executor node
+ */
+ExprState *
+ExecBuildParamSetEqual(TupleDesc desc,
+					   const TupleTableSlotOps *lops,
+					   const TupleTableSlotOps *rops,
+					   const Oid *eqfunctions,
+					   const Oid *collations,
+					   const List *param_exprs,
+					   PlanState *parent)
+{
+	ExprState  *state = makeNode(ExprState);
+	ExprEvalStep scratch = {0};
+	int			maxatt = list_length(param_exprs);
+	List	   *adjust_jumps = NIL;
+	ListCell   *lc;
+
+	state->expr = NULL;
+	state->flags = EEO_FLAG_IS_QUAL;
+	state->parent = parent;
+
+	scratch.resvalue = &state->resvalue;
+	scratch.resnull = &state->resnull;
+
+	/* push deform steps */
+	scratch.opcode = EEOP_INNER_FETCHSOME;
+	scratch.d.fetch.last_var = maxatt;
+	scratch.d.fetch.fixed = false;
+	scratch.d.fetch.known_desc = desc;
+	scratch.d.fetch.kind = lops;
+	if (ExecComputeSlotInfo(state, &scratch))
+		ExprEvalPushStep(state, &scratch);
+
+	scratch.opcode = EEOP_OUTER_FETCHSOME;
+	scratch.d.fetch.last_var = maxatt;
+	scratch.d.fetch.fixed = false;
+	scratch.d.fetch.known_desc = desc;
+	scratch.d.fetch.kind = rops;
+	if (ExecComputeSlotInfo(state, &scratch))
+		ExprEvalPushStep(state, &scratch);
+
+	for (int attno = 0; attno < maxatt; attno++)
+	{
+		Form_pg_attribute att = TupleDescAttr(desc, attno);
+		Oid			foid = eqfunctions[attno];
+		Oid			collid = collations[attno];
+		FmgrInfo   *finfo;
+		FunctionCallInfo fcinfo;
+		AclResult	aclresult;
+
+		/* Check permission to call function */
+		aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid));
+
+		InvokeFunctionExecuteHook(foid);
+
+		/* Set up the primary fmgr lookup information */
+		finfo = palloc0(sizeof(FmgrInfo));
+		fcinfo = palloc0(SizeForFunctionCallInfo(2));
+		fmgr_info(foid, finfo);
+		fmgr_info_set_expr(NULL, finfo);
+		InitFunctionCallInfoData(*fcinfo, finfo, 2,
+								 collid, NULL, NULL);
+
+		/* left arg */
+		scratch.opcode = EEOP_INNER_VAR;
+		scratch.d.var.attnum = attno;
+		scratch.d.var.vartype = att->atttypid;
+		scratch.resvalue = &fcinfo->args[0].value;
+		scratch.resnull = &fcinfo->args[0].isnull;
+		ExprEvalPushStep(state, &scratch);
+
+		/* right arg */
+		scratch.opcode = EEOP_OUTER_VAR;
+		scratch.d.var.attnum = attno;
+		scratch.d.var.vartype = att->atttypid;
+		scratch.resvalue = &fcinfo->args[1].value;
+		scratch.resnull = &fcinfo->args[1].isnull;
+		ExprEvalPushStep(state, &scratch);
+
+		/* evaluate distinctness */
+		scratch.opcode = EEOP_NOT_DISTINCT;
+		scratch.d.func.finfo = finfo;
+		scratch.d.func.fcinfo_data = fcinfo;
+		scratch.d.func.fn_addr = finfo->fn_addr;
+		scratch.d.func.nargs = 2;
+		scratch.resvalue = &state->resvalue;
+		scratch.resnull = &state->resnull;
+		ExprEvalPushStep(state, &scratch);
+
+		/* then emit EEOP_QUAL to detect if result is false (or null) */
+		scratch.opcode = EEOP_QUAL;
+		scratch.d.qualexpr.jumpdone = -1;
+		scratch.resvalue = &state->resvalue;
+		scratch.resnull = &state->resnull;
+		ExprEvalPushStep(state, &scratch);
+		adjust_jumps = lappend_int(adjust_jumps,
+								   state->steps_len - 1);
+	}
+
+	/* adjust jump targets */
+	foreach(lc, adjust_jumps)
+	{
+		ExprEvalStep *as = &state->steps[lfirst_int(lc)];
+
+		Assert(as->opcode == EEOP_QUAL);
+		Assert(as->d.qualexpr.jumpdone == -1);
+		as->d.qualexpr.jumpdone = state->steps_len;
+	}
+
+	scratch.resvalue = NULL;
+	scratch.resnull = NULL;
+	scratch.opcode = EEOP_DONE;
+	ExprEvalPushStep(state, &scratch);
+
+	ExecReadyExpr(state);
+
+	return state;
+}
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
new file mode 100644
index 0000000..6b63f93
--- /dev/null
+++ b/src/backend/executor/execExprInterp.c
@@ -0,0 +1,4373 @@
+/*-------------------------------------------------------------------------
+ *
+ * execExprInterp.c
+ *	  Interpreted evaluation of an expression step list.
+ *
+ * This file provides either a "direct threaded" (for gcc, clang and
+ * compatible) or a "switch threaded" (for all compilers) implementation of
+ * expression evaluation.  The former is amongst the fastest known methods
+ * of interpreting programs without resorting to assembly level work, or
+ * just-in-time compilation, but it requires support for computed gotos.
+ * The latter is amongst the fastest approaches doable in standard C.
+ *
+ * In either case we use ExprEvalStep->opcode to dispatch to the code block
+ * within ExecInterpExpr() that implements the specific opcode type.
+ *
+ * Switch-threading uses a plain switch() statement to perform the
+ * dispatch.  This has the advantages of being plain C and allowing the
+ * compiler to warn if implementation of a specific opcode has been forgotten.
+ * The disadvantage is that dispatches will, as commonly implemented by
+ * compilers, happen from a single location, requiring more jumps and causing
+ * bad branch prediction.
+ *
+ * In direct threading, we use gcc's label-as-values extension - also adopted
+ * by some other compilers - to replace ExprEvalStep->opcode with the address
+ * of the block implementing the instruction. Dispatch to the next instruction
+ * is done by a "computed goto".  This allows for better branch prediction
+ * (as the jumps are happening from different locations) and fewer jumps
+ * (as no preparatory jump to a common dispatch location is needed).
+ *
+ * When using direct threading, ExecReadyInterpretedExpr will replace
+ * each step's opcode field with the address of the relevant code block and
+ * ExprState->flags will contain EEO_FLAG_DIRECT_THREADED to remember that
+ * that's been done.
+ *
+ * For very simple instructions the overhead of the full interpreter
+ * "startup", as minimal as it is, is noticeable.  Therefore
+ * ExecReadyInterpretedExpr will choose to implement certain simple
+ * opcode patterns using special fast-path routines (ExecJust*).
+ *
+ * Complex or uncommon instructions are not implemented in-line in
+ * ExecInterpExpr(), rather we call out to a helper function appearing later
+ * in this file.  For one reason, there'd not be a noticeable performance
+ * benefit, but more importantly those complex routines are intended to be
+ * shared between different expression evaluation approaches.  For instance
+ * a JIT compiler would generate calls to them.  (This is why they are
+ * exported rather than being "static" in this file.)
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execExprInterp.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heaptoast.h"
+#include "catalog/pg_type.h"
+#include "commands/sequence.h"
+#include "executor/execExpr.h"
+#include "executor/nodeSubplan.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parsetree.h"
+#include "pgstat.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/date.h"
+#include "utils/datum.h"
+#include "utils/expandedrecord.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+#include "utils/typcache.h"
+#include "utils/xml.h"
+
+/*
+ * Use computed-goto-based opcode dispatch when computed gotos are available.
+ * But use a separate symbol so that it's easy to adjust locally in this file
+ * for development and testing.
+ */
+#ifdef HAVE_COMPUTED_GOTO
+#define EEO_USE_COMPUTED_GOTO
+#endif							/* HAVE_COMPUTED_GOTO */
+
+/*
+ * Macros for opcode dispatch.
+ *
+ * EEO_SWITCH - just hides the switch if not in use.
+ * EEO_CASE - labels the implementation of named expression step type.
+ * EEO_DISPATCH - jump to the implementation of the step type for 'op'.
+ * EEO_OPCODE - compute opcode required by used expression evaluation method.
+ * EEO_NEXT - increment 'op' and jump to correct next step type.
+ * EEO_JUMP - jump to the specified step number within the current expression.
+ */
+#if defined(EEO_USE_COMPUTED_GOTO)
+
+/* struct for jump target -> opcode lookup table */
+typedef struct ExprEvalOpLookup
+{
+	const void *opcode;
+	ExprEvalOp	op;
+} ExprEvalOpLookup;
+
+/* to make dispatch_table accessible outside ExecInterpExpr() */
+static const void **dispatch_table = NULL;
+
+/* jump target -> opcode lookup table */
+static ExprEvalOpLookup reverse_dispatch_table[EEOP_LAST];
+
+#define EEO_SWITCH()
+#define EEO_CASE(name)		CASE_##name:
+#define EEO_DISPATCH()		goto *((void *) op->opcode)
+#define EEO_OPCODE(opcode)	((intptr_t) dispatch_table[opcode])
+
+#else							/* !EEO_USE_COMPUTED_GOTO */
+
+#define EEO_SWITCH()		starteval: switch ((ExprEvalOp) op->opcode)
+#define EEO_CASE(name)		case name:
+#define EEO_DISPATCH()		goto starteval
+#define EEO_OPCODE(opcode)	(opcode)
+
+#endif							/* EEO_USE_COMPUTED_GOTO */
+
+#define EEO_NEXT() \
+	do { \
+		op++; \
+		EEO_DISPATCH(); \
+	} while (0)
+
+#define EEO_JUMP(stepno) \
+	do { \
+		op = &state->steps[stepno]; \
+		EEO_DISPATCH(); \
+	} while (0)
+
+
+static Datum ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull);
+static void ExecInitInterpreter(void);
+
+/* support functions */
+static void CheckVarSlotCompatibility(TupleTableSlot *slot, int attnum, Oid vartype);
+static void CheckOpSlotCompatibility(ExprEvalStep *op, TupleTableSlot *slot);
+static TupleDesc get_cached_rowtype(Oid type_id, int32 typmod,
+									ExprEvalRowtypeCache *rowcache,
+									bool *changed);
+static void ExecEvalRowNullInt(ExprState *state, ExprEvalStep *op,
+							   ExprContext *econtext, bool checkisnull);
+
+/* fast-path evaluation functions */
+static Datum ExecJustInnerVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustOuterVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustScanVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignInnerVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignOuterVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignScanVar(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustApplyFuncToCase(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustConst(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+static Datum ExecJustAssignScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull);
+
+/* execution helper functions */
+static pg_attribute_always_inline void ExecAggPlainTransByVal(AggState *aggstate,
+															  AggStatePerTrans pertrans,
+															  AggStatePerGroup pergroup,
+															  ExprContext *aggcontext,
+															  int setno);
+static pg_attribute_always_inline void ExecAggPlainTransByRef(AggState *aggstate,
+															  AggStatePerTrans pertrans,
+															  AggStatePerGroup pergroup,
+															  ExprContext *aggcontext,
+															  int setno);
+
+/*
+ * ScalarArrayOpExprHashEntry
+ * 		Hash table entry type used during EEOP_HASHED_SCALARARRAYOP
+ */
+typedef struct ScalarArrayOpExprHashEntry
+{
+	Datum		key;
+	uint32		status;			/* hash status */
+	uint32		hash;			/* hash value (cached) */
+} ScalarArrayOpExprHashEntry;
+
+#define SH_PREFIX saophash
+#define SH_ELEMENT_TYPE ScalarArrayOpExprHashEntry
+#define SH_KEY_TYPE Datum
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static bool saop_hash_element_match(struct saophash_hash *tb, Datum key1,
+									Datum key2);
+static uint32 saop_element_hash(struct saophash_hash *tb, Datum key);
+
+/*
+ * ScalarArrayOpExprHashTable
+ *		Hash table for EEOP_HASHED_SCALARARRAYOP
+ */
+typedef struct ScalarArrayOpExprHashTable
+{
+	saophash_hash *hashtab;		/* underlying hash table */
+	struct ExprEvalStep *op;
+} ScalarArrayOpExprHashTable;
+
+/* Define parameters for ScalarArrayOpExpr hash table code generation. */
+#define SH_PREFIX saophash
+#define SH_ELEMENT_TYPE ScalarArrayOpExprHashEntry
+#define SH_KEY_TYPE Datum
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) saop_element_hash(tb, key)
+#define SH_EQUAL(tb, a, b) saop_hash_element_match(tb, a, b)
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a) a->hash
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+/*
+ * Prepare ExprState for interpreted execution.
+ */
+void
+ExecReadyInterpretedExpr(ExprState *state)
+{
+	/* Ensure one-time interpreter setup has been done */
+	ExecInitInterpreter();
+
+	/* Simple validity checks on expression */
+	Assert(state->steps_len >= 1);
+	Assert(state->steps[state->steps_len - 1].opcode == EEOP_DONE);
+
+	/*
+	 * Don't perform redundant initialization. This is unreachable in current
+	 * cases, but might be hit if there's additional expression evaluation
+	 * methods that rely on interpreted execution to work.
+	 */
+	if (state->flags & EEO_FLAG_INTERPRETER_INITIALIZED)
+		return;
+
+	/*
+	 * First time through, check whether attribute matches Var.  Might not be
+	 * ok anymore, due to schema changes. We do that by setting up a callback
+	 * that does checking on the first call, which then sets the evalfunc
+	 * callback to the actual method of execution.
+	 */
+	state->evalfunc = ExecInterpExprStillValid;
+
+	/* DIRECT_THREADED should not already be set */
+	Assert((state->flags & EEO_FLAG_DIRECT_THREADED) == 0);
+
+	/*
+	 * There shouldn't be any errors before the expression is fully
+	 * initialized, and even if so, it'd lead to the expression being
+	 * abandoned.  So we can set the flag now and save some code.
+	 */
+	state->flags |= EEO_FLAG_INTERPRETER_INITIALIZED;
+
+	/*
+	 * Select fast-path evalfuncs for very simple expressions.  "Starting up"
+	 * the full interpreter is a measurable overhead for these, and these
+	 * patterns occur often enough to be worth optimizing.
+	 */
+	if (state->steps_len == 3)
+	{
+		ExprEvalOp	step0 = state->steps[0].opcode;
+		ExprEvalOp	step1 = state->steps[1].opcode;
+
+		if (step0 == EEOP_INNER_FETCHSOME &&
+			step1 == EEOP_INNER_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustInnerVar;
+			return;
+		}
+		else if (step0 == EEOP_OUTER_FETCHSOME &&
+				 step1 == EEOP_OUTER_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustOuterVar;
+			return;
+		}
+		else if (step0 == EEOP_SCAN_FETCHSOME &&
+				 step1 == EEOP_SCAN_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustScanVar;
+			return;
+		}
+		else if (step0 == EEOP_INNER_FETCHSOME &&
+				 step1 == EEOP_ASSIGN_INNER_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustAssignInnerVar;
+			return;
+		}
+		else if (step0 == EEOP_OUTER_FETCHSOME &&
+				 step1 == EEOP_ASSIGN_OUTER_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustAssignOuterVar;
+			return;
+		}
+		else if (step0 == EEOP_SCAN_FETCHSOME &&
+				 step1 == EEOP_ASSIGN_SCAN_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustAssignScanVar;
+			return;
+		}
+		else if (step0 == EEOP_CASE_TESTVAL &&
+				 step1 == EEOP_FUNCEXPR_STRICT &&
+				 state->steps[0].d.casetest.value)
+		{
+			state->evalfunc_private = (void *) ExecJustApplyFuncToCase;
+			return;
+		}
+	}
+	else if (state->steps_len == 2)
+	{
+		ExprEvalOp	step0 = state->steps[0].opcode;
+
+		if (step0 == EEOP_CONST)
+		{
+			state->evalfunc_private = (void *) ExecJustConst;
+			return;
+		}
+		else if (step0 == EEOP_INNER_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustInnerVarVirt;
+			return;
+		}
+		else if (step0 == EEOP_OUTER_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustOuterVarVirt;
+			return;
+		}
+		else if (step0 == EEOP_SCAN_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustScanVarVirt;
+			return;
+		}
+		else if (step0 == EEOP_ASSIGN_INNER_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustAssignInnerVarVirt;
+			return;
+		}
+		else if (step0 == EEOP_ASSIGN_OUTER_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustAssignOuterVarVirt;
+			return;
+		}
+		else if (step0 == EEOP_ASSIGN_SCAN_VAR)
+		{
+			state->evalfunc_private = (void *) ExecJustAssignScanVarVirt;
+			return;
+		}
+	}
+
+#if defined(EEO_USE_COMPUTED_GOTO)
+
+	/*
+	 * In the direct-threaded implementation, replace each opcode with the
+	 * address to jump to.  (Use ExecEvalStepOp() to get back the opcode.)
+	 */
+	for (int off = 0; off < state->steps_len; off++)
+	{
+		ExprEvalStep *op = &state->steps[off];
+
+		op->opcode = EEO_OPCODE(op->opcode);
+	}
+
+	state->flags |= EEO_FLAG_DIRECT_THREADED;
+#endif							/* EEO_USE_COMPUTED_GOTO */
+
+	state->evalfunc_private = (void *) ExecInterpExpr;
+}
+
+
+/*
+ * Evaluate expression identified by "state" in the execution context
+ * given by "econtext".  *isnull is set to the is-null flag for the result,
+ * and the Datum value is the function result.
+ *
+ * As a special case, return the dispatch table's address if state is NULL.
+ * This is used by ExecInitInterpreter to set up the dispatch_table global.
+ * (Only applies when EEO_USE_COMPUTED_GOTO is defined.)
+ */
+static Datum
+ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	ExprEvalStep *op;
+	TupleTableSlot *resultslot;
+	TupleTableSlot *innerslot;
+	TupleTableSlot *outerslot;
+	TupleTableSlot *scanslot;
+
+	/*
+	 * This array has to be in the same order as enum ExprEvalOp.
+	 */
+#if defined(EEO_USE_COMPUTED_GOTO)
+	static const void *const dispatch_table[] = {
+		&&CASE_EEOP_DONE,
+		&&CASE_EEOP_INNER_FETCHSOME,
+		&&CASE_EEOP_OUTER_FETCHSOME,
+		&&CASE_EEOP_SCAN_FETCHSOME,
+		&&CASE_EEOP_INNER_VAR,
+		&&CASE_EEOP_OUTER_VAR,
+		&&CASE_EEOP_SCAN_VAR,
+		&&CASE_EEOP_INNER_SYSVAR,
+		&&CASE_EEOP_OUTER_SYSVAR,
+		&&CASE_EEOP_SCAN_SYSVAR,
+		&&CASE_EEOP_WHOLEROW,
+		&&CASE_EEOP_ASSIGN_INNER_VAR,
+		&&CASE_EEOP_ASSIGN_OUTER_VAR,
+		&&CASE_EEOP_ASSIGN_SCAN_VAR,
+		&&CASE_EEOP_ASSIGN_TMP,
+		&&CASE_EEOP_ASSIGN_TMP_MAKE_RO,
+		&&CASE_EEOP_CONST,
+		&&CASE_EEOP_FUNCEXPR,
+		&&CASE_EEOP_FUNCEXPR_STRICT,
+		&&CASE_EEOP_FUNCEXPR_FUSAGE,
+		&&CASE_EEOP_FUNCEXPR_STRICT_FUSAGE,
+		&&CASE_EEOP_BOOL_AND_STEP_FIRST,
+		&&CASE_EEOP_BOOL_AND_STEP,
+		&&CASE_EEOP_BOOL_AND_STEP_LAST,
+		&&CASE_EEOP_BOOL_OR_STEP_FIRST,
+		&&CASE_EEOP_BOOL_OR_STEP,
+		&&CASE_EEOP_BOOL_OR_STEP_LAST,
+		&&CASE_EEOP_BOOL_NOT_STEP,
+		&&CASE_EEOP_QUAL,
+		&&CASE_EEOP_JUMP,
+		&&CASE_EEOP_JUMP_IF_NULL,
+		&&CASE_EEOP_JUMP_IF_NOT_NULL,
+		&&CASE_EEOP_JUMP_IF_NOT_TRUE,
+		&&CASE_EEOP_NULLTEST_ISNULL,
+		&&CASE_EEOP_NULLTEST_ISNOTNULL,
+		&&CASE_EEOP_NULLTEST_ROWISNULL,
+		&&CASE_EEOP_NULLTEST_ROWISNOTNULL,
+		&&CASE_EEOP_BOOLTEST_IS_TRUE,
+		&&CASE_EEOP_BOOLTEST_IS_NOT_TRUE,
+		&&CASE_EEOP_BOOLTEST_IS_FALSE,
+		&&CASE_EEOP_BOOLTEST_IS_NOT_FALSE,
+		&&CASE_EEOP_PARAM_EXEC,
+		&&CASE_EEOP_PARAM_EXTERN,
+		&&CASE_EEOP_PARAM_CALLBACK,
+		&&CASE_EEOP_CASE_TESTVAL,
+		&&CASE_EEOP_MAKE_READONLY,
+		&&CASE_EEOP_IOCOERCE,
+		&&CASE_EEOP_DISTINCT,
+		&&CASE_EEOP_NOT_DISTINCT,
+		&&CASE_EEOP_NULLIF,
+		&&CASE_EEOP_SQLVALUEFUNCTION,
+		&&CASE_EEOP_CURRENTOFEXPR,
+		&&CASE_EEOP_NEXTVALUEEXPR,
+		&&CASE_EEOP_ARRAYEXPR,
+		&&CASE_EEOP_ARRAYCOERCE,
+		&&CASE_EEOP_ROW,
+		&&CASE_EEOP_ROWCOMPARE_STEP,
+		&&CASE_EEOP_ROWCOMPARE_FINAL,
+		&&CASE_EEOP_MINMAX,
+		&&CASE_EEOP_FIELDSELECT,
+		&&CASE_EEOP_FIELDSTORE_DEFORM,
+		&&CASE_EEOP_FIELDSTORE_FORM,
+		&&CASE_EEOP_SBSREF_SUBSCRIPTS,
+		&&CASE_EEOP_SBSREF_OLD,
+		&&CASE_EEOP_SBSREF_ASSIGN,
+		&&CASE_EEOP_SBSREF_FETCH,
+		&&CASE_EEOP_DOMAIN_TESTVAL,
+		&&CASE_EEOP_DOMAIN_NOTNULL,
+		&&CASE_EEOP_DOMAIN_CHECK,
+		&&CASE_EEOP_CONVERT_ROWTYPE,
+		&&CASE_EEOP_SCALARARRAYOP,
+		&&CASE_EEOP_HASHED_SCALARARRAYOP,
+		&&CASE_EEOP_XMLEXPR,
+		&&CASE_EEOP_AGGREF,
+		&&CASE_EEOP_GROUPING_FUNC,
+		&&CASE_EEOP_WINDOW_FUNC,
+		&&CASE_EEOP_SUBPLAN,
+		&&CASE_EEOP_AGG_STRICT_DESERIALIZE,
+		&&CASE_EEOP_AGG_DESERIALIZE,
+		&&CASE_EEOP_AGG_STRICT_INPUT_CHECK_ARGS,
+		&&CASE_EEOP_AGG_STRICT_INPUT_CHECK_NULLS,
+		&&CASE_EEOP_AGG_PLAIN_PERGROUP_NULLCHECK,
+		&&CASE_EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL,
+		&&CASE_EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL,
+		&&CASE_EEOP_AGG_PLAIN_TRANS_BYVAL,
+		&&CASE_EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF,
+		&&CASE_EEOP_AGG_PLAIN_TRANS_STRICT_BYREF,
+		&&CASE_EEOP_AGG_PLAIN_TRANS_BYREF,
+		&&CASE_EEOP_AGG_ORDERED_TRANS_DATUM,
+		&&CASE_EEOP_AGG_ORDERED_TRANS_TUPLE,
+		&&CASE_EEOP_LAST
+	};
+
+	StaticAssertStmt(EEOP_LAST + 1 == lengthof(dispatch_table),
+					 "dispatch_table out of whack with ExprEvalOp");
+
+	if (unlikely(state == NULL))
+		return PointerGetDatum(dispatch_table);
+#else
+	Assert(state != NULL);
+#endif							/* EEO_USE_COMPUTED_GOTO */
+
+	/* setup state */
+	op = state->steps;
+	resultslot = state->resultslot;
+	innerslot = econtext->ecxt_innertuple;
+	outerslot = econtext->ecxt_outertuple;
+	scanslot = econtext->ecxt_scantuple;
+
+#if defined(EEO_USE_COMPUTED_GOTO)
+	EEO_DISPATCH();
+#endif
+
+	EEO_SWITCH()
+	{
+		EEO_CASE(EEOP_DONE)
+		{
+			goto out;
+		}
+
+		EEO_CASE(EEOP_INNER_FETCHSOME)
+		{
+			CheckOpSlotCompatibility(op, innerslot);
+
+			slot_getsomeattrs(innerslot, op->d.fetch.last_var);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_OUTER_FETCHSOME)
+		{
+			CheckOpSlotCompatibility(op, outerslot);
+
+			slot_getsomeattrs(outerslot, op->d.fetch.last_var);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_SCAN_FETCHSOME)
+		{
+			CheckOpSlotCompatibility(op, scanslot);
+
+			slot_getsomeattrs(scanslot, op->d.fetch.last_var);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_INNER_VAR)
+		{
+			int			attnum = op->d.var.attnum;
+
+			/*
+			 * Since we already extracted all referenced columns from the
+			 * tuple with a FETCHSOME step, we can just grab the value
+			 * directly out of the slot's decomposed-data arrays.  But let's
+			 * have an Assert to check that that did happen.
+			 */
+			Assert(attnum >= 0 && attnum < innerslot->tts_nvalid);
+			*op->resvalue = innerslot->tts_values[attnum];
+			*op->resnull = innerslot->tts_isnull[attnum];
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_OUTER_VAR)
+		{
+			int			attnum = op->d.var.attnum;
+
+			/* See EEOP_INNER_VAR comments */
+
+			Assert(attnum >= 0 && attnum < outerslot->tts_nvalid);
+			*op->resvalue = outerslot->tts_values[attnum];
+			*op->resnull = outerslot->tts_isnull[attnum];
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_SCAN_VAR)
+		{
+			int			attnum = op->d.var.attnum;
+
+			/* See EEOP_INNER_VAR comments */
+
+			Assert(attnum >= 0 && attnum < scanslot->tts_nvalid);
+			*op->resvalue = scanslot->tts_values[attnum];
+			*op->resnull = scanslot->tts_isnull[attnum];
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_INNER_SYSVAR)
+		{
+			ExecEvalSysVar(state, op, econtext, innerslot);
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_OUTER_SYSVAR)
+		{
+			ExecEvalSysVar(state, op, econtext, outerslot);
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_SCAN_SYSVAR)
+		{
+			ExecEvalSysVar(state, op, econtext, scanslot);
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_WHOLEROW)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalWholeRowVar(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ASSIGN_INNER_VAR)
+		{
+			int			resultnum = op->d.assign_var.resultnum;
+			int			attnum = op->d.assign_var.attnum;
+
+			/*
+			 * We do not need CheckVarSlotCompatibility here; that was taken
+			 * care of at compilation time.  But see EEOP_INNER_VAR comments.
+			 */
+			Assert(attnum >= 0 && attnum < innerslot->tts_nvalid);
+			Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+			resultslot->tts_values[resultnum] = innerslot->tts_values[attnum];
+			resultslot->tts_isnull[resultnum] = innerslot->tts_isnull[attnum];
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ASSIGN_OUTER_VAR)
+		{
+			int			resultnum = op->d.assign_var.resultnum;
+			int			attnum = op->d.assign_var.attnum;
+
+			/*
+			 * We do not need CheckVarSlotCompatibility here; that was taken
+			 * care of at compilation time.  But see EEOP_INNER_VAR comments.
+			 */
+			Assert(attnum >= 0 && attnum < outerslot->tts_nvalid);
+			Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+			resultslot->tts_values[resultnum] = outerslot->tts_values[attnum];
+			resultslot->tts_isnull[resultnum] = outerslot->tts_isnull[attnum];
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ASSIGN_SCAN_VAR)
+		{
+			int			resultnum = op->d.assign_var.resultnum;
+			int			attnum = op->d.assign_var.attnum;
+
+			/*
+			 * We do not need CheckVarSlotCompatibility here; that was taken
+			 * care of at compilation time.  But see EEOP_INNER_VAR comments.
+			 */
+			Assert(attnum >= 0 && attnum < scanslot->tts_nvalid);
+			Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+			resultslot->tts_values[resultnum] = scanslot->tts_values[attnum];
+			resultslot->tts_isnull[resultnum] = scanslot->tts_isnull[attnum];
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ASSIGN_TMP)
+		{
+			int			resultnum = op->d.assign_tmp.resultnum;
+
+			Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+			resultslot->tts_values[resultnum] = state->resvalue;
+			resultslot->tts_isnull[resultnum] = state->resnull;
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ASSIGN_TMP_MAKE_RO)
+		{
+			int			resultnum = op->d.assign_tmp.resultnum;
+
+			Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts);
+			resultslot->tts_isnull[resultnum] = state->resnull;
+			if (!resultslot->tts_isnull[resultnum])
+				resultslot->tts_values[resultnum] =
+					MakeExpandedObjectReadOnlyInternal(state->resvalue);
+			else
+				resultslot->tts_values[resultnum] = state->resvalue;
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_CONST)
+		{
+			*op->resnull = op->d.constval.isnull;
+			*op->resvalue = op->d.constval.value;
+
+			EEO_NEXT();
+		}
+
+		/*
+		 * Function-call implementations. Arguments have previously been
+		 * evaluated directly into fcinfo->args.
+		 *
+		 * As both STRICT checks and function-usage are noticeable performance
+		 * wise, and function calls are a very hot-path (they also back
+		 * operators!), it's worth having so many separate opcodes.
+		 *
+		 * Note: the reason for using a temporary variable "d", here and in
+		 * other places, is that some compilers think "*op->resvalue = f();"
+		 * requires them to evaluate op->resvalue into a register before
+		 * calling f(), just in case f() is able to modify op->resvalue
+		 * somehow.  The extra line of code can save a useless register spill
+		 * and reload across the function call.
+		 */
+		EEO_CASE(EEOP_FUNCEXPR)
+		{
+			FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+			Datum		d;
+
+			fcinfo->isnull = false;
+			d = op->d.func.fn_addr(fcinfo);
+			*op->resvalue = d;
+			*op->resnull = fcinfo->isnull;
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_FUNCEXPR_STRICT)
+		{
+			FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+			NullableDatum *args = fcinfo->args;
+			int			nargs = op->d.func.nargs;
+			Datum		d;
+
+			/* strict function, so check for NULL args */
+			for (int argno = 0; argno < nargs; argno++)
+			{
+				if (args[argno].isnull)
+				{
+					*op->resnull = true;
+					goto strictfail;
+				}
+			}
+			fcinfo->isnull = false;
+			d = op->d.func.fn_addr(fcinfo);
+			*op->resvalue = d;
+			*op->resnull = fcinfo->isnull;
+
+	strictfail:
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_FUNCEXPR_FUSAGE)
+		{
+			/* not common enough to inline */
+			ExecEvalFuncExprFusage(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_FUNCEXPR_STRICT_FUSAGE)
+		{
+			/* not common enough to inline */
+			ExecEvalFuncExprStrictFusage(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		/*
+		 * If any of its clauses is FALSE, an AND's result is FALSE regardless
+		 * of the states of the rest of the clauses, so we can stop evaluating
+		 * and return FALSE immediately.  If none are FALSE and one or more is
+		 * NULL, we return NULL; otherwise we return TRUE.  This makes sense
+		 * when you interpret NULL as "don't know": perhaps one of the "don't
+		 * knows" would have been FALSE if we'd known its value.  Only when
+		 * all the inputs are known to be TRUE can we state confidently that
+		 * the AND's result is TRUE.
+		 */
+		EEO_CASE(EEOP_BOOL_AND_STEP_FIRST)
+		{
+			*op->d.boolexpr.anynull = false;
+
+			/*
+			 * EEOP_BOOL_AND_STEP_FIRST resets anynull, otherwise it's the
+			 * same as EEOP_BOOL_AND_STEP - so fall through to that.
+			 */
+
+			/* FALL THROUGH */
+		}
+
+		EEO_CASE(EEOP_BOOL_AND_STEP)
+		{
+			if (*op->resnull)
+			{
+				*op->d.boolexpr.anynull = true;
+			}
+			else if (!DatumGetBool(*op->resvalue))
+			{
+				/* result is already set to FALSE, need not change it */
+				/* bail out early */
+				EEO_JUMP(op->d.boolexpr.jumpdone);
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_BOOL_AND_STEP_LAST)
+		{
+			if (*op->resnull)
+			{
+				/* result is already set to NULL, need not change it */
+			}
+			else if (!DatumGetBool(*op->resvalue))
+			{
+				/* result is already set to FALSE, need not change it */
+
+				/*
+				 * No point jumping early to jumpdone - would be same target
+				 * (as this is the last argument to the AND expression),
+				 * except more expensive.
+				 */
+			}
+			else if (*op->d.boolexpr.anynull)
+			{
+				*op->resvalue = (Datum) 0;
+				*op->resnull = true;
+			}
+			else
+			{
+				/* result is already set to TRUE, need not change it */
+			}
+
+			EEO_NEXT();
+		}
+
+		/*
+		 * If any of its clauses is TRUE, an OR's result is TRUE regardless of
+		 * the states of the rest of the clauses, so we can stop evaluating
+		 * and return TRUE immediately.  If none are TRUE and one or more is
+		 * NULL, we return NULL; otherwise we return FALSE.  This makes sense
+		 * when you interpret NULL as "don't know": perhaps one of the "don't
+		 * knows" would have been TRUE if we'd known its value.  Only when all
+		 * the inputs are known to be FALSE can we state confidently that the
+		 * OR's result is FALSE.
+		 */
+		EEO_CASE(EEOP_BOOL_OR_STEP_FIRST)
+		{
+			*op->d.boolexpr.anynull = false;
+
+			/*
+			 * EEOP_BOOL_OR_STEP_FIRST resets anynull, otherwise it's the same
+			 * as EEOP_BOOL_OR_STEP - so fall through to that.
+			 */
+
+			/* FALL THROUGH */
+		}
+
+		EEO_CASE(EEOP_BOOL_OR_STEP)
+		{
+			if (*op->resnull)
+			{
+				*op->d.boolexpr.anynull = true;
+			}
+			else if (DatumGetBool(*op->resvalue))
+			{
+				/* result is already set to TRUE, need not change it */
+				/* bail out early */
+				EEO_JUMP(op->d.boolexpr.jumpdone);
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_BOOL_OR_STEP_LAST)
+		{
+			if (*op->resnull)
+			{
+				/* result is already set to NULL, need not change it */
+			}
+			else if (DatumGetBool(*op->resvalue))
+			{
+				/* result is already set to TRUE, need not change it */
+
+				/*
+				 * No point jumping to jumpdone - would be same target (as
+				 * this is the last argument to the AND expression), except
+				 * more expensive.
+				 */
+			}
+			else if (*op->d.boolexpr.anynull)
+			{
+				*op->resvalue = (Datum) 0;
+				*op->resnull = true;
+			}
+			else
+			{
+				/* result is already set to FALSE, need not change it */
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_BOOL_NOT_STEP)
+		{
+			/*
+			 * Evaluation of 'not' is simple... if expr is false, then return
+			 * 'true' and vice versa.  It's safe to do this even on a
+			 * nominally null value, so we ignore resnull; that means that
+			 * NULL in produces NULL out, which is what we want.
+			 */
+			*op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue));
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_QUAL)
+		{
+			/* simplified version of BOOL_AND_STEP for use by ExecQual() */
+
+			/* If argument (also result) is false or null ... */
+			if (*op->resnull ||
+				!DatumGetBool(*op->resvalue))
+			{
+				/* ... bail out early, returning FALSE */
+				*op->resnull = false;
+				*op->resvalue = BoolGetDatum(false);
+				EEO_JUMP(op->d.qualexpr.jumpdone);
+			}
+
+			/*
+			 * Otherwise, leave the TRUE value in place, in case this is the
+			 * last qual.  Then, TRUE is the correct answer.
+			 */
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_JUMP)
+		{
+			/* Unconditionally jump to target step */
+			EEO_JUMP(op->d.jump.jumpdone);
+		}
+
+		EEO_CASE(EEOP_JUMP_IF_NULL)
+		{
+			/* Transfer control if current result is null */
+			if (*op->resnull)
+				EEO_JUMP(op->d.jump.jumpdone);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_JUMP_IF_NOT_NULL)
+		{
+			/* Transfer control if current result is non-null */
+			if (!*op->resnull)
+				EEO_JUMP(op->d.jump.jumpdone);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_JUMP_IF_NOT_TRUE)
+		{
+			/* Transfer control if current result is null or false */
+			if (*op->resnull || !DatumGetBool(*op->resvalue))
+				EEO_JUMP(op->d.jump.jumpdone);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_NULLTEST_ISNULL)
+		{
+			*op->resvalue = BoolGetDatum(*op->resnull);
+			*op->resnull = false;
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_NULLTEST_ISNOTNULL)
+		{
+			*op->resvalue = BoolGetDatum(!*op->resnull);
+			*op->resnull = false;
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_NULLTEST_ROWISNULL)
+		{
+			/* out of line implementation: too large */
+			ExecEvalRowNull(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_NULLTEST_ROWISNOTNULL)
+		{
+			/* out of line implementation: too large */
+			ExecEvalRowNotNull(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		/* BooleanTest implementations for all booltesttypes */
+
+		EEO_CASE(EEOP_BOOLTEST_IS_TRUE)
+		{
+			if (*op->resnull)
+			{
+				*op->resvalue = BoolGetDatum(false);
+				*op->resnull = false;
+			}
+			/* else, input value is the correct output as well */
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_BOOLTEST_IS_NOT_TRUE)
+		{
+			if (*op->resnull)
+			{
+				*op->resvalue = BoolGetDatum(true);
+				*op->resnull = false;
+			}
+			else
+				*op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue));
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_BOOLTEST_IS_FALSE)
+		{
+			if (*op->resnull)
+			{
+				*op->resvalue = BoolGetDatum(false);
+				*op->resnull = false;
+			}
+			else
+				*op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue));
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_BOOLTEST_IS_NOT_FALSE)
+		{
+			if (*op->resnull)
+			{
+				*op->resvalue = BoolGetDatum(true);
+				*op->resnull = false;
+			}
+			/* else, input value is the correct output as well */
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_PARAM_EXEC)
+		{
+			/* out of line implementation: too large */
+			ExecEvalParamExec(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_PARAM_EXTERN)
+		{
+			/* out of line implementation: too large */
+			ExecEvalParamExtern(state, op, econtext);
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_PARAM_CALLBACK)
+		{
+			/* allow an extension module to supply a PARAM_EXTERN value */
+			op->d.cparam.paramfunc(state, op, econtext);
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_CASE_TESTVAL)
+		{
+			/*
+			 * Normally upper parts of the expression tree have setup the
+			 * values to be returned here, but some parts of the system
+			 * currently misuse {caseValue,domainValue}_{datum,isNull} to set
+			 * run-time data.  So if no values have been set-up, use
+			 * ExprContext's.  This isn't pretty, but also not *that* ugly,
+			 * and this is unlikely to be performance sensitive enough to
+			 * worry about an extra branch.
+			 */
+			if (op->d.casetest.value)
+			{
+				*op->resvalue = *op->d.casetest.value;
+				*op->resnull = *op->d.casetest.isnull;
+			}
+			else
+			{
+				*op->resvalue = econtext->caseValue_datum;
+				*op->resnull = econtext->caseValue_isNull;
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_DOMAIN_TESTVAL)
+		{
+			/*
+			 * See EEOP_CASE_TESTVAL comment.
+			 */
+			if (op->d.casetest.value)
+			{
+				*op->resvalue = *op->d.casetest.value;
+				*op->resnull = *op->d.casetest.isnull;
+			}
+			else
+			{
+				*op->resvalue = econtext->domainValue_datum;
+				*op->resnull = econtext->domainValue_isNull;
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_MAKE_READONLY)
+		{
+			/*
+			 * Force a varlena value that might be read multiple times to R/O
+			 */
+			if (!*op->d.make_readonly.isnull)
+				*op->resvalue =
+					MakeExpandedObjectReadOnlyInternal(*op->d.make_readonly.value);
+			*op->resnull = *op->d.make_readonly.isnull;
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_IOCOERCE)
+		{
+			/*
+			 * Evaluate a CoerceViaIO node.  This can be quite a hot path, so
+			 * inline as much work as possible.  The source value is in our
+			 * result variable.
+			 */
+			char	   *str;
+
+			/* call output function (similar to OutputFunctionCall) */
+			if (*op->resnull)
+			{
+				/* output functions are not called on nulls */
+				str = NULL;
+			}
+			else
+			{
+				FunctionCallInfo fcinfo_out;
+
+				fcinfo_out = op->d.iocoerce.fcinfo_data_out;
+				fcinfo_out->args[0].value = *op->resvalue;
+				fcinfo_out->args[0].isnull = false;
+
+				fcinfo_out->isnull = false;
+				str = DatumGetCString(FunctionCallInvoke(fcinfo_out));
+
+				/* OutputFunctionCall assumes result isn't null */
+				Assert(!fcinfo_out->isnull);
+			}
+
+			/* call input function (similar to InputFunctionCall) */
+			if (!op->d.iocoerce.finfo_in->fn_strict || str != NULL)
+			{
+				FunctionCallInfo fcinfo_in;
+				Datum		d;
+
+				fcinfo_in = op->d.iocoerce.fcinfo_data_in;
+				fcinfo_in->args[0].value = PointerGetDatum(str);
+				fcinfo_in->args[0].isnull = *op->resnull;
+				/* second and third arguments are already set up */
+
+				fcinfo_in->isnull = false;
+				d = FunctionCallInvoke(fcinfo_in);
+				*op->resvalue = d;
+
+				/* Should get null result if and only if str is NULL */
+				if (str == NULL)
+				{
+					Assert(*op->resnull);
+					Assert(fcinfo_in->isnull);
+				}
+				else
+				{
+					Assert(!*op->resnull);
+					Assert(!fcinfo_in->isnull);
+				}
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_DISTINCT)
+		{
+			/*
+			 * IS DISTINCT FROM must evaluate arguments (already done into
+			 * fcinfo->args) to determine whether they are NULL; if either is
+			 * NULL then the result is determined.  If neither is NULL, then
+			 * proceed to evaluate the comparison function, which is just the
+			 * type's standard equality operator.  We need not care whether
+			 * that function is strict.  Because the handling of nulls is
+			 * different, we can't just reuse EEOP_FUNCEXPR.
+			 */
+			FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+
+			/* check function arguments for NULLness */
+			if (fcinfo->args[0].isnull && fcinfo->args[1].isnull)
+			{
+				/* Both NULL? Then is not distinct... */
+				*op->resvalue = BoolGetDatum(false);
+				*op->resnull = false;
+			}
+			else if (fcinfo->args[0].isnull || fcinfo->args[1].isnull)
+			{
+				/* Only one is NULL? Then is distinct... */
+				*op->resvalue = BoolGetDatum(true);
+				*op->resnull = false;
+			}
+			else
+			{
+				/* Neither null, so apply the equality function */
+				Datum		eqresult;
+
+				fcinfo->isnull = false;
+				eqresult = op->d.func.fn_addr(fcinfo);
+				/* Must invert result of "="; safe to do even if null */
+				*op->resvalue = BoolGetDatum(!DatumGetBool(eqresult));
+				*op->resnull = fcinfo->isnull;
+			}
+
+			EEO_NEXT();
+		}
+
+		/* see EEOP_DISTINCT for comments, this is just inverted */
+		EEO_CASE(EEOP_NOT_DISTINCT)
+		{
+			FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+
+			if (fcinfo->args[0].isnull && fcinfo->args[1].isnull)
+			{
+				*op->resvalue = BoolGetDatum(true);
+				*op->resnull = false;
+			}
+			else if (fcinfo->args[0].isnull || fcinfo->args[1].isnull)
+			{
+				*op->resvalue = BoolGetDatum(false);
+				*op->resnull = false;
+			}
+			else
+			{
+				Datum		eqresult;
+
+				fcinfo->isnull = false;
+				eqresult = op->d.func.fn_addr(fcinfo);
+				*op->resvalue = eqresult;
+				*op->resnull = fcinfo->isnull;
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_NULLIF)
+		{
+			/*
+			 * The arguments are already evaluated into fcinfo->args.
+			 */
+			FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+
+			/* if either argument is NULL they can't be equal */
+			if (!fcinfo->args[0].isnull && !fcinfo->args[1].isnull)
+			{
+				Datum		result;
+
+				fcinfo->isnull = false;
+				result = op->d.func.fn_addr(fcinfo);
+
+				/* if the arguments are equal return null */
+				if (!fcinfo->isnull && DatumGetBool(result))
+				{
+					*op->resvalue = (Datum) 0;
+					*op->resnull = true;
+
+					EEO_NEXT();
+				}
+			}
+
+			/* Arguments aren't equal, so return the first one */
+			*op->resvalue = fcinfo->args[0].value;
+			*op->resnull = fcinfo->args[0].isnull;
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_SQLVALUEFUNCTION)
+		{
+			/*
+			 * Doesn't seem worthwhile to have an inline implementation
+			 * efficiency-wise.
+			 */
+			ExecEvalSQLValueFunction(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_CURRENTOFEXPR)
+		{
+			/* error invocation uses space, and shouldn't ever occur */
+			ExecEvalCurrentOfExpr(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_NEXTVALUEEXPR)
+		{
+			/*
+			 * Doesn't seem worthwhile to have an inline implementation
+			 * efficiency-wise.
+			 */
+			ExecEvalNextValueExpr(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ARRAYEXPR)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalArrayExpr(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ARRAYCOERCE)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalArrayCoerce(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ROW)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalRow(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ROWCOMPARE_STEP)
+		{
+			FunctionCallInfo fcinfo = op->d.rowcompare_step.fcinfo_data;
+			Datum		d;
+
+			/* force NULL result if strict fn and NULL input */
+			if (op->d.rowcompare_step.finfo->fn_strict &&
+				(fcinfo->args[0].isnull || fcinfo->args[1].isnull))
+			{
+				*op->resnull = true;
+				EEO_JUMP(op->d.rowcompare_step.jumpnull);
+			}
+
+			/* Apply comparison function */
+			fcinfo->isnull = false;
+			d = op->d.rowcompare_step.fn_addr(fcinfo);
+			*op->resvalue = d;
+
+			/* force NULL result if NULL function result */
+			if (fcinfo->isnull)
+			{
+				*op->resnull = true;
+				EEO_JUMP(op->d.rowcompare_step.jumpnull);
+			}
+			*op->resnull = false;
+
+			/* If unequal, no need to compare remaining columns */
+			if (DatumGetInt32(*op->resvalue) != 0)
+			{
+				EEO_JUMP(op->d.rowcompare_step.jumpdone);
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_ROWCOMPARE_FINAL)
+		{
+			int32		cmpresult = DatumGetInt32(*op->resvalue);
+			RowCompareType rctype = op->d.rowcompare_final.rctype;
+
+			*op->resnull = false;
+			switch (rctype)
+			{
+					/* EQ and NE cases aren't allowed here */
+				case ROWCOMPARE_LT:
+					*op->resvalue = BoolGetDatum(cmpresult < 0);
+					break;
+				case ROWCOMPARE_LE:
+					*op->resvalue = BoolGetDatum(cmpresult <= 0);
+					break;
+				case ROWCOMPARE_GE:
+					*op->resvalue = BoolGetDatum(cmpresult >= 0);
+					break;
+				case ROWCOMPARE_GT:
+					*op->resvalue = BoolGetDatum(cmpresult > 0);
+					break;
+				default:
+					Assert(false);
+					break;
+			}
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_MINMAX)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalMinMax(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_FIELDSELECT)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalFieldSelect(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_FIELDSTORE_DEFORM)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalFieldStoreDeForm(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_FIELDSTORE_FORM)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalFieldStoreForm(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_SBSREF_SUBSCRIPTS)
+		{
+			/* Precheck SubscriptingRef subscript(s) */
+			if (op->d.sbsref_subscript.subscriptfunc(state, op, econtext))
+			{
+				EEO_NEXT();
+			}
+			else
+			{
+				/* Subscript is null, short-circuit SubscriptingRef to NULL */
+				EEO_JUMP(op->d.sbsref_subscript.jumpdone);
+			}
+		}
+
+		EEO_CASE(EEOP_SBSREF_OLD)
+			EEO_CASE(EEOP_SBSREF_ASSIGN)
+			EEO_CASE(EEOP_SBSREF_FETCH)
+		{
+			/* Perform a SubscriptingRef fetch or assignment */
+			op->d.sbsref.subscriptfunc(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_CONVERT_ROWTYPE)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalConvertRowtype(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_SCALARARRAYOP)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalScalarArrayOp(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_HASHED_SCALARARRAYOP)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalHashedScalarArrayOp(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_DOMAIN_NOTNULL)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalConstraintNotNull(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_DOMAIN_CHECK)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalConstraintCheck(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_XMLEXPR)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalXmlExpr(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_AGGREF)
+		{
+			/*
+			 * Returns a Datum whose value is the precomputed aggregate value
+			 * found in the given expression context.
+			 */
+			int			aggno = op->d.aggref.aggno;
+
+			Assert(econtext->ecxt_aggvalues != NULL);
+
+			*op->resvalue = econtext->ecxt_aggvalues[aggno];
+			*op->resnull = econtext->ecxt_aggnulls[aggno];
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_GROUPING_FUNC)
+		{
+			/* too complex/uncommon for an inline implementation */
+			ExecEvalGroupingFunc(state, op);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_WINDOW_FUNC)
+		{
+			/*
+			 * Like Aggref, just return a precomputed value from the econtext.
+			 */
+			WindowFuncExprState *wfunc = op->d.window_func.wfstate;
+
+			Assert(econtext->ecxt_aggvalues != NULL);
+
+			*op->resvalue = econtext->ecxt_aggvalues[wfunc->wfuncno];
+			*op->resnull = econtext->ecxt_aggnulls[wfunc->wfuncno];
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_SUBPLAN)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalSubPlan(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		/* evaluate a strict aggregate deserialization function */
+		EEO_CASE(EEOP_AGG_STRICT_DESERIALIZE)
+		{
+			/* Don't call a strict deserialization function with NULL input */
+			if (op->d.agg_deserialize.fcinfo_data->args[0].isnull)
+				EEO_JUMP(op->d.agg_deserialize.jumpnull);
+
+			/* fallthrough */
+		}
+
+		/* evaluate aggregate deserialization function (non-strict portion) */
+		EEO_CASE(EEOP_AGG_DESERIALIZE)
+		{
+			FunctionCallInfo fcinfo = op->d.agg_deserialize.fcinfo_data;
+			AggState   *aggstate = castNode(AggState, state->parent);
+			MemoryContext oldContext;
+
+			/*
+			 * We run the deserialization functions in per-input-tuple memory
+			 * context.
+			 */
+			oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+			fcinfo->isnull = false;
+			*op->resvalue = FunctionCallInvoke(fcinfo);
+			*op->resnull = fcinfo->isnull;
+			MemoryContextSwitchTo(oldContext);
+
+			EEO_NEXT();
+		}
+
+		/*
+		 * Check that a strict aggregate transition / combination function's
+		 * input is not NULL.
+		 */
+
+		EEO_CASE(EEOP_AGG_STRICT_INPUT_CHECK_ARGS)
+		{
+			NullableDatum *args = op->d.agg_strict_input_check.args;
+			int			nargs = op->d.agg_strict_input_check.nargs;
+
+			for (int argno = 0; argno < nargs; argno++)
+			{
+				if (args[argno].isnull)
+					EEO_JUMP(op->d.agg_strict_input_check.jumpnull);
+			}
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_AGG_STRICT_INPUT_CHECK_NULLS)
+		{
+			bool	   *nulls = op->d.agg_strict_input_check.nulls;
+			int			nargs = op->d.agg_strict_input_check.nargs;
+
+			for (int argno = 0; argno < nargs; argno++)
+			{
+				if (nulls[argno])
+					EEO_JUMP(op->d.agg_strict_input_check.jumpnull);
+			}
+			EEO_NEXT();
+		}
+
+		/*
+		 * Check for a NULL pointer to the per-group states.
+		 */
+
+		EEO_CASE(EEOP_AGG_PLAIN_PERGROUP_NULLCHECK)
+		{
+			AggState   *aggstate = castNode(AggState, state->parent);
+			AggStatePerGroup pergroup_allaggs =
+			aggstate->all_pergroups[op->d.agg_plain_pergroup_nullcheck.setoff];
+
+			if (pergroup_allaggs == NULL)
+				EEO_JUMP(op->d.agg_plain_pergroup_nullcheck.jumpnull);
+
+			EEO_NEXT();
+		}
+
+		/*
+		 * Different types of aggregate transition functions are implemented
+		 * as different types of steps, to avoid incurring unnecessary
+		 * overhead.  There's a step type for each valid combination of having
+		 * a by value / by reference transition type, [not] needing to the
+		 * initialize the transition value for the first row in a group from
+		 * input, and [not] strict transition function.
+		 *
+		 * Could optimize further by splitting off by-reference for
+		 * fixed-length types, but currently that doesn't seem worth it.
+		 */
+
+		EEO_CASE(EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL)
+		{
+			AggState   *aggstate = castNode(AggState, state->parent);
+			AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+			AggStatePerGroup pergroup =
+			&aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+			Assert(pertrans->transtypeByVal);
+
+			if (pergroup->noTransValue)
+			{
+				/* If transValue has not yet been initialized, do so now. */
+				ExecAggInitGroup(aggstate, pertrans, pergroup,
+								 op->d.agg_trans.aggcontext);
+				/* copied trans value from input, done this round */
+			}
+			else if (likely(!pergroup->transValueIsNull))
+			{
+				/* invoke transition function, unless prevented by strictness */
+				ExecAggPlainTransByVal(aggstate, pertrans, pergroup,
+									   op->d.agg_trans.aggcontext,
+									   op->d.agg_trans.setno);
+			}
+
+			EEO_NEXT();
+		}
+
+		/* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+		EEO_CASE(EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL)
+		{
+			AggState   *aggstate = castNode(AggState, state->parent);
+			AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+			AggStatePerGroup pergroup =
+			&aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+			Assert(pertrans->transtypeByVal);
+
+			if (likely(!pergroup->transValueIsNull))
+				ExecAggPlainTransByVal(aggstate, pertrans, pergroup,
+									   op->d.agg_trans.aggcontext,
+									   op->d.agg_trans.setno);
+
+			EEO_NEXT();
+		}
+
+		/* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+		EEO_CASE(EEOP_AGG_PLAIN_TRANS_BYVAL)
+		{
+			AggState   *aggstate = castNode(AggState, state->parent);
+			AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+			AggStatePerGroup pergroup =
+			&aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+			Assert(pertrans->transtypeByVal);
+
+			ExecAggPlainTransByVal(aggstate, pertrans, pergroup,
+								   op->d.agg_trans.aggcontext,
+								   op->d.agg_trans.setno);
+
+			EEO_NEXT();
+		}
+
+		/* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+		EEO_CASE(EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF)
+		{
+			AggState   *aggstate = castNode(AggState, state->parent);
+			AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+			AggStatePerGroup pergroup =
+			&aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+			Assert(!pertrans->transtypeByVal);
+
+			if (pergroup->noTransValue)
+				ExecAggInitGroup(aggstate, pertrans, pergroup,
+								 op->d.agg_trans.aggcontext);
+			else if (likely(!pergroup->transValueIsNull))
+				ExecAggPlainTransByRef(aggstate, pertrans, pergroup,
+									   op->d.agg_trans.aggcontext,
+									   op->d.agg_trans.setno);
+
+			EEO_NEXT();
+		}
+
+		/* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+		EEO_CASE(EEOP_AGG_PLAIN_TRANS_STRICT_BYREF)
+		{
+			AggState   *aggstate = castNode(AggState, state->parent);
+			AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+			AggStatePerGroup pergroup =
+			&aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+			Assert(!pertrans->transtypeByVal);
+
+			if (likely(!pergroup->transValueIsNull))
+				ExecAggPlainTransByRef(aggstate, pertrans, pergroup,
+									   op->d.agg_trans.aggcontext,
+									   op->d.agg_trans.setno);
+			EEO_NEXT();
+		}
+
+		/* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */
+		EEO_CASE(EEOP_AGG_PLAIN_TRANS_BYREF)
+		{
+			AggState   *aggstate = castNode(AggState, state->parent);
+			AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+			AggStatePerGroup pergroup =
+			&aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno];
+
+			Assert(!pertrans->transtypeByVal);
+
+			ExecAggPlainTransByRef(aggstate, pertrans, pergroup,
+								   op->d.agg_trans.aggcontext,
+								   op->d.agg_trans.setno);
+
+			EEO_NEXT();
+		}
+
+		/* process single-column ordered aggregate datum */
+		EEO_CASE(EEOP_AGG_ORDERED_TRANS_DATUM)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalAggOrderedTransDatum(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		/* process multi-column ordered aggregate tuple */
+		EEO_CASE(EEOP_AGG_ORDERED_TRANS_TUPLE)
+		{
+			/* too complex for an inline implementation */
+			ExecEvalAggOrderedTransTuple(state, op, econtext);
+
+			EEO_NEXT();
+		}
+
+		EEO_CASE(EEOP_LAST)
+		{
+			/* unreachable */
+			Assert(false);
+			goto out;
+		}
+	}
+
+out:
+	*isnull = state->resnull;
+	return state->resvalue;
+}
+
+/*
+ * Expression evaluation callback that performs extra checks before executing
+ * the expression. Declared extern so other methods of execution can use it
+ * too.
+ */
+Datum
+ExecInterpExprStillValid(ExprState *state, ExprContext *econtext, bool *isNull)
+{
+	/*
+	 * First time through, check whether attribute matches Var.  Might not be
+	 * ok anymore, due to schema changes.
+	 */
+	CheckExprStillValid(state, econtext);
+
+	/* skip the check during further executions */
+	state->evalfunc = (ExprStateEvalFunc) state->evalfunc_private;
+
+	/* and actually execute */
+	return state->evalfunc(state, econtext, isNull);
+}
+
+/*
+ * Check that an expression is still valid in the face of potential schema
+ * changes since the plan has been created.
+ */
+void
+CheckExprStillValid(ExprState *state, ExprContext *econtext)
+{
+	TupleTableSlot *innerslot;
+	TupleTableSlot *outerslot;
+	TupleTableSlot *scanslot;
+
+	innerslot = econtext->ecxt_innertuple;
+	outerslot = econtext->ecxt_outertuple;
+	scanslot = econtext->ecxt_scantuple;
+
+	for (int i = 0; i < state->steps_len; i++)
+	{
+		ExprEvalStep *op = &state->steps[i];
+
+		switch (ExecEvalStepOp(state, op))
+		{
+			case EEOP_INNER_VAR:
+				{
+					int			attnum = op->d.var.attnum;
+
+					CheckVarSlotCompatibility(innerslot, attnum + 1, op->d.var.vartype);
+					break;
+				}
+
+			case EEOP_OUTER_VAR:
+				{
+					int			attnum = op->d.var.attnum;
+
+					CheckVarSlotCompatibility(outerslot, attnum + 1, op->d.var.vartype);
+					break;
+				}
+
+			case EEOP_SCAN_VAR:
+				{
+					int			attnum = op->d.var.attnum;
+
+					CheckVarSlotCompatibility(scanslot, attnum + 1, op->d.var.vartype);
+					break;
+				}
+			default:
+				break;
+		}
+	}
+}
+
+/*
+ * Check whether a user attribute in a slot can be referenced by a Var
+ * expression.  This should succeed unless there have been schema changes
+ * since the expression tree has been created.
+ */
+static void
+CheckVarSlotCompatibility(TupleTableSlot *slot, int attnum, Oid vartype)
+{
+	/*
+	 * What we have to check for here is the possibility of an attribute
+	 * having been dropped or changed in type since the plan tree was created.
+	 * Ideally the plan will get invalidated and not re-used, but just in
+	 * case, we keep these defenses.  Fortunately it's sufficient to check
+	 * once on the first time through.
+	 *
+	 * Note: ideally we'd check typmod as well as typid, but that seems
+	 * impractical at the moment: in many cases the tupdesc will have been
+	 * generated by ExecTypeFromTL(), and that can't guarantee to generate an
+	 * accurate typmod in all cases, because some expression node types don't
+	 * carry typmod.  Fortunately, for precisely that reason, there should be
+	 * no places with a critical dependency on the typmod of a value.
+	 *
+	 * System attributes don't require checking since their types never
+	 * change.
+	 */
+	if (attnum > 0)
+	{
+		TupleDesc	slot_tupdesc = slot->tts_tupleDescriptor;
+		Form_pg_attribute attr;
+
+		if (attnum > slot_tupdesc->natts)	/* should never happen */
+			elog(ERROR, "attribute number %d exceeds number of columns %d",
+				 attnum, slot_tupdesc->natts);
+
+		attr = TupleDescAttr(slot_tupdesc, attnum - 1);
+
+		if (attr->attisdropped)
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_COLUMN),
+					 errmsg("attribute %d of type %s has been dropped",
+							attnum, format_type_be(slot_tupdesc->tdtypeid))));
+
+		if (vartype != attr->atttypid)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("attribute %d of type %s has wrong type",
+							attnum, format_type_be(slot_tupdesc->tdtypeid)),
+					 errdetail("Table has type %s, but query expects %s.",
+							   format_type_be(attr->atttypid),
+							   format_type_be(vartype))));
+	}
+}
+
+/*
+ * Verify that the slot is compatible with a EEOP_*_FETCHSOME operation.
+ */
+static void
+CheckOpSlotCompatibility(ExprEvalStep *op, TupleTableSlot *slot)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* there's nothing to check */
+	if (!op->d.fetch.fixed)
+		return;
+
+	/*
+	 * Should probably fixed at some point, but for now it's easier to allow
+	 * buffer and heap tuples to be used interchangeably.
+	 */
+	if (slot->tts_ops == &TTSOpsBufferHeapTuple &&
+		op->d.fetch.kind == &TTSOpsHeapTuple)
+		return;
+	if (slot->tts_ops == &TTSOpsHeapTuple &&
+		op->d.fetch.kind == &TTSOpsBufferHeapTuple)
+		return;
+
+	/*
+	 * At the moment we consider it OK if a virtual slot is used instead of a
+	 * specific type of slot, as a virtual slot never needs to be deformed.
+	 */
+	if (slot->tts_ops == &TTSOpsVirtual)
+		return;
+
+	Assert(op->d.fetch.kind == slot->tts_ops);
+#endif
+}
+
+/*
+ * get_cached_rowtype: utility function to lookup a rowtype tupdesc
+ *
+ * type_id, typmod: identity of the rowtype
+ * rowcache: space for caching identity info
+ *		(rowcache->cacheptr must be initialized to NULL)
+ * changed: if not NULL, *changed is set to true on any update
+ *
+ * The returned TupleDesc is not guaranteed pinned; caller must pin it
+ * to use it across any operation that might incur cache invalidation.
+ * (The TupleDesc is always refcounted, so just use IncrTupleDescRefCount.)
+ *
+ * NOTE: because composite types can change contents, we must be prepared
+ * to re-do this during any node execution; cannot call just once during
+ * expression initialization.
+ */
+static TupleDesc
+get_cached_rowtype(Oid type_id, int32 typmod,
+				   ExprEvalRowtypeCache *rowcache,
+				   bool *changed)
+{
+	if (type_id != RECORDOID)
+	{
+		/*
+		 * It's a named composite type, so use the regular typcache.  Do a
+		 * lookup first time through, or if the composite type changed.  Note:
+		 * "tupdesc_id == 0" may look redundant, but it protects against the
+		 * admittedly-theoretical possibility that type_id was RECORDOID the
+		 * last time through, so that the cacheptr isn't TypeCacheEntry *.
+		 */
+		TypeCacheEntry *typentry = (TypeCacheEntry *) rowcache->cacheptr;
+
+		if (unlikely(typentry == NULL ||
+					 rowcache->tupdesc_id == 0 ||
+					 typentry->tupDesc_identifier != rowcache->tupdesc_id))
+		{
+			typentry = lookup_type_cache(type_id, TYPECACHE_TUPDESC);
+			if (typentry->tupDesc == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						 errmsg("type %s is not composite",
+								format_type_be(type_id))));
+			rowcache->cacheptr = (void *) typentry;
+			rowcache->tupdesc_id = typentry->tupDesc_identifier;
+			if (changed)
+				*changed = true;
+		}
+		return typentry->tupDesc;
+	}
+	else
+	{
+		/*
+		 * A RECORD type, once registered, doesn't change for the life of the
+		 * backend.  So we don't need a typcache entry as such, which is good
+		 * because there isn't one.  It's possible that the caller is asking
+		 * about a different type than before, though.
+		 */
+		TupleDesc	tupDesc = (TupleDesc) rowcache->cacheptr;
+
+		if (unlikely(tupDesc == NULL ||
+					 rowcache->tupdesc_id != 0 ||
+					 type_id != tupDesc->tdtypeid ||
+					 typmod != tupDesc->tdtypmod))
+		{
+			tupDesc = lookup_rowtype_tupdesc(type_id, typmod);
+			/* Drop pin acquired by lookup_rowtype_tupdesc */
+			ReleaseTupleDesc(tupDesc);
+			rowcache->cacheptr = (void *) tupDesc;
+			rowcache->tupdesc_id = 0;	/* not a valid value for non-RECORD */
+			if (changed)
+				*changed = true;
+		}
+		return tupDesc;
+	}
+}
+
+
+/*
+ * Fast-path functions, for very simple expressions
+ */
+
+/* implementation of ExecJust(Inner|Outer|Scan)Var */
+static pg_attribute_always_inline Datum
+ExecJustVarImpl(ExprState *state, TupleTableSlot *slot, bool *isnull)
+{
+	ExprEvalStep *op = &state->steps[1];
+	int			attnum = op->d.var.attnum + 1;
+
+	CheckOpSlotCompatibility(&state->steps[0], slot);
+
+	/*
+	 * Since we use slot_getattr(), we don't need to implement the FETCHSOME
+	 * step explicitly, and we also needn't Assert that the attnum is in range
+	 * --- slot_getattr() will take care of any problems.
+	 */
+	return slot_getattr(slot, attnum, isnull);
+}
+
+/* Simple reference to inner Var */
+static Datum
+ExecJustInnerVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustVarImpl(state, econtext->ecxt_innertuple, isnull);
+}
+
+/* Simple reference to outer Var */
+static Datum
+ExecJustOuterVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustVarImpl(state, econtext->ecxt_outertuple, isnull);
+}
+
+/* Simple reference to scan Var */
+static Datum
+ExecJustScanVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustVarImpl(state, econtext->ecxt_scantuple, isnull);
+}
+
+/* implementation of ExecJustAssign(Inner|Outer|Scan)Var */
+static pg_attribute_always_inline Datum
+ExecJustAssignVarImpl(ExprState *state, TupleTableSlot *inslot, bool *isnull)
+{
+	ExprEvalStep *op = &state->steps[1];
+	int			attnum = op->d.assign_var.attnum + 1;
+	int			resultnum = op->d.assign_var.resultnum;
+	TupleTableSlot *outslot = state->resultslot;
+
+	CheckOpSlotCompatibility(&state->steps[0], inslot);
+
+	/*
+	 * We do not need CheckVarSlotCompatibility here; that was taken care of
+	 * at compilation time.
+	 *
+	 * Since we use slot_getattr(), we don't need to implement the FETCHSOME
+	 * step explicitly, and we also needn't Assert that the attnum is in range
+	 * --- slot_getattr() will take care of any problems.  Nonetheless, check
+	 * that resultnum is in range.
+	 */
+	Assert(resultnum >= 0 && resultnum < outslot->tts_tupleDescriptor->natts);
+	outslot->tts_values[resultnum] =
+		slot_getattr(inslot, attnum, &outslot->tts_isnull[resultnum]);
+	return 0;
+}
+
+/* Evaluate inner Var and assign to appropriate column of result tuple */
+static Datum
+ExecJustAssignInnerVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustAssignVarImpl(state, econtext->ecxt_innertuple, isnull);
+}
+
+/* Evaluate outer Var and assign to appropriate column of result tuple */
+static Datum
+ExecJustAssignOuterVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustAssignVarImpl(state, econtext->ecxt_outertuple, isnull);
+}
+
+/* Evaluate scan Var and assign to appropriate column of result tuple */
+static Datum
+ExecJustAssignScanVar(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustAssignVarImpl(state, econtext->ecxt_scantuple, isnull);
+}
+
+/* Evaluate CASE_TESTVAL and apply a strict function to it */
+static Datum
+ExecJustApplyFuncToCase(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	ExprEvalStep *op = &state->steps[0];
+	FunctionCallInfo fcinfo;
+	NullableDatum *args;
+	int			nargs;
+	Datum		d;
+
+	/*
+	 * XXX with some redesign of the CaseTestExpr mechanism, maybe we could
+	 * get rid of this data shuffling?
+	 */
+	*op->resvalue = *op->d.casetest.value;
+	*op->resnull = *op->d.casetest.isnull;
+
+	op++;
+
+	nargs = op->d.func.nargs;
+	fcinfo = op->d.func.fcinfo_data;
+	args = fcinfo->args;
+
+	/* strict function, so check for NULL args */
+	for (int argno = 0; argno < nargs; argno++)
+	{
+		if (args[argno].isnull)
+		{
+			*isnull = true;
+			return (Datum) 0;
+		}
+	}
+	fcinfo->isnull = false;
+	d = op->d.func.fn_addr(fcinfo);
+	*isnull = fcinfo->isnull;
+	return d;
+}
+
+/* Simple Const expression */
+static Datum
+ExecJustConst(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	ExprEvalStep *op = &state->steps[0];
+
+	*isnull = op->d.constval.isnull;
+	return op->d.constval.value;
+}
+
+/* implementation of ExecJust(Inner|Outer|Scan)VarVirt */
+static pg_attribute_always_inline Datum
+ExecJustVarVirtImpl(ExprState *state, TupleTableSlot *slot, bool *isnull)
+{
+	ExprEvalStep *op = &state->steps[0];
+	int			attnum = op->d.var.attnum;
+
+	/*
+	 * As it is guaranteed that a virtual slot is used, there never is a need
+	 * to perform tuple deforming (nor would it be possible). Therefore
+	 * execExpr.c has not emitted an EEOP_*_FETCHSOME step. Verify, as much as
+	 * possible, that that determination was accurate.
+	 */
+	Assert(TTS_IS_VIRTUAL(slot));
+	Assert(TTS_FIXED(slot));
+	Assert(attnum >= 0 && attnum < slot->tts_nvalid);
+
+	*isnull = slot->tts_isnull[attnum];
+
+	return slot->tts_values[attnum];
+}
+
+/* Like ExecJustInnerVar, optimized for virtual slots */
+static Datum
+ExecJustInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustVarVirtImpl(state, econtext->ecxt_innertuple, isnull);
+}
+
+/* Like ExecJustOuterVar, optimized for virtual slots */
+static Datum
+ExecJustOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustVarVirtImpl(state, econtext->ecxt_outertuple, isnull);
+}
+
+/* Like ExecJustScanVar, optimized for virtual slots */
+static Datum
+ExecJustScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustVarVirtImpl(state, econtext->ecxt_scantuple, isnull);
+}
+
+/* implementation of ExecJustAssign(Inner|Outer|Scan)VarVirt */
+static pg_attribute_always_inline Datum
+ExecJustAssignVarVirtImpl(ExprState *state, TupleTableSlot *inslot, bool *isnull)
+{
+	ExprEvalStep *op = &state->steps[0];
+	int			attnum = op->d.assign_var.attnum;
+	int			resultnum = op->d.assign_var.resultnum;
+	TupleTableSlot *outslot = state->resultslot;
+
+	/* see ExecJustVarVirtImpl for comments */
+
+	Assert(TTS_IS_VIRTUAL(inslot));
+	Assert(TTS_FIXED(inslot));
+	Assert(attnum >= 0 && attnum < inslot->tts_nvalid);
+	Assert(resultnum >= 0 && resultnum < outslot->tts_tupleDescriptor->natts);
+
+	outslot->tts_values[resultnum] = inslot->tts_values[attnum];
+	outslot->tts_isnull[resultnum] = inslot->tts_isnull[attnum];
+
+	return 0;
+}
+
+/* Like ExecJustAssignInnerVar, optimized for virtual slots */
+static Datum
+ExecJustAssignInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustAssignVarVirtImpl(state, econtext->ecxt_innertuple, isnull);
+}
+
+/* Like ExecJustAssignOuterVar, optimized for virtual slots */
+static Datum
+ExecJustAssignOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustAssignVarVirtImpl(state, econtext->ecxt_outertuple, isnull);
+}
+
+/* Like ExecJustAssignScanVar, optimized for virtual slots */
+static Datum
+ExecJustAssignScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull)
+{
+	return ExecJustAssignVarVirtImpl(state, econtext->ecxt_scantuple, isnull);
+}
+
+#if defined(EEO_USE_COMPUTED_GOTO)
+/*
+ * Comparator used when building address->opcode lookup table for
+ * ExecEvalStepOp() in the threaded dispatch case.
+ */
+static int
+dispatch_compare_ptr(const void *a, const void *b)
+{
+	const ExprEvalOpLookup *la = (const ExprEvalOpLookup *) a;
+	const ExprEvalOpLookup *lb = (const ExprEvalOpLookup *) b;
+
+	if (la->opcode < lb->opcode)
+		return -1;
+	else if (la->opcode > lb->opcode)
+		return 1;
+	return 0;
+}
+#endif
+
+/*
+ * Do one-time initialization of interpretation machinery.
+ */
+static void
+ExecInitInterpreter(void)
+{
+#if defined(EEO_USE_COMPUTED_GOTO)
+	/* Set up externally-visible pointer to dispatch table */
+	if (dispatch_table == NULL)
+	{
+		dispatch_table = (const void **)
+			DatumGetPointer(ExecInterpExpr(NULL, NULL, NULL));
+
+		/* build reverse lookup table */
+		for (int i = 0; i < EEOP_LAST; i++)
+		{
+			reverse_dispatch_table[i].opcode = dispatch_table[i];
+			reverse_dispatch_table[i].op = (ExprEvalOp) i;
+		}
+
+		/* make it bsearch()able */
+		qsort(reverse_dispatch_table,
+			  EEOP_LAST /* nmembers */ ,
+			  sizeof(ExprEvalOpLookup),
+			  dispatch_compare_ptr);
+	}
+#endif
+}
+
+/*
+ * Function to return the opcode of an expression step.
+ *
+ * When direct-threading is in use, ExprState->opcode isn't easily
+ * decipherable. This function returns the appropriate enum member.
+ */
+ExprEvalOp
+ExecEvalStepOp(ExprState *state, ExprEvalStep *op)
+{
+#if defined(EEO_USE_COMPUTED_GOTO)
+	if (state->flags & EEO_FLAG_DIRECT_THREADED)
+	{
+		ExprEvalOpLookup key;
+		ExprEvalOpLookup *res;
+
+		key.opcode = (void *) op->opcode;
+		res = bsearch(&key,
+					  reverse_dispatch_table,
+					  EEOP_LAST /* nmembers */ ,
+					  sizeof(ExprEvalOpLookup),
+					  dispatch_compare_ptr);
+		Assert(res);			/* unknown ops shouldn't get looked up */
+		return res->op;
+	}
+#endif
+	return (ExprEvalOp) op->opcode;
+}
+
+
+/*
+ * Out-of-line helper functions for complex instructions.
+ */
+
+/*
+ * Evaluate EEOP_FUNCEXPR_FUSAGE
+ */
+void
+ExecEvalFuncExprFusage(ExprState *state, ExprEvalStep *op,
+					   ExprContext *econtext)
+{
+	FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+	PgStat_FunctionCallUsage fcusage;
+	Datum		d;
+
+	pgstat_init_function_usage(fcinfo, &fcusage);
+
+	fcinfo->isnull = false;
+	d = op->d.func.fn_addr(fcinfo);
+	*op->resvalue = d;
+	*op->resnull = fcinfo->isnull;
+
+	pgstat_end_function_usage(&fcusage, true);
+}
+
+/*
+ * Evaluate EEOP_FUNCEXPR_STRICT_FUSAGE
+ */
+void
+ExecEvalFuncExprStrictFusage(ExprState *state, ExprEvalStep *op,
+							 ExprContext *econtext)
+{
+
+	FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
+	PgStat_FunctionCallUsage fcusage;
+	NullableDatum *args = fcinfo->args;
+	int			nargs = op->d.func.nargs;
+	Datum		d;
+
+	/* strict function, so check for NULL args */
+	for (int argno = 0; argno < nargs; argno++)
+	{
+		if (args[argno].isnull)
+		{
+			*op->resnull = true;
+			return;
+		}
+	}
+
+	pgstat_init_function_usage(fcinfo, &fcusage);
+
+	fcinfo->isnull = false;
+	d = op->d.func.fn_addr(fcinfo);
+	*op->resvalue = d;
+	*op->resnull = fcinfo->isnull;
+
+	pgstat_end_function_usage(&fcusage, true);
+}
+
+/*
+ * Evaluate a PARAM_EXEC parameter.
+ *
+ * PARAM_EXEC params (internal executor parameters) are stored in the
+ * ecxt_param_exec_vals array, and can be accessed by array index.
+ */
+void
+ExecEvalParamExec(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	ParamExecData *prm;
+
+	prm = &(econtext->ecxt_param_exec_vals[op->d.param.paramid]);
+	if (unlikely(prm->execPlan != NULL))
+	{
+		/* Parameter not evaluated yet, so go do it */
+		ExecSetParamPlan(prm->execPlan, econtext);
+		/* ExecSetParamPlan should have processed this param... */
+		Assert(prm->execPlan == NULL);
+	}
+	*op->resvalue = prm->value;
+	*op->resnull = prm->isnull;
+}
+
+/*
+ * Evaluate a PARAM_EXTERN parameter.
+ *
+ * PARAM_EXTERN parameters must be sought in ecxt_param_list_info.
+ */
+void
+ExecEvalParamExtern(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	ParamListInfo paramInfo = econtext->ecxt_param_list_info;
+	int			paramId = op->d.param.paramid;
+
+	if (likely(paramInfo &&
+			   paramId > 0 && paramId <= paramInfo->numParams))
+	{
+		ParamExternData *prm;
+		ParamExternData prmdata;
+
+		/* give hook a chance in case parameter is dynamic */
+		if (paramInfo->paramFetch != NULL)
+			prm = paramInfo->paramFetch(paramInfo, paramId, false, &prmdata);
+		else
+			prm = &paramInfo->params[paramId - 1];
+
+		if (likely(OidIsValid(prm->ptype)))
+		{
+			/* safety check in case hook did something unexpected */
+			if (unlikely(prm->ptype != op->d.param.paramtype))
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("type of parameter %d (%s) does not match that when preparing the plan (%s)",
+								paramId,
+								format_type_be(prm->ptype),
+								format_type_be(op->d.param.paramtype))));
+			*op->resvalue = prm->value;
+			*op->resnull = prm->isnull;
+			return;
+		}
+	}
+
+	ereport(ERROR,
+			(errcode(ERRCODE_UNDEFINED_OBJECT),
+			 errmsg("no value found for parameter %d", paramId)));
+}
+
+/*
+ * Evaluate a SQLValueFunction expression.
+ */
+void
+ExecEvalSQLValueFunction(ExprState *state, ExprEvalStep *op)
+{
+	LOCAL_FCINFO(fcinfo, 0);
+	SQLValueFunction *svf = op->d.sqlvaluefunction.svf;
+
+	*op->resnull = false;
+
+	/*
+	 * Note: current_schema() can return NULL.  current_user() etc currently
+	 * cannot, but might as well code those cases the same way for safety.
+	 */
+	switch (svf->op)
+	{
+		case SVFOP_CURRENT_DATE:
+			*op->resvalue = DateADTGetDatum(GetSQLCurrentDate());
+			break;
+		case SVFOP_CURRENT_TIME:
+		case SVFOP_CURRENT_TIME_N:
+			*op->resvalue = TimeTzADTPGetDatum(GetSQLCurrentTime(svf->typmod));
+			break;
+		case SVFOP_CURRENT_TIMESTAMP:
+		case SVFOP_CURRENT_TIMESTAMP_N:
+			*op->resvalue = TimestampTzGetDatum(GetSQLCurrentTimestamp(svf->typmod));
+			break;
+		case SVFOP_LOCALTIME:
+		case SVFOP_LOCALTIME_N:
+			*op->resvalue = TimeADTGetDatum(GetSQLLocalTime(svf->typmod));
+			break;
+		case SVFOP_LOCALTIMESTAMP:
+		case SVFOP_LOCALTIMESTAMP_N:
+			*op->resvalue = TimestampGetDatum(GetSQLLocalTimestamp(svf->typmod));
+			break;
+		case SVFOP_CURRENT_ROLE:
+		case SVFOP_CURRENT_USER:
+		case SVFOP_USER:
+			InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+			*op->resvalue = current_user(fcinfo);
+			*op->resnull = fcinfo->isnull;
+			break;
+		case SVFOP_SESSION_USER:
+			InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+			*op->resvalue = session_user(fcinfo);
+			*op->resnull = fcinfo->isnull;
+			break;
+		case SVFOP_CURRENT_CATALOG:
+			InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+			*op->resvalue = current_database(fcinfo);
+			*op->resnull = fcinfo->isnull;
+			break;
+		case SVFOP_CURRENT_SCHEMA:
+			InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+			*op->resvalue = current_schema(fcinfo);
+			*op->resnull = fcinfo->isnull;
+			break;
+	}
+}
+
+/*
+ * Raise error if a CURRENT OF expression is evaluated.
+ *
+ * The planner should convert CURRENT OF into a TidScan qualification, or some
+ * other special handling in a ForeignScan node.  So we have to be able to do
+ * ExecInitExpr on a CurrentOfExpr, but we shouldn't ever actually execute it.
+ * If we get here, we suppose we must be dealing with CURRENT OF on a foreign
+ * table whose FDW doesn't handle it, and complain accordingly.
+ */
+void
+ExecEvalCurrentOfExpr(ExprState *state, ExprEvalStep *op)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("WHERE CURRENT OF is not supported for this table type")));
+}
+
+/*
+ * Evaluate NextValueExpr.
+ */
+void
+ExecEvalNextValueExpr(ExprState *state, ExprEvalStep *op)
+{
+	int64		newval = nextval_internal(op->d.nextvalueexpr.seqid, false);
+
+	switch (op->d.nextvalueexpr.seqtypid)
+	{
+		case INT2OID:
+			*op->resvalue = Int16GetDatum((int16) newval);
+			break;
+		case INT4OID:
+			*op->resvalue = Int32GetDatum((int32) newval);
+			break;
+		case INT8OID:
+			*op->resvalue = Int64GetDatum((int64) newval);
+			break;
+		default:
+			elog(ERROR, "unsupported sequence type %u",
+				 op->d.nextvalueexpr.seqtypid);
+	}
+	*op->resnull = false;
+}
+
+/*
+ * Evaluate NullTest / IS NULL for rows.
+ */
+void
+ExecEvalRowNull(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	ExecEvalRowNullInt(state, op, econtext, true);
+}
+
+/*
+ * Evaluate NullTest / IS NOT NULL for rows.
+ */
+void
+ExecEvalRowNotNull(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	ExecEvalRowNullInt(state, op, econtext, false);
+}
+
+/* Common code for IS [NOT] NULL on a row value */
+static void
+ExecEvalRowNullInt(ExprState *state, ExprEvalStep *op,
+				   ExprContext *econtext, bool checkisnull)
+{
+	Datum		value = *op->resvalue;
+	bool		isnull = *op->resnull;
+	HeapTupleHeader tuple;
+	Oid			tupType;
+	int32		tupTypmod;
+	TupleDesc	tupDesc;
+	HeapTupleData tmptup;
+
+	*op->resnull = false;
+
+	/* NULL row variables are treated just as NULL scalar columns */
+	if (isnull)
+	{
+		*op->resvalue = BoolGetDatum(checkisnull);
+		return;
+	}
+
+	/*
+	 * The SQL standard defines IS [NOT] NULL for a non-null rowtype argument
+	 * as:
+	 *
+	 * "R IS NULL" is true if every field is the null value.
+	 *
+	 * "R IS NOT NULL" is true if no field is the null value.
+	 *
+	 * This definition is (apparently intentionally) not recursive; so our
+	 * tests on the fields are primitive attisnull tests, not recursive checks
+	 * to see if they are all-nulls or no-nulls rowtypes.
+	 *
+	 * The standard does not consider the possibility of zero-field rows, but
+	 * here we consider them to vacuously satisfy both predicates.
+	 */
+
+	tuple = DatumGetHeapTupleHeader(value);
+
+	tupType = HeapTupleHeaderGetTypeId(tuple);
+	tupTypmod = HeapTupleHeaderGetTypMod(tuple);
+
+	/* Lookup tupdesc if first time through or if type changes */
+	tupDesc = get_cached_rowtype(tupType, tupTypmod,
+								 &op->d.nulltest_row.rowcache, NULL);
+
+	/*
+	 * heap_attisnull needs a HeapTuple not a bare HeapTupleHeader.
+	 */
+	tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+	tmptup.t_data = tuple;
+
+	for (int att = 1; att <= tupDesc->natts; att++)
+	{
+		/* ignore dropped columns */
+		if (TupleDescAttr(tupDesc, att - 1)->attisdropped)
+			continue;
+		if (heap_attisnull(&tmptup, att, tupDesc))
+		{
+			/* null field disproves IS NOT NULL */
+			if (!checkisnull)
+			{
+				*op->resvalue = BoolGetDatum(false);
+				return;
+			}
+		}
+		else
+		{
+			/* non-null field disproves IS NULL */
+			if (checkisnull)
+			{
+				*op->resvalue = BoolGetDatum(false);
+				return;
+			}
+		}
+	}
+
+	*op->resvalue = BoolGetDatum(true);
+}
+
+/*
+ * Evaluate an ARRAY[] expression.
+ *
+ * The individual array elements (or subarrays) have already been evaluated
+ * into op->d.arrayexpr.elemvalues[]/elemnulls[].
+ */
+void
+ExecEvalArrayExpr(ExprState *state, ExprEvalStep *op)
+{
+	ArrayType  *result;
+	Oid			element_type = op->d.arrayexpr.elemtype;
+	int			nelems = op->d.arrayexpr.nelems;
+	int			ndims = 0;
+	int			dims[MAXDIM];
+	int			lbs[MAXDIM];
+
+	/* Set non-null as default */
+	*op->resnull = false;
+
+	if (!op->d.arrayexpr.multidims)
+	{
+		/* Elements are presumably of scalar type */
+		Datum	   *dvalues = op->d.arrayexpr.elemvalues;
+		bool	   *dnulls = op->d.arrayexpr.elemnulls;
+
+		/* setup for 1-D array of the given length */
+		ndims = 1;
+		dims[0] = nelems;
+		lbs[0] = 1;
+
+		result = construct_md_array(dvalues, dnulls, ndims, dims, lbs,
+									element_type,
+									op->d.arrayexpr.elemlength,
+									op->d.arrayexpr.elembyval,
+									op->d.arrayexpr.elemalign);
+	}
+	else
+	{
+		/* Must be nested array expressions */
+		int			nbytes = 0;
+		int			nitems = 0;
+		int			outer_nelems = 0;
+		int			elem_ndims = 0;
+		int		   *elem_dims = NULL;
+		int		   *elem_lbs = NULL;
+		bool		firstone = true;
+		bool		havenulls = false;
+		bool		haveempty = false;
+		char	  **subdata;
+		bits8	  **subbitmaps;
+		int		   *subbytes;
+		int		   *subnitems;
+		int32		dataoffset;
+		char	   *dat;
+		int			iitem;
+
+		subdata = (char **) palloc(nelems * sizeof(char *));
+		subbitmaps = (bits8 **) palloc(nelems * sizeof(bits8 *));
+		subbytes = (int *) palloc(nelems * sizeof(int));
+		subnitems = (int *) palloc(nelems * sizeof(int));
+
+		/* loop through and get data area from each element */
+		for (int elemoff = 0; elemoff < nelems; elemoff++)
+		{
+			Datum		arraydatum;
+			bool		eisnull;
+			ArrayType  *array;
+			int			this_ndims;
+
+			arraydatum = op->d.arrayexpr.elemvalues[elemoff];
+			eisnull = op->d.arrayexpr.elemnulls[elemoff];
+
+			/* temporarily ignore null subarrays */
+			if (eisnull)
+			{
+				haveempty = true;
+				continue;
+			}
+
+			array = DatumGetArrayTypeP(arraydatum);
+
+			/* run-time double-check on element type */
+			if (element_type != ARR_ELEMTYPE(array))
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("cannot merge incompatible arrays"),
+						 errdetail("Array with element type %s cannot be "
+								   "included in ARRAY construct with element type %s.",
+								   format_type_be(ARR_ELEMTYPE(array)),
+								   format_type_be(element_type))));
+
+			this_ndims = ARR_NDIM(array);
+			/* temporarily ignore zero-dimensional subarrays */
+			if (this_ndims <= 0)
+			{
+				haveempty = true;
+				continue;
+			}
+
+			if (firstone)
+			{
+				/* Get sub-array details from first member */
+				elem_ndims = this_ndims;
+				ndims = elem_ndims + 1;
+				if (ndims <= 0 || ndims > MAXDIM)
+					ereport(ERROR,
+							(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+							 errmsg("number of array dimensions (%d) exceeds the maximum allowed (%d)",
+									ndims, MAXDIM)));
+
+				elem_dims = (int *) palloc(elem_ndims * sizeof(int));
+				memcpy(elem_dims, ARR_DIMS(array), elem_ndims * sizeof(int));
+				elem_lbs = (int *) palloc(elem_ndims * sizeof(int));
+				memcpy(elem_lbs, ARR_LBOUND(array), elem_ndims * sizeof(int));
+
+				firstone = false;
+			}
+			else
+			{
+				/* Check other sub-arrays are compatible */
+				if (elem_ndims != this_ndims ||
+					memcmp(elem_dims, ARR_DIMS(array),
+						   elem_ndims * sizeof(int)) != 0 ||
+					memcmp(elem_lbs, ARR_LBOUND(array),
+						   elem_ndims * sizeof(int)) != 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+							 errmsg("multidimensional arrays must have array "
+									"expressions with matching dimensions")));
+			}
+
+			subdata[outer_nelems] = ARR_DATA_PTR(array);
+			subbitmaps[outer_nelems] = ARR_NULLBITMAP(array);
+			subbytes[outer_nelems] = ARR_SIZE(array) - ARR_DATA_OFFSET(array);
+			nbytes += subbytes[outer_nelems];
+			subnitems[outer_nelems] = ArrayGetNItems(this_ndims,
+													 ARR_DIMS(array));
+			nitems += subnitems[outer_nelems];
+			havenulls |= ARR_HASNULL(array);
+			outer_nelems++;
+		}
+
+		/*
+		 * If all items were null or empty arrays, return an empty array;
+		 * otherwise, if some were and some weren't, raise error.  (Note: we
+		 * must special-case this somehow to avoid trying to generate a 1-D
+		 * array formed from empty arrays.  It's not ideal...)
+		 */
+		if (haveempty)
+		{
+			if (ndims == 0)		/* didn't find any nonempty array */
+			{
+				*op->resvalue = PointerGetDatum(construct_empty_array(element_type));
+				return;
+			}
+			ereport(ERROR,
+					(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+					 errmsg("multidimensional arrays must have array "
+							"expressions with matching dimensions")));
+		}
+
+		/* setup for multi-D array */
+		dims[0] = outer_nelems;
+		lbs[0] = 1;
+		for (int i = 1; i < ndims; i++)
+		{
+			dims[i] = elem_dims[i - 1];
+			lbs[i] = elem_lbs[i - 1];
+		}
+
+		/* check for subscript overflow */
+		(void) ArrayGetNItems(ndims, dims);
+		ArrayCheckBounds(ndims, dims, lbs);
+
+		if (havenulls)
+		{
+			dataoffset = ARR_OVERHEAD_WITHNULLS(ndims, nitems);
+			nbytes += dataoffset;
+		}
+		else
+		{
+			dataoffset = 0;		/* marker for no null bitmap */
+			nbytes += ARR_OVERHEAD_NONULLS(ndims);
+		}
+
+		result = (ArrayType *) palloc(nbytes);
+		SET_VARSIZE(result, nbytes);
+		result->ndim = ndims;
+		result->dataoffset = dataoffset;
+		result->elemtype = element_type;
+		memcpy(ARR_DIMS(result), dims, ndims * sizeof(int));
+		memcpy(ARR_LBOUND(result), lbs, ndims * sizeof(int));
+
+		dat = ARR_DATA_PTR(result);
+		iitem = 0;
+		for (int i = 0; i < outer_nelems; i++)
+		{
+			memcpy(dat, subdata[i], subbytes[i]);
+			dat += subbytes[i];
+			if (havenulls)
+				array_bitmap_copy(ARR_NULLBITMAP(result), iitem,
+								  subbitmaps[i], 0,
+								  subnitems[i]);
+			iitem += subnitems[i];
+		}
+	}
+
+	*op->resvalue = PointerGetDatum(result);
+}
+
+/*
+ * Evaluate an ArrayCoerceExpr expression.
+ *
+ * Source array is in step's result variable.
+ */
+void
+ExecEvalArrayCoerce(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	Datum		arraydatum;
+
+	/* NULL array -> NULL result */
+	if (*op->resnull)
+		return;
+
+	arraydatum = *op->resvalue;
+
+	/*
+	 * If it's binary-compatible, modify the element type in the array header,
+	 * but otherwise leave the array as we received it.
+	 */
+	if (op->d.arraycoerce.elemexprstate == NULL)
+	{
+		/* Detoast input array if necessary, and copy in any case */
+		ArrayType  *array = DatumGetArrayTypePCopy(arraydatum);
+
+		ARR_ELEMTYPE(array) = op->d.arraycoerce.resultelemtype;
+		*op->resvalue = PointerGetDatum(array);
+		return;
+	}
+
+	/*
+	 * Use array_map to apply the sub-expression to each array element.
+	 */
+	*op->resvalue = array_map(arraydatum,
+							  op->d.arraycoerce.elemexprstate,
+							  econtext,
+							  op->d.arraycoerce.resultelemtype,
+							  op->d.arraycoerce.amstate);
+}
+
+/*
+ * Evaluate a ROW() expression.
+ *
+ * The individual columns have already been evaluated into
+ * op->d.row.elemvalues[]/elemnulls[].
+ */
+void
+ExecEvalRow(ExprState *state, ExprEvalStep *op)
+{
+	HeapTuple	tuple;
+
+	/* build tuple from evaluated field values */
+	tuple = heap_form_tuple(op->d.row.tupdesc,
+							op->d.row.elemvalues,
+							op->d.row.elemnulls);
+
+	*op->resvalue = HeapTupleGetDatum(tuple);
+	*op->resnull = false;
+}
+
+/*
+ * Evaluate GREATEST() or LEAST() expression (note this is *not* MIN()/MAX()).
+ *
+ * All of the to-be-compared expressions have already been evaluated into
+ * op->d.minmax.values[]/nulls[].
+ */
+void
+ExecEvalMinMax(ExprState *state, ExprEvalStep *op)
+{
+	Datum	   *values = op->d.minmax.values;
+	bool	   *nulls = op->d.minmax.nulls;
+	FunctionCallInfo fcinfo = op->d.minmax.fcinfo_data;
+	MinMaxOp	operator = op->d.minmax.op;
+
+	/* set at initialization */
+	Assert(fcinfo->args[0].isnull == false);
+	Assert(fcinfo->args[1].isnull == false);
+
+	/* default to null result */
+	*op->resnull = true;
+
+	for (int off = 0; off < op->d.minmax.nelems; off++)
+	{
+		/* ignore NULL inputs */
+		if (nulls[off])
+			continue;
+
+		if (*op->resnull)
+		{
+			/* first nonnull input, adopt value */
+			*op->resvalue = values[off];
+			*op->resnull = false;
+		}
+		else
+		{
+			int			cmpresult;
+
+			/* apply comparison function */
+			fcinfo->args[0].value = *op->resvalue;
+			fcinfo->args[1].value = values[off];
+
+			fcinfo->isnull = false;
+			cmpresult = DatumGetInt32(FunctionCallInvoke(fcinfo));
+			if (fcinfo->isnull) /* probably should not happen */
+				continue;
+
+			if (cmpresult > 0 && operator == IS_LEAST)
+				*op->resvalue = values[off];
+			else if (cmpresult < 0 && operator == IS_GREATEST)
+				*op->resvalue = values[off];
+		}
+	}
+}
+
+/*
+ * Evaluate a FieldSelect node.
+ *
+ * Source record is in step's result variable.
+ */
+void
+ExecEvalFieldSelect(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	AttrNumber	fieldnum = op->d.fieldselect.fieldnum;
+	Datum		tupDatum;
+	HeapTupleHeader tuple;
+	Oid			tupType;
+	int32		tupTypmod;
+	TupleDesc	tupDesc;
+	Form_pg_attribute attr;
+	HeapTupleData tmptup;
+
+	/* NULL record -> NULL result */
+	if (*op->resnull)
+		return;
+
+	tupDatum = *op->resvalue;
+
+	/* We can special-case expanded records for speed */
+	if (VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(tupDatum)))
+	{
+		ExpandedRecordHeader *erh = (ExpandedRecordHeader *) DatumGetEOHP(tupDatum);
+
+		Assert(erh->er_magic == ER_MAGIC);
+
+		/* Extract record's TupleDesc */
+		tupDesc = expanded_record_get_tupdesc(erh);
+
+		/*
+		 * Find field's attr record.  Note we don't support system columns
+		 * here: a datum tuple doesn't have valid values for most of the
+		 * interesting system columns anyway.
+		 */
+		if (fieldnum <= 0)		/* should never happen */
+			elog(ERROR, "unsupported reference to system column %d in FieldSelect",
+				 fieldnum);
+		if (fieldnum > tupDesc->natts)	/* should never happen */
+			elog(ERROR, "attribute number %d exceeds number of columns %d",
+				 fieldnum, tupDesc->natts);
+		attr = TupleDescAttr(tupDesc, fieldnum - 1);
+
+		/* Check for dropped column, and force a NULL result if so */
+		if (attr->attisdropped)
+		{
+			*op->resnull = true;
+			return;
+		}
+
+		/* Check for type mismatch --- possible after ALTER COLUMN TYPE? */
+		/* As in CheckVarSlotCompatibility, we should but can't check typmod */
+		if (op->d.fieldselect.resulttype != attr->atttypid)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("attribute %d has wrong type", fieldnum),
+					 errdetail("Table has type %s, but query expects %s.",
+							   format_type_be(attr->atttypid),
+							   format_type_be(op->d.fieldselect.resulttype))));
+
+		/* extract the field */
+		*op->resvalue = expanded_record_get_field(erh, fieldnum,
+												  op->resnull);
+	}
+	else
+	{
+		/* Get the composite datum and extract its type fields */
+		tuple = DatumGetHeapTupleHeader(tupDatum);
+
+		tupType = HeapTupleHeaderGetTypeId(tuple);
+		tupTypmod = HeapTupleHeaderGetTypMod(tuple);
+
+		/* Lookup tupdesc if first time through or if type changes */
+		tupDesc = get_cached_rowtype(tupType, tupTypmod,
+									 &op->d.fieldselect.rowcache, NULL);
+
+		/*
+		 * Find field's attr record.  Note we don't support system columns
+		 * here: a datum tuple doesn't have valid values for most of the
+		 * interesting system columns anyway.
+		 */
+		if (fieldnum <= 0)		/* should never happen */
+			elog(ERROR, "unsupported reference to system column %d in FieldSelect",
+				 fieldnum);
+		if (fieldnum > tupDesc->natts)	/* should never happen */
+			elog(ERROR, "attribute number %d exceeds number of columns %d",
+				 fieldnum, tupDesc->natts);
+		attr = TupleDescAttr(tupDesc, fieldnum - 1);
+
+		/* Check for dropped column, and force a NULL result if so */
+		if (attr->attisdropped)
+		{
+			*op->resnull = true;
+			return;
+		}
+
+		/* Check for type mismatch --- possible after ALTER COLUMN TYPE? */
+		/* As in CheckVarSlotCompatibility, we should but can't check typmod */
+		if (op->d.fieldselect.resulttype != attr->atttypid)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("attribute %d has wrong type", fieldnum),
+					 errdetail("Table has type %s, but query expects %s.",
+							   format_type_be(attr->atttypid),
+							   format_type_be(op->d.fieldselect.resulttype))));
+
+		/* heap_getattr needs a HeapTuple not a bare HeapTupleHeader */
+		tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+		tmptup.t_data = tuple;
+
+		/* extract the field */
+		*op->resvalue = heap_getattr(&tmptup,
+									 fieldnum,
+									 tupDesc,
+									 op->resnull);
+	}
+}
+
+/*
+ * Deform source tuple, filling in the step's values/nulls arrays, before
+ * evaluating individual new values as part of a FieldStore expression.
+ * Subsequent steps will overwrite individual elements of the values/nulls
+ * arrays with the new field values, and then FIELDSTORE_FORM will build the
+ * new tuple value.
+ *
+ * Source record is in step's result variable.
+ */
+void
+ExecEvalFieldStoreDeForm(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	TupleDesc	tupDesc;
+
+	/* Lookup tupdesc if first time through or if type changes */
+	tupDesc = get_cached_rowtype(op->d.fieldstore.fstore->resulttype, -1,
+								 op->d.fieldstore.rowcache, NULL);
+
+	/* Check that current tupdesc doesn't have more fields than we allocated */
+	if (unlikely(tupDesc->natts > op->d.fieldstore.ncolumns))
+		elog(ERROR, "too many columns in composite type %u",
+			 op->d.fieldstore.fstore->resulttype);
+
+	if (*op->resnull)
+	{
+		/* Convert null input tuple into an all-nulls row */
+		memset(op->d.fieldstore.nulls, true,
+			   op->d.fieldstore.ncolumns * sizeof(bool));
+	}
+	else
+	{
+		/*
+		 * heap_deform_tuple needs a HeapTuple not a bare HeapTupleHeader. We
+		 * set all the fields in the struct just in case.
+		 */
+		Datum		tupDatum = *op->resvalue;
+		HeapTupleHeader tuphdr;
+		HeapTupleData tmptup;
+
+		tuphdr = DatumGetHeapTupleHeader(tupDatum);
+		tmptup.t_len = HeapTupleHeaderGetDatumLength(tuphdr);
+		ItemPointerSetInvalid(&(tmptup.t_self));
+		tmptup.t_tableOid = InvalidOid;
+		tmptup.t_data = tuphdr;
+
+		heap_deform_tuple(&tmptup, tupDesc,
+						  op->d.fieldstore.values,
+						  op->d.fieldstore.nulls);
+	}
+}
+
+/*
+ * Compute the new composite datum after each individual field value of a
+ * FieldStore expression has been evaluated.
+ */
+void
+ExecEvalFieldStoreForm(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	TupleDesc	tupDesc;
+	HeapTuple	tuple;
+
+	/* Lookup tupdesc (should be valid already) */
+	tupDesc = get_cached_rowtype(op->d.fieldstore.fstore->resulttype, -1,
+								 op->d.fieldstore.rowcache, NULL);
+
+	tuple = heap_form_tuple(tupDesc,
+							op->d.fieldstore.values,
+							op->d.fieldstore.nulls);
+
+	*op->resvalue = HeapTupleGetDatum(tuple);
+	*op->resnull = false;
+}
+
+/*
+ * Evaluate a rowtype coercion operation.
+ * This may require rearranging field positions.
+ *
+ * Source record is in step's result variable.
+ */
+void
+ExecEvalConvertRowtype(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	HeapTuple	result;
+	Datum		tupDatum;
+	HeapTupleHeader tuple;
+	HeapTupleData tmptup;
+	TupleDesc	indesc,
+				outdesc;
+	bool		changed = false;
+
+	/* NULL in -> NULL out */
+	if (*op->resnull)
+		return;
+
+	tupDatum = *op->resvalue;
+	tuple = DatumGetHeapTupleHeader(tupDatum);
+
+	/*
+	 * Lookup tupdescs if first time through or if type changes.  We'd better
+	 * pin them since type conversion functions could do catalog lookups and
+	 * hence cause cache invalidation.
+	 */
+	indesc = get_cached_rowtype(op->d.convert_rowtype.inputtype, -1,
+								op->d.convert_rowtype.incache,
+								&changed);
+	IncrTupleDescRefCount(indesc);
+	outdesc = get_cached_rowtype(op->d.convert_rowtype.outputtype, -1,
+								 op->d.convert_rowtype.outcache,
+								 &changed);
+	IncrTupleDescRefCount(outdesc);
+
+	/*
+	 * We used to be able to assert that incoming tuples are marked with
+	 * exactly the rowtype of indesc.  However, now that ExecEvalWholeRowVar
+	 * might change the tuples' marking to plain RECORD due to inserting
+	 * aliases, we can only make this weak test:
+	 */
+	Assert(HeapTupleHeaderGetTypeId(tuple) == indesc->tdtypeid ||
+		   HeapTupleHeaderGetTypeId(tuple) == RECORDOID);
+
+	/* if first time through, or after change, initialize conversion map */
+	if (changed)
+	{
+		MemoryContext old_cxt;
+
+		/* allocate map in long-lived memory context */
+		old_cxt = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+		/* prepare map from old to new attribute numbers */
+		op->d.convert_rowtype.map = convert_tuples_by_name(indesc, outdesc);
+
+		MemoryContextSwitchTo(old_cxt);
+	}
+
+	/* Following steps need a HeapTuple not a bare HeapTupleHeader */
+	tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+	tmptup.t_data = tuple;
+
+	if (op->d.convert_rowtype.map != NULL)
+	{
+		/* Full conversion with attribute rearrangement needed */
+		result = execute_attr_map_tuple(&tmptup, op->d.convert_rowtype.map);
+		/* Result already has appropriate composite-datum header fields */
+		*op->resvalue = HeapTupleGetDatum(result);
+	}
+	else
+	{
+		/*
+		 * The tuple is physically compatible as-is, but we need to insert the
+		 * destination rowtype OID in its composite-datum header field, so we
+		 * have to copy it anyway.  heap_copy_tuple_as_datum() is convenient
+		 * for this since it will both make the physical copy and insert the
+		 * correct composite header fields.  Note that we aren't expecting to
+		 * have to flatten any toasted fields: the input was a composite
+		 * datum, so it shouldn't contain any.  So heap_copy_tuple_as_datum()
+		 * is overkill here, but its check for external fields is cheap.
+		 */
+		*op->resvalue = heap_copy_tuple_as_datum(&tmptup, outdesc);
+	}
+
+	DecrTupleDescRefCount(indesc);
+	DecrTupleDescRefCount(outdesc);
+}
+
+/*
+ * Evaluate "scalar op ANY/ALL (array)".
+ *
+ * Source array is in our result area, scalar arg is already evaluated into
+ * fcinfo->args[0].
+ *
+ * The operator always yields boolean, and we combine the results across all
+ * array elements using OR and AND (for ANY and ALL respectively).  Of course
+ * we short-circuit as soon as the result is known.
+ */
+void
+ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op)
+{
+	FunctionCallInfo fcinfo = op->d.scalararrayop.fcinfo_data;
+	bool		useOr = op->d.scalararrayop.useOr;
+	bool		strictfunc = op->d.scalararrayop.finfo->fn_strict;
+	ArrayType  *arr;
+	int			nitems;
+	Datum		result;
+	bool		resultnull;
+	int16		typlen;
+	bool		typbyval;
+	char		typalign;
+	char	   *s;
+	bits8	   *bitmap;
+	int			bitmask;
+
+	/*
+	 * If the array is NULL then we return NULL --- it's not very meaningful
+	 * to do anything else, even if the operator isn't strict.
+	 */
+	if (*op->resnull)
+		return;
+
+	/* Else okay to fetch and detoast the array */
+	arr = DatumGetArrayTypeP(*op->resvalue);
+
+	/*
+	 * If the array is empty, we return either FALSE or TRUE per the useOr
+	 * flag.  This is correct even if the scalar is NULL; since we would
+	 * evaluate the operator zero times, it matters not whether it would want
+	 * to return NULL.
+	 */
+	nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr));
+	if (nitems <= 0)
+	{
+		*op->resvalue = BoolGetDatum(!useOr);
+		*op->resnull = false;
+		return;
+	}
+
+	/*
+	 * If the scalar is NULL, and the function is strict, return NULL; no
+	 * point in iterating the loop.
+	 */
+	if (fcinfo->args[0].isnull && strictfunc)
+	{
+		*op->resnull = true;
+		return;
+	}
+
+	/*
+	 * We arrange to look up info about the element type only once per series
+	 * of calls, assuming the element type doesn't change underneath us.
+	 */
+	if (op->d.scalararrayop.element_type != ARR_ELEMTYPE(arr))
+	{
+		get_typlenbyvalalign(ARR_ELEMTYPE(arr),
+							 &op->d.scalararrayop.typlen,
+							 &op->d.scalararrayop.typbyval,
+							 &op->d.scalararrayop.typalign);
+		op->d.scalararrayop.element_type = ARR_ELEMTYPE(arr);
+	}
+
+	typlen = op->d.scalararrayop.typlen;
+	typbyval = op->d.scalararrayop.typbyval;
+	typalign = op->d.scalararrayop.typalign;
+
+	/* Initialize result appropriately depending on useOr */
+	result = BoolGetDatum(!useOr);
+	resultnull = false;
+
+	/* Loop over the array elements */
+	s = (char *) ARR_DATA_PTR(arr);
+	bitmap = ARR_NULLBITMAP(arr);
+	bitmask = 1;
+
+	for (int i = 0; i < nitems; i++)
+	{
+		Datum		elt;
+		Datum		thisresult;
+
+		/* Get array element, checking for NULL */
+		if (bitmap && (*bitmap & bitmask) == 0)
+		{
+			fcinfo->args[1].value = (Datum) 0;
+			fcinfo->args[1].isnull = true;
+		}
+		else
+		{
+			elt = fetch_att(s, typbyval, typlen);
+			s = att_addlength_pointer(s, typlen, s);
+			s = (char *) att_align_nominal(s, typalign);
+			fcinfo->args[1].value = elt;
+			fcinfo->args[1].isnull = false;
+		}
+
+		/* Call comparison function */
+		if (fcinfo->args[1].isnull && strictfunc)
+		{
+			fcinfo->isnull = true;
+			thisresult = (Datum) 0;
+		}
+		else
+		{
+			fcinfo->isnull = false;
+			thisresult = op->d.scalararrayop.fn_addr(fcinfo);
+		}
+
+		/* Combine results per OR or AND semantics */
+		if (fcinfo->isnull)
+			resultnull = true;
+		else if (useOr)
+		{
+			if (DatumGetBool(thisresult))
+			{
+				result = BoolGetDatum(true);
+				resultnull = false;
+				break;			/* needn't look at any more elements */
+			}
+		}
+		else
+		{
+			if (!DatumGetBool(thisresult))
+			{
+				result = BoolGetDatum(false);
+				resultnull = false;
+				break;			/* needn't look at any more elements */
+			}
+		}
+
+		/* advance bitmap pointer if any */
+		if (bitmap)
+		{
+			bitmask <<= 1;
+			if (bitmask == 0x100)
+			{
+				bitmap++;
+				bitmask = 1;
+			}
+		}
+	}
+
+	*op->resvalue = result;
+	*op->resnull = resultnull;
+}
+
+/*
+ * Hash function for scalar array hash op elements.
+ *
+ * We use the element type's default hash opclass, and the column collation
+ * if the type is collation-sensitive.
+ */
+static uint32
+saop_element_hash(struct saophash_hash *tb, Datum key)
+{
+	ScalarArrayOpExprHashTable *elements_tab = (ScalarArrayOpExprHashTable *) tb->private_data;
+	FunctionCallInfo fcinfo = elements_tab->op->d.hashedscalararrayop.hash_fcinfo_data;
+	Datum		hash;
+
+	fcinfo->args[0].value = key;
+	fcinfo->args[0].isnull = false;
+
+	hash = elements_tab->op->d.hashedscalararrayop.hash_fn_addr(fcinfo);
+
+	return DatumGetUInt32(hash);
+}
+
+/*
+ * Matching function for scalar array hash op elements, to be used in hashtable
+ * lookups.
+ */
+static bool
+saop_hash_element_match(struct saophash_hash *tb, Datum key1, Datum key2)
+{
+	Datum		result;
+
+	ScalarArrayOpExprHashTable *elements_tab = (ScalarArrayOpExprHashTable *) tb->private_data;
+	FunctionCallInfo fcinfo = elements_tab->op->d.hashedscalararrayop.fcinfo_data;
+
+	fcinfo->args[0].value = key1;
+	fcinfo->args[0].isnull = false;
+	fcinfo->args[1].value = key2;
+	fcinfo->args[1].isnull = false;
+
+	result = elements_tab->op->d.hashedscalararrayop.fn_addr(fcinfo);
+
+	return DatumGetBool(result);
+}
+
+/*
+ * Evaluate "scalar op ANY (const array)".
+ *
+ * Similar to ExecEvalScalarArrayOp, but optimized for faster repeat lookups
+ * by building a hashtable on the first lookup.  This hashtable will be reused
+ * by subsequent lookups.  Unlike ExecEvalScalarArrayOp, this version only
+ * supports OR semantics.
+ *
+ * Source array is in our result area, scalar arg is already evaluated into
+ * fcinfo->args[0].
+ *
+ * The operator always yields boolean.
+ */
+void
+ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	ScalarArrayOpExprHashTable *elements_tab = op->d.hashedscalararrayop.elements_tab;
+	FunctionCallInfo fcinfo = op->d.hashedscalararrayop.fcinfo_data;
+	bool		strictfunc = op->d.hashedscalararrayop.finfo->fn_strict;
+	Datum		scalar = fcinfo->args[0].value;
+	bool		scalar_isnull = fcinfo->args[0].isnull;
+	Datum		result;
+	bool		resultnull;
+	bool		hashfound;
+
+	/* We don't setup a hashed scalar array op if the array const is null. */
+	Assert(!*op->resnull);
+
+	/*
+	 * If the scalar is NULL, and the function is strict, return NULL; no
+	 * point in executing the search.
+	 */
+	if (fcinfo->args[0].isnull && strictfunc)
+	{
+		*op->resnull = true;
+		return;
+	}
+
+	/* Build the hash table on first evaluation */
+	if (elements_tab == NULL)
+	{
+		int16		typlen;
+		bool		typbyval;
+		char		typalign;
+		int			nitems;
+		bool		has_nulls = false;
+		char	   *s;
+		bits8	   *bitmap;
+		int			bitmask;
+		MemoryContext oldcontext;
+		ArrayType  *arr;
+
+		arr = DatumGetArrayTypeP(*op->resvalue);
+		nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr));
+
+		get_typlenbyvalalign(ARR_ELEMTYPE(arr),
+							 &typlen,
+							 &typbyval,
+							 &typalign);
+
+		oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+		elements_tab = (ScalarArrayOpExprHashTable *)
+			palloc(sizeof(ScalarArrayOpExprHashTable));
+		op->d.hashedscalararrayop.elements_tab = elements_tab;
+		elements_tab->op = op;
+
+		/*
+		 * Create the hash table sizing it according to the number of elements
+		 * in the array.  This does assume that the array has no duplicates.
+		 * If the array happens to contain many duplicate values then it'll
+		 * just mean that we sized the table a bit on the large side.
+		 */
+		elements_tab->hashtab = saophash_create(CurrentMemoryContext, nitems,
+												elements_tab);
+
+		MemoryContextSwitchTo(oldcontext);
+
+		s = (char *) ARR_DATA_PTR(arr);
+		bitmap = ARR_NULLBITMAP(arr);
+		bitmask = 1;
+		for (int i = 0; i < nitems; i++)
+		{
+			/* Get array element, checking for NULL. */
+			if (bitmap && (*bitmap & bitmask) == 0)
+			{
+				has_nulls = true;
+			}
+			else
+			{
+				Datum		element;
+
+				element = fetch_att(s, typbyval, typlen);
+				s = att_addlength_pointer(s, typlen, s);
+				s = (char *) att_align_nominal(s, typalign);
+
+				saophash_insert(elements_tab->hashtab, element, &hashfound);
+			}
+
+			/* Advance bitmap pointer if any. */
+			if (bitmap)
+			{
+				bitmask <<= 1;
+				if (bitmask == 0x100)
+				{
+					bitmap++;
+					bitmask = 1;
+				}
+			}
+		}
+
+		/*
+		 * Remember if we had any nulls so that we know if we need to execute
+		 * non-strict functions with a null lhs value if no match is found.
+		 */
+		op->d.hashedscalararrayop.has_nulls = has_nulls;
+	}
+
+	/* Check the hash to see if we have a match. */
+	hashfound = NULL != saophash_lookup(elements_tab->hashtab, scalar);
+
+	result = BoolGetDatum(hashfound);
+	resultnull = false;
+
+	/*
+	 * If we didn't find a match in the array, we still might need to handle
+	 * the possibility of null values.  We didn't put any NULLs into the
+	 * hashtable, but instead marked if we found any when building the table
+	 * in has_nulls.
+	 */
+	if (!DatumGetBool(result) && op->d.hashedscalararrayop.has_nulls)
+	{
+		if (strictfunc)
+		{
+
+			/*
+			 * We have nulls in the array so a non-null lhs and no match must
+			 * yield NULL.
+			 */
+			result = (Datum) 0;
+			resultnull = true;
+		}
+		else
+		{
+			/*
+			 * Execute function will null rhs just once.
+			 *
+			 * The hash lookup path will have scribbled on the lhs argument so
+			 * we need to set it up also (even though we entered this function
+			 * with it already set).
+			 */
+			fcinfo->args[0].value = scalar;
+			fcinfo->args[0].isnull = scalar_isnull;
+			fcinfo->args[1].value = (Datum) 0;
+			fcinfo->args[1].isnull = true;
+
+			result = op->d.hashedscalararrayop.fn_addr(fcinfo);
+			resultnull = fcinfo->isnull;
+		}
+	}
+
+	*op->resvalue = result;
+	*op->resnull = resultnull;
+}
+
+/*
+ * Evaluate a NOT NULL domain constraint.
+ */
+void
+ExecEvalConstraintNotNull(ExprState *state, ExprEvalStep *op)
+{
+	if (*op->resnull)
+		ereport(ERROR,
+				(errcode(ERRCODE_NOT_NULL_VIOLATION),
+				 errmsg("domain %s does not allow null values",
+						format_type_be(op->d.domaincheck.resulttype)),
+				 errdatatype(op->d.domaincheck.resulttype)));
+}
+
+/*
+ * Evaluate a CHECK domain constraint.
+ */
+void
+ExecEvalConstraintCheck(ExprState *state, ExprEvalStep *op)
+{
+	if (!*op->d.domaincheck.checknull &&
+		!DatumGetBool(*op->d.domaincheck.checkvalue))
+		ereport(ERROR,
+				(errcode(ERRCODE_CHECK_VIOLATION),
+				 errmsg("value for domain %s violates check constraint \"%s\"",
+						format_type_be(op->d.domaincheck.resulttype),
+						op->d.domaincheck.constraintname),
+				 errdomainconstraint(op->d.domaincheck.resulttype,
+									 op->d.domaincheck.constraintname)));
+}
+
+/*
+ * Evaluate the various forms of XmlExpr.
+ *
+ * Arguments have been evaluated into named_argvalue/named_argnull
+ * and/or argvalue/argnull arrays.
+ */
+void
+ExecEvalXmlExpr(ExprState *state, ExprEvalStep *op)
+{
+	XmlExpr    *xexpr = op->d.xmlexpr.xexpr;
+	Datum		value;
+
+	*op->resnull = true;		/* until we get a result */
+	*op->resvalue = (Datum) 0;
+
+	switch (xexpr->op)
+	{
+		case IS_XMLCONCAT:
+			{
+				Datum	   *argvalue = op->d.xmlexpr.argvalue;
+				bool	   *argnull = op->d.xmlexpr.argnull;
+				List	   *values = NIL;
+
+				for (int i = 0; i < list_length(xexpr->args); i++)
+				{
+					if (!argnull[i])
+						values = lappend(values, DatumGetPointer(argvalue[i]));
+				}
+
+				if (values != NIL)
+				{
+					*op->resvalue = PointerGetDatum(xmlconcat(values));
+					*op->resnull = false;
+				}
+			}
+			break;
+
+		case IS_XMLFOREST:
+			{
+				Datum	   *argvalue = op->d.xmlexpr.named_argvalue;
+				bool	   *argnull = op->d.xmlexpr.named_argnull;
+				StringInfoData buf;
+				ListCell   *lc;
+				ListCell   *lc2;
+				int			i;
+
+				initStringInfo(&buf);
+
+				i = 0;
+				forboth(lc, xexpr->named_args, lc2, xexpr->arg_names)
+				{
+					Expr	   *e = (Expr *) lfirst(lc);
+					char	   *argname = strVal(lfirst(lc2));
+
+					if (!argnull[i])
+					{
+						value = argvalue[i];
+						appendStringInfo(&buf, "<%s>%s</%s>",
+										 argname,
+										 map_sql_value_to_xml_value(value,
+																	exprType((Node *) e), true),
+										 argname);
+						*op->resnull = false;
+					}
+					i++;
+				}
+
+				if (!*op->resnull)
+				{
+					text	   *result;
+
+					result = cstring_to_text_with_len(buf.data, buf.len);
+					*op->resvalue = PointerGetDatum(result);
+				}
+
+				pfree(buf.data);
+			}
+			break;
+
+		case IS_XMLELEMENT:
+			*op->resvalue = PointerGetDatum(xmlelement(xexpr,
+													   op->d.xmlexpr.named_argvalue,
+													   op->d.xmlexpr.named_argnull,
+													   op->d.xmlexpr.argvalue,
+													   op->d.xmlexpr.argnull));
+			*op->resnull = false;
+			break;
+
+		case IS_XMLPARSE:
+			{
+				Datum	   *argvalue = op->d.xmlexpr.argvalue;
+				bool	   *argnull = op->d.xmlexpr.argnull;
+				text	   *data;
+				bool		preserve_whitespace;
+
+				/* arguments are known to be text, bool */
+				Assert(list_length(xexpr->args) == 2);
+
+				if (argnull[0])
+					return;
+				value = argvalue[0];
+				data = DatumGetTextPP(value);
+
+				if (argnull[1]) /* probably can't happen */
+					return;
+				value = argvalue[1];
+				preserve_whitespace = DatumGetBool(value);
+
+				*op->resvalue = PointerGetDatum(xmlparse(data,
+														 xexpr->xmloption,
+														 preserve_whitespace));
+				*op->resnull = false;
+			}
+			break;
+
+		case IS_XMLPI:
+			{
+				text	   *arg;
+				bool		isnull;
+
+				/* optional argument is known to be text */
+				Assert(list_length(xexpr->args) <= 1);
+
+				if (xexpr->args)
+				{
+					isnull = op->d.xmlexpr.argnull[0];
+					if (isnull)
+						arg = NULL;
+					else
+						arg = DatumGetTextPP(op->d.xmlexpr.argvalue[0]);
+				}
+				else
+				{
+					arg = NULL;
+					isnull = false;
+				}
+
+				*op->resvalue = PointerGetDatum(xmlpi(xexpr->name,
+													  arg,
+													  isnull,
+													  op->resnull));
+			}
+			break;
+
+		case IS_XMLROOT:
+			{
+				Datum	   *argvalue = op->d.xmlexpr.argvalue;
+				bool	   *argnull = op->d.xmlexpr.argnull;
+				xmltype    *data;
+				text	   *version;
+				int			standalone;
+
+				/* arguments are known to be xml, text, int */
+				Assert(list_length(xexpr->args) == 3);
+
+				if (argnull[0])
+					return;
+				data = DatumGetXmlP(argvalue[0]);
+
+				if (argnull[1])
+					version = NULL;
+				else
+					version = DatumGetTextPP(argvalue[1]);
+
+				Assert(!argnull[2]);	/* always present */
+				standalone = DatumGetInt32(argvalue[2]);
+
+				*op->resvalue = PointerGetDatum(xmlroot(data,
+														version,
+														standalone));
+				*op->resnull = false;
+			}
+			break;
+
+		case IS_XMLSERIALIZE:
+			{
+				Datum	   *argvalue = op->d.xmlexpr.argvalue;
+				bool	   *argnull = op->d.xmlexpr.argnull;
+
+				/* argument type is known to be xml */
+				Assert(list_length(xexpr->args) == 1);
+
+				if (argnull[0])
+					return;
+				value = argvalue[0];
+
+				*op->resvalue = PointerGetDatum(xmltotext_with_xmloption(DatumGetXmlP(value),
+																		 xexpr->xmloption));
+				*op->resnull = false;
+			}
+			break;
+
+		case IS_DOCUMENT:
+			{
+				Datum	   *argvalue = op->d.xmlexpr.argvalue;
+				bool	   *argnull = op->d.xmlexpr.argnull;
+
+				/* optional argument is known to be xml */
+				Assert(list_length(xexpr->args) == 1);
+
+				if (argnull[0])
+					return;
+				value = argvalue[0];
+
+				*op->resvalue =
+					BoolGetDatum(xml_is_document(DatumGetXmlP(value)));
+				*op->resnull = false;
+			}
+			break;
+
+		default:
+			elog(ERROR, "unrecognized XML operation");
+			break;
+	}
+}
+
+/*
+ * ExecEvalGroupingFunc
+ *
+ * Computes a bitmask with a bit for each (unevaluated) argument expression
+ * (rightmost arg is least significant bit).
+ *
+ * A bit is set if the corresponding expression is NOT part of the set of
+ * grouping expressions in the current grouping set.
+ */
+void
+ExecEvalGroupingFunc(ExprState *state, ExprEvalStep *op)
+{
+	AggState   *aggstate = castNode(AggState, state->parent);
+	int			result = 0;
+	Bitmapset  *grouped_cols = aggstate->grouped_cols;
+	ListCell   *lc;
+
+	foreach(lc, op->d.grouping_func.clauses)
+	{
+		int			attnum = lfirst_int(lc);
+
+		result <<= 1;
+
+		if (!bms_is_member(attnum, grouped_cols))
+			result |= 1;
+	}
+
+	*op->resvalue = Int32GetDatum(result);
+	*op->resnull = false;
+}
+
+/*
+ * Hand off evaluation of a subplan to nodeSubplan.c
+ */
+void
+ExecEvalSubPlan(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	SubPlanState *sstate = op->d.subplan.sstate;
+
+	/* could potentially be nested, so make sure there's enough stack */
+	check_stack_depth();
+
+	*op->resvalue = ExecSubPlan(sstate, econtext, op->resnull);
+}
+
+/*
+ * Evaluate a wholerow Var expression.
+ *
+ * Returns a Datum whose value is the value of a whole-row range variable
+ * with respect to given expression context.
+ */
+void
+ExecEvalWholeRowVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext)
+{
+	Var		   *variable = op->d.wholerow.var;
+	TupleTableSlot *slot;
+	TupleDesc	output_tupdesc;
+	MemoryContext oldcontext;
+	HeapTupleHeader dtuple;
+	HeapTuple	tuple;
+
+	/* This was checked by ExecInitExpr */
+	Assert(variable->varattno == InvalidAttrNumber);
+
+	/* Get the input slot we want */
+	switch (variable->varno)
+	{
+		case INNER_VAR:
+			/* get the tuple from the inner node */
+			slot = econtext->ecxt_innertuple;
+			break;
+
+		case OUTER_VAR:
+			/* get the tuple from the outer node */
+			slot = econtext->ecxt_outertuple;
+			break;
+
+			/* INDEX_VAR is handled by default case */
+
+		default:
+			/* get the tuple from the relation being scanned */
+			slot = econtext->ecxt_scantuple;
+			break;
+	}
+
+	/* Apply the junkfilter if any */
+	if (op->d.wholerow.junkFilter != NULL)
+		slot = ExecFilterJunk(op->d.wholerow.junkFilter, slot);
+
+	/*
+	 * If first time through, obtain tuple descriptor and check compatibility.
+	 *
+	 * XXX: It'd be great if this could be moved to the expression
+	 * initialization phase, but due to using slots that's currently not
+	 * feasible.
+	 */
+	if (op->d.wholerow.first)
+	{
+		/* optimistically assume we don't need slow path */
+		op->d.wholerow.slow = false;
+
+		/*
+		 * If the Var identifies a named composite type, we must check that
+		 * the actual tuple type is compatible with it.
+		 */
+		if (variable->vartype != RECORDOID)
+		{
+			TupleDesc	var_tupdesc;
+			TupleDesc	slot_tupdesc;
+
+			/*
+			 * We really only care about numbers of attributes and data types.
+			 * Also, we can ignore type mismatch on columns that are dropped
+			 * in the destination type, so long as (1) the physical storage
+			 * matches or (2) the actual column value is NULL.  Case (1) is
+			 * helpful in some cases involving out-of-date cached plans, while
+			 * case (2) is expected behavior in situations such as an INSERT
+			 * into a table with dropped columns (the planner typically
+			 * generates an INT4 NULL regardless of the dropped column type).
+			 * If we find a dropped column and cannot verify that case (1)
+			 * holds, we have to use the slow path to check (2) for each row.
+			 *
+			 * If vartype is a domain over composite, just look through that
+			 * to the base composite type.
+			 */
+			var_tupdesc = lookup_rowtype_tupdesc_domain(variable->vartype,
+														-1, false);
+
+			slot_tupdesc = slot->tts_tupleDescriptor;
+
+			if (var_tupdesc->natts != slot_tupdesc->natts)
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("table row type and query-specified row type do not match"),
+						 errdetail_plural("Table row contains %d attribute, but query expects %d.",
+										  "Table row contains %d attributes, but query expects %d.",
+										  slot_tupdesc->natts,
+										  slot_tupdesc->natts,
+										  var_tupdesc->natts)));
+
+			for (int i = 0; i < var_tupdesc->natts; i++)
+			{
+				Form_pg_attribute vattr = TupleDescAttr(var_tupdesc, i);
+				Form_pg_attribute sattr = TupleDescAttr(slot_tupdesc, i);
+
+				if (vattr->atttypid == sattr->atttypid)
+					continue;	/* no worries */
+				if (!vattr->attisdropped)
+					ereport(ERROR,
+							(errcode(ERRCODE_DATATYPE_MISMATCH),
+							 errmsg("table row type and query-specified row type do not match"),
+							 errdetail("Table has type %s at ordinal position %d, but query expects %s.",
+									   format_type_be(sattr->atttypid),
+									   i + 1,
+									   format_type_be(vattr->atttypid))));
+
+				if (vattr->attlen != sattr->attlen ||
+					vattr->attalign != sattr->attalign)
+					op->d.wholerow.slow = true; /* need to check for nulls */
+			}
+
+			/*
+			 * Use the variable's declared rowtype as the descriptor for the
+			 * output values.  In particular, we *must* absorb any
+			 * attisdropped markings.
+			 */
+			oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+			output_tupdesc = CreateTupleDescCopy(var_tupdesc);
+			MemoryContextSwitchTo(oldcontext);
+
+			ReleaseTupleDesc(var_tupdesc);
+		}
+		else
+		{
+			/*
+			 * In the RECORD case, we use the input slot's rowtype as the
+			 * descriptor for the output values, modulo possibly assigning new
+			 * column names below.
+			 */
+			oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+			output_tupdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
+			MemoryContextSwitchTo(oldcontext);
+
+			/*
+			 * It's possible that the input slot is a relation scan slot and
+			 * so is marked with that relation's rowtype.  But we're supposed
+			 * to be returning RECORD, so reset to that.
+			 */
+			output_tupdesc->tdtypeid = RECORDOID;
+			output_tupdesc->tdtypmod = -1;
+
+			/*
+			 * We already got the correct physical datatype info above, but
+			 * now we should try to find the source RTE and adopt its column
+			 * aliases, since it's unlikely that the input slot has the
+			 * desired names.
+			 *
+			 * If we can't locate the RTE, assume the column names we've got
+			 * are OK.  (As of this writing, the only cases where we can't
+			 * locate the RTE are in execution of trigger WHEN clauses, and
+			 * then the Var will have the trigger's relation's rowtype, so its
+			 * names are fine.)  Also, if the creator of the RTE didn't bother
+			 * to fill in an eref field, assume our column names are OK. (This
+			 * happens in COPY, and perhaps other places.)
+			 */
+			if (econtext->ecxt_estate &&
+				variable->varno <= econtext->ecxt_estate->es_range_table_size)
+			{
+				RangeTblEntry *rte = exec_rt_fetch(variable->varno,
+												   econtext->ecxt_estate);
+
+				if (rte->eref)
+					ExecTypeSetColNames(output_tupdesc, rte->eref->colnames);
+			}
+		}
+
+		/* Bless the tupdesc if needed, and save it in the execution state */
+		op->d.wholerow.tupdesc = BlessTupleDesc(output_tupdesc);
+
+		op->d.wholerow.first = false;
+	}
+
+	/*
+	 * Make sure all columns of the slot are accessible in the slot's
+	 * Datum/isnull arrays.
+	 */
+	slot_getallattrs(slot);
+
+	if (op->d.wholerow.slow)
+	{
+		/* Check to see if any dropped attributes are non-null */
+		TupleDesc	tupleDesc = slot->tts_tupleDescriptor;
+		TupleDesc	var_tupdesc = op->d.wholerow.tupdesc;
+
+		Assert(var_tupdesc->natts == tupleDesc->natts);
+
+		for (int i = 0; i < var_tupdesc->natts; i++)
+		{
+			Form_pg_attribute vattr = TupleDescAttr(var_tupdesc, i);
+			Form_pg_attribute sattr = TupleDescAttr(tupleDesc, i);
+
+			if (!vattr->attisdropped)
+				continue;		/* already checked non-dropped cols */
+			if (slot->tts_isnull[i])
+				continue;		/* null is always okay */
+			if (vattr->attlen != sattr->attlen ||
+				vattr->attalign != sattr->attalign)
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("table row type and query-specified row type do not match"),
+						 errdetail("Physical storage mismatch on dropped attribute at ordinal position %d.",
+								   i + 1)));
+		}
+	}
+
+	/*
+	 * Build a composite datum, making sure any toasted fields get detoasted.
+	 *
+	 * (Note: it is critical that we not change the slot's state here.)
+	 */
+	tuple = toast_build_flattened_tuple(slot->tts_tupleDescriptor,
+										slot->tts_values,
+										slot->tts_isnull);
+	dtuple = tuple->t_data;
+
+	/*
+	 * Label the datum with the composite type info we identified before.
+	 *
+	 * (Note: we could skip doing this by passing op->d.wholerow.tupdesc to
+	 * the tuple build step; but that seems a tad risky so let's not.)
+	 */
+	HeapTupleHeaderSetTypeId(dtuple, op->d.wholerow.tupdesc->tdtypeid);
+	HeapTupleHeaderSetTypMod(dtuple, op->d.wholerow.tupdesc->tdtypmod);
+
+	*op->resvalue = PointerGetDatum(dtuple);
+	*op->resnull = false;
+}
+
+void
+ExecEvalSysVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext,
+			   TupleTableSlot *slot)
+{
+	Datum		d;
+
+	/* slot_getsysattr has sufficient defenses against bad attnums */
+	d = slot_getsysattr(slot,
+						op->d.var.attnum,
+						op->resnull);
+	*op->resvalue = d;
+	/* this ought to be unreachable, but it's cheap enough to check */
+	if (unlikely(*op->resnull))
+		elog(ERROR, "failed to fetch attribute from slot");
+}
+
+/*
+ * Transition value has not been initialized. This is the first non-NULL input
+ * value for a group. We use it as the initial value for transValue.
+ */
+void
+ExecAggInitGroup(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroup,
+				 ExprContext *aggcontext)
+{
+	FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+	MemoryContext oldContext;
+
+	/*
+	 * We must copy the datum into aggcontext if it is pass-by-ref. We do not
+	 * need to pfree the old transValue, since it's NULL.  (We already checked
+	 * that the agg's input type is binary-compatible with its transtype, so
+	 * straight copy here is OK.)
+	 */
+	oldContext = MemoryContextSwitchTo(aggcontext->ecxt_per_tuple_memory);
+	pergroup->transValue = datumCopy(fcinfo->args[1].value,
+									 pertrans->transtypeByVal,
+									 pertrans->transtypeLen);
+	pergroup->transValueIsNull = false;
+	pergroup->noTransValue = false;
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Ensure that the current transition value is a child of the aggcontext,
+ * rather than the per-tuple context.
+ *
+ * NB: This can change the current memory context.
+ */
+Datum
+ExecAggTransReparent(AggState *aggstate, AggStatePerTrans pertrans,
+					 Datum newValue, bool newValueIsNull,
+					 Datum oldValue, bool oldValueIsNull)
+{
+	Assert(newValue != oldValue);
+
+	if (!newValueIsNull)
+	{
+		MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
+		if (DatumIsReadWriteExpandedObject(newValue,
+										   false,
+										   pertrans->transtypeLen) &&
+			MemoryContextGetParent(DatumGetEOHP(newValue)->eoh_context) == CurrentMemoryContext)
+			 /* do nothing */ ;
+		else
+			newValue = datumCopy(newValue,
+								 pertrans->transtypeByVal,
+								 pertrans->transtypeLen);
+	}
+	else
+	{
+		/*
+		 * Ensure that AggStatePerGroup->transValue ends up being 0, so
+		 * callers can safely compare newValue/oldValue without having to
+		 * check their respective nullness.
+		 */
+		newValue = (Datum) 0;
+	}
+
+	if (!oldValueIsNull)
+	{
+		if (DatumIsReadWriteExpandedObject(oldValue,
+										   false,
+										   pertrans->transtypeLen))
+			DeleteExpandedObject(oldValue);
+		else
+			pfree(DatumGetPointer(oldValue));
+	}
+
+	return newValue;
+}
+
+/*
+ * Invoke ordered transition function, with a datum argument.
+ */
+void
+ExecEvalAggOrderedTransDatum(ExprState *state, ExprEvalStep *op,
+							 ExprContext *econtext)
+{
+	AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+	int			setno = op->d.agg_trans.setno;
+
+	tuplesort_putdatum(pertrans->sortstates[setno],
+					   *op->resvalue, *op->resnull);
+}
+
+/*
+ * Invoke ordered transition function, with a tuple argument.
+ */
+void
+ExecEvalAggOrderedTransTuple(ExprState *state, ExprEvalStep *op,
+							 ExprContext *econtext)
+{
+	AggStatePerTrans pertrans = op->d.agg_trans.pertrans;
+	int			setno = op->d.agg_trans.setno;
+
+	ExecClearTuple(pertrans->sortslot);
+	pertrans->sortslot->tts_nvalid = pertrans->numInputs;
+	ExecStoreVirtualTuple(pertrans->sortslot);
+	tuplesort_puttupleslot(pertrans->sortstates[setno], pertrans->sortslot);
+}
+
+/* implementation of transition function invocation for byval types */
+static pg_attribute_always_inline void
+ExecAggPlainTransByVal(AggState *aggstate, AggStatePerTrans pertrans,
+					   AggStatePerGroup pergroup,
+					   ExprContext *aggcontext, int setno)
+{
+	FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+	MemoryContext oldContext;
+	Datum		newVal;
+
+	/* cf. select_current_set() */
+	aggstate->curaggcontext = aggcontext;
+	aggstate->current_set = setno;
+
+	/* set up aggstate->curpertrans for AggGetAggref() */
+	aggstate->curpertrans = pertrans;
+
+	/* invoke transition function in per-tuple context */
+	oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+
+	fcinfo->args[0].value = pergroup->transValue;
+	fcinfo->args[0].isnull = pergroup->transValueIsNull;
+	fcinfo->isnull = false;		/* just in case transfn doesn't set it */
+
+	newVal = FunctionCallInvoke(fcinfo);
+
+	pergroup->transValue = newVal;
+	pergroup->transValueIsNull = fcinfo->isnull;
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/* implementation of transition function invocation for byref types */
+static pg_attribute_always_inline void
+ExecAggPlainTransByRef(AggState *aggstate, AggStatePerTrans pertrans,
+					   AggStatePerGroup pergroup,
+					   ExprContext *aggcontext, int setno)
+{
+	FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+	MemoryContext oldContext;
+	Datum		newVal;
+
+	/* cf. select_current_set() */
+	aggstate->curaggcontext = aggcontext;
+	aggstate->current_set = setno;
+
+	/* set up aggstate->curpertrans for AggGetAggref() */
+	aggstate->curpertrans = pertrans;
+
+	/* invoke transition function in per-tuple context */
+	oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+
+	fcinfo->args[0].value = pergroup->transValue;
+	fcinfo->args[0].isnull = pergroup->transValueIsNull;
+	fcinfo->isnull = false;		/* just in case transfn doesn't set it */
+
+	newVal = FunctionCallInvoke(fcinfo);
+
+	/*
+	 * For pass-by-ref datatype, must copy the new value into aggcontext and
+	 * free the prior transValue.  But if transfn returned a pointer to its
+	 * first input, we don't need to do anything.  Also, if transfn returned a
+	 * pointer to a R/W expanded object that is already a child of the
+	 * aggcontext, assume we can adopt that value without copying it.
+	 *
+	 * It's safe to compare newVal with pergroup->transValue without regard
+	 * for either being NULL, because ExecAggTransReparent() takes care to set
+	 * transValue to 0 when NULL. Otherwise we could end up accidentally not
+	 * reparenting, when the transValue has the same numerical value as
+	 * newValue, despite being NULL.  This is a somewhat hot path, making it
+	 * undesirable to instead solve this with another branch for the common
+	 * case of the transition function returning its (modified) input
+	 * argument.
+	 */
+	if (DatumGetPointer(newVal) != DatumGetPointer(pergroup->transValue))
+		newVal = ExecAggTransReparent(aggstate, pertrans,
+									  newVal, fcinfo->isnull,
+									  pergroup->transValue,
+									  pergroup->transValueIsNull);
+
+	pergroup->transValue = newVal;
+	pergroup->transValueIsNull = fcinfo->isnull;
+
+	MemoryContextSwitchTo(oldContext);
+}
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
new file mode 100644
index 0000000..c11427a
--- /dev/null
+++ b/src/backend/executor/execGrouping.c
@@ -0,0 +1,560 @@
+/*-------------------------------------------------------------------------
+ *
+ * execGrouping.c
+ *	  executor utility routines for grouping, hashing, and aggregation
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execGrouping.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+
+static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
+static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
+												 const MinimalTuple tuple);
+static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
+														   TupleTableSlot *slot,
+														   bool *isnew, uint32 hash);
+
+/*
+ * Define parameters for tuple hash table code generation. The interface is
+ * *also* declared in execnodes.h (to generate the types, which are externally
+ * visible).
+ */
+#define SH_PREFIX tuplehash
+#define SH_ELEMENT_TYPE TupleHashEntryData
+#define SH_KEY_TYPE MinimalTuple
+#define SH_KEY firstTuple
+#define SH_HASH_KEY(tb, key) TupleHashTableHash_internal(tb, key)
+#define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0
+#define SH_SCOPE extern
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a) a->hash
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+
+/*****************************************************************************
+ *		Utility routines for grouping tuples together
+ *****************************************************************************/
+
+/*
+ * execTuplesMatchPrepare
+ *		Build expression that can be evaluated using ExecQual(), returning
+ *		whether an ExprContext's inner/outer tuples are NOT DISTINCT
+ */
+ExprState *
+execTuplesMatchPrepare(TupleDesc desc,
+					   int numCols,
+					   const AttrNumber *keyColIdx,
+					   const Oid *eqOperators,
+					   const Oid *collations,
+					   PlanState *parent)
+{
+	Oid		   *eqFunctions = (Oid *) palloc(numCols * sizeof(Oid));
+	int			i;
+	ExprState  *expr;
+
+	if (numCols == 0)
+		return NULL;
+
+	/* lookup equality functions */
+	for (i = 0; i < numCols; i++)
+		eqFunctions[i] = get_opcode(eqOperators[i]);
+
+	/* build actual expression */
+	expr = ExecBuildGroupingEqual(desc, desc, NULL, NULL,
+								  numCols, keyColIdx, eqFunctions, collations,
+								  parent);
+
+	return expr;
+}
+
+/*
+ * execTuplesHashPrepare
+ *		Look up the equality and hashing functions needed for a TupleHashTable.
+ *
+ * This is similar to execTuplesMatchPrepare, but we also need to find the
+ * hash functions associated with the equality operators.  *eqFunctions and
+ * *hashFunctions receive the palloc'd result arrays.
+ *
+ * Note: we expect that the given operators are not cross-type comparisons.
+ */
+void
+execTuplesHashPrepare(int numCols,
+					  const Oid *eqOperators,
+					  Oid **eqFuncOids,
+					  FmgrInfo **hashFunctions)
+{
+	int			i;
+
+	*eqFuncOids = (Oid *) palloc(numCols * sizeof(Oid));
+	*hashFunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
+
+	for (i = 0; i < numCols; i++)
+	{
+		Oid			eq_opr = eqOperators[i];
+		Oid			eq_function;
+		Oid			left_hash_function;
+		Oid			right_hash_function;
+
+		eq_function = get_opcode(eq_opr);
+		if (!get_op_hash_functions(eq_opr,
+								   &left_hash_function, &right_hash_function))
+			elog(ERROR, "could not find hash function for hash operator %u",
+				 eq_opr);
+		/* We're not supporting cross-type cases here */
+		Assert(left_hash_function == right_hash_function);
+		(*eqFuncOids)[i] = eq_function;
+		fmgr_info(right_hash_function, &(*hashFunctions)[i]);
+	}
+}
+
+
+/*****************************************************************************
+ *		Utility routines for all-in-memory hash tables
+ *
+ * These routines build hash tables for grouping tuples together (eg, for
+ * hash aggregation).  There is one entry for each not-distinct set of tuples
+ * presented.
+ *****************************************************************************/
+
+/*
+ * Construct an empty TupleHashTable
+ *
+ *	numCols, keyColIdx: identify the tuple fields to use as lookup key
+ *	eqfunctions: equality comparison functions to use
+ *	hashfunctions: datatype-specific hashing functions to use
+ *	nbuckets: initial estimate of hashtable size
+ *	additionalsize: size of data stored in ->additional
+ *	metacxt: memory context for long-lived allocation, but not per-entry data
+ *	tablecxt: memory context in which to store table entries
+ *	tempcxt: short-lived context for evaluation hash and comparison functions
+ *
+ * The function arrays may be made with execTuplesHashPrepare().  Note they
+ * are not cross-type functions, but expect to see the table datatype(s)
+ * on both sides.
+ *
+ * Note that keyColIdx, eqfunctions, and hashfunctions must be allocated in
+ * storage that will live as long as the hashtable does.
+ */
+TupleHashTable
+BuildTupleHashTableExt(PlanState *parent,
+					   TupleDesc inputDesc,
+					   int numCols, AttrNumber *keyColIdx,
+					   const Oid *eqfuncoids,
+					   FmgrInfo *hashfunctions,
+					   Oid *collations,
+					   long nbuckets, Size additionalsize,
+					   MemoryContext metacxt,
+					   MemoryContext tablecxt,
+					   MemoryContext tempcxt,
+					   bool use_variable_hash_iv)
+{
+	TupleHashTable hashtable;
+	Size		entrysize = sizeof(TupleHashEntryData) + additionalsize;
+	Size		hash_mem_limit;
+	MemoryContext oldcontext;
+	bool		allow_jit;
+
+	Assert(nbuckets > 0);
+
+	/* Limit initial table size request to not more than hash_mem */
+	hash_mem_limit = get_hash_memory_limit() / entrysize;
+	if (nbuckets > hash_mem_limit)
+		nbuckets = hash_mem_limit;
+
+	oldcontext = MemoryContextSwitchTo(metacxt);
+
+	hashtable = (TupleHashTable) palloc(sizeof(TupleHashTableData));
+
+	hashtable->numCols = numCols;
+	hashtable->keyColIdx = keyColIdx;
+	hashtable->tab_hash_funcs = hashfunctions;
+	hashtable->tab_collations = collations;
+	hashtable->tablecxt = tablecxt;
+	hashtable->tempcxt = tempcxt;
+	hashtable->entrysize = entrysize;
+	hashtable->tableslot = NULL;	/* will be made on first lookup */
+	hashtable->inputslot = NULL;
+	hashtable->in_hash_funcs = NULL;
+	hashtable->cur_eq_func = NULL;
+
+	/*
+	 * If parallelism is in use, even if the leader backend is performing the
+	 * scan itself, we don't want to create the hashtable exactly the same way
+	 * in all workers. As hashtables are iterated over in keyspace-order,
+	 * doing so in all processes in the same way is likely to lead to
+	 * "unbalanced" hashtables when the table size initially is
+	 * underestimated.
+	 */
+	if (use_variable_hash_iv)
+		hashtable->hash_iv = murmurhash32(ParallelWorkerNumber);
+	else
+		hashtable->hash_iv = 0;
+
+	hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable);
+
+	/*
+	 * We copy the input tuple descriptor just for safety --- we assume all
+	 * input tuples will have equivalent descriptors.
+	 */
+	hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc),
+													&TTSOpsMinimalTuple);
+
+	/*
+	 * If the old reset interface is used (i.e. BuildTupleHashTable, rather
+	 * than BuildTupleHashTableExt), allowing JIT would lead to the generated
+	 * functions to a) live longer than the query b) be re-generated each time
+	 * the table is being reset. Therefore prevent JIT from being used in that
+	 * case, by not providing a parent node (which prevents accessing the
+	 * JitContext in the EState).
+	 */
+	allow_jit = metacxt != tablecxt;
+
+	/* build comparator for all columns */
+	/* XXX: should we support non-minimal tuples for the inputslot? */
+	hashtable->tab_eq_func = ExecBuildGroupingEqual(inputDesc, inputDesc,
+													&TTSOpsMinimalTuple, &TTSOpsMinimalTuple,
+													numCols,
+													keyColIdx, eqfuncoids, collations,
+													allow_jit ? parent : NULL);
+
+	/*
+	 * While not pretty, it's ok to not shut down this context, but instead
+	 * rely on the containing memory context being reset, as
+	 * ExecBuildGroupingEqual() only builds a very simple expression calling
+	 * functions (i.e. nothing that'd employ RegisterExprContextCallback()).
+	 */
+	hashtable->exprcontext = CreateStandaloneExprContext();
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return hashtable;
+}
+
+/*
+ * BuildTupleHashTable is a backwards-compatibilty wrapper for
+ * BuildTupleHashTableExt(), that allocates the hashtable's metadata in
+ * tablecxt. Note that hashtables created this way cannot be reset leak-free
+ * with ResetTupleHashTable().
+ */
+TupleHashTable
+BuildTupleHashTable(PlanState *parent,
+					TupleDesc inputDesc,
+					int numCols, AttrNumber *keyColIdx,
+					const Oid *eqfuncoids,
+					FmgrInfo *hashfunctions,
+					Oid *collations,
+					long nbuckets, Size additionalsize,
+					MemoryContext tablecxt,
+					MemoryContext tempcxt,
+					bool use_variable_hash_iv)
+{
+	return BuildTupleHashTableExt(parent,
+								  inputDesc,
+								  numCols, keyColIdx,
+								  eqfuncoids,
+								  hashfunctions,
+								  collations,
+								  nbuckets, additionalsize,
+								  tablecxt,
+								  tablecxt,
+								  tempcxt,
+								  use_variable_hash_iv);
+}
+
+/*
+ * Reset contents of the hashtable to be empty, preserving all the non-content
+ * state. Note that the tablecxt passed to BuildTupleHashTableExt() should
+ * also be reset, otherwise there will be leaks.
+ */
+void
+ResetTupleHashTable(TupleHashTable hashtable)
+{
+	tuplehash_reset(hashtable->hashtab);
+}
+
+/*
+ * Find or create a hashtable entry for the tuple group containing the
+ * given tuple.  The tuple must be the same type as the hashtable entries.
+ *
+ * If isnew is NULL, we do not create new entries; we return NULL if no
+ * match is found.
+ *
+ * If hash is not NULL, we set it to the calculated hash value. This allows
+ * callers access to the hash value even if no entry is returned.
+ *
+ * If isnew isn't NULL, then a new entry is created if no existing entry
+ * matches.  On return, *isnew is true if the entry is newly created,
+ * false if it existed already.  ->additional_data in the new entry has
+ * been zeroed.
+ */
+TupleHashEntry
+LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
+					 bool *isnew, uint32 *hash)
+{
+	TupleHashEntry entry;
+	MemoryContext oldContext;
+	uint32		local_hash;
+
+	/* Need to run the hash functions in short-lived context */
+	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+	/* set up data needed by hash and match functions */
+	hashtable->inputslot = slot;
+	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+	hashtable->cur_eq_func = hashtable->tab_eq_func;
+
+	local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
+
+	if (hash != NULL)
+		*hash = local_hash;
+
+	Assert(entry == NULL || entry->hash == local_hash);
+
+	MemoryContextSwitchTo(oldContext);
+
+	return entry;
+}
+
+/*
+ * Compute the hash value for a tuple
+ */
+uint32
+TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
+{
+	MemoryContext oldContext;
+	uint32		hash;
+
+	hashtable->inputslot = slot;
+	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+
+	/* Need to run the hash functions in short-lived context */
+	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+	hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+
+	MemoryContextSwitchTo(oldContext);
+
+	return hash;
+}
+
+/*
+ * A variant of LookupTupleHashEntry for callers that have already computed
+ * the hash value.
+ */
+TupleHashEntry
+LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
+						 bool *isnew, uint32 hash)
+{
+	TupleHashEntry entry;
+	MemoryContext oldContext;
+
+	/* Need to run the hash functions in short-lived context */
+	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+	/* set up data needed by hash and match functions */
+	hashtable->inputslot = slot;
+	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
+	hashtable->cur_eq_func = hashtable->tab_eq_func;
+
+	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
+	Assert(entry == NULL || entry->hash == hash);
+
+	MemoryContextSwitchTo(oldContext);
+
+	return entry;
+}
+
+/*
+ * Search for a hashtable entry matching the given tuple.  No entry is
+ * created if there's not a match.  This is similar to the non-creating
+ * case of LookupTupleHashEntry, except that it supports cross-type
+ * comparisons, in which the given tuple is not of the same type as the
+ * table entries.  The caller must provide the hash functions to use for
+ * the input tuple, as well as the equality functions, since these may be
+ * different from the table's internal functions.
+ */
+TupleHashEntry
+FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
+				   ExprState *eqcomp,
+				   FmgrInfo *hashfunctions)
+{
+	TupleHashEntry entry;
+	MemoryContext oldContext;
+	MinimalTuple key;
+
+	/* Need to run the hash functions in short-lived context */
+	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
+
+	/* Set up data needed by hash and match functions */
+	hashtable->inputslot = slot;
+	hashtable->in_hash_funcs = hashfunctions;
+	hashtable->cur_eq_func = eqcomp;
+
+	/* Search the hash table */
+	key = NULL;					/* flag to reference inputslot */
+	entry = tuplehash_lookup(hashtable->hashtab, key);
+	MemoryContextSwitchTo(oldContext);
+
+	return entry;
+}
+
+/*
+ * If tuple is NULL, use the input slot instead. This convention avoids the
+ * need to materialize virtual input tuples unless they actually need to get
+ * copied into the table.
+ *
+ * Also, the caller must select an appropriate memory context for running
+ * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.)
+ */
+static uint32
+TupleHashTableHash_internal(struct tuplehash_hash *tb,
+							const MinimalTuple tuple)
+{
+	TupleHashTable hashtable = (TupleHashTable) tb->private_data;
+	int			numCols = hashtable->numCols;
+	AttrNumber *keyColIdx = hashtable->keyColIdx;
+	uint32		hashkey = hashtable->hash_iv;
+	TupleTableSlot *slot;
+	FmgrInfo   *hashfunctions;
+	int			i;
+
+	if (tuple == NULL)
+	{
+		/* Process the current input tuple for the table */
+		slot = hashtable->inputslot;
+		hashfunctions = hashtable->in_hash_funcs;
+	}
+	else
+	{
+		/*
+		 * Process a tuple already stored in the table.
+		 *
+		 * (this case never actually occurs due to the way simplehash.h is
+		 * used, as the hash-value is stored in the entries)
+		 */
+		slot = hashtable->tableslot;
+		ExecStoreMinimalTuple(tuple, slot, false);
+		hashfunctions = hashtable->tab_hash_funcs;
+	}
+
+	for (i = 0; i < numCols; i++)
+	{
+		AttrNumber	att = keyColIdx[i];
+		Datum		attr;
+		bool		isNull;
+
+		/* rotate hashkey left 1 bit at each step */
+		hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+		attr = slot_getattr(slot, att, &isNull);
+
+		if (!isNull)			/* treat nulls as having hash key 0 */
+		{
+			uint32		hkey;
+
+			hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i],
+													hashtable->tab_collations[i],
+													attr));
+			hashkey ^= hkey;
+		}
+	}
+
+	/*
+	 * The way hashes are combined above, among each other and with the IV,
+	 * doesn't lead to good bit perturbation. As the IV's goal is to lead to
+	 * achieve that, perform a round of hashing of the combined hash -
+	 * resulting in near perfect perturbation.
+	 */
+	return murmurhash32(hashkey);
+}
+
+/*
+ * Does the work of LookupTupleHashEntry and LookupTupleHashEntryHash. Useful
+ * so that we can avoid switching the memory context multiple times for
+ * LookupTupleHashEntry.
+ *
+ * NB: This function may or may not change the memory context. Caller is
+ * expected to change it back.
+ */
+static inline TupleHashEntry
+LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
+							  bool *isnew, uint32 hash)
+{
+	TupleHashEntryData *entry;
+	bool		found;
+	MinimalTuple key;
+
+	key = NULL;					/* flag to reference inputslot */
+
+	if (isnew)
+	{
+		entry = tuplehash_insert_hash(hashtable->hashtab, key, hash, &found);
+
+		if (found)
+		{
+			/* found pre-existing entry */
+			*isnew = false;
+		}
+		else
+		{
+			/* created new entry */
+			*isnew = true;
+			/* zero caller data */
+			entry->additional = NULL;
+			MemoryContextSwitchTo(hashtable->tablecxt);
+			/* Copy the first tuple into the table context */
+			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
+		}
+	}
+	else
+	{
+		entry = tuplehash_lookup_hash(hashtable->hashtab, key, hash);
+	}
+
+	return entry;
+}
+
+/*
+ * See whether two tuples (presumably of the same hash value) match
+ */
+static int
+TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2)
+{
+	TupleTableSlot *slot1;
+	TupleTableSlot *slot2;
+	TupleHashTable hashtable = (TupleHashTable) tb->private_data;
+	ExprContext *econtext = hashtable->exprcontext;
+
+	/*
+	 * We assume that simplehash.h will only ever call us with the first
+	 * argument being an actual table entry, and the second argument being
+	 * LookupTupleHashEntry's dummy TupleHashEntryData.  The other direction
+	 * could be supported too, but is not currently required.
+	 */
+	Assert(tuple1 != NULL);
+	slot1 = hashtable->tableslot;
+	ExecStoreMinimalTuple(tuple1, slot1, false);
+	Assert(tuple2 == NULL);
+	slot2 = hashtable->inputslot;
+
+	/* For crosstype comparisons, the inputslot must be first */
+	econtext->ecxt_innertuple = slot2;
+	econtext->ecxt_outertuple = slot1;
+	return !ExecQualAndReset(hashtable->cur_eq_func, econtext);
+}
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
new file mode 100644
index 0000000..74becdc
--- /dev/null
+++ b/src/backend/executor/execIndexing.c
@@ -0,0 +1,921 @@
+/*-------------------------------------------------------------------------
+ *
+ * execIndexing.c
+ *	  routines for inserting index tuples and enforcing unique and
+ *	  exclusion constraints.
+ *
+ * ExecInsertIndexTuples() is the main entry point.  It's called after
+ * inserting a tuple to the heap, and it inserts corresponding index tuples
+ * into all indexes.  At the same time, it enforces any unique and
+ * exclusion constraints:
+ *
+ * Unique Indexes
+ * --------------
+ *
+ * Enforcing a unique constraint is straightforward.  When the index AM
+ * inserts the tuple to the index, it also checks that there are no
+ * conflicting tuples in the index already.  It does so atomically, so that
+ * even if two backends try to insert the same key concurrently, only one
+ * of them will succeed.  All the logic to ensure atomicity, and to wait
+ * for in-progress transactions to finish, is handled by the index AM.
+ *
+ * If a unique constraint is deferred, we request the index AM to not
+ * throw an error if a conflict is found.  Instead, we make note that there
+ * was a conflict and return the list of indexes with conflicts to the
+ * caller.  The caller must re-check them later, by calling index_insert()
+ * with the UNIQUE_CHECK_EXISTING option.
+ *
+ * Exclusion Constraints
+ * ---------------------
+ *
+ * Exclusion constraints are different from unique indexes in that when the
+ * tuple is inserted to the index, the index AM does not check for
+ * duplicate keys at the same time.  After the insertion, we perform a
+ * separate scan on the index to check for conflicting tuples, and if one
+ * is found, we throw an error and the transaction is aborted.  If the
+ * conflicting tuple's inserter or deleter is in-progress, we wait for it
+ * to finish first.
+ *
+ * There is a chance of deadlock, if two backends insert a tuple at the
+ * same time, and then perform the scan to check for conflicts.  They will
+ * find each other's tuple, and both try to wait for each other.  The
+ * deadlock detector will detect that, and abort one of the transactions.
+ * That's fairly harmless, as one of them was bound to abort with a
+ * "duplicate key error" anyway, although you get a different error
+ * message.
+ *
+ * If an exclusion constraint is deferred, we still perform the conflict
+ * checking scan immediately after inserting the index tuple.  But instead
+ * of throwing an error if a conflict is found, we return that information
+ * to the caller.  The caller must re-check them later by calling
+ * check_exclusion_constraint().
+ *
+ * Speculative insertion
+ * ---------------------
+ *
+ * Speculative insertion is a two-phase mechanism used to implement
+ * INSERT ... ON CONFLICT DO UPDATE/NOTHING.  The tuple is first inserted
+ * to the heap and update the indexes as usual, but if a constraint is
+ * violated, we can still back out the insertion without aborting the whole
+ * transaction.  In an INSERT ... ON CONFLICT statement, if a conflict is
+ * detected, the inserted tuple is backed out and the ON CONFLICT action is
+ * executed instead.
+ *
+ * Insertion to a unique index works as usual: the index AM checks for
+ * duplicate keys atomically with the insertion.  But instead of throwing
+ * an error on a conflict, the speculatively inserted heap tuple is backed
+ * out.
+ *
+ * Exclusion constraints are slightly more complicated.  As mentioned
+ * earlier, there is a risk of deadlock when two backends insert the same
+ * key concurrently.  That was not a problem for regular insertions, when
+ * one of the transactions has to be aborted anyway, but with a speculative
+ * insertion we cannot let a deadlock happen, because we only want to back
+ * out the speculatively inserted tuple on conflict, not abort the whole
+ * transaction.
+ *
+ * When a backend detects that the speculative insertion conflicts with
+ * another in-progress tuple, it has two options:
+ *
+ * 1. back out the speculatively inserted tuple, then wait for the other
+ *	  transaction, and retry. Or,
+ * 2. wait for the other transaction, with the speculatively inserted tuple
+ *	  still in place.
+ *
+ * If two backends insert at the same time, and both try to wait for each
+ * other, they will deadlock.  So option 2 is not acceptable.  Option 1
+ * avoids the deadlock, but it is prone to a livelock instead.  Both
+ * transactions will wake up immediately as the other transaction backs
+ * out.  Then they both retry, and conflict with each other again, lather,
+ * rinse, repeat.
+ *
+ * To avoid the livelock, one of the backends must back out first, and then
+ * wait, while the other one waits without backing out.  It doesn't matter
+ * which one backs out, so we employ an arbitrary rule that the transaction
+ * with the higher XID backs out.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execIndexing.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "catalog/index.h"
+#include "executor/executor.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/lmgr.h"
+#include "utils/snapmgr.h"
+
+/* waitMode argument to check_exclusion_or_unique_constraint() */
+typedef enum
+{
+	CEOUC_WAIT,
+	CEOUC_NOWAIT,
+	CEOUC_LIVELOCK_PREVENTING_WAIT
+} CEOUC_WAIT_MODE;
+
+static bool check_exclusion_or_unique_constraint(Relation heap, Relation index,
+												 IndexInfo *indexInfo,
+												 ItemPointer tupleid,
+												 Datum *values, bool *isnull,
+												 EState *estate, bool newIndex,
+												 CEOUC_WAIT_MODE waitMode,
+												 bool errorOK,
+												 ItemPointer conflictTid);
+
+static bool index_recheck_constraint(Relation index, Oid *constr_procs,
+									 Datum *existing_values, bool *existing_isnull,
+									 Datum *new_values);
+
+/* ----------------------------------------------------------------
+ *		ExecOpenIndices
+ *
+ *		Find the indices associated with a result relation, open them,
+ *		and save information about them in the result ResultRelInfo.
+ *
+ *		At entry, caller has already opened and locked
+ *		resultRelInfo->ri_RelationDesc.
+ * ----------------------------------------------------------------
+ */
+void
+ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative)
+{
+	Relation	resultRelation = resultRelInfo->ri_RelationDesc;
+	List	   *indexoidlist;
+	ListCell   *l;
+	int			len,
+				i;
+	RelationPtr relationDescs;
+	IndexInfo **indexInfoArray;
+
+	resultRelInfo->ri_NumIndices = 0;
+
+	/* fast path if no indexes */
+	if (!RelationGetForm(resultRelation)->relhasindex)
+		return;
+
+	/*
+	 * Get cached list of index OIDs
+	 */
+	indexoidlist = RelationGetIndexList(resultRelation);
+	len = list_length(indexoidlist);
+	if (len == 0)
+		return;
+
+	/*
+	 * allocate space for result arrays
+	 */
+	relationDescs = (RelationPtr) palloc(len * sizeof(Relation));
+	indexInfoArray = (IndexInfo **) palloc(len * sizeof(IndexInfo *));
+
+	resultRelInfo->ri_NumIndices = len;
+	resultRelInfo->ri_IndexRelationDescs = relationDescs;
+	resultRelInfo->ri_IndexRelationInfo = indexInfoArray;
+
+	/*
+	 * For each index, open the index relation and save pg_index info. We
+	 * acquire RowExclusiveLock, signifying we will update the index.
+	 *
+	 * Note: we do this even if the index is not indisready; it's not worth
+	 * the trouble to optimize for the case where it isn't.
+	 */
+	i = 0;
+	foreach(l, indexoidlist)
+	{
+		Oid			indexOid = lfirst_oid(l);
+		Relation	indexDesc;
+		IndexInfo  *ii;
+
+		indexDesc = index_open(indexOid, RowExclusiveLock);
+
+		/* extract index key information from the index's pg_index info */
+		ii = BuildIndexInfo(indexDesc);
+
+		/*
+		 * If the indexes are to be used for speculative insertion, add extra
+		 * information required by unique index entries.
+		 */
+		if (speculative && ii->ii_Unique)
+			BuildSpeculativeIndexInfo(indexDesc, ii);
+
+		relationDescs[i] = indexDesc;
+		indexInfoArray[i] = ii;
+		i++;
+	}
+
+	list_free(indexoidlist);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecCloseIndices
+ *
+ *		Close the index relations stored in resultRelInfo
+ * ----------------------------------------------------------------
+ */
+void
+ExecCloseIndices(ResultRelInfo *resultRelInfo)
+{
+	int			i;
+	int			numIndices;
+	RelationPtr indexDescs;
+
+	numIndices = resultRelInfo->ri_NumIndices;
+	indexDescs = resultRelInfo->ri_IndexRelationDescs;
+
+	for (i = 0; i < numIndices; i++)
+	{
+		if (indexDescs[i] == NULL)
+			continue;			/* shouldn't happen? */
+
+		/* Drop lock acquired by ExecOpenIndices */
+		index_close(indexDescs[i], RowExclusiveLock);
+	}
+
+	/*
+	 * XXX should free indexInfo array here too?  Currently we assume that
+	 * such stuff will be cleaned up automatically in FreeExecutorState.
+	 */
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInsertIndexTuples
+ *
+ *		This routine takes care of inserting index tuples
+ *		into all the relations indexing the result relation
+ *		when a heap tuple is inserted into the result relation.
+ *
+ *		When 'update' is true, executor is performing an UPDATE
+ *		that could not use an optimization like heapam's HOT (in
+ *		more general terms a call to table_tuple_update() took
+ *		place and set 'update_indexes' to true).  Receiving this
+ *		hint makes us consider if we should pass down the
+ *		'indexUnchanged' hint in turn.  That's something that we
+ *		figure out for each index_insert() call iff 'update' is
+ *		true.  (When 'update' is false we already know not to pass
+ *		the hint to any index.)
+ *
+ *		Unique and exclusion constraints are enforced at the same
+ *		time.  This returns a list of index OIDs for any unique or
+ *		exclusion constraints that are deferred and that had
+ *		potential (unconfirmed) conflicts.  (if noDupErr == true,
+ *		the same is done for non-deferred constraints, but report
+ *		if conflict was speculative or deferred conflict to caller)
+ *
+ *		If 'arbiterIndexes' is nonempty, noDupErr applies only to
+ *		those indexes.  NIL means noDupErr applies to all indexes.
+ * ----------------------------------------------------------------
+ */
+List *
+ExecInsertIndexTuples(ResultRelInfo *resultRelInfo,
+					  TupleTableSlot *slot,
+					  EState *estate,
+					  bool update,
+					  bool noDupErr,
+					  bool *specConflict,
+					  List *arbiterIndexes)
+{
+	ItemPointer tupleid = &slot->tts_tid;
+	List	   *result = NIL;
+	int			i;
+	int			numIndices;
+	RelationPtr relationDescs;
+	Relation	heapRelation;
+	IndexInfo **indexInfoArray;
+	ExprContext *econtext;
+	Datum		values[INDEX_MAX_KEYS];
+	bool		isnull[INDEX_MAX_KEYS];
+
+	Assert(ItemPointerIsValid(tupleid));
+
+	/*
+	 * Get information from the result relation info structure.
+	 */
+	numIndices = resultRelInfo->ri_NumIndices;
+	relationDescs = resultRelInfo->ri_IndexRelationDescs;
+	indexInfoArray = resultRelInfo->ri_IndexRelationInfo;
+	heapRelation = resultRelInfo->ri_RelationDesc;
+
+	/* Sanity check: slot must belong to the same rel as the resultRelInfo. */
+	Assert(slot->tts_tableOid == RelationGetRelid(heapRelation));
+
+	/*
+	 * We will use the EState's per-tuple context for evaluating predicates
+	 * and index expressions (creating it if it's not already there).
+	 */
+	econtext = GetPerTupleExprContext(estate);
+
+	/* Arrange for econtext's scan tuple to be the tuple under test */
+	econtext->ecxt_scantuple = slot;
+
+	/*
+	 * for each index, form and insert the index tuple
+	 */
+	for (i = 0; i < numIndices; i++)
+	{
+		Relation	indexRelation = relationDescs[i];
+		IndexInfo  *indexInfo;
+		bool		applyNoDupErr;
+		IndexUniqueCheck checkUnique;
+		bool		indexUnchanged;
+		bool		satisfiesConstraint;
+
+		if (indexRelation == NULL)
+			continue;
+
+		indexInfo = indexInfoArray[i];
+
+		/* If the index is marked as read-only, ignore it */
+		if (!indexInfo->ii_ReadyForInserts)
+			continue;
+
+		/* Check for partial index */
+		if (indexInfo->ii_Predicate != NIL)
+		{
+			ExprState  *predicate;
+
+			/*
+			 * If predicate state not set up yet, create it (in the estate's
+			 * per-query context)
+			 */
+			predicate = indexInfo->ii_PredicateState;
+			if (predicate == NULL)
+			{
+				predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+				indexInfo->ii_PredicateState = predicate;
+			}
+
+			/* Skip this index-update if the predicate isn't satisfied */
+			if (!ExecQual(predicate, econtext))
+				continue;
+		}
+
+		/*
+		 * FormIndexDatum fills in its values and isnull parameters with the
+		 * appropriate values for the column(s) of the index.
+		 */
+		FormIndexDatum(indexInfo,
+					   slot,
+					   estate,
+					   values,
+					   isnull);
+
+		/* Check whether to apply noDupErr to this index */
+		applyNoDupErr = noDupErr &&
+			(arbiterIndexes == NIL ||
+			 list_member_oid(arbiterIndexes,
+							 indexRelation->rd_index->indexrelid));
+
+		/*
+		 * The index AM does the actual insertion, plus uniqueness checking.
+		 *
+		 * For an immediate-mode unique index, we just tell the index AM to
+		 * throw error if not unique.
+		 *
+		 * For a deferrable unique index, we tell the index AM to just detect
+		 * possible non-uniqueness, and we add the index OID to the result
+		 * list if further checking is needed.
+		 *
+		 * For a speculative insertion (used by INSERT ... ON CONFLICT), do
+		 * the same as for a deferrable unique index.
+		 */
+		if (!indexRelation->rd_index->indisunique)
+			checkUnique = UNIQUE_CHECK_NO;
+		else if (applyNoDupErr)
+			checkUnique = UNIQUE_CHECK_PARTIAL;
+		else if (indexRelation->rd_index->indimmediate)
+			checkUnique = UNIQUE_CHECK_YES;
+		else
+			checkUnique = UNIQUE_CHECK_PARTIAL;
+
+		/*
+		 * There's definitely going to be an index_insert() call for this
+		 * index.  If we're being called as part of an UPDATE statement,
+		 * consider if the 'indexUnchanged' = true hint should be passed.
+		 *
+		 * XXX We always assume that the hint should be passed for an UPDATE.
+		 * This is a workaround for a bug in PostgreSQL 14.  In practice this
+		 * won't make much difference for current users of the hint.
+		 */
+		indexUnchanged = update;
+
+		satisfiesConstraint =
+			index_insert(indexRelation, /* index relation */
+						 values,	/* array of index Datums */
+						 isnull,	/* null flags */
+						 tupleid,	/* tid of heap tuple */
+						 heapRelation,	/* heap relation */
+						 checkUnique,	/* type of uniqueness check to do */
+						 indexUnchanged,	/* UPDATE without logical change? */
+						 indexInfo);	/* index AM may need this */
+
+		/*
+		 * If the index has an associated exclusion constraint, check that.
+		 * This is simpler than the process for uniqueness checks since we
+		 * always insert first and then check.  If the constraint is deferred,
+		 * we check now anyway, but don't throw error on violation or wait for
+		 * a conclusive outcome from a concurrent insertion; instead we'll
+		 * queue a recheck event.  Similarly, noDupErr callers (speculative
+		 * inserters) will recheck later, and wait for a conclusive outcome
+		 * then.
+		 *
+		 * An index for an exclusion constraint can't also be UNIQUE (not an
+		 * essential property, we just don't allow it in the grammar), so no
+		 * need to preserve the prior state of satisfiesConstraint.
+		 */
+		if (indexInfo->ii_ExclusionOps != NULL)
+		{
+			bool		violationOK;
+			CEOUC_WAIT_MODE waitMode;
+
+			if (applyNoDupErr)
+			{
+				violationOK = true;
+				waitMode = CEOUC_LIVELOCK_PREVENTING_WAIT;
+			}
+			else if (!indexRelation->rd_index->indimmediate)
+			{
+				violationOK = true;
+				waitMode = CEOUC_NOWAIT;
+			}
+			else
+			{
+				violationOK = false;
+				waitMode = CEOUC_WAIT;
+			}
+
+			satisfiesConstraint =
+				check_exclusion_or_unique_constraint(heapRelation,
+													 indexRelation, indexInfo,
+													 tupleid, values, isnull,
+													 estate, false,
+													 waitMode, violationOK, NULL);
+		}
+
+		if ((checkUnique == UNIQUE_CHECK_PARTIAL ||
+			 indexInfo->ii_ExclusionOps != NULL) &&
+			!satisfiesConstraint)
+		{
+			/*
+			 * The tuple potentially violates the uniqueness or exclusion
+			 * constraint, so make a note of the index so that we can re-check
+			 * it later.  Speculative inserters are told if there was a
+			 * speculative conflict, since that always requires a restart.
+			 */
+			result = lappend_oid(result, RelationGetRelid(indexRelation));
+			if (indexRelation->rd_index->indimmediate && specConflict)
+				*specConflict = true;
+		}
+	}
+
+	return result;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecCheckIndexConstraints
+ *
+ *		This routine checks if a tuple violates any unique or
+ *		exclusion constraints.  Returns true if there is no conflict.
+ *		Otherwise returns false, and the TID of the conflicting
+ *		tuple is returned in *conflictTid.
+ *
+ *		If 'arbiterIndexes' is given, only those indexes are checked.
+ *		NIL means all indexes.
+ *
+ *		Note that this doesn't lock the values in any way, so it's
+ *		possible that a conflicting tuple is inserted immediately
+ *		after this returns.  But this can be used for a pre-check
+ *		before insertion.
+ * ----------------------------------------------------------------
+ */
+bool
+ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
+						  EState *estate, ItemPointer conflictTid,
+						  List *arbiterIndexes)
+{
+	int			i;
+	int			numIndices;
+	RelationPtr relationDescs;
+	Relation	heapRelation;
+	IndexInfo **indexInfoArray;
+	ExprContext *econtext;
+	Datum		values[INDEX_MAX_KEYS];
+	bool		isnull[INDEX_MAX_KEYS];
+	ItemPointerData invalidItemPtr;
+	bool		checkedIndex = false;
+
+	ItemPointerSetInvalid(conflictTid);
+	ItemPointerSetInvalid(&invalidItemPtr);
+
+	/*
+	 * Get information from the result relation info structure.
+	 */
+	numIndices = resultRelInfo->ri_NumIndices;
+	relationDescs = resultRelInfo->ri_IndexRelationDescs;
+	indexInfoArray = resultRelInfo->ri_IndexRelationInfo;
+	heapRelation = resultRelInfo->ri_RelationDesc;
+
+	/*
+	 * We will use the EState's per-tuple context for evaluating predicates
+	 * and index expressions (creating it if it's not already there).
+	 */
+	econtext = GetPerTupleExprContext(estate);
+
+	/* Arrange for econtext's scan tuple to be the tuple under test */
+	econtext->ecxt_scantuple = slot;
+
+	/*
+	 * For each index, form index tuple and check if it satisfies the
+	 * constraint.
+	 */
+	for (i = 0; i < numIndices; i++)
+	{
+		Relation	indexRelation = relationDescs[i];
+		IndexInfo  *indexInfo;
+		bool		satisfiesConstraint;
+
+		if (indexRelation == NULL)
+			continue;
+
+		indexInfo = indexInfoArray[i];
+
+		if (!indexInfo->ii_Unique && !indexInfo->ii_ExclusionOps)
+			continue;
+
+		/* If the index is marked as read-only, ignore it */
+		if (!indexInfo->ii_ReadyForInserts)
+			continue;
+
+		/* When specific arbiter indexes requested, only examine them */
+		if (arbiterIndexes != NIL &&
+			!list_member_oid(arbiterIndexes,
+							 indexRelation->rd_index->indexrelid))
+			continue;
+
+		if (!indexRelation->rd_index->indimmediate)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("ON CONFLICT does not support deferrable unique constraints/exclusion constraints as arbiters"),
+					 errtableconstraint(heapRelation,
+										RelationGetRelationName(indexRelation))));
+
+		checkedIndex = true;
+
+		/* Check for partial index */
+		if (indexInfo->ii_Predicate != NIL)
+		{
+			ExprState  *predicate;
+
+			/*
+			 * If predicate state not set up yet, create it (in the estate's
+			 * per-query context)
+			 */
+			predicate = indexInfo->ii_PredicateState;
+			if (predicate == NULL)
+			{
+				predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+				indexInfo->ii_PredicateState = predicate;
+			}
+
+			/* Skip this index-update if the predicate isn't satisfied */
+			if (!ExecQual(predicate, econtext))
+				continue;
+		}
+
+		/*
+		 * FormIndexDatum fills in its values and isnull parameters with the
+		 * appropriate values for the column(s) of the index.
+		 */
+		FormIndexDatum(indexInfo,
+					   slot,
+					   estate,
+					   values,
+					   isnull);
+
+		satisfiesConstraint =
+			check_exclusion_or_unique_constraint(heapRelation, indexRelation,
+												 indexInfo, &invalidItemPtr,
+												 values, isnull, estate, false,
+												 CEOUC_WAIT, true,
+												 conflictTid);
+		if (!satisfiesConstraint)
+			return false;
+	}
+
+	if (arbiterIndexes != NIL && !checkedIndex)
+		elog(ERROR, "unexpected failure to find arbiter index");
+
+	return true;
+}
+
+/*
+ * Check for violation of an exclusion or unique constraint
+ *
+ * heap: the table containing the new tuple
+ * index: the index supporting the constraint
+ * indexInfo: info about the index, including the exclusion properties
+ * tupleid: heap TID of the new tuple we have just inserted (invalid if we
+ *		haven't inserted a new tuple yet)
+ * values, isnull: the *index* column values computed for the new tuple
+ * estate: an EState we can do evaluation in
+ * newIndex: if true, we are trying to build a new index (this affects
+ *		only the wording of error messages)
+ * waitMode: whether to wait for concurrent inserters/deleters
+ * violationOK: if true, don't throw error for violation
+ * conflictTid: if not-NULL, the TID of the conflicting tuple is returned here
+ *
+ * Returns true if OK, false if actual or potential violation
+ *
+ * 'waitMode' determines what happens if a conflict is detected with a tuple
+ * that was inserted or deleted by a transaction that's still running.
+ * CEOUC_WAIT means that we wait for the transaction to commit, before
+ * throwing an error or returning.  CEOUC_NOWAIT means that we report the
+ * violation immediately; so the violation is only potential, and the caller
+ * must recheck sometime later.  This behavior is convenient for deferred
+ * exclusion checks; we need not bother queuing a deferred event if there is
+ * definitely no conflict at insertion time.
+ *
+ * CEOUC_LIVELOCK_PREVENTING_WAIT is like CEOUC_NOWAIT, but we will sometimes
+ * wait anyway, to prevent livelocking if two transactions try inserting at
+ * the same time.  This is used with speculative insertions, for INSERT ON
+ * CONFLICT statements. (See notes in file header)
+ *
+ * If violationOK is true, we just report the potential or actual violation to
+ * the caller by returning 'false'.  Otherwise we throw a descriptive error
+ * message here.  When violationOK is false, a false result is impossible.
+ *
+ * Note: The indexam is normally responsible for checking unique constraints,
+ * so this normally only needs to be used for exclusion constraints.  But this
+ * function is also called when doing a "pre-check" for conflicts on a unique
+ * constraint, when doing speculative insertion.  Caller may use the returned
+ * conflict TID to take further steps.
+ */
+static bool
+check_exclusion_or_unique_constraint(Relation heap, Relation index,
+									 IndexInfo *indexInfo,
+									 ItemPointer tupleid,
+									 Datum *values, bool *isnull,
+									 EState *estate, bool newIndex,
+									 CEOUC_WAIT_MODE waitMode,
+									 bool violationOK,
+									 ItemPointer conflictTid)
+{
+	Oid		   *constr_procs;
+	uint16	   *constr_strats;
+	Oid		   *index_collations = index->rd_indcollation;
+	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index);
+	IndexScanDesc index_scan;
+	ScanKeyData scankeys[INDEX_MAX_KEYS];
+	SnapshotData DirtySnapshot;
+	int			i;
+	bool		conflict;
+	bool		found_self;
+	ExprContext *econtext;
+	TupleTableSlot *existing_slot;
+	TupleTableSlot *save_scantuple;
+
+	if (indexInfo->ii_ExclusionOps)
+	{
+		constr_procs = indexInfo->ii_ExclusionProcs;
+		constr_strats = indexInfo->ii_ExclusionStrats;
+	}
+	else
+	{
+		constr_procs = indexInfo->ii_UniqueProcs;
+		constr_strats = indexInfo->ii_UniqueStrats;
+	}
+
+	/*
+	 * If any of the input values are NULL, the constraint check is assumed to
+	 * pass (i.e., we assume the operators are strict).
+	 */
+	for (i = 0; i < indnkeyatts; i++)
+	{
+		if (isnull[i])
+			return true;
+	}
+
+	/*
+	 * Search the tuples that are in the index for any violations, including
+	 * tuples that aren't visible yet.
+	 */
+	InitDirtySnapshot(DirtySnapshot);
+
+	for (i = 0; i < indnkeyatts; i++)
+	{
+		ScanKeyEntryInitialize(&scankeys[i],
+							   0,
+							   i + 1,
+							   constr_strats[i],
+							   InvalidOid,
+							   index_collations[i],
+							   constr_procs[i],
+							   values[i]);
+	}
+
+	/*
+	 * Need a TupleTableSlot to put existing tuples in.
+	 *
+	 * To use FormIndexDatum, we have to make the econtext's scantuple point
+	 * to this slot.  Be sure to save and restore caller's value for
+	 * scantuple.
+	 */
+	existing_slot = table_slot_create(heap, NULL);
+
+	econtext = GetPerTupleExprContext(estate);
+	save_scantuple = econtext->ecxt_scantuple;
+	econtext->ecxt_scantuple = existing_slot;
+
+	/*
+	 * May have to restart scan from this point if a potential conflict is
+	 * found.
+	 */
+retry:
+	conflict = false;
+	found_self = false;
+	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0);
+	index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0);
+
+	while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot))
+	{
+		TransactionId xwait;
+		XLTW_Oper	reason_wait;
+		Datum		existing_values[INDEX_MAX_KEYS];
+		bool		existing_isnull[INDEX_MAX_KEYS];
+		char	   *error_new;
+		char	   *error_existing;
+
+		/*
+		 * Ignore the entry for the tuple we're trying to check.
+		 */
+		if (ItemPointerIsValid(tupleid) &&
+			ItemPointerEquals(tupleid, &existing_slot->tts_tid))
+		{
+			if (found_self)		/* should not happen */
+				elog(ERROR, "found self tuple multiple times in index \"%s\"",
+					 RelationGetRelationName(index));
+			found_self = true;
+			continue;
+		}
+
+		/*
+		 * Extract the index column values and isnull flags from the existing
+		 * tuple.
+		 */
+		FormIndexDatum(indexInfo, existing_slot, estate,
+					   existing_values, existing_isnull);
+
+		/* If lossy indexscan, must recheck the condition */
+		if (index_scan->xs_recheck)
+		{
+			if (!index_recheck_constraint(index,
+										  constr_procs,
+										  existing_values,
+										  existing_isnull,
+										  values))
+				continue;		/* tuple doesn't actually match, so no
+								 * conflict */
+		}
+
+		/*
+		 * At this point we have either a conflict or a potential conflict.
+		 *
+		 * If an in-progress transaction is affecting the visibility of this
+		 * tuple, we need to wait for it to complete and then recheck (unless
+		 * the caller requested not to).  For simplicity we do rechecking by
+		 * just restarting the whole scan --- this case probably doesn't
+		 * happen often enough to be worth trying harder, and anyway we don't
+		 * want to hold any index internal locks while waiting.
+		 */
+		xwait = TransactionIdIsValid(DirtySnapshot.xmin) ?
+			DirtySnapshot.xmin : DirtySnapshot.xmax;
+
+		if (TransactionIdIsValid(xwait) &&
+			(waitMode == CEOUC_WAIT ||
+			 (waitMode == CEOUC_LIVELOCK_PREVENTING_WAIT &&
+			  DirtySnapshot.speculativeToken &&
+			  TransactionIdPrecedes(GetCurrentTransactionId(), xwait))))
+		{
+			reason_wait = indexInfo->ii_ExclusionOps ?
+				XLTW_RecheckExclusionConstr : XLTW_InsertIndex;
+			index_endscan(index_scan);
+			if (DirtySnapshot.speculativeToken)
+				SpeculativeInsertionWait(DirtySnapshot.xmin,
+										 DirtySnapshot.speculativeToken);
+			else
+				XactLockTableWait(xwait, heap,
+								  &existing_slot->tts_tid, reason_wait);
+			goto retry;
+		}
+
+		/*
+		 * We have a definite conflict (or a potential one, but the caller
+		 * didn't want to wait).  Return it to caller, or report it.
+		 */
+		if (violationOK)
+		{
+			conflict = true;
+			if (conflictTid)
+				*conflictTid = existing_slot->tts_tid;
+			break;
+		}
+
+		error_new = BuildIndexValueDescription(index, values, isnull);
+		error_existing = BuildIndexValueDescription(index, existing_values,
+													existing_isnull);
+		if (newIndex)
+			ereport(ERROR,
+					(errcode(ERRCODE_EXCLUSION_VIOLATION),
+					 errmsg("could not create exclusion constraint \"%s\"",
+							RelationGetRelationName(index)),
+					 error_new && error_existing ?
+					 errdetail("Key %s conflicts with key %s.",
+							   error_new, error_existing) :
+					 errdetail("Key conflicts exist."),
+					 errtableconstraint(heap,
+										RelationGetRelationName(index))));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_EXCLUSION_VIOLATION),
+					 errmsg("conflicting key value violates exclusion constraint \"%s\"",
+							RelationGetRelationName(index)),
+					 error_new && error_existing ?
+					 errdetail("Key %s conflicts with existing key %s.",
+							   error_new, error_existing) :
+					 errdetail("Key conflicts with existing key."),
+					 errtableconstraint(heap,
+										RelationGetRelationName(index))));
+	}
+
+	index_endscan(index_scan);
+
+	/*
+	 * Ordinarily, at this point the search should have found the originally
+	 * inserted tuple (if any), unless we exited the loop early because of
+	 * conflict.  However, it is possible to define exclusion constraints for
+	 * which that wouldn't be true --- for instance, if the operator is <>. So
+	 * we no longer complain if found_self is still false.
+	 */
+
+	econtext->ecxt_scantuple = save_scantuple;
+
+	ExecDropSingleTupleTableSlot(existing_slot);
+
+	return !conflict;
+}
+
+/*
+ * Check for violation of an exclusion constraint
+ *
+ * This is a dumbed down version of check_exclusion_or_unique_constraint
+ * for external callers. They don't need all the special modes.
+ */
+void
+check_exclusion_constraint(Relation heap, Relation index,
+						   IndexInfo *indexInfo,
+						   ItemPointer tupleid,
+						   Datum *values, bool *isnull,
+						   EState *estate, bool newIndex)
+{
+	(void) check_exclusion_or_unique_constraint(heap, index, indexInfo, tupleid,
+												values, isnull,
+												estate, newIndex,
+												CEOUC_WAIT, false, NULL);
+}
+
+/*
+ * Check existing tuple's index values to see if it really matches the
+ * exclusion condition against the new_values.  Returns true if conflict.
+ */
+static bool
+index_recheck_constraint(Relation index, Oid *constr_procs,
+						 Datum *existing_values, bool *existing_isnull,
+						 Datum *new_values)
+{
+	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index);
+	int			i;
+
+	for (i = 0; i < indnkeyatts; i++)
+	{
+		/* Assume the exclusion operators are strict */
+		if (existing_isnull[i])
+			return false;
+
+		if (!DatumGetBool(OidFunctionCall2Coll(constr_procs[i],
+											   index->rd_indcollation[i],
+											   existing_values[i],
+											   new_values[i])))
+			return false;
+	}
+
+	return true;
+}
diff --git a/src/backend/executor/execJunk.c b/src/backend/executor/execJunk.c
new file mode 100644
index 0000000..9741897
--- /dev/null
+++ b/src/backend/executor/execJunk.c
@@ -0,0 +1,304 @@
+/*-------------------------------------------------------------------------
+ *
+ * execJunk.c
+ *	  Junk attribute support stuff....
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execJunk.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+
+/*-------------------------------------------------------------------------
+ *		XXX this stuff should be rewritten to take advantage
+ *			of ExecProject() and the ProjectionInfo node.
+ *			-cim 6/3/91
+ *
+ * An attribute of a tuple living inside the executor, can be
+ * either a normal attribute or a "junk" attribute. "junk" attributes
+ * never make it out of the executor, i.e. they are never printed,
+ * returned or stored on disk. Their only purpose in life is to
+ * store some information useful only to the executor, mainly the values
+ * of system attributes like "ctid", or sort key columns that are not to
+ * be output.
+ *
+ * The general idea is the following: A target list consists of a list of
+ * TargetEntry nodes containing expressions. Each TargetEntry has a field
+ * called 'resjunk'. If the value of this field is true then the
+ * corresponding attribute is a "junk" attribute.
+ *
+ * When we initialize a plan we call ExecInitJunkFilter to create a filter.
+ *
+ * We then execute the plan, treating the resjunk attributes like any others.
+ *
+ * Finally, when at the top level we get back a tuple, we can call
+ * ExecFindJunkAttribute/ExecGetJunkAttribute to retrieve the values of the
+ * junk attributes we are interested in, and ExecFilterJunk to remove all the
+ * junk attributes from a tuple.  This new "clean" tuple is then printed,
+ * inserted, or updated.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * ExecInitJunkFilter
+ *
+ * Initialize the Junk filter.
+ *
+ * The source targetlist is passed in.  The output tuple descriptor is
+ * built from the non-junk tlist entries.
+ * An optional resultSlot can be passed as well; otherwise, we create one.
+ */
+JunkFilter *
+ExecInitJunkFilter(List *targetList, TupleTableSlot *slot)
+{
+	JunkFilter *junkfilter;
+	TupleDesc	cleanTupType;
+	int			cleanLength;
+	AttrNumber *cleanMap;
+
+	/*
+	 * Compute the tuple descriptor for the cleaned tuple.
+	 */
+	cleanTupType = ExecCleanTypeFromTL(targetList);
+
+	/*
+	 * Use the given slot, or make a new slot if we weren't given one.
+	 */
+	if (slot)
+		ExecSetSlotDescriptor(slot, cleanTupType);
+	else
+		slot = MakeSingleTupleTableSlot(cleanTupType, &TTSOpsVirtual);
+
+	/*
+	 * Now calculate the mapping between the original tuple's attributes and
+	 * the "clean" tuple's attributes.
+	 *
+	 * The "map" is an array of "cleanLength" attribute numbers, i.e. one
+	 * entry for every attribute of the "clean" tuple. The value of this entry
+	 * is the attribute number of the corresponding attribute of the
+	 * "original" tuple.  (Zero indicates a NULL output attribute, but we do
+	 * not use that feature in this routine.)
+	 */
+	cleanLength = cleanTupType->natts;
+	if (cleanLength > 0)
+	{
+		AttrNumber	cleanResno;
+		ListCell   *t;
+
+		cleanMap = (AttrNumber *) palloc(cleanLength * sizeof(AttrNumber));
+		cleanResno = 0;
+		foreach(t, targetList)
+		{
+			TargetEntry *tle = lfirst(t);
+
+			if (!tle->resjunk)
+			{
+				cleanMap[cleanResno] = tle->resno;
+				cleanResno++;
+			}
+		}
+		Assert(cleanResno == cleanLength);
+	}
+	else
+		cleanMap = NULL;
+
+	/*
+	 * Finally create and initialize the JunkFilter struct.
+	 */
+	junkfilter = makeNode(JunkFilter);
+
+	junkfilter->jf_targetList = targetList;
+	junkfilter->jf_cleanTupType = cleanTupType;
+	junkfilter->jf_cleanMap = cleanMap;
+	junkfilter->jf_resultSlot = slot;
+
+	return junkfilter;
+}
+
+/*
+ * ExecInitJunkFilterConversion
+ *
+ * Initialize a JunkFilter for rowtype conversions.
+ *
+ * Here, we are given the target "clean" tuple descriptor rather than
+ * inferring it from the targetlist.  The target descriptor can contain
+ * deleted columns.  It is assumed that the caller has checked that the
+ * non-deleted columns match up with the non-junk columns of the targetlist.
+ */
+JunkFilter *
+ExecInitJunkFilterConversion(List *targetList,
+							 TupleDesc cleanTupType,
+							 TupleTableSlot *slot)
+{
+	JunkFilter *junkfilter;
+	int			cleanLength;
+	AttrNumber *cleanMap;
+	ListCell   *t;
+	int			i;
+
+	/*
+	 * Use the given slot, or make a new slot if we weren't given one.
+	 */
+	if (slot)
+		ExecSetSlotDescriptor(slot, cleanTupType);
+	else
+		slot = MakeSingleTupleTableSlot(cleanTupType, &TTSOpsVirtual);
+
+	/*
+	 * Calculate the mapping between the original tuple's attributes and the
+	 * "clean" tuple's attributes.
+	 *
+	 * The "map" is an array of "cleanLength" attribute numbers, i.e. one
+	 * entry for every attribute of the "clean" tuple. The value of this entry
+	 * is the attribute number of the corresponding attribute of the
+	 * "original" tuple.  We store zero for any deleted attributes, marking
+	 * that a NULL is needed in the output tuple.
+	 */
+	cleanLength = cleanTupType->natts;
+	if (cleanLength > 0)
+	{
+		cleanMap = (AttrNumber *) palloc0(cleanLength * sizeof(AttrNumber));
+		t = list_head(targetList);
+		for (i = 0; i < cleanLength; i++)
+		{
+			if (TupleDescAttr(cleanTupType, i)->attisdropped)
+				continue;		/* map entry is already zero */
+			for (;;)
+			{
+				TargetEntry *tle = lfirst(t);
+
+				t = lnext(targetList, t);
+				if (!tle->resjunk)
+				{
+					cleanMap[i] = tle->resno;
+					break;
+				}
+			}
+		}
+	}
+	else
+		cleanMap = NULL;
+
+	/*
+	 * Finally create and initialize the JunkFilter struct.
+	 */
+	junkfilter = makeNode(JunkFilter);
+
+	junkfilter->jf_targetList = targetList;
+	junkfilter->jf_cleanTupType = cleanTupType;
+	junkfilter->jf_cleanMap = cleanMap;
+	junkfilter->jf_resultSlot = slot;
+
+	return junkfilter;
+}
+
+/*
+ * ExecFindJunkAttribute
+ *
+ * Locate the specified junk attribute in the junk filter's targetlist,
+ * and return its resno.  Returns InvalidAttrNumber if not found.
+ */
+AttrNumber
+ExecFindJunkAttribute(JunkFilter *junkfilter, const char *attrName)
+{
+	return ExecFindJunkAttributeInTlist(junkfilter->jf_targetList, attrName);
+}
+
+/*
+ * ExecFindJunkAttributeInTlist
+ *
+ * Find a junk attribute given a subplan's targetlist (not necessarily
+ * part of a JunkFilter).
+ */
+AttrNumber
+ExecFindJunkAttributeInTlist(List *targetlist, const char *attrName)
+{
+	ListCell   *t;
+
+	foreach(t, targetlist)
+	{
+		TargetEntry *tle = lfirst(t);
+
+		if (tle->resjunk && tle->resname &&
+			(strcmp(tle->resname, attrName) == 0))
+		{
+			/* We found it ! */
+			return tle->resno;
+		}
+	}
+
+	return InvalidAttrNumber;
+}
+
+/*
+ * ExecFilterJunk
+ *
+ * Construct and return a slot with all the junk attributes removed.
+ */
+TupleTableSlot *
+ExecFilterJunk(JunkFilter *junkfilter, TupleTableSlot *slot)
+{
+	TupleTableSlot *resultSlot;
+	AttrNumber *cleanMap;
+	TupleDesc	cleanTupType;
+	int			cleanLength;
+	int			i;
+	Datum	   *values;
+	bool	   *isnull;
+	Datum	   *old_values;
+	bool	   *old_isnull;
+
+	/*
+	 * Extract all the values of the old tuple.
+	 */
+	slot_getallattrs(slot);
+	old_values = slot->tts_values;
+	old_isnull = slot->tts_isnull;
+
+	/*
+	 * get info from the junk filter
+	 */
+	cleanTupType = junkfilter->jf_cleanTupType;
+	cleanLength = cleanTupType->natts;
+	cleanMap = junkfilter->jf_cleanMap;
+	resultSlot = junkfilter->jf_resultSlot;
+
+	/*
+	 * Prepare to build a virtual result tuple.
+	 */
+	ExecClearTuple(resultSlot);
+	values = resultSlot->tts_values;
+	isnull = resultSlot->tts_isnull;
+
+	/*
+	 * Transpose data into proper fields of the new tuple.
+	 */
+	for (i = 0; i < cleanLength; i++)
+	{
+		int			j = cleanMap[i];
+
+		if (j == 0)
+		{
+			values[i] = (Datum) 0;
+			isnull[i] = true;
+		}
+		else
+		{
+			values[i] = old_values[j - 1];
+			isnull[i] = old_isnull[j - 1];
+		}
+	}
+
+	/*
+	 * And return the virtual tuple.
+	 */
+	return ExecStoreVirtualTuple(resultSlot);
+}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
new file mode 100644
index 0000000..b3ce4ba
--- /dev/null
+++ b/src/backend/executor/execMain.c
@@ -0,0 +1,2886 @@
+/*-------------------------------------------------------------------------
+ *
+ * execMain.c
+ *	  top level executor interface routines
+ *
+ * INTERFACE ROUTINES
+ *	ExecutorStart()
+ *	ExecutorRun()
+ *	ExecutorFinish()
+ *	ExecutorEnd()
+ *
+ *	These four procedures are the external interface to the executor.
+ *	In each case, the query descriptor is required as an argument.
+ *
+ *	ExecutorStart must be called at the beginning of execution of any
+ *	query plan and ExecutorEnd must always be called at the end of
+ *	execution of a plan (unless it is aborted due to error).
+ *
+ *	ExecutorRun accepts direction and count arguments that specify whether
+ *	the plan is to be executed forwards, backwards, and for how many tuples.
+ *	In some cases ExecutorRun may be called multiple times to process all
+ *	the tuples for a plan.  It is also acceptable to stop short of executing
+ *	the whole plan (but only if it is a SELECT).
+ *
+ *	ExecutorFinish must be called after the final ExecutorRun call and
+ *	before ExecutorEnd.  This can be omitted only in case of EXPLAIN,
+ *	which should also omit ExecutorRun.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execMain.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_publication.h"
+#include "commands/matview.h"
+#include "commands/trigger.h"
+#include "executor/execdebug.h"
+#include "executor/nodeSubplan.h"
+#include "foreign/fdwapi.h"
+#include "jit/jit.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "parser/parsetree.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "tcop/utility.h"
+#include "utils/acl.h"
+#include "utils/backend_status.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/partcache.h"
+#include "utils/rls.h"
+#include "utils/ruleutils.h"
+#include "utils/snapmgr.h"
+
+
+/* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */
+ExecutorStart_hook_type ExecutorStart_hook = NULL;
+ExecutorRun_hook_type ExecutorRun_hook = NULL;
+ExecutorFinish_hook_type ExecutorFinish_hook = NULL;
+ExecutorEnd_hook_type ExecutorEnd_hook = NULL;
+
+/* Hook for plugin to get control in ExecCheckRTPerms() */
+ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL;
+
+/* decls for local routines only used within this module */
+static void InitPlan(QueryDesc *queryDesc, int eflags);
+static void CheckValidRowMarkRel(Relation rel, RowMarkType markType);
+static void ExecPostprocessPlan(EState *estate);
+static void ExecEndPlan(PlanState *planstate, EState *estate);
+static void ExecutePlan(EState *estate, PlanState *planstate,
+						bool use_parallel_mode,
+						CmdType operation,
+						bool sendTuples,
+						uint64 numberTuples,
+						ScanDirection direction,
+						DestReceiver *dest,
+						bool execute_once);
+static bool ExecCheckRTEPerms(RangeTblEntry *rte);
+static bool ExecCheckRTEPermsModified(Oid relOid, Oid userid,
+									  Bitmapset *modifiedCols,
+									  AclMode requiredPerms);
+static void ExecCheckXactReadOnly(PlannedStmt *plannedstmt);
+static char *ExecBuildSlotValueDescription(Oid reloid,
+										   TupleTableSlot *slot,
+										   TupleDesc tupdesc,
+										   Bitmapset *modifiedCols,
+										   int maxfieldlen);
+static void EvalPlanQualStart(EPQState *epqstate, Plan *planTree);
+
+/* end of local decls */
+
+
+/* ----------------------------------------------------------------
+ *		ExecutorStart
+ *
+ *		This routine must be called at the beginning of any execution of any
+ *		query plan
+ *
+ * Takes a QueryDesc previously created by CreateQueryDesc (which is separate
+ * only because some places use QueryDescs for utility commands).  The tupDesc
+ * field of the QueryDesc is filled in to describe the tuples that will be
+ * returned, and the internal fields (estate and planstate) are set up.
+ *
+ * eflags contains flag bits as described in executor.h.
+ *
+ * NB: the CurrentMemoryContext when this is called will become the parent
+ * of the per-query context used for this Executor invocation.
+ *
+ * We provide a function hook variable that lets loadable plugins
+ * get control when ExecutorStart is called.  Such a plugin would
+ * normally call standard_ExecutorStart().
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+	/*
+	 * In some cases (e.g. an EXECUTE statement) a query execution will skip
+	 * parse analysis, which means that the query_id won't be reported.  Note
+	 * that it's harmless to report the query_id multiple time, as the call
+	 * will be ignored if the top level query_id has already been reported.
+	 */
+	pgstat_report_query_id(queryDesc->plannedstmt->queryId, false);
+
+	if (ExecutorStart_hook)
+		(*ExecutorStart_hook) (queryDesc, eflags);
+	else
+		standard_ExecutorStart(queryDesc, eflags);
+}
+
+void
+standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+	EState	   *estate;
+	MemoryContext oldcontext;
+
+	/* sanity checks: queryDesc must not be started already */
+	Assert(queryDesc != NULL);
+	Assert(queryDesc->estate == NULL);
+
+	/*
+	 * If the transaction is read-only, we need to check if any writes are
+	 * planned to non-temporary tables.  EXPLAIN is considered read-only.
+	 *
+	 * Don't allow writes in parallel mode.  Supporting UPDATE and DELETE
+	 * would require (a) storing the combo CID hash in shared memory, rather
+	 * than synchronizing it just once at the start of parallelism, and (b) an
+	 * alternative to heap_update()'s reliance on xmax for mutual exclusion.
+	 * INSERT may have no such troubles, but we forbid it to simplify the
+	 * checks.
+	 *
+	 * We have lower-level defenses in CommandCounterIncrement and elsewhere
+	 * against performing unsafe operations in parallel mode, but this gives a
+	 * more user-friendly error message.
+	 */
+	if ((XactReadOnly || IsInParallelMode()) &&
+		!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+		ExecCheckXactReadOnly(queryDesc->plannedstmt);
+
+	/*
+	 * Build EState, switch into per-query memory context for startup.
+	 */
+	estate = CreateExecutorState();
+	queryDesc->estate = estate;
+
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	/*
+	 * Fill in external parameters, if any, from queryDesc; and allocate
+	 * workspace for internal parameters
+	 */
+	estate->es_param_list_info = queryDesc->params;
+
+	if (queryDesc->plannedstmt->paramExecTypes != NIL)
+	{
+		int			nParamExec;
+
+		nParamExec = list_length(queryDesc->plannedstmt->paramExecTypes);
+		estate->es_param_exec_vals = (ParamExecData *)
+			palloc0(nParamExec * sizeof(ParamExecData));
+	}
+
+	/* We now require all callers to provide sourceText */
+	Assert(queryDesc->sourceText != NULL);
+	estate->es_sourceText = queryDesc->sourceText;
+
+	/*
+	 * Fill in the query environment, if any, from queryDesc.
+	 */
+	estate->es_queryEnv = queryDesc->queryEnv;
+
+	/*
+	 * If non-read-only query, set the command ID to mark output tuples with
+	 */
+	switch (queryDesc->operation)
+	{
+		case CMD_SELECT:
+
+			/*
+			 * SELECT FOR [KEY] UPDATE/SHARE and modifying CTEs need to mark
+			 * tuples
+			 */
+			if (queryDesc->plannedstmt->rowMarks != NIL ||
+				queryDesc->plannedstmt->hasModifyingCTE)
+				estate->es_output_cid = GetCurrentCommandId(true);
+
+			/*
+			 * A SELECT without modifying CTEs can't possibly queue triggers,
+			 * so force skip-triggers mode. This is just a marginal efficiency
+			 * hack, since AfterTriggerBeginQuery/AfterTriggerEndQuery aren't
+			 * all that expensive, but we might as well do it.
+			 */
+			if (!queryDesc->plannedstmt->hasModifyingCTE)
+				eflags |= EXEC_FLAG_SKIP_TRIGGERS;
+			break;
+
+		case CMD_INSERT:
+		case CMD_DELETE:
+		case CMD_UPDATE:
+			estate->es_output_cid = GetCurrentCommandId(true);
+			break;
+
+		default:
+			elog(ERROR, "unrecognized operation code: %d",
+				 (int) queryDesc->operation);
+			break;
+	}
+
+	/*
+	 * Copy other important information into the EState
+	 */
+	estate->es_snapshot = RegisterSnapshot(queryDesc->snapshot);
+	estate->es_crosscheck_snapshot = RegisterSnapshot(queryDesc->crosscheck_snapshot);
+	estate->es_top_eflags = eflags;
+	estate->es_instrument = queryDesc->instrument_options;
+	estate->es_jit_flags = queryDesc->plannedstmt->jitFlags;
+
+	/*
+	 * Set up an AFTER-trigger statement context, unless told not to, or
+	 * unless it's EXPLAIN-only mode (when ExecutorFinish won't be called).
+	 */
+	if (!(eflags & (EXEC_FLAG_SKIP_TRIGGERS | EXEC_FLAG_EXPLAIN_ONLY)))
+		AfterTriggerBeginQuery();
+
+	/*
+	 * Initialize the plan state tree
+	 */
+	InitPlan(queryDesc, eflags);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecutorRun
+ *
+ *		This is the main routine of the executor module. It accepts
+ *		the query descriptor from the traffic cop and executes the
+ *		query plan.
+ *
+ *		ExecutorStart must have been called already.
+ *
+ *		If direction is NoMovementScanDirection then nothing is done
+ *		except to start up/shut down the destination.  Otherwise,
+ *		we retrieve up to 'count' tuples in the specified direction.
+ *
+ *		Note: count = 0 is interpreted as no portal limit, i.e., run to
+ *		completion.  Also note that the count limit is only applied to
+ *		retrieved tuples, not for instance to those inserted/updated/deleted
+ *		by a ModifyTable plan node.
+ *
+ *		There is no return value, but output tuples (if any) are sent to
+ *		the destination receiver specified in the QueryDesc; and the number
+ *		of tuples processed at the top level can be found in
+ *		estate->es_processed.
+ *
+ *		We provide a function hook variable that lets loadable plugins
+ *		get control when ExecutorRun is called.  Such a plugin would
+ *		normally call standard_ExecutorRun().
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorRun(QueryDesc *queryDesc,
+			ScanDirection direction, uint64 count,
+			bool execute_once)
+{
+	if (ExecutorRun_hook)
+		(*ExecutorRun_hook) (queryDesc, direction, count, execute_once);
+	else
+		standard_ExecutorRun(queryDesc, direction, count, execute_once);
+}
+
+void
+standard_ExecutorRun(QueryDesc *queryDesc,
+					 ScanDirection direction, uint64 count, bool execute_once)
+{
+	EState	   *estate;
+	CmdType		operation;
+	DestReceiver *dest;
+	bool		sendTuples;
+	MemoryContext oldcontext;
+
+	/* sanity checks */
+	Assert(queryDesc != NULL);
+
+	estate = queryDesc->estate;
+
+	Assert(estate != NULL);
+	Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
+
+	/*
+	 * Switch into per-query memory context
+	 */
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	/* Allow instrumentation of Executor overall runtime */
+	if (queryDesc->totaltime)
+		InstrStartNode(queryDesc->totaltime);
+
+	/*
+	 * extract information from the query descriptor and the query feature.
+	 */
+	operation = queryDesc->operation;
+	dest = queryDesc->dest;
+
+	/*
+	 * startup tuple receiver, if we will be emitting tuples
+	 */
+	estate->es_processed = 0;
+
+	sendTuples = (operation == CMD_SELECT ||
+				  queryDesc->plannedstmt->hasReturning);
+
+	if (sendTuples)
+		dest->rStartup(dest, operation, queryDesc->tupDesc);
+
+	/*
+	 * run plan
+	 */
+	if (!ScanDirectionIsNoMovement(direction))
+	{
+		if (execute_once && queryDesc->already_executed)
+			elog(ERROR, "can't re-execute query flagged for single execution");
+		queryDesc->already_executed = true;
+
+		ExecutePlan(estate,
+					queryDesc->planstate,
+					queryDesc->plannedstmt->parallelModeNeeded,
+					operation,
+					sendTuples,
+					count,
+					direction,
+					dest,
+					execute_once);
+	}
+
+	/*
+	 * shutdown tuple receiver, if we started it
+	 */
+	if (sendTuples)
+		dest->rShutdown(dest);
+
+	if (queryDesc->totaltime)
+		InstrStopNode(queryDesc->totaltime, estate->es_processed);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecutorFinish
+ *
+ *		This routine must be called after the last ExecutorRun call.
+ *		It performs cleanup such as firing AFTER triggers.  It is
+ *		separate from ExecutorEnd because EXPLAIN ANALYZE needs to
+ *		include these actions in the total runtime.
+ *
+ *		We provide a function hook variable that lets loadable plugins
+ *		get control when ExecutorFinish is called.  Such a plugin would
+ *		normally call standard_ExecutorFinish().
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorFinish(QueryDesc *queryDesc)
+{
+	if (ExecutorFinish_hook)
+		(*ExecutorFinish_hook) (queryDesc);
+	else
+		standard_ExecutorFinish(queryDesc);
+}
+
+void
+standard_ExecutorFinish(QueryDesc *queryDesc)
+{
+	EState	   *estate;
+	MemoryContext oldcontext;
+
+	/* sanity checks */
+	Assert(queryDesc != NULL);
+
+	estate = queryDesc->estate;
+
+	Assert(estate != NULL);
+	Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
+
+	/* This should be run once and only once per Executor instance */
+	Assert(!estate->es_finished);
+
+	/* Switch into per-query memory context */
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	/* Allow instrumentation of Executor overall runtime */
+	if (queryDesc->totaltime)
+		InstrStartNode(queryDesc->totaltime);
+
+	/* Run ModifyTable nodes to completion */
+	ExecPostprocessPlan(estate);
+
+	/* Execute queued AFTER triggers, unless told not to */
+	if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
+		AfterTriggerEndQuery(estate);
+
+	if (queryDesc->totaltime)
+		InstrStopNode(queryDesc->totaltime, 0);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	estate->es_finished = true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecutorEnd
+ *
+ *		This routine must be called at the end of execution of any
+ *		query plan
+ *
+ *		We provide a function hook variable that lets loadable plugins
+ *		get control when ExecutorEnd is called.  Such a plugin would
+ *		normally call standard_ExecutorEnd().
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorEnd(QueryDesc *queryDesc)
+{
+	if (ExecutorEnd_hook)
+		(*ExecutorEnd_hook) (queryDesc);
+	else
+		standard_ExecutorEnd(queryDesc);
+}
+
+void
+standard_ExecutorEnd(QueryDesc *queryDesc)
+{
+	EState	   *estate;
+	MemoryContext oldcontext;
+
+	/* sanity checks */
+	Assert(queryDesc != NULL);
+
+	estate = queryDesc->estate;
+
+	Assert(estate != NULL);
+
+	/*
+	 * Check that ExecutorFinish was called, unless in EXPLAIN-only mode. This
+	 * Assert is needed because ExecutorFinish is new as of 9.1, and callers
+	 * might forget to call it.
+	 */
+	Assert(estate->es_finished ||
+		   (estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
+
+	/*
+	 * Switch into per-query memory context to run ExecEndPlan
+	 */
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	ExecEndPlan(queryDesc->planstate, estate);
+
+	/* do away with our snapshots */
+	UnregisterSnapshot(estate->es_snapshot);
+	UnregisterSnapshot(estate->es_crosscheck_snapshot);
+
+	/*
+	 * Must switch out of context before destroying it
+	 */
+	MemoryContextSwitchTo(oldcontext);
+
+	/*
+	 * Release EState and per-query memory context.  This should release
+	 * everything the executor has allocated.
+	 */
+	FreeExecutorState(estate);
+
+	/* Reset queryDesc fields that no longer point to anything */
+	queryDesc->tupDesc = NULL;
+	queryDesc->estate = NULL;
+	queryDesc->planstate = NULL;
+	queryDesc->totaltime = NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecutorRewind
+ *
+ *		This routine may be called on an open queryDesc to rewind it
+ *		to the start.
+ * ----------------------------------------------------------------
+ */
+void
+ExecutorRewind(QueryDesc *queryDesc)
+{
+	EState	   *estate;
+	MemoryContext oldcontext;
+
+	/* sanity checks */
+	Assert(queryDesc != NULL);
+
+	estate = queryDesc->estate;
+
+	Assert(estate != NULL);
+
+	/* It's probably not sensible to rescan updating queries */
+	Assert(queryDesc->operation == CMD_SELECT);
+
+	/*
+	 * Switch into per-query memory context
+	 */
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	/*
+	 * rescan plan
+	 */
+	ExecReScan(queryDesc->planstate);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+
+/*
+ * ExecCheckRTPerms
+ *		Check access permissions for all relations listed in a range table.
+ *
+ * Returns true if permissions are adequate.  Otherwise, throws an appropriate
+ * error if ereport_on_violation is true, or simply returns false otherwise.
+ *
+ * Note that this does NOT address row-level security policies (aka: RLS).  If
+ * rows will be returned to the user as a result of this permission check
+ * passing, then RLS also needs to be consulted (and check_enable_rls()).
+ *
+ * See rewrite/rowsecurity.c.
+ */
+bool
+ExecCheckRTPerms(List *rangeTable, bool ereport_on_violation)
+{
+	ListCell   *l;
+	bool		result = true;
+
+	foreach(l, rangeTable)
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
+
+		result = ExecCheckRTEPerms(rte);
+		if (!result)
+		{
+			Assert(rte->rtekind == RTE_RELATION);
+			if (ereport_on_violation)
+				aclcheck_error(ACLCHECK_NO_PRIV, get_relkind_objtype(get_rel_relkind(rte->relid)),
+							   get_rel_name(rte->relid));
+			return false;
+		}
+	}
+
+	if (ExecutorCheckPerms_hook)
+		result = (*ExecutorCheckPerms_hook) (rangeTable,
+											 ereport_on_violation);
+	return result;
+}
+
+/*
+ * ExecCheckRTEPerms
+ *		Check access permissions for a single RTE.
+ */
+static bool
+ExecCheckRTEPerms(RangeTblEntry *rte)
+{
+	AclMode		requiredPerms;
+	AclMode		relPerms;
+	AclMode		remainingPerms;
+	Oid			relOid;
+	Oid			userid;
+
+	/*
+	 * Only plain-relation RTEs need to be checked here.  Function RTEs are
+	 * checked when the function is prepared for execution.  Join, subquery,
+	 * and special RTEs need no checks.
+	 */
+	if (rte->rtekind != RTE_RELATION)
+		return true;
+
+	/*
+	 * No work if requiredPerms is empty.
+	 */
+	requiredPerms = rte->requiredPerms;
+	if (requiredPerms == 0)
+		return true;
+
+	relOid = rte->relid;
+
+	/*
+	 * userid to check as: current user unless we have a setuid indication.
+	 *
+	 * Note: GetUserId() is presently fast enough that there's no harm in
+	 * calling it separately for each RTE.  If that stops being true, we could
+	 * call it once in ExecCheckRTPerms and pass the userid down from there.
+	 * But for now, no need for the extra clutter.
+	 */
+	userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
+
+	/*
+	 * We must have *all* the requiredPerms bits, but some of the bits can be
+	 * satisfied from column-level rather than relation-level permissions.
+	 * First, remove any bits that are satisfied by relation permissions.
+	 */
+	relPerms = pg_class_aclmask(relOid, userid, requiredPerms, ACLMASK_ALL);
+	remainingPerms = requiredPerms & ~relPerms;
+	if (remainingPerms != 0)
+	{
+		int			col = -1;
+
+		/*
+		 * If we lack any permissions that exist only as relation permissions,
+		 * we can fail straight away.
+		 */
+		if (remainingPerms & ~(ACL_SELECT | ACL_INSERT | ACL_UPDATE))
+			return false;
+
+		/*
+		 * Check to see if we have the needed privileges at column level.
+		 *
+		 * Note: failures just report a table-level error; it would be nicer
+		 * to report a column-level error if we have some but not all of the
+		 * column privileges.
+		 */
+		if (remainingPerms & ACL_SELECT)
+		{
+			/*
+			 * When the query doesn't explicitly reference any columns (for
+			 * example, SELECT COUNT(*) FROM table), allow the query if we
+			 * have SELECT on any column of the rel, as per SQL spec.
+			 */
+			if (bms_is_empty(rte->selectedCols))
+			{
+				if (pg_attribute_aclcheck_all(relOid, userid, ACL_SELECT,
+											  ACLMASK_ANY) != ACLCHECK_OK)
+					return false;
+			}
+
+			while ((col = bms_next_member(rte->selectedCols, col)) >= 0)
+			{
+				/* bit #s are offset by FirstLowInvalidHeapAttributeNumber */
+				AttrNumber	attno = col + FirstLowInvalidHeapAttributeNumber;
+
+				if (attno == InvalidAttrNumber)
+				{
+					/* Whole-row reference, must have priv on all cols */
+					if (pg_attribute_aclcheck_all(relOid, userid, ACL_SELECT,
+												  ACLMASK_ALL) != ACLCHECK_OK)
+						return false;
+				}
+				else
+				{
+					if (pg_attribute_aclcheck(relOid, attno, userid,
+											  ACL_SELECT) != ACLCHECK_OK)
+						return false;
+				}
+			}
+		}
+
+		/*
+		 * Basically the same for the mod columns, for both INSERT and UPDATE
+		 * privilege as specified by remainingPerms.
+		 */
+		if (remainingPerms & ACL_INSERT && !ExecCheckRTEPermsModified(relOid,
+																	  userid,
+																	  rte->insertedCols,
+																	  ACL_INSERT))
+			return false;
+
+		if (remainingPerms & ACL_UPDATE && !ExecCheckRTEPermsModified(relOid,
+																	  userid,
+																	  rte->updatedCols,
+																	  ACL_UPDATE))
+			return false;
+	}
+	return true;
+}
+
+/*
+ * ExecCheckRTEPermsModified
+ *		Check INSERT or UPDATE access permissions for a single RTE (these
+ *		are processed uniformly).
+ */
+static bool
+ExecCheckRTEPermsModified(Oid relOid, Oid userid, Bitmapset *modifiedCols,
+						  AclMode requiredPerms)
+{
+	int			col = -1;
+
+	/*
+	 * When the query doesn't explicitly update any columns, allow the query
+	 * if we have permission on any column of the rel.  This is to handle
+	 * SELECT FOR UPDATE as well as possible corner cases in UPDATE.
+	 */
+	if (bms_is_empty(modifiedCols))
+	{
+		if (pg_attribute_aclcheck_all(relOid, userid, requiredPerms,
+									  ACLMASK_ANY) != ACLCHECK_OK)
+			return false;
+	}
+
+	while ((col = bms_next_member(modifiedCols, col)) >= 0)
+	{
+		/* bit #s are offset by FirstLowInvalidHeapAttributeNumber */
+		AttrNumber	attno = col + FirstLowInvalidHeapAttributeNumber;
+
+		if (attno == InvalidAttrNumber)
+		{
+			/* whole-row reference can't happen here */
+			elog(ERROR, "whole-row update is not implemented");
+		}
+		else
+		{
+			if (pg_attribute_aclcheck(relOid, attno, userid,
+									  requiredPerms) != ACLCHECK_OK)
+				return false;
+		}
+	}
+	return true;
+}
+
+/*
+ * Check that the query does not imply any writes to non-temp tables;
+ * unless we're in parallel mode, in which case don't even allow writes
+ * to temp tables.
+ *
+ * Note: in a Hot Standby this would need to reject writes to temp
+ * tables just as we do in parallel mode; but an HS standby can't have created
+ * any temp tables in the first place, so no need to check that.
+ */
+static void
+ExecCheckXactReadOnly(PlannedStmt *plannedstmt)
+{
+	ListCell   *l;
+
+	/*
+	 * Fail if write permissions are requested in parallel mode for table
+	 * (temp or non-temp), otherwise fail for any non-temp table.
+	 */
+	foreach(l, plannedstmt->rtable)
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
+
+		if (rte->rtekind != RTE_RELATION)
+			continue;
+
+		if ((rte->requiredPerms & (~ACL_SELECT)) == 0)
+			continue;
+
+		if (isTempNamespace(get_rel_namespace(rte->relid)))
+			continue;
+
+		PreventCommandIfReadOnly(CreateCommandName((Node *) plannedstmt));
+	}
+
+	if (plannedstmt->commandType != CMD_SELECT || plannedstmt->hasModifyingCTE)
+		PreventCommandIfParallelMode(CreateCommandName((Node *) plannedstmt));
+}
+
+
+/* ----------------------------------------------------------------
+ *		InitPlan
+ *
+ *		Initializes the query plan: open files, allocate storage
+ *		and start up the rule manager
+ * ----------------------------------------------------------------
+ */
+static void
+InitPlan(QueryDesc *queryDesc, int eflags)
+{
+	CmdType		operation = queryDesc->operation;
+	PlannedStmt *plannedstmt = queryDesc->plannedstmt;
+	Plan	   *plan = plannedstmt->planTree;
+	List	   *rangeTable = plannedstmt->rtable;
+	EState	   *estate = queryDesc->estate;
+	PlanState  *planstate;
+	TupleDesc	tupType;
+	ListCell   *l;
+	int			i;
+
+	/*
+	 * Do permissions checks
+	 */
+	ExecCheckRTPerms(rangeTable, true);
+
+	/*
+	 * initialize the node's execution state
+	 */
+	ExecInitRangeTable(estate, rangeTable);
+
+	estate->es_plannedstmt = plannedstmt;
+
+	/*
+	 * Next, build the ExecRowMark array from the PlanRowMark(s), if any.
+	 */
+	if (plannedstmt->rowMarks)
+	{
+		estate->es_rowmarks = (ExecRowMark **)
+			palloc0(estate->es_range_table_size * sizeof(ExecRowMark *));
+		foreach(l, plannedstmt->rowMarks)
+		{
+			PlanRowMark *rc = (PlanRowMark *) lfirst(l);
+			Oid			relid;
+			Relation	relation;
+			ExecRowMark *erm;
+
+			/* ignore "parent" rowmarks; they are irrelevant at runtime */
+			if (rc->isParent)
+				continue;
+
+			/* get relation's OID (will produce InvalidOid if subquery) */
+			relid = exec_rt_fetch(rc->rti, estate)->relid;
+
+			/* open relation, if we need to access it for this mark type */
+			switch (rc->markType)
+			{
+				case ROW_MARK_EXCLUSIVE:
+				case ROW_MARK_NOKEYEXCLUSIVE:
+				case ROW_MARK_SHARE:
+				case ROW_MARK_KEYSHARE:
+				case ROW_MARK_REFERENCE:
+					relation = ExecGetRangeTableRelation(estate, rc->rti);
+					break;
+				case ROW_MARK_COPY:
+					/* no physical table access is required */
+					relation = NULL;
+					break;
+				default:
+					elog(ERROR, "unrecognized markType: %d", rc->markType);
+					relation = NULL;	/* keep compiler quiet */
+					break;
+			}
+
+			/* Check that relation is a legal target for marking */
+			if (relation)
+				CheckValidRowMarkRel(relation, rc->markType);
+
+			erm = (ExecRowMark *) palloc(sizeof(ExecRowMark));
+			erm->relation = relation;
+			erm->relid = relid;
+			erm->rti = rc->rti;
+			erm->prti = rc->prti;
+			erm->rowmarkId = rc->rowmarkId;
+			erm->markType = rc->markType;
+			erm->strength = rc->strength;
+			erm->waitPolicy = rc->waitPolicy;
+			erm->ermActive = false;
+			ItemPointerSetInvalid(&(erm->curCtid));
+			erm->ermExtra = NULL;
+
+			Assert(erm->rti > 0 && erm->rti <= estate->es_range_table_size &&
+				   estate->es_rowmarks[erm->rti - 1] == NULL);
+
+			estate->es_rowmarks[erm->rti - 1] = erm;
+		}
+	}
+
+	/*
+	 * Initialize the executor's tuple table to empty.
+	 */
+	estate->es_tupleTable = NIL;
+
+	/* signal that this EState is not used for EPQ */
+	estate->es_epq_active = NULL;
+
+	/*
+	 * Initialize private state information for each SubPlan.  We must do this
+	 * before running ExecInitNode on the main query tree, since
+	 * ExecInitSubPlan expects to be able to find these entries.
+	 */
+	Assert(estate->es_subplanstates == NIL);
+	i = 1;						/* subplan indices count from 1 */
+	foreach(l, plannedstmt->subplans)
+	{
+		Plan	   *subplan = (Plan *) lfirst(l);
+		PlanState  *subplanstate;
+		int			sp_eflags;
+
+		/*
+		 * A subplan will never need to do BACKWARD scan nor MARK/RESTORE. If
+		 * it is a parameterless subplan (not initplan), we suggest that it be
+		 * prepared to handle REWIND efficiently; otherwise there is no need.
+		 */
+		sp_eflags = eflags
+			& (EXEC_FLAG_EXPLAIN_ONLY | EXEC_FLAG_WITH_NO_DATA);
+		if (bms_is_member(i, plannedstmt->rewindPlanIDs))
+			sp_eflags |= EXEC_FLAG_REWIND;
+
+		subplanstate = ExecInitNode(subplan, estate, sp_eflags);
+
+		estate->es_subplanstates = lappend(estate->es_subplanstates,
+										   subplanstate);
+
+		i++;
+	}
+
+	/*
+	 * Initialize the private state information for all the nodes in the query
+	 * tree.  This opens files, allocates storage and leaves us ready to start
+	 * processing tuples.
+	 */
+	planstate = ExecInitNode(plan, estate, eflags);
+
+	/*
+	 * Get the tuple descriptor describing the type of tuples to return.
+	 */
+	tupType = ExecGetResultType(planstate);
+
+	/*
+	 * Initialize the junk filter if needed.  SELECT queries need a filter if
+	 * there are any junk attrs in the top-level tlist.
+	 */
+	if (operation == CMD_SELECT)
+	{
+		bool		junk_filter_needed = false;
+		ListCell   *tlist;
+
+		foreach(tlist, plan->targetlist)
+		{
+			TargetEntry *tle = (TargetEntry *) lfirst(tlist);
+
+			if (tle->resjunk)
+			{
+				junk_filter_needed = true;
+				break;
+			}
+		}
+
+		if (junk_filter_needed)
+		{
+			JunkFilter *j;
+			TupleTableSlot *slot;
+
+			slot = ExecInitExtraTupleSlot(estate, NULL, &TTSOpsVirtual);
+			j = ExecInitJunkFilter(planstate->plan->targetlist,
+								   slot);
+			estate->es_junkFilter = j;
+
+			/* Want to return the cleaned tuple type */
+			tupType = j->jf_cleanTupType;
+		}
+	}
+
+	queryDesc->tupDesc = tupType;
+	queryDesc->planstate = planstate;
+}
+
+/*
+ * Check that a proposed result relation is a legal target for the operation
+ *
+ * Generally the parser and/or planner should have noticed any such mistake
+ * already, but let's make sure.
+ *
+ * Note: when changing this function, you probably also need to look at
+ * CheckValidRowMarkRel.
+ */
+void
+CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation)
+{
+	Relation	resultRel = resultRelInfo->ri_RelationDesc;
+	TriggerDesc *trigDesc = resultRel->trigdesc;
+	FdwRoutine *fdwroutine;
+
+	switch (resultRel->rd_rel->relkind)
+	{
+		case RELKIND_RELATION:
+		case RELKIND_PARTITIONED_TABLE:
+			CheckCmdReplicaIdentity(resultRel, operation);
+			break;
+		case RELKIND_SEQUENCE:
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("cannot change sequence \"%s\"",
+							RelationGetRelationName(resultRel))));
+			break;
+		case RELKIND_TOASTVALUE:
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("cannot change TOAST relation \"%s\"",
+							RelationGetRelationName(resultRel))));
+			break;
+		case RELKIND_VIEW:
+
+			/*
+			 * Okay only if there's a suitable INSTEAD OF trigger.  Messages
+			 * here should match rewriteHandler.c's rewriteTargetView and
+			 * RewriteQuery, except that we omit errdetail because we haven't
+			 * got the information handy (and given that we really shouldn't
+			 * get here anyway, it's not worth great exertion to get).
+			 */
+			switch (operation)
+			{
+				case CMD_INSERT:
+					if (!trigDesc || !trigDesc->trig_insert_instead_row)
+						ereport(ERROR,
+								(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+								 errmsg("cannot insert into view \"%s\"",
+										RelationGetRelationName(resultRel)),
+								 errhint("To enable inserting into the view, provide an INSTEAD OF INSERT trigger or an unconditional ON INSERT DO INSTEAD rule.")));
+					break;
+				case CMD_UPDATE:
+					if (!trigDesc || !trigDesc->trig_update_instead_row)
+						ereport(ERROR,
+								(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+								 errmsg("cannot update view \"%s\"",
+										RelationGetRelationName(resultRel)),
+								 errhint("To enable updating the view, provide an INSTEAD OF UPDATE trigger or an unconditional ON UPDATE DO INSTEAD rule.")));
+					break;
+				case CMD_DELETE:
+					if (!trigDesc || !trigDesc->trig_delete_instead_row)
+						ereport(ERROR,
+								(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+								 errmsg("cannot delete from view \"%s\"",
+										RelationGetRelationName(resultRel)),
+								 errhint("To enable deleting from the view, provide an INSTEAD OF DELETE trigger or an unconditional ON DELETE DO INSTEAD rule.")));
+					break;
+				default:
+					elog(ERROR, "unrecognized CmdType: %d", (int) operation);
+					break;
+			}
+			break;
+		case RELKIND_MATVIEW:
+			if (!MatViewIncrementalMaintenanceIsEnabled())
+				ereport(ERROR,
+						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						 errmsg("cannot change materialized view \"%s\"",
+								RelationGetRelationName(resultRel))));
+			break;
+		case RELKIND_FOREIGN_TABLE:
+			/* Okay only if the FDW supports it */
+			fdwroutine = resultRelInfo->ri_FdwRoutine;
+			switch (operation)
+			{
+				case CMD_INSERT:
+					if (fdwroutine->ExecForeignInsert == NULL)
+						ereport(ERROR,
+								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+								 errmsg("cannot insert into foreign table \"%s\"",
+										RelationGetRelationName(resultRel))));
+					if (fdwroutine->IsForeignRelUpdatable != NULL &&
+						(fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_INSERT)) == 0)
+						ereport(ERROR,
+								(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+								 errmsg("foreign table \"%s\" does not allow inserts",
+										RelationGetRelationName(resultRel))));
+					break;
+				case CMD_UPDATE:
+					if (fdwroutine->ExecForeignUpdate == NULL)
+						ereport(ERROR,
+								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+								 errmsg("cannot update foreign table \"%s\"",
+										RelationGetRelationName(resultRel))));
+					if (fdwroutine->IsForeignRelUpdatable != NULL &&
+						(fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_UPDATE)) == 0)
+						ereport(ERROR,
+								(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+								 errmsg("foreign table \"%s\" does not allow updates",
+										RelationGetRelationName(resultRel))));
+					break;
+				case CMD_DELETE:
+					if (fdwroutine->ExecForeignDelete == NULL)
+						ereport(ERROR,
+								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+								 errmsg("cannot delete from foreign table \"%s\"",
+										RelationGetRelationName(resultRel))));
+					if (fdwroutine->IsForeignRelUpdatable != NULL &&
+						(fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_DELETE)) == 0)
+						ereport(ERROR,
+								(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+								 errmsg("foreign table \"%s\" does not allow deletes",
+										RelationGetRelationName(resultRel))));
+					break;
+				default:
+					elog(ERROR, "unrecognized CmdType: %d", (int) operation);
+					break;
+			}
+			break;
+		default:
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("cannot change relation \"%s\"",
+							RelationGetRelationName(resultRel))));
+			break;
+	}
+}
+
+/*
+ * Check that a proposed rowmark target relation is a legal target
+ *
+ * In most cases parser and/or planner should have noticed this already, but
+ * they don't cover all cases.
+ */
+static void
+CheckValidRowMarkRel(Relation rel, RowMarkType markType)
+{
+	FdwRoutine *fdwroutine;
+
+	switch (rel->rd_rel->relkind)
+	{
+		case RELKIND_RELATION:
+		case RELKIND_PARTITIONED_TABLE:
+			/* OK */
+			break;
+		case RELKIND_SEQUENCE:
+			/* Must disallow this because we don't vacuum sequences */
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("cannot lock rows in sequence \"%s\"",
+							RelationGetRelationName(rel))));
+			break;
+		case RELKIND_TOASTVALUE:
+			/* We could allow this, but there seems no good reason to */
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("cannot lock rows in TOAST relation \"%s\"",
+							RelationGetRelationName(rel))));
+			break;
+		case RELKIND_VIEW:
+			/* Should not get here; planner should have expanded the view */
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("cannot lock rows in view \"%s\"",
+							RelationGetRelationName(rel))));
+			break;
+		case RELKIND_MATVIEW:
+			/* Allow referencing a matview, but not actual locking clauses */
+			if (markType != ROW_MARK_REFERENCE)
+				ereport(ERROR,
+						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						 errmsg("cannot lock rows in materialized view \"%s\"",
+								RelationGetRelationName(rel))));
+			break;
+		case RELKIND_FOREIGN_TABLE:
+			/* Okay only if the FDW supports it */
+			fdwroutine = GetFdwRoutineForRelation(rel, false);
+			if (fdwroutine->RefetchForeignRow == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot lock rows in foreign table \"%s\"",
+								RelationGetRelationName(rel))));
+			break;
+		default:
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("cannot lock rows in relation \"%s\"",
+							RelationGetRelationName(rel))));
+			break;
+	}
+}
+
+/*
+ * Initialize ResultRelInfo data for one result relation
+ *
+ * Caution: before Postgres 9.1, this function included the relkind checking
+ * that's now in CheckValidResultRel, and it also did ExecOpenIndices if
+ * appropriate.  Be sure callers cover those needs.
+ */
+void
+InitResultRelInfo(ResultRelInfo *resultRelInfo,
+				  Relation resultRelationDesc,
+				  Index resultRelationIndex,
+				  ResultRelInfo *partition_root_rri,
+				  int instrument_options)
+{
+	MemSet(resultRelInfo, 0, sizeof(ResultRelInfo));
+	resultRelInfo->type = T_ResultRelInfo;
+	resultRelInfo->ri_RangeTableIndex = resultRelationIndex;
+	resultRelInfo->ri_RelationDesc = resultRelationDesc;
+	resultRelInfo->ri_NumIndices = 0;
+	resultRelInfo->ri_IndexRelationDescs = NULL;
+	resultRelInfo->ri_IndexRelationInfo = NULL;
+	/* make a copy so as not to depend on relcache info not changing... */
+	resultRelInfo->ri_TrigDesc = CopyTriggerDesc(resultRelationDesc->trigdesc);
+	if (resultRelInfo->ri_TrigDesc)
+	{
+		int			n = resultRelInfo->ri_TrigDesc->numtriggers;
+
+		resultRelInfo->ri_TrigFunctions = (FmgrInfo *)
+			palloc0(n * sizeof(FmgrInfo));
+		resultRelInfo->ri_TrigWhenExprs = (ExprState **)
+			palloc0(n * sizeof(ExprState *));
+		if (instrument_options)
+			resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false);
+	}
+	else
+	{
+		resultRelInfo->ri_TrigFunctions = NULL;
+		resultRelInfo->ri_TrigWhenExprs = NULL;
+		resultRelInfo->ri_TrigInstrument = NULL;
+	}
+	if (resultRelationDesc->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+		resultRelInfo->ri_FdwRoutine = GetFdwRoutineForRelation(resultRelationDesc, true);
+	else
+		resultRelInfo->ri_FdwRoutine = NULL;
+
+	/* The following fields are set later if needed */
+	resultRelInfo->ri_RowIdAttNo = 0;
+	resultRelInfo->ri_projectNew = NULL;
+	resultRelInfo->ri_newTupleSlot = NULL;
+	resultRelInfo->ri_oldTupleSlot = NULL;
+	resultRelInfo->ri_projectNewInfoValid = false;
+	resultRelInfo->ri_FdwState = NULL;
+	resultRelInfo->ri_usesFdwDirectModify = false;
+	resultRelInfo->ri_ConstraintExprs = NULL;
+	resultRelInfo->ri_GeneratedExprs = NULL;
+	resultRelInfo->ri_projectReturning = NULL;
+	resultRelInfo->ri_onConflictArbiterIndexes = NIL;
+	resultRelInfo->ri_onConflict = NULL;
+	resultRelInfo->ri_ReturningSlot = NULL;
+	resultRelInfo->ri_TrigOldSlot = NULL;
+	resultRelInfo->ri_TrigNewSlot = NULL;
+
+	/*
+	 * Only ExecInitPartitionInfo() and ExecInitPartitionDispatchInfo() pass
+	 * non-NULL partition_root_rri.  For child relations that are part of the
+	 * initial query rather than being dynamically added by tuple routing,
+	 * this field is filled in ExecInitModifyTable().
+	 */
+	resultRelInfo->ri_RootResultRelInfo = partition_root_rri;
+	resultRelInfo->ri_RootToPartitionMap = NULL;	/* set by
+													 * ExecInitRoutingInfo */
+	resultRelInfo->ri_PartitionTupleSlot = NULL;	/* ditto */
+	resultRelInfo->ri_ChildToRootMap = NULL;
+	resultRelInfo->ri_ChildToRootMapValid = false;
+	resultRelInfo->ri_CopyMultiInsertBuffer = NULL;
+}
+
+/*
+ * ExecGetTriggerResultRel
+ *		Get a ResultRelInfo for a trigger target relation.
+ *
+ * Most of the time, triggers are fired on one of the result relations of the
+ * query, and so we can just return a member of the es_result_relations array,
+ * or the es_tuple_routing_result_relations list (if any). (Note: in self-join
+ * situations there might be multiple members with the same OID; if so it
+ * doesn't matter which one we pick.)
+ *
+ * However, it is sometimes necessary to fire triggers on other relations;
+ * this happens mainly when an RI update trigger queues additional triggers
+ * on other relations, which will be processed in the context of the outer
+ * query.  For efficiency's sake, we want to have a ResultRelInfo for those
+ * triggers too; that can avoid repeated re-opening of the relation.  (It
+ * also provides a way for EXPLAIN ANALYZE to report the runtimes of such
+ * triggers.)  So we make additional ResultRelInfo's as needed, and save them
+ * in es_trig_target_relations.
+ */
+ResultRelInfo *
+ExecGetTriggerResultRel(EState *estate, Oid relid)
+{
+	ResultRelInfo *rInfo;
+	ListCell   *l;
+	Relation	rel;
+	MemoryContext oldcontext;
+
+	/* Search through the query result relations */
+	foreach(l, estate->es_opened_result_relations)
+	{
+		rInfo = lfirst(l);
+		if (RelationGetRelid(rInfo->ri_RelationDesc) == relid)
+			return rInfo;
+	}
+
+	/*
+	 * Search through the result relations that were created during tuple
+	 * routing, if any.
+	 */
+	foreach(l, estate->es_tuple_routing_result_relations)
+	{
+		rInfo = (ResultRelInfo *) lfirst(l);
+		if (RelationGetRelid(rInfo->ri_RelationDesc) == relid)
+			return rInfo;
+	}
+
+	/* Nope, but maybe we already made an extra ResultRelInfo for it */
+	foreach(l, estate->es_trig_target_relations)
+	{
+		rInfo = (ResultRelInfo *) lfirst(l);
+		if (RelationGetRelid(rInfo->ri_RelationDesc) == relid)
+			return rInfo;
+	}
+	/* Nope, so we need a new one */
+
+	/*
+	 * Open the target relation's relcache entry.  We assume that an
+	 * appropriate lock is still held by the backend from whenever the trigger
+	 * event got queued, so we need take no new lock here.  Also, we need not
+	 * recheck the relkind, so no need for CheckValidResultRel.
+	 */
+	rel = table_open(relid, NoLock);
+
+	/*
+	 * Make the new entry in the right context.
+	 */
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	rInfo = makeNode(ResultRelInfo);
+	InitResultRelInfo(rInfo,
+					  rel,
+					  0,		/* dummy rangetable index */
+					  NULL,
+					  estate->es_instrument);
+	estate->es_trig_target_relations =
+		lappend(estate->es_trig_target_relations, rInfo);
+	MemoryContextSwitchTo(oldcontext);
+
+	/*
+	 * Currently, we don't need any index information in ResultRelInfos used
+	 * only for triggers, so no need to call ExecOpenIndices.
+	 */
+
+	return rInfo;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecPostprocessPlan
+ *
+ *		Give plan nodes a final chance to execute before shutdown
+ * ----------------------------------------------------------------
+ */
+static void
+ExecPostprocessPlan(EState *estate)
+{
+	ListCell   *lc;
+
+	/*
+	 * Make sure nodes run forward.
+	 */
+	estate->es_direction = ForwardScanDirection;
+
+	/*
+	 * Run any secondary ModifyTable nodes to completion, in case the main
+	 * query did not fetch all rows from them.  (We do this to ensure that
+	 * such nodes have predictable results.)
+	 */
+	foreach(lc, estate->es_auxmodifytables)
+	{
+		PlanState  *ps = (PlanState *) lfirst(lc);
+
+		for (;;)
+		{
+			TupleTableSlot *slot;
+
+			/* Reset the per-output-tuple exprcontext each time */
+			ResetPerTupleExprContext(estate);
+
+			slot = ExecProcNode(ps);
+
+			if (TupIsNull(slot))
+				break;
+		}
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndPlan
+ *
+ *		Cleans up the query plan -- closes files and frees up storage
+ *
+ * NOTE: we are no longer very worried about freeing storage per se
+ * in this code; FreeExecutorState should be guaranteed to release all
+ * memory that needs to be released.  What we are worried about doing
+ * is closing relations and dropping buffer pins.  Thus, for example,
+ * tuple tables must be cleared or dropped to ensure pins are released.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecEndPlan(PlanState *planstate, EState *estate)
+{
+	ListCell   *l;
+
+	/*
+	 * shut down the node-type-specific query processing
+	 */
+	ExecEndNode(planstate);
+
+	/*
+	 * for subplans too
+	 */
+	foreach(l, estate->es_subplanstates)
+	{
+		PlanState  *subplanstate = (PlanState *) lfirst(l);
+
+		ExecEndNode(subplanstate);
+	}
+
+	/*
+	 * destroy the executor's tuple table.  Actually we only care about
+	 * releasing buffer pins and tupdesc refcounts; there's no need to pfree
+	 * the TupleTableSlots, since the containing memory context is about to go
+	 * away anyway.
+	 */
+	ExecResetTupleTable(estate->es_tupleTable, false);
+
+	/*
+	 * Close any Relations that have been opened for range table entries or
+	 * result relations.
+	 */
+	ExecCloseResultRelations(estate);
+	ExecCloseRangeTableRelations(estate);
+}
+
+/*
+ * Close any relations that have been opened for ResultRelInfos.
+ */
+void
+ExecCloseResultRelations(EState *estate)
+{
+	ListCell   *l;
+
+	/*
+	 * close indexes of result relation(s) if any.  (Rels themselves are
+	 * closed in ExecCloseRangeTableRelations())
+	 */
+	foreach(l, estate->es_opened_result_relations)
+	{
+		ResultRelInfo *resultRelInfo = lfirst(l);
+
+		ExecCloseIndices(resultRelInfo);
+	}
+
+	/* Close any relations that have been opened by ExecGetTriggerResultRel(). */
+	foreach(l, estate->es_trig_target_relations)
+	{
+		ResultRelInfo *resultRelInfo = (ResultRelInfo *) lfirst(l);
+
+		/*
+		 * Assert this is a "dummy" ResultRelInfo, see above.  Otherwise we
+		 * might be issuing a duplicate close against a Relation opened by
+		 * ExecGetRangeTableRelation.
+		 */
+		Assert(resultRelInfo->ri_RangeTableIndex == 0);
+
+		/*
+		 * Since ExecGetTriggerResultRel doesn't call ExecOpenIndices for
+		 * these rels, we needn't call ExecCloseIndices either.
+		 */
+		Assert(resultRelInfo->ri_NumIndices == 0);
+
+		table_close(resultRelInfo->ri_RelationDesc, NoLock);
+	}
+}
+
+/*
+ * Close all relations opened by ExecGetRangeTableRelation().
+ *
+ * We do not release any locks we might hold on those rels.
+ */
+void
+ExecCloseRangeTableRelations(EState *estate)
+{
+	int			i;
+
+	for (i = 0; i < estate->es_range_table_size; i++)
+	{
+		if (estate->es_relations[i])
+			table_close(estate->es_relations[i], NoLock);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecutePlan
+ *
+ *		Processes the query plan until we have retrieved 'numberTuples' tuples,
+ *		moving in the specified direction.
+ *
+ *		Runs to completion if numberTuples is 0
+ *
+ * Note: the ctid attribute is a 'junk' attribute that is removed before the
+ * user can see it
+ * ----------------------------------------------------------------
+ */
+static void
+ExecutePlan(EState *estate,
+			PlanState *planstate,
+			bool use_parallel_mode,
+			CmdType operation,
+			bool sendTuples,
+			uint64 numberTuples,
+			ScanDirection direction,
+			DestReceiver *dest,
+			bool execute_once)
+{
+	TupleTableSlot *slot;
+	uint64		current_tuple_count;
+
+	/*
+	 * initialize local variables
+	 */
+	current_tuple_count = 0;
+
+	/*
+	 * Set the direction.
+	 */
+	estate->es_direction = direction;
+
+	/*
+	 * If the plan might potentially be executed multiple times, we must force
+	 * it to run without parallelism, because we might exit early.
+	 */
+	if (!execute_once)
+		use_parallel_mode = false;
+
+	estate->es_use_parallel_mode = use_parallel_mode;
+	if (use_parallel_mode)
+		EnterParallelMode();
+
+	/*
+	 * Loop until we've processed the proper number of tuples from the plan.
+	 */
+	for (;;)
+	{
+		/* Reset the per-output-tuple exprcontext */
+		ResetPerTupleExprContext(estate);
+
+		/*
+		 * Execute the plan and obtain a tuple
+		 */
+		slot = ExecProcNode(planstate);
+
+		/*
+		 * if the tuple is null, then we assume there is nothing more to
+		 * process so we just end the loop...
+		 */
+		if (TupIsNull(slot))
+			break;
+
+		/*
+		 * If we have a junk filter, then project a new tuple with the junk
+		 * removed.
+		 *
+		 * Store this new "clean" tuple in the junkfilter's resultSlot.
+		 * (Formerly, we stored it back over the "dirty" tuple, which is WRONG
+		 * because that tuple slot has the wrong descriptor.)
+		 */
+		if (estate->es_junkFilter != NULL)
+			slot = ExecFilterJunk(estate->es_junkFilter, slot);
+
+		/*
+		 * If we are supposed to send the tuple somewhere, do so. (In
+		 * practice, this is probably always the case at this point.)
+		 */
+		if (sendTuples)
+		{
+			/*
+			 * If we are not able to send the tuple, we assume the destination
+			 * has closed and no more tuples can be sent. If that's the case,
+			 * end the loop.
+			 */
+			if (!dest->receiveSlot(slot, dest))
+				break;
+		}
+
+		/*
+		 * Count tuples processed, if this is a SELECT.  (For other operation
+		 * types, the ModifyTable plan node must count the appropriate
+		 * events.)
+		 */
+		if (operation == CMD_SELECT)
+			(estate->es_processed)++;
+
+		/*
+		 * check our tuple count.. if we've processed the proper number then
+		 * quit, else loop again and process more tuples.  Zero numberTuples
+		 * means no limit.
+		 */
+		current_tuple_count++;
+		if (numberTuples && numberTuples == current_tuple_count)
+			break;
+	}
+
+	/*
+	 * If we know we won't need to back up, we can release resources at this
+	 * point.
+	 */
+	if (!(estate->es_top_eflags & EXEC_FLAG_BACKWARD))
+		(void) ExecShutdownNode(planstate);
+
+	if (use_parallel_mode)
+		ExitParallelMode();
+}
+
+
+/*
+ * ExecRelCheck --- check that tuple meets constraints for result relation
+ *
+ * Returns NULL if OK, else name of failed check constraint
+ */
+static const char *
+ExecRelCheck(ResultRelInfo *resultRelInfo,
+			 TupleTableSlot *slot, EState *estate)
+{
+	Relation	rel = resultRelInfo->ri_RelationDesc;
+	int			ncheck = rel->rd_att->constr->num_check;
+	ConstrCheck *check = rel->rd_att->constr->check;
+	ExprContext *econtext;
+	MemoryContext oldContext;
+	int			i;
+
+	/*
+	 * CheckConstraintFetch let this pass with only a warning, but now we
+	 * should fail rather than possibly failing to enforce an important
+	 * constraint.
+	 */
+	if (ncheck != rel->rd_rel->relchecks)
+		elog(ERROR, "%d pg_constraint record(s) missing for relation \"%s\"",
+			 rel->rd_rel->relchecks - ncheck, RelationGetRelationName(rel));
+
+	/*
+	 * If first time through for this result relation, build expression
+	 * nodetrees for rel's constraint expressions.  Keep them in the per-query
+	 * memory context so they'll survive throughout the query.
+	 */
+	if (resultRelInfo->ri_ConstraintExprs == NULL)
+	{
+		oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+		resultRelInfo->ri_ConstraintExprs =
+			(ExprState **) palloc(ncheck * sizeof(ExprState *));
+		for (i = 0; i < ncheck; i++)
+		{
+			Expr	   *checkconstr;
+
+			checkconstr = stringToNode(check[i].ccbin);
+			resultRelInfo->ri_ConstraintExprs[i] =
+				ExecPrepareExpr(checkconstr, estate);
+		}
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	/*
+	 * We will use the EState's per-tuple context for evaluating constraint
+	 * expressions (creating it if it's not already there).
+	 */
+	econtext = GetPerTupleExprContext(estate);
+
+	/* Arrange for econtext's scan tuple to be the tuple under test */
+	econtext->ecxt_scantuple = slot;
+
+	/* And evaluate the constraints */
+	for (i = 0; i < ncheck; i++)
+	{
+		ExprState  *checkconstr = resultRelInfo->ri_ConstraintExprs[i];
+
+		/*
+		 * NOTE: SQL specifies that a NULL result from a constraint expression
+		 * is not to be treated as a failure.  Therefore, use ExecCheck not
+		 * ExecQual.
+		 */
+		if (!ExecCheck(checkconstr, econtext))
+			return check[i].ccname;
+	}
+
+	/* NULL result means no error */
+	return NULL;
+}
+
+/*
+ * ExecPartitionCheck --- check that tuple meets the partition constraint.
+ *
+ * Returns true if it meets the partition constraint.  If the constraint
+ * fails and we're asked to emit an error, do so and don't return; otherwise
+ * return false.
+ */
+bool
+ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
+				   EState *estate, bool emitError)
+{
+	ExprContext *econtext;
+	bool		success;
+
+	/*
+	 * If first time through, build expression state tree for the partition
+	 * check expression.  (In the corner case where the partition check
+	 * expression is empty, ie there's a default partition and nothing else,
+	 * we'll be fooled into executing this code each time through.  But it's
+	 * pretty darn cheap in that case, so we don't worry about it.)
+	 */
+	if (resultRelInfo->ri_PartitionCheckExpr == NULL)
+	{
+		/*
+		 * Ensure that the qual tree and prepared expression are in the
+		 * query-lifespan context.
+		 */
+		MemoryContext oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
+		List	   *qual = RelationGetPartitionQual(resultRelInfo->ri_RelationDesc);
+
+		resultRelInfo->ri_PartitionCheckExpr = ExecPrepareCheck(qual, estate);
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	/*
+	 * We will use the EState's per-tuple context for evaluating constraint
+	 * expressions (creating it if it's not already there).
+	 */
+	econtext = GetPerTupleExprContext(estate);
+
+	/* Arrange for econtext's scan tuple to be the tuple under test */
+	econtext->ecxt_scantuple = slot;
+
+	/*
+	 * As in case of the catalogued constraints, we treat a NULL result as
+	 * success here, not a failure.
+	 */
+	success = ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext);
+
+	/* if asked to emit error, don't actually return on failure */
+	if (!success && emitError)
+		ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+	return success;
+}
+
+/*
+ * ExecPartitionCheckEmitError - Form and emit an error message after a failed
+ * partition constraint check.
+ */
+void
+ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
+							TupleTableSlot *slot,
+							EState *estate)
+{
+	Oid			root_relid;
+	TupleDesc	tupdesc;
+	char	   *val_desc;
+	Bitmapset  *modifiedCols;
+
+	/*
+	 * If the tuple has been routed, it's been converted to the partition's
+	 * rowtype, which might differ from the root table's.  We must convert it
+	 * back to the root table's rowtype so that val_desc in the error message
+	 * matches the input tuple.
+	 */
+	if (resultRelInfo->ri_RootResultRelInfo)
+	{
+		ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo;
+		TupleDesc	old_tupdesc;
+		AttrMap    *map;
+
+		root_relid = RelationGetRelid(rootrel->ri_RelationDesc);
+		tupdesc = RelationGetDescr(rootrel->ri_RelationDesc);
+
+		old_tupdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+		/* a reverse map */
+		map = build_attrmap_by_name_if_req(old_tupdesc, tupdesc);
+
+		/*
+		 * Partition-specific slot's tupdesc can't be changed, so allocate a
+		 * new one.
+		 */
+		if (map != NULL)
+			slot = execute_attr_map_slot(map, slot,
+										 MakeTupleTableSlot(tupdesc, &TTSOpsVirtual));
+		modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate),
+								 ExecGetUpdatedCols(rootrel, estate));
+	}
+	else
+	{
+		root_relid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+		tupdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+		modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate),
+								 ExecGetUpdatedCols(resultRelInfo, estate));
+	}
+
+	val_desc = ExecBuildSlotValueDescription(root_relid,
+											 slot,
+											 tupdesc,
+											 modifiedCols,
+											 64);
+	ereport(ERROR,
+			(errcode(ERRCODE_CHECK_VIOLATION),
+			 errmsg("new row for relation \"%s\" violates partition constraint",
+					RelationGetRelationName(resultRelInfo->ri_RelationDesc)),
+			 val_desc ? errdetail("Failing row contains %s.", val_desc) : 0,
+			 errtable(resultRelInfo->ri_RelationDesc)));
+}
+
+/*
+ * ExecConstraints - check constraints of the tuple in 'slot'
+ *
+ * This checks the traditional NOT NULL and check constraints.
+ *
+ * The partition constraint is *NOT* checked.
+ *
+ * Note: 'slot' contains the tuple to check the constraints of, which may
+ * have been converted from the original input tuple after tuple routing.
+ * 'resultRelInfo' is the final result relation, after tuple routing.
+ */
+void
+ExecConstraints(ResultRelInfo *resultRelInfo,
+				TupleTableSlot *slot, EState *estate)
+{
+	Relation	rel = resultRelInfo->ri_RelationDesc;
+	TupleDesc	tupdesc = RelationGetDescr(rel);
+	TupleConstr *constr = tupdesc->constr;
+	Bitmapset  *modifiedCols;
+
+	Assert(constr);				/* we should not be called otherwise */
+
+	if (constr->has_not_null)
+	{
+		int			natts = tupdesc->natts;
+		int			attrChk;
+
+		for (attrChk = 1; attrChk <= natts; attrChk++)
+		{
+			Form_pg_attribute att = TupleDescAttr(tupdesc, attrChk - 1);
+
+			if (att->attnotnull && slot_attisnull(slot, attrChk))
+			{
+				char	   *val_desc;
+				Relation	orig_rel = rel;
+				TupleDesc	orig_tupdesc = RelationGetDescr(rel);
+
+				/*
+				 * If the tuple has been routed, it's been converted to the
+				 * partition's rowtype, which might differ from the root
+				 * table's.  We must convert it back to the root table's
+				 * rowtype so that val_desc shown error message matches the
+				 * input tuple.
+				 */
+				if (resultRelInfo->ri_RootResultRelInfo)
+				{
+					ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo;
+					AttrMap    *map;
+
+					tupdesc = RelationGetDescr(rootrel->ri_RelationDesc);
+					/* a reverse map */
+					map = build_attrmap_by_name_if_req(orig_tupdesc,
+													   tupdesc);
+
+					/*
+					 * Partition-specific slot's tupdesc can't be changed, so
+					 * allocate a new one.
+					 */
+					if (map != NULL)
+						slot = execute_attr_map_slot(map, slot,
+													 MakeTupleTableSlot(tupdesc, &TTSOpsVirtual));
+					modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate),
+											 ExecGetUpdatedCols(rootrel, estate));
+					rel = rootrel->ri_RelationDesc;
+				}
+				else
+					modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate),
+											 ExecGetUpdatedCols(resultRelInfo, estate));
+				val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
+														 slot,
+														 tupdesc,
+														 modifiedCols,
+														 64);
+
+				ereport(ERROR,
+						(errcode(ERRCODE_NOT_NULL_VIOLATION),
+						 errmsg("null value in column \"%s\" of relation \"%s\" violates not-null constraint",
+								NameStr(att->attname),
+								RelationGetRelationName(orig_rel)),
+						 val_desc ? errdetail("Failing row contains %s.", val_desc) : 0,
+						 errtablecol(orig_rel, attrChk)));
+			}
+		}
+	}
+
+	if (rel->rd_rel->relchecks > 0)
+	{
+		const char *failed;
+
+		if ((failed = ExecRelCheck(resultRelInfo, slot, estate)) != NULL)
+		{
+			char	   *val_desc;
+			Relation	orig_rel = rel;
+
+			/* See the comment above. */
+			if (resultRelInfo->ri_RootResultRelInfo)
+			{
+				ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo;
+				TupleDesc	old_tupdesc = RelationGetDescr(rel);
+				AttrMap    *map;
+
+				tupdesc = RelationGetDescr(rootrel->ri_RelationDesc);
+				/* a reverse map */
+				map = build_attrmap_by_name_if_req(old_tupdesc,
+												   tupdesc);
+
+				/*
+				 * Partition-specific slot's tupdesc can't be changed, so
+				 * allocate a new one.
+				 */
+				if (map != NULL)
+					slot = execute_attr_map_slot(map, slot,
+												 MakeTupleTableSlot(tupdesc, &TTSOpsVirtual));
+				modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate),
+										 ExecGetUpdatedCols(rootrel, estate));
+				rel = rootrel->ri_RelationDesc;
+			}
+			else
+				modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate),
+										 ExecGetUpdatedCols(resultRelInfo, estate));
+			val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
+													 slot,
+													 tupdesc,
+													 modifiedCols,
+													 64);
+			ereport(ERROR,
+					(errcode(ERRCODE_CHECK_VIOLATION),
+					 errmsg("new row for relation \"%s\" violates check constraint \"%s\"",
+							RelationGetRelationName(orig_rel), failed),
+					 val_desc ? errdetail("Failing row contains %s.", val_desc) : 0,
+					 errtableconstraint(orig_rel, failed)));
+		}
+	}
+}
+
+/*
+ * ExecWithCheckOptions -- check that tuple satisfies any WITH CHECK OPTIONs
+ * of the specified kind.
+ *
+ * Note that this needs to be called multiple times to ensure that all kinds of
+ * WITH CHECK OPTIONs are handled (both those from views which have the WITH
+ * CHECK OPTION set and from row-level security policies).  See ExecInsert()
+ * and ExecUpdate().
+ */
+void
+ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo,
+					 TupleTableSlot *slot, EState *estate)
+{
+	Relation	rel = resultRelInfo->ri_RelationDesc;
+	TupleDesc	tupdesc = RelationGetDescr(rel);
+	ExprContext *econtext;
+	ListCell   *l1,
+			   *l2;
+
+	/*
+	 * We will use the EState's per-tuple context for evaluating constraint
+	 * expressions (creating it if it's not already there).
+	 */
+	econtext = GetPerTupleExprContext(estate);
+
+	/* Arrange for econtext's scan tuple to be the tuple under test */
+	econtext->ecxt_scantuple = slot;
+
+	/* Check each of the constraints */
+	forboth(l1, resultRelInfo->ri_WithCheckOptions,
+			l2, resultRelInfo->ri_WithCheckOptionExprs)
+	{
+		WithCheckOption *wco = (WithCheckOption *) lfirst(l1);
+		ExprState  *wcoExpr = (ExprState *) lfirst(l2);
+
+		/*
+		 * Skip any WCOs which are not the kind we are looking for at this
+		 * time.
+		 */
+		if (wco->kind != kind)
+			continue;
+
+		/*
+		 * WITH CHECK OPTION checks are intended to ensure that the new tuple
+		 * is visible (in the case of a view) or that it passes the
+		 * 'with-check' policy (in the case of row security). If the qual
+		 * evaluates to NULL or FALSE, then the new tuple won't be included in
+		 * the view or doesn't pass the 'with-check' policy for the table.
+		 */
+		if (!ExecQual(wcoExpr, econtext))
+		{
+			char	   *val_desc;
+			Bitmapset  *modifiedCols;
+
+			switch (wco->kind)
+			{
+					/*
+					 * For WITH CHECK OPTIONs coming from views, we might be
+					 * able to provide the details on the row, depending on
+					 * the permissions on the relation (that is, if the user
+					 * could view it directly anyway).  For RLS violations, we
+					 * don't include the data since we don't know if the user
+					 * should be able to view the tuple as that depends on the
+					 * USING policy.
+					 */
+				case WCO_VIEW_CHECK:
+					/* See the comment in ExecConstraints(). */
+					if (resultRelInfo->ri_RootResultRelInfo)
+					{
+						ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo;
+						TupleDesc	old_tupdesc = RelationGetDescr(rel);
+						AttrMap    *map;
+
+						tupdesc = RelationGetDescr(rootrel->ri_RelationDesc);
+						/* a reverse map */
+						map = build_attrmap_by_name_if_req(old_tupdesc,
+														   tupdesc);
+
+						/*
+						 * Partition-specific slot's tupdesc can't be changed,
+						 * so allocate a new one.
+						 */
+						if (map != NULL)
+							slot = execute_attr_map_slot(map, slot,
+														 MakeTupleTableSlot(tupdesc, &TTSOpsVirtual));
+
+						modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate),
+												 ExecGetUpdatedCols(rootrel, estate));
+						rel = rootrel->ri_RelationDesc;
+					}
+					else
+						modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate),
+												 ExecGetUpdatedCols(resultRelInfo, estate));
+					val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
+															 slot,
+															 tupdesc,
+															 modifiedCols,
+															 64);
+
+					ereport(ERROR,
+							(errcode(ERRCODE_WITH_CHECK_OPTION_VIOLATION),
+							 errmsg("new row violates check option for view \"%s\"",
+									wco->relname),
+							 val_desc ? errdetail("Failing row contains %s.",
+												  val_desc) : 0));
+					break;
+				case WCO_RLS_INSERT_CHECK:
+				case WCO_RLS_UPDATE_CHECK:
+					if (wco->polname != NULL)
+						ereport(ERROR,
+								(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+								 errmsg("new row violates row-level security policy \"%s\" for table \"%s\"",
+										wco->polname, wco->relname)));
+					else
+						ereport(ERROR,
+								(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+								 errmsg("new row violates row-level security policy for table \"%s\"",
+										wco->relname)));
+					break;
+				case WCO_RLS_CONFLICT_CHECK:
+					if (wco->polname != NULL)
+						ereport(ERROR,
+								(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+								 errmsg("new row violates row-level security policy \"%s\" (USING expression) for table \"%s\"",
+										wco->polname, wco->relname)));
+					else
+						ereport(ERROR,
+								(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+								 errmsg("new row violates row-level security policy (USING expression) for table \"%s\"",
+										wco->relname)));
+					break;
+				default:
+					elog(ERROR, "unrecognized WCO kind: %u", wco->kind);
+					break;
+			}
+		}
+	}
+}
+
+/*
+ * ExecBuildSlotValueDescription -- construct a string representing a tuple
+ *
+ * This is intentionally very similar to BuildIndexValueDescription, but
+ * unlike that function, we truncate long field values (to at most maxfieldlen
+ * bytes).  That seems necessary here since heap field values could be very
+ * long, whereas index entries typically aren't so wide.
+ *
+ * Also, unlike the case with index entries, we need to be prepared to ignore
+ * dropped columns.  We used to use the slot's tuple descriptor to decode the
+ * data, but the slot's descriptor doesn't identify dropped columns, so we
+ * now need to be passed the relation's descriptor.
+ *
+ * Note that, like BuildIndexValueDescription, if the user does not have
+ * permission to view any of the columns involved, a NULL is returned.  Unlike
+ * BuildIndexValueDescription, if the user has access to view a subset of the
+ * column involved, that subset will be returned with a key identifying which
+ * columns they are.
+ */
+static char *
+ExecBuildSlotValueDescription(Oid reloid,
+							  TupleTableSlot *slot,
+							  TupleDesc tupdesc,
+							  Bitmapset *modifiedCols,
+							  int maxfieldlen)
+{
+	StringInfoData buf;
+	StringInfoData collist;
+	bool		write_comma = false;
+	bool		write_comma_collist = false;
+	int			i;
+	AclResult	aclresult;
+	bool		table_perm = false;
+	bool		any_perm = false;
+
+	/*
+	 * Check if RLS is enabled and should be active for the relation; if so,
+	 * then don't return anything.  Otherwise, go through normal permission
+	 * checks.
+	 */
+	if (check_enable_rls(reloid, InvalidOid, true) == RLS_ENABLED)
+		return NULL;
+
+	initStringInfo(&buf);
+
+	appendStringInfoChar(&buf, '(');
+
+	/*
+	 * Check if the user has permissions to see the row.  Table-level SELECT
+	 * allows access to all columns.  If the user does not have table-level
+	 * SELECT then we check each column and include those the user has SELECT
+	 * rights on.  Additionally, we always include columns the user provided
+	 * data for.
+	 */
+	aclresult = pg_class_aclcheck(reloid, GetUserId(), ACL_SELECT);
+	if (aclresult != ACLCHECK_OK)
+	{
+		/* Set up the buffer for the column list */
+		initStringInfo(&collist);
+		appendStringInfoChar(&collist, '(');
+	}
+	else
+		table_perm = any_perm = true;
+
+	/* Make sure the tuple is fully deconstructed */
+	slot_getallattrs(slot);
+
+	for (i = 0; i < tupdesc->natts; i++)
+	{
+		bool		column_perm = false;
+		char	   *val;
+		int			vallen;
+		Form_pg_attribute att = TupleDescAttr(tupdesc, i);
+
+		/* ignore dropped columns */
+		if (att->attisdropped)
+			continue;
+
+		if (!table_perm)
+		{
+			/*
+			 * No table-level SELECT, so need to make sure they either have
+			 * SELECT rights on the column or that they have provided the data
+			 * for the column.  If not, omit this column from the error
+			 * message.
+			 */
+			aclresult = pg_attribute_aclcheck(reloid, att->attnum,
+											  GetUserId(), ACL_SELECT);
+			if (bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber,
+							  modifiedCols) || aclresult == ACLCHECK_OK)
+			{
+				column_perm = any_perm = true;
+
+				if (write_comma_collist)
+					appendStringInfoString(&collist, ", ");
+				else
+					write_comma_collist = true;
+
+				appendStringInfoString(&collist, NameStr(att->attname));
+			}
+		}
+
+		if (table_perm || column_perm)
+		{
+			if (slot->tts_isnull[i])
+				val = "null";
+			else
+			{
+				Oid			foutoid;
+				bool		typisvarlena;
+
+				getTypeOutputInfo(att->atttypid,
+								  &foutoid, &typisvarlena);
+				val = OidOutputFunctionCall(foutoid, slot->tts_values[i]);
+			}
+
+			if (write_comma)
+				appendStringInfoString(&buf, ", ");
+			else
+				write_comma = true;
+
+			/* truncate if needed */
+			vallen = strlen(val);
+			if (vallen <= maxfieldlen)
+				appendBinaryStringInfo(&buf, val, vallen);
+			else
+			{
+				vallen = pg_mbcliplen(val, vallen, maxfieldlen);
+				appendBinaryStringInfo(&buf, val, vallen);
+				appendStringInfoString(&buf, "...");
+			}
+		}
+	}
+
+	/* If we end up with zero columns being returned, then return NULL. */
+	if (!any_perm)
+		return NULL;
+
+	appendStringInfoChar(&buf, ')');
+
+	if (!table_perm)
+	{
+		appendStringInfoString(&collist, ") = ");
+		appendBinaryStringInfo(&collist, buf.data, buf.len);
+
+		return collist.data;
+	}
+
+	return buf.data;
+}
+
+
+/*
+ * ExecUpdateLockMode -- find the appropriate UPDATE tuple lock mode for a
+ * given ResultRelInfo
+ */
+LockTupleMode
+ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo)
+{
+	Bitmapset  *keyCols;
+	Bitmapset  *updatedCols;
+
+	/*
+	 * Compute lock mode to use.  If columns that are part of the key have not
+	 * been modified, then we can use a weaker lock, allowing for better
+	 * concurrency.
+	 */
+	updatedCols = ExecGetAllUpdatedCols(relinfo, estate);
+	keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc,
+										 INDEX_ATTR_BITMAP_KEY);
+
+	if (bms_overlap(keyCols, updatedCols))
+		return LockTupleExclusive;
+
+	return LockTupleNoKeyExclusive;
+}
+
+/*
+ * ExecFindRowMark -- find the ExecRowMark struct for given rangetable index
+ *
+ * If no such struct, either return NULL or throw error depending on missing_ok
+ */
+ExecRowMark *
+ExecFindRowMark(EState *estate, Index rti, bool missing_ok)
+{
+	if (rti > 0 && rti <= estate->es_range_table_size &&
+		estate->es_rowmarks != NULL)
+	{
+		ExecRowMark *erm = estate->es_rowmarks[rti - 1];
+
+		if (erm)
+			return erm;
+	}
+	if (!missing_ok)
+		elog(ERROR, "failed to find ExecRowMark for rangetable index %u", rti);
+	return NULL;
+}
+
+/*
+ * ExecBuildAuxRowMark -- create an ExecAuxRowMark struct
+ *
+ * Inputs are the underlying ExecRowMark struct and the targetlist of the
+ * input plan node (not planstate node!).  We need the latter to find out
+ * the column numbers of the resjunk columns.
+ */
+ExecAuxRowMark *
+ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist)
+{
+	ExecAuxRowMark *aerm = (ExecAuxRowMark *) palloc0(sizeof(ExecAuxRowMark));
+	char		resname[32];
+
+	aerm->rowmark = erm;
+
+	/* Look up the resjunk columns associated with this rowmark */
+	if (erm->markType != ROW_MARK_COPY)
+	{
+		/* need ctid for all methods other than COPY */
+		snprintf(resname, sizeof(resname), "ctid%u", erm->rowmarkId);
+		aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist,
+													   resname);
+		if (!AttributeNumberIsValid(aerm->ctidAttNo))
+			elog(ERROR, "could not find junk %s column", resname);
+	}
+	else
+	{
+		/* need wholerow if COPY */
+		snprintf(resname, sizeof(resname), "wholerow%u", erm->rowmarkId);
+		aerm->wholeAttNo = ExecFindJunkAttributeInTlist(targetlist,
+														resname);
+		if (!AttributeNumberIsValid(aerm->wholeAttNo))
+			elog(ERROR, "could not find junk %s column", resname);
+	}
+
+	/* if child rel, need tableoid */
+	if (erm->rti != erm->prti)
+	{
+		snprintf(resname, sizeof(resname), "tableoid%u", erm->rowmarkId);
+		aerm->toidAttNo = ExecFindJunkAttributeInTlist(targetlist,
+													   resname);
+		if (!AttributeNumberIsValid(aerm->toidAttNo))
+			elog(ERROR, "could not find junk %s column", resname);
+	}
+
+	return aerm;
+}
+
+
+/*
+ * EvalPlanQual logic --- recheck modified tuple(s) to see if we want to
+ * process the updated version under READ COMMITTED rules.
+ *
+ * See backend/executor/README for some info about how this works.
+ */
+
+
+/*
+ * Check the updated version of a tuple to see if we want to process it under
+ * READ COMMITTED rules.
+ *
+ *	epqstate - state for EvalPlanQual rechecking
+ *	relation - table containing tuple
+ *	rti - rangetable index of table containing tuple
+ *	inputslot - tuple for processing - this can be the slot from
+ *		EvalPlanQualSlot(), for the increased efficiency.
+ *
+ * This tests whether the tuple in inputslot still matches the relevant
+ * quals. For that result to be useful, typically the input tuple has to be
+ * last row version (otherwise the result isn't particularly useful) and
+ * locked (otherwise the result might be out of date). That's typically
+ * achieved by using table_tuple_lock() with the
+ * TUPLE_LOCK_FLAG_FIND_LAST_VERSION flag.
+ *
+ * Returns a slot containing the new candidate update/delete tuple, or
+ * NULL if we determine we shouldn't process the row.
+ */
+TupleTableSlot *
+EvalPlanQual(EPQState *epqstate, Relation relation,
+			 Index rti, TupleTableSlot *inputslot)
+{
+	TupleTableSlot *slot;
+	TupleTableSlot *testslot;
+
+	Assert(rti > 0);
+
+	/*
+	 * Need to run a recheck subquery.  Initialize or reinitialize EPQ state.
+	 */
+	EvalPlanQualBegin(epqstate);
+
+	/*
+	 * Callers will often use the EvalPlanQualSlot to store the tuple to avoid
+	 * an unnecessary copy.
+	 */
+	testslot = EvalPlanQualSlot(epqstate, relation, rti);
+	if (testslot != inputslot)
+		ExecCopySlot(testslot, inputslot);
+
+	/*
+	 * Run the EPQ query.  We assume it will return at most one tuple.
+	 */
+	slot = EvalPlanQualNext(epqstate);
+
+	/*
+	 * If we got a tuple, force the slot to materialize the tuple so that it
+	 * is not dependent on any local state in the EPQ query (in particular,
+	 * it's highly likely that the slot contains references to any pass-by-ref
+	 * datums that may be present in copyTuple).  As with the next step, this
+	 * is to guard against early re-use of the EPQ query.
+	 */
+	if (!TupIsNull(slot))
+		ExecMaterializeSlot(slot);
+
+	/*
+	 * Clear out the test tuple.  This is needed in case the EPQ query is
+	 * re-used to test a tuple for a different relation.  (Not clear that can
+	 * really happen, but let's be safe.)
+	 */
+	ExecClearTuple(testslot);
+
+	return slot;
+}
+
+/*
+ * EvalPlanQualInit -- initialize during creation of a plan state node
+ * that might need to invoke EPQ processing.
+ *
+ * Note: subplan/auxrowmarks can be NULL/NIL if they will be set later
+ * with EvalPlanQualSetPlan.
+ */
+void
+EvalPlanQualInit(EPQState *epqstate, EState *parentestate,
+				 Plan *subplan, List *auxrowmarks, int epqParam)
+{
+	Index		rtsize = parentestate->es_range_table_size;
+
+	/* initialize data not changing over EPQState's lifetime */
+	epqstate->parentestate = parentestate;
+	epqstate->epqParam = epqParam;
+
+	/*
+	 * Allocate space to reference a slot for each potential rti - do so now
+	 * rather than in EvalPlanQualBegin(), as done for other dynamically
+	 * allocated resources, so EvalPlanQualSlot() can be used to hold tuples
+	 * that *may* need EPQ later, without forcing the overhead of
+	 * EvalPlanQualBegin().
+	 */
+	epqstate->tuple_table = NIL;
+	epqstate->relsubs_slot = (TupleTableSlot **)
+		palloc0(rtsize * sizeof(TupleTableSlot *));
+
+	/* ... and remember data that EvalPlanQualBegin will need */
+	epqstate->plan = subplan;
+	epqstate->arowMarks = auxrowmarks;
+
+	/* ... and mark the EPQ state inactive */
+	epqstate->origslot = NULL;
+	epqstate->recheckestate = NULL;
+	epqstate->recheckplanstate = NULL;
+	epqstate->relsubs_rowmark = NULL;
+	epqstate->relsubs_done = NULL;
+}
+
+/*
+ * EvalPlanQualSetPlan -- set or change subplan of an EPQState.
+ *
+ * We used to need this so that ModifyTable could deal with multiple subplans.
+ * It could now be refactored out of existence.
+ */
+void
+EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks)
+{
+	/* If we have a live EPQ query, shut it down */
+	EvalPlanQualEnd(epqstate);
+	/* And set/change the plan pointer */
+	epqstate->plan = subplan;
+	/* The rowmarks depend on the plan, too */
+	epqstate->arowMarks = auxrowmarks;
+}
+
+/*
+ * Return, and create if necessary, a slot for an EPQ test tuple.
+ *
+ * Note this only requires EvalPlanQualInit() to have been called,
+ * EvalPlanQualBegin() is not necessary.
+ */
+TupleTableSlot *
+EvalPlanQualSlot(EPQState *epqstate,
+				 Relation relation, Index rti)
+{
+	TupleTableSlot **slot;
+
+	Assert(relation);
+	Assert(rti > 0 && rti <= epqstate->parentestate->es_range_table_size);
+	slot = &epqstate->relsubs_slot[rti - 1];
+
+	if (*slot == NULL)
+	{
+		MemoryContext oldcontext;
+
+		oldcontext = MemoryContextSwitchTo(epqstate->parentestate->es_query_cxt);
+		*slot = table_slot_create(relation, &epqstate->tuple_table);
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	return *slot;
+}
+
+/*
+ * Fetch the current row value for a non-locked relation, identified by rti,
+ * that needs to be scanned by an EvalPlanQual operation.  origslot must have
+ * been set to contain the current result row (top-level row) that we need to
+ * recheck.  Returns true if a substitution tuple was found, false if not.
+ */
+bool
+EvalPlanQualFetchRowMark(EPQState *epqstate, Index rti, TupleTableSlot *slot)
+{
+	ExecAuxRowMark *earm = epqstate->relsubs_rowmark[rti - 1];
+	ExecRowMark *erm = earm->rowmark;
+	Datum		datum;
+	bool		isNull;
+
+	Assert(earm != NULL);
+	Assert(epqstate->origslot != NULL);
+
+	if (RowMarkRequiresRowShareLock(erm->markType))
+		elog(ERROR, "EvalPlanQual doesn't support locking rowmarks");
+
+	/* if child rel, must check whether it produced this row */
+	if (erm->rti != erm->prti)
+	{
+		Oid			tableoid;
+
+		datum = ExecGetJunkAttribute(epqstate->origslot,
+									 earm->toidAttNo,
+									 &isNull);
+		/* non-locked rels could be on the inside of outer joins */
+		if (isNull)
+			return false;
+
+		tableoid = DatumGetObjectId(datum);
+
+		Assert(OidIsValid(erm->relid));
+		if (tableoid != erm->relid)
+		{
+			/* this child is inactive right now */
+			return false;
+		}
+	}
+
+	if (erm->markType == ROW_MARK_REFERENCE)
+	{
+		Assert(erm->relation != NULL);
+
+		/* fetch the tuple's ctid */
+		datum = ExecGetJunkAttribute(epqstate->origslot,
+									 earm->ctidAttNo,
+									 &isNull);
+		/* non-locked rels could be on the inside of outer joins */
+		if (isNull)
+			return false;
+
+		/* fetch requests on foreign tables must be passed to their FDW */
+		if (erm->relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+		{
+			FdwRoutine *fdwroutine;
+			bool		updated = false;
+
+			fdwroutine = GetFdwRoutineForRelation(erm->relation, false);
+			/* this should have been checked already, but let's be safe */
+			if (fdwroutine->RefetchForeignRow == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot lock rows in foreign table \"%s\"",
+								RelationGetRelationName(erm->relation))));
+
+			fdwroutine->RefetchForeignRow(epqstate->recheckestate,
+										  erm,
+										  datum,
+										  slot,
+										  &updated);
+			if (TupIsNull(slot))
+				elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck");
+
+			/*
+			 * Ideally we'd insist on updated == false here, but that assumes
+			 * that FDWs can track that exactly, which they might not be able
+			 * to.  So just ignore the flag.
+			 */
+			return true;
+		}
+		else
+		{
+			/* ordinary table, fetch the tuple */
+			if (!table_tuple_fetch_row_version(erm->relation,
+											   (ItemPointer) DatumGetPointer(datum),
+											   SnapshotAny, slot))
+				elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck");
+			return true;
+		}
+	}
+	else
+	{
+		Assert(erm->markType == ROW_MARK_COPY);
+
+		/* fetch the whole-row Var for the relation */
+		datum = ExecGetJunkAttribute(epqstate->origslot,
+									 earm->wholeAttNo,
+									 &isNull);
+		/* non-locked rels could be on the inside of outer joins */
+		if (isNull)
+			return false;
+
+		ExecStoreHeapTupleDatum(datum, slot);
+		return true;
+	}
+}
+
+/*
+ * Fetch the next row (if any) from EvalPlanQual testing
+ *
+ * (In practice, there should never be more than one row...)
+ */
+TupleTableSlot *
+EvalPlanQualNext(EPQState *epqstate)
+{
+	MemoryContext oldcontext;
+	TupleTableSlot *slot;
+
+	oldcontext = MemoryContextSwitchTo(epqstate->recheckestate->es_query_cxt);
+	slot = ExecProcNode(epqstate->recheckplanstate);
+	MemoryContextSwitchTo(oldcontext);
+
+	return slot;
+}
+
+/*
+ * Initialize or reset an EvalPlanQual state tree
+ */
+void
+EvalPlanQualBegin(EPQState *epqstate)
+{
+	EState	   *parentestate = epqstate->parentestate;
+	EState	   *recheckestate = epqstate->recheckestate;
+
+	if (recheckestate == NULL)
+	{
+		/* First time through, so create a child EState */
+		EvalPlanQualStart(epqstate, epqstate->plan);
+	}
+	else
+	{
+		/*
+		 * We already have a suitable child EPQ tree, so just reset it.
+		 */
+		Index		rtsize = parentestate->es_range_table_size;
+		PlanState  *rcplanstate = epqstate->recheckplanstate;
+
+		MemSet(epqstate->relsubs_done, 0, rtsize * sizeof(bool));
+
+		/* Recopy current values of parent parameters */
+		if (parentestate->es_plannedstmt->paramExecTypes != NIL)
+		{
+			int			i;
+
+			/*
+			 * Force evaluation of any InitPlan outputs that could be needed
+			 * by the subplan, just in case they got reset since
+			 * EvalPlanQualStart (see comments therein).
+			 */
+			ExecSetParamPlanMulti(rcplanstate->plan->extParam,
+								  GetPerTupleExprContext(parentestate));
+
+			i = list_length(parentestate->es_plannedstmt->paramExecTypes);
+
+			while (--i >= 0)
+			{
+				/* copy value if any, but not execPlan link */
+				recheckestate->es_param_exec_vals[i].value =
+					parentestate->es_param_exec_vals[i].value;
+				recheckestate->es_param_exec_vals[i].isnull =
+					parentestate->es_param_exec_vals[i].isnull;
+			}
+		}
+
+		/*
+		 * Mark child plan tree as needing rescan at all scan nodes.  The
+		 * first ExecProcNode will take care of actually doing the rescan.
+		 */
+		rcplanstate->chgParam = bms_add_member(rcplanstate->chgParam,
+											   epqstate->epqParam);
+	}
+}
+
+/*
+ * Start execution of an EvalPlanQual plan tree.
+ *
+ * This is a cut-down version of ExecutorStart(): we copy some state from
+ * the top-level estate rather than initializing it fresh.
+ */
+static void
+EvalPlanQualStart(EPQState *epqstate, Plan *planTree)
+{
+	EState	   *parentestate = epqstate->parentestate;
+	Index		rtsize = parentestate->es_range_table_size;
+	EState	   *rcestate;
+	MemoryContext oldcontext;
+	ListCell   *l;
+
+	epqstate->recheckestate = rcestate = CreateExecutorState();
+
+	oldcontext = MemoryContextSwitchTo(rcestate->es_query_cxt);
+
+	/* signal that this is an EState for executing EPQ */
+	rcestate->es_epq_active = epqstate;
+
+	/*
+	 * Child EPQ EStates share the parent's copy of unchanging state such as
+	 * the snapshot, rangetable, and external Param info.  They need their own
+	 * copies of local state, including a tuple table, es_param_exec_vals,
+	 * result-rel info, etc.
+	 */
+	rcestate->es_direction = ForwardScanDirection;
+	rcestate->es_snapshot = parentestate->es_snapshot;
+	rcestate->es_crosscheck_snapshot = parentestate->es_crosscheck_snapshot;
+	rcestate->es_range_table = parentestate->es_range_table;
+	rcestate->es_range_table_size = parentestate->es_range_table_size;
+	rcestate->es_relations = parentestate->es_relations;
+	rcestate->es_queryEnv = parentestate->es_queryEnv;
+	rcestate->es_rowmarks = parentestate->es_rowmarks;
+	rcestate->es_plannedstmt = parentestate->es_plannedstmt;
+	rcestate->es_junkFilter = parentestate->es_junkFilter;
+	rcestate->es_output_cid = parentestate->es_output_cid;
+
+	/*
+	 * ResultRelInfos needed by subplans are initialized from scratch when the
+	 * subplans themselves are initialized.
+	 */
+	rcestate->es_result_relations = NULL;
+	/* es_trig_target_relations must NOT be copied */
+	rcestate->es_top_eflags = parentestate->es_top_eflags;
+	rcestate->es_instrument = parentestate->es_instrument;
+	/* es_auxmodifytables must NOT be copied */
+
+	/*
+	 * The external param list is simply shared from parent.  The internal
+	 * param workspace has to be local state, but we copy the initial values
+	 * from the parent, so as to have access to any param values that were
+	 * already set from other parts of the parent's plan tree.
+	 */
+	rcestate->es_param_list_info = parentestate->es_param_list_info;
+	if (parentestate->es_plannedstmt->paramExecTypes != NIL)
+	{
+		int			i;
+
+		/*
+		 * Force evaluation of any InitPlan outputs that could be needed by
+		 * the subplan.  (With more complexity, maybe we could postpone this
+		 * till the subplan actually demands them, but it doesn't seem worth
+		 * the trouble; this is a corner case already, since usually the
+		 * InitPlans would have been evaluated before reaching EvalPlanQual.)
+		 *
+		 * This will not touch output params of InitPlans that occur somewhere
+		 * within the subplan tree, only those that are attached to the
+		 * ModifyTable node or above it and are referenced within the subplan.
+		 * That's OK though, because the planner would only attach such
+		 * InitPlans to a lower-level SubqueryScan node, and EPQ execution
+		 * will not descend into a SubqueryScan.
+		 *
+		 * The EState's per-output-tuple econtext is sufficiently short-lived
+		 * for this, since it should get reset before there is any chance of
+		 * doing EvalPlanQual again.
+		 */
+		ExecSetParamPlanMulti(planTree->extParam,
+							  GetPerTupleExprContext(parentestate));
+
+		/* now make the internal param workspace ... */
+		i = list_length(parentestate->es_plannedstmt->paramExecTypes);
+		rcestate->es_param_exec_vals = (ParamExecData *)
+			palloc0(i * sizeof(ParamExecData));
+		/* ... and copy down all values, whether really needed or not */
+		while (--i >= 0)
+		{
+			/* copy value if any, but not execPlan link */
+			rcestate->es_param_exec_vals[i].value =
+				parentestate->es_param_exec_vals[i].value;
+			rcestate->es_param_exec_vals[i].isnull =
+				parentestate->es_param_exec_vals[i].isnull;
+		}
+	}
+
+	/*
+	 * Initialize private state information for each SubPlan.  We must do this
+	 * before running ExecInitNode on the main query tree, since
+	 * ExecInitSubPlan expects to be able to find these entries. Some of the
+	 * SubPlans might not be used in the part of the plan tree we intend to
+	 * run, but since it's not easy to tell which, we just initialize them
+	 * all.
+	 */
+	Assert(rcestate->es_subplanstates == NIL);
+	foreach(l, parentestate->es_plannedstmt->subplans)
+	{
+		Plan	   *subplan = (Plan *) lfirst(l);
+		PlanState  *subplanstate;
+
+		subplanstate = ExecInitNode(subplan, rcestate, 0);
+		rcestate->es_subplanstates = lappend(rcestate->es_subplanstates,
+											 subplanstate);
+	}
+
+	/*
+	 * Build an RTI indexed array of rowmarks, so that
+	 * EvalPlanQualFetchRowMark() can efficiently access the to be fetched
+	 * rowmark.
+	 */
+	epqstate->relsubs_rowmark = (ExecAuxRowMark **)
+		palloc0(rtsize * sizeof(ExecAuxRowMark *));
+	foreach(l, epqstate->arowMarks)
+	{
+		ExecAuxRowMark *earm = (ExecAuxRowMark *) lfirst(l);
+
+		epqstate->relsubs_rowmark[earm->rowmark->rti - 1] = earm;
+	}
+
+	/*
+	 * Initialize per-relation EPQ tuple states to not-fetched.
+	 */
+	epqstate->relsubs_done = (bool *)
+		palloc0(rtsize * sizeof(bool));
+
+	/*
+	 * Initialize the private state information for all the nodes in the part
+	 * of the plan tree we need to run.  This opens files, allocates storage
+	 * and leaves us ready to start processing tuples.
+	 */
+	epqstate->recheckplanstate = ExecInitNode(planTree, rcestate, 0);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * EvalPlanQualEnd -- shut down at termination of parent plan state node,
+ * or if we are done with the current EPQ child.
+ *
+ * This is a cut-down version of ExecutorEnd(); basically we want to do most
+ * of the normal cleanup, but *not* close result relations (which we are
+ * just sharing from the outer query).  We do, however, have to close any
+ * result and trigger target relations that got opened, since those are not
+ * shared.  (There probably shouldn't be any of the latter, but just in
+ * case...)
+ */
+void
+EvalPlanQualEnd(EPQState *epqstate)
+{
+	EState	   *estate = epqstate->recheckestate;
+	Index		rtsize;
+	MemoryContext oldcontext;
+	ListCell   *l;
+
+	rtsize = epqstate->parentestate->es_range_table_size;
+
+	/*
+	 * We may have a tuple table, even if EPQ wasn't started, because we allow
+	 * use of EvalPlanQualSlot() without calling EvalPlanQualBegin().
+	 */
+	if (epqstate->tuple_table != NIL)
+	{
+		memset(epqstate->relsubs_slot, 0,
+			   rtsize * sizeof(TupleTableSlot *));
+		ExecResetTupleTable(epqstate->tuple_table, true);
+		epqstate->tuple_table = NIL;
+	}
+
+	/* EPQ wasn't started, nothing further to do */
+	if (estate == NULL)
+		return;
+
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	ExecEndNode(epqstate->recheckplanstate);
+
+	foreach(l, estate->es_subplanstates)
+	{
+		PlanState  *subplanstate = (PlanState *) lfirst(l);
+
+		ExecEndNode(subplanstate);
+	}
+
+	/* throw away the per-estate tuple table, some node may have used it */
+	ExecResetTupleTable(estate->es_tupleTable, false);
+
+	/* Close any result and trigger target relations attached to this EState */
+	ExecCloseResultRelations(estate);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	FreeExecutorState(estate);
+
+	/* Mark EPQState idle */
+	epqstate->origslot = NULL;
+	epqstate->recheckestate = NULL;
+	epqstate->recheckplanstate = NULL;
+	epqstate->relsubs_rowmark = NULL;
+	epqstate->relsubs_done = NULL;
+}
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
new file mode 100644
index 0000000..f8a4a40
--- /dev/null
+++ b/src/backend/executor/execParallel.c
@@ -0,0 +1,1498 @@
+/*-------------------------------------------------------------------------
+ *
+ * execParallel.c
+ *	  Support routines for parallel execution.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * This file contains routines that are intended to support setting up,
+ * using, and tearing down a ParallelContext from within the PostgreSQL
+ * executor.  The ParallelContext machinery will handle starting the
+ * workers and ensuring that their state generally matches that of the
+ * leader; see src/backend/access/transam/README.parallel for details.
+ * However, we must save and restore relevant executor state, such as
+ * any ParamListInfo associated with the query, buffer/WAL usage info, and
+ * the actual plan to be passed down to the worker.
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execParallel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execParallel.h"
+#include "executor/executor.h"
+#include "executor/nodeAgg.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeBitmapHeapscan.h"
+#include "executor/nodeCustom.h"
+#include "executor/nodeForeignscan.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "executor/nodeMemoize.h"
+#include "executor/nodeSeqscan.h"
+#include "executor/nodeSort.h"
+#include "executor/nodeSubplan.h"
+#include "executor/tqueue.h"
+#include "jit/jit.h"
+#include "nodes/nodeFuncs.h"
+#include "pgstat.h"
+#include "storage/spin.h"
+#include "tcop/tcopprot.h"
+#include "utils/datum.h"
+#include "utils/dsa.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+
+/*
+ * Magic numbers for parallel executor communication.  We use constants
+ * greater than any 32-bit integer here so that values < 2^32 can be used
+ * by individual parallel nodes to store their own state.
+ */
+#define PARALLEL_KEY_EXECUTOR_FIXED		UINT64CONST(0xE000000000000001)
+#define PARALLEL_KEY_PLANNEDSTMT		UINT64CONST(0xE000000000000002)
+#define PARALLEL_KEY_PARAMLISTINFO		UINT64CONST(0xE000000000000003)
+#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xE000000000000004)
+#define PARALLEL_KEY_TUPLE_QUEUE		UINT64CONST(0xE000000000000005)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xE000000000000006)
+#define PARALLEL_KEY_DSA				UINT64CONST(0xE000000000000007)
+#define PARALLEL_KEY_QUERY_TEXT		UINT64CONST(0xE000000000000008)
+#define PARALLEL_KEY_JIT_INSTRUMENTATION UINT64CONST(0xE000000000000009)
+#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xE00000000000000A)
+
+#define PARALLEL_TUPLE_QUEUE_SIZE		65536
+
+/*
+ * Fixed-size random stuff that we need to pass to parallel workers.
+ */
+typedef struct FixedParallelExecutorState
+{
+	int64		tuples_needed;	/* tuple bound, see ExecSetTupleBound */
+	dsa_pointer param_exec;
+	int			eflags;
+	int			jit_flags;
+} FixedParallelExecutorState;
+
+/*
+ * DSM structure for accumulating per-PlanState instrumentation.
+ *
+ * instrument_options: Same meaning here as in instrument.c.
+ *
+ * instrument_offset: Offset, relative to the start of this structure,
+ * of the first Instrumentation object.  This will depend on the length of
+ * the plan_node_id array.
+ *
+ * num_workers: Number of workers.
+ *
+ * num_plan_nodes: Number of plan nodes.
+ *
+ * plan_node_id: Array of plan nodes for which we are gathering instrumentation
+ * from parallel workers.  The length of this array is given by num_plan_nodes.
+ */
+struct SharedExecutorInstrumentation
+{
+	int			instrument_options;
+	int			instrument_offset;
+	int			num_workers;
+	int			num_plan_nodes;
+	int			plan_node_id[FLEXIBLE_ARRAY_MEMBER];
+	/* array of num_plan_nodes * num_workers Instrumentation objects follows */
+};
+#define GetInstrumentationArray(sei) \
+	(AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
+	 (Instrumentation *) (((char *) sei) + sei->instrument_offset))
+
+/* Context object for ExecParallelEstimate. */
+typedef struct ExecParallelEstimateContext
+{
+	ParallelContext *pcxt;
+	int			nnodes;
+} ExecParallelEstimateContext;
+
+/* Context object for ExecParallelInitializeDSM. */
+typedef struct ExecParallelInitializeDSMContext
+{
+	ParallelContext *pcxt;
+	SharedExecutorInstrumentation *instrumentation;
+	int			nnodes;
+} ExecParallelInitializeDSMContext;
+
+/* Helper functions that run in the parallel leader. */
+static char *ExecSerializePlan(Plan *plan, EState *estate);
+static bool ExecParallelEstimate(PlanState *node,
+								 ExecParallelEstimateContext *e);
+static bool ExecParallelInitializeDSM(PlanState *node,
+									  ExecParallelInitializeDSMContext *d);
+static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt,
+													bool reinitialize);
+static bool ExecParallelReInitializeDSM(PlanState *planstate,
+										ParallelContext *pcxt);
+static bool ExecParallelRetrieveInstrumentation(PlanState *planstate,
+												SharedExecutorInstrumentation *instrumentation);
+
+/* Helper function that runs in the parallel worker. */
+static DestReceiver *ExecParallelGetReceiver(dsm_segment *seg, shm_toc *toc);
+
+/*
+ * Create a serialized representation of the plan to be sent to each worker.
+ */
+static char *
+ExecSerializePlan(Plan *plan, EState *estate)
+{
+	PlannedStmt *pstmt;
+	ListCell   *lc;
+
+	/* We can't scribble on the original plan, so make a copy. */
+	plan = copyObject(plan);
+
+	/*
+	 * The worker will start its own copy of the executor, and that copy will
+	 * insert a junk filter if the toplevel node has any resjunk entries. We
+	 * don't want that to happen, because while resjunk columns shouldn't be
+	 * sent back to the user, here the tuples are coming back to another
+	 * backend which may very well need them.  So mutate the target list
+	 * accordingly.  This is sort of a hack; there might be better ways to do
+	 * this...
+	 */
+	foreach(lc, plan->targetlist)
+	{
+		TargetEntry *tle = lfirst_node(TargetEntry, lc);
+
+		tle->resjunk = false;
+	}
+
+	/*
+	 * Create a dummy PlannedStmt.  Most of the fields don't need to be valid
+	 * for our purposes, but the worker will need at least a minimal
+	 * PlannedStmt to start the executor.
+	 */
+	pstmt = makeNode(PlannedStmt);
+	pstmt->commandType = CMD_SELECT;
+	pstmt->queryId = pgstat_get_my_query_id();
+	pstmt->hasReturning = false;
+	pstmt->hasModifyingCTE = false;
+	pstmt->canSetTag = true;
+	pstmt->transientPlan = false;
+	pstmt->dependsOnRole = false;
+	pstmt->parallelModeNeeded = false;
+	pstmt->planTree = plan;
+	pstmt->rtable = estate->es_range_table;
+	pstmt->resultRelations = NIL;
+	pstmt->appendRelations = NIL;
+
+	/*
+	 * Transfer only parallel-safe subplans, leaving a NULL "hole" in the list
+	 * for unsafe ones (so that the list indexes of the safe ones are
+	 * preserved).  This positively ensures that the worker won't try to run,
+	 * or even do ExecInitNode on, an unsafe subplan.  That's important to
+	 * protect, eg, non-parallel-aware FDWs from getting into trouble.
+	 */
+	pstmt->subplans = NIL;
+	foreach(lc, estate->es_plannedstmt->subplans)
+	{
+		Plan	   *subplan = (Plan *) lfirst(lc);
+
+		if (subplan && !subplan->parallel_safe)
+			subplan = NULL;
+		pstmt->subplans = lappend(pstmt->subplans, subplan);
+	}
+
+	pstmt->rewindPlanIDs = NULL;
+	pstmt->rowMarks = NIL;
+	pstmt->relationOids = NIL;
+	pstmt->invalItems = NIL;	/* workers can't replan anyway... */
+	pstmt->paramExecTypes = estate->es_plannedstmt->paramExecTypes;
+	pstmt->utilityStmt = NULL;
+	pstmt->stmt_location = -1;
+	pstmt->stmt_len = -1;
+
+	/* Return serialized copy of our dummy PlannedStmt. */
+	return nodeToString(pstmt);
+}
+
+/*
+ * Parallel-aware plan nodes (and occasionally others) may need some state
+ * which is shared across all parallel workers.  Before we size the DSM, give
+ * them a chance to call shm_toc_estimate_chunk or shm_toc_estimate_keys on
+ * &pcxt->estimator.
+ *
+ * While we're at it, count the number of PlanState nodes in the tree, so
+ * we know how many Instrumentation structures we need.
+ */
+static bool
+ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
+{
+	if (planstate == NULL)
+		return false;
+
+	/* Count this node. */
+	e->nnodes++;
+
+	switch (nodeTag(planstate))
+	{
+		case T_SeqScanState:
+			if (planstate->plan->parallel_aware)
+				ExecSeqScanEstimate((SeqScanState *) planstate,
+									e->pcxt);
+			break;
+		case T_IndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexScanEstimate((IndexScanState *) planstate,
+									  e->pcxt);
+			break;
+		case T_IndexOnlyScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexOnlyScanEstimate((IndexOnlyScanState *) planstate,
+										  e->pcxt);
+			break;
+		case T_ForeignScanState:
+			if (planstate->plan->parallel_aware)
+				ExecForeignScanEstimate((ForeignScanState *) planstate,
+										e->pcxt);
+			break;
+		case T_AppendState:
+			if (planstate->plan->parallel_aware)
+				ExecAppendEstimate((AppendState *) planstate,
+								   e->pcxt);
+			break;
+		case T_CustomScanState:
+			if (planstate->plan->parallel_aware)
+				ExecCustomScanEstimate((CustomScanState *) planstate,
+									   e->pcxt);
+			break;
+		case T_BitmapHeapScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate,
+									   e->pcxt);
+			break;
+		case T_HashJoinState:
+			if (planstate->plan->parallel_aware)
+				ExecHashJoinEstimate((HashJoinState *) planstate,
+									 e->pcxt);
+			break;
+		case T_HashState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecHashEstimate((HashState *) planstate, e->pcxt);
+			break;
+		case T_SortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSortEstimate((SortState *) planstate, e->pcxt);
+			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt);
+			break;
+		case T_AggState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecAggEstimate((AggState *) planstate, e->pcxt);
+			break;
+		case T_MemoizeState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecMemoizeEstimate((MemoizeState *) planstate, e->pcxt);
+			break;
+		default:
+			break;
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelEstimate, e);
+}
+
+/*
+ * Estimate the amount of space required to serialize the indicated parameters.
+ */
+static Size
+EstimateParamExecSpace(EState *estate, Bitmapset *params)
+{
+	int			paramid;
+	Size		sz = sizeof(int);
+
+	paramid = -1;
+	while ((paramid = bms_next_member(params, paramid)) >= 0)
+	{
+		Oid			typeOid;
+		int16		typLen;
+		bool		typByVal;
+		ParamExecData *prm;
+
+		prm = &(estate->es_param_exec_vals[paramid]);
+		typeOid = list_nth_oid(estate->es_plannedstmt->paramExecTypes,
+							   paramid);
+
+		sz = add_size(sz, sizeof(int)); /* space for paramid */
+
+		/* space for datum/isnull */
+		if (OidIsValid(typeOid))
+			get_typlenbyval(typeOid, &typLen, &typByVal);
+		else
+		{
+			/* If no type OID, assume by-value, like copyParamList does. */
+			typLen = sizeof(Datum);
+			typByVal = true;
+		}
+		sz = add_size(sz,
+					  datumEstimateSpace(prm->value, prm->isnull,
+										 typByVal, typLen));
+	}
+	return sz;
+}
+
+/*
+ * Serialize specified PARAM_EXEC parameters.
+ *
+ * We write the number of parameters first, as a 4-byte integer, and then
+ * write details for each parameter in turn.  The details for each parameter
+ * consist of a 4-byte paramid (location of param in execution time internal
+ * parameter array) and then the datum as serialized by datumSerialize().
+ */
+static dsa_pointer
+SerializeParamExecParams(EState *estate, Bitmapset *params, dsa_area *area)
+{
+	Size		size;
+	int			nparams;
+	int			paramid;
+	ParamExecData *prm;
+	dsa_pointer handle;
+	char	   *start_address;
+
+	/* Allocate enough space for the current parameter values. */
+	size = EstimateParamExecSpace(estate, params);
+	handle = dsa_allocate(area, size);
+	start_address = dsa_get_address(area, handle);
+
+	/* First write the number of parameters as a 4-byte integer. */
+	nparams = bms_num_members(params);
+	memcpy(start_address, &nparams, sizeof(int));
+	start_address += sizeof(int);
+
+	/* Write details for each parameter in turn. */
+	paramid = -1;
+	while ((paramid = bms_next_member(params, paramid)) >= 0)
+	{
+		Oid			typeOid;
+		int16		typLen;
+		bool		typByVal;
+
+		prm = &(estate->es_param_exec_vals[paramid]);
+		typeOid = list_nth_oid(estate->es_plannedstmt->paramExecTypes,
+							   paramid);
+
+		/* Write paramid. */
+		memcpy(start_address, &paramid, sizeof(int));
+		start_address += sizeof(int);
+
+		/* Write datum/isnull */
+		if (OidIsValid(typeOid))
+			get_typlenbyval(typeOid, &typLen, &typByVal);
+		else
+		{
+			/* If no type OID, assume by-value, like copyParamList does. */
+			typLen = sizeof(Datum);
+			typByVal = true;
+		}
+		datumSerialize(prm->value, prm->isnull, typByVal, typLen,
+					   &start_address);
+	}
+
+	return handle;
+}
+
+/*
+ * Restore specified PARAM_EXEC parameters.
+ */
+static void
+RestoreParamExecParams(char *start_address, EState *estate)
+{
+	int			nparams;
+	int			i;
+	int			paramid;
+
+	memcpy(&nparams, start_address, sizeof(int));
+	start_address += sizeof(int);
+
+	for (i = 0; i < nparams; i++)
+	{
+		ParamExecData *prm;
+
+		/* Read paramid */
+		memcpy(&paramid, start_address, sizeof(int));
+		start_address += sizeof(int);
+		prm = &(estate->es_param_exec_vals[paramid]);
+
+		/* Read datum/isnull. */
+		prm->value = datumRestore(&start_address, &prm->isnull);
+		prm->execPlan = NULL;
+	}
+}
+
+/*
+ * Initialize the dynamic shared memory segment that will be used to control
+ * parallel execution.
+ */
+static bool
+ExecParallelInitializeDSM(PlanState *planstate,
+						  ExecParallelInitializeDSMContext *d)
+{
+	if (planstate == NULL)
+		return false;
+
+	/* If instrumentation is enabled, initialize slot for this node. */
+	if (d->instrumentation != NULL)
+		d->instrumentation->plan_node_id[d->nnodes] =
+			planstate->plan->plan_node_id;
+
+	/* Count this node. */
+	d->nnodes++;
+
+	/*
+	 * Call initializers for DSM-using plan nodes.
+	 *
+	 * Most plan nodes won't do anything here, but plan nodes that allocated
+	 * DSM may need to initialize shared state in the DSM before parallel
+	 * workers are launched.  They can allocate the space they previously
+	 * estimated using shm_toc_allocate, and add the keys they previously
+	 * estimated using shm_toc_insert, in each case targeting pcxt->toc.
+	 */
+	switch (nodeTag(planstate))
+	{
+		case T_SeqScanState:
+			if (planstate->plan->parallel_aware)
+				ExecSeqScanInitializeDSM((SeqScanState *) planstate,
+										 d->pcxt);
+			break;
+		case T_IndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexScanInitializeDSM((IndexScanState *) planstate,
+										   d->pcxt);
+			break;
+		case T_IndexOnlyScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexOnlyScanInitializeDSM((IndexOnlyScanState *) planstate,
+											   d->pcxt);
+			break;
+		case T_ForeignScanState:
+			if (planstate->plan->parallel_aware)
+				ExecForeignScanInitializeDSM((ForeignScanState *) planstate,
+											 d->pcxt);
+			break;
+		case T_AppendState:
+			if (planstate->plan->parallel_aware)
+				ExecAppendInitializeDSM((AppendState *) planstate,
+										d->pcxt);
+			break;
+		case T_CustomScanState:
+			if (planstate->plan->parallel_aware)
+				ExecCustomScanInitializeDSM((CustomScanState *) planstate,
+											d->pcxt);
+			break;
+		case T_BitmapHeapScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate,
+											d->pcxt);
+			break;
+		case T_HashJoinState:
+			if (planstate->plan->parallel_aware)
+				ExecHashJoinInitializeDSM((HashJoinState *) planstate,
+										  d->pcxt);
+			break;
+		case T_HashState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecHashInitializeDSM((HashState *) planstate, d->pcxt);
+			break;
+		case T_SortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
+			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt);
+			break;
+		case T_AggState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecAggInitializeDSM((AggState *) planstate, d->pcxt);
+			break;
+		case T_MemoizeState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecMemoizeInitializeDSM((MemoizeState *) planstate, d->pcxt);
+			break;
+		default:
+			break;
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d);
+}
+
+/*
+ * It sets up the response queues for backend workers to return tuples
+ * to the main backend and start the workers.
+ */
+static shm_mq_handle **
+ExecParallelSetupTupleQueues(ParallelContext *pcxt, bool reinitialize)
+{
+	shm_mq_handle **responseq;
+	char	   *tqueuespace;
+	int			i;
+
+	/* Skip this if no workers. */
+	if (pcxt->nworkers == 0)
+		return NULL;
+
+	/* Allocate memory for shared memory queue handles. */
+	responseq = (shm_mq_handle **)
+		palloc(pcxt->nworkers * sizeof(shm_mq_handle *));
+
+	/*
+	 * If not reinitializing, allocate space from the DSM for the queues;
+	 * otherwise, find the already allocated space.
+	 */
+	if (!reinitialize)
+		tqueuespace =
+			shm_toc_allocate(pcxt->toc,
+							 mul_size(PARALLEL_TUPLE_QUEUE_SIZE,
+									  pcxt->nworkers));
+	else
+		tqueuespace = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_TUPLE_QUEUE, false);
+
+	/* Create the queues, and become the receiver for each. */
+	for (i = 0; i < pcxt->nworkers; ++i)
+	{
+		shm_mq	   *mq;
+
+		mq = shm_mq_create(tqueuespace +
+						   ((Size) i) * PARALLEL_TUPLE_QUEUE_SIZE,
+						   (Size) PARALLEL_TUPLE_QUEUE_SIZE);
+
+		shm_mq_set_receiver(mq, MyProc);
+		responseq[i] = shm_mq_attach(mq, pcxt->seg, NULL);
+	}
+
+	/* Add array of queues to shm_toc, so others can find it. */
+	if (!reinitialize)
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLE_QUEUE, tqueuespace);
+
+	/* Return array of handles. */
+	return responseq;
+}
+
+/*
+ * Sets up the required infrastructure for backend workers to perform
+ * execution and return results to the main backend.
+ */
+ParallelExecutorInfo *
+ExecInitParallelPlan(PlanState *planstate, EState *estate,
+					 Bitmapset *sendParams, int nworkers,
+					 int64 tuples_needed)
+{
+	ParallelExecutorInfo *pei;
+	ParallelContext *pcxt;
+	ExecParallelEstimateContext e;
+	ExecParallelInitializeDSMContext d;
+	FixedParallelExecutorState *fpes;
+	char	   *pstmt_data;
+	char	   *pstmt_space;
+	char	   *paramlistinfo_space;
+	BufferUsage *bufusage_space;
+	WalUsage   *walusage_space;
+	SharedExecutorInstrumentation *instrumentation = NULL;
+	SharedJitInstrumentation *jit_instrumentation = NULL;
+	int			pstmt_len;
+	int			paramlistinfo_len;
+	int			instrumentation_len = 0;
+	int			jit_instrumentation_len = 0;
+	int			instrument_offset = 0;
+	Size		dsa_minsize = dsa_minimum_size();
+	char	   *query_string;
+	int			query_len;
+
+	/*
+	 * Force any initplan outputs that we're going to pass to workers to be
+	 * evaluated, if they weren't already.
+	 *
+	 * For simplicity, we use the EState's per-output-tuple ExprContext here.
+	 * That risks intra-query memory leakage, since we might pass through here
+	 * many times before that ExprContext gets reset; but ExecSetParamPlan
+	 * doesn't normally leak any memory in the context (see its comments), so
+	 * it doesn't seem worth complicating this function's API to pass it a
+	 * shorter-lived ExprContext.  This might need to change someday.
+	 */
+	ExecSetParamPlanMulti(sendParams, GetPerTupleExprContext(estate));
+
+	/* Allocate object for return value. */
+	pei = palloc0(sizeof(ParallelExecutorInfo));
+	pei->finished = false;
+	pei->planstate = planstate;
+
+	/* Fix up and serialize plan to be sent to workers. */
+	pstmt_data = ExecSerializePlan(planstate->plan, estate);
+
+	/* Create a parallel context. */
+	pcxt = CreateParallelContext("postgres", "ParallelQueryMain", nworkers);
+	pei->pcxt = pcxt;
+
+	/*
+	 * Before telling the parallel context to create a dynamic shared memory
+	 * segment, we need to figure out how big it should be.  Estimate space
+	 * for the various things we need to store.
+	 */
+
+	/* Estimate space for fixed-size state. */
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   sizeof(FixedParallelExecutorState));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/* Estimate space for query text. */
+	query_len = strlen(estate->es_sourceText);
+	shm_toc_estimate_chunk(&pcxt->estimator, query_len + 1);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/* Estimate space for serialized PlannedStmt. */
+	pstmt_len = strlen(pstmt_data) + 1;
+	shm_toc_estimate_chunk(&pcxt->estimator, pstmt_len);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/* Estimate space for serialized ParamListInfo. */
+	paramlistinfo_len = EstimateParamListSpace(estate->es_param_list_info);
+	shm_toc_estimate_chunk(&pcxt->estimator, paramlistinfo_len);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/*
+	 * Estimate space for BufferUsage.
+	 *
+	 * If EXPLAIN is not in use and there are no extensions loaded that care,
+	 * we could skip this.  But we have no way of knowing whether anyone's
+	 * looking at pgBufferUsage, so do it unconditionally.
+	 */
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/*
+	 * Same thing for WalUsage.
+	 */
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   mul_size(sizeof(WalUsage), pcxt->nworkers));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/* Estimate space for tuple queues. */
+	shm_toc_estimate_chunk(&pcxt->estimator,
+						   mul_size(PARALLEL_TUPLE_QUEUE_SIZE, pcxt->nworkers));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/*
+	 * Give parallel-aware nodes a chance to add to the estimates, and get a
+	 * count of how many PlanState nodes there are.
+	 */
+	e.pcxt = pcxt;
+	e.nnodes = 0;
+	ExecParallelEstimate(planstate, &e);
+
+	/* Estimate space for instrumentation, if required. */
+	if (estate->es_instrument)
+	{
+		instrumentation_len =
+			offsetof(SharedExecutorInstrumentation, plan_node_id) +
+			sizeof(int) * e.nnodes;
+		instrumentation_len = MAXALIGN(instrumentation_len);
+		instrument_offset = instrumentation_len;
+		instrumentation_len +=
+			mul_size(sizeof(Instrumentation),
+					 mul_size(e.nnodes, nworkers));
+		shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
+		shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+		/* Estimate space for JIT instrumentation, if required. */
+		if (estate->es_jit_flags != PGJIT_NONE)
+		{
+			jit_instrumentation_len =
+				offsetof(SharedJitInstrumentation, jit_instr) +
+				sizeof(JitInstrumentation) * nworkers;
+			shm_toc_estimate_chunk(&pcxt->estimator, jit_instrumentation_len);
+			shm_toc_estimate_keys(&pcxt->estimator, 1);
+		}
+	}
+
+	/* Estimate space for DSA area. */
+	shm_toc_estimate_chunk(&pcxt->estimator, dsa_minsize);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/* Everyone's had a chance to ask for space, so now create the DSM. */
+	InitializeParallelDSM(pcxt);
+
+	/*
+	 * OK, now we have a dynamic shared memory segment, and it should be big
+	 * enough to store all of the data we estimated we would want to put into
+	 * it, plus whatever general stuff (not specifically executor-related) the
+	 * ParallelContext itself needs to store there.  None of the space we
+	 * asked for has been allocated or initialized yet, though, so do that.
+	 */
+
+	/* Store fixed-size state. */
+	fpes = shm_toc_allocate(pcxt->toc, sizeof(FixedParallelExecutorState));
+	fpes->tuples_needed = tuples_needed;
+	fpes->param_exec = InvalidDsaPointer;
+	fpes->eflags = estate->es_top_eflags;
+	fpes->jit_flags = estate->es_jit_flags;
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_EXECUTOR_FIXED, fpes);
+
+	/* Store query string */
+	query_string = shm_toc_allocate(pcxt->toc, query_len + 1);
+	memcpy(query_string, estate->es_sourceText, query_len + 1);
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, query_string);
+
+	/* Store serialized PlannedStmt. */
+	pstmt_space = shm_toc_allocate(pcxt->toc, pstmt_len);
+	memcpy(pstmt_space, pstmt_data, pstmt_len);
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_PLANNEDSTMT, pstmt_space);
+
+	/* Store serialized ParamListInfo. */
+	paramlistinfo_space = shm_toc_allocate(pcxt->toc, paramlistinfo_len);
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_PARAMLISTINFO, paramlistinfo_space);
+	SerializeParamList(estate->es_param_list_info, &paramlistinfo_space);
+
+	/* Allocate space for each worker's BufferUsage; no need to initialize. */
+	bufusage_space = shm_toc_allocate(pcxt->toc,
+									  mul_size(sizeof(BufferUsage), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufusage_space);
+	pei->buffer_usage = bufusage_space;
+
+	/* Same for WalUsage. */
+	walusage_space = shm_toc_allocate(pcxt->toc,
+									  mul_size(sizeof(WalUsage), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage_space);
+	pei->wal_usage = walusage_space;
+
+	/* Set up the tuple queues that the workers will write into. */
+	pei->tqueue = ExecParallelSetupTupleQueues(pcxt, false);
+
+	/* We don't need the TupleQueueReaders yet, though. */
+	pei->reader = NULL;
+
+	/*
+	 * If instrumentation options were supplied, allocate space for the data.
+	 * It only gets partially initialized here; the rest happens during
+	 * ExecParallelInitializeDSM.
+	 */
+	if (estate->es_instrument)
+	{
+		Instrumentation *instrument;
+		int			i;
+
+		instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
+		instrumentation->instrument_options = estate->es_instrument;
+		instrumentation->instrument_offset = instrument_offset;
+		instrumentation->num_workers = nworkers;
+		instrumentation->num_plan_nodes = e.nnodes;
+		instrument = GetInstrumentationArray(instrumentation);
+		for (i = 0; i < nworkers * e.nnodes; ++i)
+			InstrInit(&instrument[i], estate->es_instrument);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
+					   instrumentation);
+		pei->instrumentation = instrumentation;
+
+		if (estate->es_jit_flags != PGJIT_NONE)
+		{
+			jit_instrumentation = shm_toc_allocate(pcxt->toc,
+												   jit_instrumentation_len);
+			jit_instrumentation->num_workers = nworkers;
+			memset(jit_instrumentation->jit_instr, 0,
+				   sizeof(JitInstrumentation) * nworkers);
+			shm_toc_insert(pcxt->toc, PARALLEL_KEY_JIT_INSTRUMENTATION,
+						   jit_instrumentation);
+			pei->jit_instrumentation = jit_instrumentation;
+		}
+	}
+
+	/*
+	 * Create a DSA area that can be used by the leader and all workers.
+	 * (However, if we failed to create a DSM and are using private memory
+	 * instead, then skip this.)
+	 */
+	if (pcxt->seg != NULL)
+	{
+		char	   *area_space;
+
+		area_space = shm_toc_allocate(pcxt->toc, dsa_minsize);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_DSA, area_space);
+		pei->area = dsa_create_in_place(area_space, dsa_minsize,
+										LWTRANCHE_PARALLEL_QUERY_DSA,
+										pcxt->seg);
+
+		/*
+		 * Serialize parameters, if any, using DSA storage.  We don't dare use
+		 * the main parallel query DSM for this because we might relaunch
+		 * workers after the values have changed (and thus the amount of
+		 * storage required has changed).
+		 */
+		if (!bms_is_empty(sendParams))
+		{
+			pei->param_exec = SerializeParamExecParams(estate, sendParams,
+													   pei->area);
+			fpes->param_exec = pei->param_exec;
+		}
+	}
+
+	/*
+	 * Give parallel-aware nodes a chance to initialize their shared data.
+	 * This also initializes the elements of instrumentation->ps_instrument,
+	 * if it exists.
+	 */
+	d.pcxt = pcxt;
+	d.instrumentation = instrumentation;
+	d.nnodes = 0;
+
+	/* Install our DSA area while initializing the plan. */
+	estate->es_query_dsa = pei->area;
+	ExecParallelInitializeDSM(planstate, &d);
+	estate->es_query_dsa = NULL;
+
+	/*
+	 * Make sure that the world hasn't shifted under our feet.  This could
+	 * probably just be an Assert(), but let's be conservative for now.
+	 */
+	if (e.nnodes != d.nnodes)
+		elog(ERROR, "inconsistent count of PlanState nodes");
+
+	/* OK, we're ready to rock and roll. */
+	return pei;
+}
+
+/*
+ * Set up tuple queue readers to read the results of a parallel subplan.
+ *
+ * This is separate from ExecInitParallelPlan() because we can launch the
+ * worker processes and let them start doing something before we do this.
+ */
+void
+ExecParallelCreateReaders(ParallelExecutorInfo *pei)
+{
+	int			nworkers = pei->pcxt->nworkers_launched;
+	int			i;
+
+	Assert(pei->reader == NULL);
+
+	if (nworkers > 0)
+	{
+		pei->reader = (TupleQueueReader **)
+			palloc(nworkers * sizeof(TupleQueueReader *));
+
+		for (i = 0; i < nworkers; i++)
+		{
+			shm_mq_set_handle(pei->tqueue[i],
+							  pei->pcxt->worker[i].bgwhandle);
+			pei->reader[i] = CreateTupleQueueReader(pei->tqueue[i]);
+		}
+	}
+}
+
+/*
+ * Re-initialize the parallel executor shared memory state before launching
+ * a fresh batch of workers.
+ */
+void
+ExecParallelReinitialize(PlanState *planstate,
+						 ParallelExecutorInfo *pei,
+						 Bitmapset *sendParams)
+{
+	EState	   *estate = planstate->state;
+	FixedParallelExecutorState *fpes;
+
+	/* Old workers must already be shut down */
+	Assert(pei->finished);
+
+	/*
+	 * Force any initplan outputs that we're going to pass to workers to be
+	 * evaluated, if they weren't already (see comments in
+	 * ExecInitParallelPlan).
+	 */
+	ExecSetParamPlanMulti(sendParams, GetPerTupleExprContext(estate));
+
+	ReinitializeParallelDSM(pei->pcxt);
+	pei->tqueue = ExecParallelSetupTupleQueues(pei->pcxt, true);
+	pei->reader = NULL;
+	pei->finished = false;
+
+	fpes = shm_toc_lookup(pei->pcxt->toc, PARALLEL_KEY_EXECUTOR_FIXED, false);
+
+	/* Free any serialized parameters from the last round. */
+	if (DsaPointerIsValid(fpes->param_exec))
+	{
+		dsa_free(pei->area, fpes->param_exec);
+		fpes->param_exec = InvalidDsaPointer;
+	}
+
+	/* Serialize current parameter values if required. */
+	if (!bms_is_empty(sendParams))
+	{
+		pei->param_exec = SerializeParamExecParams(estate, sendParams,
+												   pei->area);
+		fpes->param_exec = pei->param_exec;
+	}
+
+	/* Traverse plan tree and let each child node reset associated state. */
+	estate->es_query_dsa = pei->area;
+	ExecParallelReInitializeDSM(planstate, pei->pcxt);
+	estate->es_query_dsa = NULL;
+}
+
+/*
+ * Traverse plan tree to reinitialize per-node dynamic shared memory state
+ */
+static bool
+ExecParallelReInitializeDSM(PlanState *planstate,
+							ParallelContext *pcxt)
+{
+	if (planstate == NULL)
+		return false;
+
+	/*
+	 * Call reinitializers for DSM-using plan nodes.
+	 */
+	switch (nodeTag(planstate))
+	{
+		case T_SeqScanState:
+			if (planstate->plan->parallel_aware)
+				ExecSeqScanReInitializeDSM((SeqScanState *) planstate,
+										   pcxt);
+			break;
+		case T_IndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexScanReInitializeDSM((IndexScanState *) planstate,
+											 pcxt);
+			break;
+		case T_IndexOnlyScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexOnlyScanReInitializeDSM((IndexOnlyScanState *) planstate,
+												 pcxt);
+			break;
+		case T_ForeignScanState:
+			if (planstate->plan->parallel_aware)
+				ExecForeignScanReInitializeDSM((ForeignScanState *) planstate,
+											   pcxt);
+			break;
+		case T_AppendState:
+			if (planstate->plan->parallel_aware)
+				ExecAppendReInitializeDSM((AppendState *) planstate, pcxt);
+			break;
+		case T_CustomScanState:
+			if (planstate->plan->parallel_aware)
+				ExecCustomScanReInitializeDSM((CustomScanState *) planstate,
+											  pcxt);
+			break;
+		case T_BitmapHeapScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapHeapReInitializeDSM((BitmapHeapScanState *) planstate,
+											  pcxt);
+			break;
+		case T_HashJoinState:
+			if (planstate->plan->parallel_aware)
+				ExecHashJoinReInitializeDSM((HashJoinState *) planstate,
+											pcxt);
+			break;
+		case T_HashState:
+		case T_SortState:
+		case T_IncrementalSortState:
+		case T_MemoizeState:
+			/* these nodes have DSM state, but no reinitialization is required */
+			break;
+
+		default:
+			break;
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelReInitializeDSM, pcxt);
+}
+
+/*
+ * Copy instrumentation information about this node and its descendants from
+ * dynamic shared memory.
+ */
+static bool
+ExecParallelRetrieveInstrumentation(PlanState *planstate,
+									SharedExecutorInstrumentation *instrumentation)
+{
+	Instrumentation *instrument;
+	int			i;
+	int			n;
+	int			ibytes;
+	int			plan_node_id = planstate->plan->plan_node_id;
+	MemoryContext oldcontext;
+
+	/* Find the instrumentation for this node. */
+	for (i = 0; i < instrumentation->num_plan_nodes; ++i)
+		if (instrumentation->plan_node_id[i] == plan_node_id)
+			break;
+	if (i >= instrumentation->num_plan_nodes)
+		elog(ERROR, "plan node %d not found", plan_node_id);
+
+	/* Accumulate the statistics from all workers. */
+	instrument = GetInstrumentationArray(instrumentation);
+	instrument += i * instrumentation->num_workers;
+	for (n = 0; n < instrumentation->num_workers; ++n)
+		InstrAggNode(planstate->instrument, &instrument[n]);
+
+	/*
+	 * Also store the per-worker detail.
+	 *
+	 * Worker instrumentation should be allocated in the same context as the
+	 * regular instrumentation information, which is the per-query context.
+	 * Switch into per-query memory context.
+	 */
+	oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
+	ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
+	planstate->worker_instrument =
+		palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
+	MemoryContextSwitchTo(oldcontext);
+
+	planstate->worker_instrument->num_workers = instrumentation->num_workers;
+	memcpy(&planstate->worker_instrument->instrument, instrument, ibytes);
+
+	/* Perform any node-type-specific work that needs to be done. */
+	switch (nodeTag(planstate))
+	{
+		case T_SortState:
+			ExecSortRetrieveInstrumentation((SortState *) planstate);
+			break;
+		case T_IncrementalSortState:
+			ExecIncrementalSortRetrieveInstrumentation((IncrementalSortState *) planstate);
+			break;
+		case T_HashState:
+			ExecHashRetrieveInstrumentation((HashState *) planstate);
+			break;
+		case T_AggState:
+			ExecAggRetrieveInstrumentation((AggState *) planstate);
+			break;
+		case T_MemoizeState:
+			ExecMemoizeRetrieveInstrumentation((MemoizeState *) planstate);
+			break;
+		default:
+			break;
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation,
+								 instrumentation);
+}
+
+/*
+ * Add up the workers' JIT instrumentation from dynamic shared memory.
+ */
+static void
+ExecParallelRetrieveJitInstrumentation(PlanState *planstate,
+									   SharedJitInstrumentation *shared_jit)
+{
+	JitInstrumentation *combined;
+	int			ibytes;
+
+	int			n;
+
+	/*
+	 * Accumulate worker JIT instrumentation into the combined JIT
+	 * instrumentation, allocating it if required.
+	 */
+	if (!planstate->state->es_jit_worker_instr)
+		planstate->state->es_jit_worker_instr =
+			MemoryContextAllocZero(planstate->state->es_query_cxt, sizeof(JitInstrumentation));
+	combined = planstate->state->es_jit_worker_instr;
+
+	/* Accumulate all the workers' instrumentations. */
+	for (n = 0; n < shared_jit->num_workers; ++n)
+		InstrJitAgg(combined, &shared_jit->jit_instr[n]);
+
+	/*
+	 * Store the per-worker detail.
+	 *
+	 * Similar to ExecParallelRetrieveInstrumentation(), allocate the
+	 * instrumentation in per-query context.
+	 */
+	ibytes = offsetof(SharedJitInstrumentation, jit_instr)
+		+ mul_size(shared_jit->num_workers, sizeof(JitInstrumentation));
+	planstate->worker_jit_instrument =
+		MemoryContextAlloc(planstate->state->es_query_cxt, ibytes);
+
+	memcpy(planstate->worker_jit_instrument, shared_jit, ibytes);
+}
+
+/*
+ * Finish parallel execution.  We wait for parallel workers to finish, and
+ * accumulate their buffer/WAL usage.
+ */
+void
+ExecParallelFinish(ParallelExecutorInfo *pei)
+{
+	int			nworkers = pei->pcxt->nworkers_launched;
+	int			i;
+
+	/* Make this be a no-op if called twice in a row. */
+	if (pei->finished)
+		return;
+
+	/*
+	 * Detach from tuple queues ASAP, so that any still-active workers will
+	 * notice that no further results are wanted.
+	 */
+	if (pei->tqueue != NULL)
+	{
+		for (i = 0; i < nworkers; i++)
+			shm_mq_detach(pei->tqueue[i]);
+		pfree(pei->tqueue);
+		pei->tqueue = NULL;
+	}
+
+	/*
+	 * While we're waiting for the workers to finish, let's get rid of the
+	 * tuple queue readers.  (Any other local cleanup could be done here too.)
+	 */
+	if (pei->reader != NULL)
+	{
+		for (i = 0; i < nworkers; i++)
+			DestroyTupleQueueReader(pei->reader[i]);
+		pfree(pei->reader);
+		pei->reader = NULL;
+	}
+
+	/* Now wait for the workers to finish. */
+	WaitForParallelWorkersToFinish(pei->pcxt);
+
+	/*
+	 * Next, accumulate buffer/WAL usage.  (This must wait for the workers to
+	 * finish, or we might get incomplete data.)
+	 */
+	for (i = 0; i < nworkers; i++)
+		InstrAccumParallelQuery(&pei->buffer_usage[i], &pei->wal_usage[i]);
+
+	pei->finished = true;
+}
+
+/*
+ * Accumulate instrumentation, and then clean up whatever ParallelExecutorInfo
+ * resources still exist after ExecParallelFinish.  We separate these
+ * routines because someone might want to examine the contents of the DSM
+ * after ExecParallelFinish and before calling this routine.
+ */
+void
+ExecParallelCleanup(ParallelExecutorInfo *pei)
+{
+	/* Accumulate instrumentation, if any. */
+	if (pei->instrumentation)
+		ExecParallelRetrieveInstrumentation(pei->planstate,
+											pei->instrumentation);
+
+	/* Accumulate JIT instrumentation, if any. */
+	if (pei->jit_instrumentation)
+		ExecParallelRetrieveJitInstrumentation(pei->planstate,
+											   pei->jit_instrumentation);
+
+	/* Free any serialized parameters. */
+	if (DsaPointerIsValid(pei->param_exec))
+	{
+		dsa_free(pei->area, pei->param_exec);
+		pei->param_exec = InvalidDsaPointer;
+	}
+	if (pei->area != NULL)
+	{
+		dsa_detach(pei->area);
+		pei->area = NULL;
+	}
+	if (pei->pcxt != NULL)
+	{
+		DestroyParallelContext(pei->pcxt);
+		pei->pcxt = NULL;
+	}
+	pfree(pei);
+}
+
+/*
+ * Create a DestReceiver to write tuples we produce to the shm_mq designated
+ * for that purpose.
+ */
+static DestReceiver *
+ExecParallelGetReceiver(dsm_segment *seg, shm_toc *toc)
+{
+	char	   *mqspace;
+	shm_mq	   *mq;
+
+	mqspace = shm_toc_lookup(toc, PARALLEL_KEY_TUPLE_QUEUE, false);
+	mqspace += ParallelWorkerNumber * PARALLEL_TUPLE_QUEUE_SIZE;
+	mq = (shm_mq *) mqspace;
+	shm_mq_set_sender(mq, MyProc);
+	return CreateTupleQueueDestReceiver(shm_mq_attach(mq, seg, NULL));
+}
+
+/*
+ * Create a QueryDesc for the PlannedStmt we are to execute, and return it.
+ */
+static QueryDesc *
+ExecParallelGetQueryDesc(shm_toc *toc, DestReceiver *receiver,
+						 int instrument_options)
+{
+	char	   *pstmtspace;
+	char	   *paramspace;
+	PlannedStmt *pstmt;
+	ParamListInfo paramLI;
+	char	   *queryString;
+
+	/* Get the query string from shared memory */
+	queryString = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false);
+
+	/* Reconstruct leader-supplied PlannedStmt. */
+	pstmtspace = shm_toc_lookup(toc, PARALLEL_KEY_PLANNEDSTMT, false);
+	pstmt = (PlannedStmt *) stringToNode(pstmtspace);
+
+	/* Reconstruct ParamListInfo. */
+	paramspace = shm_toc_lookup(toc, PARALLEL_KEY_PARAMLISTINFO, false);
+	paramLI = RestoreParamList(&paramspace);
+
+	/* Create a QueryDesc for the query. */
+	return CreateQueryDesc(pstmt,
+						   queryString,
+						   GetActiveSnapshot(), InvalidSnapshot,
+						   receiver, paramLI, NULL, instrument_options);
+}
+
+/*
+ * Copy instrumentation information from this node and its descendants into
+ * dynamic shared memory, so that the parallel leader can retrieve it.
+ */
+static bool
+ExecParallelReportInstrumentation(PlanState *planstate,
+								  SharedExecutorInstrumentation *instrumentation)
+{
+	int			i;
+	int			plan_node_id = planstate->plan->plan_node_id;
+	Instrumentation *instrument;
+
+	InstrEndLoop(planstate->instrument);
+
+	/*
+	 * If we shuffled the plan_node_id values in ps_instrument into sorted
+	 * order, we could use binary search here.  This might matter someday if
+	 * we're pushing down sufficiently large plan trees.  For now, do it the
+	 * slow, dumb way.
+	 */
+	for (i = 0; i < instrumentation->num_plan_nodes; ++i)
+		if (instrumentation->plan_node_id[i] == plan_node_id)
+			break;
+	if (i >= instrumentation->num_plan_nodes)
+		elog(ERROR, "plan node %d not found", plan_node_id);
+
+	/*
+	 * Add our statistics to the per-node, per-worker totals.  It's possible
+	 * that this could happen more than once if we relaunched workers.
+	 */
+	instrument = GetInstrumentationArray(instrumentation);
+	instrument += i * instrumentation->num_workers;
+	Assert(IsParallelWorker());
+	Assert(ParallelWorkerNumber < instrumentation->num_workers);
+	InstrAggNode(&instrument[ParallelWorkerNumber], planstate->instrument);
+
+	return planstate_tree_walker(planstate, ExecParallelReportInstrumentation,
+								 instrumentation);
+}
+
+/*
+ * Initialize the PlanState and its descendants with the information
+ * retrieved from shared memory.  This has to be done once the PlanState
+ * is allocated and initialized by executor; that is, after ExecutorStart().
+ */
+static bool
+ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
+{
+	if (planstate == NULL)
+		return false;
+
+	switch (nodeTag(planstate))
+	{
+		case T_SeqScanState:
+			if (planstate->plan->parallel_aware)
+				ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt);
+			break;
+		case T_IndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexScanInitializeWorker((IndexScanState *) planstate,
+											  pwcxt);
+			break;
+		case T_IndexOnlyScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate,
+												  pwcxt);
+			break;
+		case T_ForeignScanState:
+			if (planstate->plan->parallel_aware)
+				ExecForeignScanInitializeWorker((ForeignScanState *) planstate,
+												pwcxt);
+			break;
+		case T_AppendState:
+			if (planstate->plan->parallel_aware)
+				ExecAppendInitializeWorker((AppendState *) planstate, pwcxt);
+			break;
+		case T_CustomScanState:
+			if (planstate->plan->parallel_aware)
+				ExecCustomScanInitializeWorker((CustomScanState *) planstate,
+											   pwcxt);
+			break;
+		case T_BitmapHeapScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate,
+											   pwcxt);
+			break;
+		case T_HashJoinState:
+			if (planstate->plan->parallel_aware)
+				ExecHashJoinInitializeWorker((HashJoinState *) planstate,
+											 pwcxt);
+			break;
+		case T_HashState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecHashInitializeWorker((HashState *) planstate, pwcxt);
+			break;
+		case T_SortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSortInitializeWorker((SortState *) planstate, pwcxt);
+			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate,
+												pwcxt);
+			break;
+		case T_AggState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecAggInitializeWorker((AggState *) planstate, pwcxt);
+			break;
+		case T_MemoizeState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecMemoizeInitializeWorker((MemoizeState *) planstate, pwcxt);
+			break;
+		default:
+			break;
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelInitializeWorker,
+								 pwcxt);
+}
+
+/*
+ * Main entrypoint for parallel query worker processes.
+ *
+ * We reach this function from ParallelWorkerMain, so the setup necessary to
+ * create a sensible parallel environment has already been done;
+ * ParallelWorkerMain worries about stuff like the transaction state, combo
+ * CID mappings, and GUC values, so we don't need to deal with any of that
+ * here.
+ *
+ * Our job is to deal with concerns specific to the executor.  The parallel
+ * group leader will have stored a serialized PlannedStmt, and it's our job
+ * to execute that plan and write the resulting tuples to the appropriate
+ * tuple queue.  Various bits of supporting information that we need in order
+ * to do this are also stored in the dsm_segment and can be accessed through
+ * the shm_toc.
+ */
+void
+ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
+{
+	FixedParallelExecutorState *fpes;
+	BufferUsage *buffer_usage;
+	WalUsage   *wal_usage;
+	DestReceiver *receiver;
+	QueryDesc  *queryDesc;
+	SharedExecutorInstrumentation *instrumentation;
+	SharedJitInstrumentation *jit_instrumentation;
+	int			instrument_options = 0;
+	void	   *area_space;
+	dsa_area   *area;
+	ParallelWorkerContext pwcxt;
+
+	/* Get fixed-size state. */
+	fpes = shm_toc_lookup(toc, PARALLEL_KEY_EXECUTOR_FIXED, false);
+
+	/* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */
+	receiver = ExecParallelGetReceiver(seg, toc);
+	instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, true);
+	if (instrumentation != NULL)
+		instrument_options = instrumentation->instrument_options;
+	jit_instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_JIT_INSTRUMENTATION,
+										 true);
+	queryDesc = ExecParallelGetQueryDesc(toc, receiver, instrument_options);
+
+	/* Setting debug_query_string for individual workers */
+	debug_query_string = queryDesc->sourceText;
+
+	/* Report workers' query and queryId for monitoring purposes */
+	pgstat_report_activity(STATE_RUNNING, debug_query_string);
+
+	/* Attach to the dynamic shared memory area. */
+	area_space = shm_toc_lookup(toc, PARALLEL_KEY_DSA, false);
+	area = dsa_attach_in_place(area_space, seg);
+
+	/* Start up the executor */
+	queryDesc->plannedstmt->jitFlags = fpes->jit_flags;
+	ExecutorStart(queryDesc, fpes->eflags);
+
+	/* Special executor initialization steps for parallel workers */
+	queryDesc->planstate->state->es_query_dsa = area;
+	if (DsaPointerIsValid(fpes->param_exec))
+	{
+		char	   *paramexec_space;
+
+		paramexec_space = dsa_get_address(area, fpes->param_exec);
+		RestoreParamExecParams(paramexec_space, queryDesc->estate);
+
+	}
+	pwcxt.toc = toc;
+	pwcxt.seg = seg;
+	ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt);
+
+	/* Pass down any tuple bound */
+	ExecSetTupleBound(fpes->tuples_needed, queryDesc->planstate);
+
+	/*
+	 * Prepare to track buffer/WAL usage during query execution.
+	 *
+	 * We do this after starting up the executor to match what happens in the
+	 * leader, which also doesn't count buffer accesses and WAL activity that
+	 * occur during executor startup.
+	 */
+	InstrStartParallelQuery();
+
+	/*
+	 * Run the plan.  If we specified a tuple bound, be careful not to demand
+	 * more tuples than that.
+	 */
+	ExecutorRun(queryDesc,
+				ForwardScanDirection,
+				fpes->tuples_needed < 0 ? (int64) 0 : fpes->tuples_needed,
+				true);
+
+	/* Shut down the executor */
+	ExecutorFinish(queryDesc);
+
+	/* Report buffer/WAL usage during parallel execution. */
+	buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
+	wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
+	InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
+						  &wal_usage[ParallelWorkerNumber]);
+
+	/* Report instrumentation data if any instrumentation options are set. */
+	if (instrumentation != NULL)
+		ExecParallelReportInstrumentation(queryDesc->planstate,
+										  instrumentation);
+
+	/* Report JIT instrumentation data if any */
+	if (queryDesc->estate->es_jit && jit_instrumentation != NULL)
+	{
+		Assert(ParallelWorkerNumber < jit_instrumentation->num_workers);
+		jit_instrumentation->jit_instr[ParallelWorkerNumber] =
+			queryDesc->estate->es_jit->instr;
+	}
+
+	/* Must do this after capturing instrumentation. */
+	ExecutorEnd(queryDesc);
+
+	/* Cleanup. */
+	dsa_detach(area);
+	FreeQueryDesc(queryDesc);
+	receiver->rDestroy(receiver);
+}
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
new file mode 100644
index 0000000..606c920
--- /dev/null
+++ b/src/backend/executor/execPartition.c
@@ -0,0 +1,2107 @@
+/*-------------------------------------------------------------------------
+ *
+ * execPartition.c
+ *	  Support routines for partitioning.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execPartition.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/table.h"
+#include "access/tableam.h"
+#include "catalog/partition.h"
+#include "catalog/pg_inherits.h"
+#include "catalog/pg_type.h"
+#include "executor/execPartition.h"
+#include "executor/executor.h"
+#include "foreign/fdwapi.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "partitioning/partbounds.h"
+#include "partitioning/partdesc.h"
+#include "partitioning/partprune.h"
+#include "rewrite/rewriteManip.h"
+#include "utils/acl.h"
+#include "utils/lsyscache.h"
+#include "utils/partcache.h"
+#include "utils/rls.h"
+#include "utils/ruleutils.h"
+
+
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions.
+ *
+ * partition_root
+ *		The partitioned table that's the target of the command.
+ *
+ * partition_dispatch_info
+ *		Array of 'max_dispatch' elements containing a pointer to a
+ *		PartitionDispatch object for every partitioned table touched by tuple
+ *		routing.  The entry for the target partitioned table is *always*
+ *		present in the 0th element of this array.  See comment for
+ *		PartitionDispatchData->indexes for details on how this array is
+ *		indexed.
+ *
+ * nonleaf_partitions
+ *		Array of 'max_dispatch' elements containing pointers to fake
+ *		ResultRelInfo objects for nonleaf partitions, useful for checking
+ *		the partition constraint.
+ *
+ * num_dispatch
+ *		The current number of items stored in the 'partition_dispatch_info'
+ *		array.  Also serves as the index of the next free array element for
+ *		new PartitionDispatch objects that need to be stored.
+ *
+ * max_dispatch
+ *		The current allocated size of the 'partition_dispatch_info' array.
+ *
+ * partitions
+ *		Array of 'max_partitions' elements containing a pointer to a
+ *		ResultRelInfo for every leaf partition touched by tuple routing.
+ *		Some of these are pointers to ResultRelInfos which are borrowed out of
+ *		the owning ModifyTableState node.  The remainder have been built
+ *		especially for tuple routing.  See comment for
+ *		PartitionDispatchData->indexes for details on how this array is
+ *		indexed.
+ *
+ * is_borrowed_rel
+ *		Array of 'max_partitions' booleans recording whether a given entry
+ *		in 'partitions' is a ResultRelInfo pointer borrowed from the owning
+ *		ModifyTableState node, rather than being built here.
+ *
+ * num_partitions
+ *		The current number of items stored in the 'partitions' array.  Also
+ *		serves as the index of the next free array element for new
+ *		ResultRelInfo objects that need to be stored.
+ *
+ * max_partitions
+ *		The current allocated size of the 'partitions' array.
+ *
+ * memcxt
+ *		Memory context used to allocate subsidiary structs.
+ *-----------------------
+ */
+struct PartitionTupleRouting
+{
+	Relation	partition_root;
+	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
+	int			num_dispatch;
+	int			max_dispatch;
+	ResultRelInfo **partitions;
+	bool	   *is_borrowed_rel;
+	int			num_partitions;
+	int			max_partitions;
+	MemoryContext memcxt;
+};
+
+/*-----------------------
+ * PartitionDispatch - information about one partitioned table in a partition
+ * hierarchy required to route a tuple to any of its partitions.  A
+ * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
+ * struct and stored inside its 'partition_dispatch_info' array.
+ *
+ * reldesc
+ *		Relation descriptor of the table
+ *
+ * key
+ *		Partition key information of the table
+ *
+ * keystate
+ *		Execution state required for expressions in the partition key
+ *
+ * partdesc
+ *		Partition descriptor of the table
+ *
+ * tupslot
+ *		A standalone TupleTableSlot initialized with this table's tuple
+ *		descriptor, or NULL if no tuple conversion between the parent is
+ *		required.
+ *
+ * tupmap
+ *		TupleConversionMap to convert from the parent's rowtype to this table's
+ *		rowtype  (when extracting the partition key of a tuple just before
+ *		routing it through this table). A NULL value is stored if no tuple
+ *		conversion is required.
+ *
+ * indexes
+ *		Array of partdesc->nparts elements.  For leaf partitions the index
+ *		corresponds to the partition's ResultRelInfo in the encapsulating
+ *		PartitionTupleRouting's partitions array.  For partitioned partitions,
+ *		the index corresponds to the PartitionDispatch for it in its
+ *		partition_dispatch_info array.  -1 indicates we've not yet allocated
+ *		anything in PartitionTupleRouting for the partition.
+ *-----------------------
+ */
+typedef struct PartitionDispatchData
+{
+	Relation	reldesc;
+	PartitionKey key;
+	List	   *keystate;		/* list of ExprState */
+	PartitionDesc partdesc;
+	TupleTableSlot *tupslot;
+	AttrMap    *tupmap;
+	int			indexes[FLEXIBLE_ARRAY_MEMBER];
+}			PartitionDispatchData;
+
+
+static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
+											EState *estate, PartitionTupleRouting *proute,
+											PartitionDispatch dispatch,
+											ResultRelInfo *rootResultRelInfo,
+											int partidx);
+static void ExecInitRoutingInfo(ModifyTableState *mtstate,
+								EState *estate,
+								PartitionTupleRouting *proute,
+								PartitionDispatch dispatch,
+								ResultRelInfo *partRelInfo,
+								int partidx,
+								bool is_borrowed_rel);
+static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
+													   PartitionTupleRouting *proute,
+													   Oid partoid, PartitionDispatch parent_pd,
+													   int partidx, ResultRelInfo *rootResultRelInfo);
+static void FormPartitionKeyDatum(PartitionDispatch pd,
+								  TupleTableSlot *slot,
+								  EState *estate,
+								  Datum *values,
+								  bool *isnull);
+static int	get_partition_for_tuple(PartitionDispatch pd, Datum *values,
+									bool *isnull);
+static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
+												  Datum *values,
+												  bool *isnull,
+												  int maxfieldlen);
+static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri);
+static void ExecInitPruningContext(PartitionPruneContext *context,
+								   List *pruning_steps,
+								   PartitionDesc partdesc,
+								   PartitionKey partkey,
+								   PlanState *planstate);
+static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
+										   PartitionedRelPruningData *pprune,
+										   bool initial_prune,
+										   Bitmapset **validsubplans);
+
+
+/*
+ * ExecSetupPartitionTupleRouting - sets up information needed during
+ * tuple routing for partitioned tables, encapsulates it in
+ * PartitionTupleRouting, and returns it.
+ *
+ * Callers must use the returned PartitionTupleRouting during calls to
+ * ExecFindPartition().  The actual ResultRelInfo for a partition is only
+ * allocated when the partition is found for the first time.
+ *
+ * The current memory context is used to allocate this struct and all
+ * subsidiary structs that will be allocated from it later on.  Typically
+ * it should be estate->es_query_cxt.
+ */
+PartitionTupleRouting *
+ExecSetupPartitionTupleRouting(EState *estate, Relation rel)
+{
+	PartitionTupleRouting *proute;
+
+	/*
+	 * Here we attempt to expend as little effort as possible in setting up
+	 * the PartitionTupleRouting.  Each partition's ResultRelInfo is built on
+	 * demand, only when we actually need to route a tuple to that partition.
+	 * The reason for this is that a common case is for INSERT to insert a
+	 * single tuple into a partitioned table and this must be fast.
+	 */
+	proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
+	proute->partition_root = rel;
+	proute->memcxt = CurrentMemoryContext;
+	/* Rest of members initialized by zeroing */
+
+	/*
+	 * Initialize this table's PartitionDispatch object.  Here we pass in the
+	 * parent as NULL as we don't need to care about any parent of the target
+	 * partitioned table.
+	 */
+	ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
+								  NULL, 0, NULL);
+
+	return proute;
+}
+
+/*
+ * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
+ * the tuple contained in *slot should belong to.
+ *
+ * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
+ * one up or reuse one from mtstate's resultRelInfo array.  When reusing a
+ * ResultRelInfo from the mtstate we verify that the relation is a valid
+ * target for INSERTs and initialize tuple routing information.
+ *
+ * rootResultRelInfo is the relation named in the query.
+ *
+ * estate must be non-NULL; we'll need it to compute any expressions in the
+ * partition keys.  Also, its per-tuple contexts are used as evaluation
+ * scratch space.
+ *
+ * If no leaf partition is found, this routine errors out with the appropriate
+ * error message.  An error may also be raised if the found target partition
+ * is not a valid target for an INSERT.
+ */
+ResultRelInfo *
+ExecFindPartition(ModifyTableState *mtstate,
+				  ResultRelInfo *rootResultRelInfo,
+				  PartitionTupleRouting *proute,
+				  TupleTableSlot *slot, EState *estate)
+{
+	PartitionDispatch *pd = proute->partition_dispatch_info;
+	Datum		values[PARTITION_MAX_KEYS];
+	bool		isnull[PARTITION_MAX_KEYS];
+	Relation	rel;
+	PartitionDispatch dispatch;
+	PartitionDesc partdesc;
+	ExprContext *ecxt = GetPerTupleExprContext(estate);
+	TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
+	TupleTableSlot *rootslot = slot;
+	TupleTableSlot *myslot = NULL;
+	MemoryContext oldcxt;
+	ResultRelInfo *rri = NULL;
+
+	/* use per-tuple context here to avoid leaking memory */
+	oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
+
+	/*
+	 * First check the root table's partition constraint, if any.  No point in
+	 * routing the tuple if it doesn't belong in the root table itself.
+	 */
+	if (rootResultRelInfo->ri_RelationDesc->rd_rel->relispartition)
+		ExecPartitionCheck(rootResultRelInfo, slot, estate, true);
+
+	/* start with the root partitioned table */
+	dispatch = pd[0];
+	while (dispatch != NULL)
+	{
+		int			partidx = -1;
+		bool		is_leaf;
+
+		CHECK_FOR_INTERRUPTS();
+
+		rel = dispatch->reldesc;
+		partdesc = dispatch->partdesc;
+
+		/*
+		 * Extract partition key from tuple. Expression evaluation machinery
+		 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
+		 * point to the correct tuple slot.  The slot might have changed from
+		 * what was used for the parent table if the table of the current
+		 * partitioning level has different tuple descriptor from the parent.
+		 * So update ecxt_scantuple accordingly.
+		 */
+		ecxt->ecxt_scantuple = slot;
+		FormPartitionKeyDatum(dispatch, slot, estate, values, isnull);
+
+		/*
+		 * If this partitioned table has no partitions or no partition for
+		 * these values, error out.
+		 */
+		if (partdesc->nparts == 0 ||
+			(partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0)
+		{
+			char	   *val_desc;
+
+			val_desc = ExecBuildSlotPartitionKeyDescription(rel,
+															values, isnull, 64);
+			Assert(OidIsValid(RelationGetRelid(rel)));
+			ereport(ERROR,
+					(errcode(ERRCODE_CHECK_VIOLATION),
+					 errmsg("no partition of relation \"%s\" found for row",
+							RelationGetRelationName(rel)),
+					 val_desc ?
+					 errdetail("Partition key of the failing row contains %s.",
+							   val_desc) : 0,
+					 errtable(rel)));
+		}
+
+		is_leaf = partdesc->is_leaf[partidx];
+		if (is_leaf)
+		{
+			/*
+			 * We've reached the leaf -- hurray, we're done.  Look to see if
+			 * we've already got a ResultRelInfo for this partition.
+			 */
+			if (likely(dispatch->indexes[partidx] >= 0))
+			{
+				/* ResultRelInfo already built */
+				Assert(dispatch->indexes[partidx] < proute->num_partitions);
+				rri = proute->partitions[dispatch->indexes[partidx]];
+			}
+			else
+			{
+				/*
+				 * If the partition is known in the owning ModifyTableState
+				 * node, we can re-use that ResultRelInfo instead of creating
+				 * a new one with ExecInitPartitionInfo().
+				 */
+				rri = ExecLookupResultRelByOid(mtstate,
+											   partdesc->oids[partidx],
+											   true, false);
+				if (rri)
+				{
+					/* Verify this ResultRelInfo allows INSERTs */
+					CheckValidResultRel(rri, CMD_INSERT);
+
+					/*
+					 * Initialize information needed to insert this and
+					 * subsequent tuples routed to this partition.
+					 */
+					ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
+										rri, partidx, true);
+				}
+				else
+				{
+					/* We need to create a new one. */
+					rri = ExecInitPartitionInfo(mtstate, estate, proute,
+												dispatch,
+												rootResultRelInfo, partidx);
+				}
+			}
+			Assert(rri != NULL);
+
+			/* Signal to terminate the loop */
+			dispatch = NULL;
+		}
+		else
+		{
+			/*
+			 * Partition is a sub-partitioned table; get the PartitionDispatch
+			 */
+			if (likely(dispatch->indexes[partidx] >= 0))
+			{
+				/* Already built. */
+				Assert(dispatch->indexes[partidx] < proute->num_dispatch);
+
+				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
+
+				/*
+				 * Move down to the next partition level and search again
+				 * until we find a leaf partition that matches this tuple
+				 */
+				dispatch = pd[dispatch->indexes[partidx]];
+			}
+			else
+			{
+				/* Not yet built. Do that now. */
+				PartitionDispatch subdispatch;
+
+				/*
+				 * Create the new PartitionDispatch.  We pass the current one
+				 * in as the parent PartitionDispatch
+				 */
+				subdispatch = ExecInitPartitionDispatchInfo(estate,
+															proute,
+															partdesc->oids[partidx],
+															dispatch, partidx,
+															mtstate->rootResultRelInfo);
+				Assert(dispatch->indexes[partidx] >= 0 &&
+					   dispatch->indexes[partidx] < proute->num_dispatch);
+
+				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
+				dispatch = subdispatch;
+			}
+
+			/*
+			 * Convert the tuple to the new parent's layout, if different from
+			 * the previous parent.
+			 */
+			if (dispatch->tupslot)
+			{
+				AttrMap    *map = dispatch->tupmap;
+				TupleTableSlot *tempslot = myslot;
+
+				myslot = dispatch->tupslot;
+				slot = execute_attr_map_slot(map, slot, myslot);
+
+				if (tempslot != NULL)
+					ExecClearTuple(tempslot);
+			}
+		}
+
+		/*
+		 * If this partition is the default one, we must check its partition
+		 * constraint now, which may have changed concurrently due to
+		 * partitions being added to the parent.
+		 *
+		 * (We do this here, and do not rely on ExecInsert doing it, because
+		 * we don't want to miss doing it for non-leaf partitions.)
+		 */
+		if (partidx == partdesc->boundinfo->default_index)
+		{
+			/*
+			 * The tuple must match the partition's layout for the constraint
+			 * expression to be evaluated successfully.  If the partition is
+			 * sub-partitioned, that would already be the case due to the code
+			 * above, but for a leaf partition the tuple still matches the
+			 * parent's layout.
+			 *
+			 * Note that we have a map to convert from root to current
+			 * partition, but not from immediate parent to current partition.
+			 * So if we have to convert, do it from the root slot; if not, use
+			 * the root slot as-is.
+			 */
+			if (is_leaf)
+			{
+				TupleConversionMap *map = rri->ri_RootToPartitionMap;
+
+				if (map)
+					slot = execute_attr_map_slot(map->attrMap, rootslot,
+												 rri->ri_PartitionTupleSlot);
+				else
+					slot = rootslot;
+			}
+
+			ExecPartitionCheck(rri, slot, estate, true);
+		}
+	}
+
+	/* Release the tuple in the lowest parent's dedicated slot. */
+	if (myslot != NULL)
+		ExecClearTuple(myslot);
+	/* and restore ecxt's scantuple */
+	ecxt->ecxt_scantuple = ecxt_scantuple_saved;
+	MemoryContextSwitchTo(oldcxt);
+
+	return rri;
+}
+
+/*
+ * ExecInitPartitionInfo
+ *		Lock the partition and initialize ResultRelInfo.  Also setup other
+ *		information for the partition and store it in the next empty slot in
+ *		the proute->partitions array.
+ *
+ * Returns the ResultRelInfo
+ */
+static ResultRelInfo *
+ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
+					  PartitionTupleRouting *proute,
+					  PartitionDispatch dispatch,
+					  ResultRelInfo *rootResultRelInfo,
+					  int partidx)
+{
+	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+	Oid			partOid = dispatch->partdesc->oids[partidx];
+	Relation	partrel;
+	int			firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
+	Relation	firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
+	ResultRelInfo *leaf_part_rri;
+	MemoryContext oldcxt;
+	AttrMap    *part_attmap = NULL;
+	bool		found_whole_row;
+
+	oldcxt = MemoryContextSwitchTo(proute->memcxt);
+
+	partrel = table_open(partOid, RowExclusiveLock);
+
+	leaf_part_rri = makeNode(ResultRelInfo);
+	InitResultRelInfo(leaf_part_rri,
+					  partrel,
+					  0,
+					  rootResultRelInfo,
+					  estate->es_instrument);
+
+	/*
+	 * Verify result relation is a valid target for an INSERT.  An UPDATE of a
+	 * partition-key becomes a DELETE+INSERT operation, so this check is still
+	 * required when the operation is CMD_UPDATE.
+	 */
+	CheckValidResultRel(leaf_part_rri, CMD_INSERT);
+
+	/*
+	 * Open partition indices.  The user may have asked to check for conflicts
+	 * within this leaf partition and do "nothing" instead of throwing an
+	 * error.  Be prepared in that case by initializing the index information
+	 * needed by ExecInsert() to perform speculative insertions.
+	 */
+	if (partrel->rd_rel->relhasindex &&
+		leaf_part_rri->ri_IndexRelationDescs == NULL)
+		ExecOpenIndices(leaf_part_rri,
+						(node != NULL &&
+						 node->onConflictAction != ONCONFLICT_NONE));
+
+	/*
+	 * Build WITH CHECK OPTION constraints for the partition.  Note that we
+	 * didn't build the withCheckOptionList for partitions within the planner,
+	 * but simple translation of varattnos will suffice.  This only occurs for
+	 * the INSERT case or in the case of UPDATE tuple routing where we didn't
+	 * find a result rel to reuse.
+	 */
+	if (node && node->withCheckOptionLists != NIL)
+	{
+		List	   *wcoList;
+		List	   *wcoExprs = NIL;
+		ListCell   *ll;
+
+		/*
+		 * In the case of INSERT on a partitioned table, there is only one
+		 * plan.  Likewise, there is only one WCO list, not one per partition.
+		 * For UPDATE, there are as many WCO lists as there are plans.
+		 */
+		Assert((node->operation == CMD_INSERT &&
+				list_length(node->withCheckOptionLists) == 1 &&
+				list_length(node->resultRelations) == 1) ||
+			   (node->operation == CMD_UPDATE &&
+				list_length(node->withCheckOptionLists) ==
+				list_length(node->resultRelations)));
+
+		/*
+		 * Use the WCO list of the first plan as a reference to calculate
+		 * attno's for the WCO list of this partition.  In the INSERT case,
+		 * that refers to the root partitioned table, whereas in the UPDATE
+		 * tuple routing case, that refers to the first partition in the
+		 * mtstate->resultRelInfo array.  In any case, both that relation and
+		 * this partition should have the same columns, so we should be able
+		 * to map attributes successfully.
+		 */
+		wcoList = linitial(node->withCheckOptionLists);
+
+		/*
+		 * Convert Vars in it to contain this partition's attribute numbers.
+		 */
+		part_attmap =
+			build_attrmap_by_name(RelationGetDescr(partrel),
+								  RelationGetDescr(firstResultRel));
+		wcoList = (List *)
+			map_variable_attnos((Node *) wcoList,
+								firstVarno, 0,
+								part_attmap,
+								RelationGetForm(partrel)->reltype,
+								&found_whole_row);
+		/* We ignore the value of found_whole_row. */
+
+		foreach(ll, wcoList)
+		{
+			WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
+			ExprState  *wcoExpr = ExecInitQual(castNode(List, wco->qual),
+											   &mtstate->ps);
+
+			wcoExprs = lappend(wcoExprs, wcoExpr);
+		}
+
+		leaf_part_rri->ri_WithCheckOptions = wcoList;
+		leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
+	}
+
+	/*
+	 * Build the RETURNING projection for the partition.  Note that we didn't
+	 * build the returningList for partitions within the planner, but simple
+	 * translation of varattnos will suffice.  This only occurs for the INSERT
+	 * case or in the case of UPDATE tuple routing where we didn't find a
+	 * result rel to reuse.
+	 */
+	if (node && node->returningLists != NIL)
+	{
+		TupleTableSlot *slot;
+		ExprContext *econtext;
+		List	   *returningList;
+
+		/* See the comment above for WCO lists. */
+		Assert((node->operation == CMD_INSERT &&
+				list_length(node->returningLists) == 1 &&
+				list_length(node->resultRelations) == 1) ||
+			   (node->operation == CMD_UPDATE &&
+				list_length(node->returningLists) ==
+				list_length(node->resultRelations)));
+
+		/*
+		 * Use the RETURNING list of the first plan as a reference to
+		 * calculate attno's for the RETURNING list of this partition.  See
+		 * the comment above for WCO lists for more details on why this is
+		 * okay.
+		 */
+		returningList = linitial(node->returningLists);
+
+		/*
+		 * Convert Vars in it to contain this partition's attribute numbers.
+		 */
+		if (part_attmap == NULL)
+			part_attmap =
+				build_attrmap_by_name(RelationGetDescr(partrel),
+									  RelationGetDescr(firstResultRel));
+		returningList = (List *)
+			map_variable_attnos((Node *) returningList,
+								firstVarno, 0,
+								part_attmap,
+								RelationGetForm(partrel)->reltype,
+								&found_whole_row);
+		/* We ignore the value of found_whole_row. */
+
+		leaf_part_rri->ri_returningList = returningList;
+
+		/*
+		 * Initialize the projection itself.
+		 *
+		 * Use the slot and the expression context that would have been set up
+		 * in ExecInitModifyTable() for projection's output.
+		 */
+		Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
+		slot = mtstate->ps.ps_ResultTupleSlot;
+		Assert(mtstate->ps.ps_ExprContext != NULL);
+		econtext = mtstate->ps.ps_ExprContext;
+		leaf_part_rri->ri_projectReturning =
+			ExecBuildProjectionInfo(returningList, econtext, slot,
+									&mtstate->ps, RelationGetDescr(partrel));
+	}
+
+	/* Set up information needed for routing tuples to the partition. */
+	ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
+						leaf_part_rri, partidx, false);
+
+	/*
+	 * If there is an ON CONFLICT clause, initialize state for it.
+	 */
+	if (node && node->onConflictAction != ONCONFLICT_NONE)
+	{
+		TupleDesc	partrelDesc = RelationGetDescr(partrel);
+		ExprContext *econtext = mtstate->ps.ps_ExprContext;
+		ListCell   *lc;
+		List	   *arbiterIndexes = NIL;
+
+		/*
+		 * If there is a list of arbiter indexes, map it to a list of indexes
+		 * in the partition.  We do that by scanning the partition's index
+		 * list and searching for ancestry relationships to each index in the
+		 * ancestor table.
+		 */
+		if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) > 0)
+		{
+			List	   *childIdxs;
+
+			childIdxs = RelationGetIndexList(leaf_part_rri->ri_RelationDesc);
+
+			foreach(lc, childIdxs)
+			{
+				Oid			childIdx = lfirst_oid(lc);
+				List	   *ancestors;
+				ListCell   *lc2;
+
+				ancestors = get_partition_ancestors(childIdx);
+				foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes)
+				{
+					if (list_member_oid(ancestors, lfirst_oid(lc2)))
+						arbiterIndexes = lappend_oid(arbiterIndexes, childIdx);
+				}
+				list_free(ancestors);
+			}
+		}
+
+		/*
+		 * If the resulting lists are of inequal length, something is wrong.
+		 * (This shouldn't happen, since arbiter index selection should not
+		 * pick up an invalid index.)
+		 */
+		if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
+			list_length(arbiterIndexes))
+			elog(ERROR, "invalid arbiter index list");
+		leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
+
+		/*
+		 * In the DO UPDATE case, we have some more state to initialize.
+		 */
+		if (node->onConflictAction == ONCONFLICT_UPDATE)
+		{
+			OnConflictSetState *onconfl = makeNode(OnConflictSetState);
+			TupleConversionMap *map;
+
+			map = leaf_part_rri->ri_RootToPartitionMap;
+
+			Assert(node->onConflictSet != NIL);
+			Assert(rootResultRelInfo->ri_onConflict != NULL);
+
+			leaf_part_rri->ri_onConflict = onconfl;
+
+			/*
+			 * Need a separate existing slot for each partition, as the
+			 * partition could be of a different AM, even if the tuple
+			 * descriptors match.
+			 */
+			onconfl->oc_Existing =
+				table_slot_create(leaf_part_rri->ri_RelationDesc,
+								  &mtstate->ps.state->es_tupleTable);
+
+			/*
+			 * If the partition's tuple descriptor matches exactly the root
+			 * parent (the common case), we can re-use most of the parent's ON
+			 * CONFLICT SET state, skipping a bunch of work.  Otherwise, we
+			 * need to create state specific to this partition.
+			 */
+			if (map == NULL)
+			{
+				/*
+				 * It's safe to reuse these from the partition root, as we
+				 * only process one tuple at a time (therefore we won't
+				 * overwrite needed data in slots), and the results of
+				 * projections are independent of the underlying storage.
+				 * Projections and where clauses themselves don't store state
+				 * / are independent of the underlying storage.
+				 */
+				onconfl->oc_ProjSlot =
+					rootResultRelInfo->ri_onConflict->oc_ProjSlot;
+				onconfl->oc_ProjInfo =
+					rootResultRelInfo->ri_onConflict->oc_ProjInfo;
+				onconfl->oc_WhereClause =
+					rootResultRelInfo->ri_onConflict->oc_WhereClause;
+			}
+			else
+			{
+				List	   *onconflset;
+				List	   *onconflcols;
+				bool		found_whole_row;
+
+				/*
+				 * Translate expressions in onConflictSet to account for
+				 * different attribute numbers.  For that, map partition
+				 * varattnos twice: first to catch the EXCLUDED
+				 * pseudo-relation (INNER_VAR), and second to handle the main
+				 * target relation (firstVarno).
+				 */
+				onconflset = copyObject(node->onConflictSet);
+				if (part_attmap == NULL)
+					part_attmap =
+						build_attrmap_by_name(RelationGetDescr(partrel),
+											  RelationGetDescr(firstResultRel));
+				onconflset = (List *)
+					map_variable_attnos((Node *) onconflset,
+										INNER_VAR, 0,
+										part_attmap,
+										RelationGetForm(partrel)->reltype,
+										&found_whole_row);
+				/* We ignore the value of found_whole_row. */
+				onconflset = (List *)
+					map_variable_attnos((Node *) onconflset,
+										firstVarno, 0,
+										part_attmap,
+										RelationGetForm(partrel)->reltype,
+										&found_whole_row);
+				/* We ignore the value of found_whole_row. */
+
+				/* Finally, adjust the target colnos to match the partition. */
+				onconflcols = adjust_partition_colnos(node->onConflictCols,
+													  leaf_part_rri);
+
+				/* create the tuple slot for the UPDATE SET projection */
+				onconfl->oc_ProjSlot =
+					table_slot_create(partrel,
+									  &mtstate->ps.state->es_tupleTable);
+
+				/* build UPDATE SET projection state */
+				onconfl->oc_ProjInfo =
+					ExecBuildUpdateProjection(onconflset,
+											  true,
+											  onconflcols,
+											  partrelDesc,
+											  econtext,
+											  onconfl->oc_ProjSlot,
+											  &mtstate->ps);
+
+				/*
+				 * If there is a WHERE clause, initialize state where it will
+				 * be evaluated, mapping the attribute numbers appropriately.
+				 * As with onConflictSet, we need to map partition varattnos
+				 * to the partition's tupdesc.
+				 */
+				if (node->onConflictWhere)
+				{
+					List	   *clause;
+
+					clause = copyObject((List *) node->onConflictWhere);
+					clause = (List *)
+						map_variable_attnos((Node *) clause,
+											INNER_VAR, 0,
+											part_attmap,
+											RelationGetForm(partrel)->reltype,
+											&found_whole_row);
+					/* We ignore the value of found_whole_row. */
+					clause = (List *)
+						map_variable_attnos((Node *) clause,
+											firstVarno, 0,
+											part_attmap,
+											RelationGetForm(partrel)->reltype,
+											&found_whole_row);
+					/* We ignore the value of found_whole_row. */
+					onconfl->oc_WhereClause =
+						ExecInitQual((List *) clause, &mtstate->ps);
+				}
+			}
+		}
+	}
+
+	/*
+	 * Since we've just initialized this ResultRelInfo, it's not in any list
+	 * attached to the estate as yet.  Add it, so that it can be found later.
+	 *
+	 * Note that the entries in this list appear in no predetermined order,
+	 * because partition result rels are initialized as and when they're
+	 * needed.
+	 */
+	MemoryContextSwitchTo(estate->es_query_cxt);
+	estate->es_tuple_routing_result_relations =
+		lappend(estate->es_tuple_routing_result_relations,
+				leaf_part_rri);
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return leaf_part_rri;
+}
+
+/*
+ * ExecInitRoutingInfo
+ *		Set up information needed for translating tuples between root
+ *		partitioned table format and partition format, and keep track of it
+ *		in PartitionTupleRouting.
+ */
+static void
+ExecInitRoutingInfo(ModifyTableState *mtstate,
+					EState *estate,
+					PartitionTupleRouting *proute,
+					PartitionDispatch dispatch,
+					ResultRelInfo *partRelInfo,
+					int partidx,
+					bool is_borrowed_rel)
+{
+	ResultRelInfo *rootRelInfo = partRelInfo->ri_RootResultRelInfo;
+	MemoryContext oldcxt;
+	int			rri_index;
+
+	oldcxt = MemoryContextSwitchTo(proute->memcxt);
+
+	/*
+	 * Set up a tuple conversion map to convert a tuple routed to the
+	 * partition from the parent's type to the partition's.
+	 */
+	partRelInfo->ri_RootToPartitionMap =
+		convert_tuples_by_name(RelationGetDescr(rootRelInfo->ri_RelationDesc),
+							   RelationGetDescr(partRelInfo->ri_RelationDesc));
+
+	/*
+	 * If a partition has a different rowtype than the root parent, initialize
+	 * a slot dedicated to storing this partition's tuples.  The slot is used
+	 * for various operations that are applied to tuples after routing, such
+	 * as checking constraints.
+	 */
+	if (partRelInfo->ri_RootToPartitionMap != NULL)
+	{
+		Relation	partrel = partRelInfo->ri_RelationDesc;
+
+		/*
+		 * Initialize the slot itself setting its descriptor to this
+		 * partition's TupleDesc; TupleDesc reference will be released at the
+		 * end of the command.
+		 */
+		partRelInfo->ri_PartitionTupleSlot =
+			table_slot_create(partrel, &estate->es_tupleTable);
+	}
+	else
+		partRelInfo->ri_PartitionTupleSlot = NULL;
+
+	/*
+	 * If the partition is a foreign table, let the FDW init itself for
+	 * routing tuples to the partition.
+	 */
+	if (partRelInfo->ri_FdwRoutine != NULL &&
+		partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
+		partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
+
+	/*
+	 * Determine if the FDW supports batch insert and determine the batch size
+	 * (a FDW may support batching, but it may be disabled for the
+	 * server/table or for this particular query).
+	 *
+	 * If the FDW does not support batching, we set the batch size to 1.
+	 */
+	if (mtstate->operation == CMD_INSERT &&
+		partRelInfo->ri_FdwRoutine != NULL &&
+		partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
+		partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
+		partRelInfo->ri_BatchSize =
+			partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(partRelInfo);
+	else
+		partRelInfo->ri_BatchSize = 1;
+
+	Assert(partRelInfo->ri_BatchSize >= 1);
+
+	partRelInfo->ri_CopyMultiInsertBuffer = NULL;
+
+	/*
+	 * Keep track of it in the PartitionTupleRouting->partitions array.
+	 */
+	Assert(dispatch->indexes[partidx] == -1);
+
+	rri_index = proute->num_partitions++;
+
+	/* Allocate or enlarge the array, as needed */
+	if (proute->num_partitions >= proute->max_partitions)
+	{
+		if (proute->max_partitions == 0)
+		{
+			proute->max_partitions = 8;
+			proute->partitions = (ResultRelInfo **)
+				palloc(sizeof(ResultRelInfo *) * proute->max_partitions);
+			proute->is_borrowed_rel = (bool *)
+				palloc(sizeof(bool) * proute->max_partitions);
+		}
+		else
+		{
+			proute->max_partitions *= 2;
+			proute->partitions = (ResultRelInfo **)
+				repalloc(proute->partitions, sizeof(ResultRelInfo *) *
+						 proute->max_partitions);
+			proute->is_borrowed_rel = (bool *)
+				repalloc(proute->is_borrowed_rel, sizeof(bool) *
+						 proute->max_partitions);
+		}
+	}
+
+	proute->partitions[rri_index] = partRelInfo;
+	proute->is_borrowed_rel[rri_index] = is_borrowed_rel;
+	dispatch->indexes[partidx] = rri_index;
+
+	MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * ExecInitPartitionDispatchInfo
+ *		Lock the partitioned table (if not locked already) and initialize
+ *		PartitionDispatch for a partitioned table and store it in the next
+ *		available slot in the proute->partition_dispatch_info array.  Also,
+ *		record the index into this array in the parent_pd->indexes[] array in
+ *		the partidx element so that we can properly retrieve the newly created
+ *		PartitionDispatch later.
+ */
+static PartitionDispatch
+ExecInitPartitionDispatchInfo(EState *estate,
+							  PartitionTupleRouting *proute, Oid partoid,
+							  PartitionDispatch parent_pd, int partidx,
+							  ResultRelInfo *rootResultRelInfo)
+{
+	Relation	rel;
+	PartitionDesc partdesc;
+	PartitionDispatch pd;
+	int			dispatchidx;
+	MemoryContext oldcxt;
+
+	/*
+	 * For data modification, it is better that executor does not include
+	 * partitions being detached, except when running in snapshot-isolation
+	 * mode.  This means that a read-committed transaction immediately gets a
+	 * "no partition for tuple" error when a tuple is inserted into a
+	 * partition that's being detached concurrently, but a transaction in
+	 * repeatable-read mode can still use such a partition.
+	 */
+	if (estate->es_partition_directory == NULL)
+		estate->es_partition_directory =
+			CreatePartitionDirectory(estate->es_query_cxt,
+									 !IsolationUsesXactSnapshot());
+
+	oldcxt = MemoryContextSwitchTo(proute->memcxt);
+
+	/*
+	 * Only sub-partitioned tables need to be locked here.  The root
+	 * partitioned table will already have been locked as it's referenced in
+	 * the query's rtable.
+	 */
+	if (partoid != RelationGetRelid(proute->partition_root))
+		rel = table_open(partoid, RowExclusiveLock);
+	else
+		rel = proute->partition_root;
+	partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);
+
+	pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
+									partdesc->nparts * sizeof(int));
+	pd->reldesc = rel;
+	pd->key = RelationGetPartitionKey(rel);
+	pd->keystate = NIL;
+	pd->partdesc = partdesc;
+	if (parent_pd != NULL)
+	{
+		TupleDesc	tupdesc = RelationGetDescr(rel);
+
+		/*
+		 * For sub-partitioned tables where the column order differs from its
+		 * direct parent partitioned table, we must store a tuple table slot
+		 * initialized with its tuple descriptor and a tuple conversion map to
+		 * convert a tuple from its parent's rowtype to its own.  This is to
+		 * make sure that we are looking at the correct row using the correct
+		 * tuple descriptor when computing its partition key for tuple
+		 * routing.
+		 */
+		pd->tupmap = build_attrmap_by_name_if_req(RelationGetDescr(parent_pd->reldesc),
+												  tupdesc);
+		pd->tupslot = pd->tupmap ?
+			MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual) : NULL;
+	}
+	else
+	{
+		/* Not required for the root partitioned table */
+		pd->tupmap = NULL;
+		pd->tupslot = NULL;
+	}
+
+	/*
+	 * Initialize with -1 to signify that the corresponding partition's
+	 * ResultRelInfo or PartitionDispatch has not been created yet.
+	 */
+	memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);
+
+	/* Track in PartitionTupleRouting for later use */
+	dispatchidx = proute->num_dispatch++;
+
+	/* Allocate or enlarge the array, as needed */
+	if (proute->num_dispatch >= proute->max_dispatch)
+	{
+		if (proute->max_dispatch == 0)
+		{
+			proute->max_dispatch = 4;
+			proute->partition_dispatch_info = (PartitionDispatch *)
+				palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
+			proute->nonleaf_partitions = (ResultRelInfo **)
+				palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
+		}
+		else
+		{
+			proute->max_dispatch *= 2;
+			proute->partition_dispatch_info = (PartitionDispatch *)
+				repalloc(proute->partition_dispatch_info,
+						 sizeof(PartitionDispatch) * proute->max_dispatch);
+			proute->nonleaf_partitions = (ResultRelInfo **)
+				repalloc(proute->nonleaf_partitions,
+						 sizeof(ResultRelInfo *) * proute->max_dispatch);
+		}
+	}
+	proute->partition_dispatch_info[dispatchidx] = pd;
+
+	/*
+	 * If setting up a PartitionDispatch for a sub-partitioned table, we may
+	 * also need a minimally valid ResultRelInfo for checking the partition
+	 * constraint later; set that up now.
+	 */
+	if (parent_pd)
+	{
+		ResultRelInfo *rri = makeNode(ResultRelInfo);
+
+		InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
+		proute->nonleaf_partitions[dispatchidx] = rri;
+	}
+	else
+		proute->nonleaf_partitions[dispatchidx] = NULL;
+
+	/*
+	 * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
+	 * install a downlink in the parent to allow quick descent.
+	 */
+	if (parent_pd)
+	{
+		Assert(parent_pd->indexes[partidx] == -1);
+		parent_pd->indexes[partidx] = dispatchidx;
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return pd;
+}
+
+/*
+ * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
+ * routing.
+ *
+ * Close all the partitioned tables, leaf partitions, and their indices.
+ */
+void
+ExecCleanupTupleRouting(ModifyTableState *mtstate,
+						PartitionTupleRouting *proute)
+{
+	int			i;
+
+	/*
+	 * Remember, proute->partition_dispatch_info[0] corresponds to the root
+	 * partitioned table, which we must not try to close, because it is the
+	 * main target table of the query that will be closed by callers such as
+	 * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
+	 * partitioned table.
+	 */
+	for (i = 1; i < proute->num_dispatch; i++)
+	{
+		PartitionDispatch pd = proute->partition_dispatch_info[i];
+
+		table_close(pd->reldesc, NoLock);
+
+		if (pd->tupslot)
+			ExecDropSingleTupleTableSlot(pd->tupslot);
+	}
+
+	for (i = 0; i < proute->num_partitions; i++)
+	{
+		ResultRelInfo *resultRelInfo = proute->partitions[i];
+
+		/* Allow any FDWs to shut down */
+		if (resultRelInfo->ri_FdwRoutine != NULL &&
+			resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
+			resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
+														   resultRelInfo);
+
+		/*
+		 * Close it if it's not one of the result relations borrowed from the
+		 * owning ModifyTableState; those will be closed by ExecEndPlan().
+		 */
+		if (proute->is_borrowed_rel[i])
+			continue;
+
+		ExecCloseIndices(resultRelInfo);
+		table_close(resultRelInfo->ri_RelationDesc, NoLock);
+	}
+}
+
+/* ----------------
+ *		FormPartitionKeyDatum
+ *			Construct values[] and isnull[] arrays for the partition key
+ *			of a tuple.
+ *
+ *	pd				Partition dispatch object of the partitioned table
+ *	slot			Heap tuple from which to extract partition key
+ *	estate			executor state for evaluating any partition key
+ *					expressions (must be non-NULL)
+ *	values			Array of partition key Datums (output area)
+ *	isnull			Array of is-null indicators (output area)
+ *
+ * the ecxt_scantuple slot of estate's per-tuple expr context must point to
+ * the heap tuple passed in.
+ * ----------------
+ */
+static void
+FormPartitionKeyDatum(PartitionDispatch pd,
+					  TupleTableSlot *slot,
+					  EState *estate,
+					  Datum *values,
+					  bool *isnull)
+{
+	ListCell   *partexpr_item;
+	int			i;
+
+	if (pd->key->partexprs != NIL && pd->keystate == NIL)
+	{
+		/* Check caller has set up context correctly */
+		Assert(estate != NULL &&
+			   GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
+
+		/* First time through, set up expression evaluation state */
+		pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
+	}
+
+	partexpr_item = list_head(pd->keystate);
+	for (i = 0; i < pd->key->partnatts; i++)
+	{
+		AttrNumber	keycol = pd->key->partattrs[i];
+		Datum		datum;
+		bool		isNull;
+
+		if (keycol != 0)
+		{
+			/* Plain column; get the value directly from the heap tuple */
+			datum = slot_getattr(slot, keycol, &isNull);
+		}
+		else
+		{
+			/* Expression; need to evaluate it */
+			if (partexpr_item == NULL)
+				elog(ERROR, "wrong number of partition key expressions");
+			datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
+											  GetPerTupleExprContext(estate),
+											  &isNull);
+			partexpr_item = lnext(pd->keystate, partexpr_item);
+		}
+		values[i] = datum;
+		isnull[i] = isNull;
+	}
+
+	if (partexpr_item != NULL)
+		elog(ERROR, "wrong number of partition key expressions");
+}
+
+/*
+ * get_partition_for_tuple
+ *		Finds partition of relation which accepts the partition key specified
+ *		in values and isnull
+ *
+ * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
+ * found or -1 if none found.
+ */
+static int
+get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
+{
+	int			bound_offset;
+	int			part_index = -1;
+	PartitionKey key = pd->key;
+	PartitionDesc partdesc = pd->partdesc;
+	PartitionBoundInfo boundinfo = partdesc->boundinfo;
+
+	/* Route as appropriate based on partitioning strategy. */
+	switch (key->strategy)
+	{
+		case PARTITION_STRATEGY_HASH:
+			{
+				uint64		rowHash;
+
+				rowHash = compute_partition_hash_value(key->partnatts,
+													   key->partsupfunc,
+													   key->partcollation,
+													   values, isnull);
+
+				part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
+			}
+			break;
+
+		case PARTITION_STRATEGY_LIST:
+			if (isnull[0])
+			{
+				if (partition_bound_accepts_nulls(boundinfo))
+					part_index = boundinfo->null_index;
+			}
+			else
+			{
+				bool		equal = false;
+
+				bound_offset = partition_list_bsearch(key->partsupfunc,
+													  key->partcollation,
+													  boundinfo,
+													  values[0], &equal);
+				if (bound_offset >= 0 && equal)
+					part_index = boundinfo->indexes[bound_offset];
+			}
+			break;
+
+		case PARTITION_STRATEGY_RANGE:
+			{
+				bool		equal = false,
+							range_partkey_has_null = false;
+				int			i;
+
+				/*
+				 * No range includes NULL, so this will be accepted by the
+				 * default partition if there is one, and otherwise rejected.
+				 */
+				for (i = 0; i < key->partnatts; i++)
+				{
+					if (isnull[i])
+					{
+						range_partkey_has_null = true;
+						break;
+					}
+				}
+
+				if (!range_partkey_has_null)
+				{
+					bound_offset = partition_range_datum_bsearch(key->partsupfunc,
+																 key->partcollation,
+																 boundinfo,
+																 key->partnatts,
+																 values,
+																 &equal);
+
+					/*
+					 * The bound at bound_offset is less than or equal to the
+					 * tuple value, so the bound at offset+1 is the upper
+					 * bound of the partition we're looking for, if there
+					 * actually exists one.
+					 */
+					part_index = boundinfo->indexes[bound_offset + 1];
+				}
+			}
+			break;
+
+		default:
+			elog(ERROR, "unexpected partition strategy: %d",
+				 (int) key->strategy);
+	}
+
+	/*
+	 * part_index < 0 means we failed to find a partition of this parent. Use
+	 * the default partition, if there is one.
+	 */
+	if (part_index < 0)
+		part_index = boundinfo->default_index;
+
+	return part_index;
+}
+
+/*
+ * ExecBuildSlotPartitionKeyDescription
+ *
+ * This works very much like BuildIndexValueDescription() and is currently
+ * used for building error messages when ExecFindPartition() fails to find
+ * partition for a row.
+ */
+static char *
+ExecBuildSlotPartitionKeyDescription(Relation rel,
+									 Datum *values,
+									 bool *isnull,
+									 int maxfieldlen)
+{
+	StringInfoData buf;
+	PartitionKey key = RelationGetPartitionKey(rel);
+	int			partnatts = get_partition_natts(key);
+	int			i;
+	Oid			relid = RelationGetRelid(rel);
+	AclResult	aclresult;
+
+	if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
+		return NULL;
+
+	/* If the user has table-level access, just go build the description. */
+	aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
+	if (aclresult != ACLCHECK_OK)
+	{
+		/*
+		 * Step through the columns of the partition key and make sure the
+		 * user has SELECT rights on all of them.
+		 */
+		for (i = 0; i < partnatts; i++)
+		{
+			AttrNumber	attnum = get_partition_col_attnum(key, i);
+
+			/*
+			 * If this partition key column is an expression, we return no
+			 * detail rather than try to figure out what column(s) the
+			 * expression includes and if the user has SELECT rights on them.
+			 */
+			if (attnum == InvalidAttrNumber ||
+				pg_attribute_aclcheck(relid, attnum, GetUserId(),
+									  ACL_SELECT) != ACLCHECK_OK)
+				return NULL;
+		}
+	}
+
+	initStringInfo(&buf);
+	appendStringInfo(&buf, "(%s) = (",
+					 pg_get_partkeydef_columns(relid, true));
+
+	for (i = 0; i < partnatts; i++)
+	{
+		char	   *val;
+		int			vallen;
+
+		if (isnull[i])
+			val = "null";
+		else
+		{
+			Oid			foutoid;
+			bool		typisvarlena;
+
+			getTypeOutputInfo(get_partition_col_typid(key, i),
+							  &foutoid, &typisvarlena);
+			val = OidOutputFunctionCall(foutoid, values[i]);
+		}
+
+		if (i > 0)
+			appendStringInfoString(&buf, ", ");
+
+		/* truncate if needed */
+		vallen = strlen(val);
+		if (vallen <= maxfieldlen)
+			appendBinaryStringInfo(&buf, val, vallen);
+		else
+		{
+			vallen = pg_mbcliplen(val, vallen, maxfieldlen);
+			appendBinaryStringInfo(&buf, val, vallen);
+			appendStringInfoString(&buf, "...");
+		}
+	}
+
+	appendStringInfoChar(&buf, ')');
+
+	return buf.data;
+}
+
+/*
+ * adjust_partition_colnos
+ *		Adjust the list of UPDATE target column numbers to account for
+ *		attribute differences between the parent and the partition.
+ */
+static List *
+adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri)
+{
+	List	   *new_colnos = NIL;
+	TupleConversionMap *map = ExecGetChildToRootMap(leaf_part_rri);
+	AttrMap    *attrMap;
+	ListCell   *lc;
+
+	Assert(map != NULL);		/* else we shouldn't be here */
+	attrMap = map->attrMap;
+
+	foreach(lc, colnos)
+	{
+		AttrNumber	parentattrno = lfirst_int(lc);
+
+		if (parentattrno <= 0 ||
+			parentattrno > attrMap->maplen ||
+			attrMap->attnums[parentattrno - 1] == 0)
+			elog(ERROR, "unexpected attno %d in target column list",
+				 parentattrno);
+		new_colnos = lappend_int(new_colnos,
+								 attrMap->attnums[parentattrno - 1]);
+	}
+
+	return new_colnos;
+}
+
+/*-------------------------------------------------------------------------
+ * Run-Time Partition Pruning Support.
+ *
+ * The following series of functions exist to support the removal of unneeded
+ * subplans for queries against partitioned tables.  The supporting functions
+ * here are designed to work with any plan type which supports an arbitrary
+ * number of subplans, e.g. Append, MergeAppend.
+ *
+ * When pruning involves comparison of a partition key to a constant, it's
+ * done by the planner.  However, if we have a comparison to a non-constant
+ * but not volatile expression, that presents an opportunity for run-time
+ * pruning by the executor, allowing irrelevant partitions to be skipped
+ * dynamically.
+ *
+ * We must distinguish expressions containing PARAM_EXEC Params from
+ * expressions that don't contain those.  Even though a PARAM_EXEC Param is
+ * considered to be a stable expression, it can change value from one plan
+ * node scan to the next during query execution.  Stable comparison
+ * expressions that don't involve such Params allow partition pruning to be
+ * done once during executor startup.  Expressions that do involve such Params
+ * require us to prune separately for each scan of the parent plan node.
+ *
+ * Note that pruning away unneeded subplans during executor startup has the
+ * added benefit of not having to initialize the unneeded subplans at all.
+ *
+ *
+ * Functions:
+ *
+ * ExecCreatePartitionPruneState:
+ *		Creates the PartitionPruneState required by each of the two pruning
+ *		functions.  Details stored include how to map the partition index
+ *		returned by the partition pruning code into subplan indexes.
+ *
+ * ExecFindInitialMatchingSubPlans:
+ *		Returns indexes of matching subplans.  Partition pruning is attempted
+ *		without any evaluation of expressions containing PARAM_EXEC Params.
+ *		This function must be called during executor startup for the parent
+ *		plan before the subplans themselves are initialized.  Subplans which
+ *		are found not to match by this function must be removed from the
+ *		plan's list of subplans during execution, as this function performs a
+ *		remap of the partition index to subplan index map and the newly
+ *		created map provides indexes only for subplans which remain after
+ *		calling this function.
+ *
+ * ExecFindMatchingSubPlans:
+ *		Returns indexes of matching subplans after evaluating all available
+ *		expressions.  This function can only be called during execution and
+ *		must be called again each time the value of a Param listed in
+ *		PartitionPruneState's 'execparamids' changes.
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * ExecCreatePartitionPruneState
+ *		Build the data structure required for calling
+ *		ExecFindInitialMatchingSubPlans and ExecFindMatchingSubPlans.
+ *
+ * 'planstate' is the parent plan node's execution state.
+ *
+ * 'partitionpruneinfo' is a PartitionPruneInfo as generated by
+ * make_partition_pruneinfo.  Here we build a PartitionPruneState containing a
+ * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
+ * partitionpruneinfo->prune_infos), each of which contains a
+ * PartitionedRelPruningData for each PartitionedRelPruneInfo appearing in
+ * that sublist.  This two-level system is needed to keep from confusing the
+ * different hierarchies when a UNION ALL contains multiple partitioned tables
+ * as children.  The data stored in each PartitionedRelPruningData can be
+ * re-used each time we re-evaluate which partitions match the pruning steps
+ * provided in each PartitionedRelPruneInfo.
+ */
+PartitionPruneState *
+ExecCreatePartitionPruneState(PlanState *planstate,
+							  PartitionPruneInfo *partitionpruneinfo)
+{
+	EState	   *estate = planstate->state;
+	PartitionPruneState *prunestate;
+	int			n_part_hierarchies;
+	ListCell   *lc;
+	int			i;
+
+	/* For data reading, executor always omits detached partitions */
+	if (estate->es_partition_directory == NULL)
+		estate->es_partition_directory =
+			CreatePartitionDirectory(estate->es_query_cxt, false);
+
+	n_part_hierarchies = list_length(partitionpruneinfo->prune_infos);
+	Assert(n_part_hierarchies > 0);
+
+	/*
+	 * Allocate the data structure
+	 */
+	prunestate = (PartitionPruneState *)
+		palloc(offsetof(PartitionPruneState, partprunedata) +
+			   sizeof(PartitionPruningData *) * n_part_hierarchies);
+
+	prunestate->execparamids = NULL;
+	/* other_subplans can change at runtime, so we need our own copy */
+	prunestate->other_subplans = bms_copy(partitionpruneinfo->other_subplans);
+	prunestate->do_initial_prune = false;	/* may be set below */
+	prunestate->do_exec_prune = false;	/* may be set below */
+	prunestate->num_partprunedata = n_part_hierarchies;
+
+	/*
+	 * Create a short-term memory context which we'll use when making calls to
+	 * the partition pruning functions.  This avoids possible memory leaks,
+	 * since the pruning functions call comparison functions that aren't under
+	 * our control.
+	 */
+	prunestate->prune_context =
+		AllocSetContextCreate(CurrentMemoryContext,
+							  "Partition Prune",
+							  ALLOCSET_DEFAULT_SIZES);
+
+	i = 0;
+	foreach(lc, partitionpruneinfo->prune_infos)
+	{
+		List	   *partrelpruneinfos = lfirst_node(List, lc);
+		int			npartrelpruneinfos = list_length(partrelpruneinfos);
+		PartitionPruningData *prunedata;
+		ListCell   *lc2;
+		int			j;
+
+		prunedata = (PartitionPruningData *)
+			palloc(offsetof(PartitionPruningData, partrelprunedata) +
+				   npartrelpruneinfos * sizeof(PartitionedRelPruningData));
+		prunestate->partprunedata[i] = prunedata;
+		prunedata->num_partrelprunedata = npartrelpruneinfos;
+
+		j = 0;
+		foreach(lc2, partrelpruneinfos)
+		{
+			PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2);
+			PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
+			Relation	partrel;
+			PartitionDesc partdesc;
+			PartitionKey partkey;
+
+			/*
+			 * We can rely on the copies of the partitioned table's partition
+			 * key and partition descriptor appearing in its relcache entry,
+			 * because that entry will be held open and locked for the
+			 * duration of this executor run.
+			 */
+			partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex);
+			partkey = RelationGetPartitionKey(partrel);
+			partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
+												partrel);
+
+			/*
+			 * Initialize the subplan_map and subpart_map.
+			 *
+			 * Because we request detached partitions to be included, and
+			 * detaching waits for old transactions, it is safe to assume that
+			 * no partitions have disappeared since this query was planned.
+			 *
+			 * However, new partitions may have been added.
+			 */
+			Assert(partdesc->nparts >= pinfo->nparts);
+			pprune->nparts = partdesc->nparts;
+			pprune->subplan_map = palloc(sizeof(int) * partdesc->nparts);
+			if (partdesc->nparts == pinfo->nparts)
+			{
+				/*
+				 * There are no new partitions, so this is simple.  We can
+				 * simply point to the subpart_map from the plan, but we must
+				 * copy the subplan_map since we may change it later.
+				 */
+				pprune->subpart_map = pinfo->subpart_map;
+				memcpy(pprune->subplan_map, pinfo->subplan_map,
+					   sizeof(int) * pinfo->nparts);
+
+				/*
+				 * Double-check that the list of unpruned relations has not
+				 * changed.  (Pruned partitions are not in relid_map[].)
+				 */
+#ifdef USE_ASSERT_CHECKING
+				for (int k = 0; k < pinfo->nparts; k++)
+				{
+					Assert(partdesc->oids[k] == pinfo->relid_map[k] ||
+						   pinfo->subplan_map[k] == -1);
+				}
+#endif
+			}
+			else
+			{
+				int			pd_idx = 0;
+				int			pp_idx;
+
+				/*
+				 * Some new partitions have appeared since plan time, and
+				 * those are reflected in our PartitionDesc but were not
+				 * present in the one used to construct subplan_map and
+				 * subpart_map.  So we must construct new and longer arrays
+				 * where the partitions that were originally present map to
+				 * the same sub-structures, and any added partitions map to
+				 * -1, as if the new partitions had been pruned.
+				 *
+				 * Note: pinfo->relid_map[] may contain InvalidOid entries for
+				 * partitions pruned by the planner.  We cannot tell exactly
+				 * which of the partdesc entries these correspond to, but we
+				 * don't have to; just skip over them.  The non-pruned
+				 * relid_map entries, however, had better be a subset of the
+				 * partdesc entries and in the same order.
+				 */
+				pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts);
+				for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++)
+				{
+					/* Skip any InvalidOid relid_map entries */
+					while (pd_idx < pinfo->nparts &&
+						   !OidIsValid(pinfo->relid_map[pd_idx]))
+						pd_idx++;
+
+					if (pd_idx < pinfo->nparts &&
+						pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx])
+					{
+						/* match... */
+						pprune->subplan_map[pp_idx] =
+							pinfo->subplan_map[pd_idx];
+						pprune->subpart_map[pp_idx] =
+							pinfo->subpart_map[pd_idx];
+						pd_idx++;
+					}
+					else
+					{
+						/* this partdesc entry is not in the plan */
+						pprune->subplan_map[pp_idx] = -1;
+						pprune->subpart_map[pp_idx] = -1;
+					}
+				}
+
+				/*
+				 * It might seem that we need to skip any trailing InvalidOid
+				 * entries in pinfo->relid_map before checking that we scanned
+				 * all of the relid_map.  But we will have skipped them above,
+				 * because they must correspond to some partdesc->oids
+				 * entries; we just couldn't tell which.
+				 */
+				if (pd_idx != pinfo->nparts)
+					elog(ERROR, "could not match partition child tables to plan elements");
+			}
+
+			/* present_parts is also subject to later modification */
+			pprune->present_parts = bms_copy(pinfo->present_parts);
+
+			/*
+			 * Initialize pruning contexts as needed.
+			 */
+			pprune->initial_pruning_steps = pinfo->initial_pruning_steps;
+			if (pinfo->initial_pruning_steps)
+			{
+				ExecInitPruningContext(&pprune->initial_context,
+									   pinfo->initial_pruning_steps,
+									   partdesc, partkey, planstate);
+				/* Record whether initial pruning is needed at any level */
+				prunestate->do_initial_prune = true;
+			}
+			pprune->exec_pruning_steps = pinfo->exec_pruning_steps;
+			if (pinfo->exec_pruning_steps)
+			{
+				ExecInitPruningContext(&pprune->exec_context,
+									   pinfo->exec_pruning_steps,
+									   partdesc, partkey, planstate);
+				/* Record whether exec pruning is needed at any level */
+				prunestate->do_exec_prune = true;
+			}
+
+			/*
+			 * Accumulate the IDs of all PARAM_EXEC Params affecting the
+			 * partitioning decisions at this plan node.
+			 */
+			prunestate->execparamids = bms_add_members(prunestate->execparamids,
+													   pinfo->execparamids);
+
+			j++;
+		}
+		i++;
+	}
+
+	return prunestate;
+}
+
+/*
+ * Initialize a PartitionPruneContext for the given list of pruning steps.
+ */
+static void
+ExecInitPruningContext(PartitionPruneContext *context,
+					   List *pruning_steps,
+					   PartitionDesc partdesc,
+					   PartitionKey partkey,
+					   PlanState *planstate)
+{
+	int			n_steps;
+	int			partnatts;
+	ListCell   *lc;
+
+	n_steps = list_length(pruning_steps);
+
+	context->strategy = partkey->strategy;
+	context->partnatts = partnatts = partkey->partnatts;
+	context->nparts = partdesc->nparts;
+	context->boundinfo = partdesc->boundinfo;
+	context->partcollation = partkey->partcollation;
+	context->partsupfunc = partkey->partsupfunc;
+
+	/* We'll look up type-specific support functions as needed */
+	context->stepcmpfuncs = (FmgrInfo *)
+		palloc0(sizeof(FmgrInfo) * n_steps * partnatts);
+
+	context->ppccontext = CurrentMemoryContext;
+	context->planstate = planstate;
+
+	/* Initialize expression state for each expression we need */
+	context->exprstates = (ExprState **)
+		palloc0(sizeof(ExprState *) * n_steps * partnatts);
+	foreach(lc, pruning_steps)
+	{
+		PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
+		ListCell   *lc2;
+		int			keyno;
+
+		/* not needed for other step kinds */
+		if (!IsA(step, PartitionPruneStepOp))
+			continue;
+
+		Assert(list_length(step->exprs) <= partnatts);
+
+		keyno = 0;
+		foreach(lc2, step->exprs)
+		{
+			Expr	   *expr = (Expr *) lfirst(lc2);
+
+			/* not needed for Consts */
+			if (!IsA(expr, Const))
+			{
+				int			stateidx = PruneCxtStateIdx(partnatts,
+														step->step.step_id,
+														keyno);
+
+				context->exprstates[stateidx] =
+					ExecInitExpr(expr, context->planstate);
+			}
+			keyno++;
+		}
+	}
+}
+
+/*
+ * ExecFindInitialMatchingSubPlans
+ *		Identify the set of subplans that cannot be eliminated by initial
+ *		pruning, disregarding any pruning constraints involving PARAM_EXEC
+ *		Params.
+ *
+ * If additional pruning passes will be required (because of PARAM_EXEC
+ * Params), we must also update the translation data that allows conversion
+ * of partition indexes into subplan indexes to account for the unneeded
+ * subplans having been removed.
+ *
+ * Must only be called once per 'prunestate', and only if initial pruning
+ * is required.
+ *
+ * 'nsubplans' must be passed as the total number of unpruned subplans.
+ */
+Bitmapset *
+ExecFindInitialMatchingSubPlans(PartitionPruneState *prunestate, int nsubplans)
+{
+	Bitmapset  *result = NULL;
+	MemoryContext oldcontext;
+	int			i;
+
+	/* Caller error if we get here without do_initial_prune */
+	Assert(prunestate->do_initial_prune);
+
+	/*
+	 * Switch to a temp context to avoid leaking memory in the executor's
+	 * query-lifespan memory context.
+	 */
+	oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
+
+	/*
+	 * For each hierarchy, do the pruning tests, and add nondeletable
+	 * subplans' indexes to "result".
+	 */
+	for (i = 0; i < prunestate->num_partprunedata; i++)
+	{
+		PartitionPruningData *prunedata;
+		PartitionedRelPruningData *pprune;
+
+		prunedata = prunestate->partprunedata[i];
+		pprune = &prunedata->partrelprunedata[0];
+
+		/* Perform pruning without using PARAM_EXEC Params */
+		find_matching_subplans_recurse(prunedata, pprune, true, &result);
+
+		/* Expression eval may have used space in node's ps_ExprContext too */
+		if (pprune->initial_pruning_steps)
+			ResetExprContext(pprune->initial_context.planstate->ps_ExprContext);
+	}
+
+	/* Add in any subplans that partition pruning didn't account for */
+	result = bms_add_members(result, prunestate->other_subplans);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Copy result out of the temp context before we reset it */
+	result = bms_copy(result);
+
+	MemoryContextReset(prunestate->prune_context);
+
+	/*
+	 * If exec-time pruning is required and we pruned subplans above, then we
+	 * must re-sequence the subplan indexes so that ExecFindMatchingSubPlans
+	 * properly returns the indexes from the subplans which will remain after
+	 * execution of this function.
+	 *
+	 * We can safely skip this when !do_exec_prune, even though that leaves
+	 * invalid data in prunestate, because that data won't be consulted again
+	 * (cf initial Assert in ExecFindMatchingSubPlans).
+	 */
+	if (prunestate->do_exec_prune && bms_num_members(result) < nsubplans)
+	{
+		int		   *new_subplan_indexes;
+		Bitmapset  *new_other_subplans;
+		int			i;
+		int			newidx;
+
+		/*
+		 * First we must build a temporary array which maps old subplan
+		 * indexes to new ones.  For convenience of initialization, we use
+		 * 1-based indexes in this array and leave pruned items as 0.
+		 */
+		new_subplan_indexes = (int *) palloc0(sizeof(int) * nsubplans);
+		newidx = 1;
+		i = -1;
+		while ((i = bms_next_member(result, i)) >= 0)
+		{
+			Assert(i < nsubplans);
+			new_subplan_indexes[i] = newidx++;
+		}
+
+		/*
+		 * Now we can update each PartitionedRelPruneInfo's subplan_map with
+		 * new subplan indexes.  We must also recompute its present_parts
+		 * bitmap.
+		 */
+		for (i = 0; i < prunestate->num_partprunedata; i++)
+		{
+			PartitionPruningData *prunedata = prunestate->partprunedata[i];
+			int			j;
+
+			/*
+			 * Within each hierarchy, we perform this loop in back-to-front
+			 * order so that we determine present_parts for the lowest-level
+			 * partitioned tables first.  This way we can tell whether a
+			 * sub-partitioned table's partitions were entirely pruned so we
+			 * can exclude it from the current level's present_parts.
+			 */
+			for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--)
+			{
+				PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
+				int			nparts = pprune->nparts;
+				int			k;
+
+				/* We just rebuild present_parts from scratch */
+				bms_free(pprune->present_parts);
+				pprune->present_parts = NULL;
+
+				for (k = 0; k < nparts; k++)
+				{
+					int			oldidx = pprune->subplan_map[k];
+					int			subidx;
+
+					/*
+					 * If this partition existed as a subplan then change the
+					 * old subplan index to the new subplan index.  The new
+					 * index may become -1 if the partition was pruned above,
+					 * or it may just come earlier in the subplan list due to
+					 * some subplans being removed earlier in the list.  If
+					 * it's a subpartition, add it to present_parts unless
+					 * it's entirely pruned.
+					 */
+					if (oldidx >= 0)
+					{
+						Assert(oldidx < nsubplans);
+						pprune->subplan_map[k] = new_subplan_indexes[oldidx] - 1;
+
+						if (new_subplan_indexes[oldidx] > 0)
+							pprune->present_parts =
+								bms_add_member(pprune->present_parts, k);
+					}
+					else if ((subidx = pprune->subpart_map[k]) >= 0)
+					{
+						PartitionedRelPruningData *subprune;
+
+						subprune = &prunedata->partrelprunedata[subidx];
+
+						if (!bms_is_empty(subprune->present_parts))
+							pprune->present_parts =
+								bms_add_member(pprune->present_parts, k);
+					}
+				}
+			}
+		}
+
+		/*
+		 * We must also recompute the other_subplans set, since indexes in it
+		 * may change.
+		 */
+		new_other_subplans = NULL;
+		i = -1;
+		while ((i = bms_next_member(prunestate->other_subplans, i)) >= 0)
+			new_other_subplans = bms_add_member(new_other_subplans,
+												new_subplan_indexes[i] - 1);
+
+		bms_free(prunestate->other_subplans);
+		prunestate->other_subplans = new_other_subplans;
+
+		pfree(new_subplan_indexes);
+	}
+
+	return result;
+}
+
+/*
+ * ExecFindMatchingSubPlans
+ *		Determine which subplans match the pruning steps detailed in
+ *		'prunestate' for the current comparison expression values.
+ *
+ * Here we assume we may evaluate PARAM_EXEC Params.
+ */
+Bitmapset *
+ExecFindMatchingSubPlans(PartitionPruneState *prunestate)
+{
+	Bitmapset  *result = NULL;
+	MemoryContext oldcontext;
+	int			i;
+
+	/*
+	 * If !do_exec_prune, we've got problems because
+	 * ExecFindInitialMatchingSubPlans will not have bothered to update
+	 * prunestate for whatever pruning it did.
+	 */
+	Assert(prunestate->do_exec_prune);
+
+	/*
+	 * Switch to a temp context to avoid leaking memory in the executor's
+	 * query-lifespan memory context.
+	 */
+	oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
+
+	/*
+	 * For each hierarchy, do the pruning tests, and add nondeletable
+	 * subplans' indexes to "result".
+	 */
+	for (i = 0; i < prunestate->num_partprunedata; i++)
+	{
+		PartitionPruningData *prunedata;
+		PartitionedRelPruningData *pprune;
+
+		prunedata = prunestate->partprunedata[i];
+		pprune = &prunedata->partrelprunedata[0];
+
+		find_matching_subplans_recurse(prunedata, pprune, false, &result);
+
+		/* Expression eval may have used space in node's ps_ExprContext too */
+		if (pprune->exec_pruning_steps)
+			ResetExprContext(pprune->exec_context.planstate->ps_ExprContext);
+	}
+
+	/* Add in any subplans that partition pruning didn't account for */
+	result = bms_add_members(result, prunestate->other_subplans);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Copy result out of the temp context before we reset it */
+	result = bms_copy(result);
+
+	MemoryContextReset(prunestate->prune_context);
+
+	return result;
+}
+
+/*
+ * find_matching_subplans_recurse
+ *		Recursive worker function for ExecFindMatchingSubPlans and
+ *		ExecFindInitialMatchingSubPlans
+ *
+ * Adds valid (non-prunable) subplan IDs to *validsubplans
+ */
+static void
+find_matching_subplans_recurse(PartitionPruningData *prunedata,
+							   PartitionedRelPruningData *pprune,
+							   bool initial_prune,
+							   Bitmapset **validsubplans)
+{
+	Bitmapset  *partset;
+	int			i;
+
+	/* Guard against stack overflow due to overly deep partition hierarchy. */
+	check_stack_depth();
+
+	/* Only prune if pruning would be useful at this level. */
+	if (initial_prune && pprune->initial_pruning_steps)
+	{
+		partset = get_matching_partitions(&pprune->initial_context,
+										  pprune->initial_pruning_steps);
+	}
+	else if (!initial_prune && pprune->exec_pruning_steps)
+	{
+		partset = get_matching_partitions(&pprune->exec_context,
+										  pprune->exec_pruning_steps);
+	}
+	else
+	{
+		/*
+		 * If no pruning is to be done, just include all partitions at this
+		 * level.
+		 */
+		partset = pprune->present_parts;
+	}
+
+	/* Translate partset into subplan indexes */
+	i = -1;
+	while ((i = bms_next_member(partset, i)) >= 0)
+	{
+		if (pprune->subplan_map[i] >= 0)
+			*validsubplans = bms_add_member(*validsubplans,
+											pprune->subplan_map[i]);
+		else
+		{
+			int			partidx = pprune->subpart_map[i];
+
+			if (partidx >= 0)
+				find_matching_subplans_recurse(prunedata,
+											   &prunedata->partrelprunedata[partidx],
+											   initial_prune, validsubplans);
+			else
+			{
+				/*
+				 * We get here if the planner already pruned all the sub-
+				 * partitions for this partition.  Silently ignore this
+				 * partition in this case.  The end result is the same: we
+				 * would have pruned all partitions just the same, but we
+				 * don't have any pruning steps to execute to verify this.
+				 */
+			}
+		}
+	}
+}
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
new file mode 100644
index 0000000..1752b9b
--- /dev/null
+++ b/src/backend/executor/execProcnode.c
@@ -0,0 +1,981 @@
+/*-------------------------------------------------------------------------
+ *
+ * execProcnode.c
+ *	 contains dispatch functions which call the appropriate "initialize",
+ *	 "get a tuple", and "cleanup" routines for the given node type.
+ *	 If the node has children, then it will presumably call ExecInitNode,
+ *	 ExecProcNode, or ExecEndNode on its subnodes and do the appropriate
+ *	 processing.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execProcnode.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *	 NOTES
+ *		This used to be three files.  It is now all combined into
+ *		one file so that it is easier to keep the dispatch routines
+ *		in sync when new nodes are added.
+ *
+ *	 EXAMPLE
+ *		Suppose we want the age of the manager of the shoe department and
+ *		the number of employees in that department.  So we have the query:
+ *
+ *				select DEPT.no_emps, EMP.age
+ *				from DEPT, EMP
+ *				where EMP.name = DEPT.mgr and
+ *					  DEPT.name = "shoe"
+ *
+ *		Suppose the planner gives us the following plan:
+ *
+ *						Nest Loop (DEPT.mgr = EMP.name)
+ *						/		\
+ *					   /		 \
+ *				   Seq Scan		Seq Scan
+ *					DEPT		  EMP
+ *				(name = "shoe")
+ *
+ *		ExecutorStart() is called first.
+ *		It calls InitPlan() which calls ExecInitNode() on
+ *		the root of the plan -- the nest loop node.
+ *
+ *	  * ExecInitNode() notices that it is looking at a nest loop and
+ *		as the code below demonstrates, it calls ExecInitNestLoop().
+ *		Eventually this calls ExecInitNode() on the right and left subplans
+ *		and so forth until the entire plan is initialized.  The result
+ *		of ExecInitNode() is a plan state tree built with the same structure
+ *		as the underlying plan tree.
+ *
+ *	  * Then when ExecutorRun() is called, it calls ExecutePlan() which calls
+ *		ExecProcNode() repeatedly on the top node of the plan state tree.
+ *		Each time this happens, ExecProcNode() will end up calling
+ *		ExecNestLoop(), which calls ExecProcNode() on its subplans.
+ *		Each of these subplans is a sequential scan so ExecSeqScan() is
+ *		called.  The slots returned by ExecSeqScan() may contain
+ *		tuples which contain the attributes ExecNestLoop() uses to
+ *		form the tuples it returns.
+ *
+ *	  * Eventually ExecSeqScan() stops returning tuples and the nest
+ *		loop join ends.  Lastly, ExecutorEnd() calls ExecEndNode() which
+ *		calls ExecEndNestLoop() which in turn calls ExecEndNode() on
+ *		its subplans which result in ExecEndSeqScan().
+ *
+ *		This should show how the executor works by having
+ *		ExecInitNode(), ExecProcNode() and ExecEndNode() dispatch
+ *		their work to the appropriate node support routines which may
+ *		in turn call these routines themselves on their subplans.
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeAgg.h"
+#include "executor/nodeAppend.h"
+#include "executor/nodeBitmapAnd.h"
+#include "executor/nodeBitmapHeapscan.h"
+#include "executor/nodeBitmapIndexscan.h"
+#include "executor/nodeBitmapOr.h"
+#include "executor/nodeCtescan.h"
+#include "executor/nodeCustom.h"
+#include "executor/nodeForeignscan.h"
+#include "executor/nodeFunctionscan.h"
+#include "executor/nodeGather.h"
+#include "executor/nodeGatherMerge.h"
+#include "executor/nodeGroup.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "executor/nodeLimit.h"
+#include "executor/nodeLockRows.h"
+#include "executor/nodeMaterial.h"
+#include "executor/nodeMemoize.h"
+#include "executor/nodeMergeAppend.h"
+#include "executor/nodeMergejoin.h"
+#include "executor/nodeModifyTable.h"
+#include "executor/nodeNamedtuplestorescan.h"
+#include "executor/nodeNestloop.h"
+#include "executor/nodeProjectSet.h"
+#include "executor/nodeRecursiveunion.h"
+#include "executor/nodeResult.h"
+#include "executor/nodeSamplescan.h"
+#include "executor/nodeSeqscan.h"
+#include "executor/nodeSetOp.h"
+#include "executor/nodeSort.h"
+#include "executor/nodeSubplan.h"
+#include "executor/nodeSubqueryscan.h"
+#include "executor/nodeTableFuncscan.h"
+#include "executor/nodeTidrangescan.h"
+#include "executor/nodeTidscan.h"
+#include "executor/nodeUnique.h"
+#include "executor/nodeValuesscan.h"
+#include "executor/nodeWindowAgg.h"
+#include "executor/nodeWorktablescan.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+
+static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
+static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
+
+
+/* ------------------------------------------------------------------------
+ *		ExecInitNode
+ *
+ *		Recursively initializes all the nodes in the plan tree rooted
+ *		at 'node'.
+ *
+ *		Inputs:
+ *		  'node' is the current node of the plan produced by the query planner
+ *		  'estate' is the shared execution state for the plan tree
+ *		  'eflags' is a bitwise OR of flag bits described in executor.h
+ *
+ *		Returns a PlanState node corresponding to the given Plan node.
+ * ------------------------------------------------------------------------
+ */
+PlanState *
+ExecInitNode(Plan *node, EState *estate, int eflags)
+{
+	PlanState  *result;
+	List	   *subps;
+	ListCell   *l;
+
+	/*
+	 * do nothing when we get to the end of a leaf on tree.
+	 */
+	if (node == NULL)
+		return NULL;
+
+	/*
+	 * Make sure there's enough stack available. Need to check here, in
+	 * addition to ExecProcNode() (via ExecProcNodeFirst()), to ensure the
+	 * stack isn't overrun while initializing the node tree.
+	 */
+	check_stack_depth();
+
+	switch (nodeTag(node))
+	{
+			/*
+			 * control nodes
+			 */
+		case T_Result:
+			result = (PlanState *) ExecInitResult((Result *) node,
+												  estate, eflags);
+			break;
+
+		case T_ProjectSet:
+			result = (PlanState *) ExecInitProjectSet((ProjectSet *) node,
+													  estate, eflags);
+			break;
+
+		case T_ModifyTable:
+			result = (PlanState *) ExecInitModifyTable((ModifyTable *) node,
+													   estate, eflags);
+			break;
+
+		case T_Append:
+			result = (PlanState *) ExecInitAppend((Append *) node,
+												  estate, eflags);
+			break;
+
+		case T_MergeAppend:
+			result = (PlanState *) ExecInitMergeAppend((MergeAppend *) node,
+													   estate, eflags);
+			break;
+
+		case T_RecursiveUnion:
+			result = (PlanState *) ExecInitRecursiveUnion((RecursiveUnion *) node,
+														  estate, eflags);
+			break;
+
+		case T_BitmapAnd:
+			result = (PlanState *) ExecInitBitmapAnd((BitmapAnd *) node,
+													 estate, eflags);
+			break;
+
+		case T_BitmapOr:
+			result = (PlanState *) ExecInitBitmapOr((BitmapOr *) node,
+													estate, eflags);
+			break;
+
+			/*
+			 * scan nodes
+			 */
+		case T_SeqScan:
+			result = (PlanState *) ExecInitSeqScan((SeqScan *) node,
+												   estate, eflags);
+			break;
+
+		case T_SampleScan:
+			result = (PlanState *) ExecInitSampleScan((SampleScan *) node,
+													  estate, eflags);
+			break;
+
+		case T_IndexScan:
+			result = (PlanState *) ExecInitIndexScan((IndexScan *) node,
+													 estate, eflags);
+			break;
+
+		case T_IndexOnlyScan:
+			result = (PlanState *) ExecInitIndexOnlyScan((IndexOnlyScan *) node,
+														 estate, eflags);
+			break;
+
+		case T_BitmapIndexScan:
+			result = (PlanState *) ExecInitBitmapIndexScan((BitmapIndexScan *) node,
+														   estate, eflags);
+			break;
+
+		case T_BitmapHeapScan:
+			result = (PlanState *) ExecInitBitmapHeapScan((BitmapHeapScan *) node,
+														  estate, eflags);
+			break;
+
+		case T_TidScan:
+			result = (PlanState *) ExecInitTidScan((TidScan *) node,
+												   estate, eflags);
+			break;
+
+		case T_TidRangeScan:
+			result = (PlanState *) ExecInitTidRangeScan((TidRangeScan *) node,
+														estate, eflags);
+			break;
+
+		case T_SubqueryScan:
+			result = (PlanState *) ExecInitSubqueryScan((SubqueryScan *) node,
+														estate, eflags);
+			break;
+
+		case T_FunctionScan:
+			result = (PlanState *) ExecInitFunctionScan((FunctionScan *) node,
+														estate, eflags);
+			break;
+
+		case T_TableFuncScan:
+			result = (PlanState *) ExecInitTableFuncScan((TableFuncScan *) node,
+														 estate, eflags);
+			break;
+
+		case T_ValuesScan:
+			result = (PlanState *) ExecInitValuesScan((ValuesScan *) node,
+													  estate, eflags);
+			break;
+
+		case T_CteScan:
+			result = (PlanState *) ExecInitCteScan((CteScan *) node,
+												   estate, eflags);
+			break;
+
+		case T_NamedTuplestoreScan:
+			result = (PlanState *) ExecInitNamedTuplestoreScan((NamedTuplestoreScan *) node,
+															   estate, eflags);
+			break;
+
+		case T_WorkTableScan:
+			result = (PlanState *) ExecInitWorkTableScan((WorkTableScan *) node,
+														 estate, eflags);
+			break;
+
+		case T_ForeignScan:
+			result = (PlanState *) ExecInitForeignScan((ForeignScan *) node,
+													   estate, eflags);
+			break;
+
+		case T_CustomScan:
+			result = (PlanState *) ExecInitCustomScan((CustomScan *) node,
+													  estate, eflags);
+			break;
+
+			/*
+			 * join nodes
+			 */
+		case T_NestLoop:
+			result = (PlanState *) ExecInitNestLoop((NestLoop *) node,
+													estate, eflags);
+			break;
+
+		case T_MergeJoin:
+			result = (PlanState *) ExecInitMergeJoin((MergeJoin *) node,
+													 estate, eflags);
+			break;
+
+		case T_HashJoin:
+			result = (PlanState *) ExecInitHashJoin((HashJoin *) node,
+													estate, eflags);
+			break;
+
+			/*
+			 * materialization nodes
+			 */
+		case T_Material:
+			result = (PlanState *) ExecInitMaterial((Material *) node,
+													estate, eflags);
+			break;
+
+		case T_Sort:
+			result = (PlanState *) ExecInitSort((Sort *) node,
+												estate, eflags);
+			break;
+
+		case T_IncrementalSort:
+			result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node,
+														   estate, eflags);
+			break;
+
+		case T_Memoize:
+			result = (PlanState *) ExecInitMemoize((Memoize *) node, estate,
+												   eflags);
+			break;
+
+		case T_Group:
+			result = (PlanState *) ExecInitGroup((Group *) node,
+												 estate, eflags);
+			break;
+
+		case T_Agg:
+			result = (PlanState *) ExecInitAgg((Agg *) node,
+											   estate, eflags);
+			break;
+
+		case T_WindowAgg:
+			result = (PlanState *) ExecInitWindowAgg((WindowAgg *) node,
+													 estate, eflags);
+			break;
+
+		case T_Unique:
+			result = (PlanState *) ExecInitUnique((Unique *) node,
+												  estate, eflags);
+			break;
+
+		case T_Gather:
+			result = (PlanState *) ExecInitGather((Gather *) node,
+												  estate, eflags);
+			break;
+
+		case T_GatherMerge:
+			result = (PlanState *) ExecInitGatherMerge((GatherMerge *) node,
+													   estate, eflags);
+			break;
+
+		case T_Hash:
+			result = (PlanState *) ExecInitHash((Hash *) node,
+												estate, eflags);
+			break;
+
+		case T_SetOp:
+			result = (PlanState *) ExecInitSetOp((SetOp *) node,
+												 estate, eflags);
+			break;
+
+		case T_LockRows:
+			result = (PlanState *) ExecInitLockRows((LockRows *) node,
+													estate, eflags);
+			break;
+
+		case T_Limit:
+			result = (PlanState *) ExecInitLimit((Limit *) node,
+												 estate, eflags);
+			break;
+
+		default:
+			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+			result = NULL;		/* keep compiler quiet */
+			break;
+	}
+
+	ExecSetExecProcNode(result, result->ExecProcNode);
+
+	/*
+	 * Initialize any initPlans present in this node.  The planner put them in
+	 * a separate list for us.
+	 */
+	subps = NIL;
+	foreach(l, node->initPlan)
+	{
+		SubPlan    *subplan = (SubPlan *) lfirst(l);
+		SubPlanState *sstate;
+
+		Assert(IsA(subplan, SubPlan));
+		sstate = ExecInitSubPlan(subplan, result);
+		subps = lappend(subps, sstate);
+	}
+	result->initPlan = subps;
+
+	/* Set up instrumentation for this node if requested */
+	if (estate->es_instrument)
+		result->instrument = InstrAlloc(1, estate->es_instrument,
+										result->async_capable);
+
+	return result;
+}
+
+
+/*
+ * If a node wants to change its ExecProcNode function after ExecInitNode()
+ * has finished, it should do so with this function.  That way any wrapper
+ * functions can be reinstalled, without the node having to know how that
+ * works.
+ */
+void
+ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function)
+{
+	/*
+	 * Add a wrapper around the ExecProcNode callback that checks stack depth
+	 * during the first execution and maybe adds an instrumentation wrapper.
+	 * When the callback is changed after execution has already begun that
+	 * means we'll superfluously execute ExecProcNodeFirst, but that seems ok.
+	 */
+	node->ExecProcNodeReal = function;
+	node->ExecProcNode = ExecProcNodeFirst;
+}
+
+
+/*
+ * ExecProcNode wrapper that performs some one-time checks, before calling
+ * the relevant node method (possibly via an instrumentation wrapper).
+ */
+static TupleTableSlot *
+ExecProcNodeFirst(PlanState *node)
+{
+	/*
+	 * Perform stack depth check during the first execution of the node.  We
+	 * only do so the first time round because it turns out to not be cheap on
+	 * some common architectures (eg. x86).  This relies on the assumption
+	 * that ExecProcNode calls for a given plan node will always be made at
+	 * roughly the same stack depth.
+	 */
+	check_stack_depth();
+
+	/*
+	 * If instrumentation is required, change the wrapper to one that just
+	 * does instrumentation.  Otherwise we can dispense with all wrappers and
+	 * have ExecProcNode() directly call the relevant function from now on.
+	 */
+	if (node->instrument)
+		node->ExecProcNode = ExecProcNodeInstr;
+	else
+		node->ExecProcNode = node->ExecProcNodeReal;
+
+	return node->ExecProcNode(node);
+}
+
+
+/*
+ * ExecProcNode wrapper that performs instrumentation calls.  By keeping
+ * this a separate function, we avoid overhead in the normal case where
+ * no instrumentation is wanted.
+ */
+static TupleTableSlot *
+ExecProcNodeInstr(PlanState *node)
+{
+	TupleTableSlot *result;
+
+	InstrStartNode(node->instrument);
+
+	result = node->ExecProcNodeReal(node);
+
+	InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
+
+	return result;
+}
+
+
+/* ----------------------------------------------------------------
+ *		MultiExecProcNode
+ *
+ *		Execute a node that doesn't return individual tuples
+ *		(it might return a hashtable, bitmap, etc).  Caller should
+ *		check it got back the expected kind of Node.
+ *
+ * This has essentially the same responsibilities as ExecProcNode,
+ * but it does not do InstrStartNode/InstrStopNode (mainly because
+ * it can't tell how many returned tuples to count).  Each per-node
+ * function must provide its own instrumentation support.
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecProcNode(PlanState *node)
+{
+	Node	   *result;
+
+	check_stack_depth();
+
+	CHECK_FOR_INTERRUPTS();
+
+	if (node->chgParam != NULL) /* something changed */
+		ExecReScan(node);		/* let ReScan handle this */
+
+	switch (nodeTag(node))
+	{
+			/*
+			 * Only node types that actually support multiexec will be listed
+			 */
+
+		case T_HashState:
+			result = MultiExecHash((HashState *) node);
+			break;
+
+		case T_BitmapIndexScanState:
+			result = MultiExecBitmapIndexScan((BitmapIndexScanState *) node);
+			break;
+
+		case T_BitmapAndState:
+			result = MultiExecBitmapAnd((BitmapAndState *) node);
+			break;
+
+		case T_BitmapOrState:
+			result = MultiExecBitmapOr((BitmapOrState *) node);
+			break;
+
+		default:
+			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+			result = NULL;
+			break;
+	}
+
+	return result;
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecEndNode
+ *
+ *		Recursively cleans up all the nodes in the plan rooted
+ *		at 'node'.
+ *
+ *		After this operation, the query plan will not be able to be
+ *		processed any further.  This should be called only after
+ *		the query plan has been fully executed.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndNode(PlanState *node)
+{
+	/*
+	 * do nothing when we get to the end of a leaf on tree.
+	 */
+	if (node == NULL)
+		return;
+
+	/*
+	 * Make sure there's enough stack available. Need to check here, in
+	 * addition to ExecProcNode() (via ExecProcNodeFirst()), because it's not
+	 * guaranteed that ExecProcNode() is reached for all nodes.
+	 */
+	check_stack_depth();
+
+	if (node->chgParam != NULL)
+	{
+		bms_free(node->chgParam);
+		node->chgParam = NULL;
+	}
+
+	switch (nodeTag(node))
+	{
+			/*
+			 * control nodes
+			 */
+		case T_ResultState:
+			ExecEndResult((ResultState *) node);
+			break;
+
+		case T_ProjectSetState:
+			ExecEndProjectSet((ProjectSetState *) node);
+			break;
+
+		case T_ModifyTableState:
+			ExecEndModifyTable((ModifyTableState *) node);
+			break;
+
+		case T_AppendState:
+			ExecEndAppend((AppendState *) node);
+			break;
+
+		case T_MergeAppendState:
+			ExecEndMergeAppend((MergeAppendState *) node);
+			break;
+
+		case T_RecursiveUnionState:
+			ExecEndRecursiveUnion((RecursiveUnionState *) node);
+			break;
+
+		case T_BitmapAndState:
+			ExecEndBitmapAnd((BitmapAndState *) node);
+			break;
+
+		case T_BitmapOrState:
+			ExecEndBitmapOr((BitmapOrState *) node);
+			break;
+
+			/*
+			 * scan nodes
+			 */
+		case T_SeqScanState:
+			ExecEndSeqScan((SeqScanState *) node);
+			break;
+
+		case T_SampleScanState:
+			ExecEndSampleScan((SampleScanState *) node);
+			break;
+
+		case T_GatherState:
+			ExecEndGather((GatherState *) node);
+			break;
+
+		case T_GatherMergeState:
+			ExecEndGatherMerge((GatherMergeState *) node);
+			break;
+
+		case T_IndexScanState:
+			ExecEndIndexScan((IndexScanState *) node);
+			break;
+
+		case T_IndexOnlyScanState:
+			ExecEndIndexOnlyScan((IndexOnlyScanState *) node);
+			break;
+
+		case T_BitmapIndexScanState:
+			ExecEndBitmapIndexScan((BitmapIndexScanState *) node);
+			break;
+
+		case T_BitmapHeapScanState:
+			ExecEndBitmapHeapScan((BitmapHeapScanState *) node);
+			break;
+
+		case T_TidScanState:
+			ExecEndTidScan((TidScanState *) node);
+			break;
+
+		case T_TidRangeScanState:
+			ExecEndTidRangeScan((TidRangeScanState *) node);
+			break;
+
+		case T_SubqueryScanState:
+			ExecEndSubqueryScan((SubqueryScanState *) node);
+			break;
+
+		case T_FunctionScanState:
+			ExecEndFunctionScan((FunctionScanState *) node);
+			break;
+
+		case T_TableFuncScanState:
+			ExecEndTableFuncScan((TableFuncScanState *) node);
+			break;
+
+		case T_ValuesScanState:
+			ExecEndValuesScan((ValuesScanState *) node);
+			break;
+
+		case T_CteScanState:
+			ExecEndCteScan((CteScanState *) node);
+			break;
+
+		case T_NamedTuplestoreScanState:
+			ExecEndNamedTuplestoreScan((NamedTuplestoreScanState *) node);
+			break;
+
+		case T_WorkTableScanState:
+			ExecEndWorkTableScan((WorkTableScanState *) node);
+			break;
+
+		case T_ForeignScanState:
+			ExecEndForeignScan((ForeignScanState *) node);
+			break;
+
+		case T_CustomScanState:
+			ExecEndCustomScan((CustomScanState *) node);
+			break;
+
+			/*
+			 * join nodes
+			 */
+		case T_NestLoopState:
+			ExecEndNestLoop((NestLoopState *) node);
+			break;
+
+		case T_MergeJoinState:
+			ExecEndMergeJoin((MergeJoinState *) node);
+			break;
+
+		case T_HashJoinState:
+			ExecEndHashJoin((HashJoinState *) node);
+			break;
+
+			/*
+			 * materialization nodes
+			 */
+		case T_MaterialState:
+			ExecEndMaterial((MaterialState *) node);
+			break;
+
+		case T_SortState:
+			ExecEndSort((SortState *) node);
+			break;
+
+		case T_IncrementalSortState:
+			ExecEndIncrementalSort((IncrementalSortState *) node);
+			break;
+
+		case T_MemoizeState:
+			ExecEndMemoize((MemoizeState *) node);
+			break;
+
+		case T_GroupState:
+			ExecEndGroup((GroupState *) node);
+			break;
+
+		case T_AggState:
+			ExecEndAgg((AggState *) node);
+			break;
+
+		case T_WindowAggState:
+			ExecEndWindowAgg((WindowAggState *) node);
+			break;
+
+		case T_UniqueState:
+			ExecEndUnique((UniqueState *) node);
+			break;
+
+		case T_HashState:
+			ExecEndHash((HashState *) node);
+			break;
+
+		case T_SetOpState:
+			ExecEndSetOp((SetOpState *) node);
+			break;
+
+		case T_LockRowsState:
+			ExecEndLockRows((LockRowsState *) node);
+			break;
+
+		case T_LimitState:
+			ExecEndLimit((LimitState *) node);
+			break;
+
+		default:
+			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+			break;
+	}
+}
+
+/*
+ * ExecShutdownNode
+ *
+ * Give execution nodes a chance to stop asynchronous resource consumption
+ * and release any resources still held.
+ */
+bool
+ExecShutdownNode(PlanState *node)
+{
+	if (node == NULL)
+		return false;
+
+	check_stack_depth();
+
+	/*
+	 * Treat the node as running while we shut it down, but only if it's run
+	 * at least once already.  We don't expect much CPU consumption during
+	 * node shutdown, but in the case of Gather or Gather Merge, we may shut
+	 * down workers at this stage.  If so, their buffer usage will get
+	 * propagated into pgBufferUsage at this point, and we want to make sure
+	 * that it gets associated with the Gather node.  We skip this if the node
+	 * has never been executed, so as to avoid incorrectly making it appear
+	 * that it has.
+	 */
+	if (node->instrument && node->instrument->running)
+		InstrStartNode(node->instrument);
+
+	planstate_tree_walker(node, ExecShutdownNode, NULL);
+
+	switch (nodeTag(node))
+	{
+		case T_GatherState:
+			ExecShutdownGather((GatherState *) node);
+			break;
+		case T_ForeignScanState:
+			ExecShutdownForeignScan((ForeignScanState *) node);
+			break;
+		case T_CustomScanState:
+			ExecShutdownCustomScan((CustomScanState *) node);
+			break;
+		case T_GatherMergeState:
+			ExecShutdownGatherMerge((GatherMergeState *) node);
+			break;
+		case T_HashState:
+			ExecShutdownHash((HashState *) node);
+			break;
+		case T_HashJoinState:
+			ExecShutdownHashJoin((HashJoinState *) node);
+			break;
+		default:
+			break;
+	}
+
+	/* Stop the node if we started it above, reporting 0 tuples. */
+	if (node->instrument && node->instrument->running)
+		InstrStopNode(node->instrument, 0);
+
+	return false;
+}
+
+/*
+ * ExecSetTupleBound
+ *
+ * Set a tuple bound for a planstate node.  This lets child plan nodes
+ * optimize based on the knowledge that the maximum number of tuples that
+ * their parent will demand is limited.  The tuple bound for a node may
+ * only be changed between scans (i.e., after node initialization or just
+ * before an ExecReScan call).
+ *
+ * Any negative tuples_needed value means "no limit", which should be the
+ * default assumption when this is not called at all for a particular node.
+ *
+ * Note: if this is called repeatedly on a plan tree, the exact same set
+ * of nodes must be updated with the new limit each time; be careful that
+ * only unchanging conditions are tested here.
+ */
+void
+ExecSetTupleBound(int64 tuples_needed, PlanState *child_node)
+{
+	/*
+	 * Since this function recurses, in principle we should check stack depth
+	 * here.  In practice, it's probably pointless since the earlier node
+	 * initialization tree traversal would surely have consumed more stack.
+	 */
+
+	if (IsA(child_node, SortState))
+	{
+		/*
+		 * If it is a Sort node, notify it that it can use bounded sort.
+		 *
+		 * Note: it is the responsibility of nodeSort.c to react properly to
+		 * changes of these parameters.  If we ever redesign this, it'd be a
+		 * good idea to integrate this signaling with the parameter-change
+		 * mechanism.
+		 */
+		SortState  *sortState = (SortState *) child_node;
+
+		if (tuples_needed < 0)
+		{
+			/* make sure flag gets reset if needed upon rescan */
+			sortState->bounded = false;
+		}
+		else
+		{
+			sortState->bounded = true;
+			sortState->bound = tuples_needed;
+		}
+	}
+	else if (IsA(child_node, IncrementalSortState))
+	{
+		/*
+		 * If it is an IncrementalSort node, notify it that it can use bounded
+		 * sort.
+		 *
+		 * Note: it is the responsibility of nodeIncrementalSort.c to react
+		 * properly to changes of these parameters.  If we ever redesign this,
+		 * it'd be a good idea to integrate this signaling with the
+		 * parameter-change mechanism.
+		 */
+		IncrementalSortState *sortState = (IncrementalSortState *) child_node;
+
+		if (tuples_needed < 0)
+		{
+			/* make sure flag gets reset if needed upon rescan */
+			sortState->bounded = false;
+		}
+		else
+		{
+			sortState->bounded = true;
+			sortState->bound = tuples_needed;
+		}
+	}
+	else if (IsA(child_node, AppendState))
+	{
+		/*
+		 * If it is an Append, we can apply the bound to any nodes that are
+		 * children of the Append, since the Append surely need read no more
+		 * than that many tuples from any one input.
+		 */
+		AppendState *aState = (AppendState *) child_node;
+		int			i;
+
+		for (i = 0; i < aState->as_nplans; i++)
+			ExecSetTupleBound(tuples_needed, aState->appendplans[i]);
+	}
+	else if (IsA(child_node, MergeAppendState))
+	{
+		/*
+		 * If it is a MergeAppend, we can apply the bound to any nodes that
+		 * are children of the MergeAppend, since the MergeAppend surely need
+		 * read no more than that many tuples from any one input.
+		 */
+		MergeAppendState *maState = (MergeAppendState *) child_node;
+		int			i;
+
+		for (i = 0; i < maState->ms_nplans; i++)
+			ExecSetTupleBound(tuples_needed, maState->mergeplans[i]);
+	}
+	else if (IsA(child_node, ResultState))
+	{
+		/*
+		 * Similarly, for a projecting Result, we can apply the bound to its
+		 * child node.
+		 *
+		 * If Result supported qual checking, we'd have to punt on seeing a
+		 * qual.  Note that having a resconstantqual is not a showstopper: if
+		 * that condition succeeds it affects nothing, while if it fails, no
+		 * rows will be demanded from the Result child anyway.
+		 */
+		if (outerPlanState(child_node))
+			ExecSetTupleBound(tuples_needed, outerPlanState(child_node));
+	}
+	else if (IsA(child_node, SubqueryScanState))
+	{
+		/*
+		 * We can also descend through SubqueryScan, but only if it has no
+		 * qual (otherwise it might discard rows).
+		 */
+		SubqueryScanState *subqueryState = (SubqueryScanState *) child_node;
+
+		if (subqueryState->ss.ps.qual == NULL)
+			ExecSetTupleBound(tuples_needed, subqueryState->subplan);
+	}
+	else if (IsA(child_node, GatherState))
+	{
+		/*
+		 * A Gather node can propagate the bound to its workers.  As with
+		 * MergeAppend, no one worker could possibly need to return more
+		 * tuples than the Gather itself needs to.
+		 *
+		 * Note: As with Sort, the Gather node is responsible for reacting
+		 * properly to changes to this parameter.
+		 */
+		GatherState *gstate = (GatherState *) child_node;
+
+		gstate->tuples_needed = tuples_needed;
+
+		/* Also pass down the bound to our own copy of the child plan */
+		ExecSetTupleBound(tuples_needed, outerPlanState(child_node));
+	}
+	else if (IsA(child_node, GatherMergeState))
+	{
+		/* Same comments as for Gather */
+		GatherMergeState *gstate = (GatherMergeState *) child_node;
+
+		gstate->tuples_needed = tuples_needed;
+
+		ExecSetTupleBound(tuples_needed, outerPlanState(child_node));
+	}
+
+	/*
+	 * In principle we could descend through any plan node type that is
+	 * certain not to discard or combine input rows; but on seeing a node that
+	 * can do that, we can't propagate the bound any further.  For the moment
+	 * it's unclear that any other cases are worth checking here.
+	 */
+}
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
new file mode 100644
index 0000000..1e285e0
--- /dev/null
+++ b/src/backend/executor/execReplication.c
@@ -0,0 +1,629 @@
+/*-------------------------------------------------------------------------
+ *
+ * execReplication.c
+ *	  miscellaneous executor routines for logical replication
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execReplication.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "commands/trigger.h"
+#include "executor/executor.h"
+#include "executor/nodeModifyTable.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parse_relation.h"
+#include "parser/parsetree.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/typcache.h"
+
+
+/*
+ * Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that
+ * is setup to match 'rel' (*NOT* idxrel!).
+ *
+ * Returns whether any column contains NULLs.
+ *
+ * This is not generic routine, it expects the idxrel to be replication
+ * identity of a rel and meet all limitations associated with that.
+ */
+static bool
+build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel,
+						 TupleTableSlot *searchslot)
+{
+	int			attoff;
+	bool		isnull;
+	Datum		indclassDatum;
+	oidvector  *opclass;
+	int2vector *indkey = &idxrel->rd_index->indkey;
+	bool		hasnulls = false;
+
+	Assert(RelationGetReplicaIndex(rel) == RelationGetRelid(idxrel) ||
+		   RelationGetPrimaryKeyIndex(rel) == RelationGetRelid(idxrel));
+
+	indclassDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple,
+									Anum_pg_index_indclass, &isnull);
+	Assert(!isnull);
+	opclass = (oidvector *) DatumGetPointer(indclassDatum);
+
+	/* Build scankey for every attribute in the index. */
+	for (attoff = 0; attoff < IndexRelationGetNumberOfKeyAttributes(idxrel); attoff++)
+	{
+		Oid			operator;
+		Oid			opfamily;
+		RegProcedure regop;
+		int			pkattno = attoff + 1;
+		int			mainattno = indkey->values[attoff];
+		Oid			optype = get_opclass_input_type(opclass->values[attoff]);
+
+		/*
+		 * Load the operator info.  We need this to get the equality operator
+		 * function for the scan key.
+		 */
+		opfamily = get_opclass_family(opclass->values[attoff]);
+
+		operator = get_opfamily_member(opfamily, optype,
+									   optype,
+									   BTEqualStrategyNumber);
+		if (!OidIsValid(operator))
+			elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
+				 BTEqualStrategyNumber, optype, optype, opfamily);
+
+		regop = get_opcode(operator);
+
+		/* Initialize the scankey. */
+		ScanKeyInit(&skey[attoff],
+					pkattno,
+					BTEqualStrategyNumber,
+					regop,
+					searchslot->tts_values[mainattno - 1]);
+
+		skey[attoff].sk_collation = idxrel->rd_indcollation[attoff];
+
+		/* Check for null value. */
+		if (searchslot->tts_isnull[mainattno - 1])
+		{
+			hasnulls = true;
+			skey[attoff].sk_flags |= SK_ISNULL;
+		}
+	}
+
+	return hasnulls;
+}
+
+/*
+ * Search the relation 'rel' for tuple using the index.
+ *
+ * If a matching tuple is found, lock it with lockmode, fill the slot with its
+ * contents, and return true.  Return false otherwise.
+ */
+bool
+RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
+							 LockTupleMode lockmode,
+							 TupleTableSlot *searchslot,
+							 TupleTableSlot *outslot)
+{
+	ScanKeyData skey[INDEX_MAX_KEYS];
+	IndexScanDesc scan;
+	SnapshotData snap;
+	TransactionId xwait;
+	Relation	idxrel;
+	bool		found;
+
+	/* Open the index. */
+	idxrel = index_open(idxoid, RowExclusiveLock);
+
+	/* Start an index scan. */
+	InitDirtySnapshot(snap);
+	scan = index_beginscan(rel, idxrel, &snap,
+						   IndexRelationGetNumberOfKeyAttributes(idxrel),
+						   0);
+
+	/* Build scan key. */
+	build_replindex_scan_key(skey, rel, idxrel, searchslot);
+
+retry:
+	found = false;
+
+	index_rescan(scan, skey, IndexRelationGetNumberOfKeyAttributes(idxrel), NULL, 0);
+
+	/* Try to find the tuple */
+	if (index_getnext_slot(scan, ForwardScanDirection, outslot))
+	{
+		found = true;
+		ExecMaterializeSlot(outslot);
+
+		xwait = TransactionIdIsValid(snap.xmin) ?
+			snap.xmin : snap.xmax;
+
+		/*
+		 * If the tuple is locked, wait for locking transaction to finish and
+		 * retry.
+		 */
+		if (TransactionIdIsValid(xwait))
+		{
+			XactLockTableWait(xwait, NULL, NULL, XLTW_None);
+			goto retry;
+		}
+	}
+
+	/* Found tuple, try to lock it in the lockmode. */
+	if (found)
+	{
+		TM_FailureData tmfd;
+		TM_Result	res;
+
+		PushActiveSnapshot(GetLatestSnapshot());
+
+		res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(),
+							   outslot,
+							   GetCurrentCommandId(false),
+							   lockmode,
+							   LockWaitBlock,
+							   0 /* don't follow updates */ ,
+							   &tmfd);
+
+		PopActiveSnapshot();
+
+		switch (res)
+		{
+			case TM_Ok:
+				break;
+			case TM_Updated:
+				/* XXX: Improve handling here */
+				if (ItemPointerIndicatesMovedPartitions(&tmfd.ctid))
+					ereport(LOG,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying")));
+				else
+					ereport(LOG,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("concurrent update, retrying")));
+				goto retry;
+			case TM_Deleted:
+				/* XXX: Improve handling here */
+				ereport(LOG,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("concurrent delete, retrying")));
+				goto retry;
+			case TM_Invisible:
+				elog(ERROR, "attempted to lock invisible tuple");
+				break;
+			default:
+				elog(ERROR, "unexpected table_tuple_lock status: %u", res);
+				break;
+		}
+	}
+
+	index_endscan(scan);
+
+	/* Don't release lock until commit. */
+	index_close(idxrel, NoLock);
+
+	return found;
+}
+
+/*
+ * Compare the tuples in the slots by checking if they have equal values.
+ */
+static bool
+tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2,
+			 TypeCacheEntry **eq)
+{
+	int			attrnum;
+
+	Assert(slot1->tts_tupleDescriptor->natts ==
+		   slot2->tts_tupleDescriptor->natts);
+
+	slot_getallattrs(slot1);
+	slot_getallattrs(slot2);
+
+	/* Check equality of the attributes. */
+	for (attrnum = 0; attrnum < slot1->tts_tupleDescriptor->natts; attrnum++)
+	{
+		Form_pg_attribute att;
+		TypeCacheEntry *typentry;
+
+		/*
+		 * If one value is NULL and other is not, then they are certainly not
+		 * equal
+		 */
+		if (slot1->tts_isnull[attrnum] != slot2->tts_isnull[attrnum])
+			return false;
+
+		/*
+		 * If both are NULL, they can be considered equal.
+		 */
+		if (slot1->tts_isnull[attrnum] || slot2->tts_isnull[attrnum])
+			continue;
+
+		att = TupleDescAttr(slot1->tts_tupleDescriptor, attrnum);
+
+		typentry = eq[attrnum];
+		if (typentry == NULL)
+		{
+			typentry = lookup_type_cache(att->atttypid,
+										 TYPECACHE_EQ_OPR_FINFO);
+			if (!OidIsValid(typentry->eq_opr_finfo.fn_oid))
+				ereport(ERROR,
+						(errcode(ERRCODE_UNDEFINED_FUNCTION),
+						 errmsg("could not identify an equality operator for type %s",
+								format_type_be(att->atttypid))));
+			eq[attrnum] = typentry;
+		}
+
+		if (!DatumGetBool(FunctionCall2Coll(&typentry->eq_opr_finfo,
+											att->attcollation,
+											slot1->tts_values[attrnum],
+											slot2->tts_values[attrnum])))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Search the relation 'rel' for tuple using the sequential scan.
+ *
+ * If a matching tuple is found, lock it with lockmode, fill the slot with its
+ * contents, and return true.  Return false otherwise.
+ *
+ * Note that this stops on the first matching tuple.
+ *
+ * This can obviously be quite slow on tables that have more than few rows.
+ */
+bool
+RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode,
+						 TupleTableSlot *searchslot, TupleTableSlot *outslot)
+{
+	TupleTableSlot *scanslot;
+	TableScanDesc scan;
+	SnapshotData snap;
+	TypeCacheEntry **eq;
+	TransactionId xwait;
+	bool		found;
+	TupleDesc	desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel);
+
+	Assert(equalTupleDescs(desc, outslot->tts_tupleDescriptor));
+
+	eq = palloc0(sizeof(*eq) * outslot->tts_tupleDescriptor->natts);
+
+	/* Start a heap scan. */
+	InitDirtySnapshot(snap);
+	scan = table_beginscan(rel, &snap, 0, NULL);
+	scanslot = table_slot_create(rel, NULL);
+
+retry:
+	found = false;
+
+	table_rescan(scan, NULL);
+
+	/* Try to find the tuple */
+	while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot))
+	{
+		if (!tuples_equal(scanslot, searchslot, eq))
+			continue;
+
+		found = true;
+		ExecCopySlot(outslot, scanslot);
+
+		xwait = TransactionIdIsValid(snap.xmin) ?
+			snap.xmin : snap.xmax;
+
+		/*
+		 * If the tuple is locked, wait for locking transaction to finish and
+		 * retry.
+		 */
+		if (TransactionIdIsValid(xwait))
+		{
+			XactLockTableWait(xwait, NULL, NULL, XLTW_None);
+			goto retry;
+		}
+
+		/* Found our tuple and it's not locked */
+		break;
+	}
+
+	/* Found tuple, try to lock it in the lockmode. */
+	if (found)
+	{
+		TM_FailureData tmfd;
+		TM_Result	res;
+
+		PushActiveSnapshot(GetLatestSnapshot());
+
+		res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(),
+							   outslot,
+							   GetCurrentCommandId(false),
+							   lockmode,
+							   LockWaitBlock,
+							   0 /* don't follow updates */ ,
+							   &tmfd);
+
+		PopActiveSnapshot();
+
+		switch (res)
+		{
+			case TM_Ok:
+				break;
+			case TM_Updated:
+				/* XXX: Improve handling here */
+				if (ItemPointerIndicatesMovedPartitions(&tmfd.ctid))
+					ereport(LOG,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying")));
+				else
+					ereport(LOG,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("concurrent update, retrying")));
+				goto retry;
+			case TM_Deleted:
+				/* XXX: Improve handling here */
+				ereport(LOG,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("concurrent delete, retrying")));
+				goto retry;
+			case TM_Invisible:
+				elog(ERROR, "attempted to lock invisible tuple");
+				break;
+			default:
+				elog(ERROR, "unexpected table_tuple_lock status: %u", res);
+				break;
+		}
+	}
+
+	table_endscan(scan);
+	ExecDropSingleTupleTableSlot(scanslot);
+
+	return found;
+}
+
+/*
+ * Insert tuple represented in the slot to the relation, update the indexes,
+ * and execute any constraints and per-row triggers.
+ *
+ * Caller is responsible for opening the indexes.
+ */
+void
+ExecSimpleRelationInsert(ResultRelInfo *resultRelInfo,
+						 EState *estate, TupleTableSlot *slot)
+{
+	bool		skip_tuple = false;
+	Relation	rel = resultRelInfo->ri_RelationDesc;
+
+	/* For now we support only tables. */
+	Assert(rel->rd_rel->relkind == RELKIND_RELATION);
+
+	CheckCmdReplicaIdentity(rel, CMD_INSERT);
+
+	/* BEFORE ROW INSERT Triggers */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_insert_before_row)
+	{
+		if (!ExecBRInsertTriggers(estate, resultRelInfo, slot))
+			skip_tuple = true;	/* "do nothing" */
+	}
+
+	if (!skip_tuple)
+	{
+		List	   *recheckIndexes = NIL;
+
+		/* Compute stored generated columns */
+		if (rel->rd_att->constr &&
+			rel->rd_att->constr->has_generated_stored)
+			ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+									   CMD_INSERT);
+
+		/* Check the constraints of the tuple */
+		if (rel->rd_att->constr)
+			ExecConstraints(resultRelInfo, slot, estate);
+		if (rel->rd_rel->relispartition)
+			ExecPartitionCheck(resultRelInfo, slot, estate, true);
+
+		/* OK, store the tuple and create index entries for it */
+		simple_table_tuple_insert(resultRelInfo->ri_RelationDesc, slot);
+
+		if (resultRelInfo->ri_NumIndices > 0)
+			recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+												   slot, estate, false, false,
+												   NULL, NIL);
+
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot,
+							 recheckIndexes, NULL);
+
+		/*
+		 * XXX we should in theory pass a TransitionCaptureState object to the
+		 * above to capture transition tuples, but after statement triggers
+		 * don't actually get fired by replication yet anyway
+		 */
+
+		list_free(recheckIndexes);
+	}
+}
+
+/*
+ * Find the searchslot tuple and update it with data in the slot,
+ * update the indexes, and execute any constraints and per-row triggers.
+ *
+ * Caller is responsible for opening the indexes.
+ */
+void
+ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo,
+						 EState *estate, EPQState *epqstate,
+						 TupleTableSlot *searchslot, TupleTableSlot *slot)
+{
+	bool		skip_tuple = false;
+	Relation	rel = resultRelInfo->ri_RelationDesc;
+	ItemPointer tid = &(searchslot->tts_tid);
+
+	/* For now we support only tables. */
+	Assert(rel->rd_rel->relkind == RELKIND_RELATION);
+
+	CheckCmdReplicaIdentity(rel, CMD_UPDATE);
+
+	/* BEFORE ROW UPDATE Triggers */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_update_before_row)
+	{
+		if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo,
+								  tid, NULL, slot))
+			skip_tuple = true;	/* "do nothing" */
+	}
+
+	if (!skip_tuple)
+	{
+		List	   *recheckIndexes = NIL;
+		bool		update_indexes;
+
+		/* Compute stored generated columns */
+		if (rel->rd_att->constr &&
+			rel->rd_att->constr->has_generated_stored)
+			ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+									   CMD_UPDATE);
+
+		/* Check the constraints of the tuple */
+		if (rel->rd_att->constr)
+			ExecConstraints(resultRelInfo, slot, estate);
+		if (rel->rd_rel->relispartition)
+			ExecPartitionCheck(resultRelInfo, slot, estate, true);
+
+		simple_table_tuple_update(rel, tid, slot, estate->es_snapshot,
+								  &update_indexes);
+
+		if (resultRelInfo->ri_NumIndices > 0 && update_indexes)
+			recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+												   slot, estate, true, false,
+												   NULL, NIL);
+
+		/* AFTER ROW UPDATE Triggers */
+		ExecARUpdateTriggers(estate, resultRelInfo,
+							 tid, NULL, slot,
+							 recheckIndexes, NULL);
+
+		list_free(recheckIndexes);
+	}
+}
+
+/*
+ * Find the searchslot tuple and delete it, and execute any constraints
+ * and per-row triggers.
+ *
+ * Caller is responsible for opening the indexes.
+ */
+void
+ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo,
+						 EState *estate, EPQState *epqstate,
+						 TupleTableSlot *searchslot)
+{
+	bool		skip_tuple = false;
+	Relation	rel = resultRelInfo->ri_RelationDesc;
+	ItemPointer tid = &searchslot->tts_tid;
+
+	CheckCmdReplicaIdentity(rel, CMD_DELETE);
+
+	/* BEFORE ROW DELETE Triggers */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_delete_before_row)
+	{
+		skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
+										   tid, NULL, NULL);
+
+	}
+
+	if (!skip_tuple)
+	{
+		/* OK, delete the tuple */
+		simple_table_tuple_delete(rel, tid, estate->es_snapshot);
+
+		/* AFTER ROW DELETE Triggers */
+		ExecARDeleteTriggers(estate, resultRelInfo,
+							 tid, NULL, NULL);
+	}
+}
+
+/*
+ * Check if command can be executed with current replica identity.
+ */
+void
+CheckCmdReplicaIdentity(Relation rel, CmdType cmd)
+{
+	PublicationActions *pubactions;
+
+	/* We only need to do checks for UPDATE and DELETE. */
+	if (cmd != CMD_UPDATE && cmd != CMD_DELETE)
+		return;
+
+	/* If relation has replica identity we are always good. */
+	if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL ||
+		OidIsValid(RelationGetReplicaIndex(rel)))
+		return;
+
+	/*
+	 * This is either UPDATE OR DELETE and there is no replica identity.
+	 *
+	 * Check if the table publishes UPDATES or DELETES.
+	 */
+	pubactions = GetRelationPublicationActions(rel);
+	if (cmd == CMD_UPDATE && pubactions->pubupdate)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot update table \"%s\" because it does not have a replica identity and publishes updates",
+						RelationGetRelationName(rel)),
+				 errhint("To enable updating the table, set REPLICA IDENTITY using ALTER TABLE.")));
+	else if (cmd == CMD_DELETE && pubactions->pubdelete)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot delete from table \"%s\" because it does not have a replica identity and publishes deletes",
+						RelationGetRelationName(rel)),
+				 errhint("To enable deleting from the table, set REPLICA IDENTITY using ALTER TABLE.")));
+}
+
+
+/*
+ * Check if we support writing into specific relkind.
+ *
+ * The nspname and relname are only needed for error reporting.
+ */
+void
+CheckSubscriptionRelkind(char relkind, const char *nspname,
+						 const char *relname)
+{
+	/*
+	 * Give a more specific error for foreign tables.
+	 */
+	if (relkind == RELKIND_FOREIGN_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot use relation \"%s.%s\" as logical replication target",
+						nspname, relname),
+				 errdetail("\"%s.%s\" is a foreign table.",
+						   nspname, relname)));
+
+	if (relkind != RELKIND_RELATION && relkind != RELKIND_PARTITIONED_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot use relation \"%s.%s\" as logical replication target",
+						nspname, relname),
+				 errdetail("\"%s.%s\" is not a table.",
+						   nspname, relname)));
+}
diff --git a/src/backend/executor/execSRF.c b/src/backend/executor/execSRF.c
new file mode 100644
index 0000000..545b6c1
--- /dev/null
+++ b/src/backend/executor/execSRF.c
@@ -0,0 +1,980 @@
+/*-------------------------------------------------------------------------
+ *
+ * execSRF.c
+ *	  Routines implementing the API for set-returning functions
+ *
+ * This file serves nodeFunctionscan.c and nodeProjectSet.c, providing
+ * common code for calling set-returning functions according to the
+ * ReturnSetInfo API.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execSRF.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/objectaccess.h"
+#include "executor/execdebug.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parse_coerce.h"
+#include "pgstat.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/typcache.h"
+
+
+/* static function decls */
+static void init_sexpr(Oid foid, Oid input_collation, Expr *node,
+					   SetExprState *sexpr, PlanState *parent,
+					   MemoryContext sexprCxt, bool allowSRF, bool needDescForSRF);
+static void ShutdownSetExpr(Datum arg);
+static void ExecEvalFuncArgs(FunctionCallInfo fcinfo,
+							 List *argList, ExprContext *econtext);
+static void ExecPrepareTuplestoreResult(SetExprState *sexpr,
+										ExprContext *econtext,
+										Tuplestorestate *resultStore,
+										TupleDesc resultDesc);
+static void tupledesc_match(TupleDesc dst_tupdesc, TupleDesc src_tupdesc);
+
+
+/*
+ * Prepare function call in FROM (ROWS FROM) for execution.
+ *
+ * This is used by nodeFunctionscan.c.
+ */
+SetExprState *
+ExecInitTableFunctionResult(Expr *expr,
+							ExprContext *econtext, PlanState *parent)
+{
+	SetExprState *state = makeNode(SetExprState);
+
+	state->funcReturnsSet = false;
+	state->expr = expr;
+	state->func.fn_oid = InvalidOid;
+
+	/*
+	 * Normally the passed expression tree will be a FuncExpr, since the
+	 * grammar only allows a function call at the top level of a table
+	 * function reference.  However, if the function doesn't return set then
+	 * the planner might have replaced the function call via constant-folding
+	 * or inlining.  So if we see any other kind of expression node, execute
+	 * it via the general ExecEvalExpr() code.  That code path will not
+	 * support set-returning functions buried in the expression, though.
+	 */
+	if (IsA(expr, FuncExpr))
+	{
+		FuncExpr   *func = (FuncExpr *) expr;
+
+		state->funcReturnsSet = func->funcretset;
+		state->args = ExecInitExprList(func->args, parent);
+
+		init_sexpr(func->funcid, func->inputcollid, expr, state, parent,
+				   econtext->ecxt_per_query_memory, func->funcretset, false);
+	}
+	else
+	{
+		state->elidedFuncState = ExecInitExpr(expr, parent);
+	}
+
+	return state;
+}
+
+/*
+ *		ExecMakeTableFunctionResult
+ *
+ * Evaluate a table function, producing a materialized result in a Tuplestore
+ * object.
+ *
+ * This is used by nodeFunctionscan.c.
+ */
+Tuplestorestate *
+ExecMakeTableFunctionResult(SetExprState *setexpr,
+							ExprContext *econtext,
+							MemoryContext argContext,
+							TupleDesc expectedDesc,
+							bool randomAccess)
+{
+	Tuplestorestate *tupstore = NULL;
+	TupleDesc	tupdesc = NULL;
+	Oid			funcrettype;
+	bool		returnsTuple;
+	bool		returnsSet = false;
+	FunctionCallInfo fcinfo;
+	PgStat_FunctionCallUsage fcusage;
+	ReturnSetInfo rsinfo;
+	HeapTupleData tmptup;
+	MemoryContext callerContext;
+	bool		first_time = true;
+
+	/*
+	 * Execute per-tablefunc actions in appropriate context.
+	 *
+	 * The FunctionCallInfo needs to live across all the calls to a
+	 * ValuePerCall function, so it can't be allocated in the per-tuple
+	 * context. Similarly, the function arguments need to be evaluated in a
+	 * context that is longer lived than the per-tuple context: The argument
+	 * values would otherwise disappear when we reset that context in the
+	 * inner loop.  As the caller's CurrentMemoryContext is typically a
+	 * query-lifespan context, we don't want to leak memory there.  We require
+	 * the caller to pass a separate memory context that can be used for this,
+	 * and can be reset each time through to avoid bloat.
+	 */
+	MemoryContextReset(argContext);
+	callerContext = MemoryContextSwitchTo(argContext);
+
+	funcrettype = exprType((Node *) setexpr->expr);
+
+	returnsTuple = type_is_rowtype(funcrettype);
+
+	/*
+	 * Prepare a resultinfo node for communication.  We always do this even if
+	 * not expecting a set result, so that we can pass expectedDesc.  In the
+	 * generic-expression case, the expression doesn't actually get to see the
+	 * resultinfo, but set it up anyway because we use some of the fields as
+	 * our own state variables.
+	 */
+	rsinfo.type = T_ReturnSetInfo;
+	rsinfo.econtext = econtext;
+	rsinfo.expectedDesc = expectedDesc;
+	rsinfo.allowedModes = (int) (SFRM_ValuePerCall | SFRM_Materialize | SFRM_Materialize_Preferred);
+	if (randomAccess)
+		rsinfo.allowedModes |= (int) SFRM_Materialize_Random;
+	rsinfo.returnMode = SFRM_ValuePerCall;
+	/* isDone is filled below */
+	rsinfo.setResult = NULL;
+	rsinfo.setDesc = NULL;
+
+	fcinfo = palloc(SizeForFunctionCallInfo(list_length(setexpr->args)));
+
+	/*
+	 * Normally the passed expression tree will be a SetExprState, since the
+	 * grammar only allows a function call at the top level of a table
+	 * function reference.  However, if the function doesn't return set then
+	 * the planner might have replaced the function call via constant-folding
+	 * or inlining.  So if we see any other kind of expression node, execute
+	 * it via the general ExecEvalExpr() code; the only difference is that we
+	 * don't get a chance to pass a special ReturnSetInfo to any functions
+	 * buried in the expression.
+	 */
+	if (!setexpr->elidedFuncState)
+	{
+		/*
+		 * This path is similar to ExecMakeFunctionResultSet.
+		 */
+		returnsSet = setexpr->funcReturnsSet;
+		InitFunctionCallInfoData(*fcinfo, &(setexpr->func),
+								 list_length(setexpr->args),
+								 setexpr->fcinfo->fncollation,
+								 NULL, (Node *) &rsinfo);
+		/* evaluate the function's argument list */
+		Assert(CurrentMemoryContext == argContext);
+		ExecEvalFuncArgs(fcinfo, setexpr->args, econtext);
+
+		/*
+		 * If function is strict, and there are any NULL arguments, skip
+		 * calling the function and act like it returned NULL (or an empty
+		 * set, in the returns-set case).
+		 */
+		if (setexpr->func.fn_strict)
+		{
+			int			i;
+
+			for (i = 0; i < fcinfo->nargs; i++)
+			{
+				if (fcinfo->args[i].isnull)
+					goto no_function_result;
+			}
+		}
+	}
+	else
+	{
+		/* Treat setexpr as a generic expression */
+		InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL);
+	}
+
+	/*
+	 * Switch to short-lived context for calling the function or expression.
+	 */
+	MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	/*
+	 * Loop to handle the ValuePerCall protocol (which is also the same
+	 * behavior needed in the generic ExecEvalExpr path).
+	 */
+	for (;;)
+	{
+		Datum		result;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Reset per-tuple memory context before each call of the function or
+		 * expression. This cleans up any local memory the function may leak
+		 * when called.
+		 */
+		ResetExprContext(econtext);
+
+		/* Call the function or expression one time */
+		if (!setexpr->elidedFuncState)
+		{
+			pgstat_init_function_usage(fcinfo, &fcusage);
+
+			fcinfo->isnull = false;
+			rsinfo.isDone = ExprSingleResult;
+			result = FunctionCallInvoke(fcinfo);
+
+			pgstat_end_function_usage(&fcusage,
+									  rsinfo.isDone != ExprMultipleResult);
+		}
+		else
+		{
+			result =
+				ExecEvalExpr(setexpr->elidedFuncState, econtext, &fcinfo->isnull);
+			rsinfo.isDone = ExprSingleResult;
+		}
+
+		/* Which protocol does function want to use? */
+		if (rsinfo.returnMode == SFRM_ValuePerCall)
+		{
+			/*
+			 * Check for end of result set.
+			 */
+			if (rsinfo.isDone == ExprEndResult)
+				break;
+
+			/*
+			 * If first time through, build tuplestore for result.  For a
+			 * scalar function result type, also make a suitable tupdesc.
+			 */
+			if (first_time)
+			{
+				MemoryContext oldcontext =
+				MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+				tupstore = tuplestore_begin_heap(randomAccess, false, work_mem);
+				rsinfo.setResult = tupstore;
+				if (!returnsTuple)
+				{
+					tupdesc = CreateTemplateTupleDesc(1);
+					TupleDescInitEntry(tupdesc,
+									   (AttrNumber) 1,
+									   "column",
+									   funcrettype,
+									   -1,
+									   0);
+					rsinfo.setDesc = tupdesc;
+				}
+				MemoryContextSwitchTo(oldcontext);
+			}
+
+			/*
+			 * Store current resultset item.
+			 */
+			if (returnsTuple)
+			{
+				if (!fcinfo->isnull)
+				{
+					HeapTupleHeader td = DatumGetHeapTupleHeader(result);
+
+					if (tupdesc == NULL)
+					{
+						MemoryContext oldcontext =
+						MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+						/*
+						 * This is the first non-NULL result from the
+						 * function.  Use the type info embedded in the
+						 * rowtype Datum to look up the needed tupdesc.  Make
+						 * a copy for the query.
+						 */
+						tupdesc = lookup_rowtype_tupdesc_copy(HeapTupleHeaderGetTypeId(td),
+															  HeapTupleHeaderGetTypMod(td));
+						rsinfo.setDesc = tupdesc;
+						MemoryContextSwitchTo(oldcontext);
+					}
+					else
+					{
+						/*
+						 * Verify all later returned rows have same subtype;
+						 * necessary in case the type is RECORD.
+						 */
+						if (HeapTupleHeaderGetTypeId(td) != tupdesc->tdtypeid ||
+							HeapTupleHeaderGetTypMod(td) != tupdesc->tdtypmod)
+							ereport(ERROR,
+									(errcode(ERRCODE_DATATYPE_MISMATCH),
+									 errmsg("rows returned by function are not all of the same row type")));
+					}
+
+					/*
+					 * tuplestore_puttuple needs a HeapTuple not a bare
+					 * HeapTupleHeader, but it doesn't need all the fields.
+					 */
+					tmptup.t_len = HeapTupleHeaderGetDatumLength(td);
+					tmptup.t_data = td;
+
+					tuplestore_puttuple(tupstore, &tmptup);
+				}
+				else
+				{
+					/*
+					 * NULL result from a tuple-returning function; expand it
+					 * to a row of all nulls.  We rely on the expectedDesc to
+					 * form such rows.  (Note: this would be problematic if
+					 * tuplestore_putvalues saved the tdtypeid/tdtypmod from
+					 * the provided descriptor, since that might not match
+					 * what we get from the function itself.  But it doesn't.)
+					 */
+					int			natts = expectedDesc->natts;
+					bool	   *nullflags;
+
+					nullflags = (bool *) palloc(natts * sizeof(bool));
+					memset(nullflags, true, natts * sizeof(bool));
+					tuplestore_putvalues(tupstore, expectedDesc, NULL, nullflags);
+				}
+			}
+			else
+			{
+				/* Scalar-type case: just store the function result */
+				tuplestore_putvalues(tupstore, tupdesc, &result, &fcinfo->isnull);
+			}
+
+			/*
+			 * Are we done?
+			 */
+			if (rsinfo.isDone != ExprMultipleResult)
+				break;
+
+			/*
+			 * Check that set-returning functions were properly declared.
+			 * (Note: for historical reasons, we don't complain if a non-SRF
+			 * returns ExprEndResult; that's treated as returning NULL.)
+			 */
+			if (!returnsSet)
+				ereport(ERROR,
+						(errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+						 errmsg("table-function protocol for value-per-call mode was not followed")));
+		}
+		else if (rsinfo.returnMode == SFRM_Materialize)
+		{
+			/* check we're on the same page as the function author */
+			if (!first_time || rsinfo.isDone != ExprSingleResult || !returnsSet)
+				ereport(ERROR,
+						(errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+						 errmsg("table-function protocol for materialize mode was not followed")));
+			/* Done evaluating the set result */
+			break;
+		}
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+					 errmsg("unrecognized table-function returnMode: %d",
+							(int) rsinfo.returnMode)));
+
+		first_time = false;
+	}
+
+no_function_result:
+
+	/*
+	 * If we got nothing from the function (ie, an empty-set or NULL result),
+	 * we have to create the tuplestore to return, and if it's a
+	 * non-set-returning function then insert a single all-nulls row.  As
+	 * above, we depend on the expectedDesc to manufacture the dummy row.
+	 */
+	if (rsinfo.setResult == NULL)
+	{
+		MemoryContext oldcontext =
+		MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+		tupstore = tuplestore_begin_heap(randomAccess, false, work_mem);
+		rsinfo.setResult = tupstore;
+		MemoryContextSwitchTo(oldcontext);
+
+		if (!returnsSet)
+		{
+			int			natts = expectedDesc->natts;
+			bool	   *nullflags;
+
+			nullflags = (bool *) palloc(natts * sizeof(bool));
+			memset(nullflags, true, natts * sizeof(bool));
+			tuplestore_putvalues(tupstore, expectedDesc, NULL, nullflags);
+		}
+	}
+
+	/*
+	 * If function provided a tupdesc, cross-check it.  We only really need to
+	 * do this for functions returning RECORD, but might as well do it always.
+	 */
+	if (rsinfo.setDesc)
+	{
+		tupledesc_match(expectedDesc, rsinfo.setDesc);
+
+		/*
+		 * If it is a dynamically-allocated TupleDesc, free it: it is
+		 * typically allocated in a per-query context, so we must avoid
+		 * leaking it across multiple usages.
+		 */
+		if (rsinfo.setDesc->tdrefcount == -1)
+			FreeTupleDesc(rsinfo.setDesc);
+	}
+
+	MemoryContextSwitchTo(callerContext);
+
+	/* All done, pass back the tuplestore */
+	return rsinfo.setResult;
+}
+
+
+/*
+ * Prepare targetlist SRF function call for execution.
+ *
+ * This is used by nodeProjectSet.c.
+ */
+SetExprState *
+ExecInitFunctionResultSet(Expr *expr,
+						  ExprContext *econtext, PlanState *parent)
+{
+	SetExprState *state = makeNode(SetExprState);
+
+	state->funcReturnsSet = true;
+	state->expr = expr;
+	state->func.fn_oid = InvalidOid;
+
+	/*
+	 * Initialize metadata.  The expression node could be either a FuncExpr or
+	 * an OpExpr.
+	 */
+	if (IsA(expr, FuncExpr))
+	{
+		FuncExpr   *func = (FuncExpr *) expr;
+
+		state->args = ExecInitExprList(func->args, parent);
+		init_sexpr(func->funcid, func->inputcollid, expr, state, parent,
+				   econtext->ecxt_per_query_memory, true, true);
+	}
+	else if (IsA(expr, OpExpr))
+	{
+		OpExpr	   *op = (OpExpr *) expr;
+
+		state->args = ExecInitExprList(op->args, parent);
+		init_sexpr(op->opfuncid, op->inputcollid, expr, state, parent,
+				   econtext->ecxt_per_query_memory, true, true);
+	}
+	else
+		elog(ERROR, "unrecognized node type: %d",
+			 (int) nodeTag(expr));
+
+	/* shouldn't get here unless the selected function returns set */
+	Assert(state->func.fn_retset);
+
+	return state;
+}
+
+/*
+ *		ExecMakeFunctionResultSet
+ *
+ * Evaluate the arguments to a set-returning function and then call the
+ * function itself.  The argument expressions may not contain set-returning
+ * functions (the planner is supposed to have separated evaluation for those).
+ *
+ * This should be called in a short-lived (per-tuple) context, argContext
+ * needs to live until all rows have been returned (i.e. *isDone set to
+ * ExprEndResult or ExprSingleResult).
+ *
+ * This is used by nodeProjectSet.c.
+ */
+Datum
+ExecMakeFunctionResultSet(SetExprState *fcache,
+						  ExprContext *econtext,
+						  MemoryContext argContext,
+						  bool *isNull,
+						  ExprDoneCond *isDone)
+{
+	List	   *arguments;
+	Datum		result;
+	FunctionCallInfo fcinfo;
+	PgStat_FunctionCallUsage fcusage;
+	ReturnSetInfo rsinfo;
+	bool		callit;
+	int			i;
+
+restart:
+
+	/* Guard against stack overflow due to overly complex expressions */
+	check_stack_depth();
+
+	/*
+	 * If a previous call of the function returned a set result in the form of
+	 * a tuplestore, continue reading rows from the tuplestore until it's
+	 * empty.
+	 */
+	if (fcache->funcResultStore)
+	{
+		TupleTableSlot *slot = fcache->funcResultSlot;
+		MemoryContext oldContext;
+		bool		foundTup;
+
+		/*
+		 * Have to make sure tuple in slot lives long enough, otherwise
+		 * clearing the slot could end up trying to free something already
+		 * freed.
+		 */
+		oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+		foundTup = tuplestore_gettupleslot(fcache->funcResultStore, true, false,
+										   fcache->funcResultSlot);
+		MemoryContextSwitchTo(oldContext);
+
+		if (foundTup)
+		{
+			*isDone = ExprMultipleResult;
+			if (fcache->funcReturnsTuple)
+			{
+				/* We must return the whole tuple as a Datum. */
+				*isNull = false;
+				return ExecFetchSlotHeapTupleDatum(fcache->funcResultSlot);
+			}
+			else
+			{
+				/* Extract the first column and return it as a scalar. */
+				return slot_getattr(fcache->funcResultSlot, 1, isNull);
+			}
+		}
+		/* Exhausted the tuplestore, so clean up */
+		tuplestore_end(fcache->funcResultStore);
+		fcache->funcResultStore = NULL;
+		*isDone = ExprEndResult;
+		*isNull = true;
+		return (Datum) 0;
+	}
+
+	/*
+	 * arguments is a list of expressions to evaluate before passing to the
+	 * function manager.  We skip the evaluation if it was already done in the
+	 * previous call (ie, we are continuing the evaluation of a set-valued
+	 * function).  Otherwise, collect the current argument values into fcinfo.
+	 *
+	 * The arguments have to live in a context that lives at least until all
+	 * rows from this SRF have been returned, otherwise ValuePerCall SRFs
+	 * would reference freed memory after the first returned row.
+	 */
+	fcinfo = fcache->fcinfo;
+	arguments = fcache->args;
+	if (!fcache->setArgsValid)
+	{
+		MemoryContext oldContext = MemoryContextSwitchTo(argContext);
+
+		ExecEvalFuncArgs(fcinfo, arguments, econtext);
+		MemoryContextSwitchTo(oldContext);
+	}
+	else
+	{
+		/* Reset flag (we may set it again below) */
+		fcache->setArgsValid = false;
+	}
+
+	/*
+	 * Now call the function, passing the evaluated parameter values.
+	 */
+
+	/* Prepare a resultinfo node for communication. */
+	fcinfo->resultinfo = (Node *) &rsinfo;
+	rsinfo.type = T_ReturnSetInfo;
+	rsinfo.econtext = econtext;
+	rsinfo.expectedDesc = fcache->funcResultDesc;
+	rsinfo.allowedModes = (int) (SFRM_ValuePerCall | SFRM_Materialize);
+	/* note we do not set SFRM_Materialize_Random or _Preferred */
+	rsinfo.returnMode = SFRM_ValuePerCall;
+	/* isDone is filled below */
+	rsinfo.setResult = NULL;
+	rsinfo.setDesc = NULL;
+
+	/*
+	 * If function is strict, and there are any NULL arguments, skip calling
+	 * the function.
+	 */
+	callit = true;
+	if (fcache->func.fn_strict)
+	{
+		for (i = 0; i < fcinfo->nargs; i++)
+		{
+			if (fcinfo->args[i].isnull)
+			{
+				callit = false;
+				break;
+			}
+		}
+	}
+
+	if (callit)
+	{
+		pgstat_init_function_usage(fcinfo, &fcusage);
+
+		fcinfo->isnull = false;
+		rsinfo.isDone = ExprSingleResult;
+		result = FunctionCallInvoke(fcinfo);
+		*isNull = fcinfo->isnull;
+		*isDone = rsinfo.isDone;
+
+		pgstat_end_function_usage(&fcusage,
+								  rsinfo.isDone != ExprMultipleResult);
+	}
+	else
+	{
+		/* for a strict SRF, result for NULL is an empty set */
+		result = (Datum) 0;
+		*isNull = true;
+		*isDone = ExprEndResult;
+	}
+
+	/* Which protocol does function want to use? */
+	if (rsinfo.returnMode == SFRM_ValuePerCall)
+	{
+		if (*isDone != ExprEndResult)
+		{
+			/*
+			 * Save the current argument values to re-use on the next call.
+			 */
+			if (*isDone == ExprMultipleResult)
+			{
+				fcache->setArgsValid = true;
+				/* Register cleanup callback if we didn't already */
+				if (!fcache->shutdown_reg)
+				{
+					RegisterExprContextCallback(econtext,
+												ShutdownSetExpr,
+												PointerGetDatum(fcache));
+					fcache->shutdown_reg = true;
+				}
+			}
+		}
+	}
+	else if (rsinfo.returnMode == SFRM_Materialize)
+	{
+		/* check we're on the same page as the function author */
+		if (rsinfo.isDone != ExprSingleResult)
+			ereport(ERROR,
+					(errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+					 errmsg("table-function protocol for materialize mode was not followed")));
+		if (rsinfo.setResult != NULL)
+		{
+			/* prepare to return values from the tuplestore */
+			ExecPrepareTuplestoreResult(fcache, econtext,
+										rsinfo.setResult,
+										rsinfo.setDesc);
+			/* loop back to top to start returning from tuplestore */
+			goto restart;
+		}
+		/* if setResult was left null, treat it as empty set */
+		*isDone = ExprEndResult;
+		*isNull = true;
+		result = (Datum) 0;
+	}
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED),
+				 errmsg("unrecognized table-function returnMode: %d",
+						(int) rsinfo.returnMode)));
+
+	return result;
+}
+
+
+/*
+ * init_sexpr - initialize a SetExprState node during first use
+ */
+static void
+init_sexpr(Oid foid, Oid input_collation, Expr *node,
+		   SetExprState *sexpr, PlanState *parent,
+		   MemoryContext sexprCxt, bool allowSRF, bool needDescForSRF)
+{
+	AclResult	aclresult;
+	size_t		numargs = list_length(sexpr->args);
+
+	/* Check permission to call function */
+	aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE);
+	if (aclresult != ACLCHECK_OK)
+		aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid));
+	InvokeFunctionExecuteHook(foid);
+
+	/*
+	 * Safety check on nargs.  Under normal circumstances this should never
+	 * fail, as parser should check sooner.  But possibly it might fail if
+	 * server has been compiled with FUNC_MAX_ARGS smaller than some functions
+	 * declared in pg_proc?
+	 */
+	if (list_length(sexpr->args) > FUNC_MAX_ARGS)
+		ereport(ERROR,
+				(errcode(ERRCODE_TOO_MANY_ARGUMENTS),
+				 errmsg_plural("cannot pass more than %d argument to a function",
+							   "cannot pass more than %d arguments to a function",
+							   FUNC_MAX_ARGS,
+							   FUNC_MAX_ARGS)));
+
+	/* Set up the primary fmgr lookup information */
+	fmgr_info_cxt(foid, &(sexpr->func), sexprCxt);
+	fmgr_info_set_expr((Node *) sexpr->expr, &(sexpr->func));
+
+	/* Initialize the function call parameter struct as well */
+	sexpr->fcinfo =
+		(FunctionCallInfo) palloc(SizeForFunctionCallInfo(numargs));
+	InitFunctionCallInfoData(*sexpr->fcinfo, &(sexpr->func),
+							 numargs,
+							 input_collation, NULL, NULL);
+
+	/* If function returns set, check if that's allowed by caller */
+	if (sexpr->func.fn_retset && !allowSRF)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set"),
+				 parent ? executor_errposition(parent->state,
+											   exprLocation((Node *) node)) : 0));
+
+	/* Otherwise, caller should have marked the sexpr correctly */
+	Assert(sexpr->func.fn_retset == sexpr->funcReturnsSet);
+
+	/* If function returns set, prepare expected tuple descriptor */
+	if (sexpr->func.fn_retset && needDescForSRF)
+	{
+		TypeFuncClass functypclass;
+		Oid			funcrettype;
+		TupleDesc	tupdesc;
+		MemoryContext oldcontext;
+
+		functypclass = get_expr_result_type(sexpr->func.fn_expr,
+											&funcrettype,
+											&tupdesc);
+
+		/* Must save tupdesc in sexpr's context */
+		oldcontext = MemoryContextSwitchTo(sexprCxt);
+
+		if (functypclass == TYPEFUNC_COMPOSITE ||
+			functypclass == TYPEFUNC_COMPOSITE_DOMAIN)
+		{
+			/* Composite data type, e.g. a table's row type */
+			Assert(tupdesc);
+			/* Must copy it out of typcache for safety */
+			sexpr->funcResultDesc = CreateTupleDescCopy(tupdesc);
+			sexpr->funcReturnsTuple = true;
+		}
+		else if (functypclass == TYPEFUNC_SCALAR)
+		{
+			/* Base data type, i.e. scalar */
+			tupdesc = CreateTemplateTupleDesc(1);
+			TupleDescInitEntry(tupdesc,
+							   (AttrNumber) 1,
+							   NULL,
+							   funcrettype,
+							   -1,
+							   0);
+			sexpr->funcResultDesc = tupdesc;
+			sexpr->funcReturnsTuple = false;
+		}
+		else if (functypclass == TYPEFUNC_RECORD)
+		{
+			/* This will work if function doesn't need an expectedDesc */
+			sexpr->funcResultDesc = NULL;
+			sexpr->funcReturnsTuple = true;
+		}
+		else
+		{
+			/* Else, we will fail if function needs an expectedDesc */
+			sexpr->funcResultDesc = NULL;
+		}
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+	else
+		sexpr->funcResultDesc = NULL;
+
+	/* Initialize additional state */
+	sexpr->funcResultStore = NULL;
+	sexpr->funcResultSlot = NULL;
+	sexpr->shutdown_reg = false;
+}
+
+/*
+ * callback function in case a SetExprState needs to be shut down before it
+ * has been run to completion
+ */
+static void
+ShutdownSetExpr(Datum arg)
+{
+	SetExprState *sexpr = castNode(SetExprState, DatumGetPointer(arg));
+
+	/* If we have a slot, make sure it's let go of any tuplestore pointer */
+	if (sexpr->funcResultSlot)
+		ExecClearTuple(sexpr->funcResultSlot);
+
+	/* Release any open tuplestore */
+	if (sexpr->funcResultStore)
+		tuplestore_end(sexpr->funcResultStore);
+	sexpr->funcResultStore = NULL;
+
+	/* Clear any active set-argument state */
+	sexpr->setArgsValid = false;
+
+	/* execUtils will deregister the callback... */
+	sexpr->shutdown_reg = false;
+}
+
+/*
+ * Evaluate arguments for a function.
+ */
+static void
+ExecEvalFuncArgs(FunctionCallInfo fcinfo,
+				 List *argList,
+				 ExprContext *econtext)
+{
+	int			i;
+	ListCell   *arg;
+
+	i = 0;
+	foreach(arg, argList)
+	{
+		ExprState  *argstate = (ExprState *) lfirst(arg);
+
+		fcinfo->args[i].value = ExecEvalExpr(argstate,
+											 econtext,
+											 &fcinfo->args[i].isnull);
+		i++;
+	}
+
+	Assert(i == fcinfo->nargs);
+}
+
+/*
+ *		ExecPrepareTuplestoreResult
+ *
+ * Subroutine for ExecMakeFunctionResultSet: prepare to extract rows from a
+ * tuplestore function result.  We must set up a funcResultSlot (unless
+ * already done in a previous call cycle) and verify that the function
+ * returned the expected tuple descriptor.
+ */
+static void
+ExecPrepareTuplestoreResult(SetExprState *sexpr,
+							ExprContext *econtext,
+							Tuplestorestate *resultStore,
+							TupleDesc resultDesc)
+{
+	sexpr->funcResultStore = resultStore;
+
+	if (sexpr->funcResultSlot == NULL)
+	{
+		/* Create a slot so we can read data out of the tuplestore */
+		TupleDesc	slotDesc;
+		MemoryContext oldcontext;
+
+		oldcontext = MemoryContextSwitchTo(sexpr->func.fn_mcxt);
+
+		/*
+		 * If we were not able to determine the result rowtype from context,
+		 * and the function didn't return a tupdesc, we have to fail.
+		 */
+		if (sexpr->funcResultDesc)
+			slotDesc = sexpr->funcResultDesc;
+		else if (resultDesc)
+		{
+			/* don't assume resultDesc is long-lived */
+			slotDesc = CreateTupleDescCopy(resultDesc);
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("function returning setof record called in "
+							"context that cannot accept type record")));
+			slotDesc = NULL;	/* keep compiler quiet */
+		}
+
+		sexpr->funcResultSlot = MakeSingleTupleTableSlot(slotDesc,
+														 &TTSOpsMinimalTuple);
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	/*
+	 * If function provided a tupdesc, cross-check it.  We only really need to
+	 * do this for functions returning RECORD, but might as well do it always.
+	 */
+	if (resultDesc)
+	{
+		if (sexpr->funcResultDesc)
+			tupledesc_match(sexpr->funcResultDesc, resultDesc);
+
+		/*
+		 * If it is a dynamically-allocated TupleDesc, free it: it is
+		 * typically allocated in a per-query context, so we must avoid
+		 * leaking it across multiple usages.
+		 */
+		if (resultDesc->tdrefcount == -1)
+			FreeTupleDesc(resultDesc);
+	}
+
+	/* Register cleanup callback if we didn't already */
+	if (!sexpr->shutdown_reg)
+	{
+		RegisterExprContextCallback(econtext,
+									ShutdownSetExpr,
+									PointerGetDatum(sexpr));
+		sexpr->shutdown_reg = true;
+	}
+}
+
+/*
+ * Check that function result tuple type (src_tupdesc) matches or can
+ * be considered to match what the query expects (dst_tupdesc). If
+ * they don't match, ereport.
+ *
+ * We really only care about number of attributes and data type.
+ * Also, we can ignore type mismatch on columns that are dropped in the
+ * destination type, so long as the physical storage matches.  This is
+ * helpful in some cases involving out-of-date cached plans.
+ */
+static void
+tupledesc_match(TupleDesc dst_tupdesc, TupleDesc src_tupdesc)
+{
+	int			i;
+
+	if (dst_tupdesc->natts != src_tupdesc->natts)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATATYPE_MISMATCH),
+				 errmsg("function return row and query-specified return row do not match"),
+				 errdetail_plural("Returned row contains %d attribute, but query expects %d.",
+								  "Returned row contains %d attributes, but query expects %d.",
+								  src_tupdesc->natts,
+								  src_tupdesc->natts, dst_tupdesc->natts)));
+
+	for (i = 0; i < dst_tupdesc->natts; i++)
+	{
+		Form_pg_attribute dattr = TupleDescAttr(dst_tupdesc, i);
+		Form_pg_attribute sattr = TupleDescAttr(src_tupdesc, i);
+
+		if (IsBinaryCoercible(sattr->atttypid, dattr->atttypid))
+			continue;			/* no worries */
+		if (!dattr->attisdropped)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("function return row and query-specified return row do not match"),
+					 errdetail("Returned type %s at ordinal position %d, but query expects %s.",
+							   format_type_be(sattr->atttypid),
+							   i + 1,
+							   format_type_be(dattr->atttypid))));
+
+		if (dattr->attlen != sattr->attlen ||
+			dattr->attalign != sattr->attalign)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("function return row and query-specified return row do not match"),
+					 errdetail("Physical storage mismatch on dropped attribute at ordinal position %d.",
+							   i + 1)));
+	}
+}
diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c
new file mode 100644
index 0000000..69ab345
--- /dev/null
+++ b/src/backend/executor/execScan.c
@@ -0,0 +1,342 @@
+/*-------------------------------------------------------------------------
+ *
+ * execScan.c
+ *	  This code provides support for generalized relation scans. ExecScan
+ *	  is passed a node and a pointer to a function to "do the right thing"
+ *	  and return a tuple from the relation. ExecScan then does the tedious
+ *	  stuff - checking the qualification and projecting the tuple
+ *	  appropriately.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execScan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+
+/*
+ * ExecScanFetch -- check interrupts & fetch next potential tuple
+ *
+ * This routine is concerned with substituting a test tuple if we are
+ * inside an EvalPlanQual recheck.  If we aren't, just execute
+ * the access method's next-tuple routine.
+ */
+static inline TupleTableSlot *
+ExecScanFetch(ScanState *node,
+			  ExecScanAccessMtd accessMtd,
+			  ExecScanRecheckMtd recheckMtd)
+{
+	EState	   *estate = node->ps.state;
+
+	CHECK_FOR_INTERRUPTS();
+
+	if (estate->es_epq_active != NULL)
+	{
+		EPQState   *epqstate = estate->es_epq_active;
+
+		/*
+		 * We are inside an EvalPlanQual recheck.  Return the test tuple if
+		 * one is available, after rechecking any access-method-specific
+		 * conditions.
+		 */
+		Index		scanrelid = ((Scan *) node->ps.plan)->scanrelid;
+
+		if (scanrelid == 0)
+		{
+			/*
+			 * This is a ForeignScan or CustomScan which has pushed down a
+			 * join to the remote side.  The recheck method is responsible not
+			 * only for rechecking the scan/join quals but also for storing
+			 * the correct tuple in the slot.
+			 */
+
+			TupleTableSlot *slot = node->ss_ScanTupleSlot;
+
+			if (!(*recheckMtd) (node, slot))
+				ExecClearTuple(slot);	/* would not be returned by scan */
+			return slot;
+		}
+		else if (epqstate->relsubs_done[scanrelid - 1])
+		{
+			/*
+			 * Return empty slot, as we already performed an EPQ substitution
+			 * for this relation.
+			 */
+
+			TupleTableSlot *slot = node->ss_ScanTupleSlot;
+
+			/* Return empty slot, as we already returned a tuple */
+			return ExecClearTuple(slot);
+		}
+		else if (epqstate->relsubs_slot[scanrelid - 1] != NULL)
+		{
+			/*
+			 * Return replacement tuple provided by the EPQ caller.
+			 */
+
+			TupleTableSlot *slot = epqstate->relsubs_slot[scanrelid - 1];
+
+			Assert(epqstate->relsubs_rowmark[scanrelid - 1] == NULL);
+
+			/* Mark to remember that we shouldn't return more */
+			epqstate->relsubs_done[scanrelid - 1] = true;
+
+			/* Return empty slot if we haven't got a test tuple */
+			if (TupIsNull(slot))
+				return NULL;
+
+			/* Check if it meets the access-method conditions */
+			if (!(*recheckMtd) (node, slot))
+				return ExecClearTuple(slot);	/* would not be returned by
+												 * scan */
+			return slot;
+		}
+		else if (epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+		{
+			/*
+			 * Fetch and return replacement tuple using a non-locking rowmark.
+			 */
+
+			TupleTableSlot *slot = node->ss_ScanTupleSlot;
+
+			/* Mark to remember that we shouldn't return more */
+			epqstate->relsubs_done[scanrelid - 1] = true;
+
+			if (!EvalPlanQualFetchRowMark(epqstate, scanrelid, slot))
+				return NULL;
+
+			/* Return empty slot if we haven't got a test tuple */
+			if (TupIsNull(slot))
+				return NULL;
+
+			/* Check if it meets the access-method conditions */
+			if (!(*recheckMtd) (node, slot))
+				return ExecClearTuple(slot);	/* would not be returned by
+												 * scan */
+			return slot;
+		}
+	}
+
+	/*
+	 * Run the node-type-specific access method function to get the next tuple
+	 */
+	return (*accessMtd) (node);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecScan
+ *
+ *		Scans the relation using the 'access method' indicated and
+ *		returns the next qualifying tuple.
+ *		The access method returns the next tuple and ExecScan() is
+ *		responsible for checking the tuple returned against the qual-clause.
+ *
+ *		A 'recheck method' must also be provided that can check an
+ *		arbitrary tuple of the relation against any qual conditions
+ *		that are implemented internal to the access method.
+ *
+ *		Conditions:
+ *		  -- the "cursor" maintained by the AMI is positioned at the tuple
+ *			 returned previously.
+ *
+ *		Initial States:
+ *		  -- the relation indicated is opened for scanning so that the
+ *			 "cursor" is positioned before the first qualifying tuple.
+ * ----------------------------------------------------------------
+ */
+TupleTableSlot *
+ExecScan(ScanState *node,
+		 ExecScanAccessMtd accessMtd,	/* function returning a tuple */
+		 ExecScanRecheckMtd recheckMtd)
+{
+	ExprContext *econtext;
+	ExprState  *qual;
+	ProjectionInfo *projInfo;
+
+	/*
+	 * Fetch data from node
+	 */
+	qual = node->ps.qual;
+	projInfo = node->ps.ps_ProjInfo;
+	econtext = node->ps.ps_ExprContext;
+
+	/* interrupt checks are in ExecScanFetch */
+
+	/*
+	 * If we have neither a qual to check nor a projection to do, just skip
+	 * all the overhead and return the raw scan tuple.
+	 */
+	if (!qual && !projInfo)
+	{
+		ResetExprContext(econtext);
+		return ExecScanFetch(node, accessMtd, recheckMtd);
+	}
+
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage allocated in the previous tuple cycle.
+	 */
+	ResetExprContext(econtext);
+
+	/*
+	 * get a tuple from the access method.  Loop until we obtain a tuple that
+	 * passes the qualification.
+	 */
+	for (;;)
+	{
+		TupleTableSlot *slot;
+
+		slot = ExecScanFetch(node, accessMtd, recheckMtd);
+
+		/*
+		 * if the slot returned by the accessMtd contains NULL, then it means
+		 * there is nothing more to scan so we just return an empty slot,
+		 * being careful to use the projection result slot so it has correct
+		 * tupleDesc.
+		 */
+		if (TupIsNull(slot))
+		{
+			if (projInfo)
+				return ExecClearTuple(projInfo->pi_state.resultslot);
+			else
+				return slot;
+		}
+
+		/*
+		 * place the current tuple into the expr context
+		 */
+		econtext->ecxt_scantuple = slot;
+
+		/*
+		 * check that the current tuple satisfies the qual-clause
+		 *
+		 * check for non-null qual here to avoid a function call to ExecQual()
+		 * when the qual is null ... saves only a few cycles, but they add up
+		 * ...
+		 */
+		if (qual == NULL || ExecQual(qual, econtext))
+		{
+			/*
+			 * Found a satisfactory scan tuple.
+			 */
+			if (projInfo)
+			{
+				/*
+				 * Form a projection tuple, store it in the result tuple slot
+				 * and return it.
+				 */
+				return ExecProject(projInfo);
+			}
+			else
+			{
+				/*
+				 * Here, we aren't projecting, so just return scan tuple.
+				 */
+				return slot;
+			}
+		}
+		else
+			InstrCountFiltered1(node, 1);
+
+		/*
+		 * Tuple fails qual, so free per-tuple memory and try again.
+		 */
+		ResetExprContext(econtext);
+	}
+}
+
+/*
+ * ExecAssignScanProjectionInfo
+ *		Set up projection info for a scan node, if necessary.
+ *
+ * We can avoid a projection step if the requested tlist exactly matches
+ * the underlying tuple type.  If so, we just set ps_ProjInfo to NULL.
+ * Note that this case occurs not only for simple "SELECT * FROM ...", but
+ * also in most cases where there are joins or other processing nodes above
+ * the scan node, because the planner will preferentially generate a matching
+ * tlist.
+ *
+ * The scan slot's descriptor must have been set already.
+ */
+void
+ExecAssignScanProjectionInfo(ScanState *node)
+{
+	Scan	   *scan = (Scan *) node->ps.plan;
+	TupleDesc	tupdesc = node->ss_ScanTupleSlot->tts_tupleDescriptor;
+
+	ExecConditionalAssignProjectionInfo(&node->ps, tupdesc, scan->scanrelid);
+}
+
+/*
+ * ExecAssignScanProjectionInfoWithVarno
+ *		As above, but caller can specify varno expected in Vars in the tlist.
+ */
+void
+ExecAssignScanProjectionInfoWithVarno(ScanState *node, Index varno)
+{
+	TupleDesc	tupdesc = node->ss_ScanTupleSlot->tts_tupleDescriptor;
+
+	ExecConditionalAssignProjectionInfo(&node->ps, tupdesc, varno);
+}
+
+/*
+ * ExecScanReScan
+ *
+ * This must be called within the ReScan function of any plan node type
+ * that uses ExecScan().
+ */
+void
+ExecScanReScan(ScanState *node)
+{
+	EState	   *estate = node->ps.state;
+
+	/*
+	 * We must clear the scan tuple so that observers (e.g., execCurrent.c)
+	 * can tell that this plan node is not positioned on a tuple.
+	 */
+	ExecClearTuple(node->ss_ScanTupleSlot);
+
+	/* Rescan EvalPlanQual tuple if we're inside an EvalPlanQual recheck */
+	if (estate->es_epq_active != NULL)
+	{
+		EPQState   *epqstate = estate->es_epq_active;
+		Index		scanrelid = ((Scan *) node->ps.plan)->scanrelid;
+
+		if (scanrelid > 0)
+			epqstate->relsubs_done[scanrelid - 1] = false;
+		else
+		{
+			Bitmapset  *relids;
+			int			rtindex = -1;
+
+			/*
+			 * If an FDW or custom scan provider has replaced the join with a
+			 * scan, there are multiple RTIs; reset the epqScanDone flag for
+			 * all of them.
+			 */
+			if (IsA(node->ps.plan, ForeignScan))
+				relids = ((ForeignScan *) node->ps.plan)->fs_relids;
+			else if (IsA(node->ps.plan, CustomScan))
+				relids = ((CustomScan *) node->ps.plan)->custom_relids;
+			else
+				elog(ERROR, "unexpected scan node: %d",
+					 (int) nodeTag(node->ps.plan));
+
+			while ((rtindex = bms_next_member(relids, rtindex)) >= 0)
+			{
+				Assert(rtindex > 0);
+				epqstate->relsubs_done[rtindex - 1] = false;
+			}
+		}
+	}
+}
diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
new file mode 100644
index 0000000..5004b3b
--- /dev/null
+++ b/src/backend/executor/execTuples.c
@@ -0,0 +1,2339 @@
+/*-------------------------------------------------------------------------
+ *
+ * execTuples.c
+ *	  Routines dealing with TupleTableSlots.  These are used for resource
+ *	  management associated with tuples (eg, releasing buffer pins for
+ *	  tuples in disk buffers, or freeing the memory occupied by transient
+ *	  tuples).  Slots also provide access abstraction that lets us implement
+ *	  "virtual" tuples to reduce data-copying overhead.
+ *
+ *	  Routines dealing with the type information for tuples. Currently,
+ *	  the type information for a tuple is an array of FormData_pg_attribute.
+ *	  This information is needed by routines manipulating tuples
+ *	  (getattribute, formtuple, etc.).
+ *
+ *
+ *	 EXAMPLE OF HOW TABLE ROUTINES WORK
+ *		Suppose we have a query such as SELECT emp.name FROM emp and we have
+ *		a single SeqScan node in the query plan.
+ *
+ *		At ExecutorStart()
+ *		----------------
+ *
+ *		- ExecInitSeqScan() calls ExecInitScanTupleSlot() to construct a
+ *		  TupleTableSlots for the tuples returned by the access method, and
+ *		  ExecInitResultTypeTL() to define the node's return
+ *		  type. ExecAssignScanProjectionInfo() will, if necessary, create
+ *		  another TupleTableSlot for the tuples resulting from performing
+ *		  target list projections.
+ *
+ *		During ExecutorRun()
+ *		----------------
+ *		- SeqNext() calls ExecStoreBufferHeapTuple() to place the tuple
+ *		  returned by the access method into the scan tuple slot.
+ *
+ *		- ExecSeqScan() (via ExecScan), if necessary, calls ExecProject(),
+ *		  putting the result of the projection in the result tuple slot. If
+ *		  not necessary, it directly returns the slot returned by SeqNext().
+ *
+ *		- ExecutePlan() calls the output function.
+ *
+ *		The important thing to watch in the executor code is how pointers
+ *		to the slots containing tuples are passed instead of the tuples
+ *		themselves.  This facilitates the communication of related information
+ *		(such as whether or not a tuple should be pfreed, what buffer contains
+ *		this tuple, the tuple's tuple descriptor, etc).  It also allows us
+ *		to avoid physically constructing projection tuples in many cases.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execTuples.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heaptoast.h"
+#include "access/htup_details.h"
+#include "access/tupdesc_details.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/bufmgr.h"
+#include "utils/builtins.h"
+#include "utils/expandeddatum.h"
+#include "utils/lsyscache.h"
+#include "utils/typcache.h"
+
+static TupleDesc ExecTypeFromTLInternal(List *targetList,
+										bool skipjunk);
+static pg_attribute_always_inline void slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
+															  int natts);
+static inline void tts_buffer_heap_store_tuple(TupleTableSlot *slot,
+											   HeapTuple tuple,
+											   Buffer buffer,
+											   bool transfer_pin);
+static void tts_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple, bool shouldFree);
+
+
+const TupleTableSlotOps TTSOpsVirtual;
+const TupleTableSlotOps TTSOpsHeapTuple;
+const TupleTableSlotOps TTSOpsMinimalTuple;
+const TupleTableSlotOps TTSOpsBufferHeapTuple;
+
+
+/*
+ * TupleTableSlotOps implementations.
+ */
+
+/*
+ * TupleTableSlotOps implementation for VirtualTupleTableSlot.
+ */
+static void
+tts_virtual_init(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_virtual_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_virtual_clear(TupleTableSlot *slot)
+{
+	if (unlikely(TTS_SHOULDFREE(slot)))
+	{
+		VirtualTupleTableSlot *vslot = (VirtualTupleTableSlot *) slot;
+
+		pfree(vslot->data);
+		vslot->data = NULL;
+
+		slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+	}
+
+	slot->tts_nvalid = 0;
+	slot->tts_flags |= TTS_FLAG_EMPTY;
+	ItemPointerSetInvalid(&slot->tts_tid);
+}
+
+/*
+ * VirtualTupleTableSlots always have fully populated tts_values and
+ * tts_isnull arrays.  So this function should never be called.
+ */
+static void
+tts_virtual_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+	elog(ERROR, "getsomeattrs is not required to be called on a virtual tuple table slot");
+}
+
+/*
+ * VirtualTupleTableSlots never provide system attributes (except those
+ * handled generically, such as tableoid).  We generally shouldn't get
+ * here, but provide a user-friendly message if we do.
+ */
+static Datum
+tts_virtual_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+	Assert(!TTS_EMPTY(slot));
+
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot retrieve a system column in this context")));
+
+	return 0;					/* silence compiler warnings */
+}
+
+/*
+ * To materialize a virtual slot all the datums that aren't passed by value
+ * have to be copied into the slot's memory context.  To do so, compute the
+ * required size, and allocate enough memory to store all attributes.  That's
+ * good for cache hit ratio, but more importantly requires only memory
+ * allocation/deallocation.
+ */
+static void
+tts_virtual_materialize(TupleTableSlot *slot)
+{
+	VirtualTupleTableSlot *vslot = (VirtualTupleTableSlot *) slot;
+	TupleDesc	desc = slot->tts_tupleDescriptor;
+	Size		sz = 0;
+	char	   *data;
+
+	/* already materialized */
+	if (TTS_SHOULDFREE(slot))
+		return;
+
+	/* compute size of memory required */
+	for (int natt = 0; natt < desc->natts; natt++)
+	{
+		Form_pg_attribute att = TupleDescAttr(desc, natt);
+		Datum		val;
+
+		if (att->attbyval || slot->tts_isnull[natt])
+			continue;
+
+		val = slot->tts_values[natt];
+
+		if (att->attlen == -1 &&
+			VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val)))
+		{
+			/*
+			 * We want to flatten the expanded value so that the materialized
+			 * slot doesn't depend on it.
+			 */
+			sz = att_align_nominal(sz, att->attalign);
+			sz += EOH_get_flat_size(DatumGetEOHP(val));
+		}
+		else
+		{
+			sz = att_align_nominal(sz, att->attalign);
+			sz = att_addlength_datum(sz, att->attlen, val);
+		}
+	}
+
+	/* all data is byval */
+	if (sz == 0)
+		return;
+
+	/* allocate memory */
+	vslot->data = data = MemoryContextAlloc(slot->tts_mcxt, sz);
+	slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+	/* and copy all attributes into the pre-allocated space */
+	for (int natt = 0; natt < desc->natts; natt++)
+	{
+		Form_pg_attribute att = TupleDescAttr(desc, natt);
+		Datum		val;
+
+		if (att->attbyval || slot->tts_isnull[natt])
+			continue;
+
+		val = slot->tts_values[natt];
+
+		if (att->attlen == -1 &&
+			VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val)))
+		{
+			Size		data_length;
+
+			/*
+			 * We want to flatten the expanded value so that the materialized
+			 * slot doesn't depend on it.
+			 */
+			ExpandedObjectHeader *eoh = DatumGetEOHP(val);
+
+			data = (char *) att_align_nominal(data,
+											  att->attalign);
+			data_length = EOH_get_flat_size(eoh);
+			EOH_flatten_into(eoh, data, data_length);
+
+			slot->tts_values[natt] = PointerGetDatum(data);
+			data += data_length;
+		}
+		else
+		{
+			Size		data_length = 0;
+
+			data = (char *) att_align_nominal(data, att->attalign);
+			data_length = att_addlength_datum(data_length, att->attlen, val);
+
+			memcpy(data, DatumGetPointer(val), data_length);
+
+			slot->tts_values[natt] = PointerGetDatum(data);
+			data += data_length;
+		}
+	}
+}
+
+static void
+tts_virtual_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+	TupleDesc	srcdesc = srcslot->tts_tupleDescriptor;
+
+	Assert(srcdesc->natts <= dstslot->tts_tupleDescriptor->natts);
+
+	tts_virtual_clear(dstslot);
+
+	slot_getallattrs(srcslot);
+
+	for (int natt = 0; natt < srcdesc->natts; natt++)
+	{
+		dstslot->tts_values[natt] = srcslot->tts_values[natt];
+		dstslot->tts_isnull[natt] = srcslot->tts_isnull[natt];
+	}
+
+	dstslot->tts_nvalid = srcdesc->natts;
+	dstslot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+	/* make sure storage doesn't depend on external memory */
+	tts_virtual_materialize(dstslot);
+}
+
+static HeapTuple
+tts_virtual_copy_heap_tuple(TupleTableSlot *slot)
+{
+	Assert(!TTS_EMPTY(slot));
+
+	return heap_form_tuple(slot->tts_tupleDescriptor,
+						   slot->tts_values,
+						   slot->tts_isnull);
+}
+
+static MinimalTuple
+tts_virtual_copy_minimal_tuple(TupleTableSlot *slot)
+{
+	Assert(!TTS_EMPTY(slot));
+
+	return heap_form_minimal_tuple(slot->tts_tupleDescriptor,
+								   slot->tts_values,
+								   slot->tts_isnull);
+}
+
+
+/*
+ * TupleTableSlotOps implementation for HeapTupleTableSlot.
+ */
+
+static void
+tts_heap_init(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_heap_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_heap_clear(TupleTableSlot *slot)
+{
+	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+	/* Free the memory for the heap tuple if it's allowed. */
+	if (TTS_SHOULDFREE(slot))
+	{
+		heap_freetuple(hslot->tuple);
+		slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+	}
+
+	slot->tts_nvalid = 0;
+	slot->tts_flags |= TTS_FLAG_EMPTY;
+	ItemPointerSetInvalid(&slot->tts_tid);
+	hslot->off = 0;
+	hslot->tuple = NULL;
+}
+
+static void
+tts_heap_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+
+	slot_deform_heap_tuple(slot, hslot->tuple, &hslot->off, natts);
+}
+
+static Datum
+tts_heap_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+
+	/*
+	 * In some code paths it's possible to get here with a non-materialized
+	 * slot, in which case we can't retrieve system columns.
+	 */
+	if (!hslot->tuple)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot retrieve a system column in this context")));
+
+	return heap_getsysattr(hslot->tuple, attnum,
+						   slot->tts_tupleDescriptor, isnull);
+}
+
+static void
+tts_heap_materialize(TupleTableSlot *slot)
+{
+	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+	MemoryContext oldContext;
+
+	Assert(!TTS_EMPTY(slot));
+
+	/* If slot has its tuple already materialized, nothing to do. */
+	if (TTS_SHOULDFREE(slot))
+		return;
+
+	oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+
+	/*
+	 * Have to deform from scratch, otherwise tts_values[] entries could point
+	 * into the non-materialized tuple (which might be gone when accessed).
+	 */
+	slot->tts_nvalid = 0;
+	hslot->off = 0;
+
+	if (!hslot->tuple)
+		hslot->tuple = heap_form_tuple(slot->tts_tupleDescriptor,
+									   slot->tts_values,
+									   slot->tts_isnull);
+	else
+	{
+		/*
+		 * The tuple contained in this slot is not allocated in the memory
+		 * context of the given slot (else it would have TTS_SHOULDFREE set).
+		 * Copy the tuple into the given slot's memory context.
+		 */
+		hslot->tuple = heap_copytuple(hslot->tuple);
+	}
+
+	slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+static void
+tts_heap_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+	HeapTuple	tuple;
+	MemoryContext oldcontext;
+
+	oldcontext = MemoryContextSwitchTo(dstslot->tts_mcxt);
+	tuple = ExecCopySlotHeapTuple(srcslot);
+	MemoryContextSwitchTo(oldcontext);
+
+	ExecStoreHeapTuple(tuple, dstslot, true);
+}
+
+static HeapTuple
+tts_heap_get_heap_tuple(TupleTableSlot *slot)
+{
+	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+	if (!hslot->tuple)
+		tts_heap_materialize(slot);
+
+	return hslot->tuple;
+}
+
+static HeapTuple
+tts_heap_copy_heap_tuple(TupleTableSlot *slot)
+{
+	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+	if (!hslot->tuple)
+		tts_heap_materialize(slot);
+
+	return heap_copytuple(hslot->tuple);
+}
+
+static MinimalTuple
+tts_heap_copy_minimal_tuple(TupleTableSlot *slot)
+{
+	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+	if (!hslot->tuple)
+		tts_heap_materialize(slot);
+
+	return minimal_tuple_from_heap_tuple(hslot->tuple);
+}
+
+static void
+tts_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple, bool shouldFree)
+{
+	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
+
+	tts_heap_clear(slot);
+
+	slot->tts_nvalid = 0;
+	hslot->tuple = tuple;
+	hslot->off = 0;
+	slot->tts_flags &= ~(TTS_FLAG_EMPTY | TTS_FLAG_SHOULDFREE);
+	slot->tts_tid = tuple->t_self;
+
+	if (shouldFree)
+		slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+}
+
+
+/*
+ * TupleTableSlotOps implementation for MinimalTupleTableSlot.
+ */
+
+static void
+tts_minimal_init(TupleTableSlot *slot)
+{
+	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+	/*
+	 * Initialize the heap tuple pointer to access attributes of the minimal
+	 * tuple contained in the slot as if its a heap tuple.
+	 */
+	mslot->tuple = &mslot->minhdr;
+}
+
+static void
+tts_minimal_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_minimal_clear(TupleTableSlot *slot)
+{
+	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+	if (TTS_SHOULDFREE(slot))
+	{
+		heap_free_minimal_tuple(mslot->mintuple);
+		slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+	}
+
+	slot->tts_nvalid = 0;
+	slot->tts_flags |= TTS_FLAG_EMPTY;
+	ItemPointerSetInvalid(&slot->tts_tid);
+	mslot->off = 0;
+	mslot->mintuple = NULL;
+}
+
+static void
+tts_minimal_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+
+	slot_deform_heap_tuple(slot, mslot->tuple, &mslot->off, natts);
+}
+
+static Datum
+tts_minimal_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+	Assert(!TTS_EMPTY(slot));
+
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot retrieve a system column in this context")));
+
+	return 0;					/* silence compiler warnings */
+}
+
+static void
+tts_minimal_materialize(TupleTableSlot *slot)
+{
+	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+	MemoryContext oldContext;
+
+	Assert(!TTS_EMPTY(slot));
+
+	/* If slot has its tuple already materialized, nothing to do. */
+	if (TTS_SHOULDFREE(slot))
+		return;
+
+	oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+
+	/*
+	 * Have to deform from scratch, otherwise tts_values[] entries could point
+	 * into the non-materialized tuple (which might be gone when accessed).
+	 */
+	slot->tts_nvalid = 0;
+	mslot->off = 0;
+
+	if (!mslot->mintuple)
+	{
+		mslot->mintuple = heap_form_minimal_tuple(slot->tts_tupleDescriptor,
+												  slot->tts_values,
+												  slot->tts_isnull);
+	}
+	else
+	{
+		/*
+		 * The minimal tuple contained in this slot is not allocated in the
+		 * memory context of the given slot (else it would have TTS_SHOULDFREE
+		 * set).  Copy the minimal tuple into the given slot's memory context.
+		 */
+		mslot->mintuple = heap_copy_minimal_tuple(mslot->mintuple);
+	}
+
+	slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+	Assert(mslot->tuple == &mslot->minhdr);
+
+	mslot->minhdr.t_len = mslot->mintuple->t_len + MINIMAL_TUPLE_OFFSET;
+	mslot->minhdr.t_data = (HeapTupleHeader) ((char *) mslot->mintuple - MINIMAL_TUPLE_OFFSET);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+static void
+tts_minimal_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+	MemoryContext oldcontext;
+	MinimalTuple mintuple;
+
+	oldcontext = MemoryContextSwitchTo(dstslot->tts_mcxt);
+	mintuple = ExecCopySlotMinimalTuple(srcslot);
+	MemoryContextSwitchTo(oldcontext);
+
+	ExecStoreMinimalTuple(mintuple, dstslot, true);
+}
+
+static MinimalTuple
+tts_minimal_get_minimal_tuple(TupleTableSlot *slot)
+{
+	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+	if (!mslot->mintuple)
+		tts_minimal_materialize(slot);
+
+	return mslot->mintuple;
+}
+
+static HeapTuple
+tts_minimal_copy_heap_tuple(TupleTableSlot *slot)
+{
+	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+	if (!mslot->mintuple)
+		tts_minimal_materialize(slot);
+
+	return heap_tuple_from_minimal_tuple(mslot->mintuple);
+}
+
+static MinimalTuple
+tts_minimal_copy_minimal_tuple(TupleTableSlot *slot)
+{
+	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+	if (!mslot->mintuple)
+		tts_minimal_materialize(slot);
+
+	return heap_copy_minimal_tuple(mslot->mintuple);
+}
+
+static void
+tts_minimal_store_tuple(TupleTableSlot *slot, MinimalTuple mtup, bool shouldFree)
+{
+	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
+
+	tts_minimal_clear(slot);
+
+	Assert(!TTS_SHOULDFREE(slot));
+	Assert(TTS_EMPTY(slot));
+
+	slot->tts_flags &= ~TTS_FLAG_EMPTY;
+	slot->tts_nvalid = 0;
+	mslot->off = 0;
+
+	mslot->mintuple = mtup;
+	Assert(mslot->tuple == &mslot->minhdr);
+	mslot->minhdr.t_len = mtup->t_len + MINIMAL_TUPLE_OFFSET;
+	mslot->minhdr.t_data = (HeapTupleHeader) ((char *) mtup - MINIMAL_TUPLE_OFFSET);
+	/* no need to set t_self or t_tableOid since we won't allow access */
+
+	if (shouldFree)
+		slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+}
+
+
+/*
+ * TupleTableSlotOps implementation for BufferHeapTupleTableSlot.
+ */
+
+static void
+tts_buffer_heap_init(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_buffer_heap_release(TupleTableSlot *slot)
+{
+}
+
+static void
+tts_buffer_heap_clear(TupleTableSlot *slot)
+{
+	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+	/*
+	 * Free the memory for heap tuple if allowed. A tuple coming from buffer
+	 * can never be freed. But we may have materialized a tuple from buffer.
+	 * Such a tuple can be freed.
+	 */
+	if (TTS_SHOULDFREE(slot))
+	{
+		/* We should have unpinned the buffer while materializing the tuple. */
+		Assert(!BufferIsValid(bslot->buffer));
+
+		heap_freetuple(bslot->base.tuple);
+		slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+	}
+
+	if (BufferIsValid(bslot->buffer))
+		ReleaseBuffer(bslot->buffer);
+
+	slot->tts_nvalid = 0;
+	slot->tts_flags |= TTS_FLAG_EMPTY;
+	ItemPointerSetInvalid(&slot->tts_tid);
+	bslot->base.tuple = NULL;
+	bslot->base.off = 0;
+	bslot->buffer = InvalidBuffer;
+}
+
+static void
+tts_buffer_heap_getsomeattrs(TupleTableSlot *slot, int natts)
+{
+	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+
+	slot_deform_heap_tuple(slot, bslot->base.tuple, &bslot->base.off, natts);
+}
+
+static Datum
+tts_buffer_heap_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull)
+{
+	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+
+	/*
+	 * In some code paths it's possible to get here with a non-materialized
+	 * slot, in which case we can't retrieve system columns.
+	 */
+	if (!bslot->base.tuple)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot retrieve a system column in this context")));
+
+	return heap_getsysattr(bslot->base.tuple, attnum,
+						   slot->tts_tupleDescriptor, isnull);
+}
+
+static void
+tts_buffer_heap_materialize(TupleTableSlot *slot)
+{
+	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+	MemoryContext oldContext;
+
+	Assert(!TTS_EMPTY(slot));
+
+	/* If slot has its tuple already materialized, nothing to do. */
+	if (TTS_SHOULDFREE(slot))
+		return;
+
+	oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+
+	/*
+	 * Have to deform from scratch, otherwise tts_values[] entries could point
+	 * into the non-materialized tuple (which might be gone when accessed).
+	 */
+	bslot->base.off = 0;
+	slot->tts_nvalid = 0;
+
+	if (!bslot->base.tuple)
+	{
+		/*
+		 * Normally BufferHeapTupleTableSlot should have a tuple + buffer
+		 * associated with it, unless it's materialized (which would've
+		 * returned above). But when it's useful to allow storing virtual
+		 * tuples in a buffer slot, which then also needs to be
+		 * materializable.
+		 */
+		bslot->base.tuple = heap_form_tuple(slot->tts_tupleDescriptor,
+											slot->tts_values,
+											slot->tts_isnull);
+	}
+	else
+	{
+		bslot->base.tuple = heap_copytuple(bslot->base.tuple);
+
+		/*
+		 * A heap tuple stored in a BufferHeapTupleTableSlot should have a
+		 * buffer associated with it, unless it's materialized or virtual.
+		 */
+		if (likely(BufferIsValid(bslot->buffer)))
+			ReleaseBuffer(bslot->buffer);
+		bslot->buffer = InvalidBuffer;
+	}
+
+	/*
+	 * We don't set TTS_FLAG_SHOULDFREE until after releasing the buffer, if
+	 * any.  This avoids having a transient state that would fall foul of our
+	 * assertions that a slot with TTS_FLAG_SHOULDFREE doesn't own a buffer.
+	 * In the unlikely event that ReleaseBuffer() above errors out, we'd
+	 * effectively leak the copied tuple, but that seems fairly harmless.
+	 */
+	slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+static void
+tts_buffer_heap_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot)
+{
+	BufferHeapTupleTableSlot *bsrcslot = (BufferHeapTupleTableSlot *) srcslot;
+	BufferHeapTupleTableSlot *bdstslot = (BufferHeapTupleTableSlot *) dstslot;
+
+	/*
+	 * If the source slot is of a different kind, or is a buffer slot that has
+	 * been materialized / is virtual, make a new copy of the tuple. Otherwise
+	 * make a new reference to the in-buffer tuple.
+	 */
+	if (dstslot->tts_ops != srcslot->tts_ops ||
+		TTS_SHOULDFREE(srcslot) ||
+		!bsrcslot->base.tuple)
+	{
+		MemoryContext oldContext;
+
+		ExecClearTuple(dstslot);
+		dstslot->tts_flags &= ~TTS_FLAG_EMPTY;
+		oldContext = MemoryContextSwitchTo(dstslot->tts_mcxt);
+		bdstslot->base.tuple = ExecCopySlotHeapTuple(srcslot);
+		dstslot->tts_flags |= TTS_FLAG_SHOULDFREE;
+		MemoryContextSwitchTo(oldContext);
+	}
+	else
+	{
+		Assert(BufferIsValid(bsrcslot->buffer));
+
+		tts_buffer_heap_store_tuple(dstslot, bsrcslot->base.tuple,
+									bsrcslot->buffer, false);
+
+		/*
+		 * The HeapTupleData portion of the source tuple might be shorter
+		 * lived than the destination slot. Therefore copy the HeapTuple into
+		 * our slot's tupdata, which is guaranteed to live long enough (but
+		 * will still point into the buffer).
+		 */
+		memcpy(&bdstslot->base.tupdata, bdstslot->base.tuple, sizeof(HeapTupleData));
+		bdstslot->base.tuple = &bdstslot->base.tupdata;
+	}
+}
+
+static HeapTuple
+tts_buffer_heap_get_heap_tuple(TupleTableSlot *slot)
+{
+	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+
+	if (!bslot->base.tuple)
+		tts_buffer_heap_materialize(slot);
+
+	return bslot->base.tuple;
+}
+
+static HeapTuple
+tts_buffer_heap_copy_heap_tuple(TupleTableSlot *slot)
+{
+	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+
+	if (!bslot->base.tuple)
+		tts_buffer_heap_materialize(slot);
+
+	return heap_copytuple(bslot->base.tuple);
+}
+
+static MinimalTuple
+tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot)
+{
+	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+	Assert(!TTS_EMPTY(slot));
+
+	if (!bslot->base.tuple)
+		tts_buffer_heap_materialize(slot);
+
+	return minimal_tuple_from_heap_tuple(bslot->base.tuple);
+}
+
+static inline void
+tts_buffer_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple,
+							Buffer buffer, bool transfer_pin)
+{
+	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+	if (TTS_SHOULDFREE(slot))
+	{
+		/* materialized slot shouldn't have a buffer to release */
+		Assert(!BufferIsValid(bslot->buffer));
+
+		heap_freetuple(bslot->base.tuple);
+		slot->tts_flags &= ~TTS_FLAG_SHOULDFREE;
+	}
+
+	slot->tts_flags &= ~TTS_FLAG_EMPTY;
+	slot->tts_nvalid = 0;
+	bslot->base.tuple = tuple;
+	bslot->base.off = 0;
+	slot->tts_tid = tuple->t_self;
+
+	/*
+	 * If tuple is on a disk page, keep the page pinned as long as we hold a
+	 * pointer into it.  We assume the caller already has such a pin.  If
+	 * transfer_pin is true, we'll transfer that pin to this slot, if not
+	 * we'll pin it again ourselves.
+	 *
+	 * This is coded to optimize the case where the slot previously held a
+	 * tuple on the same disk page: in that case releasing and re-acquiring
+	 * the pin is a waste of cycles.  This is a common situation during
+	 * seqscans, so it's worth troubling over.
+	 */
+	if (bslot->buffer != buffer)
+	{
+		if (BufferIsValid(bslot->buffer))
+			ReleaseBuffer(bslot->buffer);
+
+		bslot->buffer = buffer;
+
+		if (!transfer_pin && BufferIsValid(buffer))
+			IncrBufferRefCount(buffer);
+	}
+	else if (transfer_pin && BufferIsValid(buffer))
+	{
+		/*
+		 * In transfer_pin mode the caller won't know about the same-page
+		 * optimization, so we gotta release its pin.
+		 */
+		ReleaseBuffer(buffer);
+	}
+}
+
+/*
+ * slot_deform_heap_tuple
+ *		Given a TupleTableSlot, extract data from the slot's physical tuple
+ *		into its Datum/isnull arrays.  Data is extracted up through the
+ *		natts'th column (caller must ensure this is a legal column number).
+ *
+ *		This is essentially an incremental version of heap_deform_tuple:
+ *		on each call we extract attributes up to the one needed, without
+ *		re-computing information about previously extracted attributes.
+ *		slot->tts_nvalid is the number of attributes already extracted.
+ *
+ * This is marked as always inline, so the different offp for different types
+ * of slots gets optimized away.
+ */
+static pg_attribute_always_inline void
+slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
+					   int natts)
+{
+	TupleDesc	tupleDesc = slot->tts_tupleDescriptor;
+	Datum	   *values = slot->tts_values;
+	bool	   *isnull = slot->tts_isnull;
+	HeapTupleHeader tup = tuple->t_data;
+	bool		hasnulls = HeapTupleHasNulls(tuple);
+	int			attnum;
+	char	   *tp;				/* ptr to tuple data */
+	uint32		off;			/* offset in tuple data */
+	bits8	   *bp = tup->t_bits;	/* ptr to null bitmap in tuple */
+	bool		slow;			/* can we use/set attcacheoff? */
+
+	/* We can only fetch as many attributes as the tuple has. */
+	natts = Min(HeapTupleHeaderGetNatts(tuple->t_data), natts);
+
+	/*
+	 * Check whether the first call for this tuple, and initialize or restore
+	 * loop state.
+	 */
+	attnum = slot->tts_nvalid;
+	if (attnum == 0)
+	{
+		/* Start from the first attribute */
+		off = 0;
+		slow = false;
+	}
+	else
+	{
+		/* Restore state from previous execution */
+		off = *offp;
+		slow = TTS_SLOW(slot);
+	}
+
+	tp = (char *) tup + tup->t_hoff;
+
+	for (; attnum < natts; attnum++)
+	{
+		Form_pg_attribute thisatt = TupleDescAttr(tupleDesc, attnum);
+
+		if (hasnulls && att_isnull(attnum, bp))
+		{
+			values[attnum] = (Datum) 0;
+			isnull[attnum] = true;
+			slow = true;		/* can't use attcacheoff anymore */
+			continue;
+		}
+
+		isnull[attnum] = false;
+
+		if (!slow && thisatt->attcacheoff >= 0)
+			off = thisatt->attcacheoff;
+		else if (thisatt->attlen == -1)
+		{
+			/*
+			 * We can only cache the offset for a varlena attribute if the
+			 * offset is already suitably aligned, so that there would be no
+			 * pad bytes in any case: then the offset will be valid for either
+			 * an aligned or unaligned value.
+			 */
+			if (!slow &&
+				off == att_align_nominal(off, thisatt->attalign))
+				thisatt->attcacheoff = off;
+			else
+			{
+				off = att_align_pointer(off, thisatt->attalign, -1,
+										tp + off);
+				slow = true;
+			}
+		}
+		else
+		{
+			/* not varlena, so safe to use att_align_nominal */
+			off = att_align_nominal(off, thisatt->attalign);
+
+			if (!slow)
+				thisatt->attcacheoff = off;
+		}
+
+		values[attnum] = fetchatt(thisatt, tp + off);
+
+		off = att_addlength_pointer(off, thisatt->attlen, tp + off);
+
+		if (thisatt->attlen <= 0)
+			slow = true;		/* can't use attcacheoff anymore */
+	}
+
+	/*
+	 * Save state for next execution
+	 */
+	slot->tts_nvalid = attnum;
+	*offp = off;
+	if (slow)
+		slot->tts_flags |= TTS_FLAG_SLOW;
+	else
+		slot->tts_flags &= ~TTS_FLAG_SLOW;
+}
+
+
+const TupleTableSlotOps TTSOpsVirtual = {
+	.base_slot_size = sizeof(VirtualTupleTableSlot),
+	.init = tts_virtual_init,
+	.release = tts_virtual_release,
+	.clear = tts_virtual_clear,
+	.getsomeattrs = tts_virtual_getsomeattrs,
+	.getsysattr = tts_virtual_getsysattr,
+	.materialize = tts_virtual_materialize,
+	.copyslot = tts_virtual_copyslot,
+
+	/*
+	 * A virtual tuple table slot can not "own" a heap tuple or a minimal
+	 * tuple.
+	 */
+	.get_heap_tuple = NULL,
+	.get_minimal_tuple = NULL,
+	.copy_heap_tuple = tts_virtual_copy_heap_tuple,
+	.copy_minimal_tuple = tts_virtual_copy_minimal_tuple
+};
+
+const TupleTableSlotOps TTSOpsHeapTuple = {
+	.base_slot_size = sizeof(HeapTupleTableSlot),
+	.init = tts_heap_init,
+	.release = tts_heap_release,
+	.clear = tts_heap_clear,
+	.getsomeattrs = tts_heap_getsomeattrs,
+	.getsysattr = tts_heap_getsysattr,
+	.materialize = tts_heap_materialize,
+	.copyslot = tts_heap_copyslot,
+	.get_heap_tuple = tts_heap_get_heap_tuple,
+
+	/* A heap tuple table slot can not "own" a minimal tuple. */
+	.get_minimal_tuple = NULL,
+	.copy_heap_tuple = tts_heap_copy_heap_tuple,
+	.copy_minimal_tuple = tts_heap_copy_minimal_tuple
+};
+
+const TupleTableSlotOps TTSOpsMinimalTuple = {
+	.base_slot_size = sizeof(MinimalTupleTableSlot),
+	.init = tts_minimal_init,
+	.release = tts_minimal_release,
+	.clear = tts_minimal_clear,
+	.getsomeattrs = tts_minimal_getsomeattrs,
+	.getsysattr = tts_minimal_getsysattr,
+	.materialize = tts_minimal_materialize,
+	.copyslot = tts_minimal_copyslot,
+
+	/* A minimal tuple table slot can not "own" a heap tuple. */
+	.get_heap_tuple = NULL,
+	.get_minimal_tuple = tts_minimal_get_minimal_tuple,
+	.copy_heap_tuple = tts_minimal_copy_heap_tuple,
+	.copy_minimal_tuple = tts_minimal_copy_minimal_tuple
+};
+
+const TupleTableSlotOps TTSOpsBufferHeapTuple = {
+	.base_slot_size = sizeof(BufferHeapTupleTableSlot),
+	.init = tts_buffer_heap_init,
+	.release = tts_buffer_heap_release,
+	.clear = tts_buffer_heap_clear,
+	.getsomeattrs = tts_buffer_heap_getsomeattrs,
+	.getsysattr = tts_buffer_heap_getsysattr,
+	.materialize = tts_buffer_heap_materialize,
+	.copyslot = tts_buffer_heap_copyslot,
+	.get_heap_tuple = tts_buffer_heap_get_heap_tuple,
+
+	/* A buffer heap tuple table slot can not "own" a minimal tuple. */
+	.get_minimal_tuple = NULL,
+	.copy_heap_tuple = tts_buffer_heap_copy_heap_tuple,
+	.copy_minimal_tuple = tts_buffer_heap_copy_minimal_tuple
+};
+
+
+/* ----------------------------------------------------------------
+ *				  tuple table create/delete functions
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ *		MakeTupleTableSlot
+ *
+ *		Basic routine to make an empty TupleTableSlot of given
+ *		TupleTableSlotType. If tupleDesc is specified the slot's descriptor is
+ *		fixed for its lifetime, gaining some efficiency. If that's
+ *		undesirable, pass NULL.
+ * --------------------------------
+ */
+TupleTableSlot *
+MakeTupleTableSlot(TupleDesc tupleDesc,
+				   const TupleTableSlotOps *tts_ops)
+{
+	Size		basesz,
+				allocsz;
+	TupleTableSlot *slot;
+
+	basesz = tts_ops->base_slot_size;
+
+	/*
+	 * When a fixed descriptor is specified, we can reduce overhead by
+	 * allocating the entire slot in one go.
+	 */
+	if (tupleDesc)
+		allocsz = MAXALIGN(basesz) +
+			MAXALIGN(tupleDesc->natts * sizeof(Datum)) +
+			MAXALIGN(tupleDesc->natts * sizeof(bool));
+	else
+		allocsz = basesz;
+
+	slot = palloc0(allocsz);
+	/* const for optimization purposes, OK to modify at allocation time */
+	*((const TupleTableSlotOps **) &slot->tts_ops) = tts_ops;
+	slot->type = T_TupleTableSlot;
+	slot->tts_flags |= TTS_FLAG_EMPTY;
+	if (tupleDesc != NULL)
+		slot->tts_flags |= TTS_FLAG_FIXED;
+	slot->tts_tupleDescriptor = tupleDesc;
+	slot->tts_mcxt = CurrentMemoryContext;
+	slot->tts_nvalid = 0;
+
+	if (tupleDesc != NULL)
+	{
+		slot->tts_values = (Datum *)
+			(((char *) slot)
+			 + MAXALIGN(basesz));
+		slot->tts_isnull = (bool *)
+			(((char *) slot)
+			 + MAXALIGN(basesz)
+			 + MAXALIGN(tupleDesc->natts * sizeof(Datum)));
+
+		PinTupleDesc(tupleDesc);
+	}
+
+	/*
+	 * And allow slot type specific initialization.
+	 */
+	slot->tts_ops->init(slot);
+
+	return slot;
+}
+
+/* --------------------------------
+ *		ExecAllocTableSlot
+ *
+ *		Create a tuple table slot within a tuple table (which is just a List).
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecAllocTableSlot(List **tupleTable, TupleDesc desc,
+				   const TupleTableSlotOps *tts_ops)
+{
+	TupleTableSlot *slot = MakeTupleTableSlot(desc, tts_ops);
+
+	*tupleTable = lappend(*tupleTable, slot);
+
+	return slot;
+}
+
+/* --------------------------------
+ *		ExecResetTupleTable
+ *
+ *		This releases any resources (buffer pins, tupdesc refcounts)
+ *		held by the tuple table, and optionally releases the memory
+ *		occupied by the tuple table data structure.
+ *		It is expected that this routine be called by ExecEndPlan().
+ * --------------------------------
+ */
+void
+ExecResetTupleTable(List *tupleTable,	/* tuple table */
+					bool shouldFree)	/* true if we should free memory */
+{
+	ListCell   *lc;
+
+	foreach(lc, tupleTable)
+	{
+		TupleTableSlot *slot = lfirst_node(TupleTableSlot, lc);
+
+		/* Always release resources and reset the slot to empty */
+		ExecClearTuple(slot);
+		slot->tts_ops->release(slot);
+		if (slot->tts_tupleDescriptor)
+		{
+			ReleaseTupleDesc(slot->tts_tupleDescriptor);
+			slot->tts_tupleDescriptor = NULL;
+		}
+
+		/* If shouldFree, release memory occupied by the slot itself */
+		if (shouldFree)
+		{
+			if (!TTS_FIXED(slot))
+			{
+				if (slot->tts_values)
+					pfree(slot->tts_values);
+				if (slot->tts_isnull)
+					pfree(slot->tts_isnull);
+			}
+			pfree(slot);
+		}
+	}
+
+	/* If shouldFree, release the list structure */
+	if (shouldFree)
+		list_free(tupleTable);
+}
+
+/* --------------------------------
+ *		MakeSingleTupleTableSlot
+ *
+ *		This is a convenience routine for operations that need a standalone
+ *		TupleTableSlot not gotten from the main executor tuple table.  It makes
+ *		a single slot of given TupleTableSlotType and initializes it to use the
+ *		given tuple descriptor.
+ * --------------------------------
+ */
+TupleTableSlot *
+MakeSingleTupleTableSlot(TupleDesc tupdesc,
+						 const TupleTableSlotOps *tts_ops)
+{
+	TupleTableSlot *slot = MakeTupleTableSlot(tupdesc, tts_ops);
+
+	return slot;
+}
+
+/* --------------------------------
+ *		ExecDropSingleTupleTableSlot
+ *
+ *		Release a TupleTableSlot made with MakeSingleTupleTableSlot.
+ *		DON'T use this on a slot that's part of a tuple table list!
+ * --------------------------------
+ */
+void
+ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
+{
+	/* This should match ExecResetTupleTable's processing of one slot */
+	Assert(IsA(slot, TupleTableSlot));
+	ExecClearTuple(slot);
+	slot->tts_ops->release(slot);
+	if (slot->tts_tupleDescriptor)
+		ReleaseTupleDesc(slot->tts_tupleDescriptor);
+	if (!TTS_FIXED(slot))
+	{
+		if (slot->tts_values)
+			pfree(slot->tts_values);
+		if (slot->tts_isnull)
+			pfree(slot->tts_isnull);
+	}
+	pfree(slot);
+}
+
+
+/* ----------------------------------------------------------------
+ *				  tuple table slot accessor functions
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ *		ExecSetSlotDescriptor
+ *
+ *		This function is used to set the tuple descriptor associated
+ *		with the slot's tuple.  The passed descriptor must have lifespan
+ *		at least equal to the slot's.  If it is a reference-counted descriptor
+ *		then the reference count is incremented for as long as the slot holds
+ *		a reference.
+ * --------------------------------
+ */
+void
+ExecSetSlotDescriptor(TupleTableSlot *slot, /* slot to change */
+					  TupleDesc tupdesc)	/* new tuple descriptor */
+{
+	Assert(!TTS_FIXED(slot));
+
+	/* For safety, make sure slot is empty before changing it */
+	ExecClearTuple(slot);
+
+	/*
+	 * Release any old descriptor.  Also release old Datum/isnull arrays if
+	 * present (we don't bother to check if they could be re-used).
+	 */
+	if (slot->tts_tupleDescriptor)
+		ReleaseTupleDesc(slot->tts_tupleDescriptor);
+
+	if (slot->tts_values)
+		pfree(slot->tts_values);
+	if (slot->tts_isnull)
+		pfree(slot->tts_isnull);
+
+	/*
+	 * Install the new descriptor; if it's refcounted, bump its refcount.
+	 */
+	slot->tts_tupleDescriptor = tupdesc;
+	PinTupleDesc(tupdesc);
+
+	/*
+	 * Allocate Datum/isnull arrays of the appropriate size.  These must have
+	 * the same lifetime as the slot, so allocate in the slot's own context.
+	 */
+	slot->tts_values = (Datum *)
+		MemoryContextAlloc(slot->tts_mcxt, tupdesc->natts * sizeof(Datum));
+	slot->tts_isnull = (bool *)
+		MemoryContextAlloc(slot->tts_mcxt, tupdesc->natts * sizeof(bool));
+}
+
+/* --------------------------------
+ *		ExecStoreHeapTuple
+ *
+ *		This function is used to store an on-the-fly physical tuple into a specified
+ *		slot in the tuple table.
+ *
+ *		tuple:	tuple to store
+ *		slot:	TTSOpsHeapTuple type slot to store it in
+ *		shouldFree: true if ExecClearTuple should pfree() the tuple
+ *					when done with it
+ *
+ * shouldFree is normally set 'true' for tuples constructed on-the-fly.  But it
+ * can be 'false' when the referenced tuple is held in a tuple table slot
+ * belonging to a lower-level executor Proc node.  In this case the lower-level
+ * slot retains ownership and responsibility for eventually releasing the
+ * tuple.  When this method is used, we must be certain that the upper-level
+ * Proc node will lose interest in the tuple sooner than the lower-level one
+ * does!  If you're not certain, copy the lower-level tuple with heap_copytuple
+ * and let the upper-level table slot assume ownership of the copy!
+ *
+ * Return value is just the passed-in slot pointer.
+ *
+ * If the target slot is not guaranteed to be TTSOpsHeapTuple type slot, use
+ * the, more expensive, ExecForceStoreHeapTuple().
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecStoreHeapTuple(HeapTuple tuple,
+				   TupleTableSlot *slot,
+				   bool shouldFree)
+{
+	/*
+	 * sanity checks
+	 */
+	Assert(tuple != NULL);
+	Assert(slot != NULL);
+	Assert(slot->tts_tupleDescriptor != NULL);
+
+	if (unlikely(!TTS_IS_HEAPTUPLE(slot)))
+		elog(ERROR, "trying to store a heap tuple into wrong type of slot");
+	tts_heap_store_tuple(slot, tuple, shouldFree);
+
+	slot->tts_tableOid = tuple->t_tableOid;
+
+	return slot;
+}
+
+/* --------------------------------
+ *		ExecStoreBufferHeapTuple
+ *
+ *		This function is used to store an on-disk physical tuple from a buffer
+ *		into a specified slot in the tuple table.
+ *
+ *		tuple:	tuple to store
+ *		slot:	TTSOpsBufferHeapTuple type slot to store it in
+ *		buffer: disk buffer if tuple is in a disk page, else InvalidBuffer
+ *
+ * The tuple table code acquires a pin on the buffer which is held until the
+ * slot is cleared, so that the tuple won't go away on us.
+ *
+ * Return value is just the passed-in slot pointer.
+ *
+ * If the target slot is not guaranteed to be TTSOpsBufferHeapTuple type slot,
+ * use the, more expensive, ExecForceStoreHeapTuple().
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecStoreBufferHeapTuple(HeapTuple tuple,
+						 TupleTableSlot *slot,
+						 Buffer buffer)
+{
+	/*
+	 * sanity checks
+	 */
+	Assert(tuple != NULL);
+	Assert(slot != NULL);
+	Assert(slot->tts_tupleDescriptor != NULL);
+	Assert(BufferIsValid(buffer));
+
+	if (unlikely(!TTS_IS_BUFFERTUPLE(slot)))
+		elog(ERROR, "trying to store an on-disk heap tuple into wrong type of slot");
+	tts_buffer_heap_store_tuple(slot, tuple, buffer, false);
+
+	slot->tts_tableOid = tuple->t_tableOid;
+
+	return slot;
+}
+
+/*
+ * Like ExecStoreBufferHeapTuple, but transfer an existing pin from the caller
+ * to the slot, i.e. the caller doesn't need to, and may not, release the pin.
+ */
+TupleTableSlot *
+ExecStorePinnedBufferHeapTuple(HeapTuple tuple,
+							   TupleTableSlot *slot,
+							   Buffer buffer)
+{
+	/*
+	 * sanity checks
+	 */
+	Assert(tuple != NULL);
+	Assert(slot != NULL);
+	Assert(slot->tts_tupleDescriptor != NULL);
+	Assert(BufferIsValid(buffer));
+
+	if (unlikely(!TTS_IS_BUFFERTUPLE(slot)))
+		elog(ERROR, "trying to store an on-disk heap tuple into wrong type of slot");
+	tts_buffer_heap_store_tuple(slot, tuple, buffer, true);
+
+	slot->tts_tableOid = tuple->t_tableOid;
+
+	return slot;
+}
+
+/*
+ * Store a minimal tuple into TTSOpsMinimalTuple type slot.
+ *
+ * If the target slot is not guaranteed to be TTSOpsMinimalTuple type slot,
+ * use the, more expensive, ExecForceStoreMinimalTuple().
+ */
+TupleTableSlot *
+ExecStoreMinimalTuple(MinimalTuple mtup,
+					  TupleTableSlot *slot,
+					  bool shouldFree)
+{
+	/*
+	 * sanity checks
+	 */
+	Assert(mtup != NULL);
+	Assert(slot != NULL);
+	Assert(slot->tts_tupleDescriptor != NULL);
+
+	if (unlikely(!TTS_IS_MINIMALTUPLE(slot)))
+		elog(ERROR, "trying to store a minimal tuple into wrong type of slot");
+	tts_minimal_store_tuple(slot, mtup, shouldFree);
+
+	return slot;
+}
+
+/*
+ * Store a HeapTuple into any kind of slot, performing conversion if
+ * necessary.
+ */
+void
+ExecForceStoreHeapTuple(HeapTuple tuple,
+						TupleTableSlot *slot,
+						bool shouldFree)
+{
+	if (TTS_IS_HEAPTUPLE(slot))
+	{
+		ExecStoreHeapTuple(tuple, slot, shouldFree);
+	}
+	else if (TTS_IS_BUFFERTUPLE(slot))
+	{
+		MemoryContext oldContext;
+		BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+
+		ExecClearTuple(slot);
+		slot->tts_flags &= ~TTS_FLAG_EMPTY;
+		oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+		bslot->base.tuple = heap_copytuple(tuple);
+		slot->tts_flags |= TTS_FLAG_SHOULDFREE;
+		MemoryContextSwitchTo(oldContext);
+
+		if (shouldFree)
+			pfree(tuple);
+	}
+	else
+	{
+		ExecClearTuple(slot);
+		heap_deform_tuple(tuple, slot->tts_tupleDescriptor,
+						  slot->tts_values, slot->tts_isnull);
+		ExecStoreVirtualTuple(slot);
+
+		if (shouldFree)
+		{
+			ExecMaterializeSlot(slot);
+			pfree(tuple);
+		}
+	}
+}
+
+/*
+ * Store a MinimalTuple into any kind of slot, performing conversion if
+ * necessary.
+ */
+void
+ExecForceStoreMinimalTuple(MinimalTuple mtup,
+						   TupleTableSlot *slot,
+						   bool shouldFree)
+{
+	if (TTS_IS_MINIMALTUPLE(slot))
+	{
+		tts_minimal_store_tuple(slot, mtup, shouldFree);
+	}
+	else
+	{
+		HeapTupleData htup;
+
+		ExecClearTuple(slot);
+
+		htup.t_len = mtup->t_len + MINIMAL_TUPLE_OFFSET;
+		htup.t_data = (HeapTupleHeader) ((char *) mtup - MINIMAL_TUPLE_OFFSET);
+		heap_deform_tuple(&htup, slot->tts_tupleDescriptor,
+						  slot->tts_values, slot->tts_isnull);
+		ExecStoreVirtualTuple(slot);
+
+		if (shouldFree)
+		{
+			ExecMaterializeSlot(slot);
+			pfree(mtup);
+		}
+	}
+}
+
+/* --------------------------------
+ *		ExecStoreVirtualTuple
+ *			Mark a slot as containing a virtual tuple.
+ *
+ * The protocol for loading a slot with virtual tuple data is:
+ *		* Call ExecClearTuple to mark the slot empty.
+ *		* Store data into the Datum/isnull arrays.
+ *		* Call ExecStoreVirtualTuple to mark the slot valid.
+ * This is a bit unclean but it avoids one round of data copying.
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecStoreVirtualTuple(TupleTableSlot *slot)
+{
+	/*
+	 * sanity checks
+	 */
+	Assert(slot != NULL);
+	Assert(slot->tts_tupleDescriptor != NULL);
+	Assert(TTS_EMPTY(slot));
+
+	slot->tts_flags &= ~TTS_FLAG_EMPTY;
+	slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+
+	return slot;
+}
+
+/* --------------------------------
+ *		ExecStoreAllNullTuple
+ *			Set up the slot to contain a null in every column.
+ *
+ * At first glance this might sound just like ExecClearTuple, but it's
+ * entirely different: the slot ends up full, not empty.
+ * --------------------------------
+ */
+TupleTableSlot *
+ExecStoreAllNullTuple(TupleTableSlot *slot)
+{
+	/*
+	 * sanity checks
+	 */
+	Assert(slot != NULL);
+	Assert(slot->tts_tupleDescriptor != NULL);
+
+	/* Clear any old contents */
+	ExecClearTuple(slot);
+
+	/*
+	 * Fill all the columns of the virtual tuple with nulls
+	 */
+	MemSet(slot->tts_values, 0,
+		   slot->tts_tupleDescriptor->natts * sizeof(Datum));
+	memset(slot->tts_isnull, true,
+		   slot->tts_tupleDescriptor->natts * sizeof(bool));
+
+	return ExecStoreVirtualTuple(slot);
+}
+
+/*
+ * Store a HeapTuple in datum form, into a slot. That always requires
+ * deforming it and storing it in virtual form.
+ *
+ * Until the slot is materialized, the contents of the slot depend on the
+ * datum.
+ */
+void
+ExecStoreHeapTupleDatum(Datum data, TupleTableSlot *slot)
+{
+	HeapTupleData tuple = {0};
+	HeapTupleHeader td;
+
+	td = DatumGetHeapTupleHeader(data);
+
+	tuple.t_len = HeapTupleHeaderGetDatumLength(td);
+	tuple.t_self = td->t_ctid;
+	tuple.t_data = td;
+
+	ExecClearTuple(slot);
+
+	heap_deform_tuple(&tuple, slot->tts_tupleDescriptor,
+					  slot->tts_values, slot->tts_isnull);
+	ExecStoreVirtualTuple(slot);
+}
+
+/*
+ * ExecFetchSlotHeapTuple - fetch HeapTuple representing the slot's content
+ *
+ * The returned HeapTuple represents the slot's content as closely as
+ * possible.
+ *
+ * If materialize is true, the contents of the slots will be made independent
+ * from the underlying storage (i.e. all buffer pins are released, memory is
+ * allocated in the slot's context).
+ *
+ * If shouldFree is not-NULL it'll be set to true if the returned tuple has
+ * been allocated in the calling memory context, and must be freed by the
+ * caller (via explicit pfree() or a memory context reset).
+ *
+ * NB: If materialize is true, modifications of the returned tuple are
+ * allowed. But it depends on the type of the slot whether such modifications
+ * will also affect the slot's contents. While that is not the nicest
+ * behaviour, all such modifications are in the process of being removed.
+ */
+HeapTuple
+ExecFetchSlotHeapTuple(TupleTableSlot *slot, bool materialize, bool *shouldFree)
+{
+	/*
+	 * sanity checks
+	 */
+	Assert(slot != NULL);
+	Assert(!TTS_EMPTY(slot));
+
+	/* Materialize the tuple so that the slot "owns" it, if requested. */
+	if (materialize)
+		slot->tts_ops->materialize(slot);
+
+	if (slot->tts_ops->get_heap_tuple == NULL)
+	{
+		if (shouldFree)
+			*shouldFree = true;
+		return slot->tts_ops->copy_heap_tuple(slot);
+	}
+	else
+	{
+		if (shouldFree)
+			*shouldFree = false;
+		return slot->tts_ops->get_heap_tuple(slot);
+	}
+}
+
+/* --------------------------------
+ *		ExecFetchSlotMinimalTuple
+ *			Fetch the slot's minimal physical tuple.
+ *
+ *		If the given tuple table slot can hold a minimal tuple, indicated by a
+ *		non-NULL get_minimal_tuple callback, the function returns the minimal
+ *		tuple returned by that callback. It assumes that the minimal tuple
+ *		returned by the callback is "owned" by the slot i.e. the slot is
+ *		responsible for freeing the memory consumed by the tuple. Hence it sets
+ *		*shouldFree to false, indicating that the caller should not free the
+ *		memory consumed by the minimal tuple. In this case the returned minimal
+ *		tuple should be considered as read-only.
+ *
+ *		If that callback is not supported, it calls copy_minimal_tuple callback
+ *		which is expected to return a copy of minimal tuple representing the
+ *		contents of the slot. In this case *shouldFree is set to true,
+ *		indicating the caller that it should free the memory consumed by the
+ *		minimal tuple. In this case the returned minimal tuple may be written
+ *		up.
+ * --------------------------------
+ */
+MinimalTuple
+ExecFetchSlotMinimalTuple(TupleTableSlot *slot,
+						  bool *shouldFree)
+{
+	/*
+	 * sanity checks
+	 */
+	Assert(slot != NULL);
+	Assert(!TTS_EMPTY(slot));
+
+	if (slot->tts_ops->get_minimal_tuple)
+	{
+		if (shouldFree)
+			*shouldFree = false;
+		return slot->tts_ops->get_minimal_tuple(slot);
+	}
+	else
+	{
+		if (shouldFree)
+			*shouldFree = true;
+		return slot->tts_ops->copy_minimal_tuple(slot);
+	}
+}
+
+/* --------------------------------
+ *		ExecFetchSlotHeapTupleDatum
+ *			Fetch the slot's tuple as a composite-type Datum.
+ *
+ *		The result is always freshly palloc'd in the caller's memory context.
+ * --------------------------------
+ */
+Datum
+ExecFetchSlotHeapTupleDatum(TupleTableSlot *slot)
+{
+	HeapTuple	tup;
+	TupleDesc	tupdesc;
+	bool		shouldFree;
+	Datum		ret;
+
+	/* Fetch slot's contents in regular-physical-tuple form */
+	tup = ExecFetchSlotHeapTuple(slot, false, &shouldFree);
+	tupdesc = slot->tts_tupleDescriptor;
+
+	/* Convert to Datum form */
+	ret = heap_copy_tuple_as_datum(tup, tupdesc);
+
+	if (shouldFree)
+		pfree(tup);
+
+	return ret;
+}
+
+/* ----------------------------------------------------------------
+ *				convenience initialization routines
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ *		ExecInitResultTypeTL
+ *
+ *		Initialize result type, using the plan node's targetlist.
+ * ----------------
+ */
+void
+ExecInitResultTypeTL(PlanState *planstate)
+{
+	TupleDesc	tupDesc = ExecTypeFromTL(planstate->plan->targetlist);
+
+	planstate->ps_ResultTupleDesc = tupDesc;
+}
+
+/* --------------------------------
+ *		ExecInit{Result,Scan,Extra}TupleSlot[TL]
+ *
+ *		These are convenience routines to initialize the specified slot
+ *		in nodes inheriting the appropriate state.  ExecInitExtraTupleSlot
+ *		is used for initializing special-purpose slots.
+ * --------------------------------
+ */
+
+/* ----------------
+ *		ExecInitResultTupleSlotTL
+ *
+ *		Initialize result tuple slot, using the tuple descriptor previously
+ *		computed with ExecInitResultTypeTL().
+ * ----------------
+ */
+void
+ExecInitResultSlot(PlanState *planstate, const TupleTableSlotOps *tts_ops)
+{
+	TupleTableSlot *slot;
+
+	slot = ExecAllocTableSlot(&planstate->state->es_tupleTable,
+							  planstate->ps_ResultTupleDesc, tts_ops);
+	planstate->ps_ResultTupleSlot = slot;
+
+	planstate->resultopsfixed = planstate->ps_ResultTupleDesc != NULL;
+	planstate->resultops = tts_ops;
+	planstate->resultopsset = true;
+}
+
+/* ----------------
+ *		ExecInitResultTupleSlotTL
+ *
+ *		Initialize result tuple slot, using the plan node's targetlist.
+ * ----------------
+ */
+void
+ExecInitResultTupleSlotTL(PlanState *planstate,
+						  const TupleTableSlotOps *tts_ops)
+{
+	ExecInitResultTypeTL(planstate);
+	ExecInitResultSlot(planstate, tts_ops);
+}
+
+/* ----------------
+ *		ExecInitScanTupleSlot
+ * ----------------
+ */
+void
+ExecInitScanTupleSlot(EState *estate, ScanState *scanstate,
+					  TupleDesc tupledesc, const TupleTableSlotOps *tts_ops)
+{
+	scanstate->ss_ScanTupleSlot = ExecAllocTableSlot(&estate->es_tupleTable,
+													 tupledesc, tts_ops);
+	scanstate->ps.scandesc = tupledesc;
+	scanstate->ps.scanopsfixed = tupledesc != NULL;
+	scanstate->ps.scanops = tts_ops;
+	scanstate->ps.scanopsset = true;
+}
+
+/* ----------------
+ *		ExecInitExtraTupleSlot
+ *
+ * Return a newly created slot. If tupledesc is non-NULL the slot will have
+ * that as its fixed tupledesc. Otherwise the caller needs to use
+ * ExecSetSlotDescriptor() to set the descriptor before use.
+ * ----------------
+ */
+TupleTableSlot *
+ExecInitExtraTupleSlot(EState *estate,
+					   TupleDesc tupledesc,
+					   const TupleTableSlotOps *tts_ops)
+{
+	return ExecAllocTableSlot(&estate->es_tupleTable, tupledesc, tts_ops);
+}
+
+/* ----------------
+ *		ExecInitNullTupleSlot
+ *
+ * Build a slot containing an all-nulls tuple of the given type.
+ * This is used as a substitute for an input tuple when performing an
+ * outer join.
+ * ----------------
+ */
+TupleTableSlot *
+ExecInitNullTupleSlot(EState *estate, TupleDesc tupType,
+					  const TupleTableSlotOps *tts_ops)
+{
+	TupleTableSlot *slot = ExecInitExtraTupleSlot(estate, tupType, tts_ops);
+
+	return ExecStoreAllNullTuple(slot);
+}
+
+/* ---------------------------------------------------------------
+ *      Routines for setting/accessing attributes in a slot.
+ * ---------------------------------------------------------------
+ */
+
+/*
+ * Fill in missing values for a TupleTableSlot.
+ *
+ * This is only exposed because it's needed for JIT compiled tuple
+ * deforming. That exception aside, there should be no callers outside of this
+ * file.
+ */
+void
+slot_getmissingattrs(TupleTableSlot *slot, int startAttNum, int lastAttNum)
+{
+	AttrMissing *attrmiss = NULL;
+
+	if (slot->tts_tupleDescriptor->constr)
+		attrmiss = slot->tts_tupleDescriptor->constr->missing;
+
+	if (!attrmiss)
+	{
+		/* no missing values array at all, so just fill everything in as NULL */
+		memset(slot->tts_values + startAttNum, 0,
+			   (lastAttNum - startAttNum) * sizeof(Datum));
+		memset(slot->tts_isnull + startAttNum, 1,
+			   (lastAttNum - startAttNum) * sizeof(bool));
+	}
+	else
+	{
+		int			missattnum;
+
+		/* if there is a missing values array we must process them one by one */
+		for (missattnum = startAttNum;
+			 missattnum < lastAttNum;
+			 missattnum++)
+		{
+			slot->tts_values[missattnum] = attrmiss[missattnum].am_value;
+			slot->tts_isnull[missattnum] = !attrmiss[missattnum].am_present;
+		}
+	}
+}
+
+/*
+ * slot_getsomeattrs_int - workhorse for slot_getsomeattrs()
+ */
+void
+slot_getsomeattrs_int(TupleTableSlot *slot, int attnum)
+{
+	/* Check for caller errors */
+	Assert(slot->tts_nvalid < attnum);	/* checked in slot_getsomeattrs */
+	Assert(attnum > 0);
+
+	if (unlikely(attnum > slot->tts_tupleDescriptor->natts))
+		elog(ERROR, "invalid attribute number %d", attnum);
+
+	/* Fetch as many attributes as possible from the underlying tuple. */
+	slot->tts_ops->getsomeattrs(slot, attnum);
+
+	/*
+	 * If the underlying tuple doesn't have enough attributes, tuple
+	 * descriptor must have the missing attributes.
+	 */
+	if (unlikely(slot->tts_nvalid < attnum))
+	{
+		slot_getmissingattrs(slot, slot->tts_nvalid, attnum);
+		slot->tts_nvalid = attnum;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecTypeFromTL
+ *
+ *		Generate a tuple descriptor for the result tuple of a targetlist.
+ *		(A parse/plan tlist must be passed, not an ExprState tlist.)
+ *		Note that resjunk columns, if any, are included in the result.
+ *
+ *		Currently there are about 4 different places where we create
+ *		TupleDescriptors.  They should all be merged, or perhaps
+ *		be rewritten to call BuildDesc().
+ * ----------------------------------------------------------------
+ */
+TupleDesc
+ExecTypeFromTL(List *targetList)
+{
+	return ExecTypeFromTLInternal(targetList, false);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecCleanTypeFromTL
+ *
+ *		Same as above, but resjunk columns are omitted from the result.
+ * ----------------------------------------------------------------
+ */
+TupleDesc
+ExecCleanTypeFromTL(List *targetList)
+{
+	return ExecTypeFromTLInternal(targetList, true);
+}
+
+static TupleDesc
+ExecTypeFromTLInternal(List *targetList, bool skipjunk)
+{
+	TupleDesc	typeInfo;
+	ListCell   *l;
+	int			len;
+	int			cur_resno = 1;
+
+	if (skipjunk)
+		len = ExecCleanTargetListLength(targetList);
+	else
+		len = ExecTargetListLength(targetList);
+	typeInfo = CreateTemplateTupleDesc(len);
+
+	foreach(l, targetList)
+	{
+		TargetEntry *tle = lfirst(l);
+
+		if (skipjunk && tle->resjunk)
+			continue;
+		TupleDescInitEntry(typeInfo,
+						   cur_resno,
+						   tle->resname,
+						   exprType((Node *) tle->expr),
+						   exprTypmod((Node *) tle->expr),
+						   0);
+		TupleDescInitEntryCollation(typeInfo,
+									cur_resno,
+									exprCollation((Node *) tle->expr));
+		cur_resno++;
+	}
+
+	return typeInfo;
+}
+
+/*
+ * ExecTypeFromExprList - build a tuple descriptor from a list of Exprs
+ *
+ * This is roughly like ExecTypeFromTL, but we work from bare expressions
+ * not TargetEntrys.  No names are attached to the tupledesc's columns.
+ */
+TupleDesc
+ExecTypeFromExprList(List *exprList)
+{
+	TupleDesc	typeInfo;
+	ListCell   *lc;
+	int			cur_resno = 1;
+
+	typeInfo = CreateTemplateTupleDesc(list_length(exprList));
+
+	foreach(lc, exprList)
+	{
+		Node	   *e = lfirst(lc);
+
+		TupleDescInitEntry(typeInfo,
+						   cur_resno,
+						   NULL,
+						   exprType(e),
+						   exprTypmod(e),
+						   0);
+		TupleDescInitEntryCollation(typeInfo,
+									cur_resno,
+									exprCollation(e));
+		cur_resno++;
+	}
+
+	return typeInfo;
+}
+
+/*
+ * ExecTypeSetColNames - set column names in a RECORD TupleDesc
+ *
+ * Column names must be provided as an alias list (list of String nodes).
+ */
+void
+ExecTypeSetColNames(TupleDesc typeInfo, List *namesList)
+{
+	int			colno = 0;
+	ListCell   *lc;
+
+	/* It's only OK to change col names in a not-yet-blessed RECORD type */
+	Assert(typeInfo->tdtypeid == RECORDOID);
+	Assert(typeInfo->tdtypmod < 0);
+
+	foreach(lc, namesList)
+	{
+		char	   *cname = strVal(lfirst(lc));
+		Form_pg_attribute attr;
+
+		/* Guard against too-long names list (probably can't happen) */
+		if (colno >= typeInfo->natts)
+			break;
+		attr = TupleDescAttr(typeInfo, colno);
+		colno++;
+
+		/*
+		 * Do nothing for empty aliases or dropped columns (these cases
+		 * probably can't arise in RECORD types, either)
+		 */
+		if (cname[0] == '\0' || attr->attisdropped)
+			continue;
+
+		/* OK, assign the column name */
+		namestrcpy(&(attr->attname), cname);
+	}
+}
+
+/*
+ * BlessTupleDesc - make a completed tuple descriptor useful for SRFs
+ *
+ * Rowtype Datums returned by a function must contain valid type information.
+ * This happens "for free" if the tupdesc came from a relcache entry, but
+ * not if we have manufactured a tupdesc for a transient RECORD datatype.
+ * In that case we have to notify typcache.c of the existence of the type.
+ */
+TupleDesc
+BlessTupleDesc(TupleDesc tupdesc)
+{
+	if (tupdesc->tdtypeid == RECORDOID &&
+		tupdesc->tdtypmod < 0)
+		assign_record_type_typmod(tupdesc);
+
+	return tupdesc;				/* just for notational convenience */
+}
+
+/*
+ * TupleDescGetAttInMetadata - Build an AttInMetadata structure based on the
+ * supplied TupleDesc. AttInMetadata can be used in conjunction with C strings
+ * to produce a properly formed tuple.
+ */
+AttInMetadata *
+TupleDescGetAttInMetadata(TupleDesc tupdesc)
+{
+	int			natts = tupdesc->natts;
+	int			i;
+	Oid			atttypeid;
+	Oid			attinfuncid;
+	FmgrInfo   *attinfuncinfo;
+	Oid		   *attioparams;
+	int32	   *atttypmods;
+	AttInMetadata *attinmeta;
+
+	attinmeta = (AttInMetadata *) palloc(sizeof(AttInMetadata));
+
+	/* "Bless" the tupledesc so that we can make rowtype datums with it */
+	attinmeta->tupdesc = BlessTupleDesc(tupdesc);
+
+	/*
+	 * Gather info needed later to call the "in" function for each attribute
+	 */
+	attinfuncinfo = (FmgrInfo *) palloc0(natts * sizeof(FmgrInfo));
+	attioparams = (Oid *) palloc0(natts * sizeof(Oid));
+	atttypmods = (int32 *) palloc0(natts * sizeof(int32));
+
+	for (i = 0; i < natts; i++)
+	{
+		Form_pg_attribute att = TupleDescAttr(tupdesc, i);
+
+		/* Ignore dropped attributes */
+		if (!att->attisdropped)
+		{
+			atttypeid = att->atttypid;
+			getTypeInputInfo(atttypeid, &attinfuncid, &attioparams[i]);
+			fmgr_info(attinfuncid, &attinfuncinfo[i]);
+			atttypmods[i] = att->atttypmod;
+		}
+	}
+	attinmeta->attinfuncs = attinfuncinfo;
+	attinmeta->attioparams = attioparams;
+	attinmeta->atttypmods = atttypmods;
+
+	return attinmeta;
+}
+
+/*
+ * BuildTupleFromCStrings - build a HeapTuple given user data in C string form.
+ * values is an array of C strings, one for each attribute of the return tuple.
+ * A NULL string pointer indicates we want to create a NULL field.
+ */
+HeapTuple
+BuildTupleFromCStrings(AttInMetadata *attinmeta, char **values)
+{
+	TupleDesc	tupdesc = attinmeta->tupdesc;
+	int			natts = tupdesc->natts;
+	Datum	   *dvalues;
+	bool	   *nulls;
+	int			i;
+	HeapTuple	tuple;
+
+	dvalues = (Datum *) palloc(natts * sizeof(Datum));
+	nulls = (bool *) palloc(natts * sizeof(bool));
+
+	/*
+	 * Call the "in" function for each non-dropped attribute, even for nulls,
+	 * to support domains.
+	 */
+	for (i = 0; i < natts; i++)
+	{
+		if (!TupleDescAttr(tupdesc, i)->attisdropped)
+		{
+			/* Non-dropped attributes */
+			dvalues[i] = InputFunctionCall(&attinmeta->attinfuncs[i],
+										   values[i],
+										   attinmeta->attioparams[i],
+										   attinmeta->atttypmods[i]);
+			if (values[i] != NULL)
+				nulls[i] = false;
+			else
+				nulls[i] = true;
+		}
+		else
+		{
+			/* Handle dropped attributes by setting to NULL */
+			dvalues[i] = (Datum) 0;
+			nulls[i] = true;
+		}
+	}
+
+	/*
+	 * Form a tuple
+	 */
+	tuple = heap_form_tuple(tupdesc, dvalues, nulls);
+
+	/*
+	 * Release locally palloc'd space.  XXX would probably be good to pfree
+	 * values of pass-by-reference datums, as well.
+	 */
+	pfree(dvalues);
+	pfree(nulls);
+
+	return tuple;
+}
+
+/*
+ * HeapTupleHeaderGetDatum - convert a HeapTupleHeader pointer to a Datum.
+ *
+ * This must *not* get applied to an on-disk tuple; the tuple should be
+ * freshly made by heap_form_tuple or some wrapper routine for it (such as
+ * BuildTupleFromCStrings).  Be sure also that the tupledesc used to build
+ * the tuple has a properly "blessed" rowtype.
+ *
+ * Formerly this was a macro equivalent to PointerGetDatum, relying on the
+ * fact that heap_form_tuple fills in the appropriate tuple header fields
+ * for a composite Datum.  However, we now require that composite Datums not
+ * contain any external TOAST pointers.  We do not want heap_form_tuple itself
+ * to enforce that; more specifically, the rule applies only to actual Datums
+ * and not to HeapTuple structures.  Therefore, HeapTupleHeaderGetDatum is
+ * now a function that detects whether there are externally-toasted fields
+ * and constructs a new tuple with inlined fields if so.  We still need
+ * heap_form_tuple to insert the Datum header fields, because otherwise this
+ * code would have no way to obtain a tupledesc for the tuple.
+ *
+ * Note that if we do build a new tuple, it's palloc'd in the current
+ * memory context.  Beware of code that changes context between the initial
+ * heap_form_tuple/etc call and calling HeapTuple(Header)GetDatum.
+ *
+ * For performance-critical callers, it could be worthwhile to take extra
+ * steps to ensure that there aren't TOAST pointers in the output of
+ * heap_form_tuple to begin with.  It's likely however that the costs of the
+ * typcache lookup and tuple disassembly/reassembly are swamped by TOAST
+ * dereference costs, so that the benefits of such extra effort would be
+ * minimal.
+ *
+ * XXX it would likely be better to create wrapper functions that produce
+ * a composite Datum from the field values in one step.  However, there's
+ * enough code using the existing APIs that we couldn't get rid of this
+ * hack anytime soon.
+ */
+Datum
+HeapTupleHeaderGetDatum(HeapTupleHeader tuple)
+{
+	Datum		result;
+	TupleDesc	tupDesc;
+
+	/* No work if there are no external TOAST pointers in the tuple */
+	if (!HeapTupleHeaderHasExternal(tuple))
+		return PointerGetDatum(tuple);
+
+	/* Use the type data saved by heap_form_tuple to look up the rowtype */
+	tupDesc = lookup_rowtype_tupdesc(HeapTupleHeaderGetTypeId(tuple),
+									 HeapTupleHeaderGetTypMod(tuple));
+
+	/* And do the flattening */
+	result = toast_flatten_tuple_to_datum(tuple,
+										  HeapTupleHeaderGetDatumLength(tuple),
+										  tupDesc);
+
+	ReleaseTupleDesc(tupDesc);
+
+	return result;
+}
+
+
+/*
+ * Functions for sending tuples to the frontend (or other specified destination)
+ * as though it is a SELECT result. These are used by utility commands that
+ * need to project directly to the destination and don't need or want full
+ * table function capability. Currently used by EXPLAIN and SHOW ALL.
+ */
+TupOutputState *
+begin_tup_output_tupdesc(DestReceiver *dest,
+						 TupleDesc tupdesc,
+						 const TupleTableSlotOps *tts_ops)
+{
+	TupOutputState *tstate;
+
+	tstate = (TupOutputState *) palloc(sizeof(TupOutputState));
+
+	tstate->slot = MakeSingleTupleTableSlot(tupdesc, tts_ops);
+	tstate->dest = dest;
+
+	tstate->dest->rStartup(tstate->dest, (int) CMD_SELECT, tupdesc);
+
+	return tstate;
+}
+
+/*
+ * write a single tuple
+ */
+void
+do_tup_output(TupOutputState *tstate, Datum *values, bool *isnull)
+{
+	TupleTableSlot *slot = tstate->slot;
+	int			natts = slot->tts_tupleDescriptor->natts;
+
+	/* make sure the slot is clear */
+	ExecClearTuple(slot);
+
+	/* insert data */
+	memcpy(slot->tts_values, values, natts * sizeof(Datum));
+	memcpy(slot->tts_isnull, isnull, natts * sizeof(bool));
+
+	/* mark slot as containing a virtual tuple */
+	ExecStoreVirtualTuple(slot);
+
+	/* send the tuple to the receiver */
+	(void) tstate->dest->receiveSlot(slot, tstate->dest);
+
+	/* clean up */
+	ExecClearTuple(slot);
+}
+
+/*
+ * write a chunk of text, breaking at newline characters
+ *
+ * Should only be used with a single-TEXT-attribute tupdesc.
+ */
+void
+do_text_output_multiline(TupOutputState *tstate, const char *txt)
+{
+	Datum		values[1];
+	bool		isnull[1] = {false};
+
+	while (*txt)
+	{
+		const char *eol;
+		int			len;
+
+		eol = strchr(txt, '\n');
+		if (eol)
+		{
+			len = eol - txt;
+			eol++;
+		}
+		else
+		{
+			len = strlen(txt);
+			eol = txt + len;
+		}
+
+		values[0] = PointerGetDatum(cstring_to_text_with_len(txt, len));
+		do_tup_output(tstate, values, isnull);
+		pfree(DatumGetPointer(values[0]));
+		txt = eol;
+	}
+}
+
+void
+end_tup_output(TupOutputState *tstate)
+{
+	tstate->dest->rShutdown(tstate->dest);
+	/* note that destroying the dest is not ours to do */
+	ExecDropSingleTupleTableSlot(tstate->slot);
+	pfree(tstate);
+}
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
new file mode 100644
index 0000000..ad11392
--- /dev/null
+++ b/src/backend/executor/execUtils.c
@@ -0,0 +1,1351 @@
+/*-------------------------------------------------------------------------
+ *
+ * execUtils.c
+ *	  miscellaneous executor utility routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execUtils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		CreateExecutorState		Create/delete executor working state
+ *		FreeExecutorState
+ *		CreateExprContext
+ *		CreateStandaloneExprContext
+ *		FreeExprContext
+ *		ReScanExprContext
+ *
+ *		ExecAssignExprContext	Common code for plan node init routines.
+ *		etc
+ *
+ *		ExecOpenScanRelation	Common code for scan node init routines.
+ *
+ *		ExecInitRangeTable		Set up executor's range-table-related data.
+ *
+ *		ExecGetRangeTableRelation		Fetch Relation for a rangetable entry.
+ *
+ *		executor_errposition	Report syntactic position of an error.
+ *
+ *		RegisterExprContextCallback    Register function shutdown callback
+ *		UnregisterExprContextCallback  Deregister function shutdown callback
+ *
+ *		GetAttributeByName		Runtime extraction of columns from tuples.
+ *		GetAttributeByNum
+ *
+ *	 NOTES
+ *		This file has traditionally been the place to stick misc.
+ *		executor support stuff that doesn't really go anyplace else.
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "access/relscan.h"
+#include "access/table.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "executor/executor.h"
+#include "executor/execPartition.h"
+#include "jit/jit.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parsetree.h"
+#include "partitioning/partdesc.h"
+#include "storage/lmgr.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/typcache.h"
+
+
+static bool tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc);
+static void ShutdownExprContext(ExprContext *econtext, bool isCommit);
+
+
+/* ----------------------------------------------------------------
+ *				 Executor state and memory management functions
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ *		CreateExecutorState
+ *
+ *		Create and initialize an EState node, which is the root of
+ *		working storage for an entire Executor invocation.
+ *
+ * Principally, this creates the per-query memory context that will be
+ * used to hold all working data that lives till the end of the query.
+ * Note that the per-query context will become a child of the caller's
+ * CurrentMemoryContext.
+ * ----------------
+ */
+EState *
+CreateExecutorState(void)
+{
+	EState	   *estate;
+	MemoryContext qcontext;
+	MemoryContext oldcontext;
+
+	/*
+	 * Create the per-query context for this Executor run.
+	 */
+	qcontext = AllocSetContextCreate(CurrentMemoryContext,
+									 "ExecutorState",
+									 ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * Make the EState node within the per-query context.  This way, we don't
+	 * need a separate pfree() operation for it at shutdown.
+	 */
+	oldcontext = MemoryContextSwitchTo(qcontext);
+
+	estate = makeNode(EState);
+
+	/*
+	 * Initialize all fields of the Executor State structure
+	 */
+	estate->es_direction = ForwardScanDirection;
+	estate->es_snapshot = InvalidSnapshot;	/* caller must initialize this */
+	estate->es_crosscheck_snapshot = InvalidSnapshot;	/* no crosscheck */
+	estate->es_range_table = NIL;
+	estate->es_range_table_size = 0;
+	estate->es_relations = NULL;
+	estate->es_rowmarks = NULL;
+	estate->es_plannedstmt = NULL;
+
+	estate->es_junkFilter = NULL;
+
+	estate->es_output_cid = (CommandId) 0;
+
+	estate->es_result_relations = NULL;
+	estate->es_opened_result_relations = NIL;
+	estate->es_tuple_routing_result_relations = NIL;
+	estate->es_trig_target_relations = NIL;
+
+	estate->es_param_list_info = NULL;
+	estate->es_param_exec_vals = NULL;
+
+	estate->es_queryEnv = NULL;
+
+	estate->es_query_cxt = qcontext;
+
+	estate->es_tupleTable = NIL;
+
+	estate->es_processed = 0;
+
+	estate->es_top_eflags = 0;
+	estate->es_instrument = 0;
+	estate->es_finished = false;
+
+	estate->es_exprcontexts = NIL;
+
+	estate->es_subplanstates = NIL;
+
+	estate->es_auxmodifytables = NIL;
+
+	estate->es_per_tuple_exprcontext = NULL;
+
+	estate->es_sourceText = NULL;
+
+	estate->es_use_parallel_mode = false;
+
+	estate->es_jit_flags = 0;
+	estate->es_jit = NULL;
+
+	/*
+	 * Return the executor state structure
+	 */
+	MemoryContextSwitchTo(oldcontext);
+
+	return estate;
+}
+
+/* ----------------
+ *		FreeExecutorState
+ *
+ *		Release an EState along with all remaining working storage.
+ *
+ * Note: this is not responsible for releasing non-memory resources, such as
+ * open relations or buffer pins.  But it will shut down any still-active
+ * ExprContexts within the EState and deallocate associated JITed expressions.
+ * That is sufficient cleanup for situations where the EState has only been
+ * used for expression evaluation, and not to run a complete Plan.
+ *
+ * This can be called in any memory context ... so long as it's not one
+ * of the ones to be freed.
+ * ----------------
+ */
+void
+FreeExecutorState(EState *estate)
+{
+	/*
+	 * Shut down and free any remaining ExprContexts.  We do this explicitly
+	 * to ensure that any remaining shutdown callbacks get called (since they
+	 * might need to release resources that aren't simply memory within the
+	 * per-query memory context).
+	 */
+	while (estate->es_exprcontexts)
+	{
+		/*
+		 * XXX: seems there ought to be a faster way to implement this than
+		 * repeated list_delete(), no?
+		 */
+		FreeExprContext((ExprContext *) linitial(estate->es_exprcontexts),
+						true);
+		/* FreeExprContext removed the list link for us */
+	}
+
+	/* release JIT context, if allocated */
+	if (estate->es_jit)
+	{
+		jit_release_context(estate->es_jit);
+		estate->es_jit = NULL;
+	}
+
+	/* release partition directory, if allocated */
+	if (estate->es_partition_directory)
+	{
+		DestroyPartitionDirectory(estate->es_partition_directory);
+		estate->es_partition_directory = NULL;
+	}
+
+	/*
+	 * Free the per-query memory context, thereby releasing all working
+	 * memory, including the EState node itself.
+	 */
+	MemoryContextDelete(estate->es_query_cxt);
+}
+
+/*
+ * Internal implementation for CreateExprContext() and CreateWorkExprContext()
+ * that allows control over the AllocSet parameters.
+ */
+static ExprContext *
+CreateExprContextInternal(EState *estate, Size minContextSize,
+						  Size initBlockSize, Size maxBlockSize)
+{
+	ExprContext *econtext;
+	MemoryContext oldcontext;
+
+	/* Create the ExprContext node within the per-query memory context */
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	econtext = makeNode(ExprContext);
+
+	/* Initialize fields of ExprContext */
+	econtext->ecxt_scantuple = NULL;
+	econtext->ecxt_innertuple = NULL;
+	econtext->ecxt_outertuple = NULL;
+
+	econtext->ecxt_per_query_memory = estate->es_query_cxt;
+
+	/*
+	 * Create working memory for expression evaluation in this context.
+	 */
+	econtext->ecxt_per_tuple_memory =
+		AllocSetContextCreate(estate->es_query_cxt,
+							  "ExprContext",
+							  minContextSize,
+							  initBlockSize,
+							  maxBlockSize);
+
+	econtext->ecxt_param_exec_vals = estate->es_param_exec_vals;
+	econtext->ecxt_param_list_info = estate->es_param_list_info;
+
+	econtext->ecxt_aggvalues = NULL;
+	econtext->ecxt_aggnulls = NULL;
+
+	econtext->caseValue_datum = (Datum) 0;
+	econtext->caseValue_isNull = true;
+
+	econtext->domainValue_datum = (Datum) 0;
+	econtext->domainValue_isNull = true;
+
+	econtext->ecxt_estate = estate;
+
+	econtext->ecxt_callbacks = NULL;
+
+	/*
+	 * Link the ExprContext into the EState to ensure it is shut down when the
+	 * EState is freed.  Because we use lcons(), shutdowns will occur in
+	 * reverse order of creation, which may not be essential but can't hurt.
+	 */
+	estate->es_exprcontexts = lcons(econtext, estate->es_exprcontexts);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return econtext;
+}
+
+/* ----------------
+ *		CreateExprContext
+ *
+ *		Create a context for expression evaluation within an EState.
+ *
+ * An executor run may require multiple ExprContexts (we usually make one
+ * for each Plan node, and a separate one for per-output-tuple processing
+ * such as constraint checking).  Each ExprContext has its own "per-tuple"
+ * memory context.
+ *
+ * Note we make no assumption about the caller's memory context.
+ * ----------------
+ */
+ExprContext *
+CreateExprContext(EState *estate)
+{
+	return CreateExprContextInternal(estate, ALLOCSET_DEFAULT_SIZES);
+}
+
+
+/* ----------------
+ *		CreateWorkExprContext
+ *
+ * Like CreateExprContext, but specifies the AllocSet sizes to be reasonable
+ * in proportion to work_mem. If the maximum block allocation size is too
+ * large, it's easy to skip right past work_mem with a single allocation.
+ * ----------------
+ */
+ExprContext *
+CreateWorkExprContext(EState *estate)
+{
+	Size		minContextSize = ALLOCSET_DEFAULT_MINSIZE;
+	Size		initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+	Size		maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
+
+	/* choose the maxBlockSize to be no larger than 1/16 of work_mem */
+	while (16 * maxBlockSize > work_mem * 1024L)
+		maxBlockSize >>= 1;
+
+	if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
+		maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+
+	return CreateExprContextInternal(estate, minContextSize,
+									 initBlockSize, maxBlockSize);
+}
+
+/* ----------------
+ *		CreateStandaloneExprContext
+ *
+ *		Create a context for standalone expression evaluation.
+ *
+ * An ExprContext made this way can be used for evaluation of expressions
+ * that contain no Params, subplans, or Var references (it might work to
+ * put tuple references into the scantuple field, but it seems unwise).
+ *
+ * The ExprContext struct is allocated in the caller's current memory
+ * context, which also becomes its "per query" context.
+ *
+ * It is caller's responsibility to free the ExprContext when done,
+ * or at least ensure that any shutdown callbacks have been called
+ * (ReScanExprContext() is suitable).  Otherwise, non-memory resources
+ * might be leaked.
+ * ----------------
+ */
+ExprContext *
+CreateStandaloneExprContext(void)
+{
+	ExprContext *econtext;
+
+	/* Create the ExprContext node within the caller's memory context */
+	econtext = makeNode(ExprContext);
+
+	/* Initialize fields of ExprContext */
+	econtext->ecxt_scantuple = NULL;
+	econtext->ecxt_innertuple = NULL;
+	econtext->ecxt_outertuple = NULL;
+
+	econtext->ecxt_per_query_memory = CurrentMemoryContext;
+
+	/*
+	 * Create working memory for expression evaluation in this context.
+	 */
+	econtext->ecxt_per_tuple_memory =
+		AllocSetContextCreate(CurrentMemoryContext,
+							  "ExprContext",
+							  ALLOCSET_DEFAULT_SIZES);
+
+	econtext->ecxt_param_exec_vals = NULL;
+	econtext->ecxt_param_list_info = NULL;
+
+	econtext->ecxt_aggvalues = NULL;
+	econtext->ecxt_aggnulls = NULL;
+
+	econtext->caseValue_datum = (Datum) 0;
+	econtext->caseValue_isNull = true;
+
+	econtext->domainValue_datum = (Datum) 0;
+	econtext->domainValue_isNull = true;
+
+	econtext->ecxt_estate = NULL;
+
+	econtext->ecxt_callbacks = NULL;
+
+	return econtext;
+}
+
+/* ----------------
+ *		FreeExprContext
+ *
+ *		Free an expression context, including calling any remaining
+ *		shutdown callbacks.
+ *
+ * Since we free the temporary context used for expression evaluation,
+ * any previously computed pass-by-reference expression result will go away!
+ *
+ * If isCommit is false, we are being called in error cleanup, and should
+ * not call callbacks but only release memory.  (It might be better to call
+ * the callbacks and pass the isCommit flag to them, but that would require
+ * more invasive code changes than currently seems justified.)
+ *
+ * Note we make no assumption about the caller's memory context.
+ * ----------------
+ */
+void
+FreeExprContext(ExprContext *econtext, bool isCommit)
+{
+	EState	   *estate;
+
+	/* Call any registered callbacks */
+	ShutdownExprContext(econtext, isCommit);
+	/* And clean up the memory used */
+	MemoryContextDelete(econtext->ecxt_per_tuple_memory);
+	/* Unlink self from owning EState, if any */
+	estate = econtext->ecxt_estate;
+	if (estate)
+		estate->es_exprcontexts = list_delete_ptr(estate->es_exprcontexts,
+												  econtext);
+	/* And delete the ExprContext node */
+	pfree(econtext);
+}
+
+/*
+ * ReScanExprContext
+ *
+ *		Reset an expression context in preparation for a rescan of its
+ *		plan node.  This requires calling any registered shutdown callbacks,
+ *		since any partially complete set-returning-functions must be canceled.
+ *
+ * Note we make no assumption about the caller's memory context.
+ */
+void
+ReScanExprContext(ExprContext *econtext)
+{
+	/* Call any registered callbacks */
+	ShutdownExprContext(econtext, true);
+	/* And clean up the memory used */
+	MemoryContextReset(econtext->ecxt_per_tuple_memory);
+}
+
+/*
+ * Build a per-output-tuple ExprContext for an EState.
+ *
+ * This is normally invoked via GetPerTupleExprContext() macro,
+ * not directly.
+ */
+ExprContext *
+MakePerTupleExprContext(EState *estate)
+{
+	if (estate->es_per_tuple_exprcontext == NULL)
+		estate->es_per_tuple_exprcontext = CreateExprContext(estate);
+
+	return estate->es_per_tuple_exprcontext;
+}
+
+
+/* ----------------------------------------------------------------
+ *				 miscellaneous node-init support functions
+ *
+ * Note: all of these are expected to be called with CurrentMemoryContext
+ * equal to the per-query memory context.
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ *		ExecAssignExprContext
+ *
+ *		This initializes the ps_ExprContext field.  It is only necessary
+ *		to do this for nodes which use ExecQual or ExecProject
+ *		because those routines require an econtext. Other nodes that
+ *		don't have to evaluate expressions don't need to do this.
+ * ----------------
+ */
+void
+ExecAssignExprContext(EState *estate, PlanState *planstate)
+{
+	planstate->ps_ExprContext = CreateExprContext(estate);
+}
+
+/* ----------------
+ *		ExecGetResultType
+ * ----------------
+ */
+TupleDesc
+ExecGetResultType(PlanState *planstate)
+{
+	return planstate->ps_ResultTupleDesc;
+}
+
+/*
+ * ExecGetResultSlotOps - information about node's type of result slot
+ */
+const TupleTableSlotOps *
+ExecGetResultSlotOps(PlanState *planstate, bool *isfixed)
+{
+	if (planstate->resultopsset && planstate->resultops)
+	{
+		if (isfixed)
+			*isfixed = planstate->resultopsfixed;
+		return planstate->resultops;
+	}
+
+	if (isfixed)
+	{
+		if (planstate->resultopsset)
+			*isfixed = planstate->resultopsfixed;
+		else if (planstate->ps_ResultTupleSlot)
+			*isfixed = TTS_FIXED(planstate->ps_ResultTupleSlot);
+		else
+			*isfixed = false;
+	}
+
+	if (!planstate->ps_ResultTupleSlot)
+		return &TTSOpsVirtual;
+
+	return planstate->ps_ResultTupleSlot->tts_ops;
+}
+
+
+/* ----------------
+ *		ExecAssignProjectionInfo
+ *
+ * forms the projection information from the node's targetlist
+ *
+ * Notes for inputDesc are same as for ExecBuildProjectionInfo: supply it
+ * for a relation-scan node, can pass NULL for upper-level nodes
+ * ----------------
+ */
+void
+ExecAssignProjectionInfo(PlanState *planstate,
+						 TupleDesc inputDesc)
+{
+	planstate->ps_ProjInfo =
+		ExecBuildProjectionInfo(planstate->plan->targetlist,
+								planstate->ps_ExprContext,
+								planstate->ps_ResultTupleSlot,
+								planstate,
+								inputDesc);
+}
+
+
+/* ----------------
+ *		ExecConditionalAssignProjectionInfo
+ *
+ * as ExecAssignProjectionInfo, but store NULL rather than building projection
+ * info if no projection is required
+ * ----------------
+ */
+void
+ExecConditionalAssignProjectionInfo(PlanState *planstate, TupleDesc inputDesc,
+									Index varno)
+{
+	if (tlist_matches_tupdesc(planstate,
+							  planstate->plan->targetlist,
+							  varno,
+							  inputDesc))
+	{
+		planstate->ps_ProjInfo = NULL;
+		planstate->resultopsset = planstate->scanopsset;
+		planstate->resultopsfixed = planstate->scanopsfixed;
+		planstate->resultops = planstate->scanops;
+	}
+	else
+	{
+		if (!planstate->ps_ResultTupleSlot)
+		{
+			ExecInitResultSlot(planstate, &TTSOpsVirtual);
+			planstate->resultops = &TTSOpsVirtual;
+			planstate->resultopsfixed = true;
+			planstate->resultopsset = true;
+		}
+		ExecAssignProjectionInfo(planstate, inputDesc);
+	}
+}
+
+static bool
+tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc)
+{
+	int			numattrs = tupdesc->natts;
+	int			attrno;
+	ListCell   *tlist_item = list_head(tlist);
+
+	/* Check the tlist attributes */
+	for (attrno = 1; attrno <= numattrs; attrno++)
+	{
+		Form_pg_attribute att_tup = TupleDescAttr(tupdesc, attrno - 1);
+		Var		   *var;
+
+		if (tlist_item == NULL)
+			return false;		/* tlist too short */
+		var = (Var *) ((TargetEntry *) lfirst(tlist_item))->expr;
+		if (!var || !IsA(var, Var))
+			return false;		/* tlist item not a Var */
+		/* if these Asserts fail, planner messed up */
+		Assert(var->varno == varno);
+		Assert(var->varlevelsup == 0);
+		if (var->varattno != attrno)
+			return false;		/* out of order */
+		if (att_tup->attisdropped)
+			return false;		/* table contains dropped columns */
+		if (att_tup->atthasmissing)
+			return false;		/* table contains cols with missing values */
+
+		/*
+		 * Note: usually the Var's type should match the tupdesc exactly, but
+		 * in situations involving unions of columns that have different
+		 * typmods, the Var may have come from above the union and hence have
+		 * typmod -1.  This is a legitimate situation since the Var still
+		 * describes the column, just not as exactly as the tupdesc does. We
+		 * could change the planner to prevent it, but it'd then insert
+		 * projection steps just to convert from specific typmod to typmod -1,
+		 * which is pretty silly.
+		 */
+		if (var->vartype != att_tup->atttypid ||
+			(var->vartypmod != att_tup->atttypmod &&
+			 var->vartypmod != -1))
+			return false;		/* type mismatch */
+
+		tlist_item = lnext(tlist, tlist_item);
+	}
+
+	if (tlist_item)
+		return false;			/* tlist too long */
+
+	return true;
+}
+
+/* ----------------
+ *		ExecFreeExprContext
+ *
+ * A plan node's ExprContext should be freed explicitly during executor
+ * shutdown because there may be shutdown callbacks to call.  (Other resources
+ * made by the above routines, such as projection info, don't need to be freed
+ * explicitly because they're just memory in the per-query memory context.)
+ *
+ * However ... there is no particular need to do it during ExecEndNode,
+ * because FreeExecutorState will free any remaining ExprContexts within
+ * the EState.  Letting FreeExecutorState do it allows the ExprContexts to
+ * be freed in reverse order of creation, rather than order of creation as
+ * will happen if we delete them here, which saves O(N^2) work in the list
+ * cleanup inside FreeExprContext.
+ * ----------------
+ */
+void
+ExecFreeExprContext(PlanState *planstate)
+{
+	/*
+	 * Per above discussion, don't actually delete the ExprContext. We do
+	 * unlink it from the plan node, though.
+	 */
+	planstate->ps_ExprContext = NULL;
+}
+
+
+/* ----------------------------------------------------------------
+ *				  Scan node support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ *		ExecAssignScanType
+ * ----------------
+ */
+void
+ExecAssignScanType(ScanState *scanstate, TupleDesc tupDesc)
+{
+	TupleTableSlot *slot = scanstate->ss_ScanTupleSlot;
+
+	ExecSetSlotDescriptor(slot, tupDesc);
+}
+
+/* ----------------
+ *		ExecCreateScanSlotFromOuterPlan
+ * ----------------
+ */
+void
+ExecCreateScanSlotFromOuterPlan(EState *estate,
+								ScanState *scanstate,
+								const TupleTableSlotOps *tts_ops)
+{
+	PlanState  *outerPlan;
+	TupleDesc	tupDesc;
+
+	outerPlan = outerPlanState(scanstate);
+	tupDesc = ExecGetResultType(outerPlan);
+
+	ExecInitScanTupleSlot(estate, scanstate, tupDesc, tts_ops);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecRelationIsTargetRelation
+ *
+ *		Detect whether a relation (identified by rangetable index)
+ *		is one of the target relations of the query.
+ *
+ * Note: This is currently no longer used in core.  We keep it around
+ * because FDWs may wish to use it to determine if their foreign table
+ * is a target relation.
+ * ----------------------------------------------------------------
+ */
+bool
+ExecRelationIsTargetRelation(EState *estate, Index scanrelid)
+{
+	return list_member_int(estate->es_plannedstmt->resultRelations, scanrelid);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecOpenScanRelation
+ *
+ *		Open the heap relation to be scanned by a base-level scan plan node.
+ *		This should be called during the node's ExecInit routine.
+ * ----------------------------------------------------------------
+ */
+Relation
+ExecOpenScanRelation(EState *estate, Index scanrelid, int eflags)
+{
+	Relation	rel;
+
+	/* Open the relation. */
+	rel = ExecGetRangeTableRelation(estate, scanrelid);
+
+	/*
+	 * Complain if we're attempting a scan of an unscannable relation, except
+	 * when the query won't actually be run.  This is a slightly klugy place
+	 * to do this, perhaps, but there is no better place.
+	 */
+	if ((eflags & (EXEC_FLAG_EXPLAIN_ONLY | EXEC_FLAG_WITH_NO_DATA)) == 0 &&
+		!RelationIsScannable(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("materialized view \"%s\" has not been populated",
+						RelationGetRelationName(rel)),
+				 errhint("Use the REFRESH MATERIALIZED VIEW command.")));
+
+	return rel;
+}
+
+/*
+ * ExecInitRangeTable
+ *		Set up executor's range-table-related data
+ *
+ * In addition to the range table proper, initialize arrays that are
+ * indexed by rangetable index.
+ */
+void
+ExecInitRangeTable(EState *estate, List *rangeTable)
+{
+	/* Remember the range table List as-is */
+	estate->es_range_table = rangeTable;
+
+	/* Set size of associated arrays */
+	estate->es_range_table_size = list_length(rangeTable);
+
+	/*
+	 * Allocate an array to store an open Relation corresponding to each
+	 * rangetable entry, and initialize entries to NULL.  Relations are opened
+	 * and stored here as needed.
+	 */
+	estate->es_relations = (Relation *)
+		palloc0(estate->es_range_table_size * sizeof(Relation));
+
+	/*
+	 * es_result_relations and es_rowmarks are also parallel to
+	 * es_range_table, but are allocated only if needed.
+	 */
+	estate->es_result_relations = NULL;
+	estate->es_rowmarks = NULL;
+}
+
+/*
+ * ExecGetRangeTableRelation
+ *		Open the Relation for a range table entry, if not already done
+ *
+ * The Relations will be closed again in ExecEndPlan().
+ */
+Relation
+ExecGetRangeTableRelation(EState *estate, Index rti)
+{
+	Relation	rel;
+
+	Assert(rti > 0 && rti <= estate->es_range_table_size);
+
+	rel = estate->es_relations[rti - 1];
+	if (rel == NULL)
+	{
+		/* First time through, so open the relation */
+		RangeTblEntry *rte = exec_rt_fetch(rti, estate);
+
+		Assert(rte->rtekind == RTE_RELATION);
+
+		if (!IsParallelWorker())
+		{
+			/*
+			 * In a normal query, we should already have the appropriate lock,
+			 * but verify that through an Assert.  Since there's already an
+			 * Assert inside table_open that insists on holding some lock, it
+			 * seems sufficient to check this only when rellockmode is higher
+			 * than the minimum.
+			 */
+			rel = table_open(rte->relid, NoLock);
+			Assert(rte->rellockmode == AccessShareLock ||
+				   CheckRelationLockedByMe(rel, rte->rellockmode, false));
+		}
+		else
+		{
+			/*
+			 * If we are a parallel worker, we need to obtain our own local
+			 * lock on the relation.  This ensures sane behavior in case the
+			 * parent process exits before we do.
+			 */
+			rel = table_open(rte->relid, rte->rellockmode);
+		}
+
+		estate->es_relations[rti - 1] = rel;
+	}
+
+	return rel;
+}
+
+/*
+ * ExecInitResultRelation
+ *		Open relation given by the passed-in RT index and fill its
+ *		ResultRelInfo node
+ *
+ * Here, we also save the ResultRelInfo in estate->es_result_relations array
+ * such that it can be accessed later using the RT index.
+ */
+void
+ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo,
+					   Index rti)
+{
+	Relation	resultRelationDesc;
+
+	resultRelationDesc = ExecGetRangeTableRelation(estate, rti);
+	InitResultRelInfo(resultRelInfo,
+					  resultRelationDesc,
+					  rti,
+					  NULL,
+					  estate->es_instrument);
+
+	if (estate->es_result_relations == NULL)
+		estate->es_result_relations = (ResultRelInfo **)
+			palloc0(estate->es_range_table_size * sizeof(ResultRelInfo *));
+	estate->es_result_relations[rti - 1] = resultRelInfo;
+
+	/*
+	 * Saving in the list allows to avoid needlessly traversing the whole
+	 * array when only a few of its entries are possibly non-NULL.
+	 */
+	estate->es_opened_result_relations =
+		lappend(estate->es_opened_result_relations, resultRelInfo);
+}
+
+/*
+ * UpdateChangedParamSet
+ *		Add changed parameters to a plan node's chgParam set
+ */
+void
+UpdateChangedParamSet(PlanState *node, Bitmapset *newchg)
+{
+	Bitmapset  *parmset;
+
+	/*
+	 * The plan node only depends on params listed in its allParam set. Don't
+	 * include anything else into its chgParam set.
+	 */
+	parmset = bms_intersect(node->plan->allParam, newchg);
+
+	/*
+	 * Keep node->chgParam == NULL if there's not actually any members; this
+	 * allows the simplest possible tests in executor node files.
+	 */
+	if (!bms_is_empty(parmset))
+		node->chgParam = bms_join(node->chgParam, parmset);
+	else
+		bms_free(parmset);
+}
+
+/*
+ * executor_errposition
+ *		Report an execution-time cursor position, if possible.
+ *
+ * This is expected to be used within an ereport() call.  The return value
+ * is a dummy (always 0, in fact).
+ *
+ * The locations stored in parsetrees are byte offsets into the source string.
+ * We have to convert them to 1-based character indexes for reporting to
+ * clients.  (We do things this way to avoid unnecessary overhead in the
+ * normal non-error case: computing character indexes would be much more
+ * expensive than storing token offsets.)
+ */
+int
+executor_errposition(EState *estate, int location)
+{
+	int			pos;
+
+	/* No-op if location was not provided */
+	if (location < 0)
+		return 0;
+	/* Can't do anything if source text is not available */
+	if (estate == NULL || estate->es_sourceText == NULL)
+		return 0;
+	/* Convert offset to character number */
+	pos = pg_mbstrlen_with_len(estate->es_sourceText, location) + 1;
+	/* And pass it to the ereport mechanism */
+	return errposition(pos);
+}
+
+/*
+ * Register a shutdown callback in an ExprContext.
+ *
+ * Shutdown callbacks will be called (in reverse order of registration)
+ * when the ExprContext is deleted or rescanned.  This provides a hook
+ * for functions called in the context to do any cleanup needed --- it's
+ * particularly useful for functions returning sets.  Note that the
+ * callback will *not* be called in the event that execution is aborted
+ * by an error.
+ */
+void
+RegisterExprContextCallback(ExprContext *econtext,
+							ExprContextCallbackFunction function,
+							Datum arg)
+{
+	ExprContext_CB *ecxt_callback;
+
+	/* Save the info in appropriate memory context */
+	ecxt_callback = (ExprContext_CB *)
+		MemoryContextAlloc(econtext->ecxt_per_query_memory,
+						   sizeof(ExprContext_CB));
+
+	ecxt_callback->function = function;
+	ecxt_callback->arg = arg;
+
+	/* link to front of list for appropriate execution order */
+	ecxt_callback->next = econtext->ecxt_callbacks;
+	econtext->ecxt_callbacks = ecxt_callback;
+}
+
+/*
+ * Deregister a shutdown callback in an ExprContext.
+ *
+ * Any list entries matching the function and arg will be removed.
+ * This can be used if it's no longer necessary to call the callback.
+ */
+void
+UnregisterExprContextCallback(ExprContext *econtext,
+							  ExprContextCallbackFunction function,
+							  Datum arg)
+{
+	ExprContext_CB **prev_callback;
+	ExprContext_CB *ecxt_callback;
+
+	prev_callback = &econtext->ecxt_callbacks;
+
+	while ((ecxt_callback = *prev_callback) != NULL)
+	{
+		if (ecxt_callback->function == function && ecxt_callback->arg == arg)
+		{
+			*prev_callback = ecxt_callback->next;
+			pfree(ecxt_callback);
+		}
+		else
+			prev_callback = &ecxt_callback->next;
+	}
+}
+
+/*
+ * Call all the shutdown callbacks registered in an ExprContext.
+ *
+ * The callback list is emptied (important in case this is only a rescan
+ * reset, and not deletion of the ExprContext).
+ *
+ * If isCommit is false, just clean the callback list but don't call 'em.
+ * (See comment for FreeExprContext.)
+ */
+static void
+ShutdownExprContext(ExprContext *econtext, bool isCommit)
+{
+	ExprContext_CB *ecxt_callback;
+	MemoryContext oldcontext;
+
+	/* Fast path in normal case where there's nothing to do. */
+	if (econtext->ecxt_callbacks == NULL)
+		return;
+
+	/*
+	 * Call the callbacks in econtext's per-tuple context.  This ensures that
+	 * any memory they might leak will get cleaned up.
+	 */
+	oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	/*
+	 * Call each callback function in reverse registration order.
+	 */
+	while ((ecxt_callback = econtext->ecxt_callbacks) != NULL)
+	{
+		econtext->ecxt_callbacks = ecxt_callback->next;
+		if (isCommit)
+			ecxt_callback->function(ecxt_callback->arg);
+		pfree(ecxt_callback);
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ *		GetAttributeByName
+ *		GetAttributeByNum
+ *
+ *		These functions return the value of the requested attribute
+ *		out of the given tuple Datum.
+ *		C functions which take a tuple as an argument are expected
+ *		to use these.  Ex: overpaid(EMP) might call GetAttributeByNum().
+ *		Note: these are actually rather slow because they do a typcache
+ *		lookup on each call.
+ */
+Datum
+GetAttributeByName(HeapTupleHeader tuple, const char *attname, bool *isNull)
+{
+	AttrNumber	attrno;
+	Datum		result;
+	Oid			tupType;
+	int32		tupTypmod;
+	TupleDesc	tupDesc;
+	HeapTupleData tmptup;
+	int			i;
+
+	if (attname == NULL)
+		elog(ERROR, "invalid attribute name");
+
+	if (isNull == NULL)
+		elog(ERROR, "a NULL isNull pointer was passed");
+
+	if (tuple == NULL)
+	{
+		/* Kinda bogus but compatible with old behavior... */
+		*isNull = true;
+		return (Datum) 0;
+	}
+
+	tupType = HeapTupleHeaderGetTypeId(tuple);
+	tupTypmod = HeapTupleHeaderGetTypMod(tuple);
+	tupDesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
+
+	attrno = InvalidAttrNumber;
+	for (i = 0; i < tupDesc->natts; i++)
+	{
+		Form_pg_attribute att = TupleDescAttr(tupDesc, i);
+
+		if (namestrcmp(&(att->attname), attname) == 0)
+		{
+			attrno = att->attnum;
+			break;
+		}
+	}
+
+	if (attrno == InvalidAttrNumber)
+		elog(ERROR, "attribute \"%s\" does not exist", attname);
+
+	/*
+	 * heap_getattr needs a HeapTuple not a bare HeapTupleHeader.  We set all
+	 * the fields in the struct just in case user tries to inspect system
+	 * columns.
+	 */
+	tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+	ItemPointerSetInvalid(&(tmptup.t_self));
+	tmptup.t_tableOid = InvalidOid;
+	tmptup.t_data = tuple;
+
+	result = heap_getattr(&tmptup,
+						  attrno,
+						  tupDesc,
+						  isNull);
+
+	ReleaseTupleDesc(tupDesc);
+
+	return result;
+}
+
+Datum
+GetAttributeByNum(HeapTupleHeader tuple,
+				  AttrNumber attrno,
+				  bool *isNull)
+{
+	Datum		result;
+	Oid			tupType;
+	int32		tupTypmod;
+	TupleDesc	tupDesc;
+	HeapTupleData tmptup;
+
+	if (!AttributeNumberIsValid(attrno))
+		elog(ERROR, "invalid attribute number %d", attrno);
+
+	if (isNull == NULL)
+		elog(ERROR, "a NULL isNull pointer was passed");
+
+	if (tuple == NULL)
+	{
+		/* Kinda bogus but compatible with old behavior... */
+		*isNull = true;
+		return (Datum) 0;
+	}
+
+	tupType = HeapTupleHeaderGetTypeId(tuple);
+	tupTypmod = HeapTupleHeaderGetTypMod(tuple);
+	tupDesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
+
+	/*
+	 * heap_getattr needs a HeapTuple not a bare HeapTupleHeader.  We set all
+	 * the fields in the struct just in case user tries to inspect system
+	 * columns.
+	 */
+	tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
+	ItemPointerSetInvalid(&(tmptup.t_self));
+	tmptup.t_tableOid = InvalidOid;
+	tmptup.t_data = tuple;
+
+	result = heap_getattr(&tmptup,
+						  attrno,
+						  tupDesc,
+						  isNull);
+
+	ReleaseTupleDesc(tupDesc);
+
+	return result;
+}
+
+/*
+ * Number of items in a tlist (including any resjunk items!)
+ */
+int
+ExecTargetListLength(List *targetlist)
+{
+	/* This used to be more complex, but fjoins are dead */
+	return list_length(targetlist);
+}
+
+/*
+ * Number of items in a tlist, not including any resjunk items
+ */
+int
+ExecCleanTargetListLength(List *targetlist)
+{
+	int			len = 0;
+	ListCell   *tl;
+
+	foreach(tl, targetlist)
+	{
+		TargetEntry *curTle = lfirst_node(TargetEntry, tl);
+
+		if (!curTle->resjunk)
+			len++;
+	}
+	return len;
+}
+
+/*
+ * Return a relInfo's tuple slot for a trigger's OLD tuples.
+ */
+TupleTableSlot *
+ExecGetTriggerOldSlot(EState *estate, ResultRelInfo *relInfo)
+{
+	if (relInfo->ri_TrigOldSlot == NULL)
+	{
+		Relation	rel = relInfo->ri_RelationDesc;
+		MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+		relInfo->ri_TrigOldSlot =
+			ExecInitExtraTupleSlot(estate,
+								   RelationGetDescr(rel),
+								   table_slot_callbacks(rel));
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	return relInfo->ri_TrigOldSlot;
+}
+
+/*
+ * Return a relInfo's tuple slot for a trigger's NEW tuples.
+ */
+TupleTableSlot *
+ExecGetTriggerNewSlot(EState *estate, ResultRelInfo *relInfo)
+{
+	if (relInfo->ri_TrigNewSlot == NULL)
+	{
+		Relation	rel = relInfo->ri_RelationDesc;
+		MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+		relInfo->ri_TrigNewSlot =
+			ExecInitExtraTupleSlot(estate,
+								   RelationGetDescr(rel),
+								   table_slot_callbacks(rel));
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	return relInfo->ri_TrigNewSlot;
+}
+
+/*
+ * Return a relInfo's tuple slot for processing returning tuples.
+ */
+TupleTableSlot *
+ExecGetReturningSlot(EState *estate, ResultRelInfo *relInfo)
+{
+	if (relInfo->ri_ReturningSlot == NULL)
+	{
+		Relation	rel = relInfo->ri_RelationDesc;
+		MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+		relInfo->ri_ReturningSlot =
+			ExecInitExtraTupleSlot(estate,
+								   RelationGetDescr(rel),
+								   table_slot_callbacks(rel));
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	return relInfo->ri_ReturningSlot;
+}
+
+/*
+ * Return the map needed to convert given child result relation's tuples to
+ * the rowtype of the query's main target ("root") relation.  Note that a
+ * NULL result is valid and means that no conversion is needed.
+ */
+TupleConversionMap *
+ExecGetChildToRootMap(ResultRelInfo *resultRelInfo)
+{
+	/* If we didn't already do so, compute the map for this child. */
+	if (!resultRelInfo->ri_ChildToRootMapValid)
+	{
+		ResultRelInfo *rootRelInfo = resultRelInfo->ri_RootResultRelInfo;
+
+		if (rootRelInfo)
+			resultRelInfo->ri_ChildToRootMap =
+				convert_tuples_by_name(RelationGetDescr(resultRelInfo->ri_RelationDesc),
+									   RelationGetDescr(rootRelInfo->ri_RelationDesc));
+		else					/* this isn't a child result rel */
+			resultRelInfo->ri_ChildToRootMap = NULL;
+
+		resultRelInfo->ri_ChildToRootMapValid = true;
+	}
+
+	return resultRelInfo->ri_ChildToRootMap;
+}
+
+/* Return a bitmap representing columns being inserted */
+Bitmapset *
+ExecGetInsertedCols(ResultRelInfo *relinfo, EState *estate)
+{
+	/*
+	 * The columns are stored in the range table entry.  If this ResultRelInfo
+	 * represents a partition routing target, and doesn't have an entry of its
+	 * own in the range table, fetch the parent's RTE and map the columns to
+	 * the order they are in the partition.
+	 */
+	if (relinfo->ri_RangeTableIndex != 0)
+	{
+		RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate);
+
+		return rte->insertedCols;
+	}
+	else if (relinfo->ri_RootResultRelInfo)
+	{
+		ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo;
+		RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate);
+
+		if (relinfo->ri_RootToPartitionMap != NULL)
+			return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap,
+										 rte->insertedCols);
+		else
+			return rte->insertedCols;
+	}
+	else
+	{
+		/*
+		 * The relation isn't in the range table and it isn't a partition
+		 * routing target.  This ResultRelInfo must've been created only for
+		 * firing triggers and the relation is not being inserted into.  (See
+		 * ExecGetTriggerResultRel.)
+		 */
+		return NULL;
+	}
+}
+
+/* Return a bitmap representing columns being updated */
+Bitmapset *
+ExecGetUpdatedCols(ResultRelInfo *relinfo, EState *estate)
+{
+	/* see ExecGetInsertedCols() */
+	if (relinfo->ri_RangeTableIndex != 0)
+	{
+		RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate);
+
+		return rte->updatedCols;
+	}
+	else if (relinfo->ri_RootResultRelInfo)
+	{
+		ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo;
+		RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate);
+
+		if (relinfo->ri_RootToPartitionMap != NULL)
+			return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap,
+										 rte->updatedCols);
+		else
+			return rte->updatedCols;
+	}
+	else
+		return NULL;
+}
+
+/* Return a bitmap representing generated columns being updated */
+Bitmapset *
+ExecGetExtraUpdatedCols(ResultRelInfo *relinfo, EState *estate)
+{
+	/* see ExecGetInsertedCols() */
+	if (relinfo->ri_RangeTableIndex != 0)
+	{
+		RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate);
+
+		return rte->extraUpdatedCols;
+	}
+	else if (relinfo->ri_RootResultRelInfo)
+	{
+		ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo;
+		RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate);
+
+		if (relinfo->ri_RootToPartitionMap != NULL)
+			return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap,
+										 rte->extraUpdatedCols);
+		else
+			return rte->extraUpdatedCols;
+	}
+	else
+		return NULL;
+}
+
+/* Return columns being updated, including generated columns */
+Bitmapset *
+ExecGetAllUpdatedCols(ResultRelInfo *relinfo, EState *estate)
+{
+	return bms_union(ExecGetUpdatedCols(relinfo, estate),
+					 ExecGetExtraUpdatedCols(relinfo, estate));
+}
diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c
new file mode 100644
index 0000000..296e54e
--- /dev/null
+++ b/src/backend/executor/functions.c
@@ -0,0 +1,2103 @@
+/*-------------------------------------------------------------------------
+ *
+ * functions.c
+ *	  Execution of SQL-language functions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/functions.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "executor/functions.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "parser/parse_coerce.h"
+#include "parser/parse_collate.h"
+#include "parser/parse_func.h"
+#include "rewrite/rewriteHandler.h"
+#include "storage/proc.h"
+#include "tcop/utility.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+
+
+/*
+ * Specialized DestReceiver for collecting query output in a SQL function
+ */
+typedef struct
+{
+	DestReceiver pub;			/* publicly-known function pointers */
+	Tuplestorestate *tstore;	/* where to put result tuples */
+	MemoryContext cxt;			/* context containing tstore */
+	JunkFilter *filter;			/* filter to convert tuple type */
+} DR_sqlfunction;
+
+/*
+ * We have an execution_state record for each query in a function.  Each
+ * record contains a plantree for its query.  If the query is currently in
+ * F_EXEC_RUN state then there's a QueryDesc too.
+ *
+ * The "next" fields chain together all the execution_state records generated
+ * from a single original parsetree.  (There will only be more than one in
+ * case of rule expansion of the original parsetree.)
+ */
+typedef enum
+{
+	F_EXEC_START, F_EXEC_RUN, F_EXEC_DONE
+} ExecStatus;
+
+typedef struct execution_state
+{
+	struct execution_state *next;
+	ExecStatus	status;
+	bool		setsResult;		/* true if this query produces func's result */
+	bool		lazyEval;		/* true if should fetch one row at a time */
+	PlannedStmt *stmt;			/* plan for this query */
+	QueryDesc  *qd;				/* null unless status == RUN */
+} execution_state;
+
+
+/*
+ * An SQLFunctionCache record is built during the first call,
+ * and linked to from the fn_extra field of the FmgrInfo struct.
+ *
+ * Note that currently this has only the lifespan of the calling query.
+ * Someday we should rewrite this code to use plancache.c to save parse/plan
+ * results for longer than that.
+ *
+ * Physically, though, the data has the lifespan of the FmgrInfo that's used
+ * to call the function, and there are cases (particularly with indexes)
+ * where the FmgrInfo might survive across transactions.  We cannot assume
+ * that the parse/plan trees are good for longer than the (sub)transaction in
+ * which parsing was done, so we must mark the record with the LXID/subxid of
+ * its creation time, and regenerate everything if that's obsolete.  To avoid
+ * memory leakage when we do have to regenerate things, all the data is kept
+ * in a sub-context of the FmgrInfo's fn_mcxt.
+ */
+typedef struct
+{
+	char	   *fname;			/* function name (for error msgs) */
+	char	   *src;			/* function body text (for error msgs) */
+
+	SQLFunctionParseInfoPtr pinfo;	/* data for parser callback hooks */
+
+	Oid			rettype;		/* actual return type */
+	int16		typlen;			/* length of the return type */
+	bool		typbyval;		/* true if return type is pass by value */
+	bool		returnsSet;		/* true if returning multiple rows */
+	bool		returnsTuple;	/* true if returning whole tuple result */
+	bool		shutdown_reg;	/* true if registered shutdown callback */
+	bool		readonly_func;	/* true to run in "read only" mode */
+	bool		lazyEval;		/* true if using lazyEval for result query */
+
+	ParamListInfo paramLI;		/* Param list representing current args */
+
+	Tuplestorestate *tstore;	/* where we accumulate result tuples */
+
+	JunkFilter *junkFilter;		/* will be NULL if function returns VOID */
+
+	/*
+	 * func_state is a List of execution_state records, each of which is the
+	 * first for its original parsetree, with any additional records chained
+	 * to it via the "next" fields.  This sublist structure is needed to keep
+	 * track of where the original query boundaries are.
+	 */
+	List	   *func_state;
+
+	MemoryContext fcontext;		/* memory context holding this struct and all
+								 * subsidiary data */
+
+	LocalTransactionId lxid;	/* lxid in which cache was made */
+	SubTransactionId subxid;	/* subxid in which cache was made */
+} SQLFunctionCache;
+
+typedef SQLFunctionCache *SQLFunctionCachePtr;
+
+
+/* non-export function prototypes */
+static Node *sql_fn_param_ref(ParseState *pstate, ParamRef *pref);
+static Node *sql_fn_post_column_ref(ParseState *pstate,
+									ColumnRef *cref, Node *var);
+static Node *sql_fn_make_param(SQLFunctionParseInfoPtr pinfo,
+							   int paramno, int location);
+static Node *sql_fn_resolve_param_name(SQLFunctionParseInfoPtr pinfo,
+									   const char *paramname, int location);
+static List *init_execution_state(List *queryTree_list,
+								  SQLFunctionCachePtr fcache,
+								  bool lazyEvalOK);
+static void init_sql_fcache(FunctionCallInfo fcinfo, Oid collation, bool lazyEvalOK);
+static void postquel_start(execution_state *es, SQLFunctionCachePtr fcache);
+static bool postquel_getnext(execution_state *es, SQLFunctionCachePtr fcache);
+static void postquel_end(execution_state *es);
+static void postquel_sub_params(SQLFunctionCachePtr fcache,
+								FunctionCallInfo fcinfo);
+static Datum postquel_get_single_result(TupleTableSlot *slot,
+										FunctionCallInfo fcinfo,
+										SQLFunctionCachePtr fcache,
+										MemoryContext resultcontext);
+static void sql_exec_error_callback(void *arg);
+static void ShutdownSQLFunction(Datum arg);
+static bool coerce_fn_result_column(TargetEntry *src_tle,
+									Oid res_type, int32 res_typmod,
+									bool tlist_is_modifiable,
+									List **upper_tlist,
+									bool *upper_tlist_nontrivial);
+static void sqlfunction_startup(DestReceiver *self, int operation, TupleDesc typeinfo);
+static bool sqlfunction_receive(TupleTableSlot *slot, DestReceiver *self);
+static void sqlfunction_shutdown(DestReceiver *self);
+static void sqlfunction_destroy(DestReceiver *self);
+
+
+/*
+ * Prepare the SQLFunctionParseInfo struct for parsing a SQL function body
+ *
+ * This includes resolving actual types of polymorphic arguments.
+ *
+ * call_expr can be passed as NULL, but then we will fail if there are any
+ * polymorphic arguments.
+ */
+SQLFunctionParseInfoPtr
+prepare_sql_fn_parse_info(HeapTuple procedureTuple,
+						  Node *call_expr,
+						  Oid inputCollation)
+{
+	SQLFunctionParseInfoPtr pinfo;
+	Form_pg_proc procedureStruct = (Form_pg_proc) GETSTRUCT(procedureTuple);
+	int			nargs;
+
+	pinfo = (SQLFunctionParseInfoPtr) palloc0(sizeof(SQLFunctionParseInfo));
+
+	/* Function's name (only) can be used to qualify argument names */
+	pinfo->fname = pstrdup(NameStr(procedureStruct->proname));
+
+	/* Save the function's input collation */
+	pinfo->collation = inputCollation;
+
+	/*
+	 * Copy input argument types from the pg_proc entry, then resolve any
+	 * polymorphic types.
+	 */
+	pinfo->nargs = nargs = procedureStruct->pronargs;
+	if (nargs > 0)
+	{
+		Oid		   *argOidVect;
+		int			argnum;
+
+		argOidVect = (Oid *) palloc(nargs * sizeof(Oid));
+		memcpy(argOidVect,
+			   procedureStruct->proargtypes.values,
+			   nargs * sizeof(Oid));
+
+		for (argnum = 0; argnum < nargs; argnum++)
+		{
+			Oid			argtype = argOidVect[argnum];
+
+			if (IsPolymorphicType(argtype))
+			{
+				argtype = get_call_expr_argtype(call_expr, argnum);
+				if (argtype == InvalidOid)
+					ereport(ERROR,
+							(errcode(ERRCODE_DATATYPE_MISMATCH),
+							 errmsg("could not determine actual type of argument declared %s",
+									format_type_be(argOidVect[argnum]))));
+				argOidVect[argnum] = argtype;
+			}
+		}
+
+		pinfo->argtypes = argOidVect;
+	}
+
+	/*
+	 * Collect names of arguments, too, if any
+	 */
+	if (nargs > 0)
+	{
+		Datum		proargnames;
+		Datum		proargmodes;
+		int			n_arg_names;
+		bool		isNull;
+
+		proargnames = SysCacheGetAttr(PROCNAMEARGSNSP, procedureTuple,
+									  Anum_pg_proc_proargnames,
+									  &isNull);
+		if (isNull)
+			proargnames = PointerGetDatum(NULL);	/* just to be sure */
+
+		proargmodes = SysCacheGetAttr(PROCNAMEARGSNSP, procedureTuple,
+									  Anum_pg_proc_proargmodes,
+									  &isNull);
+		if (isNull)
+			proargmodes = PointerGetDatum(NULL);	/* just to be sure */
+
+		n_arg_names = get_func_input_arg_names(proargnames, proargmodes,
+											   &pinfo->argnames);
+
+		/* Paranoia: ignore the result if too few array entries */
+		if (n_arg_names < nargs)
+			pinfo->argnames = NULL;
+	}
+	else
+		pinfo->argnames = NULL;
+
+	return pinfo;
+}
+
+/*
+ * Parser setup hook for parsing a SQL function body.
+ */
+void
+sql_fn_parser_setup(struct ParseState *pstate, SQLFunctionParseInfoPtr pinfo)
+{
+	pstate->p_pre_columnref_hook = NULL;
+	pstate->p_post_columnref_hook = sql_fn_post_column_ref;
+	pstate->p_paramref_hook = sql_fn_param_ref;
+	/* no need to use p_coerce_param_hook */
+	pstate->p_ref_hook_state = (void *) pinfo;
+}
+
+/*
+ * sql_fn_post_column_ref		parser callback for ColumnRefs
+ */
+static Node *
+sql_fn_post_column_ref(ParseState *pstate, ColumnRef *cref, Node *var)
+{
+	SQLFunctionParseInfoPtr pinfo = (SQLFunctionParseInfoPtr) pstate->p_ref_hook_state;
+	int			nnames;
+	Node	   *field1;
+	Node	   *subfield = NULL;
+	const char *name1;
+	const char *name2 = NULL;
+	Node	   *param;
+
+	/*
+	 * Never override a table-column reference.  This corresponds to
+	 * considering the parameter names to appear in a scope outside the
+	 * individual SQL commands, which is what we want.
+	 */
+	if (var != NULL)
+		return NULL;
+
+	/*----------
+	 * The allowed syntaxes are:
+	 *
+	 * A		A = parameter name
+	 * A.B		A = function name, B = parameter name
+	 *			OR: A = record-typed parameter name, B = field name
+	 *			(the first possibility takes precedence)
+	 * A.B.C	A = function name, B = record-typed parameter name,
+	 *			C = field name
+	 * A.*		Whole-row reference to composite parameter A.
+	 * A.B.*	Same, with A = function name, B = parameter name
+	 *
+	 * Here, it's sufficient to ignore the "*" in the last two cases --- the
+	 * main parser will take care of expanding the whole-row reference.
+	 *----------
+	 */
+	nnames = list_length(cref->fields);
+
+	if (nnames > 3)
+		return NULL;
+
+	if (IsA(llast(cref->fields), A_Star))
+		nnames--;
+
+	field1 = (Node *) linitial(cref->fields);
+	Assert(IsA(field1, String));
+	name1 = strVal(field1);
+	if (nnames > 1)
+	{
+		subfield = (Node *) lsecond(cref->fields);
+		Assert(IsA(subfield, String));
+		name2 = strVal(subfield);
+	}
+
+	if (nnames == 3)
+	{
+		/*
+		 * Three-part name: if the first part doesn't match the function name,
+		 * we can fail immediately. Otherwise, look up the second part, and
+		 * take the third part to be a field reference.
+		 */
+		if (strcmp(name1, pinfo->fname) != 0)
+			return NULL;
+
+		param = sql_fn_resolve_param_name(pinfo, name2, cref->location);
+
+		subfield = (Node *) lthird(cref->fields);
+		Assert(IsA(subfield, String));
+	}
+	else if (nnames == 2 && strcmp(name1, pinfo->fname) == 0)
+	{
+		/*
+		 * Two-part name with first part matching function name: first see if
+		 * second part matches any parameter name.
+		 */
+		param = sql_fn_resolve_param_name(pinfo, name2, cref->location);
+
+		if (param)
+		{
+			/* Yes, so this is a parameter reference, no subfield */
+			subfield = NULL;
+		}
+		else
+		{
+			/* No, so try to match as parameter name and subfield */
+			param = sql_fn_resolve_param_name(pinfo, name1, cref->location);
+		}
+	}
+	else
+	{
+		/* Single name, or parameter name followed by subfield */
+		param = sql_fn_resolve_param_name(pinfo, name1, cref->location);
+	}
+
+	if (!param)
+		return NULL;			/* No match */
+
+	if (subfield)
+	{
+		/*
+		 * Must be a reference to a field of a composite parameter; otherwise
+		 * ParseFuncOrColumn will return NULL, and we'll fail back at the
+		 * caller.
+		 */
+		param = ParseFuncOrColumn(pstate,
+								  list_make1(subfield),
+								  list_make1(param),
+								  pstate->p_last_srf,
+								  NULL,
+								  false,
+								  cref->location);
+	}
+
+	return param;
+}
+
+/*
+ * sql_fn_param_ref		parser callback for ParamRefs ($n symbols)
+ */
+static Node *
+sql_fn_param_ref(ParseState *pstate, ParamRef *pref)
+{
+	SQLFunctionParseInfoPtr pinfo = (SQLFunctionParseInfoPtr) pstate->p_ref_hook_state;
+	int			paramno = pref->number;
+
+	/* Check parameter number is valid */
+	if (paramno <= 0 || paramno > pinfo->nargs)
+		return NULL;			/* unknown parameter number */
+
+	return sql_fn_make_param(pinfo, paramno, pref->location);
+}
+
+/*
+ * sql_fn_make_param		construct a Param node for the given paramno
+ */
+static Node *
+sql_fn_make_param(SQLFunctionParseInfoPtr pinfo,
+				  int paramno, int location)
+{
+	Param	   *param;
+
+	param = makeNode(Param);
+	param->paramkind = PARAM_EXTERN;
+	param->paramid = paramno;
+	param->paramtype = pinfo->argtypes[paramno - 1];
+	param->paramtypmod = -1;
+	param->paramcollid = get_typcollation(param->paramtype);
+	param->location = location;
+
+	/*
+	 * If we have a function input collation, allow it to override the
+	 * type-derived collation for parameter symbols.  (XXX perhaps this should
+	 * not happen if the type collation is not default?)
+	 */
+	if (OidIsValid(pinfo->collation) && OidIsValid(param->paramcollid))
+		param->paramcollid = pinfo->collation;
+
+	return (Node *) param;
+}
+
+/*
+ * Search for a function parameter of the given name; if there is one,
+ * construct and return a Param node for it.  If not, return NULL.
+ * Helper function for sql_fn_post_column_ref.
+ */
+static Node *
+sql_fn_resolve_param_name(SQLFunctionParseInfoPtr pinfo,
+						  const char *paramname, int location)
+{
+	int			i;
+
+	if (pinfo->argnames == NULL)
+		return NULL;
+
+	for (i = 0; i < pinfo->nargs; i++)
+	{
+		if (pinfo->argnames[i] && strcmp(pinfo->argnames[i], paramname) == 0)
+			return sql_fn_make_param(pinfo, i + 1, location);
+	}
+
+	return NULL;
+}
+
+/*
+ * Set up the per-query execution_state records for a SQL function.
+ *
+ * The input is a List of Lists of parsed and rewritten, but not planned,
+ * querytrees.  The sublist structure denotes the original query boundaries.
+ */
+static List *
+init_execution_state(List *queryTree_list,
+					 SQLFunctionCachePtr fcache,
+					 bool lazyEvalOK)
+{
+	List	   *eslist = NIL;
+	execution_state *lasttages = NULL;
+	ListCell   *lc1;
+
+	foreach(lc1, queryTree_list)
+	{
+		List	   *qtlist = lfirst_node(List, lc1);
+		execution_state *firstes = NULL;
+		execution_state *preves = NULL;
+		ListCell   *lc2;
+
+		foreach(lc2, qtlist)
+		{
+			Query	   *queryTree = lfirst_node(Query, lc2);
+			PlannedStmt *stmt;
+			execution_state *newes;
+
+			/* Plan the query if needed */
+			if (queryTree->commandType == CMD_UTILITY)
+			{
+				/* Utility commands require no planning. */
+				stmt = makeNode(PlannedStmt);
+				stmt->commandType = CMD_UTILITY;
+				stmt->canSetTag = queryTree->canSetTag;
+				stmt->utilityStmt = queryTree->utilityStmt;
+				stmt->stmt_location = queryTree->stmt_location;
+				stmt->stmt_len = queryTree->stmt_len;
+			}
+			else
+				stmt = pg_plan_query(queryTree,
+									 fcache->src,
+									 CURSOR_OPT_PARALLEL_OK,
+									 NULL);
+
+			/*
+			 * Precheck all commands for validity in a function.  This should
+			 * generally match the restrictions spi.c applies.
+			 */
+			if (stmt->commandType == CMD_UTILITY)
+			{
+				if (IsA(stmt->utilityStmt, CopyStmt) &&
+					((CopyStmt *) stmt->utilityStmt)->filename == NULL)
+					ereport(ERROR,
+							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							 errmsg("cannot COPY to/from client in an SQL function")));
+
+				if (IsA(stmt->utilityStmt, TransactionStmt))
+					ereport(ERROR,
+							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					/* translator: %s is a SQL statement name */
+							 errmsg("%s is not allowed in an SQL function",
+									CreateCommandName(stmt->utilityStmt))));
+			}
+
+			if (fcache->readonly_func && !CommandIsReadOnly(stmt))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				/* translator: %s is a SQL statement name */
+						 errmsg("%s is not allowed in a non-volatile function",
+								CreateCommandName((Node *) stmt))));
+
+			/* OK, build the execution_state for this query */
+			newes = (execution_state *) palloc(sizeof(execution_state));
+			if (preves)
+				preves->next = newes;
+			else
+				firstes = newes;
+
+			newes->next = NULL;
+			newes->status = F_EXEC_START;
+			newes->setsResult = false;	/* might change below */
+			newes->lazyEval = false;	/* might change below */
+			newes->stmt = stmt;
+			newes->qd = NULL;
+
+			if (queryTree->canSetTag)
+				lasttages = newes;
+
+			preves = newes;
+		}
+
+		eslist = lappend(eslist, firstes);
+	}
+
+	/*
+	 * Mark the last canSetTag query as delivering the function result; then,
+	 * if it is a plain SELECT, mark it for lazy evaluation. If it's not a
+	 * SELECT we must always run it to completion.
+	 *
+	 * Note: at some point we might add additional criteria for whether to use
+	 * lazy eval.  However, we should prefer to use it whenever the function
+	 * doesn't return set, since fetching more than one row is useless in that
+	 * case.
+	 *
+	 * Note: don't set setsResult if the function returns VOID, as evidenced
+	 * by not having made a junkfilter.  This ensures we'll throw away any
+	 * output from the last statement in such a function.
+	 */
+	if (lasttages && fcache->junkFilter)
+	{
+		lasttages->setsResult = true;
+		if (lazyEvalOK &&
+			lasttages->stmt->commandType == CMD_SELECT &&
+			!lasttages->stmt->hasModifyingCTE)
+			fcache->lazyEval = lasttages->lazyEval = true;
+	}
+
+	return eslist;
+}
+
+/*
+ * Initialize the SQLFunctionCache for a SQL function
+ */
+static void
+init_sql_fcache(FunctionCallInfo fcinfo, Oid collation, bool lazyEvalOK)
+{
+	FmgrInfo   *finfo = fcinfo->flinfo;
+	Oid			foid = finfo->fn_oid;
+	MemoryContext fcontext;
+	MemoryContext oldcontext;
+	Oid			rettype;
+	TupleDesc	rettupdesc;
+	HeapTuple	procedureTuple;
+	Form_pg_proc procedureStruct;
+	SQLFunctionCachePtr fcache;
+	List	   *queryTree_list;
+	List	   *resulttlist;
+	ListCell   *lc;
+	Datum		tmp;
+	bool		isNull;
+
+	/*
+	 * Create memory context that holds all the SQLFunctionCache data.  It
+	 * must be a child of whatever context holds the FmgrInfo.
+	 */
+	fcontext = AllocSetContextCreate(finfo->fn_mcxt,
+									 "SQL function",
+									 ALLOCSET_DEFAULT_SIZES);
+
+	oldcontext = MemoryContextSwitchTo(fcontext);
+
+	/*
+	 * Create the struct proper, link it to fcontext and fn_extra.  Once this
+	 * is done, we'll be able to recover the memory after failure, even if the
+	 * FmgrInfo is long-lived.
+	 */
+	fcache = (SQLFunctionCachePtr) palloc0(sizeof(SQLFunctionCache));
+	fcache->fcontext = fcontext;
+	finfo->fn_extra = (void *) fcache;
+
+	/*
+	 * get the procedure tuple corresponding to the given function Oid
+	 */
+	procedureTuple = SearchSysCache1(PROCOID, ObjectIdGetDatum(foid));
+	if (!HeapTupleIsValid(procedureTuple))
+		elog(ERROR, "cache lookup failed for function %u", foid);
+	procedureStruct = (Form_pg_proc) GETSTRUCT(procedureTuple);
+
+	/*
+	 * copy function name immediately for use by error reporting callback, and
+	 * for use as memory context identifier
+	 */
+	fcache->fname = pstrdup(NameStr(procedureStruct->proname));
+	MemoryContextSetIdentifier(fcontext, fcache->fname);
+
+	/*
+	 * Resolve any polymorphism, obtaining the actual result type, and the
+	 * corresponding tupdesc if it's a rowtype.
+	 */
+	(void) get_call_result_type(fcinfo, &rettype, &rettupdesc);
+
+	fcache->rettype = rettype;
+
+	/* Fetch the typlen and byval info for the result type */
+	get_typlenbyval(rettype, &fcache->typlen, &fcache->typbyval);
+
+	/* Remember whether we're returning setof something */
+	fcache->returnsSet = procedureStruct->proretset;
+
+	/* Remember if function is STABLE/IMMUTABLE */
+	fcache->readonly_func =
+		(procedureStruct->provolatile != PROVOLATILE_VOLATILE);
+
+	/*
+	 * We need the actual argument types to pass to the parser.  Also make
+	 * sure that parameter symbols are considered to have the function's
+	 * resolved input collation.
+	 */
+	fcache->pinfo = prepare_sql_fn_parse_info(procedureTuple,
+											  finfo->fn_expr,
+											  collation);
+
+	/*
+	 * And of course we need the function body text.
+	 */
+	tmp = SysCacheGetAttr(PROCOID,
+						  procedureTuple,
+						  Anum_pg_proc_prosrc,
+						  &isNull);
+	if (isNull)
+		elog(ERROR, "null prosrc for function %u", foid);
+	fcache->src = TextDatumGetCString(tmp);
+
+	/* If we have prosqlbody, pay attention to that not prosrc. */
+	tmp = SysCacheGetAttr(PROCOID,
+						  procedureTuple,
+						  Anum_pg_proc_prosqlbody,
+						  &isNull);
+
+	/*
+	 * Parse and rewrite the queries in the function text.  Use sublists to
+	 * keep track of the original query boundaries.
+	 *
+	 * Note: since parsing and planning is done in fcontext, we will generate
+	 * a lot of cruft that lives as long as the fcache does.  This is annoying
+	 * but we'll not worry about it until the module is rewritten to use
+	 * plancache.c.
+	 */
+	queryTree_list = NIL;
+	if (!isNull)
+	{
+		Node	   *n;
+		List	   *stored_query_list;
+
+		n = stringToNode(TextDatumGetCString(tmp));
+		if (IsA(n, List))
+			stored_query_list = linitial_node(List, castNode(List, n));
+		else
+			stored_query_list = list_make1(n);
+
+		foreach(lc, stored_query_list)
+		{
+			Query	   *parsetree = lfirst_node(Query, lc);
+			List	   *queryTree_sublist;
+
+			AcquireRewriteLocks(parsetree, true, false);
+			queryTree_sublist = pg_rewrite_query(parsetree);
+			queryTree_list = lappend(queryTree_list, queryTree_sublist);
+		}
+	}
+	else
+	{
+		List	   *raw_parsetree_list;
+
+		raw_parsetree_list = pg_parse_query(fcache->src);
+
+		foreach(lc, raw_parsetree_list)
+		{
+			RawStmt    *parsetree = lfirst_node(RawStmt, lc);
+			List	   *queryTree_sublist;
+
+			queryTree_sublist = pg_analyze_and_rewrite_params(parsetree,
+															  fcache->src,
+															  (ParserSetupHook) sql_fn_parser_setup,
+															  fcache->pinfo,
+															  NULL);
+			queryTree_list = lappend(queryTree_list, queryTree_sublist);
+		}
+	}
+
+	/*
+	 * Check that there are no statements we don't want to allow.
+	 */
+	check_sql_fn_statements(queryTree_list);
+
+	/*
+	 * Check that the function returns the type it claims to.  Although in
+	 * simple cases this was already done when the function was defined, we
+	 * have to recheck because database objects used in the function's queries
+	 * might have changed type.  We'd have to recheck anyway if the function
+	 * had any polymorphic arguments.  Moreover, check_sql_fn_retval takes
+	 * care of injecting any required column type coercions.  (But we don't
+	 * ask it to insert nulls for dropped columns; the junkfilter handles
+	 * that.)
+	 *
+	 * Note: we set fcache->returnsTuple according to whether we are returning
+	 * the whole tuple result or just a single column.  In the latter case we
+	 * clear returnsTuple because we need not act different from the scalar
+	 * result case, even if it's a rowtype column.  (However, we have to force
+	 * lazy eval mode in that case; otherwise we'd need extra code to expand
+	 * the rowtype column into multiple columns, since we have no way to
+	 * notify the caller that it should do that.)
+	 */
+	fcache->returnsTuple = check_sql_fn_retval(queryTree_list,
+											   rettype,
+											   rettupdesc,
+											   false,
+											   &resulttlist);
+
+	/*
+	 * Construct a JunkFilter we can use to coerce the returned rowtype to the
+	 * desired form, unless the result type is VOID, in which case there's
+	 * nothing to coerce to.  (XXX Frequently, the JunkFilter isn't doing
+	 * anything very interesting, but much of this module expects it to be
+	 * there anyway.)
+	 */
+	if (rettype != VOIDOID)
+	{
+		TupleTableSlot *slot = MakeSingleTupleTableSlot(NULL,
+														&TTSOpsMinimalTuple);
+
+		/*
+		 * If the result is composite, *and* we are returning the whole tuple
+		 * result, we need to insert nulls for any dropped columns.  In the
+		 * single-column-result case, there might be dropped columns within
+		 * the composite column value, but it's not our problem here.  There
+		 * should be no resjunk entries in resulttlist, so in the second case
+		 * the JunkFilter is certainly a no-op.
+		 */
+		if (rettupdesc && fcache->returnsTuple)
+			fcache->junkFilter = ExecInitJunkFilterConversion(resulttlist,
+															  rettupdesc,
+															  slot);
+		else
+			fcache->junkFilter = ExecInitJunkFilter(resulttlist, slot);
+	}
+
+	if (fcache->returnsTuple)
+	{
+		/* Make sure output rowtype is properly blessed */
+		BlessTupleDesc(fcache->junkFilter->jf_resultSlot->tts_tupleDescriptor);
+	}
+	else if (fcache->returnsSet && type_is_rowtype(fcache->rettype))
+	{
+		/*
+		 * Returning rowtype as if it were scalar --- materialize won't work.
+		 * Right now it's sufficient to override any caller preference for
+		 * materialize mode, but to add more smarts in init_execution_state
+		 * about this, we'd probably need a three-way flag instead of bool.
+		 */
+		lazyEvalOK = true;
+	}
+
+	/* Finally, plan the queries */
+	fcache->func_state = init_execution_state(queryTree_list,
+											  fcache,
+											  lazyEvalOK);
+
+	/* Mark fcache with time of creation to show it's valid */
+	fcache->lxid = MyProc->lxid;
+	fcache->subxid = GetCurrentSubTransactionId();
+
+	ReleaseSysCache(procedureTuple);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/* Start up execution of one execution_state node */
+static void
+postquel_start(execution_state *es, SQLFunctionCachePtr fcache)
+{
+	DestReceiver *dest;
+
+	Assert(es->qd == NULL);
+
+	/* Caller should have ensured a suitable snapshot is active */
+	Assert(ActiveSnapshotSet());
+
+	/*
+	 * If this query produces the function result, send its output to the
+	 * tuplestore; else discard any output.
+	 */
+	if (es->setsResult)
+	{
+		DR_sqlfunction *myState;
+
+		dest = CreateDestReceiver(DestSQLFunction);
+		/* pass down the needed info to the dest receiver routines */
+		myState = (DR_sqlfunction *) dest;
+		Assert(myState->pub.mydest == DestSQLFunction);
+		myState->tstore = fcache->tstore;
+		myState->cxt = CurrentMemoryContext;
+		myState->filter = fcache->junkFilter;
+	}
+	else
+		dest = None_Receiver;
+
+	es->qd = CreateQueryDesc(es->stmt,
+							 fcache->src,
+							 GetActiveSnapshot(),
+							 InvalidSnapshot,
+							 dest,
+							 fcache->paramLI,
+							 es->qd ? es->qd->queryEnv : NULL,
+							 0);
+
+	/* Utility commands don't need Executor. */
+	if (es->qd->operation != CMD_UTILITY)
+	{
+		/*
+		 * In lazyEval mode, do not let the executor set up an AfterTrigger
+		 * context.  This is necessary not just an optimization, because we
+		 * mustn't exit from the function execution with a stacked
+		 * AfterTrigger level still active.  We are careful not to select
+		 * lazyEval mode for any statement that could possibly queue triggers.
+		 */
+		int			eflags;
+
+		if (es->lazyEval)
+			eflags = EXEC_FLAG_SKIP_TRIGGERS;
+		else
+			eflags = 0;			/* default run-to-completion flags */
+		ExecutorStart(es->qd, eflags);
+	}
+
+	es->status = F_EXEC_RUN;
+}
+
+/* Run one execution_state; either to completion or to first result row */
+/* Returns true if we ran to completion */
+static bool
+postquel_getnext(execution_state *es, SQLFunctionCachePtr fcache)
+{
+	bool		result;
+
+	if (es->qd->operation == CMD_UTILITY)
+	{
+		ProcessUtility(es->qd->plannedstmt,
+					   fcache->src,
+					   false,
+					   PROCESS_UTILITY_QUERY,
+					   es->qd->params,
+					   es->qd->queryEnv,
+					   es->qd->dest,
+					   NULL);
+		result = true;			/* never stops early */
+	}
+	else
+	{
+		/* Run regular commands to completion unless lazyEval */
+		uint64		count = (es->lazyEval) ? 1 : 0;
+
+		ExecutorRun(es->qd, ForwardScanDirection, count, !fcache->returnsSet || !es->lazyEval);
+
+		/*
+		 * If we requested run to completion OR there was no tuple returned,
+		 * command must be complete.
+		 */
+		result = (count == 0 || es->qd->estate->es_processed == 0);
+	}
+
+	return result;
+}
+
+/* Shut down execution of one execution_state node */
+static void
+postquel_end(execution_state *es)
+{
+	/* mark status done to ensure we don't do ExecutorEnd twice */
+	es->status = F_EXEC_DONE;
+
+	/* Utility commands don't need Executor. */
+	if (es->qd->operation != CMD_UTILITY)
+	{
+		ExecutorFinish(es->qd);
+		ExecutorEnd(es->qd);
+	}
+
+	es->qd->dest->rDestroy(es->qd->dest);
+
+	FreeQueryDesc(es->qd);
+	es->qd = NULL;
+}
+
+/* Build ParamListInfo array representing current arguments */
+static void
+postquel_sub_params(SQLFunctionCachePtr fcache,
+					FunctionCallInfo fcinfo)
+{
+	int			nargs = fcinfo->nargs;
+
+	if (nargs > 0)
+	{
+		ParamListInfo paramLI;
+
+		if (fcache->paramLI == NULL)
+		{
+			paramLI = makeParamList(nargs);
+			fcache->paramLI = paramLI;
+		}
+		else
+		{
+			paramLI = fcache->paramLI;
+			Assert(paramLI->numParams == nargs);
+		}
+
+		for (int i = 0; i < nargs; i++)
+		{
+			ParamExternData *prm = &paramLI->params[i];
+
+			prm->value = fcinfo->args[i].value;
+			prm->isnull = fcinfo->args[i].isnull;
+			prm->pflags = 0;
+			prm->ptype = fcache->pinfo->argtypes[i];
+		}
+	}
+	else
+		fcache->paramLI = NULL;
+}
+
+/*
+ * Extract the SQL function's value from a single result row.  This is used
+ * both for scalar (non-set) functions and for each row of a lazy-eval set
+ * result.
+ */
+static Datum
+postquel_get_single_result(TupleTableSlot *slot,
+						   FunctionCallInfo fcinfo,
+						   SQLFunctionCachePtr fcache,
+						   MemoryContext resultcontext)
+{
+	Datum		value;
+	MemoryContext oldcontext;
+
+	/*
+	 * Set up to return the function value.  For pass-by-reference datatypes,
+	 * be sure to allocate the result in resultcontext, not the current memory
+	 * context (which has query lifespan).  We can't leave the data in the
+	 * TupleTableSlot because we intend to clear the slot before returning.
+	 */
+	oldcontext = MemoryContextSwitchTo(resultcontext);
+
+	if (fcache->returnsTuple)
+	{
+		/* We must return the whole tuple as a Datum. */
+		fcinfo->isnull = false;
+		value = ExecFetchSlotHeapTupleDatum(slot);
+	}
+	else
+	{
+		/*
+		 * Returning a scalar, which we have to extract from the first column
+		 * of the SELECT result, and then copy into result context if needed.
+		 */
+		value = slot_getattr(slot, 1, &(fcinfo->isnull));
+
+		if (!fcinfo->isnull)
+			value = datumCopy(value, fcache->typbyval, fcache->typlen);
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return value;
+}
+
+/*
+ * fmgr_sql: function call manager for SQL functions
+ */
+Datum
+fmgr_sql(PG_FUNCTION_ARGS)
+{
+	SQLFunctionCachePtr fcache;
+	ErrorContextCallback sqlerrcontext;
+	MemoryContext oldcontext;
+	bool		randomAccess;
+	bool		lazyEvalOK;
+	bool		is_first;
+	bool		pushed_snapshot;
+	execution_state *es;
+	TupleTableSlot *slot;
+	Datum		result;
+	List	   *eslist;
+	ListCell   *eslc;
+
+	/*
+	 * Setup error traceback support for ereport()
+	 */
+	sqlerrcontext.callback = sql_exec_error_callback;
+	sqlerrcontext.arg = fcinfo->flinfo;
+	sqlerrcontext.previous = error_context_stack;
+	error_context_stack = &sqlerrcontext;
+
+	/* Check call context */
+	if (fcinfo->flinfo->fn_retset)
+	{
+		ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
+
+		/*
+		 * For simplicity, we require callers to support both set eval modes.
+		 * There are cases where we must use one or must use the other, and
+		 * it's not really worthwhile to postpone the check till we know. But
+		 * note we do not require caller to provide an expectedDesc.
+		 */
+		if (!rsi || !IsA(rsi, ReturnSetInfo) ||
+			(rsi->allowedModes & SFRM_ValuePerCall) == 0 ||
+			(rsi->allowedModes & SFRM_Materialize) == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("set-valued function called in context that cannot accept a set")));
+		randomAccess = rsi->allowedModes & SFRM_Materialize_Random;
+		lazyEvalOK = !(rsi->allowedModes & SFRM_Materialize_Preferred);
+	}
+	else
+	{
+		randomAccess = false;
+		lazyEvalOK = true;
+	}
+
+	/*
+	 * Initialize fcache (build plans) if first time through; or re-initialize
+	 * if the cache is stale.
+	 */
+	fcache = (SQLFunctionCachePtr) fcinfo->flinfo->fn_extra;
+
+	if (fcache != NULL)
+	{
+		if (fcache->lxid != MyProc->lxid ||
+			!SubTransactionIsActive(fcache->subxid))
+		{
+			/* It's stale; unlink and delete */
+			fcinfo->flinfo->fn_extra = NULL;
+			MemoryContextDelete(fcache->fcontext);
+			fcache = NULL;
+		}
+	}
+
+	if (fcache == NULL)
+	{
+		init_sql_fcache(fcinfo, PG_GET_COLLATION(), lazyEvalOK);
+		fcache = (SQLFunctionCachePtr) fcinfo->flinfo->fn_extra;
+	}
+
+	/*
+	 * Switch to context in which the fcache lives.  This ensures that our
+	 * tuplestore etc will have sufficient lifetime.  The sub-executor is
+	 * responsible for deleting per-tuple information.  (XXX in the case of a
+	 * long-lived FmgrInfo, this policy represents more memory leakage, but
+	 * it's not entirely clear where to keep stuff instead.)
+	 */
+	oldcontext = MemoryContextSwitchTo(fcache->fcontext);
+
+	/*
+	 * Find first unfinished query in function, and note whether it's the
+	 * first query.
+	 */
+	eslist = fcache->func_state;
+	es = NULL;
+	is_first = true;
+	foreach(eslc, eslist)
+	{
+		es = (execution_state *) lfirst(eslc);
+
+		while (es && es->status == F_EXEC_DONE)
+		{
+			is_first = false;
+			es = es->next;
+		}
+
+		if (es)
+			break;
+	}
+
+	/*
+	 * Convert params to appropriate format if starting a fresh execution. (If
+	 * continuing execution, we can re-use prior params.)
+	 */
+	if (is_first && es && es->status == F_EXEC_START)
+		postquel_sub_params(fcache, fcinfo);
+
+	/*
+	 * Build tuplestore to hold results, if we don't have one already. Note
+	 * it's in the query-lifespan context.
+	 */
+	if (!fcache->tstore)
+		fcache->tstore = tuplestore_begin_heap(randomAccess, false, work_mem);
+
+	/*
+	 * Execute each command in the function one after another until we either
+	 * run out of commands or get a result row from a lazily-evaluated SELECT.
+	 *
+	 * Notes about snapshot management:
+	 *
+	 * In a read-only function, we just use the surrounding query's snapshot.
+	 *
+	 * In a non-read-only function, we rely on the fact that we'll never
+	 * suspend execution between queries of the function: the only reason to
+	 * suspend execution before completion is if we are returning a row from a
+	 * lazily-evaluated SELECT.  So, when first entering this loop, we'll
+	 * either start a new query (and push a fresh snapshot) or re-establish
+	 * the active snapshot from the existing query descriptor.  If we need to
+	 * start a new query in a subsequent execution of the loop, either we need
+	 * a fresh snapshot (and pushed_snapshot is false) or the existing
+	 * snapshot is on the active stack and we can just bump its command ID.
+	 */
+	pushed_snapshot = false;
+	while (es)
+	{
+		bool		completed;
+
+		if (es->status == F_EXEC_START)
+		{
+			/*
+			 * If not read-only, be sure to advance the command counter for
+			 * each command, so that all work to date in this transaction is
+			 * visible.  Take a new snapshot if we don't have one yet,
+			 * otherwise just bump the command ID in the existing snapshot.
+			 */
+			if (!fcache->readonly_func)
+			{
+				CommandCounterIncrement();
+				if (!pushed_snapshot)
+				{
+					PushActiveSnapshot(GetTransactionSnapshot());
+					pushed_snapshot = true;
+				}
+				else
+					UpdateActiveSnapshotCommandId();
+			}
+
+			postquel_start(es, fcache);
+		}
+		else if (!fcache->readonly_func && !pushed_snapshot)
+		{
+			/* Re-establish active snapshot when re-entering function */
+			PushActiveSnapshot(es->qd->snapshot);
+			pushed_snapshot = true;
+		}
+
+		completed = postquel_getnext(es, fcache);
+
+		/*
+		 * If we ran the command to completion, we can shut it down now. Any
+		 * row(s) we need to return are safely stashed in the tuplestore, and
+		 * we want to be sure that, for example, AFTER triggers get fired
+		 * before we return anything.  Also, if the function doesn't return
+		 * set, we can shut it down anyway because it must be a SELECT and we
+		 * don't care about fetching any more result rows.
+		 */
+		if (completed || !fcache->returnsSet)
+			postquel_end(es);
+
+		/*
+		 * Break from loop if we didn't shut down (implying we got a
+		 * lazily-evaluated row).  Otherwise we'll press on till the whole
+		 * function is done, relying on the tuplestore to keep hold of the
+		 * data to eventually be returned.  This is necessary since an
+		 * INSERT/UPDATE/DELETE RETURNING that sets the result might be
+		 * followed by additional rule-inserted commands, and we want to
+		 * finish doing all those commands before we return anything.
+		 */
+		if (es->status != F_EXEC_DONE)
+			break;
+
+		/*
+		 * Advance to next execution_state, which might be in the next list.
+		 */
+		es = es->next;
+		while (!es)
+		{
+			eslc = lnext(eslist, eslc);
+			if (!eslc)
+				break;			/* end of function */
+
+			es = (execution_state *) lfirst(eslc);
+
+			/*
+			 * Flush the current snapshot so that we will take a new one for
+			 * the new query list.  This ensures that new snaps are taken at
+			 * original-query boundaries, matching the behavior of interactive
+			 * execution.
+			 */
+			if (pushed_snapshot)
+			{
+				PopActiveSnapshot();
+				pushed_snapshot = false;
+			}
+		}
+	}
+
+	/*
+	 * The tuplestore now contains whatever row(s) we are supposed to return.
+	 */
+	if (fcache->returnsSet)
+	{
+		ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
+
+		if (es)
+		{
+			/*
+			 * If we stopped short of being done, we must have a lazy-eval
+			 * row.
+			 */
+			Assert(es->lazyEval);
+			/* Re-use the junkfilter's output slot to fetch back the tuple */
+			Assert(fcache->junkFilter);
+			slot = fcache->junkFilter->jf_resultSlot;
+			if (!tuplestore_gettupleslot(fcache->tstore, true, false, slot))
+				elog(ERROR, "failed to fetch lazy-eval tuple");
+			/* Extract the result as a datum, and copy out from the slot */
+			result = postquel_get_single_result(slot, fcinfo,
+												fcache, oldcontext);
+			/* Clear the tuplestore, but keep it for next time */
+			/* NB: this might delete the slot's content, but we don't care */
+			tuplestore_clear(fcache->tstore);
+
+			/*
+			 * Let caller know we're not finished.
+			 */
+			rsi->isDone = ExprMultipleResult;
+
+			/*
+			 * Ensure we will get shut down cleanly if the exprcontext is not
+			 * run to completion.
+			 */
+			if (!fcache->shutdown_reg)
+			{
+				RegisterExprContextCallback(rsi->econtext,
+											ShutdownSQLFunction,
+											PointerGetDatum(fcache));
+				fcache->shutdown_reg = true;
+			}
+		}
+		else if (fcache->lazyEval)
+		{
+			/*
+			 * We are done with a lazy evaluation.  Clean up.
+			 */
+			tuplestore_clear(fcache->tstore);
+
+			/*
+			 * Let caller know we're finished.
+			 */
+			rsi->isDone = ExprEndResult;
+
+			fcinfo->isnull = true;
+			result = (Datum) 0;
+
+			/* Deregister shutdown callback, if we made one */
+			if (fcache->shutdown_reg)
+			{
+				UnregisterExprContextCallback(rsi->econtext,
+											  ShutdownSQLFunction,
+											  PointerGetDatum(fcache));
+				fcache->shutdown_reg = false;
+			}
+		}
+		else
+		{
+			/*
+			 * We are done with a non-lazy evaluation.  Return whatever is in
+			 * the tuplestore.  (It is now caller's responsibility to free the
+			 * tuplestore when done.)
+			 */
+			rsi->returnMode = SFRM_Materialize;
+			rsi->setResult = fcache->tstore;
+			fcache->tstore = NULL;
+			/* must copy desc because execSRF.c will free it */
+			if (fcache->junkFilter)
+				rsi->setDesc = CreateTupleDescCopy(fcache->junkFilter->jf_cleanTupType);
+
+			fcinfo->isnull = true;
+			result = (Datum) 0;
+
+			/* Deregister shutdown callback, if we made one */
+			if (fcache->shutdown_reg)
+			{
+				UnregisterExprContextCallback(rsi->econtext,
+											  ShutdownSQLFunction,
+											  PointerGetDatum(fcache));
+				fcache->shutdown_reg = false;
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * Non-set function.  If we got a row, return it; else return NULL.
+		 */
+		if (fcache->junkFilter)
+		{
+			/* Re-use the junkfilter's output slot to fetch back the tuple */
+			slot = fcache->junkFilter->jf_resultSlot;
+			if (tuplestore_gettupleslot(fcache->tstore, true, false, slot))
+				result = postquel_get_single_result(slot, fcinfo,
+													fcache, oldcontext);
+			else
+			{
+				fcinfo->isnull = true;
+				result = (Datum) 0;
+			}
+		}
+		else
+		{
+			/* Should only get here for VOID functions and procedures */
+			Assert(fcache->rettype == VOIDOID);
+			fcinfo->isnull = true;
+			result = (Datum) 0;
+		}
+
+		/* Clear the tuplestore, but keep it for next time */
+		tuplestore_clear(fcache->tstore);
+	}
+
+	/* Pop snapshot if we have pushed one */
+	if (pushed_snapshot)
+		PopActiveSnapshot();
+
+	/*
+	 * If we've gone through every command in the function, we are done. Reset
+	 * the execution states to start over again on next call.
+	 */
+	if (es == NULL)
+	{
+		foreach(eslc, fcache->func_state)
+		{
+			es = (execution_state *) lfirst(eslc);
+			while (es)
+			{
+				es->status = F_EXEC_START;
+				es = es->next;
+			}
+		}
+	}
+
+	error_context_stack = sqlerrcontext.previous;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return result;
+}
+
+
+/*
+ * error context callback to let us supply a call-stack traceback
+ */
+static void
+sql_exec_error_callback(void *arg)
+{
+	FmgrInfo   *flinfo = (FmgrInfo *) arg;
+	SQLFunctionCachePtr fcache = (SQLFunctionCachePtr) flinfo->fn_extra;
+	int			syntaxerrposition;
+
+	/*
+	 * We can do nothing useful if init_sql_fcache() didn't get as far as
+	 * saving the function name
+	 */
+	if (fcache == NULL || fcache->fname == NULL)
+		return;
+
+	/*
+	 * If there is a syntax error position, convert to internal syntax error
+	 */
+	syntaxerrposition = geterrposition();
+	if (syntaxerrposition > 0 && fcache->src != NULL)
+	{
+		errposition(0);
+		internalerrposition(syntaxerrposition);
+		internalerrquery(fcache->src);
+	}
+
+	/*
+	 * Try to determine where in the function we failed.  If there is a query
+	 * with non-null QueryDesc, finger it.  (We check this rather than looking
+	 * for F_EXEC_RUN state, so that errors during ExecutorStart or
+	 * ExecutorEnd are blamed on the appropriate query; see postquel_start and
+	 * postquel_end.)
+	 */
+	if (fcache->func_state)
+	{
+		execution_state *es;
+		int			query_num;
+		ListCell   *lc;
+
+		es = NULL;
+		query_num = 1;
+		foreach(lc, fcache->func_state)
+		{
+			es = (execution_state *) lfirst(lc);
+			while (es)
+			{
+				if (es->qd)
+				{
+					errcontext("SQL function \"%s\" statement %d",
+							   fcache->fname, query_num);
+					break;
+				}
+				es = es->next;
+			}
+			if (es)
+				break;
+			query_num++;
+		}
+		if (es == NULL)
+		{
+			/*
+			 * couldn't identify a running query; might be function entry,
+			 * function exit, or between queries.
+			 */
+			errcontext("SQL function \"%s\"", fcache->fname);
+		}
+	}
+	else
+	{
+		/*
+		 * Assume we failed during init_sql_fcache().  (It's possible that the
+		 * function actually has an empty body, but in that case we may as
+		 * well report all errors as being "during startup".)
+		 */
+		errcontext("SQL function \"%s\" during startup", fcache->fname);
+	}
+}
+
+
+/*
+ * callback function in case a function-returning-set needs to be shut down
+ * before it has been run to completion
+ */
+static void
+ShutdownSQLFunction(Datum arg)
+{
+	SQLFunctionCachePtr fcache = (SQLFunctionCachePtr) DatumGetPointer(arg);
+	execution_state *es;
+	ListCell   *lc;
+
+	foreach(lc, fcache->func_state)
+	{
+		es = (execution_state *) lfirst(lc);
+		while (es)
+		{
+			/* Shut down anything still running */
+			if (es->status == F_EXEC_RUN)
+			{
+				/* Re-establish active snapshot for any called functions */
+				if (!fcache->readonly_func)
+					PushActiveSnapshot(es->qd->snapshot);
+
+				postquel_end(es);
+
+				if (!fcache->readonly_func)
+					PopActiveSnapshot();
+			}
+
+			/* Reset states to START in case we're called again */
+			es->status = F_EXEC_START;
+			es = es->next;
+		}
+	}
+
+	/* Release tuplestore if we have one */
+	if (fcache->tstore)
+		tuplestore_end(fcache->tstore);
+	fcache->tstore = NULL;
+
+	/* execUtils will deregister the callback... */
+	fcache->shutdown_reg = false;
+}
+
+/*
+ * check_sql_fn_statements
+ *
+ * Check statements in an SQL function.  Error out if there is anything that
+ * is not acceptable.
+ */
+void
+check_sql_fn_statements(List *queryTreeLists)
+{
+	ListCell   *lc;
+
+	/* We are given a list of sublists of Queries */
+	foreach(lc, queryTreeLists)
+	{
+		List	   *sublist = lfirst_node(List, lc);
+		ListCell   *lc2;
+
+		foreach(lc2, sublist)
+		{
+			Query	   *query = lfirst_node(Query, lc2);
+
+			/*
+			 * Disallow calling procedures with output arguments.  The current
+			 * implementation would just throw the output values away, unless
+			 * the statement is the last one.  Per SQL standard, we should
+			 * assign the output values by name.  By disallowing this here, we
+			 * preserve an opportunity for future improvement.
+			 */
+			if (query->commandType == CMD_UTILITY &&
+				IsA(query->utilityStmt, CallStmt))
+			{
+				CallStmt   *stmt = (CallStmt *) query->utilityStmt;
+
+				if (stmt->outargs != NIL)
+					ereport(ERROR,
+							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							 errmsg("calling procedures with output arguments is not supported in SQL functions")));
+			}
+		}
+	}
+}
+
+/*
+ * check_sql_fn_retval()
+ *		Check return value of a list of lists of sql parse trees.
+ *
+ * The return value of a sql function is the value returned by the last
+ * canSetTag query in the function.  We do some ad-hoc type checking and
+ * coercion here to ensure that the function returns what it's supposed to.
+ * Note that we may actually modify the last query to make it match!
+ *
+ * This function returns true if the sql function returns the entire tuple
+ * result of its final statement, or false if it returns just the first column
+ * result of that statement.  It throws an error if the final statement doesn't
+ * return the right type at all.
+ *
+ * Note that because we allow "SELECT rowtype_expression", the result can be
+ * false even when the declared function return type is a rowtype.
+ *
+ * For a polymorphic function the passed rettype must be the actual resolved
+ * output type of the function.  (This means we can't check the type during
+ * function definition of a polymorphic function.)  If we do see a polymorphic
+ * rettype we'll throw an error, saying it is not a supported rettype.
+ *
+ * If the function returns composite, the passed rettupdesc should describe
+ * the expected output.  If rettupdesc is NULL, we can't verify that the
+ * output matches; that should only happen in fmgr_sql_validator(), or when
+ * the function returns RECORD and the caller doesn't actually care which
+ * composite type it is.
+ *
+ * (Typically, rettype and rettupdesc are computed by get_call_result_type
+ * or a sibling function.)
+ *
+ * In addition to coercing individual output columns, we can modify the
+ * output to include dummy NULL columns for any dropped columns appearing
+ * in rettupdesc.  This is done only if the caller asks for it.
+ *
+ * If resultTargetList isn't NULL, then *resultTargetList is set to the
+ * targetlist that defines the final statement's result.  Exception: if the
+ * function is defined to return VOID then *resultTargetList is set to NIL.
+ */
+bool
+check_sql_fn_retval(List *queryTreeLists,
+					Oid rettype, TupleDesc rettupdesc,
+					bool insertDroppedCols,
+					List **resultTargetList)
+{
+	bool		is_tuple_result = false;
+	Query	   *parse;
+	ListCell   *parse_cell;
+	List	   *tlist;
+	int			tlistlen;
+	bool		tlist_is_modifiable;
+	char		fn_typtype;
+	List	   *upper_tlist = NIL;
+	bool		upper_tlist_nontrivial = false;
+	ListCell   *lc;
+
+	if (resultTargetList)
+		*resultTargetList = NIL;	/* initialize in case of VOID result */
+
+	/*
+	 * If it's declared to return VOID, we don't care what's in the function.
+	 * (This takes care of the procedure case, as well.)
+	 */
+	if (rettype == VOIDOID)
+		return false;
+
+	/*
+	 * Find the last canSetTag query in the function body (which is presented
+	 * to us as a list of sublists of Query nodes).  This isn't necessarily
+	 * the last parsetree, because rule rewriting can insert queries after
+	 * what the user wrote.  Note that it might not even be in the last
+	 * sublist, for example if the last query rewrites to DO INSTEAD NOTHING.
+	 * (It might not be unreasonable to throw an error in such a case, but
+	 * this is the historical behavior and it doesn't seem worth changing.)
+	 */
+	parse = NULL;
+	parse_cell = NULL;
+	foreach(lc, queryTreeLists)
+	{
+		List	   *sublist = lfirst_node(List, lc);
+		ListCell   *lc2;
+
+		foreach(lc2, sublist)
+		{
+			Query	   *q = lfirst_node(Query, lc2);
+
+			if (q->canSetTag)
+			{
+				parse = q;
+				parse_cell = lc2;
+			}
+		}
+	}
+
+	/*
+	 * If it's a plain SELECT, it returns whatever the targetlist says.
+	 * Otherwise, if it's INSERT/UPDATE/DELETE with RETURNING, it returns
+	 * that. Otherwise, the function return type must be VOID.
+	 *
+	 * Note: eventually replace this test with QueryReturnsTuples?	We'd need
+	 * a more general method of determining the output type, though.  Also, it
+	 * seems too dangerous to consider FETCH or EXECUTE as returning a
+	 * determinable rowtype, since they depend on relatively short-lived
+	 * entities.
+	 */
+	if (parse &&
+		parse->commandType == CMD_SELECT)
+	{
+		tlist = parse->targetList;
+		/* tlist is modifiable unless it's a dummy in a setop query */
+		tlist_is_modifiable = (parse->setOperations == NULL);
+	}
+	else if (parse &&
+			 (parse->commandType == CMD_INSERT ||
+			  parse->commandType == CMD_UPDATE ||
+			  parse->commandType == CMD_DELETE) &&
+			 parse->returningList)
+	{
+		tlist = parse->returningList;
+		/* returningList can always be modified */
+		tlist_is_modifiable = true;
+	}
+	else
+	{
+		/* Empty function body, or last statement is a utility command */
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+				 errmsg("return type mismatch in function declared to return %s",
+						format_type_be(rettype)),
+				 errdetail("Function's final statement must be SELECT or INSERT/UPDATE/DELETE RETURNING.")));
+		return false;			/* keep compiler quiet */
+	}
+
+	/*
+	 * OK, check that the targetlist returns something matching the declared
+	 * type, and modify it if necessary.  If possible, we insert any coercion
+	 * steps right into the final statement's targetlist.  However, that might
+	 * risk changes in the statement's semantics --- we can't safely change
+	 * the output type of a grouping column, for instance.  In such cases we
+	 * handle coercions by inserting an extra level of Query that effectively
+	 * just does a projection.
+	 */
+
+	/*
+	 * Count the non-junk entries in the result targetlist.
+	 */
+	tlistlen = ExecCleanTargetListLength(tlist);
+
+	fn_typtype = get_typtype(rettype);
+
+	if (fn_typtype == TYPTYPE_BASE ||
+		fn_typtype == TYPTYPE_DOMAIN ||
+		fn_typtype == TYPTYPE_ENUM ||
+		fn_typtype == TYPTYPE_RANGE ||
+		fn_typtype == TYPTYPE_MULTIRANGE)
+	{
+		/*
+		 * For scalar-type returns, the target list must have exactly one
+		 * non-junk entry, and its type must be coercible to rettype.
+		 */
+		TargetEntry *tle;
+
+		if (tlistlen != 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+					 errmsg("return type mismatch in function declared to return %s",
+							format_type_be(rettype)),
+					 errdetail("Final statement must return exactly one column.")));
+
+		/* We assume here that non-junk TLEs must come first in tlists */
+		tle = (TargetEntry *) linitial(tlist);
+		Assert(!tle->resjunk);
+
+		if (!coerce_fn_result_column(tle, rettype, -1,
+									 tlist_is_modifiable,
+									 &upper_tlist,
+									 &upper_tlist_nontrivial))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+					 errmsg("return type mismatch in function declared to return %s",
+							format_type_be(rettype)),
+					 errdetail("Actual return type is %s.",
+							   format_type_be(exprType((Node *) tle->expr)))));
+	}
+	else if (fn_typtype == TYPTYPE_COMPOSITE || rettype == RECORDOID)
+	{
+		/*
+		 * Returns a rowtype.
+		 *
+		 * Note that we will not consider a domain over composite to be a
+		 * "rowtype" return type; it goes through the scalar case above.  This
+		 * is because we only provide column-by-column implicit casting, and
+		 * will not cast the complete record result.  So the only way to
+		 * produce a domain-over-composite result is to compute it as an
+		 * explicit single-column result.  The single-composite-column code
+		 * path just below could handle such cases, but it won't be reached.
+		 */
+		int			tupnatts;	/* physical number of columns in tuple */
+		int			tuplogcols; /* # of nondeleted columns in tuple */
+		int			colindex;	/* physical column index */
+
+		/*
+		 * If the target list has one non-junk entry, and that expression has
+		 * or can be coerced to the declared return type, take it as the
+		 * result.  This allows, for example, 'SELECT func2()', where func2
+		 * has the same composite return type as the function that's calling
+		 * it.  This provision creates some ambiguity --- maybe the expression
+		 * was meant to be the lone field of the composite result --- but it
+		 * works well enough as long as we don't get too enthusiastic about
+		 * inventing coercions from scalar to composite types.
+		 *
+		 * XXX Note that if rettype is RECORD and the expression is of a named
+		 * composite type, or vice versa, this coercion will succeed, whether
+		 * or not the record type really matches.  For the moment we rely on
+		 * runtime type checking to catch any discrepancy, but it'd be nice to
+		 * do better at parse time.
+		 */
+		if (tlistlen == 1)
+		{
+			TargetEntry *tle = (TargetEntry *) linitial(tlist);
+
+			Assert(!tle->resjunk);
+			if (coerce_fn_result_column(tle, rettype, -1,
+										tlist_is_modifiable,
+										&upper_tlist,
+										&upper_tlist_nontrivial))
+			{
+				/* Note that we're NOT setting is_tuple_result */
+				goto tlist_coercion_finished;
+			}
+		}
+
+		/*
+		 * If the caller didn't provide an expected tupdesc, we can't do any
+		 * further checking.  Assume we're returning the whole tuple.
+		 */
+		if (rettupdesc == NULL)
+		{
+			/* Return tlist if requested */
+			if (resultTargetList)
+				*resultTargetList = tlist;
+			return true;
+		}
+
+		/*
+		 * Verify that the targetlist matches the return tuple type.  We scan
+		 * the non-resjunk columns, and coerce them if necessary to match the
+		 * datatypes of the non-deleted attributes.  For deleted attributes,
+		 * insert NULL result columns if the caller asked for that.
+		 */
+		tupnatts = rettupdesc->natts;
+		tuplogcols = 0;			/* we'll count nondeleted cols as we go */
+		colindex = 0;
+
+		foreach(lc, tlist)
+		{
+			TargetEntry *tle = (TargetEntry *) lfirst(lc);
+			Form_pg_attribute attr;
+
+			/* resjunk columns can simply be ignored */
+			if (tle->resjunk)
+				continue;
+
+			do
+			{
+				colindex++;
+				if (colindex > tupnatts)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+							 errmsg("return type mismatch in function declared to return %s",
+									format_type_be(rettype)),
+							 errdetail("Final statement returns too many columns.")));
+				attr = TupleDescAttr(rettupdesc, colindex - 1);
+				if (attr->attisdropped && insertDroppedCols)
+				{
+					Expr	   *null_expr;
+
+					/* The type of the null we insert isn't important */
+					null_expr = (Expr *) makeConst(INT4OID,
+												   -1,
+												   InvalidOid,
+												   sizeof(int32),
+												   (Datum) 0,
+												   true,	/* isnull */
+												   true /* byval */ );
+					upper_tlist = lappend(upper_tlist,
+										  makeTargetEntry(null_expr,
+														  list_length(upper_tlist) + 1,
+														  NULL,
+														  false));
+					upper_tlist_nontrivial = true;
+				}
+			} while (attr->attisdropped);
+			tuplogcols++;
+
+			if (!coerce_fn_result_column(tle,
+										 attr->atttypid, attr->atttypmod,
+										 tlist_is_modifiable,
+										 &upper_tlist,
+										 &upper_tlist_nontrivial))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+						 errmsg("return type mismatch in function declared to return %s",
+								format_type_be(rettype)),
+						 errdetail("Final statement returns %s instead of %s at column %d.",
+								   format_type_be(exprType((Node *) tle->expr)),
+								   format_type_be(attr->atttypid),
+								   tuplogcols)));
+		}
+
+		/* remaining columns in rettupdesc had better all be dropped */
+		for (colindex++; colindex <= tupnatts; colindex++)
+		{
+			if (!TupleDescAttr(rettupdesc, colindex - 1)->attisdropped)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+						 errmsg("return type mismatch in function declared to return %s",
+								format_type_be(rettype)),
+						 errdetail("Final statement returns too few columns.")));
+			if (insertDroppedCols)
+			{
+				Expr	   *null_expr;
+
+				/* The type of the null we insert isn't important */
+				null_expr = (Expr *) makeConst(INT4OID,
+											   -1,
+											   InvalidOid,
+											   sizeof(int32),
+											   (Datum) 0,
+											   true,	/* isnull */
+											   true /* byval */ );
+				upper_tlist = lappend(upper_tlist,
+									  makeTargetEntry(null_expr,
+													  list_length(upper_tlist) + 1,
+													  NULL,
+													  false));
+				upper_tlist_nontrivial = true;
+			}
+		}
+
+		/* Report that we are returning entire tuple result */
+		is_tuple_result = true;
+	}
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+				 errmsg("return type %s is not supported for SQL functions",
+						format_type_be(rettype))));
+
+tlist_coercion_finished:
+
+	/*
+	 * If necessary, modify the final Query by injecting an extra Query level
+	 * that just performs a projection.  (It'd be dubious to do this to a
+	 * non-SELECT query, but we never have to; RETURNING lists can always be
+	 * modified in-place.)
+	 */
+	if (upper_tlist_nontrivial)
+	{
+		Query	   *newquery;
+		List	   *colnames;
+		RangeTblEntry *rte;
+		RangeTblRef *rtr;
+
+		Assert(parse->commandType == CMD_SELECT);
+
+		/* Most of the upper Query struct can be left as zeroes/nulls */
+		newquery = makeNode(Query);
+		newquery->commandType = CMD_SELECT;
+		newquery->querySource = parse->querySource;
+		newquery->canSetTag = true;
+		newquery->targetList = upper_tlist;
+
+		/* We need a moderately realistic colnames list for the subquery RTE */
+		colnames = NIL;
+		foreach(lc, parse->targetList)
+		{
+			TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+			if (tle->resjunk)
+				continue;
+			colnames = lappend(colnames,
+							   makeString(tle->resname ? tle->resname : ""));
+		}
+
+		/* Build a suitable RTE for the subquery */
+		rte = makeNode(RangeTblEntry);
+		rte->rtekind = RTE_SUBQUERY;
+		rte->subquery = parse;
+		rte->eref = rte->alias = makeAlias("*SELECT*", colnames);
+		rte->lateral = false;
+		rte->inh = false;
+		rte->inFromCl = true;
+		newquery->rtable = list_make1(rte);
+
+		rtr = makeNode(RangeTblRef);
+		rtr->rtindex = 1;
+		newquery->jointree = makeFromExpr(list_make1(rtr), NULL);
+
+		/* Replace original query in the correct element of the query list */
+		lfirst(parse_cell) = newquery;
+	}
+
+	/* Return tlist (possibly modified) if requested */
+	if (resultTargetList)
+		*resultTargetList = upper_tlist;
+
+	return is_tuple_result;
+}
+
+/*
+ * Process one function result column for check_sql_fn_retval
+ *
+ * Coerce the output value to the required type/typmod, and add a column
+ * to *upper_tlist for it.  Set *upper_tlist_nontrivial to true if we
+ * add an upper tlist item that's not just a Var.
+ *
+ * Returns true if OK, false if could not coerce to required type
+ * (in which case, no changes have been made)
+ */
+static bool
+coerce_fn_result_column(TargetEntry *src_tle,
+						Oid res_type,
+						int32 res_typmod,
+						bool tlist_is_modifiable,
+						List **upper_tlist,
+						bool *upper_tlist_nontrivial)
+{
+	TargetEntry *new_tle;
+	Expr	   *new_tle_expr;
+	Node	   *cast_result;
+
+	/*
+	 * If the TLE has a sortgroupref marking, don't change it, as it probably
+	 * is referenced by ORDER BY, DISTINCT, etc, and changing its type would
+	 * break query semantics.  Otherwise, it's safe to modify in-place unless
+	 * the query as a whole has issues with that.
+	 */
+	if (tlist_is_modifiable && src_tle->ressortgroupref == 0)
+	{
+		/* OK to modify src_tle in place, if necessary */
+		cast_result = coerce_to_target_type(NULL,
+											(Node *) src_tle->expr,
+											exprType((Node *) src_tle->expr),
+											res_type, res_typmod,
+											COERCION_ASSIGNMENT,
+											COERCE_IMPLICIT_CAST,
+											-1);
+		if (cast_result == NULL)
+			return false;
+		assign_expr_collations(NULL, cast_result);
+		src_tle->expr = (Expr *) cast_result;
+		/* Make a Var referencing the possibly-modified TLE */
+		new_tle_expr = (Expr *) makeVarFromTargetEntry(1, src_tle);
+	}
+	else
+	{
+		/* Any casting must happen in the upper tlist */
+		Var		   *var = makeVarFromTargetEntry(1, src_tle);
+
+		cast_result = coerce_to_target_type(NULL,
+											(Node *) var,
+											var->vartype,
+											res_type, res_typmod,
+											COERCION_ASSIGNMENT,
+											COERCE_IMPLICIT_CAST,
+											-1);
+		if (cast_result == NULL)
+			return false;
+		assign_expr_collations(NULL, cast_result);
+		/* Did the coercion actually do anything? */
+		if (cast_result != (Node *) var)
+			*upper_tlist_nontrivial = true;
+		new_tle_expr = (Expr *) cast_result;
+	}
+	new_tle = makeTargetEntry(new_tle_expr,
+							  list_length(*upper_tlist) + 1,
+							  src_tle->resname, false);
+	*upper_tlist = lappend(*upper_tlist, new_tle);
+	return true;
+}
+
+
+/*
+ * CreateSQLFunctionDestReceiver -- create a suitable DestReceiver object
+ */
+DestReceiver *
+CreateSQLFunctionDestReceiver(void)
+{
+	DR_sqlfunction *self = (DR_sqlfunction *) palloc0(sizeof(DR_sqlfunction));
+
+	self->pub.receiveSlot = sqlfunction_receive;
+	self->pub.rStartup = sqlfunction_startup;
+	self->pub.rShutdown = sqlfunction_shutdown;
+	self->pub.rDestroy = sqlfunction_destroy;
+	self->pub.mydest = DestSQLFunction;
+
+	/* private fields will be set by postquel_start */
+
+	return (DestReceiver *) self;
+}
+
+/*
+ * sqlfunction_startup --- executor startup
+ */
+static void
+sqlfunction_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+	/* no-op */
+}
+
+/*
+ * sqlfunction_receive --- receive one tuple
+ */
+static bool
+sqlfunction_receive(TupleTableSlot *slot, DestReceiver *self)
+{
+	DR_sqlfunction *myState = (DR_sqlfunction *) self;
+
+	/* Filter tuple as needed */
+	slot = ExecFilterJunk(myState->filter, slot);
+
+	/* Store the filtered tuple into the tuplestore */
+	tuplestore_puttupleslot(myState->tstore, slot);
+
+	return true;
+}
+
+/*
+ * sqlfunction_shutdown --- executor end
+ */
+static void
+sqlfunction_shutdown(DestReceiver *self)
+{
+	/* no-op */
+}
+
+/*
+ * sqlfunction_destroy --- release DestReceiver object
+ */
+static void
+sqlfunction_destroy(DestReceiver *self)
+{
+	pfree(self);
+}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
new file mode 100644
index 0000000..2b106d8
--- /dev/null
+++ b/src/backend/executor/instrument.c
@@ -0,0 +1,279 @@
+/*-------------------------------------------------------------------------
+ *
+ * instrument.c
+ *	 functions for instrumentation of plan execution
+ *
+ *
+ * Copyright (c) 2001-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/instrument.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "executor/instrument.h"
+
+BufferUsage pgBufferUsage;
+static BufferUsage save_pgBufferUsage;
+WalUsage	pgWalUsage;
+static WalUsage save_pgWalUsage;
+
+static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
+static void WalUsageAdd(WalUsage *dst, WalUsage *add);
+
+
+/* Allocate new instrumentation structure(s) */
+Instrumentation *
+InstrAlloc(int n, int instrument_options, bool async_mode)
+{
+	Instrumentation *instr;
+
+	/* initialize all fields to zeroes, then modify as needed */
+	instr = palloc0(n * sizeof(Instrumentation));
+	if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
+	{
+		bool		need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
+		bool		need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
+		bool		need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+		int			i;
+
+		for (i = 0; i < n; i++)
+		{
+			instr[i].need_bufusage = need_buffers;
+			instr[i].need_walusage = need_wal;
+			instr[i].need_timer = need_timer;
+			instr[i].async_mode = async_mode;
+		}
+	}
+
+	return instr;
+}
+
+/* Initialize a pre-allocated instrumentation structure. */
+void
+InstrInit(Instrumentation *instr, int instrument_options)
+{
+	memset(instr, 0, sizeof(Instrumentation));
+	instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
+	instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
+	instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+}
+
+/* Entry to a plan node */
+void
+InstrStartNode(Instrumentation *instr)
+{
+	if (instr->need_timer &&
+		!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
+		elog(ERROR, "InstrStartNode called twice in a row");
+
+	/* save buffer usage totals at node entry, if needed */
+	if (instr->need_bufusage)
+		instr->bufusage_start = pgBufferUsage;
+
+	if (instr->need_walusage)
+		instr->walusage_start = pgWalUsage;
+}
+
+/* Exit from a plan node */
+void
+InstrStopNode(Instrumentation *instr, double nTuples)
+{
+	double		save_tuplecount = instr->tuplecount;
+	instr_time	endtime;
+
+	/* count the returned tuples */
+	instr->tuplecount += nTuples;
+
+	/* let's update the time only if the timer was requested */
+	if (instr->need_timer)
+	{
+		if (INSTR_TIME_IS_ZERO(instr->starttime))
+			elog(ERROR, "InstrStopNode called without start");
+
+		INSTR_TIME_SET_CURRENT(endtime);
+		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
+
+		INSTR_TIME_SET_ZERO(instr->starttime);
+	}
+
+	/* Add delta of buffer usage since entry to node's totals */
+	if (instr->need_bufusage)
+		BufferUsageAccumDiff(&instr->bufusage,
+							 &pgBufferUsage, &instr->bufusage_start);
+
+	if (instr->need_walusage)
+		WalUsageAccumDiff(&instr->walusage,
+						  &pgWalUsage, &instr->walusage_start);
+
+	/* Is this the first tuple of this cycle? */
+	if (!instr->running)
+	{
+		instr->running = true;
+		instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
+	}
+	else
+	{
+		/*
+		 * In async mode, if the plan node hadn't emitted any tuples before,
+		 * this might be the first tuple
+		 */
+		if (instr->async_mode && save_tuplecount < 1.0)
+			instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
+	}
+}
+
+/* Update tuple count */
+void
+InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
+{
+	/* count the returned tuples */
+	instr->tuplecount += nTuples;
+}
+
+/* Finish a run cycle for a plan node */
+void
+InstrEndLoop(Instrumentation *instr)
+{
+	double		totaltime;
+
+	/* Skip if nothing has happened, or already shut down */
+	if (!instr->running)
+		return;
+
+	if (!INSTR_TIME_IS_ZERO(instr->starttime))
+		elog(ERROR, "InstrEndLoop called on running node");
+
+	/* Accumulate per-cycle statistics into totals */
+	totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
+
+	instr->startup += instr->firsttuple;
+	instr->total += totaltime;
+	instr->ntuples += instr->tuplecount;
+	instr->nloops += 1;
+
+	/* Reset for next cycle (if any) */
+	instr->running = false;
+	INSTR_TIME_SET_ZERO(instr->starttime);
+	INSTR_TIME_SET_ZERO(instr->counter);
+	instr->firsttuple = 0;
+	instr->tuplecount = 0;
+}
+
+/* aggregate instrumentation information */
+void
+InstrAggNode(Instrumentation *dst, Instrumentation *add)
+{
+	if (!dst->running && add->running)
+	{
+		dst->running = true;
+		dst->firsttuple = add->firsttuple;
+	}
+	else if (dst->running && add->running && dst->firsttuple > add->firsttuple)
+		dst->firsttuple = add->firsttuple;
+
+	INSTR_TIME_ADD(dst->counter, add->counter);
+
+	dst->tuplecount += add->tuplecount;
+	dst->startup += add->startup;
+	dst->total += add->total;
+	dst->ntuples += add->ntuples;
+	dst->ntuples2 += add->ntuples2;
+	dst->nloops += add->nloops;
+	dst->nfiltered1 += add->nfiltered1;
+	dst->nfiltered2 += add->nfiltered2;
+
+	/* Add delta of buffer usage since entry to node's totals */
+	if (dst->need_bufusage)
+		BufferUsageAdd(&dst->bufusage, &add->bufusage);
+
+	if (dst->need_walusage)
+		WalUsageAdd(&dst->walusage, &add->walusage);
+}
+
+/* note current values during parallel executor startup */
+void
+InstrStartParallelQuery(void)
+{
+	save_pgBufferUsage = pgBufferUsage;
+	save_pgWalUsage = pgWalUsage;
+}
+
+/* report usage after parallel executor shutdown */
+void
+InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+{
+	memset(bufusage, 0, sizeof(BufferUsage));
+	BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
+	memset(walusage, 0, sizeof(WalUsage));
+	WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
+}
+
+/* accumulate work done by workers in leader's stats */
+void
+InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+{
+	BufferUsageAdd(&pgBufferUsage, bufusage);
+	WalUsageAdd(&pgWalUsage, walusage);
+}
+
+/* dst += add */
+static void
+BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
+{
+	dst->shared_blks_hit += add->shared_blks_hit;
+	dst->shared_blks_read += add->shared_blks_read;
+	dst->shared_blks_dirtied += add->shared_blks_dirtied;
+	dst->shared_blks_written += add->shared_blks_written;
+	dst->local_blks_hit += add->local_blks_hit;
+	dst->local_blks_read += add->local_blks_read;
+	dst->local_blks_dirtied += add->local_blks_dirtied;
+	dst->local_blks_written += add->local_blks_written;
+	dst->temp_blks_read += add->temp_blks_read;
+	dst->temp_blks_written += add->temp_blks_written;
+	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
+	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
+}
+
+/* dst += add - sub */
+void
+BufferUsageAccumDiff(BufferUsage *dst,
+					 const BufferUsage *add,
+					 const BufferUsage *sub)
+{
+	dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
+	dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
+	dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
+	dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
+	dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
+	dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
+	dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
+	dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
+	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
+	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
+	INSTR_TIME_ACCUM_DIFF(dst->blk_read_time,
+						  add->blk_read_time, sub->blk_read_time);
+	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
+						  add->blk_write_time, sub->blk_write_time);
+}
+
+/* helper functions for WAL usage accumulation */
+static void
+WalUsageAdd(WalUsage *dst, WalUsage *add)
+{
+	dst->wal_bytes += add->wal_bytes;
+	dst->wal_records += add->wal_records;
+	dst->wal_fpi += add->wal_fpi;
+}
+
+void
+WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
+{
+	dst->wal_bytes += add->wal_bytes - sub->wal_bytes;
+	dst->wal_records += add->wal_records - sub->wal_records;
+	dst->wal_fpi += add->wal_fpi - sub->wal_fpi;
+}
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
new file mode 100644
index 0000000..31609c6
--- /dev/null
+++ b/src/backend/executor/nodeAgg.c
@@ -0,0 +1,4829 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeAgg.c
+ *	  Routines to handle aggregate nodes.
+ *
+ *	  ExecAgg normally evaluates each aggregate in the following steps:
+ *
+ *		 transvalue = initcond
+ *		 foreach input_tuple do
+ *			transvalue = transfunc(transvalue, input_value(s))
+ *		 result = finalfunc(transvalue, direct_argument(s))
+ *
+ *	  If a finalfunc is not supplied then the result is just the ending
+ *	  value of transvalue.
+ *
+ *	  Other behaviors can be selected by the "aggsplit" mode, which exists
+ *	  to support partial aggregation.  It is possible to:
+ *	  * Skip running the finalfunc, so that the output is always the
+ *	  final transvalue state.
+ *	  * Substitute the combinefunc for the transfunc, so that transvalue
+ *	  states (propagated up from a child partial-aggregation step) are merged
+ *	  rather than processing raw input rows.  (The statements below about
+ *	  the transfunc apply equally to the combinefunc, when it's selected.)
+ *	  * Apply the serializefunc to the output values (this only makes sense
+ *	  when skipping the finalfunc, since the serializefunc works on the
+ *	  transvalue data type).
+ *	  * Apply the deserializefunc to the input values (this only makes sense
+ *	  when using the combinefunc, for similar reasons).
+ *	  It is the planner's responsibility to connect up Agg nodes using these
+ *	  alternate behaviors in a way that makes sense, with partial aggregation
+ *	  results being fed to nodes that expect them.
+ *
+ *	  If a normal aggregate call specifies DISTINCT or ORDER BY, we sort the
+ *	  input tuples and eliminate duplicates (if required) before performing
+ *	  the above-depicted process.  (However, we don't do that for ordered-set
+ *	  aggregates; their "ORDER BY" inputs are ordinary aggregate arguments
+ *	  so far as this module is concerned.)	Note that partial aggregation
+ *	  is not supported in these cases, since we couldn't ensure global
+ *	  ordering or distinctness of the inputs.
+ *
+ *	  If transfunc is marked "strict" in pg_proc and initcond is NULL,
+ *	  then the first non-NULL input_value is assigned directly to transvalue,
+ *	  and transfunc isn't applied until the second non-NULL input_value.
+ *	  The agg's first input type and transtype must be the same in this case!
+ *
+ *	  If transfunc is marked "strict" then NULL input_values are skipped,
+ *	  keeping the previous transvalue.  If transfunc is not strict then it
+ *	  is called for every input tuple and must deal with NULL initcond
+ *	  or NULL input_values for itself.
+ *
+ *	  If finalfunc is marked "strict" then it is not called when the
+ *	  ending transvalue is NULL, instead a NULL result is created
+ *	  automatically (this is just the usual handling of strict functions,
+ *	  of course).  A non-strict finalfunc can make its own choice of
+ *	  what to return for a NULL ending transvalue.
+ *
+ *	  Ordered-set aggregates are treated specially in one other way: we
+ *	  evaluate any "direct" arguments and pass them to the finalfunc along
+ *	  with the transition value.
+ *
+ *	  A finalfunc can have additional arguments beyond the transvalue and
+ *	  any "direct" arguments, corresponding to the input arguments of the
+ *	  aggregate.  These are always just passed as NULL.  Such arguments may be
+ *	  needed to allow resolution of a polymorphic aggregate's result type.
+ *
+ *	  We compute aggregate input expressions and run the transition functions
+ *	  in a temporary econtext (aggstate->tmpcontext).  This is reset at least
+ *	  once per input tuple, so when the transvalue datatype is
+ *	  pass-by-reference, we have to be careful to copy it into a longer-lived
+ *	  memory context, and free the prior value to avoid memory leakage.  We
+ *	  store transvalues in another set of econtexts, aggstate->aggcontexts
+ *	  (one per grouping set, see below), which are also used for the hashtable
+ *	  structures in AGG_HASHED mode.  These econtexts are rescanned, not just
+ *	  reset, at group boundaries so that aggregate transition functions can
+ *	  register shutdown callbacks via AggRegisterCallback.
+ *
+ *	  The node's regular econtext (aggstate->ss.ps.ps_ExprContext) is used to
+ *	  run finalize functions and compute the output tuple; this context can be
+ *	  reset once per output tuple.
+ *
+ *	  The executor's AggState node is passed as the fmgr "context" value in
+ *	  all transfunc and finalfunc calls.  It is not recommended that the
+ *	  transition functions look at the AggState node directly, but they can
+ *	  use AggCheckCallContext() to verify that they are being called by
+ *	  nodeAgg.c (and not as ordinary SQL functions).  The main reason a
+ *	  transition function might want to know this is so that it can avoid
+ *	  palloc'ing a fixed-size pass-by-ref transition value on every call:
+ *	  it can instead just scribble on and return its left input.  Ordinarily
+ *	  it is completely forbidden for functions to modify pass-by-ref inputs,
+ *	  but in the aggregate case we know the left input is either the initial
+ *	  transition value or a previous function result, and in either case its
+ *	  value need not be preserved.  See int8inc() for an example.  Notice that
+ *	  the EEOP_AGG_PLAIN_TRANS step is coded to avoid a data copy step when
+ *	  the previous transition value pointer is returned.  It is also possible
+ *	  to avoid repeated data copying when the transition value is an expanded
+ *	  object: to do that, the transition function must take care to return
+ *	  an expanded object that is in a child context of the memory context
+ *	  returned by AggCheckCallContext().  Also, some transition functions want
+ *	  to store working state in addition to the nominal transition value; they
+ *	  can use the memory context returned by AggCheckCallContext() to do that.
+ *
+ *	  Note: AggCheckCallContext() is available as of PostgreSQL 9.0.  The
+ *	  AggState is available as context in earlier releases (back to 8.1),
+ *	  but direct examination of the node is needed to use it before 9.0.
+ *
+ *	  As of 9.4, aggregate transition functions can also use AggGetAggref()
+ *	  to get hold of the Aggref expression node for their aggregate call.
+ *	  This is mainly intended for ordered-set aggregates, which are not
+ *	  supported as window functions.  (A regular aggregate function would
+ *	  need some fallback logic to use this, since there's no Aggref node
+ *	  for a window function.)
+ *
+ *	  Grouping sets:
+ *
+ *	  A list of grouping sets which is structurally equivalent to a ROLLUP
+ *	  clause (e.g. (a,b,c), (a,b), (a)) can be processed in a single pass over
+ *	  ordered data.  We do this by keeping a separate set of transition values
+ *	  for each grouping set being concurrently processed; for each input tuple
+ *	  we update them all, and on group boundaries we reset those states
+ *	  (starting at the front of the list) whose grouping values have changed
+ *	  (the list of grouping sets is ordered from most specific to least
+ *	  specific).
+ *
+ *	  Where more complex grouping sets are used, we break them down into
+ *	  "phases", where each phase has a different sort order (except phase 0
+ *	  which is reserved for hashing).  During each phase but the last, the
+ *	  input tuples are additionally stored in a tuplesort which is keyed to the
+ *	  next phase's sort order; during each phase but the first, the input
+ *	  tuples are drawn from the previously sorted data.  (The sorting of the
+ *	  data for the first phase is handled by the planner, as it might be
+ *	  satisfied by underlying nodes.)
+ *
+ *	  Hashing can be mixed with sorted grouping.  To do this, we have an
+ *	  AGG_MIXED strategy that populates the hashtables during the first sorted
+ *	  phase, and switches to reading them out after completing all sort phases.
+ *	  We can also support AGG_HASHED with multiple hash tables and no sorting
+ *	  at all.
+ *
+ *	  From the perspective of aggregate transition and final functions, the
+ *	  only issue regarding grouping sets is this: a single call site (flinfo)
+ *	  of an aggregate function may be used for updating several different
+ *	  transition values in turn. So the function must not cache in the flinfo
+ *	  anything which logically belongs as part of the transition value (most
+ *	  importantly, the memory context in which the transition value exists).
+ *	  The support API functions (AggCheckCallContext, AggRegisterCallback) are
+ *	  sensitive to the grouping set for which the aggregate function is
+ *	  currently being called.
+ *
+ *	  Plan structure:
+ *
+ *	  What we get from the planner is actually one "real" Agg node which is
+ *	  part of the plan tree proper, but which optionally has an additional list
+ *	  of Agg nodes hung off the side via the "chain" field.  This is because an
+ *	  Agg node happens to be a convenient representation of all the data we
+ *	  need for grouping sets.
+ *
+ *	  For many purposes, we treat the "real" node as if it were just the first
+ *	  node in the chain.  The chain must be ordered such that hashed entries
+ *	  come before sorted/plain entries; the real node is marked AGG_MIXED if
+ *	  there are both types present (in which case the real node describes one
+ *	  of the hashed groupings, other AGG_HASHED nodes may optionally follow in
+ *	  the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node).  If
+ *	  the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
+ *	  nodes must be of the same type; if it is AGG_PLAIN, there can be no
+ *	  chained nodes.
+ *
+ *	  We collect all hashed nodes into a single "phase", numbered 0, and create
+ *	  a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
+ *	  Phase 0 is allocated even if there are no hashes, but remains unused in
+ *	  that case.
+ *
+ *	  AGG_HASHED nodes actually refer to only a single grouping set each,
+ *	  because for each hashed grouping we need a separate grpColIdx and
+ *	  numGroups estimate.  AGG_SORTED nodes represent a "rollup", a list of
+ *	  grouping sets that share a sort order.  Each AGG_SORTED node other than
+ *	  the first one has an associated Sort node which describes the sort order
+ *	  to be used; the first sorted node takes its input from the outer subtree,
+ *	  which the planner has already arranged to provide ordered data.
+ *
+ *	  Memory and ExprContext usage:
+ *
+ *	  Because we're accumulating aggregate values across input rows, we need to
+ *	  use more memory contexts than just simple input/output tuple contexts.
+ *	  In fact, for a rollup, we need a separate context for each grouping set
+ *	  so that we can reset the inner (finer-grained) aggregates on their group
+ *	  boundaries while continuing to accumulate values for outer
+ *	  (coarser-grained) groupings.  On top of this, we might be simultaneously
+ *	  populating hashtables; however, we only need one context for all the
+ *	  hashtables.
+ *
+ *	  So we create an array, aggcontexts, with an ExprContext for each grouping
+ *	  set in the largest rollup that we're going to process, and use the
+ *	  per-tuple memory context of those ExprContexts to store the aggregate
+ *	  transition values.  hashcontext is the single context created to support
+ *	  all hash tables.
+ *
+ *	  Spilling To Disk
+ *
+ *	  When performing hash aggregation, if the hash table memory exceeds the
+ *	  limit (see hash_agg_check_limits()), we enter "spill mode". In spill
+ *	  mode, we advance the transition states only for groups already in the
+ *	  hash table. For tuples that would need to create a new hash table
+ *	  entries (and initialize new transition states), we instead spill them to
+ *	  disk to be processed later. The tuples are spilled in a partitioned
+ *	  manner, so that subsequent batches are smaller and less likely to exceed
+ *	  hash_mem (if a batch does exceed hash_mem, it must be spilled
+ *	  recursively).
+ *
+ *	  Spilled data is written to logical tapes. These provide better control
+ *	  over memory usage, disk space, and the number of files than if we were
+ *	  to use a BufFile for each spill.
+ *
+ *	  Note that it's possible for transition states to start small but then
+ *	  grow very large; for instance in the case of ARRAY_AGG. In such cases,
+ *	  it's still possible to significantly exceed hash_mem. We try to avoid
+ *	  this situation by estimating what will fit in the available memory, and
+ *	  imposing a limit on the number of groups separately from the amount of
+ *	  memory consumed.
+ *
+ *    Transition / Combine function invocation:
+ *
+ *    For performance reasons transition functions, including combine
+ *    functions, aren't invoked one-by-one from nodeAgg.c after computing
+ *    arguments using the expression evaluation engine. Instead
+ *    ExecBuildAggTrans() builds one large expression that does both argument
+ *    evaluation and transition function invocation. That avoids performance
+ *    issues due to repeated uses of expression evaluation, complications due
+ *    to filter expressions having to be evaluated early, and allows to JIT
+ *    the entire expression into one native function.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeAgg.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/parallel.h"
+#include "catalog/objectaccess.h"
+#include "catalog/pg_aggregate.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "common/hashfn.h"
+#include "executor/execExpr.h"
+#include "executor/executor.h"
+#include "executor/nodeAgg.h"
+#include "lib/hyperloglog.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
+#include "parser/parse_agg.h"
+#include "parser/parse_coerce.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/dynahash.h"
+#include "utils/expandeddatum.h"
+#include "utils/logtape.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/syscache.h"
+#include "utils/tuplesort.h"
+
+/*
+ * Control how many partitions are created when spilling HashAgg to
+ * disk.
+ *
+ * HASHAGG_PARTITION_FACTOR is multiplied by the estimated number of
+ * partitions needed such that each partition will fit in memory. The factor
+ * is set higher than one because there's not a high cost to having a few too
+ * many partitions, and it makes it less likely that a partition will need to
+ * be spilled recursively. Another benefit of having more, smaller partitions
+ * is that small hash tables may perform better than large ones due to memory
+ * caching effects.
+ *
+ * We also specify a min and max number of partitions per spill. Too few might
+ * mean a lot of wasted I/O from repeated spilling of the same tuples. Too
+ * many will result in lots of memory wasted buffering the spill files (which
+ * could instead be spent on a larger hash table).
+ */
+#define HASHAGG_PARTITION_FACTOR 1.50
+#define HASHAGG_MIN_PARTITIONS 4
+#define HASHAGG_MAX_PARTITIONS 1024
+
+/*
+ * For reading from tapes, the buffer size must be a multiple of
+ * BLCKSZ. Larger values help when reading from multiple tapes concurrently,
+ * but that doesn't happen in HashAgg, so we simply use BLCKSZ. Writing to a
+ * tape always uses a buffer of size BLCKSZ.
+ */
+#define HASHAGG_READ_BUFFER_SIZE BLCKSZ
+#define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ
+
+/*
+ * HyperLogLog is used for estimating the cardinality of the spilled tuples in
+ * a given partition. 5 bits corresponds to a size of about 32 bytes and a
+ * worst-case error of around 18%. That's effective enough to choose a
+ * reasonable number of partitions when recursing.
+ */
+#define HASHAGG_HLL_BIT_WIDTH 5
+
+/*
+ * Estimate chunk overhead as a constant 16 bytes. XXX: should this be
+ * improved?
+ */
+#define CHUNKHDRSZ 16
+
+/*
+ * Track all tapes needed for a HashAgg that spills. We don't know the maximum
+ * number of tapes needed at the start of the algorithm (because it can
+ * recurse), so one tape set is allocated and extended as needed for new
+ * tapes. When a particular tape is already read, rewind it for write mode and
+ * put it in the free list.
+ *
+ * Tapes' buffers can take up substantial memory when many tapes are open at
+ * once. We only need one tape open at a time in read mode (using a buffer
+ * that's a multiple of BLCKSZ); but we need one tape open in write mode (each
+ * requiring a buffer of size BLCKSZ) for each partition.
+ */
+typedef struct HashTapeInfo
+{
+	LogicalTapeSet *tapeset;
+	int			ntapes;
+	int		   *freetapes;
+	int			nfreetapes;
+	int			freetapes_alloc;
+} HashTapeInfo;
+
+/*
+ * Represents partitioned spill data for a single hashtable. Contains the
+ * necessary information to route tuples to the correct partition, and to
+ * transform the spilled data into new batches.
+ *
+ * The high bits are used for partition selection (when recursing, we ignore
+ * the bits that have already been used for partition selection at an earlier
+ * level).
+ */
+typedef struct HashAggSpill
+{
+	LogicalTapeSet *tapeset;	/* borrowed reference to tape set */
+	int			npartitions;	/* number of partitions */
+	int		   *partitions;		/* spill partition tape numbers */
+	int64	   *ntuples;		/* number of tuples in each partition */
+	uint32		mask;			/* mask to find partition from hash value */
+	int			shift;			/* after masking, shift by this amount */
+	hyperLogLogState *hll_card; /* cardinality estimate for contents */
+} HashAggSpill;
+
+/*
+ * Represents work to be done for one pass of hash aggregation (with only one
+ * grouping set).
+ *
+ * Also tracks the bits of the hash already used for partition selection by
+ * earlier iterations, so that this batch can use new bits. If all bits have
+ * already been used, no partitioning will be done (any spilled data will go
+ * to a single output tape).
+ */
+typedef struct HashAggBatch
+{
+	int			setno;			/* grouping set */
+	int			used_bits;		/* number of bits of hash already used */
+	LogicalTapeSet *tapeset;	/* borrowed reference to tape set */
+	int			input_tapenum;	/* input partition tape */
+	int64		input_tuples;	/* number of tuples in this batch */
+	double		input_card;		/* estimated group cardinality */
+} HashAggBatch;
+
+/* used to find referenced colnos */
+typedef struct FindColsContext
+{
+	bool		is_aggref;		/* is under an aggref */
+	Bitmapset  *aggregated;		/* column references under an aggref */
+	Bitmapset  *unaggregated;	/* other column references */
+} FindColsContext;
+
+static void select_current_set(AggState *aggstate, int setno, bool is_hash);
+static void initialize_phase(AggState *aggstate, int newphase);
+static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
+static void initialize_aggregates(AggState *aggstate,
+								  AggStatePerGroup *pergroups,
+								  int numReset);
+static void advance_transition_function(AggState *aggstate,
+										AggStatePerTrans pertrans,
+										AggStatePerGroup pergroupstate);
+static void advance_aggregates(AggState *aggstate);
+static void process_ordered_aggregate_single(AggState *aggstate,
+											 AggStatePerTrans pertrans,
+											 AggStatePerGroup pergroupstate);
+static void process_ordered_aggregate_multi(AggState *aggstate,
+											AggStatePerTrans pertrans,
+											AggStatePerGroup pergroupstate);
+static void finalize_aggregate(AggState *aggstate,
+							   AggStatePerAgg peragg,
+							   AggStatePerGroup pergroupstate,
+							   Datum *resultVal, bool *resultIsNull);
+static void finalize_partialaggregate(AggState *aggstate,
+									  AggStatePerAgg peragg,
+									  AggStatePerGroup pergroupstate,
+									  Datum *resultVal, bool *resultIsNull);
+static inline void prepare_hash_slot(AggStatePerHash perhash,
+									 TupleTableSlot *inputslot,
+									 TupleTableSlot *hashslot);
+static void prepare_projection_slot(AggState *aggstate,
+									TupleTableSlot *slot,
+									int currentSet);
+static void finalize_aggregates(AggState *aggstate,
+								AggStatePerAgg peragg,
+								AggStatePerGroup pergroup);
+static TupleTableSlot *project_aggregates(AggState *aggstate);
+static void find_cols(AggState *aggstate, Bitmapset **aggregated,
+					  Bitmapset **unaggregated);
+static bool find_cols_walker(Node *node, FindColsContext *context);
+static void build_hash_tables(AggState *aggstate);
+static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
+static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
+										  bool nullcheck);
+static long hash_choose_num_buckets(double hashentrysize,
+									long estimated_nbuckets,
+									Size memory);
+static int	hash_choose_num_partitions(double input_groups,
+									   double hashentrysize,
+									   int used_bits,
+									   int *log2_npartittions);
+static void initialize_hash_entry(AggState *aggstate,
+								  TupleHashTable hashtable,
+								  TupleHashEntry entry);
+static void lookup_hash_entries(AggState *aggstate);
+static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
+static void agg_fill_hash_table(AggState *aggstate);
+static bool agg_refill_hash_table(AggState *aggstate);
+static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate);
+static TupleTableSlot *agg_retrieve_hash_table_in_memory(AggState *aggstate);
+static void hash_agg_check_limits(AggState *aggstate);
+static void hash_agg_enter_spill_mode(AggState *aggstate);
+static void hash_agg_update_metrics(AggState *aggstate, bool from_tape,
+									int npartitions);
+static void hashagg_finish_initial_spills(AggState *aggstate);
+static void hashagg_reset_spill_state(AggState *aggstate);
+static HashAggBatch *hashagg_batch_new(LogicalTapeSet *tapeset,
+									   int input_tapenum, int setno,
+									   int64 input_tuples, double input_card,
+									   int used_bits);
+static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
+static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo,
+							   int used_bits, double input_groups,
+							   double hashentrysize);
+static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
+								TupleTableSlot *slot, uint32 hash);
+static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill,
+								 int setno);
+static void hashagg_tapeinfo_init(AggState *aggstate);
+static void hashagg_tapeinfo_assign(HashTapeInfo *tapeinfo, int *dest,
+									int ndest);
+static void hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum);
+static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
+static void build_pertrans_for_aggref(AggStatePerTrans pertrans,
+									  AggState *aggstate, EState *estate,
+									  Aggref *aggref, Oid aggtransfn, Oid aggtranstype,
+									  Oid aggserialfn, Oid aggdeserialfn,
+									  Datum initValue, bool initValueIsNull,
+									  Oid *inputTypes, int numArguments);
+
+
+/*
+ * Select the current grouping set; affects current_set and
+ * curaggcontext.
+ */
+static void
+select_current_set(AggState *aggstate, int setno, bool is_hash)
+{
+	/*
+	 * When changing this, also adapt ExecAggPlainTransByVal() and
+	 * ExecAggPlainTransByRef().
+	 */
+	if (is_hash)
+		aggstate->curaggcontext = aggstate->hashcontext;
+	else
+		aggstate->curaggcontext = aggstate->aggcontexts[setno];
+
+	aggstate->current_set = setno;
+}
+
+/*
+ * Switch to phase "newphase", which must either be 0 or 1 (to reset) or
+ * current_phase + 1. Juggle the tuplesorts accordingly.
+ *
+ * Phase 0 is for hashing, which we currently handle last in the AGG_MIXED
+ * case, so when entering phase 0, all we need to do is drop open sorts.
+ */
+static void
+initialize_phase(AggState *aggstate, int newphase)
+{
+	Assert(newphase <= 1 || newphase == aggstate->current_phase + 1);
+
+	/*
+	 * Whatever the previous state, we're now done with whatever input
+	 * tuplesort was in use.
+	 */
+	if (aggstate->sort_in)
+	{
+		tuplesort_end(aggstate->sort_in);
+		aggstate->sort_in = NULL;
+	}
+
+	if (newphase <= 1)
+	{
+		/*
+		 * Discard any existing output tuplesort.
+		 */
+		if (aggstate->sort_out)
+		{
+			tuplesort_end(aggstate->sort_out);
+			aggstate->sort_out = NULL;
+		}
+	}
+	else
+	{
+		/*
+		 * The old output tuplesort becomes the new input one, and this is the
+		 * right time to actually sort it.
+		 */
+		aggstate->sort_in = aggstate->sort_out;
+		aggstate->sort_out = NULL;
+		Assert(aggstate->sort_in);
+		tuplesort_performsort(aggstate->sort_in);
+	}
+
+	/*
+	 * If this isn't the last phase, we need to sort appropriately for the
+	 * next phase in sequence.
+	 */
+	if (newphase > 0 && newphase < aggstate->numphases - 1)
+	{
+		Sort	   *sortnode = aggstate->phases[newphase + 1].sortnode;
+		PlanState  *outerNode = outerPlanState(aggstate);
+		TupleDesc	tupDesc = ExecGetResultType(outerNode);
+
+		aggstate->sort_out = tuplesort_begin_heap(tupDesc,
+												  sortnode->numCols,
+												  sortnode->sortColIdx,
+												  sortnode->sortOperators,
+												  sortnode->collations,
+												  sortnode->nullsFirst,
+												  work_mem,
+												  NULL, false);
+	}
+
+	aggstate->current_phase = newphase;
+	aggstate->phase = &aggstate->phases[newphase];
+}
+
+/*
+ * Fetch a tuple from either the outer plan (for phase 1) or from the sorter
+ * populated by the previous phase.  Copy it to the sorter for the next phase
+ * if any.
+ *
+ * Callers cannot rely on memory for tuple in returned slot remaining valid
+ * past any subsequently fetched tuple.
+ */
+static TupleTableSlot *
+fetch_input_tuple(AggState *aggstate)
+{
+	TupleTableSlot *slot;
+
+	if (aggstate->sort_in)
+	{
+		/* make sure we check for interrupts in either path through here */
+		CHECK_FOR_INTERRUPTS();
+		if (!tuplesort_gettupleslot(aggstate->sort_in, true, false,
+									aggstate->sort_slot, NULL))
+			return NULL;
+		slot = aggstate->sort_slot;
+	}
+	else
+		slot = ExecProcNode(outerPlanState(aggstate));
+
+	if (!TupIsNull(slot) && aggstate->sort_out)
+		tuplesort_puttupleslot(aggstate->sort_out, slot);
+
+	return slot;
+}
+
+/*
+ * (Re)Initialize an individual aggregate.
+ *
+ * This function handles only one grouping set, already set in
+ * aggstate->current_set.
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans,
+					 AggStatePerGroup pergroupstate)
+{
+	/*
+	 * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
+	 */
+	if (pertrans->numSortCols > 0)
+	{
+		/*
+		 * In case of rescan, maybe there could be an uncompleted sort
+		 * operation?  Clean it up if so.
+		 */
+		if (pertrans->sortstates[aggstate->current_set])
+			tuplesort_end(pertrans->sortstates[aggstate->current_set]);
+
+
+		/*
+		 * We use a plain Datum sorter when there's a single input column;
+		 * otherwise sort the full tuple.  (See comments for
+		 * process_ordered_aggregate_single.)
+		 */
+		if (pertrans->numInputs == 1)
+		{
+			Form_pg_attribute attr = TupleDescAttr(pertrans->sortdesc, 0);
+
+			pertrans->sortstates[aggstate->current_set] =
+				tuplesort_begin_datum(attr->atttypid,
+									  pertrans->sortOperators[0],
+									  pertrans->sortCollations[0],
+									  pertrans->sortNullsFirst[0],
+									  work_mem, NULL, false);
+		}
+		else
+			pertrans->sortstates[aggstate->current_set] =
+				tuplesort_begin_heap(pertrans->sortdesc,
+									 pertrans->numSortCols,
+									 pertrans->sortColIdx,
+									 pertrans->sortOperators,
+									 pertrans->sortCollations,
+									 pertrans->sortNullsFirst,
+									 work_mem, NULL, false);
+	}
+
+	/*
+	 * (Re)set transValue to the initial value.
+	 *
+	 * Note that when the initial value is pass-by-ref, we must copy it (into
+	 * the aggcontext) since we will pfree the transValue later.
+	 */
+	if (pertrans->initValueIsNull)
+		pergroupstate->transValue = pertrans->initValue;
+	else
+	{
+		MemoryContext oldContext;
+
+		oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
+		pergroupstate->transValue = datumCopy(pertrans->initValue,
+											  pertrans->transtypeByVal,
+											  pertrans->transtypeLen);
+		MemoryContextSwitchTo(oldContext);
+	}
+	pergroupstate->transValueIsNull = pertrans->initValueIsNull;
+
+	/*
+	 * If the initial value for the transition state doesn't exist in the
+	 * pg_aggregate table then we will let the first non-NULL value returned
+	 * from the outer procNode become the initial value. (This is useful for
+	 * aggregates like max() and min().) The noTransValue flag signals that we
+	 * still need to do this.
+	 */
+	pergroupstate->noTransValue = pertrans->initValueIsNull;
+}
+
+/*
+ * Initialize all aggregate transition states for a new group of input values.
+ *
+ * If there are multiple grouping sets, we initialize only the first numReset
+ * of them (the grouping sets are ordered so that the most specific one, which
+ * is reset most often, is first). As a convenience, if numReset is 0, we
+ * reinitialize all sets.
+ *
+ * NB: This cannot be used for hash aggregates, as for those the grouping set
+ * number has to be specified from further up.
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+initialize_aggregates(AggState *aggstate,
+					  AggStatePerGroup *pergroups,
+					  int numReset)
+{
+	int			transno;
+	int			numGroupingSets = Max(aggstate->phase->numsets, 1);
+	int			setno = 0;
+	int			numTrans = aggstate->numtrans;
+	AggStatePerTrans transstates = aggstate->pertrans;
+
+	if (numReset == 0)
+		numReset = numGroupingSets;
+
+	for (setno = 0; setno < numReset; setno++)
+	{
+		AggStatePerGroup pergroup = pergroups[setno];
+
+		select_current_set(aggstate, setno, false);
+
+		for (transno = 0; transno < numTrans; transno++)
+		{
+			AggStatePerTrans pertrans = &transstates[transno];
+			AggStatePerGroup pergroupstate = &pergroup[transno];
+
+			initialize_aggregate(aggstate, pertrans, pergroupstate);
+		}
+	}
+}
+
+/*
+ * Given new input value(s), advance the transition function of one aggregate
+ * state within one grouping set only (already set in aggstate->current_set)
+ *
+ * The new values (and null flags) have been preloaded into argument positions
+ * 1 and up in pertrans->transfn_fcinfo, so that we needn't copy them again to
+ * pass to the transition function.  We also expect that the static fields of
+ * the fcinfo are already initialized; that was done by ExecInitAgg().
+ *
+ * It doesn't matter which memory context this is called in.
+ */
+static void
+advance_transition_function(AggState *aggstate,
+							AggStatePerTrans pertrans,
+							AggStatePerGroup pergroupstate)
+{
+	FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+	MemoryContext oldContext;
+	Datum		newVal;
+
+	if (pertrans->transfn.fn_strict)
+	{
+		/*
+		 * For a strict transfn, nothing happens when there's a NULL input; we
+		 * just keep the prior transValue.
+		 */
+		int			numTransInputs = pertrans->numTransInputs;
+		int			i;
+
+		for (i = 1; i <= numTransInputs; i++)
+		{
+			if (fcinfo->args[i].isnull)
+				return;
+		}
+		if (pergroupstate->noTransValue)
+		{
+			/*
+			 * transValue has not been initialized. This is the first non-NULL
+			 * input value. We use it as the initial value for transValue. (We
+			 * already checked that the agg's input type is binary-compatible
+			 * with its transtype, so straight copy here is OK.)
+			 *
+			 * We must copy the datum into aggcontext if it is pass-by-ref. We
+			 * do not need to pfree the old transValue, since it's NULL.
+			 */
+			oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
+			pergroupstate->transValue = datumCopy(fcinfo->args[1].value,
+												  pertrans->transtypeByVal,
+												  pertrans->transtypeLen);
+			pergroupstate->transValueIsNull = false;
+			pergroupstate->noTransValue = false;
+			MemoryContextSwitchTo(oldContext);
+			return;
+		}
+		if (pergroupstate->transValueIsNull)
+		{
+			/*
+			 * Don't call a strict function with NULL inputs.  Note it is
+			 * possible to get here despite the above tests, if the transfn is
+			 * strict *and* returned a NULL on a prior cycle. If that happens
+			 * we will propagate the NULL all the way to the end.
+			 */
+			return;
+		}
+	}
+
+	/* We run the transition functions in per-input-tuple memory context */
+	oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
+
+	/* set up aggstate->curpertrans for AggGetAggref() */
+	aggstate->curpertrans = pertrans;
+
+	/*
+	 * OK to call the transition function
+	 */
+	fcinfo->args[0].value = pergroupstate->transValue;
+	fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
+	fcinfo->isnull = false;		/* just in case transfn doesn't set it */
+
+	newVal = FunctionCallInvoke(fcinfo);
+
+	aggstate->curpertrans = NULL;
+
+	/*
+	 * If pass-by-ref datatype, must copy the new value into aggcontext and
+	 * free the prior transValue.  But if transfn returned a pointer to its
+	 * first input, we don't need to do anything.  Also, if transfn returned a
+	 * pointer to a R/W expanded object that is already a child of the
+	 * aggcontext, assume we can adopt that value without copying it.
+	 *
+	 * It's safe to compare newVal with pergroup->transValue without regard
+	 * for either being NULL, because ExecAggTransReparent() takes care to set
+	 * transValue to 0 when NULL. Otherwise we could end up accidentally not
+	 * reparenting, when the transValue has the same numerical value as
+	 * newValue, despite being NULL.  This is a somewhat hot path, making it
+	 * undesirable to instead solve this with another branch for the common
+	 * case of the transition function returning its (modified) input
+	 * argument.
+	 */
+	if (!pertrans->transtypeByVal &&
+		DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue))
+		newVal = ExecAggTransReparent(aggstate, pertrans,
+									  newVal, fcinfo->isnull,
+									  pergroupstate->transValue,
+									  pergroupstate->transValueIsNull);
+
+	pergroupstate->transValue = newVal;
+	pergroupstate->transValueIsNull = fcinfo->isnull;
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Advance each aggregate transition state for one input tuple.  The input
+ * tuple has been stored in tmpcontext->ecxt_outertuple, so that it is
+ * accessible to ExecEvalExpr.
+ *
+ * We have two sets of transition states to handle: one for sorted aggregation
+ * and one for hashed; we do them both here, to avoid multiple evaluation of
+ * the inputs.
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+advance_aggregates(AggState *aggstate)
+{
+	bool		dummynull;
+
+	ExecEvalExprSwitchContext(aggstate->phase->evaltrans,
+							  aggstate->tmpcontext,
+							  &dummynull);
+}
+
+/*
+ * Run the transition function for a DISTINCT or ORDER BY aggregate
+ * with only one input.  This is called after we have completed
+ * entering all the input values into the sort object.  We complete the
+ * sort, read out the values in sorted order, and run the transition
+ * function on each value (applying DISTINCT if appropriate).
+ *
+ * Note that the strictness of the transition function was checked when
+ * entering the values into the sort, so we don't check it again here;
+ * we just apply standard SQL DISTINCT logic.
+ *
+ * The one-input case is handled separately from the multi-input case
+ * for performance reasons: for single by-value inputs, such as the
+ * common case of count(distinct id), the tuplesort_getdatum code path
+ * is around 300% faster.  (The speedup for by-reference types is less
+ * but still noticeable.)
+ *
+ * This function handles only one grouping set (already set in
+ * aggstate->current_set).
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+process_ordered_aggregate_single(AggState *aggstate,
+								 AggStatePerTrans pertrans,
+								 AggStatePerGroup pergroupstate)
+{
+	Datum		oldVal = (Datum) 0;
+	bool		oldIsNull = true;
+	bool		haveOldVal = false;
+	MemoryContext workcontext = aggstate->tmpcontext->ecxt_per_tuple_memory;
+	MemoryContext oldContext;
+	bool		isDistinct = (pertrans->numDistinctCols > 0);
+	Datum		newAbbrevVal = (Datum) 0;
+	Datum		oldAbbrevVal = (Datum) 0;
+	FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+	Datum	   *newVal;
+	bool	   *isNull;
+
+	Assert(pertrans->numDistinctCols < 2);
+
+	tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
+
+	/* Load the column into argument 1 (arg 0 will be transition value) */
+	newVal = &fcinfo->args[1].value;
+	isNull = &fcinfo->args[1].isnull;
+
+	/*
+	 * Note: if input type is pass-by-ref, the datums returned by the sort are
+	 * freshly palloc'd in the per-query context, so we must be careful to
+	 * pfree them when they are no longer needed.
+	 */
+
+	while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set],
+							  true, newVal, isNull, &newAbbrevVal))
+	{
+		/*
+		 * Clear and select the working context for evaluation of the equality
+		 * function and transition function.
+		 */
+		MemoryContextReset(workcontext);
+		oldContext = MemoryContextSwitchTo(workcontext);
+
+		/*
+		 * If DISTINCT mode, and not distinct from prior, skip it.
+		 */
+		if (isDistinct &&
+			haveOldVal &&
+			((oldIsNull && *isNull) ||
+			 (!oldIsNull && !*isNull &&
+			  oldAbbrevVal == newAbbrevVal &&
+			  DatumGetBool(FunctionCall2Coll(&pertrans->equalfnOne,
+											 pertrans->aggCollation,
+											 oldVal, *newVal)))))
+		{
+			/* equal to prior, so forget this one */
+			if (!pertrans->inputtypeByVal && !*isNull)
+				pfree(DatumGetPointer(*newVal));
+		}
+		else
+		{
+			advance_transition_function(aggstate, pertrans, pergroupstate);
+			/* forget the old value, if any */
+			if (!oldIsNull && !pertrans->inputtypeByVal)
+				pfree(DatumGetPointer(oldVal));
+			/* and remember the new one for subsequent equality checks */
+			oldVal = *newVal;
+			oldAbbrevVal = newAbbrevVal;
+			oldIsNull = *isNull;
+			haveOldVal = true;
+		}
+
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	if (!oldIsNull && !pertrans->inputtypeByVal)
+		pfree(DatumGetPointer(oldVal));
+
+	tuplesort_end(pertrans->sortstates[aggstate->current_set]);
+	pertrans->sortstates[aggstate->current_set] = NULL;
+}
+
+/*
+ * Run the transition function for a DISTINCT or ORDER BY aggregate
+ * with more than one input.  This is called after we have completed
+ * entering all the input values into the sort object.  We complete the
+ * sort, read out the values in sorted order, and run the transition
+ * function on each value (applying DISTINCT if appropriate).
+ *
+ * This function handles only one grouping set (already set in
+ * aggstate->current_set).
+ *
+ * When called, CurrentMemoryContext should be the per-query context.
+ */
+static void
+process_ordered_aggregate_multi(AggState *aggstate,
+								AggStatePerTrans pertrans,
+								AggStatePerGroup pergroupstate)
+{
+	ExprContext *tmpcontext = aggstate->tmpcontext;
+	FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
+	TupleTableSlot *slot1 = pertrans->sortslot;
+	TupleTableSlot *slot2 = pertrans->uniqslot;
+	int			numTransInputs = pertrans->numTransInputs;
+	int			numDistinctCols = pertrans->numDistinctCols;
+	Datum		newAbbrevVal = (Datum) 0;
+	Datum		oldAbbrevVal = (Datum) 0;
+	bool		haveOldValue = false;
+	TupleTableSlot *save = aggstate->tmpcontext->ecxt_outertuple;
+	int			i;
+
+	tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
+
+	ExecClearTuple(slot1);
+	if (slot2)
+		ExecClearTuple(slot2);
+
+	while (tuplesort_gettupleslot(pertrans->sortstates[aggstate->current_set],
+								  true, true, slot1, &newAbbrevVal))
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		tmpcontext->ecxt_outertuple = slot1;
+		tmpcontext->ecxt_innertuple = slot2;
+
+		if (numDistinctCols == 0 ||
+			!haveOldValue ||
+			newAbbrevVal != oldAbbrevVal ||
+			!ExecQual(pertrans->equalfnMulti, tmpcontext))
+		{
+			/*
+			 * Extract the first numTransInputs columns as datums to pass to
+			 * the transfn.
+			 */
+			slot_getsomeattrs(slot1, numTransInputs);
+
+			/* Load values into fcinfo */
+			/* Start from 1, since the 0th arg will be the transition value */
+			for (i = 0; i < numTransInputs; i++)
+			{
+				fcinfo->args[i + 1].value = slot1->tts_values[i];
+				fcinfo->args[i + 1].isnull = slot1->tts_isnull[i];
+			}
+
+			advance_transition_function(aggstate, pertrans, pergroupstate);
+
+			if (numDistinctCols > 0)
+			{
+				/* swap the slot pointers to retain the current tuple */
+				TupleTableSlot *tmpslot = slot2;
+
+				slot2 = slot1;
+				slot1 = tmpslot;
+				/* avoid ExecQual() calls by reusing abbreviated keys */
+				oldAbbrevVal = newAbbrevVal;
+				haveOldValue = true;
+			}
+		}
+
+		/* Reset context each time */
+		ResetExprContext(tmpcontext);
+
+		ExecClearTuple(slot1);
+	}
+
+	if (slot2)
+		ExecClearTuple(slot2);
+
+	tuplesort_end(pertrans->sortstates[aggstate->current_set]);
+	pertrans->sortstates[aggstate->current_set] = NULL;
+
+	/* restore previous slot, potentially in use for grouping sets */
+	tmpcontext->ecxt_outertuple = save;
+}
+
+/*
+ * Compute the final value of one aggregate.
+ *
+ * This function handles only one grouping set (already set in
+ * aggstate->current_set).
+ *
+ * The finalfn will be run, and the result delivered, in the
+ * output-tuple context; caller's CurrentMemoryContext does not matter.
+ *
+ * The finalfn uses the state as set in the transno. This also might be
+ * being used by another aggregate function, so it's important that we do
+ * nothing destructive here.
+ */
+static void
+finalize_aggregate(AggState *aggstate,
+				   AggStatePerAgg peragg,
+				   AggStatePerGroup pergroupstate,
+				   Datum *resultVal, bool *resultIsNull)
+{
+	LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+	bool		anynull = false;
+	MemoryContext oldContext;
+	int			i;
+	ListCell   *lc;
+	AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
+
+	oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+	/*
+	 * Evaluate any direct arguments.  We do this even if there's no finalfn
+	 * (which is unlikely anyway), so that side-effects happen as expected.
+	 * The direct arguments go into arg positions 1 and up, leaving position 0
+	 * for the transition state value.
+	 */
+	i = 1;
+	foreach(lc, peragg->aggdirectargs)
+	{
+		ExprState  *expr = (ExprState *) lfirst(lc);
+
+		fcinfo->args[i].value = ExecEvalExpr(expr,
+											 aggstate->ss.ps.ps_ExprContext,
+											 &fcinfo->args[i].isnull);
+		anynull |= fcinfo->args[i].isnull;
+		i++;
+	}
+
+	/*
+	 * Apply the agg's finalfn if one is provided, else return transValue.
+	 */
+	if (OidIsValid(peragg->finalfn_oid))
+	{
+		int			numFinalArgs = peragg->numFinalArgs;
+
+		/* set up aggstate->curperagg for AggGetAggref() */
+		aggstate->curperagg = peragg;
+
+		InitFunctionCallInfoData(*fcinfo, &peragg->finalfn,
+								 numFinalArgs,
+								 pertrans->aggCollation,
+								 (void *) aggstate, NULL);
+
+		/* Fill in the transition state value */
+		fcinfo->args[0].value =
+			MakeExpandedObjectReadOnly(pergroupstate->transValue,
+									   pergroupstate->transValueIsNull,
+									   pertrans->transtypeLen);
+		fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
+		anynull |= pergroupstate->transValueIsNull;
+
+		/* Fill any remaining argument positions with nulls */
+		for (; i < numFinalArgs; i++)
+		{
+			fcinfo->args[i].value = (Datum) 0;
+			fcinfo->args[i].isnull = true;
+			anynull = true;
+		}
+
+		if (fcinfo->flinfo->fn_strict && anynull)
+		{
+			/* don't call a strict function with NULL inputs */
+			*resultVal = (Datum) 0;
+			*resultIsNull = true;
+		}
+		else
+		{
+			*resultVal = FunctionCallInvoke(fcinfo);
+			*resultIsNull = fcinfo->isnull;
+		}
+		aggstate->curperagg = NULL;
+	}
+	else
+	{
+		/* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
+		*resultVal = pergroupstate->transValue;
+		*resultIsNull = pergroupstate->transValueIsNull;
+	}
+
+	/*
+	 * If result is pass-by-ref, make sure it is in the right context.
+	 */
+	if (!peragg->resulttypeByVal && !*resultIsNull &&
+		!MemoryContextContains(CurrentMemoryContext,
+							   DatumGetPointer(*resultVal)))
+		*resultVal = datumCopy(*resultVal,
+							   peragg->resulttypeByVal,
+							   peragg->resulttypeLen);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Compute the output value of one partial aggregate.
+ *
+ * The serialization function will be run, and the result delivered, in the
+ * output-tuple context; caller's CurrentMemoryContext does not matter.
+ */
+static void
+finalize_partialaggregate(AggState *aggstate,
+						  AggStatePerAgg peragg,
+						  AggStatePerGroup pergroupstate,
+						  Datum *resultVal, bool *resultIsNull)
+{
+	AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
+	MemoryContext oldContext;
+
+	oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+	/*
+	 * serialfn_oid will be set if we must serialize the transvalue before
+	 * returning it
+	 */
+	if (OidIsValid(pertrans->serialfn_oid))
+	{
+		/* Don't call a strict serialization function with NULL input. */
+		if (pertrans->serialfn.fn_strict && pergroupstate->transValueIsNull)
+		{
+			*resultVal = (Datum) 0;
+			*resultIsNull = true;
+		}
+		else
+		{
+			FunctionCallInfo fcinfo = pertrans->serialfn_fcinfo;
+
+			fcinfo->args[0].value =
+				MakeExpandedObjectReadOnly(pergroupstate->transValue,
+										   pergroupstate->transValueIsNull,
+										   pertrans->transtypeLen);
+			fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
+			fcinfo->isnull = false;
+
+			*resultVal = FunctionCallInvoke(fcinfo);
+			*resultIsNull = fcinfo->isnull;
+		}
+	}
+	else
+	{
+		/* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
+		*resultVal = pergroupstate->transValue;
+		*resultIsNull = pergroupstate->transValueIsNull;
+	}
+
+	/* If result is pass-by-ref, make sure it is in the right context. */
+	if (!peragg->resulttypeByVal && !*resultIsNull &&
+		!MemoryContextContains(CurrentMemoryContext,
+							   DatumGetPointer(*resultVal)))
+		*resultVal = datumCopy(*resultVal,
+							   peragg->resulttypeByVal,
+							   peragg->resulttypeLen);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Extract the attributes that make up the grouping key into the
+ * hashslot. This is necessary to compute the hash or perform a lookup.
+ */
+static inline void
+prepare_hash_slot(AggStatePerHash perhash,
+				  TupleTableSlot *inputslot,
+				  TupleTableSlot *hashslot)
+{
+	int			i;
+
+	/* transfer just the needed columns into hashslot */
+	slot_getsomeattrs(inputslot, perhash->largestGrpColIdx);
+	ExecClearTuple(hashslot);
+
+	for (i = 0; i < perhash->numhashGrpCols; i++)
+	{
+		int			varNumber = perhash->hashGrpColIdxInput[i] - 1;
+
+		hashslot->tts_values[i] = inputslot->tts_values[varNumber];
+		hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber];
+	}
+	ExecStoreVirtualTuple(hashslot);
+}
+
+/*
+ * Prepare to finalize and project based on the specified representative tuple
+ * slot and grouping set.
+ *
+ * In the specified tuple slot, force to null all attributes that should be
+ * read as null in the context of the current grouping set.  Also stash the
+ * current group bitmap where GroupingExpr can get at it.
+ *
+ * This relies on three conditions:
+ *
+ * 1) Nothing is ever going to try and extract the whole tuple from this slot,
+ * only reference it in evaluations, which will only access individual
+ * attributes.
+ *
+ * 2) No system columns are going to need to be nulled. (If a system column is
+ * referenced in a group clause, it is actually projected in the outer plan
+ * tlist.)
+ *
+ * 3) Within a given phase, we never need to recover the value of an attribute
+ * once it has been set to null.
+ *
+ * Poking into the slot this way is a bit ugly, but the consensus is that the
+ * alternative was worse.
+ */
+static void
+prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
+{
+	if (aggstate->phase->grouped_cols)
+	{
+		Bitmapset  *grouped_cols = aggstate->phase->grouped_cols[currentSet];
+
+		aggstate->grouped_cols = grouped_cols;
+
+		if (TTS_EMPTY(slot))
+		{
+			/*
+			 * Force all values to be NULL if working on an empty input tuple
+			 * (i.e. an empty grouping set for which no input rows were
+			 * supplied).
+			 */
+			ExecStoreAllNullTuple(slot);
+		}
+		else if (aggstate->all_grouped_cols)
+		{
+			ListCell   *lc;
+
+			/* all_grouped_cols is arranged in desc order */
+			slot_getsomeattrs(slot, linitial_int(aggstate->all_grouped_cols));
+
+			foreach(lc, aggstate->all_grouped_cols)
+			{
+				int			attnum = lfirst_int(lc);
+
+				if (!bms_is_member(attnum, grouped_cols))
+					slot->tts_isnull[attnum - 1] = true;
+			}
+		}
+	}
+}
+
+/*
+ * Compute the final value of all aggregates for one group.
+ *
+ * This function handles only one grouping set at a time, which the caller must
+ * have selected.  It's also the caller's responsibility to adjust the supplied
+ * pergroup parameter to point to the current set's transvalues.
+ *
+ * Results are stored in the output econtext aggvalues/aggnulls.
+ */
+static void
+finalize_aggregates(AggState *aggstate,
+					AggStatePerAgg peraggs,
+					AggStatePerGroup pergroup)
+{
+	ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
+	Datum	   *aggvalues = econtext->ecxt_aggvalues;
+	bool	   *aggnulls = econtext->ecxt_aggnulls;
+	int			aggno;
+	int			transno;
+
+	/*
+	 * If there were any DISTINCT and/or ORDER BY aggregates, sort their
+	 * inputs and run the transition functions.
+	 */
+	for (transno = 0; transno < aggstate->numtrans; transno++)
+	{
+		AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+		AggStatePerGroup pergroupstate;
+
+		pergroupstate = &pergroup[transno];
+
+		if (pertrans->numSortCols > 0)
+		{
+			Assert(aggstate->aggstrategy != AGG_HASHED &&
+				   aggstate->aggstrategy != AGG_MIXED);
+
+			if (pertrans->numInputs == 1)
+				process_ordered_aggregate_single(aggstate,
+												 pertrans,
+												 pergroupstate);
+			else
+				process_ordered_aggregate_multi(aggstate,
+												pertrans,
+												pergroupstate);
+		}
+	}
+
+	/*
+	 * Run the final functions.
+	 */
+	for (aggno = 0; aggno < aggstate->numaggs; aggno++)
+	{
+		AggStatePerAgg peragg = &peraggs[aggno];
+		int			transno = peragg->transno;
+		AggStatePerGroup pergroupstate;
+
+		pergroupstate = &pergroup[transno];
+
+		if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
+			finalize_partialaggregate(aggstate, peragg, pergroupstate,
+									  &aggvalues[aggno], &aggnulls[aggno]);
+		else
+			finalize_aggregate(aggstate, peragg, pergroupstate,
+							   &aggvalues[aggno], &aggnulls[aggno]);
+	}
+}
+
+/*
+ * Project the result of a group (whose aggs have already been calculated by
+ * finalize_aggregates). Returns the result slot, or NULL if no row is
+ * projected (suppressed by qual).
+ */
+static TupleTableSlot *
+project_aggregates(AggState *aggstate)
+{
+	ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
+
+	/*
+	 * Check the qual (HAVING clause); if the group does not match, ignore it.
+	 */
+	if (ExecQual(aggstate->ss.ps.qual, econtext))
+	{
+		/*
+		 * Form and return projection tuple using the aggregate results and
+		 * the representative input tuple.
+		 */
+		return ExecProject(aggstate->ss.ps.ps_ProjInfo);
+	}
+	else
+		InstrCountFiltered1(aggstate, 1);
+
+	return NULL;
+}
+
+/*
+ * Find input-tuple columns that are needed, dividing them into
+ * aggregated and unaggregated sets.
+ */
+static void
+find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
+{
+	Agg		   *agg = (Agg *) aggstate->ss.ps.plan;
+	FindColsContext context;
+
+	context.is_aggref = false;
+	context.aggregated = NULL;
+	context.unaggregated = NULL;
+
+	/* Examine tlist and quals */
+	(void) find_cols_walker((Node *) agg->plan.targetlist, &context);
+	(void) find_cols_walker((Node *) agg->plan.qual, &context);
+
+	/* In some cases, grouping columns will not appear in the tlist */
+	for (int i = 0; i < agg->numCols; i++)
+		context.unaggregated = bms_add_member(context.unaggregated,
+											  agg->grpColIdx[i]);
+
+	*aggregated = context.aggregated;
+	*unaggregated = context.unaggregated;
+}
+
+static bool
+find_cols_walker(Node *node, FindColsContext *context)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Var))
+	{
+		Var		   *var = (Var *) node;
+
+		/* setrefs.c should have set the varno to OUTER_VAR */
+		Assert(var->varno == OUTER_VAR);
+		Assert(var->varlevelsup == 0);
+		if (context->is_aggref)
+			context->aggregated = bms_add_member(context->aggregated,
+												 var->varattno);
+		else
+			context->unaggregated = bms_add_member(context->unaggregated,
+												   var->varattno);
+		return false;
+	}
+	if (IsA(node, Aggref))
+	{
+		Assert(!context->is_aggref);
+		context->is_aggref = true;
+		expression_tree_walker(node, find_cols_walker, (void *) context);
+		context->is_aggref = false;
+		return false;
+	}
+	return expression_tree_walker(node, find_cols_walker,
+								  (void *) context);
+}
+
+/*
+ * (Re-)initialize the hash table(s) to empty.
+ *
+ * To implement hashed aggregation, we need a hashtable that stores a
+ * representative tuple and an array of AggStatePerGroup structs for each
+ * distinct set of GROUP BY column values.  We compute the hash key from the
+ * GROUP BY columns.  The per-group data is allocated in lookup_hash_entry(),
+ * for each entry.
+ *
+ * We have a separate hashtable and associated perhash data structure for each
+ * grouping set for which we're doing hashing.
+ *
+ * The contents of the hash tables always live in the hashcontext's per-tuple
+ * memory context (there is only one of these for all tables together, since
+ * they are all reset at the same time).
+ */
+static void
+build_hash_tables(AggState *aggstate)
+{
+	int			setno;
+
+	for (setno = 0; setno < aggstate->num_hashes; ++setno)
+	{
+		AggStatePerHash perhash = &aggstate->perhash[setno];
+		long		nbuckets;
+		Size		memory;
+
+		if (perhash->hashtable != NULL)
+		{
+			ResetTupleHashTable(perhash->hashtable);
+			continue;
+		}
+
+		Assert(perhash->aggnode->numGroups > 0);
+
+		memory = aggstate->hash_mem_limit / aggstate->num_hashes;
+
+		/* choose reasonable number of buckets per hashtable */
+		nbuckets = hash_choose_num_buckets(aggstate->hashentrysize,
+										   perhash->aggnode->numGroups,
+										   memory);
+
+		build_hash_table(aggstate, setno, nbuckets);
+	}
+
+	aggstate->hash_ngroups_current = 0;
+}
+
+/*
+ * Build a single hashtable for this grouping set.
+ */
+static void
+build_hash_table(AggState *aggstate, int setno, long nbuckets)
+{
+	AggStatePerHash perhash = &aggstate->perhash[setno];
+	MemoryContext metacxt = aggstate->hash_metacxt;
+	MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
+	MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
+	Size		additionalsize;
+
+	Assert(aggstate->aggstrategy == AGG_HASHED ||
+		   aggstate->aggstrategy == AGG_MIXED);
+
+	/*
+	 * Used to make sure initial hash table allocation does not exceed
+	 * hash_mem. Note that the estimate does not include space for
+	 * pass-by-reference transition data values, nor for the representative
+	 * tuple of each group.
+	 */
+	additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
+
+	perhash->hashtable = BuildTupleHashTableExt(&aggstate->ss.ps,
+												perhash->hashslot->tts_tupleDescriptor,
+												perhash->numCols,
+												perhash->hashGrpColIdxHash,
+												perhash->eqfuncoids,
+												perhash->hashfunctions,
+												perhash->aggnode->grpCollations,
+												nbuckets,
+												additionalsize,
+												metacxt,
+												hashcxt,
+												tmpcxt,
+												DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
+}
+
+/*
+ * Compute columns that actually need to be stored in hashtable entries.  The
+ * incoming tuples from the child plan node will contain grouping columns,
+ * other columns referenced in our targetlist and qual, columns used to
+ * compute the aggregate functions, and perhaps just junk columns we don't use
+ * at all.  Only columns of the first two types need to be stored in the
+ * hashtable, and getting rid of the others can make the table entries
+ * significantly smaller.  The hashtable only contains the relevant columns,
+ * and is packed/unpacked in lookup_hash_entry() / agg_retrieve_hash_table()
+ * into the format of the normal input descriptor.
+ *
+ * Additional columns, in addition to the columns grouped by, come from two
+ * sources: Firstly functionally dependent columns that we don't need to group
+ * by themselves, and secondly ctids for row-marks.
+ *
+ * To eliminate duplicates, we build a bitmapset of the needed columns, and
+ * then build an array of the columns included in the hashtable. We might
+ * still have duplicates if the passed-in grpColIdx has them, which can happen
+ * in edge cases from semijoins/distinct; these can't always be removed,
+ * because it's not certain that the duplicate cols will be using the same
+ * hash function.
+ *
+ * Note that the array is preserved over ExecReScanAgg, so we allocate it in
+ * the per-query context (unlike the hash table itself).
+ */
+static void
+find_hash_columns(AggState *aggstate)
+{
+	Bitmapset  *base_colnos;
+	Bitmapset  *aggregated_colnos;
+	TupleDesc	scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
+	List	   *outerTlist = outerPlanState(aggstate)->plan->targetlist;
+	int			numHashes = aggstate->num_hashes;
+	EState	   *estate = aggstate->ss.ps.state;
+	int			j;
+
+	/* Find Vars that will be needed in tlist and qual */
+	find_cols(aggstate, &aggregated_colnos, &base_colnos);
+	aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos);
+	aggstate->max_colno_needed = 0;
+	aggstate->all_cols_needed = true;
+
+	for (int i = 0; i < scanDesc->natts; i++)
+	{
+		int			colno = i + 1;
+
+		if (bms_is_member(colno, aggstate->colnos_needed))
+			aggstate->max_colno_needed = colno;
+		else
+			aggstate->all_cols_needed = false;
+	}
+
+	for (j = 0; j < numHashes; ++j)
+	{
+		AggStatePerHash perhash = &aggstate->perhash[j];
+		Bitmapset  *colnos = bms_copy(base_colnos);
+		AttrNumber *grpColIdx = perhash->aggnode->grpColIdx;
+		List	   *hashTlist = NIL;
+		TupleDesc	hashDesc;
+		int			maxCols;
+		int			i;
+
+		perhash->largestGrpColIdx = 0;
+
+		/*
+		 * If we're doing grouping sets, then some Vars might be referenced in
+		 * tlist/qual for the benefit of other grouping sets, but not needed
+		 * when hashing; i.e. prepare_projection_slot will null them out, so
+		 * there'd be no point storing them.  Use prepare_projection_slot's
+		 * logic to determine which.
+		 */
+		if (aggstate->phases[0].grouped_cols)
+		{
+			Bitmapset  *grouped_cols = aggstate->phases[0].grouped_cols[j];
+			ListCell   *lc;
+
+			foreach(lc, aggstate->all_grouped_cols)
+			{
+				int			attnum = lfirst_int(lc);
+
+				if (!bms_is_member(attnum, grouped_cols))
+					colnos = bms_del_member(colnos, attnum);
+			}
+		}
+
+		/*
+		 * Compute maximum number of input columns accounting for possible
+		 * duplications in the grpColIdx array, which can happen in some edge
+		 * cases where HashAggregate was generated as part of a semijoin or a
+		 * DISTINCT.
+		 */
+		maxCols = bms_num_members(colnos) + perhash->numCols;
+
+		perhash->hashGrpColIdxInput =
+			palloc(maxCols * sizeof(AttrNumber));
+		perhash->hashGrpColIdxHash =
+			palloc(perhash->numCols * sizeof(AttrNumber));
+
+		/* Add all the grouping columns to colnos */
+		for (i = 0; i < perhash->numCols; i++)
+			colnos = bms_add_member(colnos, grpColIdx[i]);
+
+		/*
+		 * First build mapping for columns directly hashed. These are the
+		 * first, because they'll be accessed when computing hash values and
+		 * comparing tuples for exact matches. We also build simple mapping
+		 * for execGrouping, so it knows where to find the to-be-hashed /
+		 * compared columns in the input.
+		 */
+		for (i = 0; i < perhash->numCols; i++)
+		{
+			perhash->hashGrpColIdxInput[i] = grpColIdx[i];
+			perhash->hashGrpColIdxHash[i] = i + 1;
+			perhash->numhashGrpCols++;
+			/* delete already mapped columns */
+			bms_del_member(colnos, grpColIdx[i]);
+		}
+
+		/* and add the remaining columns */
+		while ((i = bms_first_member(colnos)) >= 0)
+		{
+			perhash->hashGrpColIdxInput[perhash->numhashGrpCols] = i;
+			perhash->numhashGrpCols++;
+		}
+
+		/* and build a tuple descriptor for the hashtable */
+		for (i = 0; i < perhash->numhashGrpCols; i++)
+		{
+			int			varNumber = perhash->hashGrpColIdxInput[i] - 1;
+
+			hashTlist = lappend(hashTlist, list_nth(outerTlist, varNumber));
+			perhash->largestGrpColIdx =
+				Max(varNumber + 1, perhash->largestGrpColIdx);
+		}
+
+		hashDesc = ExecTypeFromTL(hashTlist);
+
+		execTuplesHashPrepare(perhash->numCols,
+							  perhash->aggnode->grpOperators,
+							  &perhash->eqfuncoids,
+							  &perhash->hashfunctions);
+		perhash->hashslot =
+			ExecAllocTableSlot(&estate->es_tupleTable, hashDesc,
+							   &TTSOpsMinimalTuple);
+
+		list_free(hashTlist);
+		bms_free(colnos);
+	}
+
+	bms_free(base_colnos);
+}
+
+/*
+ * Estimate per-hash-table-entry overhead.
+ */
+Size
+hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
+{
+	Size		tupleChunkSize;
+	Size		pergroupChunkSize;
+	Size		transitionChunkSize;
+	Size		tupleSize = (MAXALIGN(SizeofMinimalTupleHeader) +
+							 tupleWidth);
+	Size		pergroupSize = numTrans * sizeof(AggStatePerGroupData);
+
+	tupleChunkSize = CHUNKHDRSZ + tupleSize;
+
+	if (pergroupSize > 0)
+		pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
+	else
+		pergroupChunkSize = 0;
+
+	if (transitionSpace > 0)
+		transitionChunkSize = CHUNKHDRSZ + transitionSpace;
+	else
+		transitionChunkSize = 0;
+
+	return
+		sizeof(TupleHashEntryData) +
+		tupleChunkSize +
+		pergroupChunkSize +
+		transitionChunkSize;
+}
+
+/*
+ * hashagg_recompile_expressions()
+ *
+ * Identifies the right phase, compiles the right expression given the
+ * arguments, and then sets phase->evalfunc to that expression.
+ *
+ * Different versions of the compiled expression are needed depending on
+ * whether hash aggregation has spilled or not, and whether it's reading from
+ * the outer plan or a tape. Before spilling to disk, the expression reads
+ * from the outer plan and does not need to perform a NULL check. After
+ * HashAgg begins to spill, new groups will not be created in the hash table,
+ * and the AggStatePerGroup array may be NULL; therefore we need to add a null
+ * pointer check to the expression. Then, when reading spilled data from a
+ * tape, we change the outer slot type to be a fixed minimal tuple slot.
+ *
+ * It would be wasteful to recompile every time, so cache the compiled
+ * expressions in the AggStatePerPhase, and reuse when appropriate.
+ */
+static void
+hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
+{
+	AggStatePerPhase phase;
+	int			i = minslot ? 1 : 0;
+	int			j = nullcheck ? 1 : 0;
+
+	Assert(aggstate->aggstrategy == AGG_HASHED ||
+		   aggstate->aggstrategy == AGG_MIXED);
+
+	if (aggstate->aggstrategy == AGG_HASHED)
+		phase = &aggstate->phases[0];
+	else						/* AGG_MIXED */
+		phase = &aggstate->phases[1];
+
+	if (phase->evaltrans_cache[i][j] == NULL)
+	{
+		const TupleTableSlotOps *outerops = aggstate->ss.ps.outerops;
+		bool		outerfixed = aggstate->ss.ps.outeropsfixed;
+		bool		dohash = true;
+		bool		dosort = false;
+
+		/*
+		 * If minslot is true, that means we are processing a spilled batch
+		 * (inside agg_refill_hash_table()), and we must not advance the
+		 * sorted grouping sets.
+		 */
+		if (aggstate->aggstrategy == AGG_MIXED && !minslot)
+			dosort = true;
+
+		/* temporarily change the outerops while compiling the expression */
+		if (minslot)
+		{
+			aggstate->ss.ps.outerops = &TTSOpsMinimalTuple;
+			aggstate->ss.ps.outeropsfixed = true;
+		}
+
+		phase->evaltrans_cache[i][j] = ExecBuildAggTrans(aggstate, phase,
+														 dosort, dohash,
+														 nullcheck);
+
+		/* change back */
+		aggstate->ss.ps.outerops = outerops;
+		aggstate->ss.ps.outeropsfixed = outerfixed;
+	}
+
+	phase->evaltrans = phase->evaltrans_cache[i][j];
+}
+
+/*
+ * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
+ * number of partitions we expect to create (if we do spill).
+ *
+ * There are two limits: a memory limit, and also an ngroups limit. The
+ * ngroups limit becomes important when we expect transition values to grow
+ * substantially larger than the initial value.
+ */
+void
+hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
+					Size *mem_limit, uint64 *ngroups_limit,
+					int *num_partitions)
+{
+	int			npartitions;
+	Size		partition_mem;
+	Size		hash_mem_limit = get_hash_memory_limit();
+
+	/* if not expected to spill, use all of hash_mem */
+	if (input_groups * hashentrysize <= hash_mem_limit)
+	{
+		if (num_partitions != NULL)
+			*num_partitions = 0;
+		*mem_limit = hash_mem_limit;
+		*ngroups_limit = hash_mem_limit / hashentrysize;
+		return;
+	}
+
+	/*
+	 * Calculate expected memory requirements for spilling, which is the size
+	 * of the buffers needed for all the tapes that need to be open at once.
+	 * Then, subtract that from the memory available for holding hash tables.
+	 */
+	npartitions = hash_choose_num_partitions(input_groups,
+											 hashentrysize,
+											 used_bits,
+											 NULL);
+	if (num_partitions != NULL)
+		*num_partitions = npartitions;
+
+	partition_mem =
+		HASHAGG_READ_BUFFER_SIZE +
+		HASHAGG_WRITE_BUFFER_SIZE * npartitions;
+
+	/*
+	 * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
+	 * minimum number of partitions, so we aren't going to dramatically exceed
+	 * work mem anyway.
+	 */
+	if (hash_mem_limit > 4 * partition_mem)
+		*mem_limit = hash_mem_limit - partition_mem;
+	else
+		*mem_limit = hash_mem_limit * 0.75;
+
+	if (*mem_limit > hashentrysize)
+		*ngroups_limit = *mem_limit / hashentrysize;
+	else
+		*ngroups_limit = 1;
+}
+
+/*
+ * hash_agg_check_limits
+ *
+ * After adding a new group to the hash table, check whether we need to enter
+ * spill mode. Allocations may happen without adding new groups (for instance,
+ * if the transition state size grows), so this check is imperfect.
+ */
+static void
+hash_agg_check_limits(AggState *aggstate)
+{
+	uint64		ngroups = aggstate->hash_ngroups_current;
+	Size		meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
+													 true);
+	Size		hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+														true);
+
+	/*
+	 * Don't spill unless there's at least one group in the hash table so we
+	 * can be sure to make progress even in edge cases.
+	 */
+	if (aggstate->hash_ngroups_current > 0 &&
+		(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
+		 ngroups > aggstate->hash_ngroups_limit))
+	{
+		hash_agg_enter_spill_mode(aggstate);
+	}
+}
+
+/*
+ * Enter "spill mode", meaning that no new groups are added to any of the hash
+ * tables. Tuples that would create a new group are instead spilled, and
+ * processed later.
+ */
+static void
+hash_agg_enter_spill_mode(AggState *aggstate)
+{
+	aggstate->hash_spill_mode = true;
+	hashagg_recompile_expressions(aggstate, aggstate->table_filled, true);
+
+	if (!aggstate->hash_ever_spilled)
+	{
+		Assert(aggstate->hash_tapeinfo == NULL);
+		Assert(aggstate->hash_spills == NULL);
+
+		aggstate->hash_ever_spilled = true;
+
+		hashagg_tapeinfo_init(aggstate);
+
+		aggstate->hash_spills = palloc(sizeof(HashAggSpill) * aggstate->num_hashes);
+
+		for (int setno = 0; setno < aggstate->num_hashes; setno++)
+		{
+			AggStatePerHash perhash = &aggstate->perhash[setno];
+			HashAggSpill *spill = &aggstate->hash_spills[setno];
+
+			hashagg_spill_init(spill, aggstate->hash_tapeinfo, 0,
+							   perhash->aggnode->numGroups,
+							   aggstate->hashentrysize);
+		}
+	}
+}
+
+/*
+ * Update metrics after filling the hash table.
+ *
+ * If reading from the outer plan, from_tape should be false; if reading from
+ * another tape, from_tape should be true.
+ */
+static void
+hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
+{
+	Size		meta_mem;
+	Size		hashkey_mem;
+	Size		buffer_mem;
+	Size		total_mem;
+
+	if (aggstate->aggstrategy != AGG_MIXED &&
+		aggstate->aggstrategy != AGG_HASHED)
+		return;
+
+	/* memory for the hash table itself */
+	meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
+
+	/* memory for the group keys and transition states */
+	hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
+
+	/* memory for read/write tape buffers, if spilled */
+	buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE;
+	if (from_tape)
+		buffer_mem += HASHAGG_READ_BUFFER_SIZE;
+
+	/* update peak mem */
+	total_mem = meta_mem + hashkey_mem + buffer_mem;
+	if (total_mem > aggstate->hash_mem_peak)
+		aggstate->hash_mem_peak = total_mem;
+
+	/* update disk usage */
+	if (aggstate->hash_tapeinfo != NULL)
+	{
+		uint64		disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeinfo->tapeset) * (BLCKSZ / 1024);
+
+		if (aggstate->hash_disk_used < disk_used)
+			aggstate->hash_disk_used = disk_used;
+	}
+
+	/* update hashentrysize estimate based on contents */
+	if (aggstate->hash_ngroups_current > 0)
+	{
+		aggstate->hashentrysize =
+			sizeof(TupleHashEntryData) +
+			(hashkey_mem / (double) aggstate->hash_ngroups_current);
+	}
+}
+
+/*
+ * Choose a reasonable number of buckets for the initial hash table size.
+ */
+static long
+hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
+{
+	long		max_nbuckets;
+	long		nbuckets = ngroups;
+
+	max_nbuckets = memory / hashentrysize;
+
+	/*
+	 * Underestimating is better than overestimating. Too many buckets crowd
+	 * out space for group keys and transition state values.
+	 */
+	max_nbuckets >>= 1;
+
+	if (nbuckets > max_nbuckets)
+		nbuckets = max_nbuckets;
+
+	return Max(nbuckets, 1);
+}
+
+/*
+ * Determine the number of partitions to create when spilling, which will
+ * always be a power of two. If log2_npartitions is non-NULL, set
+ * *log2_npartitions to the log2() of the number of partitions.
+ */
+static int
+hash_choose_num_partitions(double input_groups, double hashentrysize,
+						   int used_bits, int *log2_npartitions)
+{
+	Size		hash_mem_limit = get_hash_memory_limit();
+	double		partition_limit;
+	double		mem_wanted;
+	double		dpartitions;
+	int			npartitions;
+	int			partition_bits;
+
+	/*
+	 * Avoid creating so many partitions that the memory requirements of the
+	 * open partition files are greater than 1/4 of hash_mem.
+	 */
+	partition_limit =
+		(hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
+		HASHAGG_WRITE_BUFFER_SIZE;
+
+	mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
+
+	/* make enough partitions so that each one is likely to fit in memory */
+	dpartitions = 1 + (mem_wanted / hash_mem_limit);
+
+	if (dpartitions > partition_limit)
+		dpartitions = partition_limit;
+
+	if (dpartitions < HASHAGG_MIN_PARTITIONS)
+		dpartitions = HASHAGG_MIN_PARTITIONS;
+	if (dpartitions > HASHAGG_MAX_PARTITIONS)
+		dpartitions = HASHAGG_MAX_PARTITIONS;
+
+	/* HASHAGG_MAX_PARTITIONS limit makes this safe */
+	npartitions = (int) dpartitions;
+
+	/* ceil(log2(npartitions)) */
+	partition_bits = my_log2(npartitions);
+
+	/* make sure that we don't exhaust the hash bits */
+	if (partition_bits + used_bits >= 32)
+		partition_bits = 32 - used_bits;
+
+	if (log2_npartitions != NULL)
+		*log2_npartitions = partition_bits;
+
+	/* number of partitions will be a power of two */
+	npartitions = 1 << partition_bits;
+
+	return npartitions;
+}
+
+/*
+ * Initialize a freshly-created TupleHashEntry.
+ */
+static void
+initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
+					  TupleHashEntry entry)
+{
+	AggStatePerGroup pergroup;
+	int			transno;
+
+	aggstate->hash_ngroups_current++;
+	hash_agg_check_limits(aggstate);
+
+	/* no need to allocate or initialize per-group state */
+	if (aggstate->numtrans == 0)
+		return;
+
+	pergroup = (AggStatePerGroup)
+		MemoryContextAlloc(hashtable->tablecxt,
+						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
+
+	entry->additional = pergroup;
+
+	/*
+	 * Initialize aggregates for new tuple group, lookup_hash_entries()
+	 * already has selected the relevant grouping set.
+	 */
+	for (transno = 0; transno < aggstate->numtrans; transno++)
+	{
+		AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+		AggStatePerGroup pergroupstate = &pergroup[transno];
+
+		initialize_aggregate(aggstate, pertrans, pergroupstate);
+	}
+}
+
+/*
+ * Look up hash entries for the current tuple in all hashed grouping sets.
+ *
+ * Be aware that lookup_hash_entry can reset the tmpcontext.
+ *
+ * Some entries may be left NULL if we are in "spill mode". The same tuple
+ * will belong to different groups for each grouping set, so may match a group
+ * already in memory for one set and match a group not in memory for another
+ * set. When in "spill mode", the tuple will be spilled for each grouping set
+ * where it doesn't match a group in memory.
+ *
+ * NB: It's possible to spill the same tuple for several different grouping
+ * sets. This may seem wasteful, but it's actually a trade-off: if we spill
+ * the tuple multiple times for multiple grouping sets, it can be partitioned
+ * for each grouping set, making the refilling of the hash table very
+ * efficient.
+ */
+static void
+lookup_hash_entries(AggState *aggstate)
+{
+	AggStatePerGroup *pergroup = aggstate->hash_pergroup;
+	TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
+	int			setno;
+
+	for (setno = 0; setno < aggstate->num_hashes; setno++)
+	{
+		AggStatePerHash perhash = &aggstate->perhash[setno];
+		TupleHashTable hashtable = perhash->hashtable;
+		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashEntry entry;
+		uint32		hash;
+		bool		isnew = false;
+		bool	   *p_isnew;
+
+		/* if hash table already spilled, don't create new entries */
+		p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
+
+		select_current_set(aggstate, setno, true);
+		prepare_hash_slot(perhash,
+						  outerslot,
+						  hashslot);
+
+		entry = LookupTupleHashEntry(hashtable, hashslot,
+									 p_isnew, &hash);
+
+		if (entry != NULL)
+		{
+			if (isnew)
+				initialize_hash_entry(aggstate, hashtable, entry);
+			pergroup[setno] = entry->additional;
+		}
+		else
+		{
+			HashAggSpill *spill = &aggstate->hash_spills[setno];
+			TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
+
+			if (spill->partitions == NULL)
+				hashagg_spill_init(spill, aggstate->hash_tapeinfo, 0,
+								   perhash->aggnode->numGroups,
+								   aggstate->hashentrysize);
+
+			hashagg_spill_tuple(aggstate, spill, slot, hash);
+			pergroup[setno] = NULL;
+		}
+	}
+}
+
+/*
+ * ExecAgg -
+ *
+ *	  ExecAgg receives tuples from its outer subplan and aggregates over
+ *	  the appropriate attribute for each aggregate function use (Aggref
+ *	  node) appearing in the targetlist or qual of the node.  The number
+ *	  of tuples to aggregate over depends on whether grouped or plain
+ *	  aggregation is selected.  In grouped aggregation, we produce a result
+ *	  row for each group; in plain aggregation there's a single result row
+ *	  for the whole query.  In either case, the value of each aggregate is
+ *	  stored in the expression context to be used when ExecProject evaluates
+ *	  the result tuple.
+ */
+static TupleTableSlot *
+ExecAgg(PlanState *pstate)
+{
+	AggState   *node = castNode(AggState, pstate);
+	TupleTableSlot *result = NULL;
+
+	CHECK_FOR_INTERRUPTS();
+
+	if (!node->agg_done)
+	{
+		/* Dispatch based on strategy */
+		switch (node->phase->aggstrategy)
+		{
+			case AGG_HASHED:
+				if (!node->table_filled)
+					agg_fill_hash_table(node);
+				/* FALLTHROUGH */
+			case AGG_MIXED:
+				result = agg_retrieve_hash_table(node);
+				break;
+			case AGG_PLAIN:
+			case AGG_SORTED:
+				result = agg_retrieve_direct(node);
+				break;
+		}
+
+		if (!TupIsNull(result))
+			return result;
+	}
+
+	return NULL;
+}
+
+/*
+ * ExecAgg for non-hashed case
+ */
+static TupleTableSlot *
+agg_retrieve_direct(AggState *aggstate)
+{
+	Agg		   *node = aggstate->phase->aggnode;
+	ExprContext *econtext;
+	ExprContext *tmpcontext;
+	AggStatePerAgg peragg;
+	AggStatePerGroup *pergroups;
+	TupleTableSlot *outerslot;
+	TupleTableSlot *firstSlot;
+	TupleTableSlot *result;
+	bool		hasGroupingSets = aggstate->phase->numsets > 0;
+	int			numGroupingSets = Max(aggstate->phase->numsets, 1);
+	int			currentSet;
+	int			nextSetSize;
+	int			numReset;
+	int			i;
+
+	/*
+	 * get state info from node
+	 *
+	 * econtext is the per-output-tuple expression context
+	 *
+	 * tmpcontext is the per-input-tuple expression context
+	 */
+	econtext = aggstate->ss.ps.ps_ExprContext;
+	tmpcontext = aggstate->tmpcontext;
+
+	peragg = aggstate->peragg;
+	pergroups = aggstate->pergroups;
+	firstSlot = aggstate->ss.ss_ScanTupleSlot;
+
+	/*
+	 * We loop retrieving groups until we find one matching
+	 * aggstate->ss.ps.qual
+	 *
+	 * For grouping sets, we have the invariant that aggstate->projected_set
+	 * is either -1 (initial call) or the index (starting from 0) in
+	 * gset_lengths for the group we just completed (either by projecting a
+	 * row or by discarding it in the qual).
+	 */
+	while (!aggstate->agg_done)
+	{
+		/*
+		 * Clear the per-output-tuple context for each group, as well as
+		 * aggcontext (which contains any pass-by-ref transvalues of the old
+		 * group).  Some aggregate functions store working state in child
+		 * contexts; those now get reset automatically without us needing to
+		 * do anything special.
+		 *
+		 * We use ReScanExprContext not just ResetExprContext because we want
+		 * any registered shutdown callbacks to be called.  That allows
+		 * aggregate functions to ensure they've cleaned up any non-memory
+		 * resources.
+		 */
+		ReScanExprContext(econtext);
+
+		/*
+		 * Determine how many grouping sets need to be reset at this boundary.
+		 */
+		if (aggstate->projected_set >= 0 &&
+			aggstate->projected_set < numGroupingSets)
+			numReset = aggstate->projected_set + 1;
+		else
+			numReset = numGroupingSets;
+
+		/*
+		 * numReset can change on a phase boundary, but that's OK; we want to
+		 * reset the contexts used in _this_ phase, and later, after possibly
+		 * changing phase, initialize the right number of aggregates for the
+		 * _new_ phase.
+		 */
+
+		for (i = 0; i < numReset; i++)
+		{
+			ReScanExprContext(aggstate->aggcontexts[i]);
+		}
+
+		/*
+		 * Check if input is complete and there are no more groups to project
+		 * in this phase; move to next phase or mark as done.
+		 */
+		if (aggstate->input_done == true &&
+			aggstate->projected_set >= (numGroupingSets - 1))
+		{
+			if (aggstate->current_phase < aggstate->numphases - 1)
+			{
+				initialize_phase(aggstate, aggstate->current_phase + 1);
+				aggstate->input_done = false;
+				aggstate->projected_set = -1;
+				numGroupingSets = Max(aggstate->phase->numsets, 1);
+				node = aggstate->phase->aggnode;
+				numReset = numGroupingSets;
+			}
+			else if (aggstate->aggstrategy == AGG_MIXED)
+			{
+				/*
+				 * Mixed mode; we've output all the grouped stuff and have
+				 * full hashtables, so switch to outputting those.
+				 */
+				initialize_phase(aggstate, 0);
+				aggstate->table_filled = true;
+				ResetTupleHashIterator(aggstate->perhash[0].hashtable,
+									   &aggstate->perhash[0].hashiter);
+				select_current_set(aggstate, 0, true);
+				return agg_retrieve_hash_table(aggstate);
+			}
+			else
+			{
+				aggstate->agg_done = true;
+				break;
+			}
+		}
+
+		/*
+		 * Get the number of columns in the next grouping set after the last
+		 * projected one (if any). This is the number of columns to compare to
+		 * see if we reached the boundary of that set too.
+		 */
+		if (aggstate->projected_set >= 0 &&
+			aggstate->projected_set < (numGroupingSets - 1))
+			nextSetSize = aggstate->phase->gset_lengths[aggstate->projected_set + 1];
+		else
+			nextSetSize = 0;
+
+		/*----------
+		 * If a subgroup for the current grouping set is present, project it.
+		 *
+		 * We have a new group if:
+		 *	- we're out of input but haven't projected all grouping sets
+		 *	  (checked above)
+		 * OR
+		 *	  - we already projected a row that wasn't from the last grouping
+		 *		set
+		 *	  AND
+		 *	  - the next grouping set has at least one grouping column (since
+		 *		empty grouping sets project only once input is exhausted)
+		 *	  AND
+		 *	  - the previous and pending rows differ on the grouping columns
+		 *		of the next grouping set
+		 *----------
+		 */
+		tmpcontext->ecxt_innertuple = econtext->ecxt_outertuple;
+		if (aggstate->input_done ||
+			(node->aggstrategy != AGG_PLAIN &&
+			 aggstate->projected_set != -1 &&
+			 aggstate->projected_set < (numGroupingSets - 1) &&
+			 nextSetSize > 0 &&
+			 !ExecQualAndReset(aggstate->phase->eqfunctions[nextSetSize - 1],
+							   tmpcontext)))
+		{
+			aggstate->projected_set += 1;
+
+			Assert(aggstate->projected_set < numGroupingSets);
+			Assert(nextSetSize > 0 || aggstate->input_done);
+		}
+		else
+		{
+			/*
+			 * We no longer care what group we just projected, the next
+			 * projection will always be the first (or only) grouping set
+			 * (unless the input proves to be empty).
+			 */
+			aggstate->projected_set = 0;
+
+			/*
+			 * If we don't already have the first tuple of the new group,
+			 * fetch it from the outer plan.
+			 */
+			if (aggstate->grp_firstTuple == NULL)
+			{
+				outerslot = fetch_input_tuple(aggstate);
+				if (!TupIsNull(outerslot))
+				{
+					/*
+					 * Make a copy of the first input tuple; we will use this
+					 * for comparisons (in group mode) and for projection.
+					 */
+					aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
+				}
+				else
+				{
+					/* outer plan produced no tuples at all */
+					if (hasGroupingSets)
+					{
+						/*
+						 * If there was no input at all, we need to project
+						 * rows only if there are grouping sets of size 0.
+						 * Note that this implies that there can't be any
+						 * references to ungrouped Vars, which would otherwise
+						 * cause issues with the empty output slot.
+						 *
+						 * XXX: This is no longer true, we currently deal with
+						 * this in finalize_aggregates().
+						 */
+						aggstate->input_done = true;
+
+						while (aggstate->phase->gset_lengths[aggstate->projected_set] > 0)
+						{
+							aggstate->projected_set += 1;
+							if (aggstate->projected_set >= numGroupingSets)
+							{
+								/*
+								 * We can't set agg_done here because we might
+								 * have more phases to do, even though the
+								 * input is empty. So we need to restart the
+								 * whole outer loop.
+								 */
+								break;
+							}
+						}
+
+						if (aggstate->projected_set >= numGroupingSets)
+							continue;
+					}
+					else
+					{
+						aggstate->agg_done = true;
+						/* If we are grouping, we should produce no tuples too */
+						if (node->aggstrategy != AGG_PLAIN)
+							return NULL;
+					}
+				}
+			}
+
+			/*
+			 * Initialize working state for a new input tuple group.
+			 */
+			initialize_aggregates(aggstate, pergroups, numReset);
+
+			if (aggstate->grp_firstTuple != NULL)
+			{
+				/*
+				 * Store the copied first input tuple in the tuple table slot
+				 * reserved for it.  The tuple will be deleted when it is
+				 * cleared from the slot.
+				 */
+				ExecForceStoreHeapTuple(aggstate->grp_firstTuple,
+										firstSlot, true);
+				aggstate->grp_firstTuple = NULL;	/* don't keep two pointers */
+
+				/* set up for first advance_aggregates call */
+				tmpcontext->ecxt_outertuple = firstSlot;
+
+				/*
+				 * Process each outer-plan tuple, and then fetch the next one,
+				 * until we exhaust the outer plan or cross a group boundary.
+				 */
+				for (;;)
+				{
+					/*
+					 * During phase 1 only of a mixed agg, we need to update
+					 * hashtables as well in advance_aggregates.
+					 */
+					if (aggstate->aggstrategy == AGG_MIXED &&
+						aggstate->current_phase == 1)
+					{
+						lookup_hash_entries(aggstate);
+					}
+
+					/* Advance the aggregates (or combine functions) */
+					advance_aggregates(aggstate);
+
+					/* Reset per-input-tuple context after each tuple */
+					ResetExprContext(tmpcontext);
+
+					outerslot = fetch_input_tuple(aggstate);
+					if (TupIsNull(outerslot))
+					{
+						/* no more outer-plan tuples available */
+
+						/* if we built hash tables, finalize any spills */
+						if (aggstate->aggstrategy == AGG_MIXED &&
+							aggstate->current_phase == 1)
+							hashagg_finish_initial_spills(aggstate);
+
+						if (hasGroupingSets)
+						{
+							aggstate->input_done = true;
+							break;
+						}
+						else
+						{
+							aggstate->agg_done = true;
+							break;
+						}
+					}
+					/* set up for next advance_aggregates call */
+					tmpcontext->ecxt_outertuple = outerslot;
+
+					/*
+					 * If we are grouping, check whether we've crossed a group
+					 * boundary.
+					 */
+					if (node->aggstrategy != AGG_PLAIN)
+					{
+						tmpcontext->ecxt_innertuple = firstSlot;
+						if (!ExecQual(aggstate->phase->eqfunctions[node->numCols - 1],
+									  tmpcontext))
+						{
+							aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
+							break;
+						}
+					}
+				}
+			}
+
+			/*
+			 * Use the representative input tuple for any references to
+			 * non-aggregated input columns in aggregate direct args, the node
+			 * qual, and the tlist.  (If we are not grouping, and there are no
+			 * input rows at all, we will come here with an empty firstSlot
+			 * ... but if not grouping, there can't be any references to
+			 * non-aggregated input columns, so no problem.)
+			 */
+			econtext->ecxt_outertuple = firstSlot;
+		}
+
+		Assert(aggstate->projected_set >= 0);
+
+		currentSet = aggstate->projected_set;
+
+		prepare_projection_slot(aggstate, econtext->ecxt_outertuple, currentSet);
+
+		select_current_set(aggstate, currentSet, false);
+
+		finalize_aggregates(aggstate,
+							peragg,
+							pergroups[currentSet]);
+
+		/*
+		 * If there's no row to project right now, we must continue rather
+		 * than returning a null since there might be more groups.
+		 */
+		result = project_aggregates(aggstate);
+		if (result)
+			return result;
+	}
+
+	/* No more groups */
+	return NULL;
+}
+
+/*
+ * ExecAgg for hashed case: read input and build hash table
+ */
+static void
+agg_fill_hash_table(AggState *aggstate)
+{
+	TupleTableSlot *outerslot;
+	ExprContext *tmpcontext = aggstate->tmpcontext;
+
+	/*
+	 * Process each outer-plan tuple, and then fetch the next one, until we
+	 * exhaust the outer plan.
+	 */
+	for (;;)
+	{
+		outerslot = fetch_input_tuple(aggstate);
+		if (TupIsNull(outerslot))
+			break;
+
+		/* set up for lookup_hash_entries and advance_aggregates */
+		tmpcontext->ecxt_outertuple = outerslot;
+
+		/* Find or build hashtable entries */
+		lookup_hash_entries(aggstate);
+
+		/* Advance the aggregates (or combine functions) */
+		advance_aggregates(aggstate);
+
+		/*
+		 * Reset per-input-tuple context after each tuple, but note that the
+		 * hash lookups do this too
+		 */
+		ResetExprContext(aggstate->tmpcontext);
+	}
+
+	/* finalize spills, if any */
+	hashagg_finish_initial_spills(aggstate);
+
+	aggstate->table_filled = true;
+	/* Initialize to walk the first hash table */
+	select_current_set(aggstate, 0, true);
+	ResetTupleHashIterator(aggstate->perhash[0].hashtable,
+						   &aggstate->perhash[0].hashiter);
+}
+
+/*
+ * If any data was spilled during hash aggregation, reset the hash table and
+ * reprocess one batch of spilled data. After reprocessing a batch, the hash
+ * table will again contain data, ready to be consumed by
+ * agg_retrieve_hash_table_in_memory().
+ *
+ * Should only be called after all in memory hash table entries have been
+ * finalized and emitted.
+ *
+ * Return false when input is exhausted and there's no more work to be done;
+ * otherwise return true.
+ */
+static bool
+agg_refill_hash_table(AggState *aggstate)
+{
+	HashAggBatch *batch;
+	AggStatePerHash perhash;
+	HashAggSpill spill;
+	HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
+	bool		spill_initialized = false;
+
+	if (aggstate->hash_batches == NIL)
+		return false;
+
+	/* hash_batches is a stack, with the top item at the end of the list */
+	batch = llast(aggstate->hash_batches);
+	aggstate->hash_batches = list_delete_last(aggstate->hash_batches);
+
+	hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
+						batch->used_bits, &aggstate->hash_mem_limit,
+						&aggstate->hash_ngroups_limit, NULL);
+
+	/*
+	 * Each batch only processes one grouping set; set the rest to NULL so
+	 * that advance_aggregates() knows to ignore them. We don't touch
+	 * pergroups for sorted grouping sets here, because they will be needed if
+	 * we rescan later. The expressions for sorted grouping sets will not be
+	 * evaluated after we recompile anyway.
+	 */
+	MemSet(aggstate->hash_pergroup, 0,
+		   sizeof(AggStatePerGroup) * aggstate->num_hashes);
+
+	/* free memory and reset hash tables */
+	ReScanExprContext(aggstate->hashcontext);
+	for (int setno = 0; setno < aggstate->num_hashes; setno++)
+		ResetTupleHashTable(aggstate->perhash[setno].hashtable);
+
+	aggstate->hash_ngroups_current = 0;
+
+	/*
+	 * In AGG_MIXED mode, hash aggregation happens in phase 1 and the output
+	 * happens in phase 0. So, we switch to phase 1 when processing a batch,
+	 * and back to phase 0 after the batch is done.
+	 */
+	Assert(aggstate->current_phase == 0);
+	if (aggstate->phase->aggstrategy == AGG_MIXED)
+	{
+		aggstate->current_phase = 1;
+		aggstate->phase = &aggstate->phases[aggstate->current_phase];
+	}
+
+	select_current_set(aggstate, batch->setno, true);
+
+	perhash = &aggstate->perhash[aggstate->current_set];
+
+	/*
+	 * Spilled tuples are always read back as MinimalTuples, which may be
+	 * different from the outer plan, so recompile the aggregate expressions.
+	 *
+	 * We still need the NULL check, because we are only processing one
+	 * grouping set at a time and the rest will be NULL.
+	 */
+	hashagg_recompile_expressions(aggstate, true, true);
+
+	for (;;)
+	{
+		TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
+		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashEntry entry;
+		MinimalTuple tuple;
+		uint32		hash;
+		bool		isnew = false;
+		bool	   *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
+
+		CHECK_FOR_INTERRUPTS();
+
+		tuple = hashagg_batch_read(batch, &hash);
+		if (tuple == NULL)
+			break;
+
+		ExecStoreMinimalTuple(tuple, spillslot, true);
+		aggstate->tmpcontext->ecxt_outertuple = spillslot;
+
+		prepare_hash_slot(perhash,
+						  aggstate->tmpcontext->ecxt_outertuple,
+						  hashslot);
+		entry = LookupTupleHashEntryHash(
+										 perhash->hashtable, hashslot, p_isnew, hash);
+
+		if (entry != NULL)
+		{
+			if (isnew)
+				initialize_hash_entry(aggstate, perhash->hashtable, entry);
+			aggstate->hash_pergroup[batch->setno] = entry->additional;
+			advance_aggregates(aggstate);
+		}
+		else
+		{
+			if (!spill_initialized)
+			{
+				/*
+				 * Avoid initializing the spill until we actually need it so
+				 * that we don't assign tapes that will never be used.
+				 */
+				spill_initialized = true;
+				hashagg_spill_init(&spill, tapeinfo, batch->used_bits,
+								   batch->input_card, aggstate->hashentrysize);
+			}
+			/* no memory for a new group, spill */
+			hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
+
+			aggstate->hash_pergroup[batch->setno] = NULL;
+		}
+
+		/*
+		 * Reset per-input-tuple context after each tuple, but note that the
+		 * hash lookups do this too
+		 */
+		ResetExprContext(aggstate->tmpcontext);
+	}
+
+	hashagg_tapeinfo_release(tapeinfo, batch->input_tapenum);
+
+	/* change back to phase 0 */
+	aggstate->current_phase = 0;
+	aggstate->phase = &aggstate->phases[aggstate->current_phase];
+
+	if (spill_initialized)
+	{
+		hashagg_spill_finish(aggstate, &spill, batch->setno);
+		hash_agg_update_metrics(aggstate, true, spill.npartitions);
+	}
+	else
+		hash_agg_update_metrics(aggstate, true, 0);
+
+	aggstate->hash_spill_mode = false;
+
+	/* prepare to walk the first hash table */
+	select_current_set(aggstate, batch->setno, true);
+	ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable,
+						   &aggstate->perhash[batch->setno].hashiter);
+
+	pfree(batch);
+
+	return true;
+}
+
+/*
+ * ExecAgg for hashed case: retrieving groups from hash table
+ *
+ * After exhausting in-memory tuples, also try refilling the hash table using
+ * previously-spilled tuples. Only returns NULL after all in-memory and
+ * spilled tuples are exhausted.
+ */
+static TupleTableSlot *
+agg_retrieve_hash_table(AggState *aggstate)
+{
+	TupleTableSlot *result = NULL;
+
+	while (result == NULL)
+	{
+		result = agg_retrieve_hash_table_in_memory(aggstate);
+		if (result == NULL)
+		{
+			if (!agg_refill_hash_table(aggstate))
+			{
+				aggstate->agg_done = true;
+				break;
+			}
+		}
+	}
+
+	return result;
+}
+
+/*
+ * Retrieve the groups from the in-memory hash tables without considering any
+ * spilled tuples.
+ */
+static TupleTableSlot *
+agg_retrieve_hash_table_in_memory(AggState *aggstate)
+{
+	ExprContext *econtext;
+	AggStatePerAgg peragg;
+	AggStatePerGroup pergroup;
+	TupleHashEntryData *entry;
+	TupleTableSlot *firstSlot;
+	TupleTableSlot *result;
+	AggStatePerHash perhash;
+
+	/*
+	 * get state info from node.
+	 *
+	 * econtext is the per-output-tuple expression context.
+	 */
+	econtext = aggstate->ss.ps.ps_ExprContext;
+	peragg = aggstate->peragg;
+	firstSlot = aggstate->ss.ss_ScanTupleSlot;
+
+	/*
+	 * Note that perhash (and therefore anything accessed through it) can
+	 * change inside the loop, as we change between grouping sets.
+	 */
+	perhash = &aggstate->perhash[aggstate->current_set];
+
+	/*
+	 * We loop retrieving groups until we find one satisfying
+	 * aggstate->ss.ps.qual
+	 */
+	for (;;)
+	{
+		TupleTableSlot *hashslot = perhash->hashslot;
+		int			i;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Find the next entry in the hash table
+		 */
+		entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter);
+		if (entry == NULL)
+		{
+			int			nextset = aggstate->current_set + 1;
+
+			if (nextset < aggstate->num_hashes)
+			{
+				/*
+				 * Switch to next grouping set, reinitialize, and restart the
+				 * loop.
+				 */
+				select_current_set(aggstate, nextset, true);
+
+				perhash = &aggstate->perhash[aggstate->current_set];
+
+				ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter);
+
+				continue;
+			}
+			else
+			{
+				return NULL;
+			}
+		}
+
+		/*
+		 * Clear the per-output-tuple context for each group
+		 *
+		 * We intentionally don't use ReScanExprContext here; if any aggs have
+		 * registered shutdown callbacks, they mustn't be called yet, since we
+		 * might not be done with that agg.
+		 */
+		ResetExprContext(econtext);
+
+		/*
+		 * Transform representative tuple back into one with the right
+		 * columns.
+		 */
+		ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
+		slot_getallattrs(hashslot);
+
+		ExecClearTuple(firstSlot);
+		memset(firstSlot->tts_isnull, true,
+			   firstSlot->tts_tupleDescriptor->natts * sizeof(bool));
+
+		for (i = 0; i < perhash->numhashGrpCols; i++)
+		{
+			int			varNumber = perhash->hashGrpColIdxInput[i] - 1;
+
+			firstSlot->tts_values[varNumber] = hashslot->tts_values[i];
+			firstSlot->tts_isnull[varNumber] = hashslot->tts_isnull[i];
+		}
+		ExecStoreVirtualTuple(firstSlot);
+
+		pergroup = (AggStatePerGroup) entry->additional;
+
+		/*
+		 * Use the representative input tuple for any references to
+		 * non-aggregated input columns in the qual and tlist.
+		 */
+		econtext->ecxt_outertuple = firstSlot;
+
+		prepare_projection_slot(aggstate,
+								econtext->ecxt_outertuple,
+								aggstate->current_set);
+
+		finalize_aggregates(aggstate, peragg, pergroup);
+
+		result = project_aggregates(aggstate);
+		if (result)
+			return result;
+	}
+
+	/* No more groups */
+	return NULL;
+}
+
+/*
+ * Initialize HashTapeInfo
+ */
+static void
+hashagg_tapeinfo_init(AggState *aggstate)
+{
+	HashTapeInfo *tapeinfo = palloc(sizeof(HashTapeInfo));
+	int			init_tapes = 16;	/* expanded dynamically */
+
+	tapeinfo->tapeset = LogicalTapeSetCreate(init_tapes, true, NULL, NULL, -1);
+	tapeinfo->ntapes = init_tapes;
+	tapeinfo->nfreetapes = init_tapes;
+	tapeinfo->freetapes_alloc = init_tapes;
+	tapeinfo->freetapes = palloc(init_tapes * sizeof(int));
+	for (int i = 0; i < init_tapes; i++)
+		tapeinfo->freetapes[i] = i;
+
+	aggstate->hash_tapeinfo = tapeinfo;
+}
+
+/*
+ * Assign unused tapes to spill partitions, extending the tape set if
+ * necessary.
+ */
+static void
+hashagg_tapeinfo_assign(HashTapeInfo *tapeinfo, int *partitions,
+						int npartitions)
+{
+	int			partidx = 0;
+
+	/* use free tapes if available */
+	while (partidx < npartitions && tapeinfo->nfreetapes > 0)
+		partitions[partidx++] = tapeinfo->freetapes[--tapeinfo->nfreetapes];
+
+	if (partidx < npartitions)
+	{
+		LogicalTapeSetExtend(tapeinfo->tapeset, npartitions - partidx);
+
+		while (partidx < npartitions)
+			partitions[partidx++] = tapeinfo->ntapes++;
+	}
+}
+
+/*
+ * After a tape has already been written to and then read, this function
+ * rewinds it for writing and adds it to the free list.
+ */
+static void
+hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum)
+{
+	/* rewinding frees the buffer while not in use */
+	LogicalTapeRewindForWrite(tapeinfo->tapeset, tapenum);
+	if (tapeinfo->freetapes_alloc == tapeinfo->nfreetapes)
+	{
+		tapeinfo->freetapes_alloc <<= 1;
+		tapeinfo->freetapes = repalloc(tapeinfo->freetapes,
+									   tapeinfo->freetapes_alloc * sizeof(int));
+	}
+	tapeinfo->freetapes[tapeinfo->nfreetapes++] = tapenum;
+}
+
+/*
+ * hashagg_spill_init
+ *
+ * Called after we determined that spilling is necessary. Chooses the number
+ * of partitions to create, and initializes them.
+ */
+static void
+hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
+				   double input_groups, double hashentrysize)
+{
+	int			npartitions;
+	int			partition_bits;
+
+	npartitions = hash_choose_num_partitions(input_groups, hashentrysize,
+											 used_bits, &partition_bits);
+
+	spill->partitions = palloc0(sizeof(int) * npartitions);
+	spill->ntuples = palloc0(sizeof(int64) * npartitions);
+	spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions);
+
+	hashagg_tapeinfo_assign(tapeinfo, spill->partitions, npartitions);
+
+	spill->tapeset = tapeinfo->tapeset;
+	spill->shift = 32 - used_bits - partition_bits;
+	spill->mask = (npartitions - 1) << spill->shift;
+	spill->npartitions = npartitions;
+
+	for (int i = 0; i < npartitions; i++)
+		initHyperLogLog(&spill->hll_card[i], HASHAGG_HLL_BIT_WIDTH);
+}
+
+/*
+ * hashagg_spill_tuple
+ *
+ * No room for new groups in the hash table. Save for later in the appropriate
+ * partition.
+ */
+static Size
+hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
+					TupleTableSlot *inputslot, uint32 hash)
+{
+	LogicalTapeSet *tapeset = spill->tapeset;
+	TupleTableSlot *spillslot;
+	int			partition;
+	MinimalTuple tuple;
+	int			tapenum;
+	int			total_written = 0;
+	bool		shouldFree;
+
+	Assert(spill->partitions != NULL);
+
+	/* spill only attributes that we actually need */
+	if (!aggstate->all_cols_needed)
+	{
+		spillslot = aggstate->hash_spill_wslot;
+		slot_getsomeattrs(inputslot, aggstate->max_colno_needed);
+		ExecClearTuple(spillslot);
+		for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++)
+		{
+			if (bms_is_member(i + 1, aggstate->colnos_needed))
+			{
+				spillslot->tts_values[i] = inputslot->tts_values[i];
+				spillslot->tts_isnull[i] = inputslot->tts_isnull[i];
+			}
+			else
+				spillslot->tts_isnull[i] = true;
+		}
+		ExecStoreVirtualTuple(spillslot);
+	}
+	else
+		spillslot = inputslot;
+
+	tuple = ExecFetchSlotMinimalTuple(spillslot, &shouldFree);
+
+	partition = (hash & spill->mask) >> spill->shift;
+	spill->ntuples[partition]++;
+
+	/*
+	 * All hash values destined for a given partition have some bits in
+	 * common, which causes bad HLL cardinality estimates. Hash the hash to
+	 * get a more uniform distribution.
+	 */
+	addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash));
+
+	tapenum = spill->partitions[partition];
+
+	LogicalTapeWrite(tapeset, tapenum, (void *) &hash, sizeof(uint32));
+	total_written += sizeof(uint32);
+
+	LogicalTapeWrite(tapeset, tapenum, (void *) tuple, tuple->t_len);
+	total_written += tuple->t_len;
+
+	if (shouldFree)
+		pfree(tuple);
+
+	return total_written;
+}
+
+/*
+ * hashagg_batch_new
+ *
+ * Construct a HashAggBatch item, which represents one iteration of HashAgg to
+ * be done.
+ */
+static HashAggBatch *
+hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
+				  int64 input_tuples, double input_card, int used_bits)
+{
+	HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
+
+	batch->setno = setno;
+	batch->used_bits = used_bits;
+	batch->tapeset = tapeset;
+	batch->input_tapenum = tapenum;
+	batch->input_tuples = input_tuples;
+	batch->input_card = input_card;
+
+	return batch;
+}
+
+/*
+ * read_spilled_tuple
+ * 		read the next tuple from a batch's tape.  Return NULL if no more.
+ */
+static MinimalTuple
+hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
+{
+	LogicalTapeSet *tapeset = batch->tapeset;
+	int			tapenum = batch->input_tapenum;
+	MinimalTuple tuple;
+	uint32		t_len;
+	size_t		nread;
+	uint32		hash;
+
+	nread = LogicalTapeRead(tapeset, tapenum, &hash, sizeof(uint32));
+	if (nread == 0)
+		return NULL;
+	if (nread != sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
+						tapenum, sizeof(uint32), nread)));
+	if (hashp != NULL)
+		*hashp = hash;
+
+	nread = LogicalTapeRead(tapeset, tapenum, &t_len, sizeof(t_len));
+	if (nread != sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
+						tapenum, sizeof(uint32), nread)));
+
+	tuple = (MinimalTuple) palloc(t_len);
+	tuple->t_len = t_len;
+
+	nread = LogicalTapeRead(tapeset, tapenum,
+							(void *) ((char *) tuple + sizeof(uint32)),
+							t_len - sizeof(uint32));
+	if (nread != t_len - sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes",
+						tapenum, t_len - sizeof(uint32), nread)));
+
+	return tuple;
+}
+
+/*
+ * hashagg_finish_initial_spills
+ *
+ * After a HashAggBatch has been processed, it may have spilled tuples to
+ * disk. If so, turn the spilled partitions into new batches that must later
+ * be executed.
+ */
+static void
+hashagg_finish_initial_spills(AggState *aggstate)
+{
+	int			setno;
+	int			total_npartitions = 0;
+
+	if (aggstate->hash_spills != NULL)
+	{
+		for (setno = 0; setno < aggstate->num_hashes; setno++)
+		{
+			HashAggSpill *spill = &aggstate->hash_spills[setno];
+
+			total_npartitions += spill->npartitions;
+			hashagg_spill_finish(aggstate, spill, setno);
+		}
+
+		/*
+		 * We're not processing tuples from outer plan any more; only
+		 * processing batches of spilled tuples. The initial spill structures
+		 * are no longer needed.
+		 */
+		pfree(aggstate->hash_spills);
+		aggstate->hash_spills = NULL;
+	}
+
+	hash_agg_update_metrics(aggstate, false, total_npartitions);
+	aggstate->hash_spill_mode = false;
+}
+
+/*
+ * hashagg_spill_finish
+ *
+ * Transform spill partitions into new batches.
+ */
+static void
+hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
+{
+	int			i;
+	int			used_bits = 32 - spill->shift;
+
+	if (spill->npartitions == 0)
+		return;					/* didn't spill */
+
+	for (i = 0; i < spill->npartitions; i++)
+	{
+		LogicalTapeSet *tapeset = aggstate->hash_tapeinfo->tapeset;
+		int			tapenum = spill->partitions[i];
+		HashAggBatch *new_batch;
+		double		cardinality;
+
+		/* if the partition is empty, don't create a new batch of work */
+		if (spill->ntuples[i] == 0)
+			continue;
+
+		cardinality = estimateHyperLogLog(&spill->hll_card[i]);
+		freeHyperLogLog(&spill->hll_card[i]);
+
+		/* rewinding frees the buffer while not in use */
+		LogicalTapeRewindForRead(tapeset, tapenum,
+								 HASHAGG_READ_BUFFER_SIZE);
+
+		new_batch = hashagg_batch_new(tapeset, tapenum, setno,
+									  spill->ntuples[i], cardinality,
+									  used_bits);
+		aggstate->hash_batches = lappend(aggstate->hash_batches, new_batch);
+		aggstate->hash_batches_used++;
+	}
+
+	pfree(spill->ntuples);
+	pfree(spill->hll_card);
+	pfree(spill->partitions);
+}
+
+/*
+ * Free resources related to a spilled HashAgg.
+ */
+static void
+hashagg_reset_spill_state(AggState *aggstate)
+{
+	/* free spills from initial pass */
+	if (aggstate->hash_spills != NULL)
+	{
+		int			setno;
+
+		for (setno = 0; setno < aggstate->num_hashes; setno++)
+		{
+			HashAggSpill *spill = &aggstate->hash_spills[setno];
+
+			pfree(spill->ntuples);
+			pfree(spill->partitions);
+		}
+		pfree(aggstate->hash_spills);
+		aggstate->hash_spills = NULL;
+	}
+
+	/* free batches */
+	list_free_deep(aggstate->hash_batches);
+	aggstate->hash_batches = NIL;
+
+	/* close tape set */
+	if (aggstate->hash_tapeinfo != NULL)
+	{
+		HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
+
+		LogicalTapeSetClose(tapeinfo->tapeset);
+		pfree(tapeinfo->freetapes);
+		pfree(tapeinfo);
+		aggstate->hash_tapeinfo = NULL;
+	}
+}
+
+
+/* -----------------
+ * ExecInitAgg
+ *
+ *	Creates the run-time information for the agg node produced by the
+ *	planner and initializes its outer subtree.
+ *
+ * -----------------
+ */
+AggState *
+ExecInitAgg(Agg *node, EState *estate, int eflags)
+{
+	AggState   *aggstate;
+	AggStatePerAgg peraggs;
+	AggStatePerTrans pertransstates;
+	AggStatePerGroup *pergroups;
+	Plan	   *outerPlan;
+	ExprContext *econtext;
+	TupleDesc	scanDesc;
+	int			max_aggno;
+	int			max_transno;
+	int			numaggrefs;
+	int			numaggs;
+	int			numtrans;
+	int			phase;
+	int			phaseidx;
+	ListCell   *l;
+	Bitmapset  *all_grouped_cols = NULL;
+	int			numGroupingSets = 1;
+	int			numPhases;
+	int			numHashes;
+	int			i = 0;
+	int			j = 0;
+	bool		use_hashing = (node->aggstrategy == AGG_HASHED ||
+							   node->aggstrategy == AGG_MIXED);
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	aggstate = makeNode(AggState);
+	aggstate->ss.ps.plan = (Plan *) node;
+	aggstate->ss.ps.state = estate;
+	aggstate->ss.ps.ExecProcNode = ExecAgg;
+
+	aggstate->aggs = NIL;
+	aggstate->numaggs = 0;
+	aggstate->numtrans = 0;
+	aggstate->aggstrategy = node->aggstrategy;
+	aggstate->aggsplit = node->aggsplit;
+	aggstate->maxsets = 0;
+	aggstate->projected_set = -1;
+	aggstate->current_set = 0;
+	aggstate->peragg = NULL;
+	aggstate->pertrans = NULL;
+	aggstate->curperagg = NULL;
+	aggstate->curpertrans = NULL;
+	aggstate->input_done = false;
+	aggstate->agg_done = false;
+	aggstate->pergroups = NULL;
+	aggstate->grp_firstTuple = NULL;
+	aggstate->sort_in = NULL;
+	aggstate->sort_out = NULL;
+
+	/*
+	 * phases[0] always exists, but is dummy in sorted/plain mode
+	 */
+	numPhases = (use_hashing ? 1 : 2);
+	numHashes = (use_hashing ? 1 : 0);
+
+	/*
+	 * Calculate the maximum number of grouping sets in any phase; this
+	 * determines the size of some allocations.  Also calculate the number of
+	 * phases, since all hashed/mixed nodes contribute to only a single phase.
+	 */
+	if (node->groupingSets)
+	{
+		numGroupingSets = list_length(node->groupingSets);
+
+		foreach(l, node->chain)
+		{
+			Agg		   *agg = lfirst(l);
+
+			numGroupingSets = Max(numGroupingSets,
+								  list_length(agg->groupingSets));
+
+			/*
+			 * additional AGG_HASHED aggs become part of phase 0, but all
+			 * others add an extra phase.
+			 */
+			if (agg->aggstrategy != AGG_HASHED)
+				++numPhases;
+			else
+				++numHashes;
+		}
+	}
+
+	aggstate->maxsets = numGroupingSets;
+	aggstate->numphases = numPhases;
+
+	aggstate->aggcontexts = (ExprContext **)
+		palloc0(sizeof(ExprContext *) * numGroupingSets);
+
+	/*
+	 * Create expression contexts.  We need three or more, one for
+	 * per-input-tuple processing, one for per-output-tuple processing, one
+	 * for all the hashtables, and one for each grouping set.  The per-tuple
+	 * memory context of the per-grouping-set ExprContexts (aggcontexts)
+	 * replaces the standalone memory context formerly used to hold transition
+	 * values.  We cheat a little by using ExecAssignExprContext() to build
+	 * all of them.
+	 *
+	 * NOTE: the details of what is stored in aggcontexts and what is stored
+	 * in the regular per-query memory context are driven by a simple
+	 * decision: we want to reset the aggcontext at group boundaries (if not
+	 * hashing) and in ExecReScanAgg to recover no-longer-wanted space.
+	 */
+	ExecAssignExprContext(estate, &aggstate->ss.ps);
+	aggstate->tmpcontext = aggstate->ss.ps.ps_ExprContext;
+
+	for (i = 0; i < numGroupingSets; ++i)
+	{
+		ExecAssignExprContext(estate, &aggstate->ss.ps);
+		aggstate->aggcontexts[i] = aggstate->ss.ps.ps_ExprContext;
+	}
+
+	if (use_hashing)
+		aggstate->hashcontext = CreateWorkExprContext(estate);
+
+	ExecAssignExprContext(estate, &aggstate->ss.ps);
+
+	/*
+	 * Initialize child nodes.
+	 *
+	 * If we are doing a hashed aggregation then the child plan does not need
+	 * to handle REWIND efficiently; see ExecReScanAgg.
+	 */
+	if (node->aggstrategy == AGG_HASHED)
+		eflags &= ~EXEC_FLAG_REWIND;
+	outerPlan = outerPlan(node);
+	outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags);
+
+	/*
+	 * initialize source tuple type.
+	 */
+	aggstate->ss.ps.outerops =
+		ExecGetResultSlotOps(outerPlanState(&aggstate->ss),
+							 &aggstate->ss.ps.outeropsfixed);
+	aggstate->ss.ps.outeropsset = true;
+
+	ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss,
+									aggstate->ss.ps.outerops);
+	scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
+
+	/*
+	 * If there are more than two phases (including a potential dummy phase
+	 * 0), input will be resorted using tuplesort. Need a slot for that.
+	 */
+	if (numPhases > 2)
+	{
+		aggstate->sort_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+													 &TTSOpsMinimalTuple);
+
+		/*
+		 * The output of the tuplesort, and the output from the outer child
+		 * might not use the same type of slot. In most cases the child will
+		 * be a Sort, and thus return a TTSOpsMinimalTuple type slot - but the
+		 * input can also be presorted due an index, in which case it could be
+		 * a different type of slot.
+		 *
+		 * XXX: For efficiency it would be good to instead/additionally
+		 * generate expressions with corresponding settings of outerops* for
+		 * the individual phases - deforming is often a bottleneck for
+		 * aggregations with lots of rows per group. If there's multiple
+		 * sorts, we know that all but the first use TTSOpsMinimalTuple (via
+		 * the nodeAgg.c internal tuplesort).
+		 */
+		if (aggstate->ss.ps.outeropsfixed &&
+			aggstate->ss.ps.outerops != &TTSOpsMinimalTuple)
+			aggstate->ss.ps.outeropsfixed = false;
+	}
+
+	/*
+	 * Initialize result type, slot and projection.
+	 */
+	ExecInitResultTupleSlotTL(&aggstate->ss.ps, &TTSOpsVirtual);
+	ExecAssignProjectionInfo(&aggstate->ss.ps, NULL);
+
+	/*
+	 * initialize child expressions
+	 *
+	 * We expect the parser to have checked that no aggs contain other agg
+	 * calls in their arguments (and just to be sure, we verify it again while
+	 * initializing the plan node).  This would make no sense under SQL
+	 * semantics, and it's forbidden by the spec.  Because it is true, we
+	 * don't need to worry about evaluating the aggs in any particular order.
+	 *
+	 * Note: execExpr.c finds Aggrefs for us, and adds them to aggstate->aggs.
+	 * Aggrefs in the qual are found here; Aggrefs in the targetlist are found
+	 * during ExecAssignProjectionInfo, above.
+	 */
+	aggstate->ss.ps.qual =
+		ExecInitQual(node->plan.qual, (PlanState *) aggstate);
+
+	/*
+	 * We should now have found all Aggrefs in the targetlist and quals.
+	 */
+	numaggrefs = list_length(aggstate->aggs);
+	max_aggno = -1;
+	max_transno = -1;
+	foreach(l, aggstate->aggs)
+	{
+		Aggref	   *aggref = (Aggref *) lfirst(l);
+
+		max_aggno = Max(max_aggno, aggref->aggno);
+		max_transno = Max(max_transno, aggref->aggtransno);
+	}
+	numaggs = max_aggno + 1;
+	numtrans = max_transno + 1;
+
+	/*
+	 * For each phase, prepare grouping set data and fmgr lookup data for
+	 * compare functions.  Accumulate all_grouped_cols in passing.
+	 */
+	aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData));
+
+	aggstate->num_hashes = numHashes;
+	if (numHashes)
+	{
+		aggstate->perhash = palloc0(sizeof(AggStatePerHashData) * numHashes);
+		aggstate->phases[0].numsets = 0;
+		aggstate->phases[0].gset_lengths = palloc(numHashes * sizeof(int));
+		aggstate->phases[0].grouped_cols = palloc(numHashes * sizeof(Bitmapset *));
+	}
+
+	phase = 0;
+	for (phaseidx = 0; phaseidx <= list_length(node->chain); ++phaseidx)
+	{
+		Agg		   *aggnode;
+		Sort	   *sortnode;
+
+		if (phaseidx > 0)
+		{
+			aggnode = list_nth_node(Agg, node->chain, phaseidx - 1);
+			sortnode = castNode(Sort, aggnode->plan.lefttree);
+		}
+		else
+		{
+			aggnode = node;
+			sortnode = NULL;
+		}
+
+		Assert(phase <= 1 || sortnode);
+
+		if (aggnode->aggstrategy == AGG_HASHED
+			|| aggnode->aggstrategy == AGG_MIXED)
+		{
+			AggStatePerPhase phasedata = &aggstate->phases[0];
+			AggStatePerHash perhash;
+			Bitmapset  *cols = NULL;
+
+			Assert(phase == 0);
+			i = phasedata->numsets++;
+			perhash = &aggstate->perhash[i];
+
+			/* phase 0 always points to the "real" Agg in the hash case */
+			phasedata->aggnode = node;
+			phasedata->aggstrategy = node->aggstrategy;
+
+			/* but the actual Agg node representing this hash is saved here */
+			perhash->aggnode = aggnode;
+
+			phasedata->gset_lengths[i] = perhash->numCols = aggnode->numCols;
+
+			for (j = 0; j < aggnode->numCols; ++j)
+				cols = bms_add_member(cols, aggnode->grpColIdx[j]);
+
+			phasedata->grouped_cols[i] = cols;
+
+			all_grouped_cols = bms_add_members(all_grouped_cols, cols);
+			continue;
+		}
+		else
+		{
+			AggStatePerPhase phasedata = &aggstate->phases[++phase];
+			int			num_sets;
+
+			phasedata->numsets = num_sets = list_length(aggnode->groupingSets);
+
+			if (num_sets)
+			{
+				phasedata->gset_lengths = palloc(num_sets * sizeof(int));
+				phasedata->grouped_cols = palloc(num_sets * sizeof(Bitmapset *));
+
+				i = 0;
+				foreach(l, aggnode->groupingSets)
+				{
+					int			current_length = list_length(lfirst(l));
+					Bitmapset  *cols = NULL;
+
+					/* planner forces this to be correct */
+					for (j = 0; j < current_length; ++j)
+						cols = bms_add_member(cols, aggnode->grpColIdx[j]);
+
+					phasedata->grouped_cols[i] = cols;
+					phasedata->gset_lengths[i] = current_length;
+
+					++i;
+				}
+
+				all_grouped_cols = bms_add_members(all_grouped_cols,
+												   phasedata->grouped_cols[0]);
+			}
+			else
+			{
+				Assert(phaseidx == 0);
+
+				phasedata->gset_lengths = NULL;
+				phasedata->grouped_cols = NULL;
+			}
+
+			/*
+			 * If we are grouping, precompute fmgr lookup data for inner loop.
+			 */
+			if (aggnode->aggstrategy == AGG_SORTED)
+			{
+				int			i = 0;
+
+				Assert(aggnode->numCols > 0);
+
+				/*
+				 * Build a separate function for each subset of columns that
+				 * need to be compared.
+				 */
+				phasedata->eqfunctions =
+					(ExprState **) palloc0(aggnode->numCols * sizeof(ExprState *));
+
+				/* for each grouping set */
+				for (i = 0; i < phasedata->numsets; i++)
+				{
+					int			length = phasedata->gset_lengths[i];
+
+					if (phasedata->eqfunctions[length - 1] != NULL)
+						continue;
+
+					phasedata->eqfunctions[length - 1] =
+						execTuplesMatchPrepare(scanDesc,
+											   length,
+											   aggnode->grpColIdx,
+											   aggnode->grpOperators,
+											   aggnode->grpCollations,
+											   (PlanState *) aggstate);
+				}
+
+				/* and for all grouped columns, unless already computed */
+				if (phasedata->eqfunctions[aggnode->numCols - 1] == NULL)
+				{
+					phasedata->eqfunctions[aggnode->numCols - 1] =
+						execTuplesMatchPrepare(scanDesc,
+											   aggnode->numCols,
+											   aggnode->grpColIdx,
+											   aggnode->grpOperators,
+											   aggnode->grpCollations,
+											   (PlanState *) aggstate);
+				}
+			}
+
+			phasedata->aggnode = aggnode;
+			phasedata->aggstrategy = aggnode->aggstrategy;
+			phasedata->sortnode = sortnode;
+		}
+	}
+
+	/*
+	 * Convert all_grouped_cols to a descending-order list.
+	 */
+	i = -1;
+	while ((i = bms_next_member(all_grouped_cols, i)) >= 0)
+		aggstate->all_grouped_cols = lcons_int(i, aggstate->all_grouped_cols);
+
+	/*
+	 * Set up aggregate-result storage in the output expr context, and also
+	 * allocate my private per-agg working storage
+	 */
+	econtext = aggstate->ss.ps.ps_ExprContext;
+	econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs);
+	econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs);
+
+	peraggs = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs);
+	pertransstates = (AggStatePerTrans) palloc0(sizeof(AggStatePerTransData) * numtrans);
+
+	aggstate->peragg = peraggs;
+	aggstate->pertrans = pertransstates;
+
+
+	aggstate->all_pergroups =
+		(AggStatePerGroup *) palloc0(sizeof(AggStatePerGroup)
+									 * (numGroupingSets + numHashes));
+	pergroups = aggstate->all_pergroups;
+
+	if (node->aggstrategy != AGG_HASHED)
+	{
+		for (i = 0; i < numGroupingSets; i++)
+		{
+			pergroups[i] = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData)
+													  * numaggs);
+		}
+
+		aggstate->pergroups = pergroups;
+		pergroups += numGroupingSets;
+	}
+
+	/*
+	 * Hashing can only appear in the initial phase.
+	 */
+	if (use_hashing)
+	{
+		Plan	   *outerplan = outerPlan(node);
+		uint64		totalGroups = 0;
+		int			i;
+
+		aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
+													   "HashAgg meta context",
+													   ALLOCSET_DEFAULT_SIZES);
+		aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
+															&TTSOpsMinimalTuple);
+		aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
+															&TTSOpsVirtual);
+
+		/* this is an array of pointers, not structures */
+		aggstate->hash_pergroup = pergroups;
+
+		aggstate->hashentrysize = hash_agg_entry_size(aggstate->numtrans,
+													  outerplan->plan_width,
+													  node->transitionSpace);
+
+		/*
+		 * Consider all of the grouping sets together when setting the limits
+		 * and estimating the number of partitions. This can be inaccurate
+		 * when there is more than one grouping set, but should still be
+		 * reasonable.
+		 */
+		for (i = 0; i < aggstate->num_hashes; i++)
+			totalGroups += aggstate->perhash[i].aggnode->numGroups;
+
+		hash_agg_set_limits(aggstate->hashentrysize, totalGroups, 0,
+							&aggstate->hash_mem_limit,
+							&aggstate->hash_ngroups_limit,
+							&aggstate->hash_planned_partitions);
+		find_hash_columns(aggstate);
+
+		/* Skip massive memory allocation if we are just doing EXPLAIN */
+		if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+			build_hash_tables(aggstate);
+
+		aggstate->table_filled = false;
+
+		/* Initialize this to 1, meaning nothing spilled, yet */
+		aggstate->hash_batches_used = 1;
+	}
+
+	/*
+	 * Initialize current phase-dependent values to initial phase. The initial
+	 * phase is 1 (first sort pass) for all strategies that use sorting (if
+	 * hashing is being done too, then phase 0 is processed last); but if only
+	 * hashing is being done, then phase 0 is all there is.
+	 */
+	if (node->aggstrategy == AGG_HASHED)
+	{
+		aggstate->current_phase = 0;
+		initialize_phase(aggstate, 0);
+		select_current_set(aggstate, 0, true);
+	}
+	else
+	{
+		aggstate->current_phase = 1;
+		initialize_phase(aggstate, 1);
+		select_current_set(aggstate, 0, false);
+	}
+
+	/*
+	 * Perform lookups of aggregate function info, and initialize the
+	 * unchanging fields of the per-agg and per-trans data.
+	 */
+	foreach(l, aggstate->aggs)
+	{
+		Aggref	   *aggref = lfirst(l);
+		AggStatePerAgg peragg;
+		AggStatePerTrans pertrans;
+		Oid			inputTypes[FUNC_MAX_ARGS];
+		int			numArguments;
+		int			numDirectArgs;
+		HeapTuple	aggTuple;
+		Form_pg_aggregate aggform;
+		AclResult	aclresult;
+		Oid			finalfn_oid;
+		Oid			serialfn_oid,
+					deserialfn_oid;
+		Oid			aggOwner;
+		Expr	   *finalfnexpr;
+		Oid			aggtranstype;
+
+		/* Planner should have assigned aggregate to correct level */
+		Assert(aggref->agglevelsup == 0);
+		/* ... and the split mode should match */
+		Assert(aggref->aggsplit == aggstate->aggsplit);
+
+		peragg = &peraggs[aggref->aggno];
+
+		/* Check if we initialized the state for this aggregate already. */
+		if (peragg->aggref != NULL)
+			continue;
+
+		peragg->aggref = aggref;
+		peragg->transno = aggref->aggtransno;
+
+		/* Fetch the pg_aggregate row */
+		aggTuple = SearchSysCache1(AGGFNOID,
+								   ObjectIdGetDatum(aggref->aggfnoid));
+		if (!HeapTupleIsValid(aggTuple))
+			elog(ERROR, "cache lookup failed for aggregate %u",
+				 aggref->aggfnoid);
+		aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
+
+		/* Check permission to call aggregate function */
+		aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(),
+									 ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, OBJECT_AGGREGATE,
+						   get_func_name(aggref->aggfnoid));
+		InvokeFunctionExecuteHook(aggref->aggfnoid);
+
+		/* planner recorded transition state type in the Aggref itself */
+		aggtranstype = aggref->aggtranstype;
+		Assert(OidIsValid(aggtranstype));
+
+		/* Final function only required if we're finalizing the aggregates */
+		if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
+			peragg->finalfn_oid = finalfn_oid = InvalidOid;
+		else
+			peragg->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
+
+		serialfn_oid = InvalidOid;
+		deserialfn_oid = InvalidOid;
+
+		/*
+		 * Check if serialization/deserialization is required.  We only do it
+		 * for aggregates that have transtype INTERNAL.
+		 */
+		if (aggtranstype == INTERNALOID)
+		{
+			/*
+			 * The planner should only have generated a serialize agg node if
+			 * every aggregate with an INTERNAL state has a serialization
+			 * function.  Verify that.
+			 */
+			if (DO_AGGSPLIT_SERIALIZE(aggstate->aggsplit))
+			{
+				/* serialization only valid when not running finalfn */
+				Assert(DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
+
+				if (!OidIsValid(aggform->aggserialfn))
+					elog(ERROR, "serialfunc not provided for serialization aggregation");
+				serialfn_oid = aggform->aggserialfn;
+			}
+
+			/* Likewise for deserialization functions */
+			if (DO_AGGSPLIT_DESERIALIZE(aggstate->aggsplit))
+			{
+				/* deserialization only valid when combining states */
+				Assert(DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
+
+				if (!OidIsValid(aggform->aggdeserialfn))
+					elog(ERROR, "deserialfunc not provided for deserialization aggregation");
+				deserialfn_oid = aggform->aggdeserialfn;
+			}
+		}
+
+		/* Check that aggregate owner has permission to call component fns */
+		{
+			HeapTuple	procTuple;
+
+			procTuple = SearchSysCache1(PROCOID,
+										ObjectIdGetDatum(aggref->aggfnoid));
+			if (!HeapTupleIsValid(procTuple))
+				elog(ERROR, "cache lookup failed for function %u",
+					 aggref->aggfnoid);
+			aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
+			ReleaseSysCache(procTuple);
+
+			if (OidIsValid(finalfn_oid))
+			{
+				aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
+											 ACL_EXECUTE);
+				if (aclresult != ACLCHECK_OK)
+					aclcheck_error(aclresult, OBJECT_FUNCTION,
+								   get_func_name(finalfn_oid));
+				InvokeFunctionExecuteHook(finalfn_oid);
+			}
+			if (OidIsValid(serialfn_oid))
+			{
+				aclresult = pg_proc_aclcheck(serialfn_oid, aggOwner,
+											 ACL_EXECUTE);
+				if (aclresult != ACLCHECK_OK)
+					aclcheck_error(aclresult, OBJECT_FUNCTION,
+								   get_func_name(serialfn_oid));
+				InvokeFunctionExecuteHook(serialfn_oid);
+			}
+			if (OidIsValid(deserialfn_oid))
+			{
+				aclresult = pg_proc_aclcheck(deserialfn_oid, aggOwner,
+											 ACL_EXECUTE);
+				if (aclresult != ACLCHECK_OK)
+					aclcheck_error(aclresult, OBJECT_FUNCTION,
+								   get_func_name(deserialfn_oid));
+				InvokeFunctionExecuteHook(deserialfn_oid);
+			}
+		}
+
+		/*
+		 * Get actual datatypes of the (nominal) aggregate inputs.  These
+		 * could be different from the agg's declared input types, when the
+		 * agg accepts ANY or a polymorphic type.
+		 */
+		numArguments = get_aggregate_argtypes(aggref, inputTypes);
+
+		/* Count the "direct" arguments, if any */
+		numDirectArgs = list_length(aggref->aggdirectargs);
+
+		/* Detect how many arguments to pass to the finalfn */
+		if (aggform->aggfinalextra)
+			peragg->numFinalArgs = numArguments + 1;
+		else
+			peragg->numFinalArgs = numDirectArgs + 1;
+
+		/* Initialize any direct-argument expressions */
+		peragg->aggdirectargs = ExecInitExprList(aggref->aggdirectargs,
+												 (PlanState *) aggstate);
+
+		/*
+		 * build expression trees using actual argument & result types for the
+		 * finalfn, if it exists and is required.
+		 */
+		if (OidIsValid(finalfn_oid))
+		{
+			build_aggregate_finalfn_expr(inputTypes,
+										 peragg->numFinalArgs,
+										 aggtranstype,
+										 aggref->aggtype,
+										 aggref->inputcollid,
+										 finalfn_oid,
+										 &finalfnexpr);
+			fmgr_info(finalfn_oid, &peragg->finalfn);
+			fmgr_info_set_expr((Node *) finalfnexpr, &peragg->finalfn);
+		}
+
+		/* get info about the output value's datatype */
+		get_typlenbyval(aggref->aggtype,
+						&peragg->resulttypeLen,
+						&peragg->resulttypeByVal);
+
+		/*
+		 * Build working state for invoking the transition function, if we
+		 * haven't done it already.
+		 */
+		pertrans = &pertransstates[aggref->aggtransno];
+		if (pertrans->aggref == NULL)
+		{
+			Datum		textInitVal;
+			Datum		initValue;
+			bool		initValueIsNull;
+			Oid			transfn_oid;
+
+			/*
+			 * If this aggregation is performing state combines, then instead
+			 * of using the transition function, we'll use the combine
+			 * function
+			 */
+			if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
+			{
+				transfn_oid = aggform->aggcombinefn;
+
+				/* If not set then the planner messed up */
+				if (!OidIsValid(transfn_oid))
+					elog(ERROR, "combinefn not set for aggregate function");
+			}
+			else
+				transfn_oid = aggform->aggtransfn;
+
+			aclresult = pg_proc_aclcheck(transfn_oid, aggOwner,
+										 ACL_EXECUTE);
+			if (aclresult != ACLCHECK_OK)
+				aclcheck_error(aclresult, OBJECT_FUNCTION,
+							   get_func_name(transfn_oid));
+			InvokeFunctionExecuteHook(transfn_oid);
+
+			/*
+			 * initval is potentially null, so don't try to access it as a
+			 * struct field. Must do it the hard way with SysCacheGetAttr.
+			 */
+			textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
+										  Anum_pg_aggregate_agginitval,
+										  &initValueIsNull);
+			if (initValueIsNull)
+				initValue = (Datum) 0;
+			else
+				initValue = GetAggInitVal(textInitVal, aggtranstype);
+
+			build_pertrans_for_aggref(pertrans, aggstate, estate,
+									  aggref, transfn_oid, aggtranstype,
+									  serialfn_oid, deserialfn_oid,
+									  initValue, initValueIsNull,
+									  inputTypes, numArguments);
+		}
+		else
+			pertrans->aggshared = true;
+		ReleaseSysCache(aggTuple);
+	}
+
+	/*
+	 * Update aggstate->numaggs to be the number of unique aggregates found.
+	 * Also set numstates to the number of unique transition states found.
+	 */
+	aggstate->numaggs = numaggs;
+	aggstate->numtrans = numtrans;
+
+	/*
+	 * Last, check whether any more aggregates got added onto the node while
+	 * we processed the expressions for the aggregate arguments (including not
+	 * only the regular arguments and FILTER expressions handled immediately
+	 * above, but any direct arguments we might've handled earlier).  If so,
+	 * we have nested aggregate functions, which is semantically nonsensical,
+	 * so complain.  (This should have been caught by the parser, so we don't
+	 * need to work hard on a helpful error message; but we defend against it
+	 * here anyway, just to be sure.)
+	 */
+	if (numaggrefs != list_length(aggstate->aggs))
+		ereport(ERROR,
+				(errcode(ERRCODE_GROUPING_ERROR),
+				 errmsg("aggregate function calls cannot be nested")));
+
+	/*
+	 * Build expressions doing all the transition work at once. We build a
+	 * different one for each phase, as the number of transition function
+	 * invocation can differ between phases. Note this'll work both for
+	 * transition and combination functions (although there'll only be one
+	 * phase in the latter case).
+	 */
+	for (phaseidx = 0; phaseidx < aggstate->numphases; phaseidx++)
+	{
+		AggStatePerPhase phase = &aggstate->phases[phaseidx];
+		bool		dohash = false;
+		bool		dosort = false;
+
+		/* phase 0 doesn't necessarily exist */
+		if (!phase->aggnode)
+			continue;
+
+		if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 1)
+		{
+			/*
+			 * Phase one, and only phase one, in a mixed agg performs both
+			 * sorting and aggregation.
+			 */
+			dohash = true;
+			dosort = true;
+		}
+		else if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 0)
+		{
+			/*
+			 * No need to compute a transition function for an AGG_MIXED phase
+			 * 0 - the contents of the hashtables will have been computed
+			 * during phase 1.
+			 */
+			continue;
+		}
+		else if (phase->aggstrategy == AGG_PLAIN ||
+				 phase->aggstrategy == AGG_SORTED)
+		{
+			dohash = false;
+			dosort = true;
+		}
+		else if (phase->aggstrategy == AGG_HASHED)
+		{
+			dohash = true;
+			dosort = false;
+		}
+		else
+			Assert(false);
+
+		phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash,
+											 false);
+
+		/* cache compiled expression for outer slot without NULL check */
+		phase->evaltrans_cache[0][0] = phase->evaltrans;
+	}
+
+	return aggstate;
+}
+
+/*
+ * Build the state needed to calculate a state value for an aggregate.
+ *
+ * This initializes all the fields in 'pertrans'. 'aggref' is the aggregate
+ * to initialize the state for. 'aggtransfn', 'aggtranstype', and the rest
+ * of the arguments could be calculated from 'aggref', but the caller has
+ * calculated them already, so might as well pass them.
+ */
+static void
+build_pertrans_for_aggref(AggStatePerTrans pertrans,
+						  AggState *aggstate, EState *estate,
+						  Aggref *aggref,
+						  Oid aggtransfn, Oid aggtranstype,
+						  Oid aggserialfn, Oid aggdeserialfn,
+						  Datum initValue, bool initValueIsNull,
+						  Oid *inputTypes, int numArguments)
+{
+	int			numGroupingSets = Max(aggstate->maxsets, 1);
+	Expr	   *serialfnexpr = NULL;
+	Expr	   *deserialfnexpr = NULL;
+	ListCell   *lc;
+	int			numInputs;
+	int			numDirectArgs;
+	List	   *sortlist;
+	int			numSortCols;
+	int			numDistinctCols;
+	int			i;
+
+	/* Begin filling in the pertrans data */
+	pertrans->aggref = aggref;
+	pertrans->aggshared = false;
+	pertrans->aggCollation = aggref->inputcollid;
+	pertrans->transfn_oid = aggtransfn;
+	pertrans->serialfn_oid = aggserialfn;
+	pertrans->deserialfn_oid = aggdeserialfn;
+	pertrans->initValue = initValue;
+	pertrans->initValueIsNull = initValueIsNull;
+
+	/* Count the "direct" arguments, if any */
+	numDirectArgs = list_length(aggref->aggdirectargs);
+
+	/* Count the number of aggregated input columns */
+	pertrans->numInputs = numInputs = list_length(aggref->args);
+
+	pertrans->aggtranstype = aggtranstype;
+
+	/*
+	 * When combining states, we have no use at all for the aggregate
+	 * function's transfn. Instead we use the combinefn.  In this case, the
+	 * transfn and transfn_oid fields of pertrans refer to the combine
+	 * function rather than the transition function.
+	 */
+	if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
+	{
+		Expr	   *combinefnexpr;
+		size_t		numTransArgs;
+
+		/*
+		 * When combining there's only one input, the to-be-combined added
+		 * transition value from below (this node's transition value is
+		 * counted separately).
+		 */
+		pertrans->numTransInputs = 1;
+
+		/* account for the current transition state */
+		numTransArgs = pertrans->numTransInputs + 1;
+
+		build_aggregate_combinefn_expr(aggtranstype,
+									   aggref->inputcollid,
+									   aggtransfn,
+									   &combinefnexpr);
+		fmgr_info(aggtransfn, &pertrans->transfn);
+		fmgr_info_set_expr((Node *) combinefnexpr, &pertrans->transfn);
+
+		pertrans->transfn_fcinfo =
+			(FunctionCallInfo) palloc(SizeForFunctionCallInfo(2));
+		InitFunctionCallInfoData(*pertrans->transfn_fcinfo,
+								 &pertrans->transfn,
+								 numTransArgs,
+								 pertrans->aggCollation,
+								 (void *) aggstate, NULL);
+
+		/*
+		 * Ensure that a combine function to combine INTERNAL states is not
+		 * strict. This should have been checked during CREATE AGGREGATE, but
+		 * the strict property could have been changed since then.
+		 */
+		if (pertrans->transfn.fn_strict && aggtranstype == INTERNALOID)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+					 errmsg("combine function with transition type %s must not be declared STRICT",
+							format_type_be(aggtranstype))));
+	}
+	else
+	{
+		Expr	   *transfnexpr;
+		size_t		numTransArgs;
+
+		/* Detect how many arguments to pass to the transfn */
+		if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
+			pertrans->numTransInputs = numInputs;
+		else
+			pertrans->numTransInputs = numArguments;
+
+		/* account for the current transition state */
+		numTransArgs = pertrans->numTransInputs + 1;
+
+		/*
+		 * Set up infrastructure for calling the transfn.  Note that
+		 * invtransfn is not needed here.
+		 */
+		build_aggregate_transfn_expr(inputTypes,
+									 numArguments,
+									 numDirectArgs,
+									 aggref->aggvariadic,
+									 aggtranstype,
+									 aggref->inputcollid,
+									 aggtransfn,
+									 InvalidOid,
+									 &transfnexpr,
+									 NULL);
+		fmgr_info(aggtransfn, &pertrans->transfn);
+		fmgr_info_set_expr((Node *) transfnexpr, &pertrans->transfn);
+
+		pertrans->transfn_fcinfo =
+			(FunctionCallInfo) palloc(SizeForFunctionCallInfo(numTransArgs));
+		InitFunctionCallInfoData(*pertrans->transfn_fcinfo,
+								 &pertrans->transfn,
+								 numTransArgs,
+								 pertrans->aggCollation,
+								 (void *) aggstate, NULL);
+
+		/*
+		 * If the transfn is strict and the initval is NULL, make sure input
+		 * type and transtype are the same (or at least binary-compatible), so
+		 * that it's OK to use the first aggregated input value as the initial
+		 * transValue.  This should have been checked at agg definition time,
+		 * but we must check again in case the transfn's strictness property
+		 * has been changed.
+		 */
+		if (pertrans->transfn.fn_strict && pertrans->initValueIsNull)
+		{
+			if (numArguments <= numDirectArgs ||
+				!IsBinaryCoercible(inputTypes[numDirectArgs],
+								   aggtranstype))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+						 errmsg("aggregate %u needs to have compatible input type and transition type",
+								aggref->aggfnoid)));
+		}
+	}
+
+	/* get info about the state value's datatype */
+	get_typlenbyval(aggtranstype,
+					&pertrans->transtypeLen,
+					&pertrans->transtypeByVal);
+
+	if (OidIsValid(aggserialfn))
+	{
+		build_aggregate_serialfn_expr(aggserialfn,
+									  &serialfnexpr);
+		fmgr_info(aggserialfn, &pertrans->serialfn);
+		fmgr_info_set_expr((Node *) serialfnexpr, &pertrans->serialfn);
+
+		pertrans->serialfn_fcinfo =
+			(FunctionCallInfo) palloc(SizeForFunctionCallInfo(1));
+		InitFunctionCallInfoData(*pertrans->serialfn_fcinfo,
+								 &pertrans->serialfn,
+								 1,
+								 InvalidOid,
+								 (void *) aggstate, NULL);
+	}
+
+	if (OidIsValid(aggdeserialfn))
+	{
+		build_aggregate_deserialfn_expr(aggdeserialfn,
+										&deserialfnexpr);
+		fmgr_info(aggdeserialfn, &pertrans->deserialfn);
+		fmgr_info_set_expr((Node *) deserialfnexpr, &pertrans->deserialfn);
+
+		pertrans->deserialfn_fcinfo =
+			(FunctionCallInfo) palloc(SizeForFunctionCallInfo(2));
+		InitFunctionCallInfoData(*pertrans->deserialfn_fcinfo,
+								 &pertrans->deserialfn,
+								 2,
+								 InvalidOid,
+								 (void *) aggstate, NULL);
+
+	}
+
+	/*
+	 * If we're doing either DISTINCT or ORDER BY for a plain agg, then we
+	 * have a list of SortGroupClause nodes; fish out the data in them and
+	 * stick them into arrays.  We ignore ORDER BY for an ordered-set agg,
+	 * however; the agg's transfn and finalfn are responsible for that.
+	 *
+	 * Note that by construction, if there is a DISTINCT clause then the ORDER
+	 * BY clause is a prefix of it (see transformDistinctClause).
+	 */
+	if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
+	{
+		sortlist = NIL;
+		numSortCols = numDistinctCols = 0;
+	}
+	else if (aggref->aggdistinct)
+	{
+		sortlist = aggref->aggdistinct;
+		numSortCols = numDistinctCols = list_length(sortlist);
+		Assert(numSortCols >= list_length(aggref->aggorder));
+	}
+	else
+	{
+		sortlist = aggref->aggorder;
+		numSortCols = list_length(sortlist);
+		numDistinctCols = 0;
+	}
+
+	pertrans->numSortCols = numSortCols;
+	pertrans->numDistinctCols = numDistinctCols;
+
+	/*
+	 * If we have either sorting or filtering to do, create a tupledesc and
+	 * slot corresponding to the aggregated inputs (including sort
+	 * expressions) of the agg.
+	 */
+	if (numSortCols > 0 || aggref->aggfilter)
+	{
+		pertrans->sortdesc = ExecTypeFromTL(aggref->args);
+		pertrans->sortslot =
+			ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
+								   &TTSOpsMinimalTuple);
+	}
+
+	if (numSortCols > 0)
+	{
+		/*
+		 * We don't implement DISTINCT or ORDER BY aggs in the HASHED case
+		 * (yet)
+		 */
+		Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED);
+
+		/* If we have only one input, we need its len/byval info. */
+		if (numInputs == 1)
+		{
+			get_typlenbyval(inputTypes[numDirectArgs],
+							&pertrans->inputtypeLen,
+							&pertrans->inputtypeByVal);
+		}
+		else if (numDistinctCols > 0)
+		{
+			/* we will need an extra slot to store prior values */
+			pertrans->uniqslot =
+				ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
+									   &TTSOpsMinimalTuple);
+		}
+
+		/* Extract the sort information for use later */
+		pertrans->sortColIdx =
+			(AttrNumber *) palloc(numSortCols * sizeof(AttrNumber));
+		pertrans->sortOperators =
+			(Oid *) palloc(numSortCols * sizeof(Oid));
+		pertrans->sortCollations =
+			(Oid *) palloc(numSortCols * sizeof(Oid));
+		pertrans->sortNullsFirst =
+			(bool *) palloc(numSortCols * sizeof(bool));
+
+		i = 0;
+		foreach(lc, sortlist)
+		{
+			SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc);
+			TargetEntry *tle = get_sortgroupclause_tle(sortcl, aggref->args);
+
+			/* the parser should have made sure of this */
+			Assert(OidIsValid(sortcl->sortop));
+
+			pertrans->sortColIdx[i] = tle->resno;
+			pertrans->sortOperators[i] = sortcl->sortop;
+			pertrans->sortCollations[i] = exprCollation((Node *) tle->expr);
+			pertrans->sortNullsFirst[i] = sortcl->nulls_first;
+			i++;
+		}
+		Assert(i == numSortCols);
+	}
+
+	if (aggref->aggdistinct)
+	{
+		Oid		   *ops;
+
+		Assert(numArguments > 0);
+		Assert(list_length(aggref->aggdistinct) == numDistinctCols);
+
+		ops = palloc(numDistinctCols * sizeof(Oid));
+
+		i = 0;
+		foreach(lc, aggref->aggdistinct)
+			ops[i++] = ((SortGroupClause *) lfirst(lc))->eqop;
+
+		/* lookup / build the necessary comparators */
+		if (numDistinctCols == 1)
+			fmgr_info(get_opcode(ops[0]), &pertrans->equalfnOne);
+		else
+			pertrans->equalfnMulti =
+				execTuplesMatchPrepare(pertrans->sortdesc,
+									   numDistinctCols,
+									   pertrans->sortColIdx,
+									   ops,
+									   pertrans->sortCollations,
+									   &aggstate->ss.ps);
+		pfree(ops);
+	}
+
+	pertrans->sortstates = (Tuplesortstate **)
+		palloc0(sizeof(Tuplesortstate *) * numGroupingSets);
+}
+
+
+static Datum
+GetAggInitVal(Datum textInitVal, Oid transtype)
+{
+	Oid			typinput,
+				typioparam;
+	char	   *strInitVal;
+	Datum		initVal;
+
+	getTypeInputInfo(transtype, &typinput, &typioparam);
+	strInitVal = TextDatumGetCString(textInitVal);
+	initVal = OidInputFunctionCall(typinput, strInitVal,
+								   typioparam, -1);
+	pfree(strInitVal);
+	return initVal;
+}
+
+void
+ExecEndAgg(AggState *node)
+{
+	PlanState  *outerPlan;
+	int			transno;
+	int			numGroupingSets = Max(node->maxsets, 1);
+	int			setno;
+
+	/*
+	 * When ending a parallel worker, copy the statistics gathered by the
+	 * worker back into shared memory so that it can be picked up by the main
+	 * process to report in EXPLAIN ANALYZE.
+	 */
+	if (node->shared_info && IsParallelWorker())
+	{
+		AggregateInstrumentation *si;
+
+		Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+		si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+		si->hash_batches_used = node->hash_batches_used;
+		si->hash_disk_used = node->hash_disk_used;
+		si->hash_mem_peak = node->hash_mem_peak;
+	}
+
+	/* Make sure we have closed any open tuplesorts */
+
+	if (node->sort_in)
+		tuplesort_end(node->sort_in);
+	if (node->sort_out)
+		tuplesort_end(node->sort_out);
+
+	hashagg_reset_spill_state(node);
+
+	if (node->hash_metacxt != NULL)
+	{
+		MemoryContextDelete(node->hash_metacxt);
+		node->hash_metacxt = NULL;
+	}
+
+	for (transno = 0; transno < node->numtrans; transno++)
+	{
+		AggStatePerTrans pertrans = &node->pertrans[transno];
+
+		for (setno = 0; setno < numGroupingSets; setno++)
+		{
+			if (pertrans->sortstates[setno])
+				tuplesort_end(pertrans->sortstates[setno]);
+		}
+	}
+
+	/* And ensure any agg shutdown callbacks have been called */
+	for (setno = 0; setno < numGroupingSets; setno++)
+		ReScanExprContext(node->aggcontexts[setno]);
+	if (node->hashcontext)
+		ReScanExprContext(node->hashcontext);
+
+	/*
+	 * We don't actually free any ExprContexts here (see comment in
+	 * ExecFreeExprContext), just unlinking the output one from the plan node
+	 * suffices.
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/* clean up tuple table */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	outerPlan = outerPlanState(node);
+	ExecEndNode(outerPlan);
+}
+
+void
+ExecReScanAgg(AggState *node)
+{
+	ExprContext *econtext = node->ss.ps.ps_ExprContext;
+	PlanState  *outerPlan = outerPlanState(node);
+	Agg		   *aggnode = (Agg *) node->ss.ps.plan;
+	int			transno;
+	int			numGroupingSets = Max(node->maxsets, 1);
+	int			setno;
+
+	node->agg_done = false;
+
+	if (node->aggstrategy == AGG_HASHED)
+	{
+		/*
+		 * In the hashed case, if we haven't yet built the hash table then we
+		 * can just return; nothing done yet, so nothing to undo. If subnode's
+		 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+		 * else no reason to re-scan it at all.
+		 */
+		if (!node->table_filled)
+			return;
+
+		/*
+		 * If we do have the hash table, and it never spilled, and the subplan
+		 * does not have any parameter changes, and none of our own parameter
+		 * changes affect input expressions of the aggregated functions, then
+		 * we can just rescan the existing hash table; no need to build it
+		 * again.
+		 */
+		if (outerPlan->chgParam == NULL && !node->hash_ever_spilled &&
+			!bms_overlap(node->ss.ps.chgParam, aggnode->aggParams))
+		{
+			ResetTupleHashIterator(node->perhash[0].hashtable,
+								   &node->perhash[0].hashiter);
+			select_current_set(node, 0, true);
+			return;
+		}
+	}
+
+	/* Make sure we have closed any open tuplesorts */
+	for (transno = 0; transno < node->numtrans; transno++)
+	{
+		for (setno = 0; setno < numGroupingSets; setno++)
+		{
+			AggStatePerTrans pertrans = &node->pertrans[transno];
+
+			if (pertrans->sortstates[setno])
+			{
+				tuplesort_end(pertrans->sortstates[setno]);
+				pertrans->sortstates[setno] = NULL;
+			}
+		}
+	}
+
+	/*
+	 * We don't need to ReScanExprContext the output tuple context here;
+	 * ExecReScan already did it. But we do need to reset our per-grouping-set
+	 * contexts, which may have transvalues stored in them. (We use rescan
+	 * rather than just reset because transfns may have registered callbacks
+	 * that need to be run now.) For the AGG_HASHED case, see below.
+	 */
+
+	for (setno = 0; setno < numGroupingSets; setno++)
+	{
+		ReScanExprContext(node->aggcontexts[setno]);
+	}
+
+	/* Release first tuple of group, if we have made a copy */
+	if (node->grp_firstTuple != NULL)
+	{
+		heap_freetuple(node->grp_firstTuple);
+		node->grp_firstTuple = NULL;
+	}
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/* Forget current agg values */
+	MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numaggs);
+	MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs);
+
+	/*
+	 * With AGG_HASHED/MIXED, the hash table is allocated in a sub-context of
+	 * the hashcontext. This used to be an issue, but now, resetting a context
+	 * automatically deletes sub-contexts too.
+	 */
+	if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED)
+	{
+		hashagg_reset_spill_state(node);
+
+		node->hash_ever_spilled = false;
+		node->hash_spill_mode = false;
+		node->hash_ngroups_current = 0;
+
+		ReScanExprContext(node->hashcontext);
+		/* Rebuild an empty hash table */
+		build_hash_tables(node);
+		node->table_filled = false;
+		/* iterator will be reset when the table is filled */
+
+		hashagg_recompile_expressions(node, false, false);
+	}
+
+	if (node->aggstrategy != AGG_HASHED)
+	{
+		/*
+		 * Reset the per-group state (in particular, mark transvalues null)
+		 */
+		for (setno = 0; setno < numGroupingSets; setno++)
+		{
+			MemSet(node->pergroups[setno], 0,
+				   sizeof(AggStatePerGroupData) * node->numaggs);
+		}
+
+		/* reset to phase 1 */
+		initialize_phase(node, 1);
+
+		node->input_done = false;
+		node->projected_set = -1;
+	}
+
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+}
+
+
+/***********************************************************************
+ * API exposed to aggregate functions
+ ***********************************************************************/
+
+
+/*
+ * AggCheckCallContext - test if a SQL function is being called as an aggregate
+ *
+ * The transition and/or final functions of an aggregate may want to verify
+ * that they are being called as aggregates, rather than as plain SQL
+ * functions.  They should use this function to do so.  The return value
+ * is nonzero if being called as an aggregate, or zero if not.  (Specific
+ * nonzero values are AGG_CONTEXT_AGGREGATE or AGG_CONTEXT_WINDOW, but more
+ * values could conceivably appear in future.)
+ *
+ * If aggcontext isn't NULL, the function also stores at *aggcontext the
+ * identity of the memory context that aggregate transition values are being
+ * stored in.  Note that the same aggregate call site (flinfo) may be called
+ * interleaved on different transition values in different contexts, so it's
+ * not kosher to cache aggcontext under fn_extra.  It is, however, kosher to
+ * cache it in the transvalue itself (for internal-type transvalues).
+ */
+int
+AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext)
+{
+	if (fcinfo->context && IsA(fcinfo->context, AggState))
+	{
+		if (aggcontext)
+		{
+			AggState   *aggstate = ((AggState *) fcinfo->context);
+			ExprContext *cxt = aggstate->curaggcontext;
+
+			*aggcontext = cxt->ecxt_per_tuple_memory;
+		}
+		return AGG_CONTEXT_AGGREGATE;
+	}
+	if (fcinfo->context && IsA(fcinfo->context, WindowAggState))
+	{
+		if (aggcontext)
+			*aggcontext = ((WindowAggState *) fcinfo->context)->curaggcontext;
+		return AGG_CONTEXT_WINDOW;
+	}
+
+	/* this is just to prevent "uninitialized variable" warnings */
+	if (aggcontext)
+		*aggcontext = NULL;
+	return 0;
+}
+
+/*
+ * AggGetAggref - allow an aggregate support function to get its Aggref
+ *
+ * If the function is being called as an aggregate support function,
+ * return the Aggref node for the aggregate call.  Otherwise, return NULL.
+ *
+ * Aggregates sharing the same inputs and transition functions can get
+ * merged into a single transition calculation.  If the transition function
+ * calls AggGetAggref, it will get some one of the Aggrefs for which it is
+ * executing.  It must therefore not pay attention to the Aggref fields that
+ * relate to the final function, as those are indeterminate.  But if a final
+ * function calls AggGetAggref, it will get a precise result.
+ *
+ * Note that if an aggregate is being used as a window function, this will
+ * return NULL.  We could provide a similar function to return the relevant
+ * WindowFunc node in such cases, but it's not needed yet.
+ */
+Aggref *
+AggGetAggref(FunctionCallInfo fcinfo)
+{
+	if (fcinfo->context && IsA(fcinfo->context, AggState))
+	{
+		AggState   *aggstate = (AggState *) fcinfo->context;
+		AggStatePerAgg curperagg;
+		AggStatePerTrans curpertrans;
+
+		/* check curperagg (valid when in a final function) */
+		curperagg = aggstate->curperagg;
+
+		if (curperagg)
+			return curperagg->aggref;
+
+		/* check curpertrans (valid when in a transition function) */
+		curpertrans = aggstate->curpertrans;
+
+		if (curpertrans)
+			return curpertrans->aggref;
+	}
+	return NULL;
+}
+
+/*
+ * AggGetTempMemoryContext - fetch short-term memory context for aggregates
+ *
+ * This is useful in agg final functions; the context returned is one that
+ * the final function can safely reset as desired.  This isn't useful for
+ * transition functions, since the context returned MAY (we don't promise)
+ * be the same as the context those are called in.
+ *
+ * As above, this is currently not useful for aggs called as window functions.
+ */
+MemoryContext
+AggGetTempMemoryContext(FunctionCallInfo fcinfo)
+{
+	if (fcinfo->context && IsA(fcinfo->context, AggState))
+	{
+		AggState   *aggstate = (AggState *) fcinfo->context;
+
+		return aggstate->tmpcontext->ecxt_per_tuple_memory;
+	}
+	return NULL;
+}
+
+/*
+ * AggStateIsShared - find out whether transition state is shared
+ *
+ * If the function is being called as an aggregate support function,
+ * return true if the aggregate's transition state is shared across
+ * multiple aggregates, false if it is not.
+ *
+ * Returns true if not called as an aggregate support function.
+ * This is intended as a conservative answer, ie "no you'd better not
+ * scribble on your input".  In particular, will return true if the
+ * aggregate is being used as a window function, which is a scenario
+ * in which changing the transition state is a bad idea.  We might
+ * want to refine the behavior for the window case in future.
+ */
+bool
+AggStateIsShared(FunctionCallInfo fcinfo)
+{
+	if (fcinfo->context && IsA(fcinfo->context, AggState))
+	{
+		AggState   *aggstate = (AggState *) fcinfo->context;
+		AggStatePerAgg curperagg;
+		AggStatePerTrans curpertrans;
+
+		/* check curperagg (valid when in a final function) */
+		curperagg = aggstate->curperagg;
+
+		if (curperagg)
+			return aggstate->pertrans[curperagg->transno].aggshared;
+
+		/* check curpertrans (valid when in a transition function) */
+		curpertrans = aggstate->curpertrans;
+
+		if (curpertrans)
+			return curpertrans->aggshared;
+	}
+	return true;
+}
+
+/*
+ * AggRegisterCallback - register a cleanup callback for an aggregate
+ *
+ * This is useful for aggs to register shutdown callbacks, which will ensure
+ * that non-memory resources are freed.  The callback will occur just before
+ * the associated aggcontext (as returned by AggCheckCallContext) is reset,
+ * either between groups or as a result of rescanning the query.  The callback
+ * will NOT be called on error paths.  The typical use-case is for freeing of
+ * tuplestores or tuplesorts maintained in aggcontext, or pins held by slots
+ * created by the agg functions.  (The callback will not be called until after
+ * the result of the finalfn is no longer needed, so it's safe for the finalfn
+ * to return data that will be freed by the callback.)
+ *
+ * As above, this is currently not useful for aggs called as window functions.
+ */
+void
+AggRegisterCallback(FunctionCallInfo fcinfo,
+					ExprContextCallbackFunction func,
+					Datum arg)
+{
+	if (fcinfo->context && IsA(fcinfo->context, AggState))
+	{
+		AggState   *aggstate = (AggState *) fcinfo->context;
+		ExprContext *cxt = aggstate->curaggcontext;
+
+		RegisterExprContextCallback(cxt, func, arg);
+
+		return;
+	}
+	elog(ERROR, "aggregate function cannot register a callback in this context");
+}
+
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+ /* ----------------------------------------------------------------
+  *		ExecAggEstimate
+  *
+  *		Estimate space required to propagate aggregate statistics.
+  * ----------------------------------------------------------------
+  */
+void
+ExecAggEstimate(AggState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
+	size = add_size(size, offsetof(SharedAggInfo, sinstrument));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAggInitializeDSM
+ *
+ *		Initialize DSM space for aggregate statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedAggInfo, sinstrument)
+		+ pcxt->nworkers * sizeof(AggregateInstrumentation);
+	node->shared_info = shm_toc_allocate(pcxt->toc, size);
+	/* ensure any unfilled slots will contain zeroes */
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAggInitializeWorker
+ *
+ *		Attach worker to DSM space for aggregate statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
+{
+	node->shared_info =
+		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAggRetrieveInstrumentation
+ *
+ *		Transfer aggregate statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggRetrieveInstrumentation(AggState *node)
+{
+	Size		size;
+	SharedAggInfo *si;
+
+	if (node->shared_info == NULL)
+		return;
+
+	size = offsetof(SharedAggInfo, sinstrument)
+		+ node->shared_info->num_workers * sizeof(AggregateInstrumentation);
+	si = palloc(size);
+	memcpy(si, node->shared_info, size);
+	node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeAppend.c b/src/backend/executor/nodeAppend.c
new file mode 100644
index 0000000..6a2daa6
--- /dev/null
+++ b/src/backend/executor/nodeAppend.c
@@ -0,0 +1,1186 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeAppend.c
+ *	  routines to handle append nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeAppend.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ *		ExecInitAppend	- initialize the append node
+ *		ExecAppend		- retrieve the next tuple from the node
+ *		ExecEndAppend	- shut down the append node
+ *		ExecReScanAppend - rescan the append node
+ *
+ *	 NOTES
+ *		Each append node contains a list of one or more subplans which
+ *		must be iteratively processed (forwards or backwards).
+ *		Tuples are retrieved by executing the 'whichplan'th subplan
+ *		until the subplan stops returning tuples, at which point that
+ *		plan is shut down and the next started up.
+ *
+ *		Append nodes don't make use of their left and right
+ *		subtrees, rather they maintain a list of subplans so
+ *		a typical append node looks like this in the plan tree:
+ *
+ *				   ...
+ *				   /
+ *				Append -------+------+------+--- nil
+ *				/	\		  |		 |		|
+ *			  nil	nil		 ...    ...    ...
+ *								 subplans
+ *
+ *		Append nodes are currently used for unions, and to support
+ *		inheritance queries, where several relations need to be scanned.
+ *		For example, in our standard person/student/employee/student-emp
+ *		example, where student and employee inherit from person
+ *		and student-emp inherits from student and employee, the
+ *		query:
+ *
+ *				select name from person
+ *
+ *		generates the plan:
+ *
+ *				  |
+ *				Append -------+-------+--------+--------+
+ *				/	\		  |		  |		   |		|
+ *			  nil	nil		 Scan	 Scan	  Scan	   Scan
+ *							  |		  |		   |		|
+ *							person employee student student-emp
+ */
+
+#include "postgres.h"
+
+#include "executor/execAsync.h"
+#include "executor/execdebug.h"
+#include "executor/execPartition.h"
+#include "executor/nodeAppend.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/latch.h"
+
+/* Shared state for parallel-aware Append. */
+struct ParallelAppendState
+{
+	LWLock		pa_lock;		/* mutual exclusion to choose next subplan */
+	int			pa_next_plan;	/* next plan to choose by any worker */
+
+	/*
+	 * pa_finished[i] should be true if no more workers should select subplan
+	 * i.  for a non-partial plan, this should be set to true as soon as a
+	 * worker selects the plan; for a partial plan, it remains false until
+	 * some worker executes the plan to completion.
+	 */
+	bool		pa_finished[FLEXIBLE_ARRAY_MEMBER];
+};
+
+#define INVALID_SUBPLAN_INDEX		-1
+#define EVENT_BUFFER_SIZE			16
+
+static TupleTableSlot *ExecAppend(PlanState *pstate);
+static bool choose_next_subplan_locally(AppendState *node);
+static bool choose_next_subplan_for_leader(AppendState *node);
+static bool choose_next_subplan_for_worker(AppendState *node);
+static void mark_invalid_subplans_as_finished(AppendState *node);
+static void ExecAppendAsyncBegin(AppendState *node);
+static bool ExecAppendAsyncGetNext(AppendState *node, TupleTableSlot **result);
+static bool ExecAppendAsyncRequest(AppendState *node, TupleTableSlot **result);
+static void ExecAppendAsyncEventWait(AppendState *node);
+static void classify_matching_subplans(AppendState *node);
+
+/* ----------------------------------------------------------------
+ *		ExecInitAppend
+ *
+ *		Begin all of the subscans of the append node.
+ *
+ *	   (This is potentially wasteful, since the entire result of the
+ *		append node may not be scanned, but this way all of the
+ *		structures get allocated in the executor's top level memory
+ *		block instead of that of the call to ExecAppend.)
+ * ----------------------------------------------------------------
+ */
+AppendState *
+ExecInitAppend(Append *node, EState *estate, int eflags)
+{
+	AppendState *appendstate = makeNode(AppendState);
+	PlanState **appendplanstates;
+	Bitmapset  *validsubplans;
+	Bitmapset  *asyncplans;
+	int			nplans;
+	int			nasyncplans;
+	int			firstvalid;
+	int			i,
+				j;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & EXEC_FLAG_MARK));
+
+	/*
+	 * create new AppendState for our append node
+	 */
+	appendstate->ps.plan = (Plan *) node;
+	appendstate->ps.state = estate;
+	appendstate->ps.ExecProcNode = ExecAppend;
+
+	/* Let choose_next_subplan_* function handle setting the first subplan */
+	appendstate->as_whichplan = INVALID_SUBPLAN_INDEX;
+	appendstate->as_syncdone = false;
+	appendstate->as_begun = false;
+
+	/* If run-time partition pruning is enabled, then set that up now */
+	if (node->part_prune_info != NULL)
+	{
+		PartitionPruneState *prunestate;
+
+		/* We may need an expression context to evaluate partition exprs */
+		ExecAssignExprContext(estate, &appendstate->ps);
+
+		/* Create the working data structure for pruning. */
+		prunestate = ExecCreatePartitionPruneState(&appendstate->ps,
+												   node->part_prune_info);
+		appendstate->as_prune_state = prunestate;
+
+		/* Perform an initial partition prune, if required. */
+		if (prunestate->do_initial_prune)
+		{
+			/* Determine which subplans survive initial pruning */
+			validsubplans = ExecFindInitialMatchingSubPlans(prunestate,
+															list_length(node->appendplans));
+
+			nplans = bms_num_members(validsubplans);
+		}
+		else
+		{
+			/* We'll need to initialize all subplans */
+			nplans = list_length(node->appendplans);
+			Assert(nplans > 0);
+			validsubplans = bms_add_range(NULL, 0, nplans - 1);
+		}
+
+		/*
+		 * When no run-time pruning is required and there's at least one
+		 * subplan, we can fill as_valid_subplans immediately, preventing
+		 * later calls to ExecFindMatchingSubPlans.
+		 */
+		if (!prunestate->do_exec_prune && nplans > 0)
+			appendstate->as_valid_subplans = bms_add_range(NULL, 0, nplans - 1);
+	}
+	else
+	{
+		nplans = list_length(node->appendplans);
+
+		/*
+		 * When run-time partition pruning is not enabled we can just mark all
+		 * subplans as valid; they must also all be initialized.
+		 */
+		Assert(nplans > 0);
+		appendstate->as_valid_subplans = validsubplans =
+			bms_add_range(NULL, 0, nplans - 1);
+		appendstate->as_prune_state = NULL;
+	}
+
+	/*
+	 * Initialize result tuple type and slot.
+	 */
+	ExecInitResultTupleSlotTL(&appendstate->ps, &TTSOpsVirtual);
+
+	/* node returns slots from each of its subnodes, therefore not fixed */
+	appendstate->ps.resultopsset = true;
+	appendstate->ps.resultopsfixed = false;
+
+	appendplanstates = (PlanState **) palloc(nplans *
+											 sizeof(PlanState *));
+
+	/*
+	 * call ExecInitNode on each of the valid plans to be executed and save
+	 * the results into the appendplanstates array.
+	 *
+	 * While at it, find out the first valid partial plan.
+	 */
+	j = 0;
+	asyncplans = NULL;
+	nasyncplans = 0;
+	firstvalid = nplans;
+	i = -1;
+	while ((i = bms_next_member(validsubplans, i)) >= 0)
+	{
+		Plan	   *initNode = (Plan *) list_nth(node->appendplans, i);
+
+		/*
+		 * Record async subplans.  When executing EvalPlanQual, we treat them
+		 * as sync ones; don't do this when initializing an EvalPlanQual plan
+		 * tree.
+		 */
+		if (initNode->async_capable && estate->es_epq_active == NULL)
+		{
+			asyncplans = bms_add_member(asyncplans, j);
+			nasyncplans++;
+		}
+
+		/*
+		 * Record the lowest appendplans index which is a valid partial plan.
+		 */
+		if (i >= node->first_partial_plan && j < firstvalid)
+			firstvalid = j;
+
+		appendplanstates[j++] = ExecInitNode(initNode, estate, eflags);
+	}
+
+	appendstate->as_first_partial_plan = firstvalid;
+	appendstate->appendplans = appendplanstates;
+	appendstate->as_nplans = nplans;
+
+	/* Initialize async state */
+	appendstate->as_asyncplans = asyncplans;
+	appendstate->as_nasyncplans = nasyncplans;
+	appendstate->as_asyncrequests = NULL;
+	appendstate->as_asyncresults = NULL;
+	appendstate->as_nasyncresults = 0;
+	appendstate->as_nasyncremain = 0;
+	appendstate->as_needrequest = NULL;
+	appendstate->as_eventset = NULL;
+	appendstate->as_valid_asyncplans = NULL;
+
+	if (nasyncplans > 0)
+	{
+		appendstate->as_asyncrequests = (AsyncRequest **)
+			palloc0(nplans * sizeof(AsyncRequest *));
+
+		i = -1;
+		while ((i = bms_next_member(asyncplans, i)) >= 0)
+		{
+			AsyncRequest *areq;
+
+			areq = palloc(sizeof(AsyncRequest));
+			areq->requestor = (PlanState *) appendstate;
+			areq->requestee = appendplanstates[i];
+			areq->request_index = i;
+			areq->callback_pending = false;
+			areq->request_complete = false;
+			areq->result = NULL;
+
+			appendstate->as_asyncrequests[i] = areq;
+		}
+
+		appendstate->as_asyncresults = (TupleTableSlot **)
+			palloc0(nasyncplans * sizeof(TupleTableSlot *));
+
+		if (appendstate->as_valid_subplans != NULL)
+			classify_matching_subplans(appendstate);
+	}
+
+	/*
+	 * Miscellaneous initialization
+	 */
+
+	appendstate->ps.ps_ProjInfo = NULL;
+
+	/* For parallel query, this will be overridden later. */
+	appendstate->choose_next_subplan = choose_next_subplan_locally;
+
+	return appendstate;
+}
+
+/* ----------------------------------------------------------------
+ *	   ExecAppend
+ *
+ *		Handles iteration over multiple subplans.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecAppend(PlanState *pstate)
+{
+	AppendState *node = castNode(AppendState, pstate);
+	TupleTableSlot *result;
+
+	/*
+	 * If this is the first call after Init or ReScan, we need to do the
+	 * initialization work.
+	 */
+	if (!node->as_begun)
+	{
+		Assert(node->as_whichplan == INVALID_SUBPLAN_INDEX);
+		Assert(!node->as_syncdone);
+
+		/* Nothing to do if there are no subplans */
+		if (node->as_nplans == 0)
+			return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+		/* If there are any async subplans, begin executing them. */
+		if (node->as_nasyncplans > 0)
+			ExecAppendAsyncBegin(node);
+
+		/*
+		 * If no sync subplan has been chosen, we must choose one before
+		 * proceeding.
+		 */
+		if (!node->choose_next_subplan(node) && node->as_nasyncremain == 0)
+			return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+		Assert(node->as_syncdone ||
+			   (node->as_whichplan >= 0 &&
+				node->as_whichplan < node->as_nplans));
+
+		/* And we're initialized. */
+		node->as_begun = true;
+	}
+
+	for (;;)
+	{
+		PlanState  *subnode;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * try to get a tuple from an async subplan if any
+		 */
+		if (node->as_syncdone || !bms_is_empty(node->as_needrequest))
+		{
+			if (ExecAppendAsyncGetNext(node, &result))
+				return result;
+			Assert(!node->as_syncdone);
+			Assert(bms_is_empty(node->as_needrequest));
+		}
+
+		/*
+		 * figure out which sync subplan we are currently processing
+		 */
+		Assert(node->as_whichplan >= 0 && node->as_whichplan < node->as_nplans);
+		subnode = node->appendplans[node->as_whichplan];
+
+		/*
+		 * get a tuple from the subplan
+		 */
+		result = ExecProcNode(subnode);
+
+		if (!TupIsNull(result))
+		{
+			/*
+			 * If the subplan gave us something then return it as-is. We do
+			 * NOT make use of the result slot that was set up in
+			 * ExecInitAppend; there's no need for it.
+			 */
+			return result;
+		}
+
+		/*
+		 * wait or poll for async events if any. We do this before checking
+		 * for the end of iteration, because it might drain the remaining
+		 * async subplans.
+		 */
+		if (node->as_nasyncremain > 0)
+			ExecAppendAsyncEventWait(node);
+
+		/* choose new sync subplan; if no sync/async subplans, we're done */
+		if (!node->choose_next_subplan(node) && node->as_nasyncremain == 0)
+			return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndAppend
+ *
+ *		Shuts down the subscans of the append node.
+ *
+ *		Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndAppend(AppendState *node)
+{
+	PlanState **appendplans;
+	int			nplans;
+	int			i;
+
+	/*
+	 * get information from the node
+	 */
+	appendplans = node->appendplans;
+	nplans = node->as_nplans;
+
+	/*
+	 * shut down each of the subscans
+	 */
+	for (i = 0; i < nplans; i++)
+		ExecEndNode(appendplans[i]);
+}
+
+void
+ExecReScanAppend(AppendState *node)
+{
+	int			nasyncplans = node->as_nasyncplans;
+	int			i;
+
+	/*
+	 * If any PARAM_EXEC Params used in pruning expressions have changed, then
+	 * we'd better unset the valid subplans so that they are reselected for
+	 * the new parameter values.
+	 */
+	if (node->as_prune_state &&
+		bms_overlap(node->ps.chgParam,
+					node->as_prune_state->execparamids))
+	{
+		bms_free(node->as_valid_subplans);
+		node->as_valid_subplans = NULL;
+		if (nasyncplans > 0)
+		{
+			bms_free(node->as_valid_asyncplans);
+			node->as_valid_asyncplans = NULL;
+		}
+	}
+
+	for (i = 0; i < node->as_nplans; i++)
+	{
+		PlanState  *subnode = node->appendplans[i];
+
+		/*
+		 * ExecReScan doesn't know about my subplans, so I have to do
+		 * changed-parameter signaling myself.
+		 */
+		if (node->ps.chgParam != NULL)
+			UpdateChangedParamSet(subnode, node->ps.chgParam);
+
+		/*
+		 * If chgParam of subnode is not null then plan will be re-scanned by
+		 * first ExecProcNode or by first ExecAsyncRequest.
+		 */
+		if (subnode->chgParam == NULL)
+			ExecReScan(subnode);
+	}
+
+	/* Reset async state */
+	if (nasyncplans > 0)
+	{
+		i = -1;
+		while ((i = bms_next_member(node->as_asyncplans, i)) >= 0)
+		{
+			AsyncRequest *areq = node->as_asyncrequests[i];
+
+			areq->callback_pending = false;
+			areq->request_complete = false;
+			areq->result = NULL;
+		}
+
+		node->as_nasyncresults = 0;
+		node->as_nasyncremain = 0;
+		bms_free(node->as_needrequest);
+		node->as_needrequest = NULL;
+	}
+
+	/* Let choose_next_subplan_* function handle setting the first subplan */
+	node->as_whichplan = INVALID_SUBPLAN_INDEX;
+	node->as_syncdone = false;
+	node->as_begun = false;
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Append Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecAppendEstimate
+ *
+ *		Compute the amount of space we'll need in the parallel
+ *		query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendEstimate(AppendState *node,
+				   ParallelContext *pcxt)
+{
+	node->pstate_len =
+		add_size(offsetof(ParallelAppendState, pa_finished),
+				 sizeof(bool) * node->as_nplans);
+
+	shm_toc_estimate_chunk(&pcxt->estimator, node->pstate_len);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecAppendInitializeDSM
+ *
+ *		Set up shared state for Parallel Append.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendInitializeDSM(AppendState *node,
+						ParallelContext *pcxt)
+{
+	ParallelAppendState *pstate;
+
+	pstate = shm_toc_allocate(pcxt->toc, node->pstate_len);
+	memset(pstate, 0, node->pstate_len);
+	LWLockInitialize(&pstate->pa_lock, LWTRANCHE_PARALLEL_APPEND);
+	shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id, pstate);
+
+	node->as_pstate = pstate;
+	node->choose_next_subplan = choose_next_subplan_for_leader;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAppendReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendReInitializeDSM(AppendState *node, ParallelContext *pcxt)
+{
+	ParallelAppendState *pstate = node->as_pstate;
+
+	pstate->pa_next_plan = 0;
+	memset(pstate->pa_finished, 0, sizeof(bool) * node->as_nplans);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAppendInitializeWorker
+ *
+ *		Copy relevant information from TOC into planstate, and initialize
+ *		whatever is required to choose and execute the optimal subplan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendInitializeWorker(AppendState *node, ParallelWorkerContext *pwcxt)
+{
+	node->as_pstate = shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);
+	node->choose_next_subplan = choose_next_subplan_for_worker;
+}
+
+/* ----------------------------------------------------------------
+ *		choose_next_subplan_locally
+ *
+ *		Choose next sync subplan for a non-parallel-aware Append,
+ *		returning false if there are no more.
+ * ----------------------------------------------------------------
+ */
+static bool
+choose_next_subplan_locally(AppendState *node)
+{
+	int			whichplan = node->as_whichplan;
+	int			nextplan;
+
+	/* We should never be called when there are no subplans */
+	Assert(node->as_nplans > 0);
+
+	/* Nothing to do if syncdone */
+	if (node->as_syncdone)
+		return false;
+
+	/*
+	 * If first call then have the bms member function choose the first valid
+	 * sync subplan by initializing whichplan to -1.  If there happen to be no
+	 * valid sync subplans then the bms member function will handle that by
+	 * returning a negative number which will allow us to exit returning a
+	 * false value.
+	 */
+	if (whichplan == INVALID_SUBPLAN_INDEX)
+	{
+		if (node->as_nasyncplans > 0)
+		{
+			/* We'd have filled as_valid_subplans already */
+			Assert(node->as_valid_subplans);
+		}
+		else if (node->as_valid_subplans == NULL)
+			node->as_valid_subplans =
+				ExecFindMatchingSubPlans(node->as_prune_state);
+
+		whichplan = -1;
+	}
+
+	/* Ensure whichplan is within the expected range */
+	Assert(whichplan >= -1 && whichplan <= node->as_nplans);
+
+	if (ScanDirectionIsForward(node->ps.state->es_direction))
+		nextplan = bms_next_member(node->as_valid_subplans, whichplan);
+	else
+		nextplan = bms_prev_member(node->as_valid_subplans, whichplan);
+
+	if (nextplan < 0)
+	{
+		/* Set as_syncdone if in async mode */
+		if (node->as_nasyncplans > 0)
+			node->as_syncdone = true;
+		return false;
+	}
+
+	node->as_whichplan = nextplan;
+
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		choose_next_subplan_for_leader
+ *
+ *      Try to pick a plan which doesn't commit us to doing much
+ *      work locally, so that as much work as possible is done in
+ *      the workers.  Cheapest subplans are at the end.
+ * ----------------------------------------------------------------
+ */
+static bool
+choose_next_subplan_for_leader(AppendState *node)
+{
+	ParallelAppendState *pstate = node->as_pstate;
+
+	/* Backward scan is not supported by parallel-aware plans */
+	Assert(ScanDirectionIsForward(node->ps.state->es_direction));
+
+	/* We should never be called when there are no subplans */
+	Assert(node->as_nplans > 0);
+
+	LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE);
+
+	if (node->as_whichplan != INVALID_SUBPLAN_INDEX)
+	{
+		/* Mark just-completed subplan as finished. */
+		node->as_pstate->pa_finished[node->as_whichplan] = true;
+	}
+	else
+	{
+		/* Start with last subplan. */
+		node->as_whichplan = node->as_nplans - 1;
+
+		/*
+		 * If we've yet to determine the valid subplans then do so now.  If
+		 * run-time pruning is disabled then the valid subplans will always be
+		 * set to all subplans.
+		 */
+		if (node->as_valid_subplans == NULL)
+		{
+			node->as_valid_subplans =
+				ExecFindMatchingSubPlans(node->as_prune_state);
+
+			/*
+			 * Mark each invalid plan as finished to allow the loop below to
+			 * select the first valid subplan.
+			 */
+			mark_invalid_subplans_as_finished(node);
+		}
+	}
+
+	/* Loop until we find a subplan to execute. */
+	while (pstate->pa_finished[node->as_whichplan])
+	{
+		if (node->as_whichplan == 0)
+		{
+			pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
+			node->as_whichplan = INVALID_SUBPLAN_INDEX;
+			LWLockRelease(&pstate->pa_lock);
+			return false;
+		}
+
+		/*
+		 * We needn't pay attention to as_valid_subplans here as all invalid
+		 * plans have been marked as finished.
+		 */
+		node->as_whichplan--;
+	}
+
+	/* If non-partial, immediately mark as finished. */
+	if (node->as_whichplan < node->as_first_partial_plan)
+		node->as_pstate->pa_finished[node->as_whichplan] = true;
+
+	LWLockRelease(&pstate->pa_lock);
+
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		choose_next_subplan_for_worker
+ *
+ *		Choose next subplan for a parallel-aware Append, returning
+ *		false if there are no more.
+ *
+ *		We start from the first plan and advance through the list;
+ *		when we get back to the end, we loop back to the first
+ *		partial plan.  This assigns the non-partial plans first in
+ *		order of descending cost and then spreads out the workers
+ *		as evenly as possible across the remaining partial plans.
+ * ----------------------------------------------------------------
+ */
+static bool
+choose_next_subplan_for_worker(AppendState *node)
+{
+	ParallelAppendState *pstate = node->as_pstate;
+
+	/* Backward scan is not supported by parallel-aware plans */
+	Assert(ScanDirectionIsForward(node->ps.state->es_direction));
+
+	/* We should never be called when there are no subplans */
+	Assert(node->as_nplans > 0);
+
+	LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE);
+
+	/* Mark just-completed subplan as finished. */
+	if (node->as_whichplan != INVALID_SUBPLAN_INDEX)
+		node->as_pstate->pa_finished[node->as_whichplan] = true;
+
+	/*
+	 * If we've yet to determine the valid subplans then do so now.  If
+	 * run-time pruning is disabled then the valid subplans will always be set
+	 * to all subplans.
+	 */
+	else if (node->as_valid_subplans == NULL)
+	{
+		node->as_valid_subplans =
+			ExecFindMatchingSubPlans(node->as_prune_state);
+		mark_invalid_subplans_as_finished(node);
+	}
+
+	/* If all the plans are already done, we have nothing to do */
+	if (pstate->pa_next_plan == INVALID_SUBPLAN_INDEX)
+	{
+		LWLockRelease(&pstate->pa_lock);
+		return false;
+	}
+
+	/* Save the plan from which we are starting the search. */
+	node->as_whichplan = pstate->pa_next_plan;
+
+	/* Loop until we find a valid subplan to execute. */
+	while (pstate->pa_finished[pstate->pa_next_plan])
+	{
+		int			nextplan;
+
+		nextplan = bms_next_member(node->as_valid_subplans,
+								   pstate->pa_next_plan);
+		if (nextplan >= 0)
+		{
+			/* Advance to the next valid plan. */
+			pstate->pa_next_plan = nextplan;
+		}
+		else if (node->as_whichplan > node->as_first_partial_plan)
+		{
+			/*
+			 * Try looping back to the first valid partial plan, if there is
+			 * one.  If there isn't, arrange to bail out below.
+			 */
+			nextplan = bms_next_member(node->as_valid_subplans,
+									   node->as_first_partial_plan - 1);
+			pstate->pa_next_plan =
+				nextplan < 0 ? node->as_whichplan : nextplan;
+		}
+		else
+		{
+			/*
+			 * At last plan, and either there are no partial plans or we've
+			 * tried them all.  Arrange to bail out.
+			 */
+			pstate->pa_next_plan = node->as_whichplan;
+		}
+
+		if (pstate->pa_next_plan == node->as_whichplan)
+		{
+			/* We've tried everything! */
+			pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
+			LWLockRelease(&pstate->pa_lock);
+			return false;
+		}
+	}
+
+	/* Pick the plan we found, and advance pa_next_plan one more time. */
+	node->as_whichplan = pstate->pa_next_plan;
+	pstate->pa_next_plan = bms_next_member(node->as_valid_subplans,
+										   pstate->pa_next_plan);
+
+	/*
+	 * If there are no more valid plans then try setting the next plan to the
+	 * first valid partial plan.
+	 */
+	if (pstate->pa_next_plan < 0)
+	{
+		int			nextplan = bms_next_member(node->as_valid_subplans,
+											   node->as_first_partial_plan - 1);
+
+		if (nextplan >= 0)
+			pstate->pa_next_plan = nextplan;
+		else
+		{
+			/*
+			 * There are no valid partial plans, and we already chose the last
+			 * non-partial plan; so flag that there's nothing more for our
+			 * fellow workers to do.
+			 */
+			pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
+		}
+	}
+
+	/* If non-partial, immediately mark as finished. */
+	if (node->as_whichplan < node->as_first_partial_plan)
+		node->as_pstate->pa_finished[node->as_whichplan] = true;
+
+	LWLockRelease(&pstate->pa_lock);
+
+	return true;
+}
+
+/*
+ * mark_invalid_subplans_as_finished
+ *		Marks the ParallelAppendState's pa_finished as true for each invalid
+ *		subplan.
+ *
+ * This function should only be called for parallel Append with run-time
+ * pruning enabled.
+ */
+static void
+mark_invalid_subplans_as_finished(AppendState *node)
+{
+	int			i;
+
+	/* Only valid to call this while in parallel Append mode */
+	Assert(node->as_pstate);
+
+	/* Shouldn't have been called when run-time pruning is not enabled */
+	Assert(node->as_prune_state);
+
+	/* Nothing to do if all plans are valid */
+	if (bms_num_members(node->as_valid_subplans) == node->as_nplans)
+		return;
+
+	/* Mark all non-valid plans as finished */
+	for (i = 0; i < node->as_nplans; i++)
+	{
+		if (!bms_is_member(i, node->as_valid_subplans))
+			node->as_pstate->pa_finished[i] = true;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *						Asynchronous Append Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecAppendAsyncBegin
+ *
+ *		Begin executing designed async-capable subplans.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecAppendAsyncBegin(AppendState *node)
+{
+	int			i;
+
+	/* Backward scan is not supported by async-aware Appends. */
+	Assert(ScanDirectionIsForward(node->ps.state->es_direction));
+
+	/* We should never be called when there are no subplans */
+	Assert(node->as_nplans > 0);
+
+	/* We should never be called when there are no async subplans. */
+	Assert(node->as_nasyncplans > 0);
+
+	/* If we've yet to determine the valid subplans then do so now. */
+	if (node->as_valid_subplans == NULL)
+	{
+		node->as_valid_subplans =
+			ExecFindMatchingSubPlans(node->as_prune_state);
+
+		classify_matching_subplans(node);
+	}
+
+	/* Initialize state variables. */
+	node->as_syncdone = bms_is_empty(node->as_valid_subplans);
+	node->as_nasyncremain = bms_num_members(node->as_valid_asyncplans);
+
+	/* Nothing to do if there are no valid async subplans. */
+	if (node->as_nasyncremain == 0)
+		return;
+
+	/* Make a request for each of the valid async subplans. */
+	i = -1;
+	while ((i = bms_next_member(node->as_valid_asyncplans, i)) >= 0)
+	{
+		AsyncRequest *areq = node->as_asyncrequests[i];
+
+		Assert(areq->request_index == i);
+		Assert(!areq->callback_pending);
+
+		/* Do the actual work. */
+		ExecAsyncRequest(areq);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAppendAsyncGetNext
+ *
+ *		Get the next tuple from any of the asynchronous subplans.
+ * ----------------------------------------------------------------
+ */
+static bool
+ExecAppendAsyncGetNext(AppendState *node, TupleTableSlot **result)
+{
+	*result = NULL;
+
+	/* We should never be called when there are no valid async subplans. */
+	Assert(node->as_nasyncremain > 0);
+
+	/* Request a tuple asynchronously. */
+	if (ExecAppendAsyncRequest(node, result))
+		return true;
+
+	while (node->as_nasyncremain > 0)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		/* Wait or poll for async events. */
+		ExecAppendAsyncEventWait(node);
+
+		/* Request a tuple asynchronously. */
+		if (ExecAppendAsyncRequest(node, result))
+			return true;
+
+		/* Break from loop if there's any sync subplan that isn't complete. */
+		if (!node->as_syncdone)
+			break;
+	}
+
+	/*
+	 * If all sync subplans are complete, we're totally done scanning the
+	 * given node.  Otherwise, we're done with the asynchronous stuff but must
+	 * continue scanning the sync subplans.
+	 */
+	if (node->as_syncdone)
+	{
+		Assert(node->as_nasyncremain == 0);
+		*result = ExecClearTuple(node->ps.ps_ResultTupleSlot);
+		return true;
+	}
+
+	return false;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAppendAsyncRequest
+ *
+ *		Request a tuple asynchronously.
+ * ----------------------------------------------------------------
+ */
+static bool
+ExecAppendAsyncRequest(AppendState *node, TupleTableSlot **result)
+{
+	Bitmapset  *needrequest;
+	int			i;
+
+	/* Nothing to do if there are no async subplans needing a new request. */
+	if (bms_is_empty(node->as_needrequest))
+	{
+		Assert(node->as_nasyncresults == 0);
+		return false;
+	}
+
+	/*
+	 * If there are any asynchronously-generated results that have not yet
+	 * been returned, we have nothing to do; just return one of them.
+	 */
+	if (node->as_nasyncresults > 0)
+	{
+		--node->as_nasyncresults;
+		*result = node->as_asyncresults[node->as_nasyncresults];
+		return true;
+	}
+
+	/* Make a new request for each of the async subplans that need it. */
+	needrequest = node->as_needrequest;
+	node->as_needrequest = NULL;
+	i = -1;
+	while ((i = bms_next_member(needrequest, i)) >= 0)
+	{
+		AsyncRequest *areq = node->as_asyncrequests[i];
+
+		/* Do the actual work. */
+		ExecAsyncRequest(areq);
+	}
+	bms_free(needrequest);
+
+	/* Return one of the asynchronously-generated results if any. */
+	if (node->as_nasyncresults > 0)
+	{
+		--node->as_nasyncresults;
+		*result = node->as_asyncresults[node->as_nasyncresults];
+		return true;
+	}
+
+	return false;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAppendAsyncEventWait
+ *
+ *		Wait or poll for file descriptor events and fire callbacks.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecAppendAsyncEventWait(AppendState *node)
+{
+	int			nevents = node->as_nasyncplans + 1;
+	long		timeout = node->as_syncdone ? -1 : 0;
+	WaitEvent	occurred_event[EVENT_BUFFER_SIZE];
+	int			noccurred;
+	int			i;
+
+	/* We should never be called when there are no valid async subplans. */
+	Assert(node->as_nasyncremain > 0);
+
+	node->as_eventset = CreateWaitEventSet(CurrentMemoryContext, nevents);
+	AddWaitEventToSet(node->as_eventset, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+					  NULL, NULL);
+
+	/* Give each waiting subplan a chance to add an event. */
+	i = -1;
+	while ((i = bms_next_member(node->as_asyncplans, i)) >= 0)
+	{
+		AsyncRequest *areq = node->as_asyncrequests[i];
+
+		if (areq->callback_pending)
+			ExecAsyncConfigureWait(areq);
+	}
+
+	/*
+	 * No need for further processing if there are no configured events other
+	 * than the postmaster death event.
+	 */
+	if (GetNumRegisteredWaitEvents(node->as_eventset) == 1)
+	{
+		FreeWaitEventSet(node->as_eventset);
+		node->as_eventset = NULL;
+		return;
+	}
+
+	/* We wait on at most EVENT_BUFFER_SIZE events. */
+	if (nevents > EVENT_BUFFER_SIZE)
+		nevents = EVENT_BUFFER_SIZE;
+
+	/*
+	 * If the timeout is -1, wait until at least one event occurs.  If the
+	 * timeout is 0, poll for events, but do not wait at all.
+	 */
+	noccurred = WaitEventSetWait(node->as_eventset, timeout, occurred_event,
+								 nevents, WAIT_EVENT_APPEND_READY);
+	FreeWaitEventSet(node->as_eventset);
+	node->as_eventset = NULL;
+	if (noccurred == 0)
+		return;
+
+	/* Deliver notifications. */
+	for (i = 0; i < noccurred; i++)
+	{
+		WaitEvent  *w = &occurred_event[i];
+
+		/*
+		 * Each waiting subplan should have registered its wait event with
+		 * user_data pointing back to its AsyncRequest.
+		 */
+		if ((w->events & WL_SOCKET_READABLE) != 0)
+		{
+			AsyncRequest *areq = (AsyncRequest *) w->user_data;
+
+			if (areq->callback_pending)
+			{
+				/*
+				 * Mark it as no longer needing a callback.  We must do this
+				 * before dispatching the callback in case the callback resets
+				 * the flag.
+				 */
+				areq->callback_pending = false;
+
+				/* Do the actual work. */
+				ExecAsyncNotify(areq);
+			}
+		}
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAsyncAppendResponse
+ *
+ *		Receive a response from an asynchronous request we made.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncAppendResponse(AsyncRequest *areq)
+{
+	AppendState *node = (AppendState *) areq->requestor;
+	TupleTableSlot *slot = areq->result;
+
+	/* The result should be a TupleTableSlot or NULL. */
+	Assert(slot == NULL || IsA(slot, TupleTableSlot));
+
+	/* Nothing to do if the request is pending. */
+	if (!areq->request_complete)
+	{
+		/* The request would have been pending for a callback. */
+		Assert(areq->callback_pending);
+		return;
+	}
+
+	/* If the result is NULL or an empty slot, there's nothing more to do. */
+	if (TupIsNull(slot))
+	{
+		/* The ending subplan wouldn't have been pending for a callback. */
+		Assert(!areq->callback_pending);
+		--node->as_nasyncremain;
+		return;
+	}
+
+	/* Save result so we can return it. */
+	Assert(node->as_nasyncresults < node->as_nasyncplans);
+	node->as_asyncresults[node->as_nasyncresults++] = slot;
+
+	/*
+	 * Mark the subplan that returned a result as ready for a new request.  We
+	 * don't launch another one here immediately because it might complete.
+	 */
+	node->as_needrequest = bms_add_member(node->as_needrequest,
+										  areq->request_index);
+}
+
+/* ----------------------------------------------------------------
+ *		classify_matching_subplans
+ *
+ *		Classify the node's as_valid_subplans into sync ones and
+ *		async ones, adjust it to contain sync ones only, and save
+ *		async ones in the node's as_valid_asyncplans.
+ * ----------------------------------------------------------------
+ */
+static void
+classify_matching_subplans(AppendState *node)
+{
+	Bitmapset  *valid_asyncplans;
+
+	Assert(node->as_valid_asyncplans == NULL);
+
+	/* Nothing to do if there are no valid subplans. */
+	if (bms_is_empty(node->as_valid_subplans))
+	{
+		node->as_syncdone = true;
+		node->as_nasyncremain = 0;
+		return;
+	}
+
+	/* Nothing to do if there are no valid async subplans. */
+	if (!bms_overlap(node->as_valid_subplans, node->as_asyncplans))
+	{
+		node->as_nasyncremain = 0;
+		return;
+	}
+
+	/* Get valid async subplans. */
+	valid_asyncplans = bms_copy(node->as_asyncplans);
+	valid_asyncplans = bms_int_members(valid_asyncplans,
+									   node->as_valid_subplans);
+
+	/* Adjust the valid subplans to contain sync subplans only. */
+	node->as_valid_subplans = bms_del_members(node->as_valid_subplans,
+											  valid_asyncplans);
+
+	/* Save valid async subplans. */
+	node->as_valid_asyncplans = valid_asyncplans;
+}
diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c
new file mode 100644
index 0000000..a8d7b1e
--- /dev/null
+++ b/src/backend/executor/nodeBitmapAnd.c
@@ -0,0 +1,223 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeBitmapAnd.c
+ *	  routines to handle BitmapAnd nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeBitmapAnd.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ *		ExecInitBitmapAnd	- initialize the BitmapAnd node
+ *		MultiExecBitmapAnd	- retrieve the result bitmap from the node
+ *		ExecEndBitmapAnd	- shut down the BitmapAnd node
+ *		ExecReScanBitmapAnd - rescan the BitmapAnd node
+ *
+ *	 NOTES
+ *		BitmapAnd nodes don't make use of their left and right
+ *		subtrees, rather they maintain a list of subplans,
+ *		much like Append nodes.  The logic is much simpler than
+ *		Append, however, since we needn't cope with forward/backward
+ *		execution.
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeBitmapAnd.h"
+
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapAnd
+ *
+ *		stub for pro forma compliance
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecBitmapAnd(PlanState *pstate)
+{
+	elog(ERROR, "BitmapAnd node does not support ExecProcNode call convention");
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitBitmapAnd
+ *
+ *		Begin all of the subscans of the BitmapAnd node.
+ * ----------------------------------------------------------------
+ */
+BitmapAndState *
+ExecInitBitmapAnd(BitmapAnd *node, EState *estate, int eflags)
+{
+	BitmapAndState *bitmapandstate = makeNode(BitmapAndState);
+	PlanState **bitmapplanstates;
+	int			nplans;
+	int			i;
+	ListCell   *l;
+	Plan	   *initNode;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * Set up empty vector of subplan states
+	 */
+	nplans = list_length(node->bitmapplans);
+
+	bitmapplanstates = (PlanState **) palloc0(nplans * sizeof(PlanState *));
+
+	/*
+	 * create new BitmapAndState for our BitmapAnd node
+	 */
+	bitmapandstate->ps.plan = (Plan *) node;
+	bitmapandstate->ps.state = estate;
+	bitmapandstate->ps.ExecProcNode = ExecBitmapAnd;
+	bitmapandstate->bitmapplans = bitmapplanstates;
+	bitmapandstate->nplans = nplans;
+
+	/*
+	 * call ExecInitNode on each of the plans to be executed and save the
+	 * results into the array "bitmapplanstates".
+	 */
+	i = 0;
+	foreach(l, node->bitmapplans)
+	{
+		initNode = (Plan *) lfirst(l);
+		bitmapplanstates[i] = ExecInitNode(initNode, estate, eflags);
+		i++;
+	}
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * BitmapAnd plans don't have expression contexts because they never call
+	 * ExecQual or ExecProject.  They don't need any tuple slots either.
+	 */
+
+	return bitmapandstate;
+}
+
+/* ----------------------------------------------------------------
+ *	   MultiExecBitmapAnd
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecBitmapAnd(BitmapAndState *node)
+{
+	PlanState **bitmapplans;
+	int			nplans;
+	int			i;
+	TIDBitmap  *result = NULL;
+
+	/* must provide our own instrumentation support */
+	if (node->ps.instrument)
+		InstrStartNode(node->ps.instrument);
+
+	/*
+	 * get information from the node
+	 */
+	bitmapplans = node->bitmapplans;
+	nplans = node->nplans;
+
+	/*
+	 * Scan all the subplans and AND their result bitmaps
+	 */
+	for (i = 0; i < nplans; i++)
+	{
+		PlanState  *subnode = bitmapplans[i];
+		TIDBitmap  *subresult;
+
+		subresult = (TIDBitmap *) MultiExecProcNode(subnode);
+
+		if (!subresult || !IsA(subresult, TIDBitmap))
+			elog(ERROR, "unrecognized result from subplan");
+
+		if (result == NULL)
+			result = subresult; /* first subplan */
+		else
+		{
+			tbm_intersect(result, subresult);
+			tbm_free(subresult);
+		}
+
+		/*
+		 * If at any stage we have a completely empty bitmap, we can fall out
+		 * without evaluating the remaining subplans, since ANDing them can no
+		 * longer change the result.  (Note: the fact that indxpath.c orders
+		 * the subplans by selectivity should make this case more likely to
+		 * occur.)
+		 */
+		if (tbm_is_empty(result))
+			break;
+	}
+
+	if (result == NULL)
+		elog(ERROR, "BitmapAnd doesn't support zero inputs");
+
+	/* must provide our own instrumentation support */
+	if (node->ps.instrument)
+		InstrStopNode(node->ps.instrument, 0 /* XXX */ );
+
+	return (Node *) result;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndBitmapAnd
+ *
+ *		Shuts down the subscans of the BitmapAnd node.
+ *
+ *		Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndBitmapAnd(BitmapAndState *node)
+{
+	PlanState **bitmapplans;
+	int			nplans;
+	int			i;
+
+	/*
+	 * get information from the node
+	 */
+	bitmapplans = node->bitmapplans;
+	nplans = node->nplans;
+
+	/*
+	 * shut down each of the subscans (that we've initialized)
+	 */
+	for (i = 0; i < nplans; i++)
+	{
+		if (bitmapplans[i])
+			ExecEndNode(bitmapplans[i]);
+	}
+}
+
+void
+ExecReScanBitmapAnd(BitmapAndState *node)
+{
+	int			i;
+
+	for (i = 0; i < node->nplans; i++)
+	{
+		PlanState  *subnode = node->bitmapplans[i];
+
+		/*
+		 * ExecReScan doesn't know about my subplans, so I have to do
+		 * changed-parameter signaling myself.
+		 */
+		if (node->ps.chgParam != NULL)
+			UpdateChangedParamSet(subnode, node->ps.chgParam);
+
+		/*
+		 * If chgParam of subnode is not null then plan will be re-scanned by
+		 * first ExecProcNode.
+		 */
+		if (subnode->chgParam == NULL)
+			ExecReScan(subnode);
+	}
+}
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
new file mode 100644
index 0000000..2db1914
--- /dev/null
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -0,0 +1,954 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeBitmapHeapscan.c
+ *	  Routines to support bitmapped scans of relations
+ *
+ * NOTE: it is critical that this plan type only be used with MVCC-compliant
+ * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
+ * special snapshots).  The reason is that since index and heap scans are
+ * decoupled, there can be no assurance that the index tuple prompting a
+ * visit to a particular heap TID still exists when the visit is made.
+ * Therefore the tuple might not exist anymore either (which is OK because
+ * heap_fetch will cope) --- but worse, the tuple slot could have been
+ * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
+ * certain to fail the time qual and so it will not be mistakenly returned,
+ * but with anything else we might return a tuple that doesn't meet the
+ * required index qual conditions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeBitmapHeapscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecBitmapHeapScan			scans a relation using bitmap info
+ *		ExecBitmapHeapNext			workhorse for above
+ *		ExecInitBitmapHeapScan		creates and initializes state info.
+ *		ExecReScanBitmapHeapScan	prepares to rescan the plan.
+ *		ExecEndBitmapHeapScan		releases all storage.
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/visibilitymap.h"
+#include "executor/execdebug.h"
+#include "executor/nodeBitmapHeapscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+#include "utils/spccache.h"
+
+static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
+static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
+static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
+												TBMIterateResult *tbmres);
+static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
+static inline void BitmapPrefetch(BitmapHeapScanState *node,
+								  TableScanDesc scan);
+static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
+
+
+/* ----------------------------------------------------------------
+ *		BitmapHeapNext
+ *
+ *		Retrieve next tuple from the BitmapHeapScan node's currentRelation
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+BitmapHeapNext(BitmapHeapScanState *node)
+{
+	ExprContext *econtext;
+	TableScanDesc scan;
+	TIDBitmap  *tbm;
+	TBMIterator *tbmiterator = NULL;
+	TBMSharedIterator *shared_tbmiterator = NULL;
+	TBMIterateResult *tbmres;
+	TupleTableSlot *slot;
+	ParallelBitmapHeapState *pstate = node->pstate;
+	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
+
+	/*
+	 * extract necessary information from index scan node
+	 */
+	econtext = node->ss.ps.ps_ExprContext;
+	slot = node->ss.ss_ScanTupleSlot;
+	scan = node->ss.ss_currentScanDesc;
+	tbm = node->tbm;
+	if (pstate == NULL)
+		tbmiterator = node->tbmiterator;
+	else
+		shared_tbmiterator = node->shared_tbmiterator;
+	tbmres = node->tbmres;
+
+	/*
+	 * If we haven't yet performed the underlying index scan, do it, and begin
+	 * the iteration over the bitmap.
+	 *
+	 * For prefetching, we use *two* iterators, one for the pages we are
+	 * actually scanning and another that runs ahead of the first for
+	 * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
+	 * the prefetch iterator is.  Also, node->prefetch_target tracks the
+	 * desired prefetch distance, which starts small and increases up to the
+	 * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
+	 * a scan that stops after a few tuples because of a LIMIT.
+	 */
+	if (!node->initialized)
+	{
+		if (!pstate)
+		{
+			tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
+
+			if (!tbm || !IsA(tbm, TIDBitmap))
+				elog(ERROR, "unrecognized result from subplan");
+
+			node->tbm = tbm;
+			node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
+			node->tbmres = tbmres = NULL;
+
+#ifdef USE_PREFETCH
+			if (node->prefetch_maximum > 0)
+			{
+				node->prefetch_iterator = tbm_begin_iterate(tbm);
+				node->prefetch_pages = 0;
+				node->prefetch_target = -1;
+			}
+#endif							/* USE_PREFETCH */
+		}
+		else
+		{
+			/*
+			 * The leader will immediately come out of the function, but
+			 * others will be blocked until leader populates the TBM and wakes
+			 * them up.
+			 */
+			if (BitmapShouldInitializeSharedState(pstate))
+			{
+				tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
+				if (!tbm || !IsA(tbm, TIDBitmap))
+					elog(ERROR, "unrecognized result from subplan");
+
+				node->tbm = tbm;
+
+				/*
+				 * Prepare to iterate over the TBM. This will return the
+				 * dsa_pointer of the iterator state which will be used by
+				 * multiple processes to iterate jointly.
+				 */
+				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
+#ifdef USE_PREFETCH
+				if (node->prefetch_maximum > 0)
+				{
+					pstate->prefetch_iterator =
+						tbm_prepare_shared_iterate(tbm);
+
+					/*
+					 * We don't need the mutex here as we haven't yet woke up
+					 * others.
+					 */
+					pstate->prefetch_pages = 0;
+					pstate->prefetch_target = -1;
+				}
+#endif
+
+				/* We have initialized the shared state so wake up others. */
+				BitmapDoneInitializingSharedState(pstate);
+			}
+
+			/* Allocate a private iterator and attach the shared state to it */
+			node->shared_tbmiterator = shared_tbmiterator =
+				tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
+			node->tbmres = tbmres = NULL;
+
+#ifdef USE_PREFETCH
+			if (node->prefetch_maximum > 0)
+			{
+				node->shared_prefetch_iterator =
+					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
+			}
+#endif							/* USE_PREFETCH */
+		}
+		node->initialized = true;
+	}
+
+	for (;;)
+	{
+		bool		skip_fetch;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Get next page of results if needed
+		 */
+		if (tbmres == NULL)
+		{
+			if (!pstate)
+				node->tbmres = tbmres = tbm_iterate(tbmiterator);
+			else
+				node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
+			if (tbmres == NULL)
+			{
+				/* no more entries in the bitmap */
+				break;
+			}
+
+			BitmapAdjustPrefetchIterator(node, tbmres);
+
+			/*
+			 * We can skip fetching the heap page if we don't need any fields
+			 * from the heap, and the bitmap entries don't need rechecking,
+			 * and all tuples on the page are visible to our transaction.
+			 *
+			 * XXX: It's a layering violation that we do these checks above
+			 * tableam, they should probably moved below it at some point.
+			 */
+			skip_fetch = (node->can_skip_fetch &&
+						  !tbmres->recheck &&
+						  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+										 tbmres->blockno,
+										 &node->vmbuffer));
+
+			if (skip_fetch)
+			{
+				/* can't be lossy in the skip_fetch case */
+				Assert(tbmres->ntuples >= 0);
+
+				/*
+				 * The number of tuples on this page is put into
+				 * node->return_empty_tuples.
+				 */
+				node->return_empty_tuples = tbmres->ntuples;
+			}
+			else if (!table_scan_bitmap_next_block(scan, tbmres))
+			{
+				/* AM doesn't think this block is valid, skip */
+				continue;
+			}
+
+			if (tbmres->ntuples >= 0)
+				node->exact_pages++;
+			else
+				node->lossy_pages++;
+
+			/* Adjust the prefetch target */
+			BitmapAdjustPrefetchTarget(node);
+		}
+		else
+		{
+			/*
+			 * Continuing in previously obtained page.
+			 */
+
+#ifdef USE_PREFETCH
+
+			/*
+			 * Try to prefetch at least a few pages even before we get to the
+			 * second page if we don't stop reading after the first tuple.
+			 */
+			if (!pstate)
+			{
+				if (node->prefetch_target < node->prefetch_maximum)
+					node->prefetch_target++;
+			}
+			else if (pstate->prefetch_target < node->prefetch_maximum)
+			{
+				/* take spinlock while updating shared state */
+				SpinLockAcquire(&pstate->mutex);
+				if (pstate->prefetch_target < node->prefetch_maximum)
+					pstate->prefetch_target++;
+				SpinLockRelease(&pstate->mutex);
+			}
+#endif							/* USE_PREFETCH */
+		}
+
+		/*
+		 * We issue prefetch requests *after* fetching the current page to try
+		 * to avoid having prefetching interfere with the main I/O. Also, this
+		 * should happen only when we have determined there is still something
+		 * to do on the current page, else we may uselessly prefetch the same
+		 * page we are just about to request for real.
+		 *
+		 * XXX: It's a layering violation that we do these checks above
+		 * tableam, they should probably moved below it at some point.
+		 */
+		BitmapPrefetch(node, scan);
+
+		if (node->return_empty_tuples > 0)
+		{
+			/*
+			 * If we don't have to fetch the tuple, just return nulls.
+			 */
+			ExecStoreAllNullTuple(slot);
+
+			if (--node->return_empty_tuples == 0)
+			{
+				/* no more tuples to return in the next round */
+				node->tbmres = tbmres = NULL;
+			}
+		}
+		else
+		{
+			/*
+			 * Attempt to fetch tuple from AM.
+			 */
+			if (!table_scan_bitmap_next_tuple(scan, tbmres, slot))
+			{
+				/* nothing more to look at on this page */
+				node->tbmres = tbmres = NULL;
+				continue;
+			}
+
+			/*
+			 * If we are using lossy info, we have to recheck the qual
+			 * conditions at every tuple.
+			 */
+			if (tbmres->recheck)
+			{
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->bitmapqualorig, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					ExecClearTuple(slot);
+					continue;
+				}
+			}
+		}
+
+		/* OK to return this tuple */
+		return slot;
+	}
+
+	/*
+	 * if we get here it means we are at the end of the scan..
+	 */
+	return ExecClearTuple(slot);
+}
+
+/*
+ *	BitmapDoneInitializingSharedState - Shared state is initialized
+ *
+ *	By this time the leader has already populated the TBM and initialized the
+ *	shared state so wake up other processes.
+ */
+static inline void
+BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
+{
+	SpinLockAcquire(&pstate->mutex);
+	pstate->state = BM_FINISHED;
+	SpinLockRelease(&pstate->mutex);
+	ConditionVariableBroadcast(&pstate->cv);
+}
+
+/*
+ *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
+ */
+static inline void
+BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
+							 TBMIterateResult *tbmres)
+{
+#ifdef USE_PREFETCH
+	ParallelBitmapHeapState *pstate = node->pstate;
+
+	if (pstate == NULL)
+	{
+		TBMIterator *prefetch_iterator = node->prefetch_iterator;
+
+		if (node->prefetch_pages > 0)
+		{
+			/* The main iterator has closed the distance by one page */
+			node->prefetch_pages--;
+		}
+		else if (prefetch_iterator)
+		{
+			/* Do not let the prefetch iterator get behind the main one */
+			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+
+			if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
+				elog(ERROR, "prefetch and main iterators are out of sync");
+		}
+		return;
+	}
+
+	if (node->prefetch_maximum > 0)
+	{
+		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
+
+		SpinLockAcquire(&pstate->mutex);
+		if (pstate->prefetch_pages > 0)
+		{
+			pstate->prefetch_pages--;
+			SpinLockRelease(&pstate->mutex);
+		}
+		else
+		{
+			/* Release the mutex before iterating */
+			SpinLockRelease(&pstate->mutex);
+
+			/*
+			 * In case of shared mode, we can not ensure that the current
+			 * blockno of the main iterator and that of the prefetch iterator
+			 * are same.  It's possible that whatever blockno we are
+			 * prefetching will be processed by another process.  Therefore,
+			 * we don't validate the blockno here as we do in non-parallel
+			 * case.
+			 */
+			if (prefetch_iterator)
+				tbm_shared_iterate(prefetch_iterator);
+		}
+	}
+#endif							/* USE_PREFETCH */
+}
+
+/*
+ * BitmapAdjustPrefetchTarget - Adjust the prefetch target
+ *
+ * Increase prefetch target if it's not yet at the max.  Note that
+ * we will increase it to zero after fetching the very first
+ * page/tuple, then to one after the second tuple is fetched, then
+ * it doubles as later pages are fetched.
+ */
+static inline void
+BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
+{
+#ifdef USE_PREFETCH
+	ParallelBitmapHeapState *pstate = node->pstate;
+
+	if (pstate == NULL)
+	{
+		if (node->prefetch_target >= node->prefetch_maximum)
+			 /* don't increase any further */ ;
+		else if (node->prefetch_target >= node->prefetch_maximum / 2)
+			node->prefetch_target = node->prefetch_maximum;
+		else if (node->prefetch_target > 0)
+			node->prefetch_target *= 2;
+		else
+			node->prefetch_target++;
+		return;
+	}
+
+	/* Do an unlocked check first to save spinlock acquisitions. */
+	if (pstate->prefetch_target < node->prefetch_maximum)
+	{
+		SpinLockAcquire(&pstate->mutex);
+		if (pstate->prefetch_target >= node->prefetch_maximum)
+			 /* don't increase any further */ ;
+		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
+			pstate->prefetch_target = node->prefetch_maximum;
+		else if (pstate->prefetch_target > 0)
+			pstate->prefetch_target *= 2;
+		else
+			pstate->prefetch_target++;
+		SpinLockRelease(&pstate->mutex);
+	}
+#endif							/* USE_PREFETCH */
+}
+
+/*
+ * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
+ */
+static inline void
+BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
+{
+#ifdef USE_PREFETCH
+	ParallelBitmapHeapState *pstate = node->pstate;
+
+	if (pstate == NULL)
+	{
+		TBMIterator *prefetch_iterator = node->prefetch_iterator;
+
+		if (prefetch_iterator)
+		{
+			while (node->prefetch_pages < node->prefetch_target)
+			{
+				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+				bool		skip_fetch;
+
+				if (tbmpre == NULL)
+				{
+					/* No more pages to prefetch */
+					tbm_end_iterate(prefetch_iterator);
+					node->prefetch_iterator = NULL;
+					break;
+				}
+				node->prefetch_pages++;
+
+				/*
+				 * If we expect not to have to actually read this heap page,
+				 * skip this prefetch call, but continue to run the prefetch
+				 * logic normally.  (Would it be better not to increment
+				 * prefetch_pages?)
+				 *
+				 * This depends on the assumption that the index AM will
+				 * report the same recheck flag for this future heap page as
+				 * it did for the current heap page; which is not a certainty
+				 * but is true in many cases.
+				 */
+				skip_fetch = (node->can_skip_fetch &&
+							  (node->tbmres ? !node->tbmres->recheck : false) &&
+							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+											 tbmpre->blockno,
+											 &node->pvmbuffer));
+
+				if (!skip_fetch)
+					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
+			}
+		}
+
+		return;
+	}
+
+	if (pstate->prefetch_pages < pstate->prefetch_target)
+	{
+		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
+
+		if (prefetch_iterator)
+		{
+			while (1)
+			{
+				TBMIterateResult *tbmpre;
+				bool		do_prefetch = false;
+				bool		skip_fetch;
+
+				/*
+				 * Recheck under the mutex. If some other process has already
+				 * done enough prefetching then we need not to do anything.
+				 */
+				SpinLockAcquire(&pstate->mutex);
+				if (pstate->prefetch_pages < pstate->prefetch_target)
+				{
+					pstate->prefetch_pages++;
+					do_prefetch = true;
+				}
+				SpinLockRelease(&pstate->mutex);
+
+				if (!do_prefetch)
+					return;
+
+				tbmpre = tbm_shared_iterate(prefetch_iterator);
+				if (tbmpre == NULL)
+				{
+					/* No more pages to prefetch */
+					tbm_end_shared_iterate(prefetch_iterator);
+					node->shared_prefetch_iterator = NULL;
+					break;
+				}
+
+				/* As above, skip prefetch if we expect not to need page */
+				skip_fetch = (node->can_skip_fetch &&
+							  (node->tbmres ? !node->tbmres->recheck : false) &&
+							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+											 tbmpre->blockno,
+											 &node->pvmbuffer));
+
+				if (!skip_fetch)
+					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
+			}
+		}
+	}
+#endif							/* USE_PREFETCH */
+}
+
+/*
+ * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
+{
+	ExprContext *econtext;
+
+	/*
+	 * extract necessary information from index scan node
+	 */
+	econtext = node->ss.ps.ps_ExprContext;
+
+	/* Does the tuple meet the original qual conditions? */
+	econtext->ecxt_scantuple = slot;
+	return ExecQualAndReset(node->bitmapqualorig, econtext);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapScan(node)
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecBitmapHeapScan(PlanState *pstate)
+{
+	BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) BitmapHeapNext,
+					(ExecScanRecheckMtd) BitmapHeapRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanBitmapHeapScan(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/* rescan to release any page pin */
+	table_rescan(node->ss.ss_currentScanDesc, NULL);
+
+	/* release bitmaps and buffers if any */
+	if (node->tbmiterator)
+		tbm_end_iterate(node->tbmiterator);
+	if (node->prefetch_iterator)
+		tbm_end_iterate(node->prefetch_iterator);
+	if (node->shared_tbmiterator)
+		tbm_end_shared_iterate(node->shared_tbmiterator);
+	if (node->shared_prefetch_iterator)
+		tbm_end_shared_iterate(node->shared_prefetch_iterator);
+	if (node->tbm)
+		tbm_free(node->tbm);
+	if (node->vmbuffer != InvalidBuffer)
+		ReleaseBuffer(node->vmbuffer);
+	if (node->pvmbuffer != InvalidBuffer)
+		ReleaseBuffer(node->pvmbuffer);
+	node->tbm = NULL;
+	node->tbmiterator = NULL;
+	node->tbmres = NULL;
+	node->prefetch_iterator = NULL;
+	node->initialized = false;
+	node->shared_tbmiterator = NULL;
+	node->shared_prefetch_iterator = NULL;
+	node->vmbuffer = InvalidBuffer;
+	node->pvmbuffer = InvalidBuffer;
+
+	ExecScanReScan(&node->ss);
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndBitmapHeapScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndBitmapHeapScan(BitmapHeapScanState *node)
+{
+	TableScanDesc scanDesc;
+
+	/*
+	 * extract information from the node
+	 */
+	scanDesc = node->ss.ss_currentScanDesc;
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clear out tuple table slots
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close down subplans
+	 */
+	ExecEndNode(outerPlanState(node));
+
+	/*
+	 * release bitmaps and buffers if any
+	 */
+	if (node->tbmiterator)
+		tbm_end_iterate(node->tbmiterator);
+	if (node->prefetch_iterator)
+		tbm_end_iterate(node->prefetch_iterator);
+	if (node->tbm)
+		tbm_free(node->tbm);
+	if (node->shared_tbmiterator)
+		tbm_end_shared_iterate(node->shared_tbmiterator);
+	if (node->shared_prefetch_iterator)
+		tbm_end_shared_iterate(node->shared_prefetch_iterator);
+	if (node->vmbuffer != InvalidBuffer)
+		ReleaseBuffer(node->vmbuffer);
+	if (node->pvmbuffer != InvalidBuffer)
+		ReleaseBuffer(node->pvmbuffer);
+
+	/*
+	 * close heap scan
+	 */
+	table_endscan(scanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitBitmapHeapScan
+ *
+ *		Initializes the scan's state information.
+ * ----------------------------------------------------------------
+ */
+BitmapHeapScanState *
+ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
+{
+	BitmapHeapScanState *scanstate;
+	Relation	currentRelation;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * Assert caller didn't ask for an unsafe snapshot --- see comments at
+	 * head of file.
+	 */
+	Assert(IsMVCCSnapshot(estate->es_snapshot));
+
+	/*
+	 * create state structure
+	 */
+	scanstate = makeNode(BitmapHeapScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
+
+	scanstate->tbm = NULL;
+	scanstate->tbmiterator = NULL;
+	scanstate->tbmres = NULL;
+	scanstate->return_empty_tuples = 0;
+	scanstate->vmbuffer = InvalidBuffer;
+	scanstate->pvmbuffer = InvalidBuffer;
+	scanstate->exact_pages = 0;
+	scanstate->lossy_pages = 0;
+	scanstate->prefetch_iterator = NULL;
+	scanstate->prefetch_pages = 0;
+	scanstate->prefetch_target = 0;
+	scanstate->pscan_len = 0;
+	scanstate->initialized = false;
+	scanstate->shared_tbmiterator = NULL;
+	scanstate->shared_prefetch_iterator = NULL;
+	scanstate->pstate = NULL;
+
+	/*
+	 * We can potentially skip fetching heap pages if we do not need any
+	 * columns of the table, either for checking non-indexable quals or for
+	 * returning data.  This test is a bit simplistic, as it checks the
+	 * stronger condition that there's no qual or return tlist at all.  But in
+	 * most cases it's probably not worth working harder than that.
+	 */
+	scanstate->can_skip_fetch = (node->scan.plan.qual == NIL &&
+								 node->scan.plan.targetlist == NIL);
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * open the scan relation
+	 */
+	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * get the scan type from the relation descriptor.
+	 */
+	ExecInitScanTupleSlot(estate, &scanstate->ss,
+						  RelationGetDescr(currentRelation),
+						  table_slot_callbacks(currentRelation));
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+	scanstate->bitmapqualorig =
+		ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
+
+	/*
+	 * Maximum number of prefetches for the tablespace if configured,
+	 * otherwise the current value of the effective_io_concurrency GUC.
+	 */
+	scanstate->prefetch_maximum =
+		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
+
+	scanstate->ss.ss_currentRelation = currentRelation;
+
+	scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation,
+														  estate->es_snapshot,
+														  0,
+														  NULL);
+
+	/*
+	 * all done.
+	 */
+	return scanstate;
+}
+
+/*----------------
+ *		BitmapShouldInitializeSharedState
+ *
+ *		The first process to come here and see the state to the BM_INITIAL
+ *		will become the leader for the parallel bitmap scan and will be
+ *		responsible for populating the TIDBitmap.  The other processes will
+ *		be blocked by the condition variable until the leader wakes them up.
+ * ---------------
+ */
+static bool
+BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
+{
+	SharedBitmapState state;
+
+	while (1)
+	{
+		SpinLockAcquire(&pstate->mutex);
+		state = pstate->state;
+		if (pstate->state == BM_INITIAL)
+			pstate->state = BM_INPROGRESS;
+		SpinLockRelease(&pstate->mutex);
+
+		/* Exit if bitmap is done, or if we're the leader. */
+		if (state != BM_INPROGRESS)
+			break;
+
+		/* Wait for the leader to wake us up. */
+		ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
+	}
+
+	ConditionVariableCancelSleep();
+
+	return (state == BM_INITIAL);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapEstimate
+ *
+ *		Compute the amount of space we'll need in the parallel
+ *		query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapEstimate(BitmapHeapScanState *node,
+					   ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+
+	node->pscan_len = add_size(offsetof(ParallelBitmapHeapState,
+										phs_snapshot_data),
+							   EstimateSnapshotSpace(estate->es_snapshot));
+
+	shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapInitializeDSM
+ *
+ *		Set up a parallel bitmap heap scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
+							ParallelContext *pcxt)
+{
+	ParallelBitmapHeapState *pstate;
+	EState	   *estate = node->ss.ps.state;
+	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
+
+	/* If there's no DSA, there are no workers; initialize nothing. */
+	if (dsa == NULL)
+		return;
+
+	pstate = shm_toc_allocate(pcxt->toc, node->pscan_len);
+
+	pstate->tbmiterator = 0;
+	pstate->prefetch_iterator = 0;
+
+	/* Initialize the mutex */
+	SpinLockInit(&pstate->mutex);
+	pstate->prefetch_pages = 0;
+	pstate->prefetch_target = 0;
+	pstate->state = BM_INITIAL;
+
+	ConditionVariableInit(&pstate->cv);
+	SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data);
+
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
+	node->pstate = pstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
+							  ParallelContext *pcxt)
+{
+	ParallelBitmapHeapState *pstate = node->pstate;
+	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
+
+	/* If there's no DSA, there are no workers; do nothing. */
+	if (dsa == NULL)
+		return;
+
+	pstate->state = BM_INITIAL;
+
+	if (DsaPointerIsValid(pstate->tbmiterator))
+		tbm_free_shared_area(dsa, pstate->tbmiterator);
+
+	if (DsaPointerIsValid(pstate->prefetch_iterator))
+		tbm_free_shared_area(dsa, pstate->prefetch_iterator);
+
+	pstate->tbmiterator = InvalidDsaPointer;
+	pstate->prefetch_iterator = InvalidDsaPointer;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapInitializeWorker
+ *
+ *		Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
+							   ParallelWorkerContext *pwcxt)
+{
+	ParallelBitmapHeapState *pstate;
+	Snapshot	snapshot;
+
+	Assert(node->ss.ps.state->es_query_dsa != NULL);
+
+	pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+	node->pstate = pstate;
+
+	snapshot = RestoreSnapshot(pstate->phs_snapshot_data);
+	table_scan_update_snapshot(node->ss.ss_currentScanDesc, snapshot);
+}
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
new file mode 100644
index 0000000..48c2036
--- /dev/null
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -0,0 +1,330 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeBitmapIndexscan.c
+ *	  Routines to support bitmapped index scans of relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeBitmapIndexscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		MultiExecBitmapIndexScan	scans a relation using index.
+ *		ExecInitBitmapIndexScan		creates and initializes state info.
+ *		ExecReScanBitmapIndexScan	prepares to rescan the plan.
+ *		ExecEndBitmapIndexScan		releases all storage.
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "executor/execdebug.h"
+#include "executor/nodeBitmapIndexscan.h"
+#include "executor/nodeIndexscan.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapIndexScan
+ *
+ *		stub for pro forma compliance
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecBitmapIndexScan(PlanState *pstate)
+{
+	elog(ERROR, "BitmapIndexScan node does not support ExecProcNode call convention");
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		MultiExecBitmapIndexScan(node)
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecBitmapIndexScan(BitmapIndexScanState *node)
+{
+	TIDBitmap  *tbm;
+	IndexScanDesc scandesc;
+	double		nTuples = 0;
+	bool		doscan;
+
+	/* must provide our own instrumentation support */
+	if (node->ss.ps.instrument)
+		InstrStartNode(node->ss.ps.instrument);
+
+	/*
+	 * extract necessary information from index scan node
+	 */
+	scandesc = node->biss_ScanDesc;
+
+	/*
+	 * If we have runtime keys and they've not already been set up, do it now.
+	 * Array keys are also treated as runtime keys; note that if ExecReScan
+	 * returns with biss_RuntimeKeysReady still false, then there is an empty
+	 * array key so we should do nothing.
+	 */
+	if (!node->biss_RuntimeKeysReady &&
+		(node->biss_NumRuntimeKeys != 0 || node->biss_NumArrayKeys != 0))
+	{
+		ExecReScan((PlanState *) node);
+		doscan = node->biss_RuntimeKeysReady;
+	}
+	else
+		doscan = true;
+
+	/*
+	 * Prepare the result bitmap.  Normally we just create a new one to pass
+	 * back; however, our parent node is allowed to store a pre-made one into
+	 * node->biss_result, in which case we just OR our tuple IDs into the
+	 * existing bitmap.  (This saves needing explicit UNION steps.)
+	 */
+	if (node->biss_result)
+	{
+		tbm = node->biss_result;
+		node->biss_result = NULL;	/* reset for next time */
+	}
+	else
+	{
+		/* XXX should we use less than work_mem for this? */
+		tbm = tbm_create(work_mem * 1024L,
+						 ((BitmapIndexScan *) node->ss.ps.plan)->isshared ?
+						 node->ss.ps.state->es_query_dsa : NULL);
+	}
+
+	/*
+	 * Get TIDs from index and insert into bitmap
+	 */
+	while (doscan)
+	{
+		nTuples += (double) index_getbitmap(scandesc, tbm);
+
+		CHECK_FOR_INTERRUPTS();
+
+		doscan = ExecIndexAdvanceArrayKeys(node->biss_ArrayKeys,
+										   node->biss_NumArrayKeys);
+		if (doscan)				/* reset index scan */
+			index_rescan(node->biss_ScanDesc,
+						 node->biss_ScanKeys, node->biss_NumScanKeys,
+						 NULL, 0);
+	}
+
+	/* must provide our own instrumentation support */
+	if (node->ss.ps.instrument)
+		InstrStopNode(node->ss.ps.instrument, nTuples);
+
+	return (Node *) tbm;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanBitmapIndexScan(node)
+ *
+ *		Recalculates the values of any scan keys whose value depends on
+ *		information known at runtime, then rescans the indexed relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanBitmapIndexScan(BitmapIndexScanState *node)
+{
+	ExprContext *econtext = node->biss_RuntimeContext;
+
+	/*
+	 * Reset the runtime-key context so we don't leak memory as each outer
+	 * tuple is scanned.  Note this assumes that we will recalculate *all*
+	 * runtime keys on each call.
+	 */
+	if (econtext)
+		ResetExprContext(econtext);
+
+	/*
+	 * If we are doing runtime key calculations (ie, any of the index key
+	 * values weren't simple Consts), compute the new key values.
+	 *
+	 * Array keys are also treated as runtime keys; note that if we return
+	 * with biss_RuntimeKeysReady still false, then there is an empty array
+	 * key so no index scan is needed.
+	 */
+	if (node->biss_NumRuntimeKeys != 0)
+		ExecIndexEvalRuntimeKeys(econtext,
+								 node->biss_RuntimeKeys,
+								 node->biss_NumRuntimeKeys);
+	if (node->biss_NumArrayKeys != 0)
+		node->biss_RuntimeKeysReady =
+			ExecIndexEvalArrayKeys(econtext,
+								   node->biss_ArrayKeys,
+								   node->biss_NumArrayKeys);
+	else
+		node->biss_RuntimeKeysReady = true;
+
+	/* reset index scan */
+	if (node->biss_RuntimeKeysReady)
+		index_rescan(node->biss_ScanDesc,
+					 node->biss_ScanKeys, node->biss_NumScanKeys,
+					 NULL, 0);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndBitmapIndexScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndBitmapIndexScan(BitmapIndexScanState *node)
+{
+	Relation	indexRelationDesc;
+	IndexScanDesc indexScanDesc;
+
+	/*
+	 * extract information from the node
+	 */
+	indexRelationDesc = node->biss_RelationDesc;
+	indexScanDesc = node->biss_ScanDesc;
+
+	/*
+	 * Free the exprcontext ... now dead code, see ExecFreeExprContext
+	 */
+#ifdef NOT_USED
+	if (node->biss_RuntimeContext)
+		FreeExprContext(node->biss_RuntimeContext, true);
+#endif
+
+	/*
+	 * close the index relation (no-op if we didn't open it)
+	 */
+	if (indexScanDesc)
+		index_endscan(indexScanDesc);
+	if (indexRelationDesc)
+		index_close(indexRelationDesc, NoLock);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitBitmapIndexScan
+ *
+ *		Initializes the index scan's state information.
+ * ----------------------------------------------------------------
+ */
+BitmapIndexScanState *
+ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
+{
+	BitmapIndexScanState *indexstate;
+	LOCKMODE	lockmode;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	indexstate = makeNode(BitmapIndexScanState);
+	indexstate->ss.ps.plan = (Plan *) node;
+	indexstate->ss.ps.state = estate;
+	indexstate->ss.ps.ExecProcNode = ExecBitmapIndexScan;
+
+	/* normally we don't make the result bitmap till runtime */
+	indexstate->biss_result = NULL;
+
+	/*
+	 * We do not open or lock the base relation here.  We assume that an
+	 * ancestor BitmapHeapScan node is holding AccessShareLock (or better) on
+	 * the heap relation throughout the execution of the plan tree.
+	 */
+
+	indexstate->ss.ss_currentRelation = NULL;
+	indexstate->ss.ss_currentScanDesc = NULL;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * We do not need a standard exprcontext for this node, though we may
+	 * decide below to create a runtime-key exprcontext
+	 */
+
+	/*
+	 * initialize child expressions
+	 *
+	 * We don't need to initialize targetlist or qual since neither are used.
+	 *
+	 * Note: we don't initialize all of the indexqual expression, only the
+	 * sub-parts corresponding to runtime keys (see below).
+	 */
+
+	/*
+	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
+	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
+	 * references to nonexistent indexes.
+	 */
+	if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
+		return indexstate;
+
+	/* Open the index relation. */
+	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
+	indexstate->biss_RelationDesc = index_open(node->indexid, lockmode);
+
+	/*
+	 * Initialize index-specific scan state
+	 */
+	indexstate->biss_RuntimeKeysReady = false;
+	indexstate->biss_RuntimeKeys = NULL;
+	indexstate->biss_NumRuntimeKeys = 0;
+
+	/*
+	 * build the index scan keys from the index qualification
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->biss_RelationDesc,
+						   node->indexqual,
+						   false,
+						   &indexstate->biss_ScanKeys,
+						   &indexstate->biss_NumScanKeys,
+						   &indexstate->biss_RuntimeKeys,
+						   &indexstate->biss_NumRuntimeKeys,
+						   &indexstate->biss_ArrayKeys,
+						   &indexstate->biss_NumArrayKeys);
+
+	/*
+	 * If we have runtime keys or array keys, we need an ExprContext to
+	 * evaluate them. We could just create a "standard" plan node exprcontext,
+	 * but to keep the code looking similar to nodeIndexscan.c, it seems
+	 * better to stick with the approach of using a separate ExprContext.
+	 */
+	if (indexstate->biss_NumRuntimeKeys != 0 ||
+		indexstate->biss_NumArrayKeys != 0)
+	{
+		ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
+
+		ExecAssignExprContext(estate, &indexstate->ss.ps);
+		indexstate->biss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
+		indexstate->ss.ps.ps_ExprContext = stdecontext;
+	}
+	else
+	{
+		indexstate->biss_RuntimeContext = NULL;
+	}
+
+	/*
+	 * Initialize scan descriptor.
+	 */
+	indexstate->biss_ScanDesc =
+		index_beginscan_bitmap(indexstate->biss_RelationDesc,
+							   estate->es_snapshot,
+							   indexstate->biss_NumScanKeys);
+
+	/*
+	 * If no run-time keys to calculate, go ahead and pass the scankeys to the
+	 * index AM.
+	 */
+	if (indexstate->biss_NumRuntimeKeys == 0 &&
+		indexstate->biss_NumArrayKeys == 0)
+		index_rescan(indexstate->biss_ScanDesc,
+					 indexstate->biss_ScanKeys, indexstate->biss_NumScanKeys,
+					 NULL, 0);
+
+	/*
+	 * all done.
+	 */
+	return indexstate;
+}
diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c
new file mode 100644
index 0000000..4a8c01d
--- /dev/null
+++ b/src/backend/executor/nodeBitmapOr.c
@@ -0,0 +1,241 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeBitmapOr.c
+ *	  routines to handle BitmapOr nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeBitmapOr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ *		ExecInitBitmapOr	- initialize the BitmapOr node
+ *		MultiExecBitmapOr	- retrieve the result bitmap from the node
+ *		ExecEndBitmapOr		- shut down the BitmapOr node
+ *		ExecReScanBitmapOr	- rescan the BitmapOr node
+ *
+ *	 NOTES
+ *		BitmapOr nodes don't make use of their left and right
+ *		subtrees, rather they maintain a list of subplans,
+ *		much like Append nodes.  The logic is much simpler than
+ *		Append, however, since we needn't cope with forward/backward
+ *		execution.
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeBitmapOr.h"
+#include "miscadmin.h"
+
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapOr
+ *
+ *		stub for pro forma compliance
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecBitmapOr(PlanState *pstate)
+{
+	elog(ERROR, "BitmapOr node does not support ExecProcNode call convention");
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitBitmapOr
+ *
+ *		Begin all of the subscans of the BitmapOr node.
+ * ----------------------------------------------------------------
+ */
+BitmapOrState *
+ExecInitBitmapOr(BitmapOr *node, EState *estate, int eflags)
+{
+	BitmapOrState *bitmaporstate = makeNode(BitmapOrState);
+	PlanState **bitmapplanstates;
+	int			nplans;
+	int			i;
+	ListCell   *l;
+	Plan	   *initNode;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * Set up empty vector of subplan states
+	 */
+	nplans = list_length(node->bitmapplans);
+
+	bitmapplanstates = (PlanState **) palloc0(nplans * sizeof(PlanState *));
+
+	/*
+	 * create new BitmapOrState for our BitmapOr node
+	 */
+	bitmaporstate->ps.plan = (Plan *) node;
+	bitmaporstate->ps.state = estate;
+	bitmaporstate->ps.ExecProcNode = ExecBitmapOr;
+	bitmaporstate->bitmapplans = bitmapplanstates;
+	bitmaporstate->nplans = nplans;
+
+	/*
+	 * call ExecInitNode on each of the plans to be executed and save the
+	 * results into the array "bitmapplanstates".
+	 */
+	i = 0;
+	foreach(l, node->bitmapplans)
+	{
+		initNode = (Plan *) lfirst(l);
+		bitmapplanstates[i] = ExecInitNode(initNode, estate, eflags);
+		i++;
+	}
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * BitmapOr plans don't have expression contexts because they never call
+	 * ExecQual or ExecProject.  They don't need any tuple slots either.
+	 */
+
+	return bitmaporstate;
+}
+
+/* ----------------------------------------------------------------
+ *	   MultiExecBitmapOr
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecBitmapOr(BitmapOrState *node)
+{
+	PlanState **bitmapplans;
+	int			nplans;
+	int			i;
+	TIDBitmap  *result = NULL;
+
+	/* must provide our own instrumentation support */
+	if (node->ps.instrument)
+		InstrStartNode(node->ps.instrument);
+
+	/*
+	 * get information from the node
+	 */
+	bitmapplans = node->bitmapplans;
+	nplans = node->nplans;
+
+	/*
+	 * Scan all the subplans and OR their result bitmaps
+	 */
+	for (i = 0; i < nplans; i++)
+	{
+		PlanState  *subnode = bitmapplans[i];
+		TIDBitmap  *subresult;
+
+		/*
+		 * We can special-case BitmapIndexScan children to avoid an explicit
+		 * tbm_union step for each child: just pass down the current result
+		 * bitmap and let the child OR directly into it.
+		 */
+		if (IsA(subnode, BitmapIndexScanState))
+		{
+			if (result == NULL) /* first subplan */
+			{
+				/* XXX should we use less than work_mem for this? */
+				result = tbm_create(work_mem * 1024L,
+									((BitmapOr *) node->ps.plan)->isshared ?
+									node->ps.state->es_query_dsa : NULL);
+			}
+
+			((BitmapIndexScanState *) subnode)->biss_result = result;
+
+			subresult = (TIDBitmap *) MultiExecProcNode(subnode);
+
+			if (subresult != result)
+				elog(ERROR, "unrecognized result from subplan");
+		}
+		else
+		{
+			/* standard implementation */
+			subresult = (TIDBitmap *) MultiExecProcNode(subnode);
+
+			if (!subresult || !IsA(subresult, TIDBitmap))
+				elog(ERROR, "unrecognized result from subplan");
+
+			if (result == NULL)
+				result = subresult; /* first subplan */
+			else
+			{
+				tbm_union(result, subresult);
+				tbm_free(subresult);
+			}
+		}
+	}
+
+	/* We could return an empty result set here? */
+	if (result == NULL)
+		elog(ERROR, "BitmapOr doesn't support zero inputs");
+
+	/* must provide our own instrumentation support */
+	if (node->ps.instrument)
+		InstrStopNode(node->ps.instrument, 0 /* XXX */ );
+
+	return (Node *) result;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndBitmapOr
+ *
+ *		Shuts down the subscans of the BitmapOr node.
+ *
+ *		Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndBitmapOr(BitmapOrState *node)
+{
+	PlanState **bitmapplans;
+	int			nplans;
+	int			i;
+
+	/*
+	 * get information from the node
+	 */
+	bitmapplans = node->bitmapplans;
+	nplans = node->nplans;
+
+	/*
+	 * shut down each of the subscans (that we've initialized)
+	 */
+	for (i = 0; i < nplans; i++)
+	{
+		if (bitmapplans[i])
+			ExecEndNode(bitmapplans[i]);
+	}
+}
+
+void
+ExecReScanBitmapOr(BitmapOrState *node)
+{
+	int			i;
+
+	for (i = 0; i < node->nplans; i++)
+	{
+		PlanState  *subnode = node->bitmapplans[i];
+
+		/*
+		 * ExecReScan doesn't know about my subplans, so I have to do
+		 * changed-parameter signaling myself.
+		 */
+		if (node->ps.chgParam != NULL)
+			UpdateChangedParamSet(subnode, node->ps.chgParam);
+
+		/*
+		 * If chgParam of subnode is not null then plan will be re-scanned by
+		 * first ExecProcNode.
+		 */
+		if (subnode->chgParam == NULL)
+			ExecReScan(subnode);
+	}
+}
diff --git a/src/backend/executor/nodeCtescan.c b/src/backend/executor/nodeCtescan.c
new file mode 100644
index 0000000..9c2b08d
--- /dev/null
+++ b/src/backend/executor/nodeCtescan.c
@@ -0,0 +1,351 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeCtescan.c
+ *	  routines to handle CteScan nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeCtescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeCtescan.h"
+#include "miscadmin.h"
+
+static TupleTableSlot *CteScanNext(CteScanState *node);
+
+/* ----------------------------------------------------------------
+ *		CteScanNext
+ *
+ *		This is a workhorse for ExecCteScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+CteScanNext(CteScanState *node)
+{
+	EState	   *estate;
+	ScanDirection dir;
+	bool		forward;
+	Tuplestorestate *tuplestorestate;
+	bool		eof_tuplestore;
+	TupleTableSlot *slot;
+
+	/*
+	 * get state info from node
+	 */
+	estate = node->ss.ps.state;
+	dir = estate->es_direction;
+	forward = ScanDirectionIsForward(dir);
+	tuplestorestate = node->leader->cte_table;
+	tuplestore_select_read_pointer(tuplestorestate, node->readptr);
+	slot = node->ss.ss_ScanTupleSlot;
+
+	/*
+	 * If we are not at the end of the tuplestore, or are going backwards, try
+	 * to fetch a tuple from tuplestore.
+	 */
+	eof_tuplestore = tuplestore_ateof(tuplestorestate);
+
+	if (!forward && eof_tuplestore)
+	{
+		if (!node->leader->eof_cte)
+		{
+			/*
+			 * When reversing direction at tuplestore EOF, the first
+			 * gettupleslot call will fetch the last-added tuple; but we want
+			 * to return the one before that, if possible. So do an extra
+			 * fetch.
+			 */
+			if (!tuplestore_advance(tuplestorestate, forward))
+				return NULL;	/* the tuplestore must be empty */
+		}
+		eof_tuplestore = false;
+	}
+
+	/*
+	 * If we can fetch another tuple from the tuplestore, return it.
+	 *
+	 * Note: we have to use copy=true in the tuplestore_gettupleslot call,
+	 * because we are sharing the tuplestore with other nodes that might write
+	 * into the tuplestore before we get called again.
+	 */
+	if (!eof_tuplestore)
+	{
+		if (tuplestore_gettupleslot(tuplestorestate, forward, true, slot))
+			return slot;
+		if (forward)
+			eof_tuplestore = true;
+	}
+
+	/*
+	 * If necessary, try to fetch another row from the CTE query.
+	 *
+	 * Note: the eof_cte state variable exists to short-circuit further calls
+	 * of the CTE plan.  It's not optional, unfortunately, because some plan
+	 * node types are not robust about being called again when they've already
+	 * returned NULL.
+	 */
+	if (eof_tuplestore && !node->leader->eof_cte)
+	{
+		TupleTableSlot *cteslot;
+
+		/*
+		 * We can only get here with forward==true, so no need to worry about
+		 * which direction the subplan will go.
+		 */
+		cteslot = ExecProcNode(node->cteplanstate);
+		if (TupIsNull(cteslot))
+		{
+			node->leader->eof_cte = true;
+			return NULL;
+		}
+
+		/*
+		 * There are corner cases where the subplan could change which
+		 * tuplestore read pointer is active, so be sure to reselect ours
+		 * before storing the tuple we got.
+		 */
+		tuplestore_select_read_pointer(tuplestorestate, node->readptr);
+
+		/*
+		 * Append a copy of the returned tuple to tuplestore.  NOTE: because
+		 * our read pointer is certainly in EOF state, its read position will
+		 * move forward over the added tuple.  This is what we want.  Also,
+		 * any other readers will *not* move past the new tuple, which is what
+		 * they want.
+		 */
+		tuplestore_puttupleslot(tuplestorestate, cteslot);
+
+		/*
+		 * We MUST copy the CTE query's output tuple into our own slot. This
+		 * is because other CteScan nodes might advance the CTE query before
+		 * we are called again, and our output tuple must stay stable over
+		 * that.
+		 */
+		return ExecCopySlot(slot, cteslot);
+	}
+
+	/*
+	 * Nothing left ...
+	 */
+	return ExecClearTuple(slot);
+}
+
+/*
+ * CteScanRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+CteScanRecheck(CteScanState *node, TupleTableSlot *slot)
+{
+	/* nothing to check */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecCteScan(node)
+ *
+ *		Scans the CTE sequentially and returns the next qualifying tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecCteScan(PlanState *pstate)
+{
+	CteScanState *node = castNode(CteScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) CteScanNext,
+					(ExecScanRecheckMtd) CteScanRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecInitCteScan
+ * ----------------------------------------------------------------
+ */
+CteScanState *
+ExecInitCteScan(CteScan *node, EState *estate, int eflags)
+{
+	CteScanState *scanstate;
+	ParamExecData *prmdata;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & EXEC_FLAG_MARK));
+
+	/*
+	 * For the moment we have to force the tuplestore to allow REWIND, because
+	 * we might be asked to rescan the CTE even though upper levels didn't
+	 * tell us to be prepared to do it efficiently.  Annoying, since this
+	 * prevents truncation of the tuplestore.  XXX FIXME
+	 *
+	 * Note: if we are in an EPQ recheck plan tree, it's likely that no access
+	 * to the tuplestore is needed at all, making this even more annoying.
+	 * It's not worth improving that as long as all the read pointers would
+	 * have REWIND anyway, but if we ever improve this logic then that aspect
+	 * should be considered too.
+	 */
+	eflags |= EXEC_FLAG_REWIND;
+
+	/*
+	 * CteScan should not have any children.
+	 */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create new CteScanState for node
+	 */
+	scanstate = makeNode(CteScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecCteScan;
+	scanstate->eflags = eflags;
+	scanstate->cte_table = NULL;
+	scanstate->eof_cte = false;
+
+	/*
+	 * Find the already-initialized plan for the CTE query.
+	 */
+	scanstate->cteplanstate = (PlanState *) list_nth(estate->es_subplanstates,
+													 node->ctePlanId - 1);
+
+	/*
+	 * The Param slot associated with the CTE query is used to hold a pointer
+	 * to the CteState of the first CteScan node that initializes for this
+	 * CTE.  This node will be the one that holds the shared state for all the
+	 * CTEs, particularly the shared tuplestore.
+	 */
+	prmdata = &(estate->es_param_exec_vals[node->cteParam]);
+	Assert(prmdata->execPlan == NULL);
+	Assert(!prmdata->isnull);
+	scanstate->leader = castNode(CteScanState, DatumGetPointer(prmdata->value));
+	if (scanstate->leader == NULL)
+	{
+		/* I am the leader */
+		prmdata->value = PointerGetDatum(scanstate);
+		scanstate->leader = scanstate;
+		scanstate->cte_table = tuplestore_begin_heap(true, false, work_mem);
+		tuplestore_set_eflags(scanstate->cte_table, scanstate->eflags);
+		scanstate->readptr = 0;
+	}
+	else
+	{
+		/* Not the leader */
+		/* Create my own read pointer, and ensure it is at start */
+		scanstate->readptr =
+			tuplestore_alloc_read_pointer(scanstate->leader->cte_table,
+										  scanstate->eflags);
+		tuplestore_select_read_pointer(scanstate->leader->cte_table,
+									   scanstate->readptr);
+		tuplestore_rescan(scanstate->leader->cte_table);
+	}
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * The scan tuple type (ie, the rowtype we expect to find in the work
+	 * table) is the same as the result rowtype of the CTE query.
+	 */
+	ExecInitScanTupleSlot(estate, &scanstate->ss,
+						  ExecGetResultType(scanstate->cteplanstate),
+						  &TTSOpsMinimalTuple);
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndCteScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndCteScan(CteScanState *node)
+{
+	/*
+	 * Free exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * If I am the leader, free the tuplestore.
+	 */
+	if (node->leader == node)
+	{
+		tuplestore_end(node->cte_table);
+		node->cte_table = NULL;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanCteScan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanCteScan(CteScanState *node)
+{
+	Tuplestorestate *tuplestorestate = node->leader->cte_table;
+
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	ExecScanReScan(&node->ss);
+
+	/*
+	 * Clear the tuplestore if a new scan of the underlying CTE is required.
+	 * This implicitly resets all the tuplestore's read pointers.  Note that
+	 * multiple CTE nodes might redundantly clear the tuplestore; that's OK,
+	 * and not unduly expensive.  We'll stop taking this path as soon as
+	 * somebody has attempted to read something from the underlying CTE
+	 * (thereby causing its chgParam to be cleared).
+	 */
+	if (node->leader->cteplanstate->chgParam != NULL)
+	{
+		tuplestore_clear(tuplestorestate);
+		node->leader->eof_cte = false;
+	}
+	else
+	{
+		/*
+		 * Else, just rewind my own pointer.  Either the underlying CTE
+		 * doesn't need a rescan (and we can re-read what's in the tuplestore
+		 * now), or somebody else already took care of it.
+		 */
+		tuplestore_select_read_pointer(tuplestorestate, node->readptr);
+		tuplestore_rescan(tuplestorestate);
+	}
+}
diff --git a/src/backend/executor/nodeCustom.c b/src/backend/executor/nodeCustom.c
new file mode 100644
index 0000000..c82060e
--- /dev/null
+++ b/src/backend/executor/nodeCustom.c
@@ -0,0 +1,228 @@
+/* ------------------------------------------------------------------------
+ *
+ * nodeCustom.c
+ *		Routines to handle execution of custom scan node
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * ------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "executor/executor.h"
+#include "executor/nodeCustom.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "nodes/extensible.h"
+#include "nodes/plannodes.h"
+#include "parser/parsetree.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static TupleTableSlot *ExecCustomScan(PlanState *pstate);
+
+
+CustomScanState *
+ExecInitCustomScan(CustomScan *cscan, EState *estate, int eflags)
+{
+	CustomScanState *css;
+	Relation	scan_rel = NULL;
+	Index		scanrelid = cscan->scan.scanrelid;
+	Index		tlistvarno;
+
+	/*
+	 * Allocate the CustomScanState object.  We let the custom scan provider
+	 * do the palloc, in case it wants to make a larger object that embeds
+	 * CustomScanState as the first field.  It must set the node tag and the
+	 * methods field correctly at this time.  Other standard fields should be
+	 * set to zero.
+	 */
+	css = castNode(CustomScanState,
+				   cscan->methods->CreateCustomScanState(cscan));
+
+	/* ensure flags is filled correctly */
+	css->flags = cscan->flags;
+
+	/* fill up fields of ScanState */
+	css->ss.ps.plan = &cscan->scan.plan;
+	css->ss.ps.state = estate;
+	css->ss.ps.ExecProcNode = ExecCustomScan;
+
+	/* create expression context for node */
+	ExecAssignExprContext(estate, &css->ss.ps);
+
+	/*
+	 * open the scan relation, if any
+	 */
+	if (scanrelid > 0)
+	{
+		scan_rel = ExecOpenScanRelation(estate, scanrelid, eflags);
+		css->ss.ss_currentRelation = scan_rel;
+	}
+
+	/*
+	 * Determine the scan tuple type.  If the custom scan provider provided a
+	 * targetlist describing the scan tuples, use that; else use base
+	 * relation's rowtype.
+	 */
+	if (cscan->custom_scan_tlist != NIL || scan_rel == NULL)
+	{
+		TupleDesc	scan_tupdesc;
+
+		scan_tupdesc = ExecTypeFromTL(cscan->custom_scan_tlist);
+		ExecInitScanTupleSlot(estate, &css->ss, scan_tupdesc, &TTSOpsVirtual);
+		/* Node's targetlist will contain Vars with varno = INDEX_VAR */
+		tlistvarno = INDEX_VAR;
+	}
+	else
+	{
+		ExecInitScanTupleSlot(estate, &css->ss, RelationGetDescr(scan_rel),
+							  &TTSOpsVirtual);
+		/* Node's targetlist will contain Vars with varno = scanrelid */
+		tlistvarno = scanrelid;
+	}
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTupleSlotTL(&css->ss.ps, &TTSOpsVirtual);
+	ExecAssignScanProjectionInfoWithVarno(&css->ss, tlistvarno);
+
+	/* initialize child expressions */
+	css->ss.ps.qual =
+		ExecInitQual(cscan->scan.plan.qual, (PlanState *) css);
+
+	/*
+	 * The callback of custom-scan provider applies the final initialization
+	 * of the custom-scan-state node according to its logic.
+	 */
+	css->methods->BeginCustomScan(css, estate, eflags);
+
+	return css;
+}
+
+static TupleTableSlot *
+ExecCustomScan(PlanState *pstate)
+{
+	CustomScanState *node = castNode(CustomScanState, pstate);
+
+	CHECK_FOR_INTERRUPTS();
+
+	Assert(node->methods->ExecCustomScan != NULL);
+	return node->methods->ExecCustomScan(node);
+}
+
+void
+ExecEndCustomScan(CustomScanState *node)
+{
+	Assert(node->methods->EndCustomScan != NULL);
+	node->methods->EndCustomScan(node);
+
+	/* Free the exprcontext */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/* Clean out the tuple table */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+void
+ExecReScanCustomScan(CustomScanState *node)
+{
+	Assert(node->methods->ReScanCustomScan != NULL);
+	node->methods->ReScanCustomScan(node);
+}
+
+void
+ExecCustomMarkPos(CustomScanState *node)
+{
+	if (!node->methods->MarkPosCustomScan)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("custom scan \"%s\" does not support MarkPos",
+						node->methods->CustomName)));
+	node->methods->MarkPosCustomScan(node);
+}
+
+void
+ExecCustomRestrPos(CustomScanState *node)
+{
+	if (!node->methods->RestrPosCustomScan)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("custom scan \"%s\" does not support MarkPos",
+						node->methods->CustomName)));
+	node->methods->RestrPosCustomScan(node);
+}
+
+void
+ExecCustomScanEstimate(CustomScanState *node, ParallelContext *pcxt)
+{
+	const CustomExecMethods *methods = node->methods;
+
+	if (methods->EstimateDSMCustomScan)
+	{
+		node->pscan_len = methods->EstimateDSMCustomScan(node, pcxt);
+		shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+		shm_toc_estimate_keys(&pcxt->estimator, 1);
+	}
+}
+
+void
+ExecCustomScanInitializeDSM(CustomScanState *node, ParallelContext *pcxt)
+{
+	const CustomExecMethods *methods = node->methods;
+
+	if (methods->InitializeDSMCustomScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
+
+		coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len);
+		methods->InitializeDSMCustomScan(node, pcxt, coordinate);
+		shm_toc_insert(pcxt->toc, plan_node_id, coordinate);
+	}
+}
+
+void
+ExecCustomScanReInitializeDSM(CustomScanState *node, ParallelContext *pcxt)
+{
+	const CustomExecMethods *methods = node->methods;
+
+	if (methods->ReInitializeDSMCustomScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
+
+		coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false);
+		methods->ReInitializeDSMCustomScan(node, pcxt, coordinate);
+	}
+}
+
+void
+ExecCustomScanInitializeWorker(CustomScanState *node,
+							   ParallelWorkerContext *pwcxt)
+{
+	const CustomExecMethods *methods = node->methods;
+
+	if (methods->InitializeWorkerCustomScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
+
+		coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false);
+		methods->InitializeWorkerCustomScan(node, pwcxt->toc, coordinate);
+	}
+}
+
+void
+ExecShutdownCustomScan(CustomScanState *node)
+{
+	const CustomExecMethods *methods = node->methods;
+
+	if (methods->ShutdownCustomScan)
+		methods->ShutdownCustomScan(node);
+}
diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c
new file mode 100644
index 0000000..d27849a
--- /dev/null
+++ b/src/backend/executor/nodeForeignscan.c
@@ -0,0 +1,504 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeForeignscan.c
+ *	  Routines to support scans of foreign tables
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeForeignscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *
+ *		ExecForeignScan			scans a foreign table.
+ *		ExecInitForeignScan		creates and initializes state info.
+ *		ExecReScanForeignScan	rescans the foreign relation.
+ *		ExecEndForeignScan		releases any resources allocated.
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeForeignscan.h"
+#include "foreign/fdwapi.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static TupleTableSlot *ForeignNext(ForeignScanState *node);
+static bool ForeignRecheck(ForeignScanState *node, TupleTableSlot *slot);
+
+
+/* ----------------------------------------------------------------
+ *		ForeignNext
+ *
+ *		This is a workhorse for ExecForeignScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ForeignNext(ForeignScanState *node)
+{
+	TupleTableSlot *slot;
+	ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
+	ExprContext *econtext = node->ss.ps.ps_ExprContext;
+	MemoryContext oldcontext;
+
+	/* Call the Iterate function in short-lived context */
+	oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+	if (plan->operation != CMD_SELECT)
+	{
+		/*
+		 * direct modifications cannot be re-evaluated, so shouldn't get here
+		 * during EvalPlanQual processing
+		 */
+		Assert(node->ss.ps.state->es_epq_active == NULL);
+
+		slot = node->fdwroutine->IterateDirectModify(node);
+	}
+	else
+		slot = node->fdwroutine->IterateForeignScan(node);
+	MemoryContextSwitchTo(oldcontext);
+
+	/*
+	 * Insert valid value into tableoid, the only actually-useful system
+	 * column.
+	 */
+	if (plan->fsSystemCol && !TupIsNull(slot))
+		slot->tts_tableOid = RelationGetRelid(node->ss.ss_currentRelation);
+
+	return slot;
+}
+
+/*
+ * ForeignRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+ForeignRecheck(ForeignScanState *node, TupleTableSlot *slot)
+{
+	FdwRoutine *fdwroutine = node->fdwroutine;
+	ExprContext *econtext;
+
+	/*
+	 * extract necessary information from foreign scan node
+	 */
+	econtext = node->ss.ps.ps_ExprContext;
+
+	/* Does the tuple meet the remote qual condition? */
+	econtext->ecxt_scantuple = slot;
+
+	ResetExprContext(econtext);
+
+	/*
+	 * If an outer join is pushed down, RecheckForeignScan may need to store a
+	 * different tuple in the slot, because a different set of columns may go
+	 * to NULL upon recheck.  Otherwise, it shouldn't need to change the slot
+	 * contents, just return true or false to indicate whether the quals still
+	 * pass.  For simple cases, setting fdw_recheck_quals may be easier than
+	 * providing this callback.
+	 */
+	if (fdwroutine->RecheckForeignScan &&
+		!fdwroutine->RecheckForeignScan(node, slot))
+		return false;
+
+	return ExecQual(node->fdw_recheck_quals, econtext);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecForeignScan(node)
+ *
+ *		Fetches the next tuple from the FDW, checks local quals, and
+ *		returns it.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecForeignScan(PlanState *pstate)
+{
+	ForeignScanState *node = castNode(ForeignScanState, pstate);
+	ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
+	EState	   *estate = node->ss.ps.state;
+
+	/*
+	 * Ignore direct modifications when EvalPlanQual is active --- they are
+	 * irrelevant for EvalPlanQual rechecking
+	 */
+	if (estate->es_epq_active != NULL && plan->operation != CMD_SELECT)
+		return NULL;
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) ForeignNext,
+					(ExecScanRecheckMtd) ForeignRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecInitForeignScan
+ * ----------------------------------------------------------------
+ */
+ForeignScanState *
+ExecInitForeignScan(ForeignScan *node, EState *estate, int eflags)
+{
+	ForeignScanState *scanstate;
+	Relation	currentRelation = NULL;
+	Index		scanrelid = node->scan.scanrelid;
+	Index		tlistvarno;
+	FdwRoutine *fdwroutine;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	scanstate = makeNode(ForeignScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecForeignScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * open the scan relation, if any; also acquire function pointers from the
+	 * FDW's handler
+	 */
+	if (scanrelid > 0)
+	{
+		currentRelation = ExecOpenScanRelation(estate, scanrelid, eflags);
+		scanstate->ss.ss_currentRelation = currentRelation;
+		fdwroutine = GetFdwRoutineForRelation(currentRelation, true);
+	}
+	else
+	{
+		/* We can't use the relcache, so get fdwroutine the hard way */
+		fdwroutine = GetFdwRoutineByServerId(node->fs_server);
+	}
+
+	/*
+	 * Determine the scan tuple type.  If the FDW provided a targetlist
+	 * describing the scan tuples, use that; else use base relation's rowtype.
+	 */
+	if (node->fdw_scan_tlist != NIL || currentRelation == NULL)
+	{
+		TupleDesc	scan_tupdesc;
+
+		scan_tupdesc = ExecTypeFromTL(node->fdw_scan_tlist);
+		ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc,
+							  &TTSOpsHeapTuple);
+		/* Node's targetlist will contain Vars with varno = INDEX_VAR */
+		tlistvarno = INDEX_VAR;
+	}
+	else
+	{
+		TupleDesc	scan_tupdesc;
+
+		/* don't trust FDWs to return tuples fulfilling NOT NULL constraints */
+		scan_tupdesc = CreateTupleDescCopy(RelationGetDescr(currentRelation));
+		ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc,
+							  &TTSOpsHeapTuple);
+		/* Node's targetlist will contain Vars with varno = scanrelid */
+		tlistvarno = scanrelid;
+	}
+
+	/* Don't know what an FDW might return */
+	scanstate->ss.ps.scanopsfixed = false;
+	scanstate->ss.ps.scanopsset = true;
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfoWithVarno(&scanstate->ss, tlistvarno);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+	scanstate->fdw_recheck_quals =
+		ExecInitQual(node->fdw_recheck_quals, (PlanState *) scanstate);
+
+	/*
+	 * Determine whether to scan the foreign relation asynchronously or not;
+	 * this has to be kept in sync with the code in ExecInitAppend().
+	 */
+	scanstate->ss.ps.async_capable = (((Plan *) node)->async_capable &&
+									  estate->es_epq_active == NULL);
+
+	/*
+	 * Initialize FDW-related state.
+	 */
+	scanstate->fdwroutine = fdwroutine;
+	scanstate->fdw_state = NULL;
+
+	/*
+	 * For the FDW's convenience, look up the modification target relation's
+	 * ResultRelInfo.  The ModifyTable node should have initialized it for us,
+	 * see ExecInitModifyTable.
+	 *
+	 * Don't try to look up the ResultRelInfo when EvalPlanQual is active,
+	 * though.  Direct modifications cannot be re-evaluated as part of
+	 * EvalPlanQual.  The lookup wouldn't work anyway because during
+	 * EvalPlanQual processing, EvalPlanQual only initializes the subtree
+	 * under the ModifyTable, and doesn't run ExecInitModifyTable.
+	 */
+	if (node->resultRelation > 0 && estate->es_epq_active == NULL)
+	{
+		if (estate->es_result_relations == NULL ||
+			estate->es_result_relations[node->resultRelation - 1] == NULL)
+		{
+			elog(ERROR, "result relation not initialized");
+		}
+		scanstate->resultRelInfo = estate->es_result_relations[node->resultRelation - 1];
+	}
+
+	/* Initialize any outer plan. */
+	if (outerPlan(node))
+		outerPlanState(scanstate) =
+			ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * Tell the FDW to initialize the scan.
+	 */
+	if (node->operation != CMD_SELECT)
+	{
+		/*
+		 * Direct modifications cannot be re-evaluated by EvalPlanQual, so
+		 * don't bother preparing the FDW.
+		 *
+		 * In case of an inherited UPDATE/DELETE with foreign targets there
+		 * can be direct-modify ForeignScan nodes in the EvalPlanQual subtree,
+		 * so we need to ignore such ForeignScan nodes during EvalPlanQual
+		 * processing.  See also ExecForeignScan/ExecReScanForeignScan.
+		 */
+		if (estate->es_epq_active == NULL)
+			fdwroutine->BeginDirectModify(scanstate, eflags);
+	}
+	else
+		fdwroutine->BeginForeignScan(scanstate, eflags);
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndForeignScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndForeignScan(ForeignScanState *node)
+{
+	ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
+	EState	   *estate = node->ss.ps.state;
+
+	/* Let the FDW shut down */
+	if (plan->operation != CMD_SELECT)
+	{
+		if (estate->es_epq_active == NULL)
+			node->fdwroutine->EndDirectModify(node);
+	}
+	else
+		node->fdwroutine->EndForeignScan(node);
+
+	/* Shut down any outer plan. */
+	if (outerPlanState(node))
+		ExecEndNode(outerPlanState(node));
+
+	/* Free the exprcontext */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/* clean out the tuple table */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanForeignScan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanForeignScan(ForeignScanState *node)
+{
+	ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
+	EState	   *estate = node->ss.ps.state;
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/*
+	 * Ignore direct modifications when EvalPlanQual is active --- they are
+	 * irrelevant for EvalPlanQual rechecking
+	 */
+	if (estate->es_epq_active != NULL && plan->operation != CMD_SELECT)
+		return;
+
+	node->fdwroutine->ReScanForeignScan(node);
+
+	/*
+	 * If chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.  outerPlan may also be NULL, in which case there is
+	 * nothing to rescan at all.
+	 */
+	if (outerPlan != NULL && outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+
+	ExecScanReScan(&node->ss);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecForeignScanEstimate
+ *
+ *		Informs size of the parallel coordination information, if any
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanEstimate(ForeignScanState *node, ParallelContext *pcxt)
+{
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	if (fdwroutine->EstimateDSMForeignScan)
+	{
+		node->pscan_len = fdwroutine->EstimateDSMForeignScan(node, pcxt);
+		shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+		shm_toc_estimate_keys(&pcxt->estimator, 1);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecForeignScanInitializeDSM
+ *
+ *		Initialize the parallel coordination information
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanInitializeDSM(ForeignScanState *node, ParallelContext *pcxt)
+{
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	if (fdwroutine->InitializeDSMForeignScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
+
+		coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len);
+		fdwroutine->InitializeDSMForeignScan(node, pcxt, coordinate);
+		shm_toc_insert(pcxt->toc, plan_node_id, coordinate);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecForeignScanReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanReInitializeDSM(ForeignScanState *node, ParallelContext *pcxt)
+{
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	if (fdwroutine->ReInitializeDSMForeignScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
+
+		coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false);
+		fdwroutine->ReInitializeDSMForeignScan(node, pcxt, coordinate);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecForeignScanInitializeWorker
+ *
+ *		Initialization according to the parallel coordination information
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanInitializeWorker(ForeignScanState *node,
+								ParallelWorkerContext *pwcxt)
+{
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	if (fdwroutine->InitializeWorkerForeignScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
+
+		coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false);
+		fdwroutine->InitializeWorkerForeignScan(node, pwcxt->toc, coordinate);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecShutdownForeignScan
+ *
+ *		Gives FDW chance to stop asynchronous resource consumption
+ *		and release any resources still held.
+ * ----------------------------------------------------------------
+ */
+void
+ExecShutdownForeignScan(ForeignScanState *node)
+{
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	if (fdwroutine->ShutdownForeignScan)
+		fdwroutine->ShutdownForeignScan(node);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAsyncForeignScanRequest
+ *
+ *		Asynchronously request a tuple from a designed async-capable node
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanRequest(AsyncRequest *areq)
+{
+	ForeignScanState *node = (ForeignScanState *) areq->requestee;
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	Assert(fdwroutine->ForeignAsyncRequest != NULL);
+	fdwroutine->ForeignAsyncRequest(areq);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAsyncForeignScanConfigureWait
+ *
+ *		In async mode, configure for a wait
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanConfigureWait(AsyncRequest *areq)
+{
+	ForeignScanState *node = (ForeignScanState *) areq->requestee;
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	Assert(fdwroutine->ForeignAsyncConfigureWait != NULL);
+	fdwroutine->ForeignAsyncConfigureWait(areq);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAsyncForeignScanNotify
+ *
+ *		Callback invoked when a relevant event has occurred
+ * ----------------------------------------------------------------
+ */
+void
+ExecAsyncForeignScanNotify(AsyncRequest *areq)
+{
+	ForeignScanState *node = (ForeignScanState *) areq->requestee;
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	Assert(fdwroutine->ForeignAsyncNotify != NULL);
+	fdwroutine->ForeignAsyncNotify(areq);
+}
diff --git a/src/backend/executor/nodeFunctionscan.c b/src/backend/executor/nodeFunctionscan.c
new file mode 100644
index 0000000..b31b2b2
--- /dev/null
+++ b/src/backend/executor/nodeFunctionscan.c
@@ -0,0 +1,620 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeFunctionscan.c
+ *	  Support routines for scanning RangeFunctions (functions in rangetable).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeFunctionscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecFunctionScan		scans a function.
+ *		ExecFunctionNext		retrieve next tuple in sequential order.
+ *		ExecInitFunctionScan	creates and initializes a functionscan node.
+ *		ExecEndFunctionScan		releases any storage allocated.
+ *		ExecReScanFunctionScan	rescans the function
+ */
+#include "postgres.h"
+
+#include "catalog/pg_type.h"
+#include "executor/nodeFunctionscan.h"
+#include "funcapi.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Runtime data for each function being scanned.
+ */
+typedef struct FunctionScanPerFuncState
+{
+	SetExprState *setexpr;		/* state of the expression being evaluated */
+	TupleDesc	tupdesc;		/* desc of the function result type */
+	int			colcount;		/* expected number of result columns */
+	Tuplestorestate *tstore;	/* holds the function result set */
+	int64		rowcount;		/* # of rows in result set, -1 if not known */
+	TupleTableSlot *func_slot;	/* function result slot (or NULL) */
+} FunctionScanPerFuncState;
+
+static TupleTableSlot *FunctionNext(FunctionScanState *node);
+
+
+/* ----------------------------------------------------------------
+ *						Scan Support
+ * ----------------------------------------------------------------
+ */
+/* ----------------------------------------------------------------
+ *		FunctionNext
+ *
+ *		This is a workhorse for ExecFunctionScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+FunctionNext(FunctionScanState *node)
+{
+	EState	   *estate;
+	ScanDirection direction;
+	TupleTableSlot *scanslot;
+	bool		alldone;
+	int64		oldpos;
+	int			funcno;
+	int			att;
+
+	/*
+	 * get information from the estate and scan state
+	 */
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	scanslot = node->ss.ss_ScanTupleSlot;
+
+	if (node->simple)
+	{
+		/*
+		 * Fast path for the trivial case: the function return type and scan
+		 * result type are the same, so we fetch the function result straight
+		 * into the scan result slot. No need to update ordinality or
+		 * rowcounts either.
+		 */
+		Tuplestorestate *tstore = node->funcstates[0].tstore;
+
+		/*
+		 * If first time through, read all tuples from function and put them
+		 * in a tuplestore. Subsequent calls just fetch tuples from
+		 * tuplestore.
+		 */
+		if (tstore == NULL)
+		{
+			node->funcstates[0].tstore = tstore =
+				ExecMakeTableFunctionResult(node->funcstates[0].setexpr,
+											node->ss.ps.ps_ExprContext,
+											node->argcontext,
+											node->funcstates[0].tupdesc,
+											node->eflags & EXEC_FLAG_BACKWARD);
+
+			/*
+			 * paranoia - cope if the function, which may have constructed the
+			 * tuplestore itself, didn't leave it pointing at the start. This
+			 * call is fast, so the overhead shouldn't be an issue.
+			 */
+			tuplestore_rescan(tstore);
+		}
+
+		/*
+		 * Get the next tuple from tuplestore.
+		 */
+		(void) tuplestore_gettupleslot(tstore,
+									   ScanDirectionIsForward(direction),
+									   false,
+									   scanslot);
+		return scanslot;
+	}
+
+	/*
+	 * Increment or decrement ordinal counter before checking for end-of-data,
+	 * so that we can move off either end of the result by 1 (and no more than
+	 * 1) without losing correct count.  See PortalRunSelect for why we can
+	 * assume that we won't be called repeatedly in the end-of-data state.
+	 */
+	oldpos = node->ordinal;
+	if (ScanDirectionIsForward(direction))
+		node->ordinal++;
+	else
+		node->ordinal--;
+
+	/*
+	 * Main loop over functions.
+	 *
+	 * We fetch the function results into func_slots (which match the function
+	 * return types), and then copy the values to scanslot (which matches the
+	 * scan result type), setting the ordinal column (if any) as well.
+	 */
+	ExecClearTuple(scanslot);
+	att = 0;
+	alldone = true;
+	for (funcno = 0; funcno < node->nfuncs; funcno++)
+	{
+		FunctionScanPerFuncState *fs = &node->funcstates[funcno];
+		int			i;
+
+		/*
+		 * If first time through, read all tuples from function and put them
+		 * in a tuplestore. Subsequent calls just fetch tuples from
+		 * tuplestore.
+		 */
+		if (fs->tstore == NULL)
+		{
+			fs->tstore =
+				ExecMakeTableFunctionResult(fs->setexpr,
+											node->ss.ps.ps_ExprContext,
+											node->argcontext,
+											fs->tupdesc,
+											node->eflags & EXEC_FLAG_BACKWARD);
+
+			/*
+			 * paranoia - cope if the function, which may have constructed the
+			 * tuplestore itself, didn't leave it pointing at the start. This
+			 * call is fast, so the overhead shouldn't be an issue.
+			 */
+			tuplestore_rescan(fs->tstore);
+		}
+
+		/*
+		 * Get the next tuple from tuplestore.
+		 *
+		 * If we have a rowcount for the function, and we know the previous
+		 * read position was out of bounds, don't try the read. This allows
+		 * backward scan to work when there are mixed row counts present.
+		 */
+		if (fs->rowcount != -1 && fs->rowcount < oldpos)
+			ExecClearTuple(fs->func_slot);
+		else
+			(void) tuplestore_gettupleslot(fs->tstore,
+										   ScanDirectionIsForward(direction),
+										   false,
+										   fs->func_slot);
+
+		if (TupIsNull(fs->func_slot))
+		{
+			/*
+			 * If we ran out of data for this function in the forward
+			 * direction then we now know how many rows it returned. We need
+			 * to know this in order to handle backwards scans. The row count
+			 * we store is actually 1+ the actual number, because we have to
+			 * position the tuplestore 1 off its end sometimes.
+			 */
+			if (ScanDirectionIsForward(direction) && fs->rowcount == -1)
+				fs->rowcount = node->ordinal;
+
+			/*
+			 * populate the result cols with nulls
+			 */
+			for (i = 0; i < fs->colcount; i++)
+			{
+				scanslot->tts_values[att] = (Datum) 0;
+				scanslot->tts_isnull[att] = true;
+				att++;
+			}
+		}
+		else
+		{
+			/*
+			 * we have a result, so just copy it to the result cols.
+			 */
+			slot_getallattrs(fs->func_slot);
+
+			for (i = 0; i < fs->colcount; i++)
+			{
+				scanslot->tts_values[att] = fs->func_slot->tts_values[i];
+				scanslot->tts_isnull[att] = fs->func_slot->tts_isnull[i];
+				att++;
+			}
+
+			/*
+			 * We're not done until every function result is exhausted; we pad
+			 * the shorter results with nulls until then.
+			 */
+			alldone = false;
+		}
+	}
+
+	/*
+	 * ordinal col is always last, per spec.
+	 */
+	if (node->ordinality)
+	{
+		scanslot->tts_values[att] = Int64GetDatumFast(node->ordinal);
+		scanslot->tts_isnull[att] = false;
+	}
+
+	/*
+	 * If alldone, we just return the previously-cleared scanslot.  Otherwise,
+	 * finish creating the virtual tuple.
+	 */
+	if (!alldone)
+		ExecStoreVirtualTuple(scanslot);
+
+	return scanslot;
+}
+
+/*
+ * FunctionRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+FunctionRecheck(FunctionScanState *node, TupleTableSlot *slot)
+{
+	/* nothing to check */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecFunctionScan(node)
+ *
+ *		Scans the function sequentially and returns the next qualifying
+ *		tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecFunctionScan(PlanState *pstate)
+{
+	FunctionScanState *node = castNode(FunctionScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) FunctionNext,
+					(ExecScanRecheckMtd) FunctionRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitFunctionScan
+ * ----------------------------------------------------------------
+ */
+FunctionScanState *
+ExecInitFunctionScan(FunctionScan *node, EState *estate, int eflags)
+{
+	FunctionScanState *scanstate;
+	int			nfuncs = list_length(node->functions);
+	TupleDesc	scan_tupdesc;
+	int			i,
+				natts;
+	ListCell   *lc;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & EXEC_FLAG_MARK));
+
+	/*
+	 * FunctionScan should not have any children.
+	 */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create new ScanState for node
+	 */
+	scanstate = makeNode(FunctionScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecFunctionScan;
+	scanstate->eflags = eflags;
+
+	/*
+	 * are we adding an ordinality column?
+	 */
+	scanstate->ordinality = node->funcordinality;
+
+	scanstate->nfuncs = nfuncs;
+	if (nfuncs == 1 && !node->funcordinality)
+		scanstate->simple = true;
+	else
+		scanstate->simple = false;
+
+	/*
+	 * Ordinal 0 represents the "before the first row" position.
+	 *
+	 * We need to track ordinal position even when not adding an ordinality
+	 * column to the result, in order to handle backwards scanning properly
+	 * with multiple functions with different result sizes. (We can't position
+	 * any individual function's tuplestore any more than 1 place beyond its
+	 * end, so when scanning backwards, we need to know when to start
+	 * including the function in the scan again.)
+	 */
+	scanstate->ordinal = 0;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	scanstate->funcstates = palloc(nfuncs * sizeof(FunctionScanPerFuncState));
+
+	natts = 0;
+	i = 0;
+	foreach(lc, node->functions)
+	{
+		RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
+		Node	   *funcexpr = rtfunc->funcexpr;
+		int			colcount = rtfunc->funccolcount;
+		FunctionScanPerFuncState *fs = &scanstate->funcstates[i];
+		TypeFuncClass functypclass;
+		Oid			funcrettype;
+		TupleDesc	tupdesc;
+
+		fs->setexpr =
+			ExecInitTableFunctionResult((Expr *) funcexpr,
+										scanstate->ss.ps.ps_ExprContext,
+										&scanstate->ss.ps);
+
+		/*
+		 * Don't allocate the tuplestores; the actual calls to the functions
+		 * do that.  NULL means that we have not called the function yet (or
+		 * need to call it again after a rescan).
+		 */
+		fs->tstore = NULL;
+		fs->rowcount = -1;
+
+		/*
+		 * Now determine if the function returns a simple or composite type,
+		 * and build an appropriate tupdesc.  Note that in the composite case,
+		 * the function may now return more columns than it did when the plan
+		 * was made; we have to ignore any columns beyond "colcount".
+		 */
+		functypclass = get_expr_result_type(funcexpr,
+											&funcrettype,
+											&tupdesc);
+
+		if (functypclass == TYPEFUNC_COMPOSITE ||
+			functypclass == TYPEFUNC_COMPOSITE_DOMAIN)
+		{
+			/* Composite data type, e.g. a table's row type */
+			Assert(tupdesc);
+			Assert(tupdesc->natts >= colcount);
+			/* Must copy it out of typcache for safety */
+			tupdesc = CreateTupleDescCopy(tupdesc);
+		}
+		else if (functypclass == TYPEFUNC_SCALAR)
+		{
+			/* Base data type, i.e. scalar */
+			tupdesc = CreateTemplateTupleDesc(1);
+			TupleDescInitEntry(tupdesc,
+							   (AttrNumber) 1,
+							   NULL,	/* don't care about the name here */
+							   funcrettype,
+							   -1,
+							   0);
+			TupleDescInitEntryCollation(tupdesc,
+										(AttrNumber) 1,
+										exprCollation(funcexpr));
+		}
+		else if (functypclass == TYPEFUNC_RECORD)
+		{
+			tupdesc = BuildDescFromLists(rtfunc->funccolnames,
+										 rtfunc->funccoltypes,
+										 rtfunc->funccoltypmods,
+										 rtfunc->funccolcollations);
+
+			/*
+			 * For RECORD results, make sure a typmod has been assigned.  (The
+			 * function should do this for itself, but let's cover things in
+			 * case it doesn't.)
+			 */
+			BlessTupleDesc(tupdesc);
+		}
+		else
+		{
+			/* crummy error message, but parser should have caught this */
+			elog(ERROR, "function in FROM has unsupported return type");
+		}
+
+		fs->tupdesc = tupdesc;
+		fs->colcount = colcount;
+
+		/*
+		 * We only need separate slots for the function results if we are
+		 * doing ordinality or multiple functions; otherwise, we'll fetch
+		 * function results directly into the scan slot.
+		 */
+		if (!scanstate->simple)
+		{
+			fs->func_slot = ExecInitExtraTupleSlot(estate, fs->tupdesc,
+												   &TTSOpsMinimalTuple);
+		}
+		else
+			fs->func_slot = NULL;
+
+		natts += colcount;
+		i++;
+	}
+
+	/*
+	 * Create the combined TupleDesc
+	 *
+	 * If there is just one function without ordinality, the scan result
+	 * tupdesc is the same as the function result tupdesc --- except that we
+	 * may stuff new names into it below, so drop any rowtype label.
+	 */
+	if (scanstate->simple)
+	{
+		scan_tupdesc = CreateTupleDescCopy(scanstate->funcstates[0].tupdesc);
+		scan_tupdesc->tdtypeid = RECORDOID;
+		scan_tupdesc->tdtypmod = -1;
+	}
+	else
+	{
+		AttrNumber	attno = 0;
+
+		if (node->funcordinality)
+			natts++;
+
+		scan_tupdesc = CreateTemplateTupleDesc(natts);
+
+		for (i = 0; i < nfuncs; i++)
+		{
+			TupleDesc	tupdesc = scanstate->funcstates[i].tupdesc;
+			int			colcount = scanstate->funcstates[i].colcount;
+			int			j;
+
+			for (j = 1; j <= colcount; j++)
+				TupleDescCopyEntry(scan_tupdesc, ++attno, tupdesc, j);
+		}
+
+		/* If doing ordinality, add a column of type "bigint" at the end */
+		if (node->funcordinality)
+		{
+			TupleDescInitEntry(scan_tupdesc,
+							   ++attno,
+							   NULL,	/* don't care about the name here */
+							   INT8OID,
+							   -1,
+							   0);
+		}
+
+		Assert(attno == natts);
+	}
+
+	/*
+	 * Initialize scan slot and type.
+	 */
+	ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc,
+						  &TTSOpsMinimalTuple);
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+	/*
+	 * Create a memory context that ExecMakeTableFunctionResult can use to
+	 * evaluate function arguments in.  We can't use the per-tuple context for
+	 * this because it gets reset too often; but we don't want to leak
+	 * evaluation results into the query-lifespan context either.  We just
+	 * need one context, because we evaluate each function separately.
+	 */
+	scanstate->argcontext = AllocSetContextCreate(CurrentMemoryContext,
+												  "Table function arguments",
+												  ALLOCSET_DEFAULT_SIZES);
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndFunctionScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndFunctionScan(FunctionScanState *node)
+{
+	int			i;
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * Release slots and tuplestore resources
+	 */
+	for (i = 0; i < node->nfuncs; i++)
+	{
+		FunctionScanPerFuncState *fs = &node->funcstates[i];
+
+		if (fs->func_slot)
+			ExecClearTuple(fs->func_slot);
+
+		if (fs->tstore != NULL)
+		{
+			tuplestore_end(node->funcstates[i].tstore);
+			fs->tstore = NULL;
+		}
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanFunctionScan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanFunctionScan(FunctionScanState *node)
+{
+	FunctionScan *scan = (FunctionScan *) node->ss.ps.plan;
+	int			i;
+	Bitmapset  *chgparam = node->ss.ps.chgParam;
+
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	for (i = 0; i < node->nfuncs; i++)
+	{
+		FunctionScanPerFuncState *fs = &node->funcstates[i];
+
+		if (fs->func_slot)
+			ExecClearTuple(fs->func_slot);
+	}
+
+	ExecScanReScan(&node->ss);
+
+	/*
+	 * Here we have a choice whether to drop the tuplestores (and recompute
+	 * the function outputs) or just rescan them.  We must recompute if an
+	 * expression contains changed parameters, else we rescan.
+	 *
+	 * XXX maybe we should recompute if the function is volatile?  But in
+	 * general the executor doesn't conditionalize its actions on that.
+	 */
+	if (chgparam)
+	{
+		ListCell   *lc;
+
+		i = 0;
+		foreach(lc, scan->functions)
+		{
+			RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
+
+			if (bms_overlap(chgparam, rtfunc->funcparams))
+			{
+				if (node->funcstates[i].tstore != NULL)
+				{
+					tuplestore_end(node->funcstates[i].tstore);
+					node->funcstates[i].tstore = NULL;
+				}
+				node->funcstates[i].rowcount = -1;
+			}
+			i++;
+		}
+	}
+
+	/* Reset ordinality counter */
+	node->ordinal = 0;
+
+	/* Make sure we rewind any remaining tuplestores */
+	for (i = 0; i < node->nfuncs; i++)
+	{
+		if (node->funcstates[i].tstore != NULL)
+			tuplestore_rescan(node->funcstates[i].tstore);
+	}
+}
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
new file mode 100644
index 0000000..734142b
--- /dev/null
+++ b/src/backend/executor/nodeGather.c
@@ -0,0 +1,477 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeGather.c
+ *	  Support routines for scanning a plan via multiple workers.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * A Gather executor launches parallel workers to run multiple copies of a
+ * plan.  It can also run the plan itself, if the workers are not available
+ * or have not started up yet.  It then merges all of the results it produces
+ * and the results from the workers into a single output stream.  Therefore,
+ * it will normally be used with a plan where running multiple copies of the
+ * same plan does not produce duplicate output, such as parallel-aware
+ * SeqScan.
+ *
+ * Alternatively, a Gather node can be configured to use just one worker
+ * and the single-copy flag can be set.  In this case, the Gather node will
+ * run the plan in one worker and will not execute the plan itself.  In
+ * this case, it simply returns whatever tuples were returned by the worker.
+ * If a worker cannot be obtained, then it will run the plan itself and
+ * return the results.  Therefore, a plan used with a single-copy Gather
+ * node need not be parallel-aware.
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeGather.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/xact.h"
+#include "executor/execdebug.h"
+#include "executor/execParallel.h"
+#include "executor/nodeGather.h"
+#include "executor/nodeSubplan.h"
+#include "executor/tqueue.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+static TupleTableSlot *ExecGather(PlanState *pstate);
+static TupleTableSlot *gather_getnext(GatherState *gatherstate);
+static MinimalTuple gather_readnext(GatherState *gatherstate);
+static void ExecShutdownGatherWorkers(GatherState *node);
+
+
+/* ----------------------------------------------------------------
+ *		ExecInitGather
+ * ----------------------------------------------------------------
+ */
+GatherState *
+ExecInitGather(Gather *node, EState *estate, int eflags)
+{
+	GatherState *gatherstate;
+	Plan	   *outerNode;
+	TupleDesc	tupDesc;
+
+	/* Gather node doesn't have innerPlan node. */
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create state structure
+	 */
+	gatherstate = makeNode(GatherState);
+	gatherstate->ps.plan = (Plan *) node;
+	gatherstate->ps.state = estate;
+	gatherstate->ps.ExecProcNode = ExecGather;
+
+	gatherstate->initialized = false;
+	gatherstate->need_to_scan_locally =
+		!node->single_copy && parallel_leader_participation;
+	gatherstate->tuples_needed = -1;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &gatherstate->ps);
+
+	/*
+	 * now initialize outer plan
+	 */
+	outerNode = outerPlan(node);
+	outerPlanState(gatherstate) = ExecInitNode(outerNode, estate, eflags);
+	tupDesc = ExecGetResultType(outerPlanState(gatherstate));
+
+	/*
+	 * Leader may access ExecProcNode result directly (if
+	 * need_to_scan_locally), or from workers via tuple queue.  So we can't
+	 * trivially rely on the slot type being fixed for expressions evaluated
+	 * within this node.
+	 */
+	gatherstate->ps.outeropsset = true;
+	gatherstate->ps.outeropsfixed = false;
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&gatherstate->ps);
+	ExecConditionalAssignProjectionInfo(&gatherstate->ps, tupDesc, OUTER_VAR);
+
+	/*
+	 * Without projections result slot type is not trivially known, see
+	 * comment above.
+	 */
+	if (gatherstate->ps.ps_ProjInfo == NULL)
+	{
+		gatherstate->ps.resultopsset = true;
+		gatherstate->ps.resultopsfixed = false;
+	}
+
+	/*
+	 * Initialize funnel slot to same tuple descriptor as outer plan.
+	 */
+	gatherstate->funnel_slot = ExecInitExtraTupleSlot(estate, tupDesc,
+													  &TTSOpsMinimalTuple);
+
+	/*
+	 * Gather doesn't support checking a qual (it's always more efficient to
+	 * do it in the child node).
+	 */
+	Assert(!node->plan.qual);
+
+	return gatherstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecGather(node)
+ *
+ *		Scans the relation via multiple workers and returns
+ *		the next qualifying tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecGather(PlanState *pstate)
+{
+	GatherState *node = castNode(GatherState, pstate);
+	TupleTableSlot *slot;
+	ExprContext *econtext;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * Initialize the parallel context and workers on first execution. We do
+	 * this on first execution rather than during node initialization, as it
+	 * needs to allocate a large dynamic segment, so it is better to do it
+	 * only if it is really needed.
+	 */
+	if (!node->initialized)
+	{
+		EState	   *estate = node->ps.state;
+		Gather	   *gather = (Gather *) node->ps.plan;
+
+		/*
+		 * Sometimes we might have to run without parallelism; but if parallel
+		 * mode is active then we can try to fire up some workers.
+		 */
+		if (gather->num_workers > 0 && estate->es_use_parallel_mode)
+		{
+			ParallelContext *pcxt;
+
+			/* Initialize, or re-initialize, shared state needed by workers. */
+			if (!node->pei)
+				node->pei = ExecInitParallelPlan(node->ps.lefttree,
+												 estate,
+												 gather->initParam,
+												 gather->num_workers,
+												 node->tuples_needed);
+			else
+				ExecParallelReinitialize(node->ps.lefttree,
+										 node->pei,
+										 gather->initParam);
+
+			/*
+			 * Register backend workers. We might not get as many as we
+			 * requested, or indeed any at all.
+			 */
+			pcxt = node->pei->pcxt;
+			LaunchParallelWorkers(pcxt);
+			/* We save # workers launched for the benefit of EXPLAIN */
+			node->nworkers_launched = pcxt->nworkers_launched;
+
+			/* Set up tuple queue readers to read the results. */
+			if (pcxt->nworkers_launched > 0)
+			{
+				ExecParallelCreateReaders(node->pei);
+				/* Make a working array showing the active readers */
+				node->nreaders = pcxt->nworkers_launched;
+				node->reader = (TupleQueueReader **)
+					palloc(node->nreaders * sizeof(TupleQueueReader *));
+				memcpy(node->reader, node->pei->reader,
+					   node->nreaders * sizeof(TupleQueueReader *));
+			}
+			else
+			{
+				/* No workers?	Then never mind. */
+				node->nreaders = 0;
+				node->reader = NULL;
+			}
+			node->nextreader = 0;
+		}
+
+		/* Run plan locally if no workers or enabled and not single-copy. */
+		node->need_to_scan_locally = (node->nreaders == 0)
+			|| (!gather->single_copy && parallel_leader_participation);
+		node->initialized = true;
+	}
+
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage allocated in the previous tuple cycle.
+	 */
+	econtext = node->ps.ps_ExprContext;
+	ResetExprContext(econtext);
+
+	/*
+	 * Get next tuple, either from one of our workers, or by running the plan
+	 * ourselves.
+	 */
+	slot = gather_getnext(node);
+	if (TupIsNull(slot))
+		return NULL;
+
+	/* If no projection is required, we're done. */
+	if (node->ps.ps_ProjInfo == NULL)
+		return slot;
+
+	/*
+	 * Form the result tuple using ExecProject(), and return it.
+	 */
+	econtext->ecxt_outertuple = slot;
+	return ExecProject(node->ps.ps_ProjInfo);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndGather
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndGather(GatherState *node)
+{
+	ExecEndNode(outerPlanState(node));	/* let children clean up first */
+	ExecShutdownGather(node);
+	ExecFreeExprContext(&node->ps);
+	if (node->ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ps.ps_ResultTupleSlot);
+}
+
+/*
+ * Read the next tuple.  We might fetch a tuple from one of the tuple queues
+ * using gather_readnext, or if no tuple queue contains a tuple and the
+ * single_copy flag is not set, we might generate one locally instead.
+ */
+static TupleTableSlot *
+gather_getnext(GatherState *gatherstate)
+{
+	PlanState  *outerPlan = outerPlanState(gatherstate);
+	TupleTableSlot *outerTupleSlot;
+	TupleTableSlot *fslot = gatherstate->funnel_slot;
+	MinimalTuple tup;
+
+	while (gatherstate->nreaders > 0 || gatherstate->need_to_scan_locally)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		if (gatherstate->nreaders > 0)
+		{
+			tup = gather_readnext(gatherstate);
+
+			if (HeapTupleIsValid(tup))
+			{
+				ExecStoreMinimalTuple(tup,	/* tuple to store */
+									  fslot,	/* slot to store the tuple */
+									  false);	/* don't pfree tuple  */
+				return fslot;
+			}
+		}
+
+		if (gatherstate->need_to_scan_locally)
+		{
+			EState	   *estate = gatherstate->ps.state;
+
+			/* Install our DSA area while executing the plan. */
+			estate->es_query_dsa =
+				gatherstate->pei ? gatherstate->pei->area : NULL;
+			outerTupleSlot = ExecProcNode(outerPlan);
+			estate->es_query_dsa = NULL;
+
+			if (!TupIsNull(outerTupleSlot))
+				return outerTupleSlot;
+
+			gatherstate->need_to_scan_locally = false;
+		}
+	}
+
+	return ExecClearTuple(fslot);
+}
+
+/*
+ * Attempt to read a tuple from one of our parallel workers.
+ */
+static MinimalTuple
+gather_readnext(GatherState *gatherstate)
+{
+	int			nvisited = 0;
+
+	for (;;)
+	{
+		TupleQueueReader *reader;
+		MinimalTuple tup;
+		bool		readerdone;
+
+		/* Check for async events, particularly messages from workers. */
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Attempt to read a tuple, but don't block if none is available.
+		 *
+		 * Note that TupleQueueReaderNext will just return NULL for a worker
+		 * which fails to initialize.  We'll treat that worker as having
+		 * produced no tuples; WaitForParallelWorkersToFinish will error out
+		 * when we get there.
+		 */
+		Assert(gatherstate->nextreader < gatherstate->nreaders);
+		reader = gatherstate->reader[gatherstate->nextreader];
+		tup = TupleQueueReaderNext(reader, true, &readerdone);
+
+		/*
+		 * If this reader is done, remove it from our working array of active
+		 * readers.  If all readers are done, we're outta here.
+		 */
+		if (readerdone)
+		{
+			Assert(!tup);
+			--gatherstate->nreaders;
+			if (gatherstate->nreaders == 0)
+			{
+				ExecShutdownGatherWorkers(gatherstate);
+				return NULL;
+			}
+			memmove(&gatherstate->reader[gatherstate->nextreader],
+					&gatherstate->reader[gatherstate->nextreader + 1],
+					sizeof(TupleQueueReader *)
+					* (gatherstate->nreaders - gatherstate->nextreader));
+			if (gatherstate->nextreader >= gatherstate->nreaders)
+				gatherstate->nextreader = 0;
+			continue;
+		}
+
+		/* If we got a tuple, return it. */
+		if (tup)
+			return tup;
+
+		/*
+		 * Advance nextreader pointer in round-robin fashion.  Note that we
+		 * only reach this code if we weren't able to get a tuple from the
+		 * current worker.  We used to advance the nextreader pointer after
+		 * every tuple, but it turns out to be much more efficient to keep
+		 * reading from the same queue until that would require blocking.
+		 */
+		gatherstate->nextreader++;
+		if (gatherstate->nextreader >= gatherstate->nreaders)
+			gatherstate->nextreader = 0;
+
+		/* Have we visited every (surviving) TupleQueueReader? */
+		nvisited++;
+		if (nvisited >= gatherstate->nreaders)
+		{
+			/*
+			 * If (still) running plan locally, return NULL so caller can
+			 * generate another tuple from the local copy of the plan.
+			 */
+			if (gatherstate->need_to_scan_locally)
+				return NULL;
+
+			/* Nothing to do except wait for developments. */
+			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+							 WAIT_EVENT_EXECUTE_GATHER);
+			ResetLatch(MyLatch);
+			nvisited = 0;
+		}
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecShutdownGatherWorkers
+ *
+ *		Stop all the parallel workers.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecShutdownGatherWorkers(GatherState *node)
+{
+	if (node->pei != NULL)
+		ExecParallelFinish(node->pei);
+
+	/* Flush local copy of reader array */
+	if (node->reader)
+		pfree(node->reader);
+	node->reader = NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecShutdownGather
+ *
+ *		Destroy the setup for parallel workers including parallel context.
+ * ----------------------------------------------------------------
+ */
+void
+ExecShutdownGather(GatherState *node)
+{
+	ExecShutdownGatherWorkers(node);
+
+	/* Now destroy the parallel context. */
+	if (node->pei != NULL)
+	{
+		ExecParallelCleanup(node->pei);
+		node->pei = NULL;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *						Join Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecReScanGather
+ *
+ *		Prepare to re-scan the result of a Gather.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanGather(GatherState *node)
+{
+	Gather	   *gather = (Gather *) node->ps.plan;
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/* Make sure any existing workers are gracefully shut down */
+	ExecShutdownGatherWorkers(node);
+
+	/* Mark node so that shared state will be rebuilt at next call */
+	node->initialized = false;
+
+	/*
+	 * Set child node's chgParam to tell it that the next scan might deliver a
+	 * different set of rows within the leader process.  (The overall rowset
+	 * shouldn't change, but the leader process's subset might; hence nodes
+	 * between here and the parallel table scan node mustn't optimize on the
+	 * assumption of an unchanging rowset.)
+	 */
+	if (gather->rescan_param >= 0)
+		outerPlan->chgParam = bms_add_member(outerPlan->chgParam,
+											 gather->rescan_param);
+
+	/*
+	 * If chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.  Note: because this does nothing if we have a
+	 * rescan_param, it's currently guaranteed that parallel-aware child nodes
+	 * will not see a ReScan call until after they get a ReInitializeDSM call.
+	 * That ordering might not be something to rely on, though.  A good rule
+	 * of thumb is that ReInitializeDSM should reset only shared state, ReScan
+	 * should reset only local state, and anything that depends on both of
+	 * those steps being finished must wait until the first ExecProcNode call.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+}
diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c
new file mode 100644
index 0000000..03f02a1
--- /dev/null
+++ b/src/backend/executor/nodeGatherMerge.c
@@ -0,0 +1,789 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeGatherMerge.c
+ *		Scan a plan in multiple workers, and do order-preserving merge.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeGatherMerge.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/xact.h"
+#include "executor/execdebug.h"
+#include "executor/execParallel.h"
+#include "executor/nodeGatherMerge.h"
+#include "executor/nodeSubplan.h"
+#include "executor/tqueue.h"
+#include "lib/binaryheap.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/*
+ * When we read tuples from workers, it's a good idea to read several at once
+ * for efficiency when possible: this minimizes context-switching overhead.
+ * But reading too many at a time wastes memory without improving performance.
+ * We'll read up to MAX_TUPLE_STORE tuples (in addition to the first one).
+ */
+#define MAX_TUPLE_STORE 10
+
+/*
+ * Pending-tuple array for each worker.  This holds additional tuples that
+ * we were able to fetch from the worker, but can't process yet.  In addition,
+ * this struct holds the "done" flag indicating the worker is known to have
+ * no more tuples.  (We do not use this struct for the leader; we don't keep
+ * any pending tuples for the leader, and the need_to_scan_locally flag serves
+ * as its "done" indicator.)
+ */
+typedef struct GMReaderTupleBuffer
+{
+	MinimalTuple *tuple;		/* array of length MAX_TUPLE_STORE */
+	int			nTuples;		/* number of tuples currently stored */
+	int			readCounter;	/* index of next tuple to extract */
+	bool		done;			/* true if reader is known exhausted */
+} GMReaderTupleBuffer;
+
+static TupleTableSlot *ExecGatherMerge(PlanState *pstate);
+static int32 heap_compare_slots(Datum a, Datum b, void *arg);
+static TupleTableSlot *gather_merge_getnext(GatherMergeState *gm_state);
+static MinimalTuple gm_readnext_tuple(GatherMergeState *gm_state, int nreader,
+									  bool nowait, bool *done);
+static void ExecShutdownGatherMergeWorkers(GatherMergeState *node);
+static void gather_merge_setup(GatherMergeState *gm_state);
+static void gather_merge_init(GatherMergeState *gm_state);
+static void gather_merge_clear_tuples(GatherMergeState *gm_state);
+static bool gather_merge_readnext(GatherMergeState *gm_state, int reader,
+								  bool nowait);
+static void load_tuple_array(GatherMergeState *gm_state, int reader);
+
+/* ----------------------------------------------------------------
+ *		ExecInitGather
+ * ----------------------------------------------------------------
+ */
+GatherMergeState *
+ExecInitGatherMerge(GatherMerge *node, EState *estate, int eflags)
+{
+	GatherMergeState *gm_state;
+	Plan	   *outerNode;
+	TupleDesc	tupDesc;
+
+	/* Gather merge node doesn't have innerPlan node. */
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create state structure
+	 */
+	gm_state = makeNode(GatherMergeState);
+	gm_state->ps.plan = (Plan *) node;
+	gm_state->ps.state = estate;
+	gm_state->ps.ExecProcNode = ExecGatherMerge;
+
+	gm_state->initialized = false;
+	gm_state->gm_initialized = false;
+	gm_state->tuples_needed = -1;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &gm_state->ps);
+
+	/*
+	 * GatherMerge doesn't support checking a qual (it's always more efficient
+	 * to do it in the child node).
+	 */
+	Assert(!node->plan.qual);
+
+	/*
+	 * now initialize outer plan
+	 */
+	outerNode = outerPlan(node);
+	outerPlanState(gm_state) = ExecInitNode(outerNode, estate, eflags);
+
+	/*
+	 * Leader may access ExecProcNode result directly (if
+	 * need_to_scan_locally), or from workers via tuple queue.  So we can't
+	 * trivially rely on the slot type being fixed for expressions evaluated
+	 * within this node.
+	 */
+	gm_state->ps.outeropsset = true;
+	gm_state->ps.outeropsfixed = false;
+
+	/*
+	 * Store the tuple descriptor into gather merge state, so we can use it
+	 * while initializing the gather merge slots.
+	 */
+	tupDesc = ExecGetResultType(outerPlanState(gm_state));
+	gm_state->tupDesc = tupDesc;
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&gm_state->ps);
+	ExecConditionalAssignProjectionInfo(&gm_state->ps, tupDesc, OUTER_VAR);
+
+	/*
+	 * Without projections result slot type is not trivially known, see
+	 * comment above.
+	 */
+	if (gm_state->ps.ps_ProjInfo == NULL)
+	{
+		gm_state->ps.resultopsset = true;
+		gm_state->ps.resultopsfixed = false;
+	}
+
+	/*
+	 * initialize sort-key information
+	 */
+	if (node->numCols)
+	{
+		int			i;
+
+		gm_state->gm_nkeys = node->numCols;
+		gm_state->gm_sortkeys =
+			palloc0(sizeof(SortSupportData) * node->numCols);
+
+		for (i = 0; i < node->numCols; i++)
+		{
+			SortSupport sortKey = gm_state->gm_sortkeys + i;
+
+			sortKey->ssup_cxt = CurrentMemoryContext;
+			sortKey->ssup_collation = node->collations[i];
+			sortKey->ssup_nulls_first = node->nullsFirst[i];
+			sortKey->ssup_attno = node->sortColIdx[i];
+
+			/*
+			 * We don't perform abbreviated key conversion here, for the same
+			 * reasons that it isn't used in MergeAppend
+			 */
+			sortKey->abbreviate = false;
+
+			PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
+		}
+	}
+
+	/* Now allocate the workspace for gather merge */
+	gather_merge_setup(gm_state);
+
+	return gm_state;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecGatherMerge(node)
+ *
+ *		Scans the relation via multiple workers and returns
+ *		the next qualifying tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecGatherMerge(PlanState *pstate)
+{
+	GatherMergeState *node = castNode(GatherMergeState, pstate);
+	TupleTableSlot *slot;
+	ExprContext *econtext;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * As with Gather, we don't launch workers until this node is actually
+	 * executed.
+	 */
+	if (!node->initialized)
+	{
+		EState	   *estate = node->ps.state;
+		GatherMerge *gm = castNode(GatherMerge, node->ps.plan);
+
+		/*
+		 * Sometimes we might have to run without parallelism; but if parallel
+		 * mode is active then we can try to fire up some workers.
+		 */
+		if (gm->num_workers > 0 && estate->es_use_parallel_mode)
+		{
+			ParallelContext *pcxt;
+
+			/* Initialize, or re-initialize, shared state needed by workers. */
+			if (!node->pei)
+				node->pei = ExecInitParallelPlan(node->ps.lefttree,
+												 estate,
+												 gm->initParam,
+												 gm->num_workers,
+												 node->tuples_needed);
+			else
+				ExecParallelReinitialize(node->ps.lefttree,
+										 node->pei,
+										 gm->initParam);
+
+			/* Try to launch workers. */
+			pcxt = node->pei->pcxt;
+			LaunchParallelWorkers(pcxt);
+			/* We save # workers launched for the benefit of EXPLAIN */
+			node->nworkers_launched = pcxt->nworkers_launched;
+
+			/* Set up tuple queue readers to read the results. */
+			if (pcxt->nworkers_launched > 0)
+			{
+				ExecParallelCreateReaders(node->pei);
+				/* Make a working array showing the active readers */
+				node->nreaders = pcxt->nworkers_launched;
+				node->reader = (TupleQueueReader **)
+					palloc(node->nreaders * sizeof(TupleQueueReader *));
+				memcpy(node->reader, node->pei->reader,
+					   node->nreaders * sizeof(TupleQueueReader *));
+			}
+			else
+			{
+				/* No workers?	Then never mind. */
+				node->nreaders = 0;
+				node->reader = NULL;
+			}
+		}
+
+		/* allow leader to participate if enabled or no choice */
+		if (parallel_leader_participation || node->nreaders == 0)
+			node->need_to_scan_locally = true;
+		node->initialized = true;
+	}
+
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage allocated in the previous tuple cycle.
+	 */
+	econtext = node->ps.ps_ExprContext;
+	ResetExprContext(econtext);
+
+	/*
+	 * Get next tuple, either from one of our workers, or by running the plan
+	 * ourselves.
+	 */
+	slot = gather_merge_getnext(node);
+	if (TupIsNull(slot))
+		return NULL;
+
+	/* If no projection is required, we're done. */
+	if (node->ps.ps_ProjInfo == NULL)
+		return slot;
+
+	/*
+	 * Form the result tuple using ExecProject(), and return it.
+	 */
+	econtext->ecxt_outertuple = slot;
+	return ExecProject(node->ps.ps_ProjInfo);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndGatherMerge
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndGatherMerge(GatherMergeState *node)
+{
+	ExecEndNode(outerPlanState(node));	/* let children clean up first */
+	ExecShutdownGatherMerge(node);
+	ExecFreeExprContext(&node->ps);
+	if (node->ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ps.ps_ResultTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecShutdownGatherMerge
+ *
+ *		Destroy the setup for parallel workers including parallel context.
+ * ----------------------------------------------------------------
+ */
+void
+ExecShutdownGatherMerge(GatherMergeState *node)
+{
+	ExecShutdownGatherMergeWorkers(node);
+
+	/* Now destroy the parallel context. */
+	if (node->pei != NULL)
+	{
+		ExecParallelCleanup(node->pei);
+		node->pei = NULL;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecShutdownGatherMergeWorkers
+ *
+ *		Stop all the parallel workers.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecShutdownGatherMergeWorkers(GatherMergeState *node)
+{
+	if (node->pei != NULL)
+		ExecParallelFinish(node->pei);
+
+	/* Flush local copy of reader array */
+	if (node->reader)
+		pfree(node->reader);
+	node->reader = NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanGatherMerge
+ *
+ *		Prepare to re-scan the result of a GatherMerge.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanGatherMerge(GatherMergeState *node)
+{
+	GatherMerge *gm = (GatherMerge *) node->ps.plan;
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/* Make sure any existing workers are gracefully shut down */
+	ExecShutdownGatherMergeWorkers(node);
+
+	/* Free any unused tuples, so we don't leak memory across rescans */
+	gather_merge_clear_tuples(node);
+
+	/* Mark node so that shared state will be rebuilt at next call */
+	node->initialized = false;
+	node->gm_initialized = false;
+
+	/*
+	 * Set child node's chgParam to tell it that the next scan might deliver a
+	 * different set of rows within the leader process.  (The overall rowset
+	 * shouldn't change, but the leader process's subset might; hence nodes
+	 * between here and the parallel table scan node mustn't optimize on the
+	 * assumption of an unchanging rowset.)
+	 */
+	if (gm->rescan_param >= 0)
+		outerPlan->chgParam = bms_add_member(outerPlan->chgParam,
+											 gm->rescan_param);
+
+	/*
+	 * If chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.  Note: because this does nothing if we have a
+	 * rescan_param, it's currently guaranteed that parallel-aware child nodes
+	 * will not see a ReScan call until after they get a ReInitializeDSM call.
+	 * That ordering might not be something to rely on, though.  A good rule
+	 * of thumb is that ReInitializeDSM should reset only shared state, ReScan
+	 * should reset only local state, and anything that depends on both of
+	 * those steps being finished must wait until the first ExecProcNode call.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+}
+
+/*
+ * Set up the data structures that we'll need for Gather Merge.
+ *
+ * We allocate these once on the basis of gm->num_workers, which is an
+ * upper bound for the number of workers we'll actually have.  During
+ * a rescan, we reset the structures to empty.  This approach simplifies
+ * not leaking memory across rescans.
+ *
+ * In the gm_slots[] array, index 0 is for the leader, and indexes 1 to n
+ * are for workers.  The values placed into gm_heap correspond to indexes
+ * in gm_slots[].  The gm_tuple_buffers[] array, however, is indexed from
+ * 0 to n-1; it has no entry for the leader.
+ */
+static void
+gather_merge_setup(GatherMergeState *gm_state)
+{
+	GatherMerge *gm = castNode(GatherMerge, gm_state->ps.plan);
+	int			nreaders = gm->num_workers;
+	int			i;
+
+	/*
+	 * Allocate gm_slots for the number of workers + one more slot for leader.
+	 * Slot 0 is always for the leader.  Leader always calls ExecProcNode() to
+	 * read the tuple, and then stores it directly into its gm_slots entry.
+	 * For other slots, code below will call ExecInitExtraTupleSlot() to
+	 * create a slot for the worker's results.  Note that during any single
+	 * scan, we might have fewer than num_workers available workers, in which
+	 * case the extra array entries go unused.
+	 */
+	gm_state->gm_slots = (TupleTableSlot **)
+		palloc0((nreaders + 1) * sizeof(TupleTableSlot *));
+
+	/* Allocate the tuple slot and tuple array for each worker */
+	gm_state->gm_tuple_buffers = (GMReaderTupleBuffer *)
+		palloc0(nreaders * sizeof(GMReaderTupleBuffer));
+
+	for (i = 0; i < nreaders; i++)
+	{
+		/* Allocate the tuple array with length MAX_TUPLE_STORE */
+		gm_state->gm_tuple_buffers[i].tuple =
+			(MinimalTuple *) palloc0(sizeof(MinimalTuple) * MAX_TUPLE_STORE);
+
+		/* Initialize tuple slot for worker */
+		gm_state->gm_slots[i + 1] =
+			ExecInitExtraTupleSlot(gm_state->ps.state, gm_state->tupDesc,
+								   &TTSOpsMinimalTuple);
+	}
+
+	/* Allocate the resources for the merge */
+	gm_state->gm_heap = binaryheap_allocate(nreaders + 1,
+											heap_compare_slots,
+											gm_state);
+}
+
+/*
+ * Initialize the Gather Merge.
+ *
+ * Reset data structures to ensure they're empty.  Then pull at least one
+ * tuple from leader + each worker (or set its "done" indicator), and set up
+ * the heap.
+ */
+static void
+gather_merge_init(GatherMergeState *gm_state)
+{
+	int			nreaders = gm_state->nreaders;
+	bool		nowait = true;
+	int			i;
+
+	/* Assert that gather_merge_setup made enough space */
+	Assert(nreaders <= castNode(GatherMerge, gm_state->ps.plan)->num_workers);
+
+	/* Reset leader's tuple slot to empty */
+	gm_state->gm_slots[0] = NULL;
+
+	/* Reset the tuple slot and tuple array for each worker */
+	for (i = 0; i < nreaders; i++)
+	{
+		/* Reset tuple array to empty */
+		gm_state->gm_tuple_buffers[i].nTuples = 0;
+		gm_state->gm_tuple_buffers[i].readCounter = 0;
+		/* Reset done flag to not-done */
+		gm_state->gm_tuple_buffers[i].done = false;
+		/* Ensure output slot is empty */
+		ExecClearTuple(gm_state->gm_slots[i + 1]);
+	}
+
+	/* Reset binary heap to empty */
+	binaryheap_reset(gm_state->gm_heap);
+
+	/*
+	 * First, try to read a tuple from each worker (including leader) in
+	 * nowait mode.  After this, if not all workers were able to produce a
+	 * tuple (or a "done" indication), then re-read from remaining workers,
+	 * this time using wait mode.  Add all live readers (those producing at
+	 * least one tuple) to the heap.
+	 */
+reread:
+	for (i = 0; i <= nreaders; i++)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		/* skip this source if already known done */
+		if ((i == 0) ? gm_state->need_to_scan_locally :
+			!gm_state->gm_tuple_buffers[i - 1].done)
+		{
+			if (TupIsNull(gm_state->gm_slots[i]))
+			{
+				/* Don't have a tuple yet, try to get one */
+				if (gather_merge_readnext(gm_state, i, nowait))
+					binaryheap_add_unordered(gm_state->gm_heap,
+											 Int32GetDatum(i));
+			}
+			else
+			{
+				/*
+				 * We already got at least one tuple from this worker, but
+				 * might as well see if it has any more ready by now.
+				 */
+				load_tuple_array(gm_state, i);
+			}
+		}
+	}
+
+	/* need not recheck leader, since nowait doesn't matter for it */
+	for (i = 1; i <= nreaders; i++)
+	{
+		if (!gm_state->gm_tuple_buffers[i - 1].done &&
+			TupIsNull(gm_state->gm_slots[i]))
+		{
+			nowait = false;
+			goto reread;
+		}
+	}
+
+	/* Now heapify the heap. */
+	binaryheap_build(gm_state->gm_heap);
+
+	gm_state->gm_initialized = true;
+}
+
+/*
+ * Clear out the tuple table slot, and any unused pending tuples,
+ * for each gather merge input.
+ */
+static void
+gather_merge_clear_tuples(GatherMergeState *gm_state)
+{
+	int			i;
+
+	for (i = 0; i < gm_state->nreaders; i++)
+	{
+		GMReaderTupleBuffer *tuple_buffer = &gm_state->gm_tuple_buffers[i];
+
+		while (tuple_buffer->readCounter < tuple_buffer->nTuples)
+			pfree(tuple_buffer->tuple[tuple_buffer->readCounter++]);
+
+		ExecClearTuple(gm_state->gm_slots[i + 1]);
+	}
+}
+
+/*
+ * Read the next tuple for gather merge.
+ *
+ * Fetch the sorted tuple out of the heap.
+ */
+static TupleTableSlot *
+gather_merge_getnext(GatherMergeState *gm_state)
+{
+	int			i;
+
+	if (!gm_state->gm_initialized)
+	{
+		/*
+		 * First time through: pull the first tuple from each participant, and
+		 * set up the heap.
+		 */
+		gather_merge_init(gm_state);
+	}
+	else
+	{
+		/*
+		 * Otherwise, pull the next tuple from whichever participant we
+		 * returned from last time, and reinsert that participant's index into
+		 * the heap, because it might now compare differently against the
+		 * other elements of the heap.
+		 */
+		i = DatumGetInt32(binaryheap_first(gm_state->gm_heap));
+
+		if (gather_merge_readnext(gm_state, i, false))
+			binaryheap_replace_first(gm_state->gm_heap, Int32GetDatum(i));
+		else
+		{
+			/* reader exhausted, remove it from heap */
+			(void) binaryheap_remove_first(gm_state->gm_heap);
+		}
+	}
+
+	if (binaryheap_empty(gm_state->gm_heap))
+	{
+		/* All the queues are exhausted, and so is the heap */
+		gather_merge_clear_tuples(gm_state);
+		return NULL;
+	}
+	else
+	{
+		/* Return next tuple from whichever participant has the leading one */
+		i = DatumGetInt32(binaryheap_first(gm_state->gm_heap));
+		return gm_state->gm_slots[i];
+	}
+}
+
+/*
+ * Read tuple(s) for given reader in nowait mode, and load into its tuple
+ * array, until we have MAX_TUPLE_STORE of them or would have to block.
+ */
+static void
+load_tuple_array(GatherMergeState *gm_state, int reader)
+{
+	GMReaderTupleBuffer *tuple_buffer;
+	int			i;
+
+	/* Don't do anything if this is the leader. */
+	if (reader == 0)
+		return;
+
+	tuple_buffer = &gm_state->gm_tuple_buffers[reader - 1];
+
+	/* If there's nothing in the array, reset the counters to zero. */
+	if (tuple_buffer->nTuples == tuple_buffer->readCounter)
+		tuple_buffer->nTuples = tuple_buffer->readCounter = 0;
+
+	/* Try to fill additional slots in the array. */
+	for (i = tuple_buffer->nTuples; i < MAX_TUPLE_STORE; i++)
+	{
+		MinimalTuple tuple;
+
+		tuple = gm_readnext_tuple(gm_state,
+								  reader,
+								  true,
+								  &tuple_buffer->done);
+		if (!tuple)
+			break;
+		tuple_buffer->tuple[i] = tuple;
+		tuple_buffer->nTuples++;
+	}
+}
+
+/*
+ * Store the next tuple for a given reader into the appropriate slot.
+ *
+ * Returns true if successful, false if not (either reader is exhausted,
+ * or we didn't want to wait for a tuple).  Sets done flag if reader
+ * is found to be exhausted.
+ */
+static bool
+gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
+{
+	GMReaderTupleBuffer *tuple_buffer;
+	MinimalTuple tup;
+
+	/*
+	 * If we're being asked to generate a tuple from the leader, then we just
+	 * call ExecProcNode as normal to produce one.
+	 */
+	if (reader == 0)
+	{
+		if (gm_state->need_to_scan_locally)
+		{
+			PlanState  *outerPlan = outerPlanState(gm_state);
+			TupleTableSlot *outerTupleSlot;
+			EState	   *estate = gm_state->ps.state;
+
+			/* Install our DSA area while executing the plan. */
+			estate->es_query_dsa = gm_state->pei ? gm_state->pei->area : NULL;
+			outerTupleSlot = ExecProcNode(outerPlan);
+			estate->es_query_dsa = NULL;
+
+			if (!TupIsNull(outerTupleSlot))
+			{
+				gm_state->gm_slots[0] = outerTupleSlot;
+				return true;
+			}
+			/* need_to_scan_locally serves as "done" flag for leader */
+			gm_state->need_to_scan_locally = false;
+		}
+		return false;
+	}
+
+	/* Otherwise, check the state of the relevant tuple buffer. */
+	tuple_buffer = &gm_state->gm_tuple_buffers[reader - 1];
+
+	if (tuple_buffer->nTuples > tuple_buffer->readCounter)
+	{
+		/* Return any tuple previously read that is still buffered. */
+		tup = tuple_buffer->tuple[tuple_buffer->readCounter++];
+	}
+	else if (tuple_buffer->done)
+	{
+		/* Reader is known to be exhausted. */
+		return false;
+	}
+	else
+	{
+		/* Read and buffer next tuple. */
+		tup = gm_readnext_tuple(gm_state,
+								reader,
+								nowait,
+								&tuple_buffer->done);
+		if (!tup)
+			return false;
+
+		/*
+		 * Attempt to read more tuples in nowait mode and store them in the
+		 * pending-tuple array for the reader.
+		 */
+		load_tuple_array(gm_state, reader);
+	}
+
+	Assert(tup);
+
+	/* Build the TupleTableSlot for the given tuple */
+	ExecStoreMinimalTuple(tup,	/* tuple to store */
+						  gm_state->gm_slots[reader],	/* slot in which to
+														 * store the tuple */
+						  true);	/* pfree tuple when done with it */
+
+	return true;
+}
+
+/*
+ * Attempt to read a tuple from given worker.
+ */
+static MinimalTuple
+gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
+				  bool *done)
+{
+	TupleQueueReader *reader;
+	MinimalTuple tup;
+
+	/* Check for async events, particularly messages from workers. */
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * Attempt to read a tuple.
+	 *
+	 * Note that TupleQueueReaderNext will just return NULL for a worker which
+	 * fails to initialize.  We'll treat that worker as having produced no
+	 * tuples; WaitForParallelWorkersToFinish will error out when we get
+	 * there.
+	 */
+	reader = gm_state->reader[nreader - 1];
+	tup = TupleQueueReaderNext(reader, nowait, done);
+
+	/*
+	 * Since we'll be buffering these across multiple calls, we need to make a
+	 * copy.
+	 */
+	return tup ? heap_copy_minimal_tuple(tup) : NULL;
+}
+
+/*
+ * We have one slot for each item in the heap array.  We use SlotNumber
+ * to store slot indexes.  This doesn't actually provide any formal
+ * type-safety, but it makes the code more self-documenting.
+ */
+typedef int32 SlotNumber;
+
+/*
+ * Compare the tuples in the two given slots.
+ */
+static int32
+heap_compare_slots(Datum a, Datum b, void *arg)
+{
+	GatherMergeState *node = (GatherMergeState *) arg;
+	SlotNumber	slot1 = DatumGetInt32(a);
+	SlotNumber	slot2 = DatumGetInt32(b);
+
+	TupleTableSlot *s1 = node->gm_slots[slot1];
+	TupleTableSlot *s2 = node->gm_slots[slot2];
+	int			nkey;
+
+	Assert(!TupIsNull(s1));
+	Assert(!TupIsNull(s2));
+
+	for (nkey = 0; nkey < node->gm_nkeys; nkey++)
+	{
+		SortSupport sortKey = node->gm_sortkeys + nkey;
+		AttrNumber	attno = sortKey->ssup_attno;
+		Datum		datum1,
+					datum2;
+		bool		isNull1,
+					isNull2;
+		int			compare;
+
+		datum1 = slot_getattr(s1, attno, &isNull1);
+		datum2 = slot_getattr(s2, attno, &isNull2);
+
+		compare = ApplySortComparator(datum1, isNull1,
+									  datum2, isNull2,
+									  sortKey);
+		if (compare != 0)
+		{
+			INVERT_COMPARE_RESULT(compare);
+			return compare;
+		}
+	}
+	return 0;
+}
diff --git a/src/backend/executor/nodeGroup.c b/src/backend/executor/nodeGroup.c
new file mode 100644
index 0000000..1721b2a
--- /dev/null
+++ b/src/backend/executor/nodeGroup.c
@@ -0,0 +1,255 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeGroup.c
+ *	  Routines to handle group nodes (used for queries with GROUP BY clause).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * DESCRIPTION
+ *	  The Group node is designed for handling queries with a GROUP BY clause.
+ *	  Its outer plan must deliver tuples that are sorted in the order
+ *	  specified by the grouping columns (ie. tuples from the same group are
+ *	  consecutive).  That way, we just have to compare adjacent tuples to
+ *	  locate group boundaries.
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeGroup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeGroup.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/*
+ *	 ExecGroup -
+ *
+ *		Return one tuple for each group of matching input tuples.
+ */
+static TupleTableSlot *
+ExecGroup(PlanState *pstate)
+{
+	GroupState *node = castNode(GroupState, pstate);
+	ExprContext *econtext;
+	TupleTableSlot *firsttupleslot;
+	TupleTableSlot *outerslot;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * get state info from node
+	 */
+	if (node->grp_done)
+		return NULL;
+	econtext = node->ss.ps.ps_ExprContext;
+
+	/*
+	 * The ScanTupleSlot holds the (copied) first tuple of each group.
+	 */
+	firsttupleslot = node->ss.ss_ScanTupleSlot;
+
+	/*
+	 * We need not call ResetExprContext here because ExecQualAndReset() will
+	 * reset the per-tuple memory context once per input tuple.
+	 */
+
+	/*
+	 * If first time through, acquire first input tuple and determine whether
+	 * to return it or not.
+	 */
+	if (TupIsNull(firsttupleslot))
+	{
+		outerslot = ExecProcNode(outerPlanState(node));
+		if (TupIsNull(outerslot))
+		{
+			/* empty input, so return nothing */
+			node->grp_done = true;
+			return NULL;
+		}
+		/* Copy tuple into firsttupleslot */
+		ExecCopySlot(firsttupleslot, outerslot);
+
+		/*
+		 * Set it up as input for qual test and projection.  The expressions
+		 * will access the input tuple as varno OUTER.
+		 */
+		econtext->ecxt_outertuple = firsttupleslot;
+
+		/*
+		 * Check the qual (HAVING clause); if the group does not match, ignore
+		 * it and fall into scan loop.
+		 */
+		if (ExecQual(node->ss.ps.qual, econtext))
+		{
+			/*
+			 * Form and return a projection tuple using the first input tuple.
+			 */
+			return ExecProject(node->ss.ps.ps_ProjInfo);
+		}
+		else
+			InstrCountFiltered1(node, 1);
+	}
+
+	/*
+	 * This loop iterates once per input tuple group.  At the head of the
+	 * loop, we have finished processing the first tuple of the group and now
+	 * need to scan over all the other group members.
+	 */
+	for (;;)
+	{
+		/*
+		 * Scan over all remaining tuples that belong to this group
+		 */
+		for (;;)
+		{
+			outerslot = ExecProcNode(outerPlanState(node));
+			if (TupIsNull(outerslot))
+			{
+				/* no more groups, so we're done */
+				node->grp_done = true;
+				return NULL;
+			}
+
+			/*
+			 * Compare with first tuple and see if this tuple is of the same
+			 * group.  If so, ignore it and keep scanning.
+			 */
+			econtext->ecxt_innertuple = firsttupleslot;
+			econtext->ecxt_outertuple = outerslot;
+			if (!ExecQualAndReset(node->eqfunction, econtext))
+				break;
+		}
+
+		/*
+		 * We have the first tuple of the next input group.  See if we want to
+		 * return it.
+		 */
+		/* Copy tuple, set up as input for qual test and projection */
+		ExecCopySlot(firsttupleslot, outerslot);
+		econtext->ecxt_outertuple = firsttupleslot;
+
+		/*
+		 * Check the qual (HAVING clause); if the group does not match, ignore
+		 * it and loop back to scan the rest of the group.
+		 */
+		if (ExecQual(node->ss.ps.qual, econtext))
+		{
+			/*
+			 * Form and return a projection tuple using the first input tuple.
+			 */
+			return ExecProject(node->ss.ps.ps_ProjInfo);
+		}
+		else
+			InstrCountFiltered1(node, 1);
+	}
+}
+
+/* -----------------
+ * ExecInitGroup
+ *
+ *	Creates the run-time information for the group node produced by the
+ *	planner and initializes its outer subtree
+ * -----------------
+ */
+GroupState *
+ExecInitGroup(Group *node, EState *estate, int eflags)
+{
+	GroupState *grpstate;
+	const TupleTableSlotOps *tts_ops;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	grpstate = makeNode(GroupState);
+	grpstate->ss.ps.plan = (Plan *) node;
+	grpstate->ss.ps.state = estate;
+	grpstate->ss.ps.ExecProcNode = ExecGroup;
+	grpstate->grp_done = false;
+
+	/*
+	 * create expression context
+	 */
+	ExecAssignExprContext(estate, &grpstate->ss.ps);
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlanState(grpstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * Initialize scan slot and type.
+	 */
+	tts_ops = ExecGetResultSlotOps(outerPlanState(&grpstate->ss), NULL);
+	ExecCreateScanSlotFromOuterPlan(estate, &grpstate->ss, tts_ops);
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTupleSlotTL(&grpstate->ss.ps, &TTSOpsVirtual);
+	ExecAssignProjectionInfo(&grpstate->ss.ps, NULL);
+
+	/*
+	 * initialize child expressions
+	 */
+	grpstate->ss.ps.qual =
+		ExecInitQual(node->plan.qual, (PlanState *) grpstate);
+
+	/*
+	 * Precompute fmgr lookup data for inner loop
+	 */
+	grpstate->eqfunction =
+		execTuplesMatchPrepare(ExecGetResultType(outerPlanState(grpstate)),
+							   node->numCols,
+							   node->grpColIdx,
+							   node->grpOperators,
+							   node->grpCollations,
+							   &grpstate->ss.ps);
+
+	return grpstate;
+}
+
+/* ------------------------
+ *		ExecEndGroup(node)
+ *
+ * -----------------------
+ */
+void
+ExecEndGroup(GroupState *node)
+{
+	PlanState  *outerPlan;
+
+	ExecFreeExprContext(&node->ss.ps);
+
+	/* clean up tuple table */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	outerPlan = outerPlanState(node);
+	ExecEndNode(outerPlan);
+}
+
+void
+ExecReScanGroup(GroupState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	node->grp_done = false;
+	/* must clear first tuple */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+}
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
new file mode 100644
index 0000000..15d8bbe
--- /dev/null
+++ b/src/backend/executor/nodeHash.c
@@ -0,0 +1,3434 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeHash.c
+ *	  Routines to hash relations for hashjoin
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeHash.c
+ *
+ * See note on parallelism in nodeHashjoin.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		MultiExecHash	- generate an in-memory hash table of the relation
+ *		ExecInitHash	- initialize node and subnodes
+ *		ExecEndHash		- shutdown node and subnodes
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+#include <limits.h>
+
+#include "access/htup_details.h"
+#include "access/parallel.h"
+#include "catalog/pg_statistic.h"
+#include "commands/tablespace.h"
+#include "executor/execdebug.h"
+#include "executor/hashjoin.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "port/pg_bitutils.h"
+#include "utils/dynahash.h"
+#include "utils/guc.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/syscache.h"
+
+static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
+static void ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable);
+static void ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable);
+static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
+								  int mcvsToUse);
+static void ExecHashSkewTableInsert(HashJoinTable hashtable,
+									TupleTableSlot *slot,
+									uint32 hashvalue,
+									int bucketNumber);
+static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
+
+static void *dense_alloc(HashJoinTable hashtable, Size size);
+static HashJoinTuple ExecParallelHashTupleAlloc(HashJoinTable hashtable,
+												size_t size,
+												dsa_pointer *shared);
+static void MultiExecPrivateHash(HashState *node);
+static void MultiExecParallelHash(HashState *node);
+static inline HashJoinTuple ExecParallelHashFirstTuple(HashJoinTable table,
+													   int bucketno);
+static inline HashJoinTuple ExecParallelHashNextTuple(HashJoinTable table,
+													  HashJoinTuple tuple);
+static inline void ExecParallelHashPushTuple(dsa_pointer_atomic *head,
+											 HashJoinTuple tuple,
+											 dsa_pointer tuple_shared);
+static void ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch);
+static void ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable);
+static void ExecParallelHashRepartitionFirst(HashJoinTable hashtable);
+static void ExecParallelHashRepartitionRest(HashJoinTable hashtable);
+static HashMemoryChunk ExecParallelHashPopChunkQueue(HashJoinTable table,
+													 dsa_pointer *shared);
+static bool ExecParallelHashTuplePrealloc(HashJoinTable hashtable,
+										  int batchno,
+										  size_t size);
+static void ExecParallelHashMergeCounters(HashJoinTable hashtable);
+static void ExecParallelHashCloseBatchAccessors(HashJoinTable hashtable);
+
+
+/* ----------------------------------------------------------------
+ *		ExecHash
+ *
+ *		stub for pro forma compliance
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecHash(PlanState *pstate)
+{
+	elog(ERROR, "Hash node does not support ExecProcNode call convention");
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		MultiExecHash
+ *
+ *		build hash table for hashjoin, doing partitioning if more
+ *		than one batch is required.
+ * ----------------------------------------------------------------
+ */
+Node *
+MultiExecHash(HashState *node)
+{
+	/* must provide our own instrumentation support */
+	if (node->ps.instrument)
+		InstrStartNode(node->ps.instrument);
+
+	if (node->parallel_state != NULL)
+		MultiExecParallelHash(node);
+	else
+		MultiExecPrivateHash(node);
+
+	/* must provide our own instrumentation support */
+	if (node->ps.instrument)
+		InstrStopNode(node->ps.instrument, node->hashtable->partialTuples);
+
+	/*
+	 * We do not return the hash table directly because it's not a subtype of
+	 * Node, and so would violate the MultiExecProcNode API.  Instead, our
+	 * parent Hashjoin node is expected to know how to fish it out of our node
+	 * state.  Ugly but not really worth cleaning up, since Hashjoin knows
+	 * quite a bit more about Hash besides that.
+	 */
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		MultiExecPrivateHash
+ *
+ *		parallel-oblivious version, building a backend-private
+ *		hash table and (if necessary) batch files.
+ * ----------------------------------------------------------------
+ */
+static void
+MultiExecPrivateHash(HashState *node)
+{
+	PlanState  *outerNode;
+	List	   *hashkeys;
+	HashJoinTable hashtable;
+	TupleTableSlot *slot;
+	ExprContext *econtext;
+	uint32		hashvalue;
+
+	/*
+	 * get state info from node
+	 */
+	outerNode = outerPlanState(node);
+	hashtable = node->hashtable;
+
+	/*
+	 * set expression context
+	 */
+	hashkeys = node->hashkeys;
+	econtext = node->ps.ps_ExprContext;
+
+	/*
+	 * Get all tuples from the node below the Hash node and insert into the
+	 * hash table (or temp files).
+	 */
+	for (;;)
+	{
+		slot = ExecProcNode(outerNode);
+		if (TupIsNull(slot))
+			break;
+		/* We have to compute the hash value */
+		econtext->ecxt_outertuple = slot;
+		if (ExecHashGetHashValue(hashtable, econtext, hashkeys,
+								 false, hashtable->keepNulls,
+								 &hashvalue))
+		{
+			int			bucketNumber;
+
+			bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue);
+			if (bucketNumber != INVALID_SKEW_BUCKET_NO)
+			{
+				/* It's a skew tuple, so put it into that hash table */
+				ExecHashSkewTableInsert(hashtable, slot, hashvalue,
+										bucketNumber);
+				hashtable->skewTuples += 1;
+			}
+			else
+			{
+				/* Not subject to skew optimization, so insert normally */
+				ExecHashTableInsert(hashtable, slot, hashvalue);
+			}
+			hashtable->totalTuples += 1;
+		}
+	}
+
+	/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
+	if (hashtable->nbuckets != hashtable->nbuckets_optimal)
+		ExecHashIncreaseNumBuckets(hashtable);
+
+	/* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
+	hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
+	if (hashtable->spaceUsed > hashtable->spacePeak)
+		hashtable->spacePeak = hashtable->spaceUsed;
+
+	hashtable->partialTuples = hashtable->totalTuples;
+}
+
+/* ----------------------------------------------------------------
+ *		MultiExecParallelHash
+ *
+ *		parallel-aware version, building a shared hash table and
+ *		(if necessary) batch files using the combined effort of
+ *		a set of co-operating backends.
+ * ----------------------------------------------------------------
+ */
+static void
+MultiExecParallelHash(HashState *node)
+{
+	ParallelHashJoinState *pstate;
+	PlanState  *outerNode;
+	List	   *hashkeys;
+	HashJoinTable hashtable;
+	TupleTableSlot *slot;
+	ExprContext *econtext;
+	uint32		hashvalue;
+	Barrier    *build_barrier;
+	int			i;
+
+	/*
+	 * get state info from node
+	 */
+	outerNode = outerPlanState(node);
+	hashtable = node->hashtable;
+
+	/*
+	 * set expression context
+	 */
+	hashkeys = node->hashkeys;
+	econtext = node->ps.ps_ExprContext;
+
+	/*
+	 * Synchronize the parallel hash table build.  At this stage we know that
+	 * the shared hash table has been or is being set up by
+	 * ExecHashTableCreate(), but we don't know if our peers have returned
+	 * from there or are here in MultiExecParallelHash(), and if so how far
+	 * through they are.  To find out, we check the build_barrier phase then
+	 * and jump to the right step in the build algorithm.
+	 */
+	pstate = hashtable->parallel_state;
+	build_barrier = &pstate->build_barrier;
+	Assert(BarrierPhase(build_barrier) >= PHJ_BUILD_ALLOCATING);
+	switch (BarrierPhase(build_barrier))
+	{
+		case PHJ_BUILD_ALLOCATING:
+
+			/*
+			 * Either I just allocated the initial hash table in
+			 * ExecHashTableCreate(), or someone else is doing that.  Either
+			 * way, wait for everyone to arrive here so we can proceed.
+			 */
+			BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ALLOCATE);
+			/* Fall through. */
+
+		case PHJ_BUILD_HASHING_INNER:
+
+			/*
+			 * It's time to begin hashing, or if we just arrived here then
+			 * hashing is already underway, so join in that effort.  While
+			 * hashing we have to be prepared to help increase the number of
+			 * batches or buckets at any time, and if we arrived here when
+			 * that was already underway we'll have to help complete that work
+			 * immediately so that it's safe to access batches and buckets
+			 * below.
+			 */
+			if (PHJ_GROW_BATCHES_PHASE(BarrierAttach(&pstate->grow_batches_barrier)) !=
+				PHJ_GROW_BATCHES_ELECTING)
+				ExecParallelHashIncreaseNumBatches(hashtable);
+			if (PHJ_GROW_BUCKETS_PHASE(BarrierAttach(&pstate->grow_buckets_barrier)) !=
+				PHJ_GROW_BUCKETS_ELECTING)
+				ExecParallelHashIncreaseNumBuckets(hashtable);
+			ExecParallelHashEnsureBatchAccessors(hashtable);
+			ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+			for (;;)
+			{
+				slot = ExecProcNode(outerNode);
+				if (TupIsNull(slot))
+					break;
+				econtext->ecxt_outertuple = slot;
+				if (ExecHashGetHashValue(hashtable, econtext, hashkeys,
+										 false, hashtable->keepNulls,
+										 &hashvalue))
+					ExecParallelHashTableInsert(hashtable, slot, hashvalue);
+				hashtable->partialTuples++;
+			}
+
+			/*
+			 * Make sure that any tuples we wrote to disk are visible to
+			 * others before anyone tries to load them.
+			 */
+			for (i = 0; i < hashtable->nbatch; ++i)
+				sts_end_write(hashtable->batches[i].inner_tuples);
+
+			/*
+			 * Update shared counters.  We need an accurate total tuple count
+			 * to control the empty table optimization.
+			 */
+			ExecParallelHashMergeCounters(hashtable);
+
+			BarrierDetach(&pstate->grow_buckets_barrier);
+			BarrierDetach(&pstate->grow_batches_barrier);
+
+			/*
+			 * Wait for everyone to finish building and flushing files and
+			 * counters.
+			 */
+			if (BarrierArriveAndWait(build_barrier,
+									 WAIT_EVENT_HASH_BUILD_HASH_INNER))
+			{
+				/*
+				 * Elect one backend to disable any further growth.  Batches
+				 * are now fixed.  While building them we made sure they'd fit
+				 * in our memory budget when we load them back in later (or we
+				 * tried to do that and gave up because we detected extreme
+				 * skew).
+				 */
+				pstate->growth = PHJ_GROWTH_DISABLED;
+			}
+	}
+
+	/*
+	 * We're not yet attached to a batch.  We all agree on the dimensions and
+	 * number of inner tuples (for the empty table optimization).
+	 */
+	hashtable->curbatch = -1;
+	hashtable->nbuckets = pstate->nbuckets;
+	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
+	hashtable->totalTuples = pstate->total_tuples;
+	ExecParallelHashEnsureBatchAccessors(hashtable);
+
+	/*
+	 * The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
+	 * case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
+	 * there already).
+	 */
+	Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+		   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitHash
+ *
+ *		Init routine for Hash node
+ * ----------------------------------------------------------------
+ */
+HashState *
+ExecInitHash(Hash *node, EState *estate, int eflags)
+{
+	HashState  *hashstate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	hashstate = makeNode(HashState);
+	hashstate->ps.plan = (Plan *) node;
+	hashstate->ps.state = estate;
+	hashstate->ps.ExecProcNode = ExecHash;
+	hashstate->hashtable = NULL;
+	hashstate->hashkeys = NIL;	/* will be set by parent HashJoin */
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &hashstate->ps);
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * initialize our result slot and type. No need to build projection
+	 * because this node doesn't do projections.
+	 */
+	ExecInitResultTupleSlotTL(&hashstate->ps, &TTSOpsMinimalTuple);
+	hashstate->ps.ps_ProjInfo = NULL;
+
+	/*
+	 * initialize child expressions
+	 */
+	Assert(node->plan.qual == NIL);
+	hashstate->hashkeys =
+		ExecInitExprList(node->hashkeys, (PlanState *) hashstate);
+
+	return hashstate;
+}
+
+/* ---------------------------------------------------------------
+ *		ExecEndHash
+ *
+ *		clean up routine for Hash node
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndHash(HashState *node)
+{
+	PlanState  *outerPlan;
+
+	/*
+	 * free exprcontext
+	 */
+	ExecFreeExprContext(&node->ps);
+
+	/*
+	 * shut down the subplan
+	 */
+	outerPlan = outerPlanState(node);
+	ExecEndNode(outerPlan);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecHashTableCreate
+ *
+ *		create an empty hashtable data structure for hashjoin.
+ * ----------------------------------------------------------------
+ */
+HashJoinTable
+ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, bool keepNulls)
+{
+	Hash	   *node;
+	HashJoinTable hashtable;
+	Plan	   *outerNode;
+	size_t		space_allowed;
+	int			nbuckets;
+	int			nbatch;
+	double		rows;
+	int			num_skew_mcvs;
+	int			log2_nbuckets;
+	int			nkeys;
+	int			i;
+	ListCell   *ho;
+	ListCell   *hc;
+	MemoryContext oldcxt;
+
+	/*
+	 * Get information about the size of the relation to be hashed (it's the
+	 * "outer" subtree of this node, but the inner relation of the hashjoin).
+	 * Compute the appropriate size of the hash table.
+	 */
+	node = (Hash *) state->ps.plan;
+	outerNode = outerPlan(node);
+
+	/*
+	 * If this is shared hash table with a partial plan, then we can't use
+	 * outerNode->plan_rows to estimate its size.  We need an estimate of the
+	 * total number of rows across all copies of the partial plan.
+	 */
+	rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;
+
+	ExecChooseHashTableSize(rows, outerNode->plan_width,
+							OidIsValid(node->skewTable),
+							state->parallel_state != NULL,
+							state->parallel_state != NULL ?
+							state->parallel_state->nparticipants - 1 : 0,
+							&space_allowed,
+							&nbuckets, &nbatch, &num_skew_mcvs);
+
+	/* nbuckets must be a power of 2 */
+	log2_nbuckets = my_log2(nbuckets);
+	Assert(nbuckets == (1 << log2_nbuckets));
+
+	/*
+	 * Initialize the hash table control block.
+	 *
+	 * The hashtable control block is just palloc'd from the executor's
+	 * per-query memory context.  Everything else should be kept inside the
+	 * subsidiary hashCxt or batchCxt.
+	 */
+	hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
+	hashtable->nbuckets = nbuckets;
+	hashtable->nbuckets_original = nbuckets;
+	hashtable->nbuckets_optimal = nbuckets;
+	hashtable->log2_nbuckets = log2_nbuckets;
+	hashtable->log2_nbuckets_optimal = log2_nbuckets;
+	hashtable->buckets.unshared = NULL;
+	hashtable->keepNulls = keepNulls;
+	hashtable->skewEnabled = false;
+	hashtable->skewBucket = NULL;
+	hashtable->skewBucketLen = 0;
+	hashtable->nSkewBuckets = 0;
+	hashtable->skewBucketNums = NULL;
+	hashtable->nbatch = nbatch;
+	hashtable->curbatch = 0;
+	hashtable->nbatch_original = nbatch;
+	hashtable->nbatch_outstart = nbatch;
+	hashtable->growEnabled = true;
+	hashtable->totalTuples = 0;
+	hashtable->partialTuples = 0;
+	hashtable->skewTuples = 0;
+	hashtable->innerBatchFile = NULL;
+	hashtable->outerBatchFile = NULL;
+	hashtable->spaceUsed = 0;
+	hashtable->spacePeak = 0;
+	hashtable->spaceAllowed = space_allowed;
+	hashtable->spaceUsedSkew = 0;
+	hashtable->spaceAllowedSkew =
+		hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;
+	hashtable->chunks = NULL;
+	hashtable->current_chunk = NULL;
+	hashtable->parallel_state = state->parallel_state;
+	hashtable->area = state->ps.state->es_query_dsa;
+	hashtable->batches = NULL;
+
+#ifdef HJDEBUG
+	printf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n",
+		   hashtable, nbatch, nbuckets);
+#endif
+
+	/*
+	 * Create temporary memory contexts in which to keep the hashtable working
+	 * storage.  See notes in executor/hashjoin.h.
+	 */
+	hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
+											   "HashTableContext",
+											   ALLOCSET_DEFAULT_SIZES);
+
+	hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
+												"HashBatchContext",
+												ALLOCSET_DEFAULT_SIZES);
+
+	/* Allocate data that will live for the life of the hashjoin */
+
+	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+
+	/*
+	 * Get info about the hash functions to be used for each hash key. Also
+	 * remember whether the join operators are strict.
+	 */
+	nkeys = list_length(hashOperators);
+	hashtable->outer_hashfunctions =
+		(FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
+	hashtable->inner_hashfunctions =
+		(FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
+	hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));
+	hashtable->collations = (Oid *) palloc(nkeys * sizeof(Oid));
+	i = 0;
+	forboth(ho, hashOperators, hc, hashCollations)
+	{
+		Oid			hashop = lfirst_oid(ho);
+		Oid			left_hashfn;
+		Oid			right_hashfn;
+
+		if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
+			elog(ERROR, "could not find hash function for hash operator %u",
+				 hashop);
+		fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
+		fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
+		hashtable->hashStrict[i] = op_strict(hashop);
+		hashtable->collations[i] = lfirst_oid(hc);
+		i++;
+	}
+
+	if (nbatch > 1 && hashtable->parallel_state == NULL)
+	{
+		/*
+		 * allocate and initialize the file arrays in hashCxt (not needed for
+		 * parallel case which uses shared tuplestores instead of raw files)
+		 */
+		hashtable->innerBatchFile = (BufFile **)
+			palloc0(nbatch * sizeof(BufFile *));
+		hashtable->outerBatchFile = (BufFile **)
+			palloc0(nbatch * sizeof(BufFile *));
+		/* The files will not be opened until needed... */
+		/* ... but make sure we have temp tablespaces established for them */
+		PrepareTempTablespaces();
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+
+	if (hashtable->parallel_state)
+	{
+		ParallelHashJoinState *pstate = hashtable->parallel_state;
+		Barrier    *build_barrier;
+
+		/*
+		 * Attach to the build barrier.  The corresponding detach operation is
+		 * in ExecHashTableDetach.  Note that we won't attach to the
+		 * batch_barrier for batch 0 yet.  We'll attach later and start it out
+		 * in PHJ_BATCH_PROBING phase, because batch 0 is allocated up front
+		 * and then loaded while hashing (the standard hybrid hash join
+		 * algorithm), and we'll coordinate that using build_barrier.
+		 */
+		build_barrier = &pstate->build_barrier;
+		BarrierAttach(build_barrier);
+
+		/*
+		 * So far we have no idea whether there are any other participants,
+		 * and if so, what phase they are working on.  The only thing we care
+		 * about at this point is whether someone has already created the
+		 * SharedHashJoinBatch objects and the hash table for batch 0.  One
+		 * backend will be elected to do that now if necessary.
+		 */
+		if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECTING &&
+			BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECT))
+		{
+			pstate->nbatch = nbatch;
+			pstate->space_allowed = space_allowed;
+			pstate->growth = PHJ_GROWTH_OK;
+
+			/* Set up the shared state for coordinating batches. */
+			ExecParallelHashJoinSetUpBatches(hashtable, nbatch);
+
+			/*
+			 * Allocate batch 0's hash table up front so we can load it
+			 * directly while hashing.
+			 */
+			pstate->nbuckets = nbuckets;
+			ExecParallelHashTableAlloc(hashtable, 0);
+		}
+
+		/*
+		 * The next Parallel Hash synchronization point is in
+		 * MultiExecParallelHash(), which will progress it all the way to
+		 * PHJ_BUILD_DONE.  The caller must not return control from this
+		 * executor node between now and then.
+		 */
+	}
+	else
+	{
+		/*
+		 * Prepare context for the first-scan space allocations; allocate the
+		 * hashbucket array therein, and set each bucket "empty".
+		 */
+		MemoryContextSwitchTo(hashtable->batchCxt);
+
+		hashtable->buckets.unshared = (HashJoinTuple *)
+			palloc0(nbuckets * sizeof(HashJoinTuple));
+
+		/*
+		 * Set up for skew optimization, if possible and there's a need for
+		 * more than one batch.  (In a one-batch join, there's no point in
+		 * it.)
+		 */
+		if (nbatch > 1)
+			ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	return hashtable;
+}
+
+
+/*
+ * Compute appropriate size for hashtable given the estimated size of the
+ * relation to be hashed (number of rows and average row width).
+ *
+ * This is exported so that the planner's costsize.c can use it.
+ */
+
+/* Target bucket loading (tuples per bucket) */
+#define NTUP_PER_BUCKET			1
+
+void
+ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
+						bool try_combined_hash_mem,
+						int parallel_workers,
+						size_t *space_allowed,
+						int *numbuckets,
+						int *numbatches,
+						int *num_skew_mcvs)
+{
+	int			tupsize;
+	double		inner_rel_bytes;
+	size_t		hash_table_bytes;
+	size_t		bucket_bytes;
+	size_t		max_pointers;
+	int			nbatch = 1;
+	int			nbuckets;
+	double		dbuckets;
+
+	/* Force a plausible relation size if no info */
+	if (ntuples <= 0.0)
+		ntuples = 1000.0;
+
+	/*
+	 * Estimate tupsize based on footprint of tuple in hashtable... note this
+	 * does not allow for any palloc overhead.  The manipulations of spaceUsed
+	 * don't count palloc overhead either.
+	 */
+	tupsize = HJTUPLE_OVERHEAD +
+		MAXALIGN(SizeofMinimalTupleHeader) +
+		MAXALIGN(tupwidth);
+	inner_rel_bytes = ntuples * tupsize;
+
+	/*
+	 * Compute in-memory hashtable size limit from GUCs.
+	 */
+	hash_table_bytes = get_hash_memory_limit();
+
+	/*
+	 * Parallel Hash tries to use the combined hash_mem of all workers to
+	 * avoid the need to batch.  If that won't work, it falls back to hash_mem
+	 * per worker and tries to process batches in parallel.
+	 */
+	if (try_combined_hash_mem)
+	{
+		/* Careful, this could overflow size_t */
+		double		newlimit;
+
+		newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1);
+		newlimit = Min(newlimit, (double) SIZE_MAX);
+		hash_table_bytes = (size_t) newlimit;
+	}
+
+	*space_allowed = hash_table_bytes;
+
+	/*
+	 * If skew optimization is possible, estimate the number of skew buckets
+	 * that will fit in the memory allowed, and decrement the assumed space
+	 * available for the main hash table accordingly.
+	 *
+	 * We make the optimistic assumption that each skew bucket will contain
+	 * one inner-relation tuple.  If that turns out to be low, we will recover
+	 * at runtime by reducing the number of skew buckets.
+	 *
+	 * hashtable->skewBucket will have up to 8 times as many HashSkewBucket
+	 * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
+	 * will round up to the next power of 2 and then multiply by 4 to reduce
+	 * collisions.
+	 */
+	if (useskew)
+	{
+		size_t		bytes_per_mcv;
+		size_t		skew_mcvs;
+
+		/*----------
+		 * Compute number of MCVs we could hold in hash_table_bytes
+		 *
+		 * Divisor is:
+		 * size of a hash tuple +
+		 * worst-case size of skewBucket[] per MCV +
+		 * size of skewBucketNums[] entry +
+		 * size of skew bucket struct itself
+		 *----------
+		 */
+		bytes_per_mcv = tupsize +
+			(8 * sizeof(HashSkewBucket *)) +
+			sizeof(int) +
+			SKEW_BUCKET_OVERHEAD;
+		skew_mcvs = hash_table_bytes / bytes_per_mcv;
+
+		/*
+		 * Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as
+		 * not to worry about size_t overflow in the multiplication)
+		 */
+		skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100;
+
+		/* Now clamp to integer range */
+		skew_mcvs = Min(skew_mcvs, INT_MAX);
+
+		*num_skew_mcvs = (int) skew_mcvs;
+
+		/* Reduce hash_table_bytes by the amount needed for the skew table */
+		if (skew_mcvs > 0)
+			hash_table_bytes -= skew_mcvs * bytes_per_mcv;
+	}
+	else
+		*num_skew_mcvs = 0;
+
+	/*
+	 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
+	 * memory is filled, assuming a single batch; but limit the value so that
+	 * the pointer arrays we'll try to allocate do not exceed hash_table_bytes
+	 * nor MaxAllocSize.
+	 *
+	 * Note that both nbuckets and nbatch must be powers of 2 to make
+	 * ExecHashGetBucketAndBatch fast.
+	 */
+	max_pointers = hash_table_bytes / sizeof(HashJoinTuple);
+	max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
+	/* If max_pointers isn't a power of 2, must round it down to one */
+	max_pointers = pg_prevpower2_size_t(max_pointers);
+
+	/* Also ensure we avoid integer overflow in nbatch and nbuckets */
+	/* (this step is redundant given the current value of MaxAllocSize) */
+	max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
+
+	dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
+	dbuckets = Min(dbuckets, max_pointers);
+	nbuckets = (int) dbuckets;
+	/* don't let nbuckets be really small, though ... */
+	nbuckets = Max(nbuckets, 1024);
+	/* ... and force it to be a power of 2. */
+	nbuckets = pg_nextpower2_32(nbuckets);
+
+	/*
+	 * If there's not enough space to store the projected number of tuples and
+	 * the required bucket headers, we will need multiple batches.
+	 */
+	bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
+	if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
+	{
+		/* We'll need multiple batches */
+		size_t		sbuckets;
+		double		dbatch;
+		int			minbatch;
+		size_t		bucket_size;
+
+		/*
+		 * If Parallel Hash with combined hash_mem would still need multiple
+		 * batches, we'll have to fall back to regular hash_mem budget.
+		 */
+		if (try_combined_hash_mem)
+		{
+			ExecChooseHashTableSize(ntuples, tupwidth, useskew,
+									false, parallel_workers,
+									space_allowed,
+									numbuckets,
+									numbatches,
+									num_skew_mcvs);
+			return;
+		}
+
+		/*
+		 * Estimate the number of buckets we'll want to have when hash_mem is
+		 * entirely full.  Each bucket will contain a bucket pointer plus
+		 * NTUP_PER_BUCKET tuples, whose projected size already includes
+		 * overhead for the hash code, pointer to the next tuple, etc.
+		 */
+		bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
+		sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size);
+		sbuckets = Min(sbuckets, max_pointers);
+		nbuckets = (int) sbuckets;
+		nbuckets = pg_nextpower2_32(nbuckets);
+		bucket_bytes = nbuckets * sizeof(HashJoinTuple);
+
+		/*
+		 * Buckets are simple pointers to hashjoin tuples, while tupsize
+		 * includes the pointer, hash code, and MinimalTupleData.  So buckets
+		 * should never really exceed 25% of hash_mem (even for
+		 * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not
+		 * 2^N bytes, where we might get more because of doubling. So let's
+		 * look for 50% here.
+		 */
+		Assert(bucket_bytes <= hash_table_bytes / 2);
+
+		/* Calculate required number of batches. */
+		dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));
+		dbatch = Min(dbatch, max_pointers);
+		minbatch = (int) dbatch;
+		nbatch = pg_nextpower2_32(Max(2, minbatch));
+	}
+
+	Assert(nbuckets > 0);
+	Assert(nbatch > 0);
+
+	*numbuckets = nbuckets;
+	*numbatches = nbatch;
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecHashTableDestroy
+ *
+ *		destroy a hash table
+ * ----------------------------------------------------------------
+ */
+void
+ExecHashTableDestroy(HashJoinTable hashtable)
+{
+	int			i;
+
+	/*
+	 * Make sure all the temp files are closed.  We skip batch 0, since it
+	 * can't have any temp files (and the arrays might not even exist if
+	 * nbatch is only 1).  Parallel hash joins don't use these files.
+	 */
+	if (hashtable->innerBatchFile != NULL)
+	{
+		for (i = 1; i < hashtable->nbatch; i++)
+		{
+			if (hashtable->innerBatchFile[i])
+				BufFileClose(hashtable->innerBatchFile[i]);
+			if (hashtable->outerBatchFile[i])
+				BufFileClose(hashtable->outerBatchFile[i]);
+		}
+	}
+
+	/* Release working memory (batchCxt is a child, so it goes away too) */
+	MemoryContextDelete(hashtable->hashCxt);
+
+	/* And drop the control block */
+	pfree(hashtable);
+}
+
+/*
+ * ExecHashIncreaseNumBatches
+ *		increase the original number of batches in order to reduce
+ *		current memory consumption
+ */
+static void
+ExecHashIncreaseNumBatches(HashJoinTable hashtable)
+{
+	int			oldnbatch = hashtable->nbatch;
+	int			curbatch = hashtable->curbatch;
+	int			nbatch;
+	MemoryContext oldcxt;
+	long		ninmemory;
+	long		nfreed;
+	HashMemoryChunk oldchunks;
+
+	/* do nothing if we've decided to shut off growth */
+	if (!hashtable->growEnabled)
+		return;
+
+	/* safety check to avoid overflow */
+	if (oldnbatch > Min(INT_MAX / 2, MaxAllocSize / (sizeof(void *) * 2)))
+		return;
+
+	nbatch = oldnbatch * 2;
+	Assert(nbatch > 1);
+
+#ifdef HJDEBUG
+	printf("Hashjoin %p: increasing nbatch to %d because space = %zu\n",
+		   hashtable, nbatch, hashtable->spaceUsed);
+#endif
+
+	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+
+	if (hashtable->innerBatchFile == NULL)
+	{
+		/* we had no file arrays before */
+		hashtable->innerBatchFile = (BufFile **)
+			palloc0(nbatch * sizeof(BufFile *));
+		hashtable->outerBatchFile = (BufFile **)
+			palloc0(nbatch * sizeof(BufFile *));
+		/* time to establish the temp tablespaces, too */
+		PrepareTempTablespaces();
+	}
+	else
+	{
+		/* enlarge arrays and zero out added entries */
+		hashtable->innerBatchFile = (BufFile **)
+			repalloc(hashtable->innerBatchFile, nbatch * sizeof(BufFile *));
+		hashtable->outerBatchFile = (BufFile **)
+			repalloc(hashtable->outerBatchFile, nbatch * sizeof(BufFile *));
+		MemSet(hashtable->innerBatchFile + oldnbatch, 0,
+			   (nbatch - oldnbatch) * sizeof(BufFile *));
+		MemSet(hashtable->outerBatchFile + oldnbatch, 0,
+			   (nbatch - oldnbatch) * sizeof(BufFile *));
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+
+	hashtable->nbatch = nbatch;
+
+	/*
+	 * Scan through the existing hash table entries and dump out any that are
+	 * no longer of the current batch.
+	 */
+	ninmemory = nfreed = 0;
+
+	/* If know we need to resize nbuckets, we can do it while rebatching. */
+	if (hashtable->nbuckets_optimal != hashtable->nbuckets)
+	{
+		/* we never decrease the number of buckets */
+		Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
+
+		hashtable->nbuckets = hashtable->nbuckets_optimal;
+		hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
+
+		hashtable->buckets.unshared =
+			repalloc(hashtable->buckets.unshared,
+					 sizeof(HashJoinTuple) * hashtable->nbuckets);
+	}
+
+	/*
+	 * We will scan through the chunks directly, so that we can reset the
+	 * buckets now and not have to keep track which tuples in the buckets have
+	 * already been processed. We will free the old chunks as we go.
+	 */
+	memset(hashtable->buckets.unshared, 0,
+		   sizeof(HashJoinTuple) * hashtable->nbuckets);
+	oldchunks = hashtable->chunks;
+	hashtable->chunks = NULL;
+
+	/* so, let's scan through the old chunks, and all tuples in each chunk */
+	while (oldchunks != NULL)
+	{
+		HashMemoryChunk nextchunk = oldchunks->next.unshared;
+
+		/* position within the buffer (up to oldchunks->used) */
+		size_t		idx = 0;
+
+		/* process all tuples stored in this chunk (and then free it) */
+		while (idx < oldchunks->used)
+		{
+			HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(oldchunks) + idx);
+			MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+			int			hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len);
+			int			bucketno;
+			int			batchno;
+
+			ninmemory++;
+			ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+									  &bucketno, &batchno);
+
+			if (batchno == curbatch)
+			{
+				/* keep tuple in memory - copy it into the new chunk */
+				HashJoinTuple copyTuple;
+
+				copyTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
+				memcpy(copyTuple, hashTuple, hashTupleSize);
+
+				/* and add it back to the appropriate bucket */
+				copyTuple->next.unshared = hashtable->buckets.unshared[bucketno];
+				hashtable->buckets.unshared[bucketno] = copyTuple;
+			}
+			else
+			{
+				/* dump it out */
+				Assert(batchno > curbatch);
+				ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
+									  hashTuple->hashvalue,
+									  &hashtable->innerBatchFile[batchno]);
+
+				hashtable->spaceUsed -= hashTupleSize;
+				nfreed++;
+			}
+
+			/* next tuple in this chunk */
+			idx += MAXALIGN(hashTupleSize);
+
+			/* allow this loop to be cancellable */
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		/* we're done with this chunk - free it and proceed to the next one */
+		pfree(oldchunks);
+		oldchunks = nextchunk;
+	}
+
+#ifdef HJDEBUG
+	printf("Hashjoin %p: freed %ld of %ld tuples, space now %zu\n",
+		   hashtable, nfreed, ninmemory, hashtable->spaceUsed);
+#endif
+
+	/*
+	 * If we dumped out either all or none of the tuples in the table, disable
+	 * further expansion of nbatch.  This situation implies that we have
+	 * enough tuples of identical hashvalues to overflow spaceAllowed.
+	 * Increasing nbatch will not fix it since there's no way to subdivide the
+	 * group any more finely. We have to just gut it out and hope the server
+	 * has enough RAM.
+	 */
+	if (nfreed == 0 || nfreed == ninmemory)
+	{
+		hashtable->growEnabled = false;
+#ifdef HJDEBUG
+		printf("Hashjoin %p: disabling further increase of nbatch\n",
+			   hashtable);
+#endif
+	}
+}
+
+/*
+ * ExecParallelHashIncreaseNumBatches
+ *		Every participant attached to grow_batches_barrier must run this
+ *		function when it observes growth == PHJ_GROWTH_NEED_MORE_BATCHES.
+ */
+static void
+ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	int			i;
+
+	Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER);
+
+	/*
+	 * It's unlikely, but we need to be prepared for new participants to show
+	 * up while we're in the middle of this operation so we need to switch on
+	 * barrier phase here.
+	 */
+	switch (PHJ_GROW_BATCHES_PHASE(BarrierPhase(&pstate->grow_batches_barrier)))
+	{
+		case PHJ_GROW_BATCHES_ELECTING:
+
+			/*
+			 * Elect one participant to prepare to grow the number of batches.
+			 * This involves reallocating or resetting the buckets of batch 0
+			 * in preparation for all participants to begin repartitioning the
+			 * tuples.
+			 */
+			if (BarrierArriveAndWait(&pstate->grow_batches_barrier,
+									 WAIT_EVENT_HASH_GROW_BATCHES_ELECT))
+			{
+				dsa_pointer_atomic *buckets;
+				ParallelHashJoinBatch *old_batch0;
+				int			new_nbatch;
+				int			i;
+
+				/* Move the old batch out of the way. */
+				old_batch0 = hashtable->batches[0].shared;
+				pstate->old_batches = pstate->batches;
+				pstate->old_nbatch = hashtable->nbatch;
+				pstate->batches = InvalidDsaPointer;
+
+				/* Free this backend's old accessors. */
+				ExecParallelHashCloseBatchAccessors(hashtable);
+
+				/* Figure out how many batches to use. */
+				if (hashtable->nbatch == 1)
+				{
+					/*
+					 * We are going from single-batch to multi-batch.  We need
+					 * to switch from one large combined memory budget to the
+					 * regular hash_mem budget.
+					 */
+					pstate->space_allowed = get_hash_memory_limit();
+
+					/*
+					 * The combined hash_mem of all participants wasn't
+					 * enough. Therefore one batch per participant would be
+					 * approximately equivalent and would probably also be
+					 * insufficient.  So try two batches per participant,
+					 * rounded up to a power of two.
+					 */
+					new_nbatch = pg_nextpower2_32(pstate->nparticipants * 2);
+				}
+				else
+				{
+					/*
+					 * We were already multi-batched.  Try doubling the number
+					 * of batches.
+					 */
+					new_nbatch = hashtable->nbatch * 2;
+				}
+
+				/* Allocate new larger generation of batches. */
+				Assert(hashtable->nbatch == pstate->nbatch);
+				ExecParallelHashJoinSetUpBatches(hashtable, new_nbatch);
+				Assert(hashtable->nbatch == pstate->nbatch);
+
+				/* Replace or recycle batch 0's bucket array. */
+				if (pstate->old_nbatch == 1)
+				{
+					double		dtuples;
+					double		dbuckets;
+					int			new_nbuckets;
+
+					/*
+					 * We probably also need a smaller bucket array.  How many
+					 * tuples do we expect per batch, assuming we have only
+					 * half of them so far?  Normally we don't need to change
+					 * the bucket array's size, because the size of each batch
+					 * stays the same as we add more batches, but in this
+					 * special case we move from a large batch to many smaller
+					 * batches and it would be wasteful to keep the large
+					 * array.
+					 */
+					dtuples = (old_batch0->ntuples * 2.0) / new_nbatch;
+					dbuckets = ceil(dtuples / NTUP_PER_BUCKET);
+					dbuckets = Min(dbuckets,
+								   MaxAllocSize / sizeof(dsa_pointer_atomic));
+					new_nbuckets = (int) dbuckets;
+					new_nbuckets = Max(new_nbuckets, 1024);
+					new_nbuckets = pg_nextpower2_32(new_nbuckets);
+					dsa_free(hashtable->area, old_batch0->buckets);
+					hashtable->batches[0].shared->buckets =
+						dsa_allocate(hashtable->area,
+									 sizeof(dsa_pointer_atomic) * new_nbuckets);
+					buckets = (dsa_pointer_atomic *)
+						dsa_get_address(hashtable->area,
+										hashtable->batches[0].shared->buckets);
+					for (i = 0; i < new_nbuckets; ++i)
+						dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
+					pstate->nbuckets = new_nbuckets;
+				}
+				else
+				{
+					/* Recycle the existing bucket array. */
+					hashtable->batches[0].shared->buckets = old_batch0->buckets;
+					buckets = (dsa_pointer_atomic *)
+						dsa_get_address(hashtable->area, old_batch0->buckets);
+					for (i = 0; i < hashtable->nbuckets; ++i)
+						dsa_pointer_atomic_write(&buckets[i], InvalidDsaPointer);
+				}
+
+				/* Move all chunks to the work queue for parallel processing. */
+				pstate->chunk_work_queue = old_batch0->chunks;
+
+				/* Disable further growth temporarily while we're growing. */
+				pstate->growth = PHJ_GROWTH_DISABLED;
+			}
+			else
+			{
+				/* All other participants just flush their tuples to disk. */
+				ExecParallelHashCloseBatchAccessors(hashtable);
+			}
+			/* Fall through. */
+
+		case PHJ_GROW_BATCHES_ALLOCATING:
+			/* Wait for the above to be finished. */
+			BarrierArriveAndWait(&pstate->grow_batches_barrier,
+								 WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE);
+			/* Fall through. */
+
+		case PHJ_GROW_BATCHES_REPARTITIONING:
+			/* Make sure that we have the current dimensions and buckets. */
+			ExecParallelHashEnsureBatchAccessors(hashtable);
+			ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+			/* Then partition, flush counters. */
+			ExecParallelHashRepartitionFirst(hashtable);
+			ExecParallelHashRepartitionRest(hashtable);
+			ExecParallelHashMergeCounters(hashtable);
+			/* Wait for the above to be finished. */
+			BarrierArriveAndWait(&pstate->grow_batches_barrier,
+								 WAIT_EVENT_HASH_GROW_BATCHES_REPARTITION);
+			/* Fall through. */
+
+		case PHJ_GROW_BATCHES_DECIDING:
+
+			/*
+			 * Elect one participant to clean up and decide whether further
+			 * repartitioning is needed, or should be disabled because it's
+			 * not helping.
+			 */
+			if (BarrierArriveAndWait(&pstate->grow_batches_barrier,
+									 WAIT_EVENT_HASH_GROW_BATCHES_DECIDE))
+			{
+				bool		space_exhausted = false;
+				bool		extreme_skew_detected = false;
+
+				/* Make sure that we have the current dimensions and buckets. */
+				ExecParallelHashEnsureBatchAccessors(hashtable);
+				ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+
+				/* Are any of the new generation of batches exhausted? */
+				for (i = 0; i < hashtable->nbatch; ++i)
+				{
+					ParallelHashJoinBatch *batch = hashtable->batches[i].shared;
+
+					if (batch->space_exhausted ||
+						batch->estimated_size > pstate->space_allowed)
+					{
+						int			parent;
+
+						space_exhausted = true;
+
+						/*
+						 * Did this batch receive ALL of the tuples from its
+						 * parent batch?  That would indicate that further
+						 * repartitioning isn't going to help (the hash values
+						 * are probably all the same).
+						 */
+						parent = i % pstate->old_nbatch;
+						if (batch->ntuples == hashtable->batches[parent].shared->old_ntuples)
+							extreme_skew_detected = true;
+					}
+				}
+
+				/* Don't keep growing if it's not helping or we'd overflow. */
+				if (extreme_skew_detected || hashtable->nbatch >= INT_MAX / 2)
+					pstate->growth = PHJ_GROWTH_DISABLED;
+				else if (space_exhausted)
+					pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
+				else
+					pstate->growth = PHJ_GROWTH_OK;
+
+				/* Free the old batches in shared memory. */
+				dsa_free(hashtable->area, pstate->old_batches);
+				pstate->old_batches = InvalidDsaPointer;
+			}
+			/* Fall through. */
+
+		case PHJ_GROW_BATCHES_FINISHING:
+			/* Wait for the above to complete. */
+			BarrierArriveAndWait(&pstate->grow_batches_barrier,
+								 WAIT_EVENT_HASH_GROW_BATCHES_FINISH);
+	}
+}
+
+/*
+ * Repartition the tuples currently loaded into memory for inner batch 0
+ * because the number of batches has been increased.  Some tuples are retained
+ * in memory and some are written out to a later batch.
+ */
+static void
+ExecParallelHashRepartitionFirst(HashJoinTable hashtable)
+{
+	dsa_pointer chunk_shared;
+	HashMemoryChunk chunk;
+
+	Assert(hashtable->nbatch == hashtable->parallel_state->nbatch);
+
+	while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared)))
+	{
+		size_t		idx = 0;
+
+		/* Repartition all tuples in this chunk. */
+		while (idx < chunk->used)
+		{
+			HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
+			MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
+			HashJoinTuple copyTuple;
+			dsa_pointer shared;
+			int			bucketno;
+			int			batchno;
+
+			ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+									  &bucketno, &batchno);
+
+			Assert(batchno < hashtable->nbatch);
+			if (batchno == 0)
+			{
+				/* It still belongs in batch 0.  Copy to a new chunk. */
+				copyTuple =
+					ExecParallelHashTupleAlloc(hashtable,
+											   HJTUPLE_OVERHEAD + tuple->t_len,
+											   &shared);
+				copyTuple->hashvalue = hashTuple->hashvalue;
+				memcpy(HJTUPLE_MINTUPLE(copyTuple), tuple, tuple->t_len);
+				ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+										  copyTuple, shared);
+			}
+			else
+			{
+				size_t		tuple_size =
+				MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+
+				/* It belongs in a later batch. */
+				hashtable->batches[batchno].estimated_size += tuple_size;
+				sts_puttuple(hashtable->batches[batchno].inner_tuples,
+							 &hashTuple->hashvalue, tuple);
+			}
+
+			/* Count this tuple. */
+			++hashtable->batches[0].old_ntuples;
+			++hashtable->batches[batchno].ntuples;
+
+			idx += MAXALIGN(HJTUPLE_OVERHEAD +
+							HJTUPLE_MINTUPLE(hashTuple)->t_len);
+		}
+
+		/* Free this chunk. */
+		dsa_free(hashtable->area, chunk_shared);
+
+		CHECK_FOR_INTERRUPTS();
+	}
+}
+
+/*
+ * Help repartition inner batches 1..n.
+ */
+static void
+ExecParallelHashRepartitionRest(HashJoinTable hashtable)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	int			old_nbatch = pstate->old_nbatch;
+	SharedTuplestoreAccessor **old_inner_tuples;
+	ParallelHashJoinBatch *old_batches;
+	int			i;
+
+	/* Get our hands on the previous generation of batches. */
+	old_batches = (ParallelHashJoinBatch *)
+		dsa_get_address(hashtable->area, pstate->old_batches);
+	old_inner_tuples = palloc0(sizeof(SharedTuplestoreAccessor *) * old_nbatch);
+	for (i = 1; i < old_nbatch; ++i)
+	{
+		ParallelHashJoinBatch *shared =
+		NthParallelHashJoinBatch(old_batches, i);
+
+		old_inner_tuples[i] = sts_attach(ParallelHashJoinBatchInner(shared),
+										 ParallelWorkerNumber + 1,
+										 &pstate->fileset);
+	}
+
+	/* Join in the effort to repartition them. */
+	for (i = 1; i < old_nbatch; ++i)
+	{
+		MinimalTuple tuple;
+		uint32		hashvalue;
+
+		/* Scan one partition from the previous generation. */
+		sts_begin_parallel_scan(old_inner_tuples[i]);
+		while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], &hashvalue)))
+		{
+			size_t		tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+			int			bucketno;
+			int			batchno;
+
+			/* Decide which partition it goes to in the new generation. */
+			ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno,
+									  &batchno);
+
+			hashtable->batches[batchno].estimated_size += tuple_size;
+			++hashtable->batches[batchno].ntuples;
+			++hashtable->batches[i].old_ntuples;
+
+			/* Store the tuple its new batch. */
+			sts_puttuple(hashtable->batches[batchno].inner_tuples,
+						 &hashvalue, tuple);
+
+			CHECK_FOR_INTERRUPTS();
+		}
+		sts_end_parallel_scan(old_inner_tuples[i]);
+	}
+
+	pfree(old_inner_tuples);
+}
+
+/*
+ * Transfer the backend-local per-batch counters to the shared totals.
+ */
+static void
+ExecParallelHashMergeCounters(HashJoinTable hashtable)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	int			i;
+
+	LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
+	pstate->total_tuples = 0;
+	for (i = 0; i < hashtable->nbatch; ++i)
+	{
+		ParallelHashJoinBatchAccessor *batch = &hashtable->batches[i];
+
+		batch->shared->size += batch->size;
+		batch->shared->estimated_size += batch->estimated_size;
+		batch->shared->ntuples += batch->ntuples;
+		batch->shared->old_ntuples += batch->old_ntuples;
+		batch->size = 0;
+		batch->estimated_size = 0;
+		batch->ntuples = 0;
+		batch->old_ntuples = 0;
+		pstate->total_tuples += batch->shared->ntuples;
+	}
+	LWLockRelease(&pstate->lock);
+}
+
+/*
+ * ExecHashIncreaseNumBuckets
+ *		increase the original number of buckets in order to reduce
+ *		number of tuples per bucket
+ */
+static void
+ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+	HashMemoryChunk chunk;
+
+	/* do nothing if not an increase (it's called increase for a reason) */
+	if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
+		return;
+
+#ifdef HJDEBUG
+	printf("Hashjoin %p: increasing nbuckets %d => %d\n",
+		   hashtable, hashtable->nbuckets, hashtable->nbuckets_optimal);
+#endif
+
+	hashtable->nbuckets = hashtable->nbuckets_optimal;
+	hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
+
+	Assert(hashtable->nbuckets > 1);
+	Assert(hashtable->nbuckets <= (INT_MAX / 2));
+	Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
+
+	/*
+	 * Just reallocate the proper number of buckets - we don't need to walk
+	 * through them - we can walk the dense-allocated chunks (just like in
+	 * ExecHashIncreaseNumBatches, but without all the copying into new
+	 * chunks)
+	 */
+	hashtable->buckets.unshared =
+		(HashJoinTuple *) repalloc(hashtable->buckets.unshared,
+								   hashtable->nbuckets * sizeof(HashJoinTuple));
+
+	memset(hashtable->buckets.unshared, 0,
+		   hashtable->nbuckets * sizeof(HashJoinTuple));
+
+	/* scan through all tuples in all chunks to rebuild the hash table */
+	for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next.unshared)
+	{
+		/* process all tuples stored in this chunk */
+		size_t		idx = 0;
+
+		while (idx < chunk->used)
+		{
+			HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
+			int			bucketno;
+			int			batchno;
+
+			ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+									  &bucketno, &batchno);
+
+			/* add the tuple to the proper bucket */
+			hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
+			hashtable->buckets.unshared[bucketno] = hashTuple;
+
+			/* advance index past the tuple */
+			idx += MAXALIGN(HJTUPLE_OVERHEAD +
+							HJTUPLE_MINTUPLE(hashTuple)->t_len);
+		}
+
+		/* allow this loop to be cancellable */
+		CHECK_FOR_INTERRUPTS();
+	}
+}
+
+static void
+ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	int			i;
+	HashMemoryChunk chunk;
+	dsa_pointer chunk_s;
+
+	Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER);
+
+	/*
+	 * It's unlikely, but we need to be prepared for new participants to show
+	 * up while we're in the middle of this operation so we need to switch on
+	 * barrier phase here.
+	 */
+	switch (PHJ_GROW_BUCKETS_PHASE(BarrierPhase(&pstate->grow_buckets_barrier)))
+	{
+		case PHJ_GROW_BUCKETS_ELECTING:
+			/* Elect one participant to prepare to increase nbuckets. */
+			if (BarrierArriveAndWait(&pstate->grow_buckets_barrier,
+									 WAIT_EVENT_HASH_GROW_BUCKETS_ELECT))
+			{
+				size_t		size;
+				dsa_pointer_atomic *buckets;
+
+				/* Double the size of the bucket array. */
+				pstate->nbuckets *= 2;
+				size = pstate->nbuckets * sizeof(dsa_pointer_atomic);
+				hashtable->batches[0].shared->size += size / 2;
+				dsa_free(hashtable->area, hashtable->batches[0].shared->buckets);
+				hashtable->batches[0].shared->buckets =
+					dsa_allocate(hashtable->area, size);
+				buckets = (dsa_pointer_atomic *)
+					dsa_get_address(hashtable->area,
+									hashtable->batches[0].shared->buckets);
+				for (i = 0; i < pstate->nbuckets; ++i)
+					dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
+
+				/* Put the chunk list onto the work queue. */
+				pstate->chunk_work_queue = hashtable->batches[0].shared->chunks;
+
+				/* Clear the flag. */
+				pstate->growth = PHJ_GROWTH_OK;
+			}
+			/* Fall through. */
+
+		case PHJ_GROW_BUCKETS_ALLOCATING:
+			/* Wait for the above to complete. */
+			BarrierArriveAndWait(&pstate->grow_buckets_barrier,
+								 WAIT_EVENT_HASH_GROW_BUCKETS_ALLOCATE);
+			/* Fall through. */
+
+		case PHJ_GROW_BUCKETS_REINSERTING:
+			/* Reinsert all tuples into the hash table. */
+			ExecParallelHashEnsureBatchAccessors(hashtable);
+			ExecParallelHashTableSetCurrentBatch(hashtable, 0);
+			while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_s)))
+			{
+				size_t		idx = 0;
+
+				while (idx < chunk->used)
+				{
+					HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
+					dsa_pointer shared = chunk_s + HASH_CHUNK_HEADER_SIZE + idx;
+					int			bucketno;
+					int			batchno;
+
+					ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
+											  &bucketno, &batchno);
+					Assert(batchno == 0);
+
+					/* add the tuple to the proper bucket */
+					ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+											  hashTuple, shared);
+
+					/* advance index past the tuple */
+					idx += MAXALIGN(HJTUPLE_OVERHEAD +
+									HJTUPLE_MINTUPLE(hashTuple)->t_len);
+				}
+
+				/* allow this loop to be cancellable */
+				CHECK_FOR_INTERRUPTS();
+			}
+			BarrierArriveAndWait(&pstate->grow_buckets_barrier,
+								 WAIT_EVENT_HASH_GROW_BUCKETS_REINSERT);
+	}
+}
+
+/*
+ * ExecHashTableInsert
+ *		insert a tuple into the hash table depending on the hash value
+ *		it may just go to a temp file for later batches
+ *
+ * Note: the passed TupleTableSlot may contain a regular, minimal, or virtual
+ * tuple; the minimal case in particular is certain to happen while reloading
+ * tuples from batch files.  We could save some cycles in the regular-tuple
+ * case by not forcing the slot contents into minimal form; not clear if it's
+ * worth the messiness required.
+ */
+void
+ExecHashTableInsert(HashJoinTable hashtable,
+					TupleTableSlot *slot,
+					uint32 hashvalue)
+{
+	bool		shouldFree;
+	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+	int			bucketno;
+	int			batchno;
+
+	ExecHashGetBucketAndBatch(hashtable, hashvalue,
+							  &bucketno, &batchno);
+
+	/*
+	 * decide whether to put the tuple in the hash table or a temp file
+	 */
+	if (batchno == hashtable->curbatch)
+	{
+		/*
+		 * put the tuple in hash table
+		 */
+		HashJoinTuple hashTuple;
+		int			hashTupleSize;
+		double		ntuples = (hashtable->totalTuples - hashtable->skewTuples);
+
+		/* Create the HashJoinTuple */
+		hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
+		hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
+
+		hashTuple->hashvalue = hashvalue;
+		memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+
+		/*
+		 * We always reset the tuple-matched flag on insertion.  This is okay
+		 * even when reloading a tuple from a batch file, since the tuple
+		 * could not possibly have been matched to an outer tuple before it
+		 * went into the batch file.
+		 */
+		HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
+
+		/* Push it onto the front of the bucket's list */
+		hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
+		hashtable->buckets.unshared[bucketno] = hashTuple;
+
+		/*
+		 * Increase the (optimal) number of buckets if we just exceeded the
+		 * NTUP_PER_BUCKET threshold, but only when there's still a single
+		 * batch.
+		 */
+		if (hashtable->nbatch == 1 &&
+			ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
+		{
+			/* Guard against integer overflow and alloc size overflow */
+			if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
+				hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
+			{
+				hashtable->nbuckets_optimal *= 2;
+				hashtable->log2_nbuckets_optimal += 1;
+			}
+		}
+
+		/* Account for space used, and back off if we've used too much */
+		hashtable->spaceUsed += hashTupleSize;
+		if (hashtable->spaceUsed > hashtable->spacePeak)
+			hashtable->spacePeak = hashtable->spaceUsed;
+		if (hashtable->spaceUsed +
+			hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
+			> hashtable->spaceAllowed)
+			ExecHashIncreaseNumBatches(hashtable);
+	}
+	else
+	{
+		/*
+		 * put the tuple into a temp file for later batches
+		 */
+		Assert(batchno > hashtable->curbatch);
+		ExecHashJoinSaveTuple(tuple,
+							  hashvalue,
+							  &hashtable->innerBatchFile[batchno]);
+	}
+
+	if (shouldFree)
+		heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * ExecParallelHashTableInsert
+ *		insert a tuple into a shared hash table or shared batch tuplestore
+ */
+void
+ExecParallelHashTableInsert(HashJoinTable hashtable,
+							TupleTableSlot *slot,
+							uint32 hashvalue)
+{
+	bool		shouldFree;
+	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+	dsa_pointer shared;
+	int			bucketno;
+	int			batchno;
+
+retry:
+	ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+
+	if (batchno == 0)
+	{
+		HashJoinTuple hashTuple;
+
+		/* Try to load it into memory. */
+		Assert(BarrierPhase(&hashtable->parallel_state->build_barrier) ==
+			   PHJ_BUILD_HASHING_INNER);
+		hashTuple = ExecParallelHashTupleAlloc(hashtable,
+											   HJTUPLE_OVERHEAD + tuple->t_len,
+											   &shared);
+		if (hashTuple == NULL)
+			goto retry;
+
+		/* Store the hash value in the HashJoinTuple header. */
+		hashTuple->hashvalue = hashvalue;
+		memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+
+		/* Push it onto the front of the bucket's list */
+		ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+								  hashTuple, shared);
+	}
+	else
+	{
+		size_t		tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
+
+		Assert(batchno > 0);
+
+		/* Try to preallocate space in the batch if necessary. */
+		if (hashtable->batches[batchno].preallocated < tuple_size)
+		{
+			if (!ExecParallelHashTuplePrealloc(hashtable, batchno, tuple_size))
+				goto retry;
+		}
+
+		Assert(hashtable->batches[batchno].preallocated >= tuple_size);
+		hashtable->batches[batchno].preallocated -= tuple_size;
+		sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue,
+					 tuple);
+	}
+	++hashtable->batches[batchno].ntuples;
+
+	if (shouldFree)
+		heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * Insert a tuple into the current hash table.  Unlike
+ * ExecParallelHashTableInsert, this version is not prepared to send the tuple
+ * to other batches or to run out of memory, and should only be called with
+ * tuples that belong in the current batch once growth has been disabled.
+ */
+void
+ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
+										TupleTableSlot *slot,
+										uint32 hashvalue)
+{
+	bool		shouldFree;
+	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+	HashJoinTuple hashTuple;
+	dsa_pointer shared;
+	int			batchno;
+	int			bucketno;
+
+	ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+	Assert(batchno == hashtable->curbatch);
+	hashTuple = ExecParallelHashTupleAlloc(hashtable,
+										   HJTUPLE_OVERHEAD + tuple->t_len,
+										   &shared);
+	hashTuple->hashvalue = hashvalue;
+	memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+	HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
+	ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
+							  hashTuple, shared);
+
+	if (shouldFree)
+		heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * ExecHashGetHashValue
+ *		Compute the hash value for a tuple
+ *
+ * The tuple to be tested must be in econtext->ecxt_outertuple (thus Vars in
+ * the hashkeys expressions need to have OUTER_VAR as varno). If outer_tuple
+ * is false (meaning it's the HashJoin's inner node, Hash), econtext,
+ * hashkeys, and slot need to be from Hash, with hashkeys/slot referencing and
+ * being suitable for tuples from the node below the Hash. Conversely, if
+ * outer_tuple is true, econtext is from HashJoin, and hashkeys/slot need to
+ * be appropriate for tuples from HashJoin's outer node.
+ *
+ * A true result means the tuple's hash value has been successfully computed
+ * and stored at *hashvalue.  A false result means the tuple cannot match
+ * because it contains a null attribute, and hence it should be discarded
+ * immediately.  (If keep_nulls is true then false is never returned.)
+ */
+bool
+ExecHashGetHashValue(HashJoinTable hashtable,
+					 ExprContext *econtext,
+					 List *hashkeys,
+					 bool outer_tuple,
+					 bool keep_nulls,
+					 uint32 *hashvalue)
+{
+	uint32		hashkey = 0;
+	FmgrInfo   *hashfunctions;
+	ListCell   *hk;
+	int			i = 0;
+	MemoryContext oldContext;
+
+	/*
+	 * We reset the eval context each time to reclaim any memory leaked in the
+	 * hashkey expressions.
+	 */
+	ResetExprContext(econtext);
+
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	if (outer_tuple)
+		hashfunctions = hashtable->outer_hashfunctions;
+	else
+		hashfunctions = hashtable->inner_hashfunctions;
+
+	foreach(hk, hashkeys)
+	{
+		ExprState  *keyexpr = (ExprState *) lfirst(hk);
+		Datum		keyval;
+		bool		isNull;
+
+		/* rotate hashkey left 1 bit at each step */
+		hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+		/*
+		 * Get the join attribute value of the tuple
+		 */
+		keyval = ExecEvalExpr(keyexpr, econtext, &isNull);
+
+		/*
+		 * If the attribute is NULL, and the join operator is strict, then
+		 * this tuple cannot pass the join qual so we can reject it
+		 * immediately (unless we're scanning the outside of an outer join, in
+		 * which case we must not reject it).  Otherwise we act like the
+		 * hashcode of NULL is zero (this will support operators that act like
+		 * IS NOT DISTINCT, though not any more-random behavior).  We treat
+		 * the hash support function as strict even if the operator is not.
+		 *
+		 * Note: currently, all hashjoinable operators must be strict since
+		 * the hash index AM assumes that.  However, it takes so little extra
+		 * code here to allow non-strict that we may as well do it.
+		 */
+		if (isNull)
+		{
+			if (hashtable->hashStrict[i] && !keep_nulls)
+			{
+				MemoryContextSwitchTo(oldContext);
+				return false;	/* cannot match */
+			}
+			/* else, leave hashkey unmodified, equivalent to hashcode 0 */
+		}
+		else
+		{
+			/* Compute the hash function */
+			uint32		hkey;
+
+			hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i], hashtable->collations[i], keyval));
+			hashkey ^= hkey;
+		}
+
+		i++;
+	}
+
+	MemoryContextSwitchTo(oldContext);
+
+	*hashvalue = hashkey;
+	return true;
+}
+
+/*
+ * ExecHashGetBucketAndBatch
+ *		Determine the bucket number and batch number for a hash value
+ *
+ * Note: on-the-fly increases of nbatch must not change the bucket number
+ * for a given hash code (since we don't move tuples to different hash
+ * chains), and must only cause the batch number to remain the same or
+ * increase.  Our algorithm is
+ *		bucketno = hashvalue MOD nbuckets
+ *		batchno = ROR(hashvalue, log2_nbuckets) MOD nbatch
+ * where nbuckets and nbatch are both expected to be powers of 2, so we can
+ * do the computations by shifting and masking.  (This assumes that all hash
+ * functions are good about randomizing all their output bits, else we are
+ * likely to have very skewed bucket or batch occupancy.)
+ *
+ * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic
+ * bucket count growth.  Once we start batching, the value is fixed and does
+ * not change over the course of the join (making it possible to compute batch
+ * number the way we do here).
+ *
+ * nbatch is always a power of 2; we increase it only by doubling it.  This
+ * effectively adds one more bit to the top of the batchno.  In very large
+ * joins, we might run out of bits to add, so we do this by rotating the hash
+ * value.  This causes batchno to steal bits from bucketno when the number of
+ * virtual buckets exceeds 2^32.  It's better to have longer bucket chains
+ * than to lose the ability to divide batches.
+ */
+void
+ExecHashGetBucketAndBatch(HashJoinTable hashtable,
+						  uint32 hashvalue,
+						  int *bucketno,
+						  int *batchno)
+{
+	uint32		nbuckets = (uint32) hashtable->nbuckets;
+	uint32		nbatch = (uint32) hashtable->nbatch;
+
+	if (nbatch > 1)
+	{
+		*bucketno = hashvalue & (nbuckets - 1);
+		*batchno = pg_rotate_right32(hashvalue,
+									 hashtable->log2_nbuckets) & (nbatch - 1);
+	}
+	else
+	{
+		*bucketno = hashvalue & (nbuckets - 1);
+		*batchno = 0;
+	}
+}
+
+/*
+ * ExecScanHashBucket
+ *		scan a hash bucket for matches to the current outer tuple
+ *
+ * The current outer tuple must be stored in econtext->ecxt_outertuple.
+ *
+ * On success, the inner tuple is stored into hjstate->hj_CurTuple and
+ * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
+ * for the latter.
+ */
+bool
+ExecScanHashBucket(HashJoinState *hjstate,
+				   ExprContext *econtext)
+{
+	ExprState  *hjclauses = hjstate->hashclauses;
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	HashJoinTuple hashTuple = hjstate->hj_CurTuple;
+	uint32		hashvalue = hjstate->hj_CurHashValue;
+
+	/*
+	 * hj_CurTuple is the address of the tuple last returned from the current
+	 * bucket, or NULL if it's time to start scanning a new bucket.
+	 *
+	 * If the tuple hashed to a skew bucket then scan the skew bucket
+	 * otherwise scan the standard hashtable bucket.
+	 */
+	if (hashTuple != NULL)
+		hashTuple = hashTuple->next.unshared;
+	else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
+		hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;
+	else
+		hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
+
+	while (hashTuple != NULL)
+	{
+		if (hashTuple->hashvalue == hashvalue)
+		{
+			TupleTableSlot *inntuple;
+
+			/* insert hashtable's tuple into exec slot so ExecQual sees it */
+			inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+											 hjstate->hj_HashTupleSlot,
+											 false);	/* do not pfree */
+			econtext->ecxt_innertuple = inntuple;
+
+			if (ExecQualAndReset(hjclauses, econtext))
+			{
+				hjstate->hj_CurTuple = hashTuple;
+				return true;
+			}
+		}
+
+		hashTuple = hashTuple->next.unshared;
+	}
+
+	/*
+	 * no match
+	 */
+	return false;
+}
+
+/*
+ * ExecParallelScanHashBucket
+ *		scan a hash bucket for matches to the current outer tuple
+ *
+ * The current outer tuple must be stored in econtext->ecxt_outertuple.
+ *
+ * On success, the inner tuple is stored into hjstate->hj_CurTuple and
+ * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
+ * for the latter.
+ */
+bool
+ExecParallelScanHashBucket(HashJoinState *hjstate,
+						   ExprContext *econtext)
+{
+	ExprState  *hjclauses = hjstate->hashclauses;
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	HashJoinTuple hashTuple = hjstate->hj_CurTuple;
+	uint32		hashvalue = hjstate->hj_CurHashValue;
+
+	/*
+	 * hj_CurTuple is the address of the tuple last returned from the current
+	 * bucket, or NULL if it's time to start scanning a new bucket.
+	 */
+	if (hashTuple != NULL)
+		hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
+	else
+		hashTuple = ExecParallelHashFirstTuple(hashtable,
+											   hjstate->hj_CurBucketNo);
+
+	while (hashTuple != NULL)
+	{
+		if (hashTuple->hashvalue == hashvalue)
+		{
+			TupleTableSlot *inntuple;
+
+			/* insert hashtable's tuple into exec slot so ExecQual sees it */
+			inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+											 hjstate->hj_HashTupleSlot,
+											 false);	/* do not pfree */
+			econtext->ecxt_innertuple = inntuple;
+
+			if (ExecQualAndReset(hjclauses, econtext))
+			{
+				hjstate->hj_CurTuple = hashTuple;
+				return true;
+			}
+		}
+
+		hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
+	}
+
+	/*
+	 * no match
+	 */
+	return false;
+}
+
+/*
+ * ExecPrepHashTableForUnmatched
+ *		set up for a series of ExecScanHashTableForUnmatched calls
+ */
+void
+ExecPrepHashTableForUnmatched(HashJoinState *hjstate)
+{
+	/*----------
+	 * During this scan we use the HashJoinState fields as follows:
+	 *
+	 * hj_CurBucketNo: next regular bucket to scan
+	 * hj_CurSkewBucketNo: next skew bucket (an index into skewBucketNums)
+	 * hj_CurTuple: last tuple returned, or NULL to start next bucket
+	 *----------
+	 */
+	hjstate->hj_CurBucketNo = 0;
+	hjstate->hj_CurSkewBucketNo = 0;
+	hjstate->hj_CurTuple = NULL;
+}
+
+/*
+ * ExecScanHashTableForUnmatched
+ *		scan the hash table for unmatched inner tuples
+ *
+ * On success, the inner tuple is stored into hjstate->hj_CurTuple and
+ * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
+ * for the latter.
+ */
+bool
+ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext)
+{
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	HashJoinTuple hashTuple = hjstate->hj_CurTuple;
+
+	for (;;)
+	{
+		/*
+		 * hj_CurTuple is the address of the tuple last returned from the
+		 * current bucket, or NULL if it's time to start scanning a new
+		 * bucket.
+		 */
+		if (hashTuple != NULL)
+			hashTuple = hashTuple->next.unshared;
+		else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
+		{
+			hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
+			hjstate->hj_CurBucketNo++;
+		}
+		else if (hjstate->hj_CurSkewBucketNo < hashtable->nSkewBuckets)
+		{
+			int			j = hashtable->skewBucketNums[hjstate->hj_CurSkewBucketNo];
+
+			hashTuple = hashtable->skewBucket[j]->tuples;
+			hjstate->hj_CurSkewBucketNo++;
+		}
+		else
+			break;				/* finished all buckets */
+
+		while (hashTuple != NULL)
+		{
+			if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
+			{
+				TupleTableSlot *inntuple;
+
+				/* insert hashtable's tuple into exec slot */
+				inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+												 hjstate->hj_HashTupleSlot,
+												 false);	/* do not pfree */
+				econtext->ecxt_innertuple = inntuple;
+
+				/*
+				 * Reset temp memory each time; although this function doesn't
+				 * do any qual eval, the caller will, so let's keep it
+				 * parallel to ExecScanHashBucket.
+				 */
+				ResetExprContext(econtext);
+
+				hjstate->hj_CurTuple = hashTuple;
+				return true;
+			}
+
+			hashTuple = hashTuple->next.unshared;
+		}
+
+		/* allow this loop to be cancellable */
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	/*
+	 * no more unmatched tuples
+	 */
+	return false;
+}
+
+/*
+ * ExecHashTableReset
+ *
+ *		reset hash table header for new batch
+ */
+void
+ExecHashTableReset(HashJoinTable hashtable)
+{
+	MemoryContext oldcxt;
+	int			nbuckets = hashtable->nbuckets;
+
+	/*
+	 * Release all the hash buckets and tuples acquired in the prior pass, and
+	 * reinitialize the context for a new pass.
+	 */
+	MemoryContextReset(hashtable->batchCxt);
+	oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
+
+	/* Reallocate and reinitialize the hash bucket headers. */
+	hashtable->buckets.unshared = (HashJoinTuple *)
+		palloc0(nbuckets * sizeof(HashJoinTuple));
+
+	hashtable->spaceUsed = 0;
+
+	MemoryContextSwitchTo(oldcxt);
+
+	/* Forget the chunks (the memory was freed by the context reset above). */
+	hashtable->chunks = NULL;
+}
+
+/*
+ * ExecHashTableResetMatchFlags
+ *		Clear all the HeapTupleHeaderHasMatch flags in the table
+ */
+void
+ExecHashTableResetMatchFlags(HashJoinTable hashtable)
+{
+	HashJoinTuple tuple;
+	int			i;
+
+	/* Reset all flags in the main table ... */
+	for (i = 0; i < hashtable->nbuckets; i++)
+	{
+		for (tuple = hashtable->buckets.unshared[i]; tuple != NULL;
+			 tuple = tuple->next.unshared)
+			HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
+	}
+
+	/* ... and the same for the skew buckets, if any */
+	for (i = 0; i < hashtable->nSkewBuckets; i++)
+	{
+		int			j = hashtable->skewBucketNums[i];
+		HashSkewBucket *skewBucket = hashtable->skewBucket[j];
+
+		for (tuple = skewBucket->tuples; tuple != NULL; tuple = tuple->next.unshared)
+			HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
+	}
+}
+
+
+void
+ExecReScanHash(HashState *node)
+{
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->ps.lefttree->chgParam == NULL)
+		ExecReScan(node->ps.lefttree);
+}
+
+
+/*
+ * ExecHashBuildSkewHash
+ *
+ *		Set up for skew optimization if we can identify the most common values
+ *		(MCVs) of the outer relation's join key.  We make a skew hash bucket
+ *		for the hash value of each MCV, up to the number of slots allowed
+ *		based on available memory.
+ */
+static void
+ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, int mcvsToUse)
+{
+	HeapTupleData *statsTuple;
+	AttStatsSlot sslot;
+
+	/* Do nothing if planner didn't identify the outer relation's join key */
+	if (!OidIsValid(node->skewTable))
+		return;
+	/* Also, do nothing if we don't have room for at least one skew bucket */
+	if (mcvsToUse <= 0)
+		return;
+
+	/*
+	 * Try to find the MCV statistics for the outer relation's join key.
+	 */
+	statsTuple = SearchSysCache3(STATRELATTINH,
+								 ObjectIdGetDatum(node->skewTable),
+								 Int16GetDatum(node->skewColumn),
+								 BoolGetDatum(node->skewInherit));
+	if (!HeapTupleIsValid(statsTuple))
+		return;
+
+	if (get_attstatsslot(&sslot, statsTuple,
+						 STATISTIC_KIND_MCV, InvalidOid,
+						 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
+	{
+		double		frac;
+		int			nbuckets;
+		FmgrInfo   *hashfunctions;
+		int			i;
+
+		if (mcvsToUse > sslot.nvalues)
+			mcvsToUse = sslot.nvalues;
+
+		/*
+		 * Calculate the expected fraction of outer relation that will
+		 * participate in the skew optimization.  If this isn't at least
+		 * SKEW_MIN_OUTER_FRACTION, don't use skew optimization.
+		 */
+		frac = 0;
+		for (i = 0; i < mcvsToUse; i++)
+			frac += sslot.numbers[i];
+		if (frac < SKEW_MIN_OUTER_FRACTION)
+		{
+			free_attstatsslot(&sslot);
+			ReleaseSysCache(statsTuple);
+			return;
+		}
+
+		/*
+		 * Okay, set up the skew hashtable.
+		 *
+		 * skewBucket[] is an open addressing hashtable with a power of 2 size
+		 * that is greater than the number of MCV values.  (This ensures there
+		 * will be at least one null entry, so searches will always
+		 * terminate.)
+		 *
+		 * Note: this code could fail if mcvsToUse exceeds INT_MAX/8 or
+		 * MaxAllocSize/sizeof(void *)/8, but that is not currently possible
+		 * since we limit pg_statistic entries to much less than that.
+		 */
+		nbuckets = pg_nextpower2_32(mcvsToUse + 1);
+		/* use two more bits just to help avoid collisions */
+		nbuckets <<= 2;
+
+		hashtable->skewEnabled = true;
+		hashtable->skewBucketLen = nbuckets;
+
+		/*
+		 * We allocate the bucket memory in the hashtable's batch context. It
+		 * is only needed during the first batch, and this ensures it will be
+		 * automatically removed once the first batch is done.
+		 */
+		hashtable->skewBucket = (HashSkewBucket **)
+			MemoryContextAllocZero(hashtable->batchCxt,
+								   nbuckets * sizeof(HashSkewBucket *));
+		hashtable->skewBucketNums = (int *)
+			MemoryContextAllocZero(hashtable->batchCxt,
+								   mcvsToUse * sizeof(int));
+
+		hashtable->spaceUsed += nbuckets * sizeof(HashSkewBucket *)
+			+ mcvsToUse * sizeof(int);
+		hashtable->spaceUsedSkew += nbuckets * sizeof(HashSkewBucket *)
+			+ mcvsToUse * sizeof(int);
+		if (hashtable->spaceUsed > hashtable->spacePeak)
+			hashtable->spacePeak = hashtable->spaceUsed;
+
+		/*
+		 * Create a skew bucket for each MCV hash value.
+		 *
+		 * Note: it is very important that we create the buckets in order of
+		 * decreasing MCV frequency.  If we have to remove some buckets, they
+		 * must be removed in reverse order of creation (see notes in
+		 * ExecHashRemoveNextSkewBucket) and we want the least common MCVs to
+		 * be removed first.
+		 */
+		hashfunctions = hashtable->outer_hashfunctions;
+
+		for (i = 0; i < mcvsToUse; i++)
+		{
+			uint32		hashvalue;
+			int			bucket;
+
+			hashvalue = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[0],
+														 hashtable->collations[0],
+														 sslot.values[i]));
+
+			/*
+			 * While we have not hit a hole in the hashtable and have not hit
+			 * the desired bucket, we have collided with some previous hash
+			 * value, so try the next bucket location.  NB: this code must
+			 * match ExecHashGetSkewBucket.
+			 */
+			bucket = hashvalue & (nbuckets - 1);
+			while (hashtable->skewBucket[bucket] != NULL &&
+				   hashtable->skewBucket[bucket]->hashvalue != hashvalue)
+				bucket = (bucket + 1) & (nbuckets - 1);
+
+			/*
+			 * If we found an existing bucket with the same hashvalue, leave
+			 * it alone.  It's okay for two MCVs to share a hashvalue.
+			 */
+			if (hashtable->skewBucket[bucket] != NULL)
+				continue;
+
+			/* Okay, create a new skew bucket for this hashvalue. */
+			hashtable->skewBucket[bucket] = (HashSkewBucket *)
+				MemoryContextAlloc(hashtable->batchCxt,
+								   sizeof(HashSkewBucket));
+			hashtable->skewBucket[bucket]->hashvalue = hashvalue;
+			hashtable->skewBucket[bucket]->tuples = NULL;
+			hashtable->skewBucketNums[hashtable->nSkewBuckets] = bucket;
+			hashtable->nSkewBuckets++;
+			hashtable->spaceUsed += SKEW_BUCKET_OVERHEAD;
+			hashtable->spaceUsedSkew += SKEW_BUCKET_OVERHEAD;
+			if (hashtable->spaceUsed > hashtable->spacePeak)
+				hashtable->spacePeak = hashtable->spaceUsed;
+		}
+
+		free_attstatsslot(&sslot);
+	}
+
+	ReleaseSysCache(statsTuple);
+}
+
+/*
+ * ExecHashGetSkewBucket
+ *
+ *		Returns the index of the skew bucket for this hashvalue,
+ *		or INVALID_SKEW_BUCKET_NO if the hashvalue is not
+ *		associated with any active skew bucket.
+ */
+int
+ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue)
+{
+	int			bucket;
+
+	/*
+	 * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in
+	 * particular, this happens after the initial batch is done).
+	 */
+	if (!hashtable->skewEnabled)
+		return INVALID_SKEW_BUCKET_NO;
+
+	/*
+	 * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.
+	 */
+	bucket = hashvalue & (hashtable->skewBucketLen - 1);
+
+	/*
+	 * While we have not hit a hole in the hashtable and have not hit the
+	 * desired bucket, we have collided with some other hash value, so try the
+	 * next bucket location.
+	 */
+	while (hashtable->skewBucket[bucket] != NULL &&
+		   hashtable->skewBucket[bucket]->hashvalue != hashvalue)
+		bucket = (bucket + 1) & (hashtable->skewBucketLen - 1);
+
+	/*
+	 * Found the desired bucket?
+	 */
+	if (hashtable->skewBucket[bucket] != NULL)
+		return bucket;
+
+	/*
+	 * There must not be any hashtable entry for this hash value.
+	 */
+	return INVALID_SKEW_BUCKET_NO;
+}
+
+/*
+ * ExecHashSkewTableInsert
+ *
+ *		Insert a tuple into the skew hashtable.
+ *
+ * This should generally match up with the current-batch case in
+ * ExecHashTableInsert.
+ */
+static void
+ExecHashSkewTableInsert(HashJoinTable hashtable,
+						TupleTableSlot *slot,
+						uint32 hashvalue,
+						int bucketNumber)
+{
+	bool		shouldFree;
+	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+	HashJoinTuple hashTuple;
+	int			hashTupleSize;
+
+	/* Create the HashJoinTuple */
+	hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
+	hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
+												   hashTupleSize);
+	hashTuple->hashvalue = hashvalue;
+	memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+	HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
+
+	/* Push it onto the front of the skew bucket's list */
+	hashTuple->next.unshared = hashtable->skewBucket[bucketNumber]->tuples;
+	hashtable->skewBucket[bucketNumber]->tuples = hashTuple;
+	Assert(hashTuple != hashTuple->next.unshared);
+
+	/* Account for space used, and back off if we've used too much */
+	hashtable->spaceUsed += hashTupleSize;
+	hashtable->spaceUsedSkew += hashTupleSize;
+	if (hashtable->spaceUsed > hashtable->spacePeak)
+		hashtable->spacePeak = hashtable->spaceUsed;
+	while (hashtable->spaceUsedSkew > hashtable->spaceAllowedSkew)
+		ExecHashRemoveNextSkewBucket(hashtable);
+
+	/* Check we are not over the total spaceAllowed, either */
+	if (hashtable->spaceUsed > hashtable->spaceAllowed)
+		ExecHashIncreaseNumBatches(hashtable);
+
+	if (shouldFree)
+		heap_free_minimal_tuple(tuple);
+}
+
+/*
+ *		ExecHashRemoveNextSkewBucket
+ *
+ *		Remove the least valuable skew bucket by pushing its tuples into
+ *		the main hash table.
+ */
+static void
+ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
+{
+	int			bucketToRemove;
+	HashSkewBucket *bucket;
+	uint32		hashvalue;
+	int			bucketno;
+	int			batchno;
+	HashJoinTuple hashTuple;
+
+	/* Locate the bucket to remove */
+	bucketToRemove = hashtable->skewBucketNums[hashtable->nSkewBuckets - 1];
+	bucket = hashtable->skewBucket[bucketToRemove];
+
+	/*
+	 * Calculate which bucket and batch the tuples belong to in the main
+	 * hashtable.  They all have the same hash value, so it's the same for all
+	 * of them.  Also note that it's not possible for nbatch to increase while
+	 * we are processing the tuples.
+	 */
+	hashvalue = bucket->hashvalue;
+	ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+
+	/* Process all tuples in the bucket */
+	hashTuple = bucket->tuples;
+	while (hashTuple != NULL)
+	{
+		HashJoinTuple nextHashTuple = hashTuple->next.unshared;
+		MinimalTuple tuple;
+		Size		tupleSize;
+
+		/*
+		 * This code must agree with ExecHashTableInsert.  We do not use
+		 * ExecHashTableInsert directly as ExecHashTableInsert expects a
+		 * TupleTableSlot while we already have HashJoinTuples.
+		 */
+		tuple = HJTUPLE_MINTUPLE(hashTuple);
+		tupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
+
+		/* Decide whether to put the tuple in the hash table or a temp file */
+		if (batchno == hashtable->curbatch)
+		{
+			/* Move the tuple to the main hash table */
+			HashJoinTuple copyTuple;
+
+			/*
+			 * We must copy the tuple into the dense storage, else it will not
+			 * be found by, eg, ExecHashIncreaseNumBatches.
+			 */
+			copyTuple = (HashJoinTuple) dense_alloc(hashtable, tupleSize);
+			memcpy(copyTuple, hashTuple, tupleSize);
+			pfree(hashTuple);
+
+			copyTuple->next.unshared = hashtable->buckets.unshared[bucketno];
+			hashtable->buckets.unshared[bucketno] = copyTuple;
+
+			/* We have reduced skew space, but overall space doesn't change */
+			hashtable->spaceUsedSkew -= tupleSize;
+		}
+		else
+		{
+			/* Put the tuple into a temp file for later batches */
+			Assert(batchno > hashtable->curbatch);
+			ExecHashJoinSaveTuple(tuple, hashvalue,
+								  &hashtable->innerBatchFile[batchno]);
+			pfree(hashTuple);
+			hashtable->spaceUsed -= tupleSize;
+			hashtable->spaceUsedSkew -= tupleSize;
+		}
+
+		hashTuple = nextHashTuple;
+
+		/* allow this loop to be cancellable */
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	/*
+	 * Free the bucket struct itself and reset the hashtable entry to NULL.
+	 *
+	 * NOTE: this is not nearly as simple as it looks on the surface, because
+	 * of the possibility of collisions in the hashtable.  Suppose that hash
+	 * values A and B collide at a particular hashtable entry, and that A was
+	 * entered first so B gets shifted to a different table entry.  If we were
+	 * to remove A first then ExecHashGetSkewBucket would mistakenly start
+	 * reporting that B is not in the hashtable, because it would hit the NULL
+	 * before finding B.  However, we always remove entries in the reverse
+	 * order of creation, so this failure cannot happen.
+	 */
+	hashtable->skewBucket[bucketToRemove] = NULL;
+	hashtable->nSkewBuckets--;
+	pfree(bucket);
+	hashtable->spaceUsed -= SKEW_BUCKET_OVERHEAD;
+	hashtable->spaceUsedSkew -= SKEW_BUCKET_OVERHEAD;
+
+	/*
+	 * If we have removed all skew buckets then give up on skew optimization.
+	 * Release the arrays since they aren't useful any more.
+	 */
+	if (hashtable->nSkewBuckets == 0)
+	{
+		hashtable->skewEnabled = false;
+		pfree(hashtable->skewBucket);
+		pfree(hashtable->skewBucketNums);
+		hashtable->skewBucket = NULL;
+		hashtable->skewBucketNums = NULL;
+		hashtable->spaceUsed -= hashtable->spaceUsedSkew;
+		hashtable->spaceUsedSkew = 0;
+	}
+}
+
+/*
+ * Reserve space in the DSM segment for instrumentation data.
+ */
+void
+ExecHashEstimate(HashState *node, ParallelContext *pcxt)
+{
+	size_t		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation));
+	size = add_size(size, offsetof(SharedHashInfo, hinstrument));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/*
+ * Set up a space in the DSM for all workers to record instrumentation data
+ * about their hash table.
+ */
+void
+ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt)
+{
+	size_t		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedHashInfo, hinstrument) +
+		pcxt->nworkers * sizeof(HashInstrumentation);
+	node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);
+
+	/* Each per-worker area must start out as zeroes. */
+	memset(node->shared_info, 0, size);
+
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/*
+ * Locate the DSM space for hash table instrumentation data that we'll write
+ * to at shutdown time.
+ */
+void
+ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt)
+{
+	SharedHashInfo *shared_info;
+
+	/* don't need this if not instrumenting */
+	if (!node->ps.instrument)
+		return;
+
+	/*
+	 * Find our entry in the shared area, and set up a pointer to it so that
+	 * we'll accumulate stats there when shutting down or rebuilding the hash
+	 * table.
+	 */
+	shared_info = (SharedHashInfo *)
+		shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);
+	node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber];
+}
+
+/*
+ * Collect EXPLAIN stats if needed, saving them into DSM memory if
+ * ExecHashInitializeWorker was called, or local storage if not.  In the
+ * parallel case, this must be done in ExecShutdownHash() rather than
+ * ExecEndHash() because the latter runs after we've detached from the DSM
+ * segment.
+ */
+void
+ExecShutdownHash(HashState *node)
+{
+	/* Allocate save space if EXPLAIN'ing and we didn't do so already */
+	if (node->ps.instrument && !node->hinstrument)
+		node->hinstrument = (HashInstrumentation *)
+			palloc0(sizeof(HashInstrumentation));
+	/* Now accumulate data for the current (final) hash table */
+	if (node->hinstrument && node->hashtable)
+		ExecHashAccumInstrumentation(node->hinstrument, node->hashtable);
+}
+
+/*
+ * Retrieve instrumentation data from workers before the DSM segment is
+ * detached, so that EXPLAIN can access it.
+ */
+void
+ExecHashRetrieveInstrumentation(HashState *node)
+{
+	SharedHashInfo *shared_info = node->shared_info;
+	size_t		size;
+
+	if (shared_info == NULL)
+		return;
+
+	/* Replace node->shared_info with a copy in backend-local memory. */
+	size = offsetof(SharedHashInfo, hinstrument) +
+		shared_info->num_workers * sizeof(HashInstrumentation);
+	node->shared_info = palloc(size);
+	memcpy(node->shared_info, shared_info, size);
+}
+
+/*
+ * Accumulate instrumentation data from 'hashtable' into an
+ * initially-zeroed HashInstrumentation struct.
+ *
+ * This is used to merge information across successive hash table instances
+ * within a single plan node.  We take the maximum values of each interesting
+ * number.  The largest nbuckets and largest nbatch values might have occurred
+ * in different instances, so there's some risk of confusion from reporting
+ * unrelated numbers; but there's a bigger risk of misdiagnosing a performance
+ * issue if we don't report the largest values.  Similarly, we want to report
+ * the largest spacePeak regardless of whether it happened in the same
+ * instance as the largest nbuckets or nbatch.  All the instances should have
+ * the same nbuckets_original and nbatch_original; but there's little value
+ * in depending on that here, so handle them the same way.
+ */
+void
+ExecHashAccumInstrumentation(HashInstrumentation *instrument,
+							 HashJoinTable hashtable)
+{
+	instrument->nbuckets = Max(instrument->nbuckets,
+							   hashtable->nbuckets);
+	instrument->nbuckets_original = Max(instrument->nbuckets_original,
+										hashtable->nbuckets_original);
+	instrument->nbatch = Max(instrument->nbatch,
+							 hashtable->nbatch);
+	instrument->nbatch_original = Max(instrument->nbatch_original,
+									  hashtable->nbatch_original);
+	instrument->space_peak = Max(instrument->space_peak,
+								 hashtable->spacePeak);
+}
+
+/*
+ * Allocate 'size' bytes from the currently active HashMemoryChunk
+ */
+static void *
+dense_alloc(HashJoinTable hashtable, Size size)
+{
+	HashMemoryChunk newChunk;
+	char	   *ptr;
+
+	/* just in case the size is not already aligned properly */
+	size = MAXALIGN(size);
+
+	/*
+	 * If tuple size is larger than threshold, allocate a separate chunk.
+	 */
+	if (size > HASH_CHUNK_THRESHOLD)
+	{
+		/* allocate new chunk and put it at the beginning of the list */
+		newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
+														HASH_CHUNK_HEADER_SIZE + size);
+		newChunk->maxlen = size;
+		newChunk->used = size;
+		newChunk->ntuples = 1;
+
+		/*
+		 * Add this chunk to the list after the first existing chunk, so that
+		 * we don't lose the remaining space in the "current" chunk.
+		 */
+		if (hashtable->chunks != NULL)
+		{
+			newChunk->next = hashtable->chunks->next;
+			hashtable->chunks->next.unshared = newChunk;
+		}
+		else
+		{
+			newChunk->next.unshared = hashtable->chunks;
+			hashtable->chunks = newChunk;
+		}
+
+		return HASH_CHUNK_DATA(newChunk);
+	}
+
+	/*
+	 * See if we have enough space for it in the current chunk (if any). If
+	 * not, allocate a fresh chunk.
+	 */
+	if ((hashtable->chunks == NULL) ||
+		(hashtable->chunks->maxlen - hashtable->chunks->used) < size)
+	{
+		/* allocate new chunk and put it at the beginning of the list */
+		newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
+														HASH_CHUNK_HEADER_SIZE + HASH_CHUNK_SIZE);
+
+		newChunk->maxlen = HASH_CHUNK_SIZE;
+		newChunk->used = size;
+		newChunk->ntuples = 1;
+
+		newChunk->next.unshared = hashtable->chunks;
+		hashtable->chunks = newChunk;
+
+		return HASH_CHUNK_DATA(newChunk);
+	}
+
+	/* There is enough space in the current chunk, let's add the tuple */
+	ptr = HASH_CHUNK_DATA(hashtable->chunks) + hashtable->chunks->used;
+	hashtable->chunks->used += size;
+	hashtable->chunks->ntuples += 1;
+
+	/* return pointer to the start of the tuple memory */
+	return ptr;
+}
+
+/*
+ * Allocate space for a tuple in shared dense storage.  This is equivalent to
+ * dense_alloc but for Parallel Hash using shared memory.
+ *
+ * While loading a tuple into shared memory, we might run out of memory and
+ * decide to repartition, or determine that the load factor is too high and
+ * decide to expand the bucket array, or discover that another participant has
+ * commanded us to help do that.  Return NULL if number of buckets or batches
+ * has changed, indicating that the caller must retry (considering the
+ * possibility that the tuple no longer belongs in the same batch).
+ */
+static HashJoinTuple
+ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
+						   dsa_pointer *shared)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	dsa_pointer chunk_shared;
+	HashMemoryChunk chunk;
+	Size		chunk_size;
+	HashJoinTuple result;
+	int			curbatch = hashtable->curbatch;
+
+	size = MAXALIGN(size);
+
+	/*
+	 * Fast path: if there is enough space in this backend's current chunk,
+	 * then we can allocate without any locking.
+	 */
+	chunk = hashtable->current_chunk;
+	if (chunk != NULL &&
+		size <= HASH_CHUNK_THRESHOLD &&
+		chunk->maxlen - chunk->used >= size)
+	{
+
+		chunk_shared = hashtable->current_chunk_shared;
+		Assert(chunk == dsa_get_address(hashtable->area, chunk_shared));
+		*shared = chunk_shared + HASH_CHUNK_HEADER_SIZE + chunk->used;
+		result = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + chunk->used);
+		chunk->used += size;
+
+		Assert(chunk->used <= chunk->maxlen);
+		Assert(result == dsa_get_address(hashtable->area, *shared));
+
+		return result;
+	}
+
+	/* Slow path: try to allocate a new chunk. */
+	LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
+
+	/*
+	 * Check if we need to help increase the number of buckets or batches.
+	 */
+	if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES ||
+		pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+	{
+		ParallelHashGrowth growth = pstate->growth;
+
+		hashtable->current_chunk = NULL;
+		LWLockRelease(&pstate->lock);
+
+		/* Another participant has commanded us to help grow. */
+		if (growth == PHJ_GROWTH_NEED_MORE_BATCHES)
+			ExecParallelHashIncreaseNumBatches(hashtable);
+		else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+			ExecParallelHashIncreaseNumBuckets(hashtable);
+
+		/* The caller must retry. */
+		return NULL;
+	}
+
+	/* Oversized tuples get their own chunk. */
+	if (size > HASH_CHUNK_THRESHOLD)
+		chunk_size = size + HASH_CHUNK_HEADER_SIZE;
+	else
+		chunk_size = HASH_CHUNK_SIZE;
+
+	/* Check if it's time to grow batches or buckets. */
+	if (pstate->growth != PHJ_GROWTH_DISABLED)
+	{
+		Assert(curbatch == 0);
+		Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER);
+
+		/*
+		 * Check if our space limit would be exceeded.  To avoid choking on
+		 * very large tuples or very low hash_mem setting, we'll always allow
+		 * each backend to allocate at least one chunk.
+		 */
+		if (hashtable->batches[0].at_least_one_chunk &&
+			hashtable->batches[0].shared->size +
+			chunk_size > pstate->space_allowed)
+		{
+			pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
+			hashtable->batches[0].shared->space_exhausted = true;
+			LWLockRelease(&pstate->lock);
+
+			return NULL;
+		}
+
+		/* Check if our load factor limit would be exceeded. */
+		if (hashtable->nbatch == 1)
+		{
+			hashtable->batches[0].shared->ntuples += hashtable->batches[0].ntuples;
+			hashtable->batches[0].ntuples = 0;
+			/* Guard against integer overflow and alloc size overflow */
+			if (hashtable->batches[0].shared->ntuples + 1 >
+				hashtable->nbuckets * NTUP_PER_BUCKET &&
+				hashtable->nbuckets < (INT_MAX / 2) &&
+				hashtable->nbuckets * 2 <=
+				MaxAllocSize / sizeof(dsa_pointer_atomic))
+			{
+				pstate->growth = PHJ_GROWTH_NEED_MORE_BUCKETS;
+				LWLockRelease(&pstate->lock);
+
+				return NULL;
+			}
+		}
+	}
+
+	/* We are cleared to allocate a new chunk. */
+	chunk_shared = dsa_allocate(hashtable->area, chunk_size);
+	hashtable->batches[curbatch].shared->size += chunk_size;
+	hashtable->batches[curbatch].at_least_one_chunk = true;
+
+	/* Set up the chunk. */
+	chunk = (HashMemoryChunk) dsa_get_address(hashtable->area, chunk_shared);
+	*shared = chunk_shared + HASH_CHUNK_HEADER_SIZE;
+	chunk->maxlen = chunk_size - HASH_CHUNK_HEADER_SIZE;
+	chunk->used = size;
+
+	/*
+	 * Push it onto the list of chunks, so that it can be found if we need to
+	 * increase the number of buckets or batches (batch 0 only) and later for
+	 * freeing the memory (all batches).
+	 */
+	chunk->next.shared = hashtable->batches[curbatch].shared->chunks;
+	hashtable->batches[curbatch].shared->chunks = chunk_shared;
+
+	if (size <= HASH_CHUNK_THRESHOLD)
+	{
+		/*
+		 * Make this the current chunk so that we can use the fast path to
+		 * fill the rest of it up in future calls.
+		 */
+		hashtable->current_chunk = chunk;
+		hashtable->current_chunk_shared = chunk_shared;
+	}
+	LWLockRelease(&pstate->lock);
+
+	Assert(HASH_CHUNK_DATA(chunk) == dsa_get_address(hashtable->area, *shared));
+	result = (HashJoinTuple) HASH_CHUNK_DATA(chunk);
+
+	return result;
+}
+
+/*
+ * One backend needs to set up the shared batch state including tuplestores.
+ * Other backends will ensure they have correctly configured accessors by
+ * called ExecParallelHashEnsureBatchAccessors().
+ */
+static void
+ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	ParallelHashJoinBatch *batches;
+	MemoryContext oldcxt;
+	int			i;
+
+	Assert(hashtable->batches == NULL);
+
+	/* Allocate space. */
+	pstate->batches =
+		dsa_allocate0(hashtable->area,
+					  EstimateParallelHashJoinBatch(hashtable) * nbatch);
+	pstate->nbatch = nbatch;
+	batches = dsa_get_address(hashtable->area, pstate->batches);
+
+	/* Use hash join memory context. */
+	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+
+	/* Allocate this backend's accessor array. */
+	hashtable->nbatch = nbatch;
+	hashtable->batches = (ParallelHashJoinBatchAccessor *)
+		palloc0(sizeof(ParallelHashJoinBatchAccessor) * hashtable->nbatch);
+
+	/* Set up the shared state, tuplestores and backend-local accessors. */
+	for (i = 0; i < hashtable->nbatch; ++i)
+	{
+		ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i];
+		ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i);
+		char		name[MAXPGPATH];
+
+		/*
+		 * All members of shared were zero-initialized.  We just need to set
+		 * up the Barrier.
+		 */
+		BarrierInit(&shared->batch_barrier, 0);
+		if (i == 0)
+		{
+			/* Batch 0 doesn't need to be loaded. */
+			BarrierAttach(&shared->batch_barrier);
+			while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_PROBING)
+				BarrierArriveAndWait(&shared->batch_barrier, 0);
+			BarrierDetach(&shared->batch_barrier);
+		}
+
+		/* Initialize accessor state.  All members were zero-initialized. */
+		accessor->shared = shared;
+
+		/* Initialize the shared tuplestores. */
+		snprintf(name, sizeof(name), "i%dof%d", i, hashtable->nbatch);
+		accessor->inner_tuples =
+			sts_initialize(ParallelHashJoinBatchInner(shared),
+						   pstate->nparticipants,
+						   ParallelWorkerNumber + 1,
+						   sizeof(uint32),
+						   SHARED_TUPLESTORE_SINGLE_PASS,
+						   &pstate->fileset,
+						   name);
+		snprintf(name, sizeof(name), "o%dof%d", i, hashtable->nbatch);
+		accessor->outer_tuples =
+			sts_initialize(ParallelHashJoinBatchOuter(shared,
+													  pstate->nparticipants),
+						   pstate->nparticipants,
+						   ParallelWorkerNumber + 1,
+						   sizeof(uint32),
+						   SHARED_TUPLESTORE_SINGLE_PASS,
+						   &pstate->fileset,
+						   name);
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * Free the current set of ParallelHashJoinBatchAccessor objects.
+ */
+static void
+ExecParallelHashCloseBatchAccessors(HashJoinTable hashtable)
+{
+	int			i;
+
+	for (i = 0; i < hashtable->nbatch; ++i)
+	{
+		/* Make sure no files are left open. */
+		sts_end_write(hashtable->batches[i].inner_tuples);
+		sts_end_write(hashtable->batches[i].outer_tuples);
+		sts_end_parallel_scan(hashtable->batches[i].inner_tuples);
+		sts_end_parallel_scan(hashtable->batches[i].outer_tuples);
+	}
+	pfree(hashtable->batches);
+	hashtable->batches = NULL;
+}
+
+/*
+ * Make sure this backend has up-to-date accessors for the current set of
+ * batches.
+ */
+static void
+ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	ParallelHashJoinBatch *batches;
+	MemoryContext oldcxt;
+	int			i;
+
+	if (hashtable->batches != NULL)
+	{
+		if (hashtable->nbatch == pstate->nbatch)
+			return;
+		ExecParallelHashCloseBatchAccessors(hashtable);
+	}
+
+	/*
+	 * It's possible for a backend to start up very late so that the whole
+	 * join is finished and the shm state for tracking batches has already
+	 * been freed by ExecHashTableDetach().  In that case we'll just leave
+	 * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
+	 * up early.
+	 */
+	if (!DsaPointerIsValid(pstate->batches))
+		return;
+
+	/* Use hash join memory context. */
+	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+
+	/* Allocate this backend's accessor array. */
+	hashtable->nbatch = pstate->nbatch;
+	hashtable->batches = (ParallelHashJoinBatchAccessor *)
+		palloc0(sizeof(ParallelHashJoinBatchAccessor) * hashtable->nbatch);
+
+	/* Find the base of the pseudo-array of ParallelHashJoinBatch objects. */
+	batches = (ParallelHashJoinBatch *)
+		dsa_get_address(hashtable->area, pstate->batches);
+
+	/* Set up the accessor array and attach to the tuplestores. */
+	for (i = 0; i < hashtable->nbatch; ++i)
+	{
+		ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i];
+		ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i);
+
+		accessor->shared = shared;
+		accessor->preallocated = 0;
+		accessor->done = false;
+		accessor->inner_tuples =
+			sts_attach(ParallelHashJoinBatchInner(shared),
+					   ParallelWorkerNumber + 1,
+					   &pstate->fileset);
+		accessor->outer_tuples =
+			sts_attach(ParallelHashJoinBatchOuter(shared,
+												  pstate->nparticipants),
+					   ParallelWorkerNumber + 1,
+					   &pstate->fileset);
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * Allocate an empty shared memory hash table for a given batch.
+ */
+void
+ExecParallelHashTableAlloc(HashJoinTable hashtable, int batchno)
+{
+	ParallelHashJoinBatch *batch = hashtable->batches[batchno].shared;
+	dsa_pointer_atomic *buckets;
+	int			nbuckets = hashtable->parallel_state->nbuckets;
+	int			i;
+
+	batch->buckets =
+		dsa_allocate(hashtable->area, sizeof(dsa_pointer_atomic) * nbuckets);
+	buckets = (dsa_pointer_atomic *)
+		dsa_get_address(hashtable->area, batch->buckets);
+	for (i = 0; i < nbuckets; ++i)
+		dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
+}
+
+/*
+ * If we are currently attached to a shared hash join batch, detach.  If we
+ * are last to detach, clean up.
+ */
+void
+ExecHashTableDetachBatch(HashJoinTable hashtable)
+{
+	if (hashtable->parallel_state != NULL &&
+		hashtable->curbatch >= 0)
+	{
+		int			curbatch = hashtable->curbatch;
+		ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
+
+		/* Make sure any temporary files are closed. */
+		sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);
+		sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);
+
+		/* Detach from the batch we were last working on. */
+		if (BarrierArriveAndDetach(&batch->batch_barrier))
+		{
+			/*
+			 * Technically we shouldn't access the barrier because we're no
+			 * longer attached, but since there is no way it's moving after
+			 * this point it seems safe to make the following assertion.
+			 */
+			Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_DONE);
+
+			/* Free shared chunks and buckets. */
+			while (DsaPointerIsValid(batch->chunks))
+			{
+				HashMemoryChunk chunk =
+				dsa_get_address(hashtable->area, batch->chunks);
+				dsa_pointer next = chunk->next.shared;
+
+				dsa_free(hashtable->area, batch->chunks);
+				batch->chunks = next;
+			}
+			if (DsaPointerIsValid(batch->buckets))
+			{
+				dsa_free(hashtable->area, batch->buckets);
+				batch->buckets = InvalidDsaPointer;
+			}
+		}
+
+		/*
+		 * Track the largest batch we've been attached to.  Though each
+		 * backend might see a different subset of batches, explain.c will
+		 * scan the results from all backends to find the largest value.
+		 */
+		hashtable->spacePeak =
+			Max(hashtable->spacePeak,
+				batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);
+
+		/* Remember that we are not attached to a batch. */
+		hashtable->curbatch = -1;
+	}
+}
+
+/*
+ * Detach from all shared resources.  If we are last to detach, clean up.
+ */
+void
+ExecHashTableDetach(HashJoinTable hashtable)
+{
+	if (hashtable->parallel_state)
+	{
+		ParallelHashJoinState *pstate = hashtable->parallel_state;
+		int			i;
+
+		/* Make sure any temporary files are closed. */
+		if (hashtable->batches)
+		{
+			for (i = 0; i < hashtable->nbatch; ++i)
+			{
+				sts_end_write(hashtable->batches[i].inner_tuples);
+				sts_end_write(hashtable->batches[i].outer_tuples);
+				sts_end_parallel_scan(hashtable->batches[i].inner_tuples);
+				sts_end_parallel_scan(hashtable->batches[i].outer_tuples);
+			}
+		}
+
+		/* If we're last to detach, clean up shared memory. */
+		if (BarrierDetach(&pstate->build_barrier))
+		{
+			if (DsaPointerIsValid(pstate->batches))
+			{
+				dsa_free(hashtable->area, pstate->batches);
+				pstate->batches = InvalidDsaPointer;
+			}
+		}
+
+		hashtable->parallel_state = NULL;
+	}
+}
+
+/*
+ * Get the first tuple in a given bucket identified by number.
+ */
+static inline HashJoinTuple
+ExecParallelHashFirstTuple(HashJoinTable hashtable, int bucketno)
+{
+	HashJoinTuple tuple;
+	dsa_pointer p;
+
+	Assert(hashtable->parallel_state);
+	p = dsa_pointer_atomic_read(&hashtable->buckets.shared[bucketno]);
+	tuple = (HashJoinTuple) dsa_get_address(hashtable->area, p);
+
+	return tuple;
+}
+
+/*
+ * Get the next tuple in the same bucket as 'tuple'.
+ */
+static inline HashJoinTuple
+ExecParallelHashNextTuple(HashJoinTable hashtable, HashJoinTuple tuple)
+{
+	HashJoinTuple next;
+
+	Assert(hashtable->parallel_state);
+	next = (HashJoinTuple) dsa_get_address(hashtable->area, tuple->next.shared);
+
+	return next;
+}
+
+/*
+ * Insert a tuple at the front of a chain of tuples in DSA memory atomically.
+ */
+static inline void
+ExecParallelHashPushTuple(dsa_pointer_atomic *head,
+						  HashJoinTuple tuple,
+						  dsa_pointer tuple_shared)
+{
+	for (;;)
+	{
+		tuple->next.shared = dsa_pointer_atomic_read(head);
+		if (dsa_pointer_atomic_compare_exchange(head,
+												&tuple->next.shared,
+												tuple_shared))
+			break;
+	}
+}
+
+/*
+ * Prepare to work on a given batch.
+ */
+void
+ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno)
+{
+	Assert(hashtable->batches[batchno].shared->buckets != InvalidDsaPointer);
+
+	hashtable->curbatch = batchno;
+	hashtable->buckets.shared = (dsa_pointer_atomic *)
+		dsa_get_address(hashtable->area,
+						hashtable->batches[batchno].shared->buckets);
+	hashtable->nbuckets = hashtable->parallel_state->nbuckets;
+	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
+	hashtable->current_chunk = NULL;
+	hashtable->current_chunk_shared = InvalidDsaPointer;
+	hashtable->batches[batchno].at_least_one_chunk = false;
+}
+
+/*
+ * Take the next available chunk from the queue of chunks being worked on in
+ * parallel.  Return NULL if there are none left.  Otherwise return a pointer
+ * to the chunk, and set *shared to the DSA pointer to the chunk.
+ */
+static HashMemoryChunk
+ExecParallelHashPopChunkQueue(HashJoinTable hashtable, dsa_pointer *shared)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	HashMemoryChunk chunk;
+
+	LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
+	if (DsaPointerIsValid(pstate->chunk_work_queue))
+	{
+		*shared = pstate->chunk_work_queue;
+		chunk = (HashMemoryChunk)
+			dsa_get_address(hashtable->area, *shared);
+		pstate->chunk_work_queue = chunk->next.shared;
+	}
+	else
+		chunk = NULL;
+	LWLockRelease(&pstate->lock);
+
+	return chunk;
+}
+
+/*
+ * Increase the space preallocated in this backend for a given inner batch by
+ * at least a given amount.  This allows us to track whether a given batch
+ * would fit in memory when loaded back in.  Also increase the number of
+ * batches or buckets if required.
+ *
+ * This maintains a running estimation of how much space will be taken when we
+ * load the batch back into memory by simulating the way chunks will be handed
+ * out to workers.  It's not perfectly accurate because the tuples will be
+ * packed into memory chunks differently by ExecParallelHashTupleAlloc(), but
+ * it should be pretty close.  It tends to overestimate by a fraction of a
+ * chunk per worker since all workers gang up to preallocate during hashing,
+ * but workers tend to reload batches alone if there are enough to go around,
+ * leaving fewer partially filled chunks.  This effect is bounded by
+ * nparticipants.
+ *
+ * Return false if the number of batches or buckets has changed, and the
+ * caller should reconsider which batch a given tuple now belongs in and call
+ * again.
+ */
+static bool
+ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
+{
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+	ParallelHashJoinBatchAccessor *batch = &hashtable->batches[batchno];
+	size_t		want = Max(size, HASH_CHUNK_SIZE - HASH_CHUNK_HEADER_SIZE);
+
+	Assert(batchno > 0);
+	Assert(batchno < hashtable->nbatch);
+	Assert(size == MAXALIGN(size));
+
+	LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
+
+	/* Has another participant commanded us to help grow? */
+	if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES ||
+		pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+	{
+		ParallelHashGrowth growth = pstate->growth;
+
+		LWLockRelease(&pstate->lock);
+		if (growth == PHJ_GROWTH_NEED_MORE_BATCHES)
+			ExecParallelHashIncreaseNumBatches(hashtable);
+		else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
+			ExecParallelHashIncreaseNumBuckets(hashtable);
+
+		return false;
+	}
+
+	if (pstate->growth != PHJ_GROWTH_DISABLED &&
+		batch->at_least_one_chunk &&
+		(batch->shared->estimated_size + want + HASH_CHUNK_HEADER_SIZE
+		 > pstate->space_allowed))
+	{
+		/*
+		 * We have determined that this batch would exceed the space budget if
+		 * loaded into memory.  Command all participants to help repartition.
+		 */
+		batch->shared->space_exhausted = true;
+		pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
+		LWLockRelease(&pstate->lock);
+
+		return false;
+	}
+
+	batch->at_least_one_chunk = true;
+	batch->shared->estimated_size += want + HASH_CHUNK_HEADER_SIZE;
+	batch->preallocated = want;
+	LWLockRelease(&pstate->lock);
+
+	return true;
+}
+
+/*
+ * Calculate the limit on how much memory can be used by Hash and similar
+ * plan types.  This is work_mem times hash_mem_multiplier, and is
+ * expressed in bytes.
+ *
+ * Exported for use by the planner, as well as other hash-like executor
+ * nodes.  This is a rather random place for this, but there is no better
+ * place.
+ */
+size_t
+get_hash_memory_limit(void)
+{
+	double		mem_limit;
+
+	/* Do initial calculation in double arithmetic */
+	mem_limit = (double) work_mem * hash_mem_multiplier * 1024.0;
+
+	/* Clamp in case it doesn't fit in size_t */
+	mem_limit = Min(mem_limit, (double) SIZE_MAX);
+
+	return (size_t) mem_limit;
+}
+
+/*
+ * Convert the hash memory limit to an integer number of kilobytes,
+ * that is something comparable to work_mem.  Like work_mem, we clamp
+ * the result to ensure that multiplying it by 1024 fits in a long int.
+ *
+ * This is deprecated since it may understate the actual memory limit.
+ * It is unused in core and will eventually be removed.
+ */
+int
+get_hash_mem(void)
+{
+	size_t		mem_limit = get_hash_memory_limit();
+
+	/* Remove the kilobyte factor */
+	mem_limit /= 1024;
+
+	/* Clamp to MAX_KILOBYTES, like work_mem */
+	mem_limit = Min(mem_limit, (size_t) MAX_KILOBYTES);
+
+	return (int) mem_limit;
+}
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
new file mode 100644
index 0000000..510bdd3
--- /dev/null
+++ b/src/backend/executor/nodeHashjoin.c
@@ -0,0 +1,1551 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeHashjoin.c
+ *	  Routines to handle hash join nodes
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeHashjoin.c
+ *
+ * PARALLELISM
+ *
+ * Hash joins can participate in parallel query execution in several ways.  A
+ * parallel-oblivious hash join is one where the node is unaware that it is
+ * part of a parallel plan.  In this case, a copy of the inner plan is used to
+ * build a copy of the hash table in every backend, and the outer plan could
+ * either be built from a partial or complete path, so that the results of the
+ * hash join are correspondingly either partial or complete.  A parallel-aware
+ * hash join is one that behaves differently, coordinating work between
+ * backends, and appears as Parallel Hash Join in EXPLAIN output.  A Parallel
+ * Hash Join always appears with a Parallel Hash node.
+ *
+ * Parallel-aware hash joins use the same per-backend state machine to track
+ * progress through the hash join algorithm as parallel-oblivious hash joins.
+ * In a parallel-aware hash join, there is also a shared state machine that
+ * co-operating backends use to synchronize their local state machines and
+ * program counters.  The shared state machine is managed with a Barrier IPC
+ * primitive.  When all attached participants arrive at a barrier, the phase
+ * advances and all waiting participants are released.
+ *
+ * When a participant begins working on a parallel hash join, it must first
+ * figure out how much progress has already been made, because participants
+ * don't wait for each other to begin.  For this reason there are switch
+ * statements at key points in the code where we have to synchronize our local
+ * state machine with the phase, and then jump to the correct part of the
+ * algorithm so that we can get started.
+ *
+ * One barrier called build_barrier is used to coordinate the hashing phases.
+ * The phase is represented by an integer which begins at zero and increments
+ * one by one, but in the code it is referred to by symbolic names as follows:
+ *
+ *   PHJ_BUILD_ELECTING              -- initial state
+ *   PHJ_BUILD_ALLOCATING            -- one sets up the batches and table 0
+ *   PHJ_BUILD_HASHING_INNER         -- all hash the inner rel
+ *   PHJ_BUILD_HASHING_OUTER         -- (multi-batch only) all hash the outer
+ *   PHJ_BUILD_DONE                  -- building done, probing can begin
+ *
+ * While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may
+ * be used repeatedly as required to coordinate expansions in the number of
+ * batches or buckets.  Their phases are as follows:
+ *
+ *   PHJ_GROW_BATCHES_ELECTING       -- initial state
+ *   PHJ_GROW_BATCHES_ALLOCATING     -- one allocates new batches
+ *   PHJ_GROW_BATCHES_REPARTITIONING -- all repartition
+ *   PHJ_GROW_BATCHES_FINISHING      -- one cleans up, detects skew
+ *
+ *   PHJ_GROW_BUCKETS_ELECTING       -- initial state
+ *   PHJ_GROW_BUCKETS_ALLOCATING     -- one allocates new buckets
+ *   PHJ_GROW_BUCKETS_REINSERTING    -- all insert tuples
+ *
+ * If the planner got the number of batches and buckets right, those won't be
+ * necessary, but on the other hand we might finish up needing to expand the
+ * buckets or batches multiple times while hashing the inner relation to stay
+ * within our memory budget and load factor target.  For that reason it's a
+ * separate pair of barriers using circular phases.
+ *
+ * The PHJ_BUILD_HASHING_OUTER phase is required only for multi-batch joins,
+ * because we need to divide the outer relation into batches up front in order
+ * to be able to process batches entirely independently.  In contrast, the
+ * parallel-oblivious algorithm simply throws tuples 'forward' to 'later'
+ * batches whenever it encounters them while scanning and probing, which it
+ * can do because it processes batches in serial order.
+ *
+ * Once PHJ_BUILD_DONE is reached, backends then split up and process
+ * different batches, or gang up and work together on probing batches if there
+ * aren't enough to go around.  For each batch there is a separate barrier
+ * with the following phases:
+ *
+ *  PHJ_BATCH_ELECTING       -- initial state
+ *  PHJ_BATCH_ALLOCATING     -- one allocates buckets
+ *  PHJ_BATCH_LOADING        -- all load the hash table from disk
+ *  PHJ_BATCH_PROBING        -- all probe
+ *  PHJ_BATCH_DONE           -- end
+ *
+ * Batch 0 is a special case, because it starts out in phase
+ * PHJ_BATCH_PROBING; populating batch 0's hash table is done during
+ * PHJ_BUILD_HASHING_INNER so we can skip loading.
+ *
+ * Initially we try to plan for a single-batch hash join using the combined
+ * hash_mem of all participants to create a large shared hash table.  If that
+ * turns out either at planning or execution time to be impossible then we
+ * fall back to regular hash_mem sized hash tables.
+ *
+ * To avoid deadlocks, we never wait for any barrier unless it is known that
+ * all other backends attached to it are actively executing the node or have
+ * already arrived.  Practically, that means that we never return a tuple
+ * while attached to a barrier, unless the barrier has reached its final
+ * state.  In the slightly special case of the per-batch barrier, we return
+ * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use
+ * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/parallel.h"
+#include "executor/executor.h"
+#include "executor/hashjoin.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeHashjoin.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#include "utils/sharedtuplestore.h"
+
+
+/*
+ * States of the ExecHashJoin state machine
+ */
+#define HJ_BUILD_HASHTABLE		1
+#define HJ_NEED_NEW_OUTER		2
+#define HJ_SCAN_BUCKET			3
+#define HJ_FILL_OUTER_TUPLE		4
+#define HJ_FILL_INNER_TUPLES	5
+#define HJ_NEED_NEW_BATCH		6
+
+/* Returns true if doing null-fill on outer relation */
+#define HJ_FILL_OUTER(hjstate)	((hjstate)->hj_NullInnerTupleSlot != NULL)
+/* Returns true if doing null-fill on inner relation */
+#define HJ_FILL_INNER(hjstate)	((hjstate)->hj_NullOuterTupleSlot != NULL)
+
+static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
+												 HashJoinState *hjstate,
+												 uint32 *hashvalue);
+static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
+														 HashJoinState *hjstate,
+														 uint32 *hashvalue);
+static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
+												 BufFile *file,
+												 uint32 *hashvalue,
+												 TupleTableSlot *tupleSlot);
+static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
+static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate);
+static void ExecParallelHashJoinPartitionOuter(HashJoinState *node);
+
+
+/* ----------------------------------------------------------------
+ *		ExecHashJoinImpl
+ *
+ *		This function implements the Hybrid Hashjoin algorithm.  It is marked
+ *		with an always-inline attribute so that ExecHashJoin() and
+ *		ExecParallelHashJoin() can inline it.  Compilers that respect the
+ *		attribute should create versions specialized for parallel == true and
+ *		parallel == false with unnecessary branches removed.
+ *
+ *		Note: the relation we build hash table on is the "inner"
+ *			  the other one is "outer".
+ * ----------------------------------------------------------------
+ */
+static pg_attribute_always_inline TupleTableSlot *
+ExecHashJoinImpl(PlanState *pstate, bool parallel)
+{
+	HashJoinState *node = castNode(HashJoinState, pstate);
+	PlanState  *outerNode;
+	HashState  *hashNode;
+	ExprState  *joinqual;
+	ExprState  *otherqual;
+	ExprContext *econtext;
+	HashJoinTable hashtable;
+	TupleTableSlot *outerTupleSlot;
+	uint32		hashvalue;
+	int			batchno;
+	ParallelHashJoinState *parallel_state;
+
+	/*
+	 * get information from HashJoin node
+	 */
+	joinqual = node->js.joinqual;
+	otherqual = node->js.ps.qual;
+	hashNode = (HashState *) innerPlanState(node);
+	outerNode = outerPlanState(node);
+	hashtable = node->hj_HashTable;
+	econtext = node->js.ps.ps_ExprContext;
+	parallel_state = hashNode->parallel_state;
+
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage allocated in the previous tuple cycle.
+	 */
+	ResetExprContext(econtext);
+
+	/*
+	 * run the hash join state machine
+	 */
+	for (;;)
+	{
+		/*
+		 * It's possible to iterate this loop many times before returning a
+		 * tuple, in some pathological cases such as needing to move much of
+		 * the current batch to a later batch.  So let's check for interrupts
+		 * each time through.
+		 */
+		CHECK_FOR_INTERRUPTS();
+
+		switch (node->hj_JoinState)
+		{
+			case HJ_BUILD_HASHTABLE:
+
+				/*
+				 * First time through: build hash table for inner relation.
+				 */
+				Assert(hashtable == NULL);
+
+				/*
+				 * If the outer relation is completely empty, and it's not
+				 * right/full join, we can quit without building the hash
+				 * table.  However, for an inner join it is only a win to
+				 * check this when the outer relation's startup cost is less
+				 * than the projected cost of building the hash table.
+				 * Otherwise it's best to build the hash table first and see
+				 * if the inner relation is empty.  (When it's a left join, we
+				 * should always make this check, since we aren't going to be
+				 * able to skip the join on the strength of an empty inner
+				 * relation anyway.)
+				 *
+				 * If we are rescanning the join, we make use of information
+				 * gained on the previous scan: don't bother to try the
+				 * prefetch if the previous scan found the outer relation
+				 * nonempty. This is not 100% reliable since with new
+				 * parameters the outer relation might yield different
+				 * results, but it's a good heuristic.
+				 *
+				 * The only way to make the check is to try to fetch a tuple
+				 * from the outer plan node.  If we succeed, we have to stash
+				 * it away for later consumption by ExecHashJoinOuterGetTuple.
+				 */
+				if (HJ_FILL_INNER(node))
+				{
+					/* no chance to not build the hash table */
+					node->hj_FirstOuterTupleSlot = NULL;
+				}
+				else if (parallel)
+				{
+					/*
+					 * The empty-outer optimization is not implemented for
+					 * shared hash tables, because no one participant can
+					 * determine that there are no outer tuples, and it's not
+					 * yet clear that it's worth the synchronization overhead
+					 * of reaching consensus to figure that out.  So we have
+					 * to build the hash table.
+					 */
+					node->hj_FirstOuterTupleSlot = NULL;
+				}
+				else if (HJ_FILL_OUTER(node) ||
+						 (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
+						  !node->hj_OuterNotEmpty))
+				{
+					node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
+					if (TupIsNull(node->hj_FirstOuterTupleSlot))
+					{
+						node->hj_OuterNotEmpty = false;
+						return NULL;
+					}
+					else
+						node->hj_OuterNotEmpty = true;
+				}
+				else
+					node->hj_FirstOuterTupleSlot = NULL;
+
+				/*
+				 * Create the hash table.  If using Parallel Hash, then
+				 * whoever gets here first will create the hash table and any
+				 * later arrivals will merely attach to it.
+				 */
+				hashtable = ExecHashTableCreate(hashNode,
+												node->hj_HashOperators,
+												node->hj_Collations,
+												HJ_FILL_INNER(node));
+				node->hj_HashTable = hashtable;
+
+				/*
+				 * Execute the Hash node, to build the hash table.  If using
+				 * Parallel Hash, then we'll try to help hashing unless we
+				 * arrived too late.
+				 */
+				hashNode->hashtable = hashtable;
+				(void) MultiExecProcNode((PlanState *) hashNode);
+
+				/*
+				 * If the inner relation is completely empty, and we're not
+				 * doing a left outer join, we can quit without scanning the
+				 * outer relation.
+				 */
+				if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
+					return NULL;
+
+				/*
+				 * need to remember whether nbatch has increased since we
+				 * began scanning the outer relation
+				 */
+				hashtable->nbatch_outstart = hashtable->nbatch;
+
+				/*
+				 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a
+				 * tuple above, because ExecHashJoinOuterGetTuple will
+				 * immediately set it again.)
+				 */
+				node->hj_OuterNotEmpty = false;
+
+				if (parallel)
+				{
+					Barrier    *build_barrier;
+
+					build_barrier = &parallel_state->build_barrier;
+					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+						   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
+					if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER)
+					{
+						/*
+						 * If multi-batch, we need to hash the outer relation
+						 * up front.
+						 */
+						if (hashtable->nbatch > 1)
+							ExecParallelHashJoinPartitionOuter(node);
+						BarrierArriveAndWait(build_barrier,
+											 WAIT_EVENT_HASH_BUILD_HASH_OUTER);
+					}
+					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
+
+					/* Each backend should now select a batch to work on. */
+					hashtable->curbatch = -1;
+					node->hj_JoinState = HJ_NEED_NEW_BATCH;
+
+					continue;
+				}
+				else
+					node->hj_JoinState = HJ_NEED_NEW_OUTER;
+
+				/* FALL THRU */
+
+			case HJ_NEED_NEW_OUTER:
+
+				/*
+				 * We don't have an outer tuple, try to get the next one
+				 */
+				if (parallel)
+					outerTupleSlot =
+						ExecParallelHashJoinOuterGetTuple(outerNode, node,
+														  &hashvalue);
+				else
+					outerTupleSlot =
+						ExecHashJoinOuterGetTuple(outerNode, node, &hashvalue);
+
+				if (TupIsNull(outerTupleSlot))
+				{
+					/* end of batch, or maybe whole join */
+					if (HJ_FILL_INNER(node))
+					{
+						/* set up to scan for unmatched inner tuples */
+						ExecPrepHashTableForUnmatched(node);
+						node->hj_JoinState = HJ_FILL_INNER_TUPLES;
+					}
+					else
+						node->hj_JoinState = HJ_NEED_NEW_BATCH;
+					continue;
+				}
+
+				econtext->ecxt_outertuple = outerTupleSlot;
+				node->hj_MatchedOuter = false;
+
+				/*
+				 * Find the corresponding bucket for this tuple in the main
+				 * hash table or skew hash table.
+				 */
+				node->hj_CurHashValue = hashvalue;
+				ExecHashGetBucketAndBatch(hashtable, hashvalue,
+										  &node->hj_CurBucketNo, &batchno);
+				node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
+																 hashvalue);
+				node->hj_CurTuple = NULL;
+
+				/*
+				 * The tuple might not belong to the current batch (where
+				 * "current batch" includes the skew buckets if any).
+				 */
+				if (batchno != hashtable->curbatch &&
+					node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
+				{
+					bool		shouldFree;
+					MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot,
+																	  &shouldFree);
+
+					/*
+					 * Need to postpone this outer tuple to a later batch.
+					 * Save it in the corresponding outer-batch file.
+					 */
+					Assert(parallel_state == NULL);
+					Assert(batchno > hashtable->curbatch);
+					ExecHashJoinSaveTuple(mintuple, hashvalue,
+										  &hashtable->outerBatchFile[batchno]);
+
+					if (shouldFree)
+						heap_free_minimal_tuple(mintuple);
+
+					/* Loop around, staying in HJ_NEED_NEW_OUTER state */
+					continue;
+				}
+
+				/* OK, let's scan the bucket for matches */
+				node->hj_JoinState = HJ_SCAN_BUCKET;
+
+				/* FALL THRU */
+
+			case HJ_SCAN_BUCKET:
+
+				/*
+				 * Scan the selected hash bucket for matches to current outer
+				 */
+				if (parallel)
+				{
+					if (!ExecParallelScanHashBucket(node, econtext))
+					{
+						/* out of matches; check for possible outer-join fill */
+						node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
+						continue;
+					}
+				}
+				else
+				{
+					if (!ExecScanHashBucket(node, econtext))
+					{
+						/* out of matches; check for possible outer-join fill */
+						node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
+						continue;
+					}
+				}
+
+				/*
+				 * We've got a match, but still need to test non-hashed quals.
+				 * ExecScanHashBucket already set up all the state needed to
+				 * call ExecQual.
+				 *
+				 * If we pass the qual, then save state for next call and have
+				 * ExecProject form the projection, store it in the tuple
+				 * table, and return the slot.
+				 *
+				 * Only the joinquals determine tuple match status, but all
+				 * quals must pass to actually return the tuple.
+				 */
+				if (joinqual == NULL || ExecQual(joinqual, econtext))
+				{
+					node->hj_MatchedOuter = true;
+
+					if (parallel)
+					{
+						/*
+						 * Full/right outer joins are currently not supported
+						 * for parallel joins, so we don't need to set the
+						 * match bit.  Experiments show that it's worth
+						 * avoiding the shared memory traffic on large
+						 * systems.
+						 */
+						Assert(!HJ_FILL_INNER(node));
+					}
+					else
+					{
+						/*
+						 * This is really only needed if HJ_FILL_INNER(node),
+						 * but we'll avoid the branch and just set it always.
+						 */
+						HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+					}
+
+					/* In an antijoin, we never return a matched tuple */
+					if (node->js.jointype == JOIN_ANTI)
+					{
+						node->hj_JoinState = HJ_NEED_NEW_OUTER;
+						continue;
+					}
+
+					/*
+					 * If we only need to join to the first matching inner
+					 * tuple, then consider returning this one, but after that
+					 * continue with next outer tuple.
+					 */
+					if (node->js.single_match)
+						node->hj_JoinState = HJ_NEED_NEW_OUTER;
+
+					if (otherqual == NULL || ExecQual(otherqual, econtext))
+						return ExecProject(node->js.ps.ps_ProjInfo);
+					else
+						InstrCountFiltered2(node, 1);
+				}
+				else
+					InstrCountFiltered1(node, 1);
+				break;
+
+			case HJ_FILL_OUTER_TUPLE:
+
+				/*
+				 * The current outer tuple has run out of matches, so check
+				 * whether to emit a dummy outer-join tuple.  Whether we emit
+				 * one or not, the next state is NEED_NEW_OUTER.
+				 */
+				node->hj_JoinState = HJ_NEED_NEW_OUTER;
+
+				if (!node->hj_MatchedOuter &&
+					HJ_FILL_OUTER(node))
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the inner
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
+
+					if (otherqual == NULL || ExecQual(otherqual, econtext))
+						return ExecProject(node->js.ps.ps_ProjInfo);
+					else
+						InstrCountFiltered2(node, 1);
+				}
+				break;
+
+			case HJ_FILL_INNER_TUPLES:
+
+				/*
+				 * We have finished a batch, but we are doing right/full join,
+				 * so any unmatched inner tuples in the hashtable have to be
+				 * emitted before we continue to the next batch.
+				 */
+				if (!ExecScanHashTableForUnmatched(node, econtext))
+				{
+					/* no more unmatched tuples */
+					node->hj_JoinState = HJ_NEED_NEW_BATCH;
+					continue;
+				}
+
+				/*
+				 * Generate a fake join tuple with nulls for the outer tuple,
+				 * and return it if it passes the non-join quals.
+				 */
+				econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
+
+				if (otherqual == NULL || ExecQual(otherqual, econtext))
+					return ExecProject(node->js.ps.ps_ProjInfo);
+				else
+					InstrCountFiltered2(node, 1);
+				break;
+
+			case HJ_NEED_NEW_BATCH:
+
+				/*
+				 * Try to advance to next batch.  Done if there are no more.
+				 */
+				if (parallel)
+				{
+					if (!ExecParallelHashJoinNewBatch(node))
+						return NULL;	/* end of parallel-aware join */
+				}
+				else
+				{
+					if (!ExecHashJoinNewBatch(node))
+						return NULL;	/* end of parallel-oblivious join */
+				}
+				node->hj_JoinState = HJ_NEED_NEW_OUTER;
+				break;
+
+			default:
+				elog(ERROR, "unrecognized hashjoin state: %d",
+					 (int) node->hj_JoinState);
+		}
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecHashJoin
+ *
+ *		Parallel-oblivious version.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *			/* return: a tuple or NULL */
+ExecHashJoin(PlanState *pstate)
+{
+	/*
+	 * On sufficiently smart compilers this should be inlined with the
+	 * parallel-aware branches removed.
+	 */
+	return ExecHashJoinImpl(pstate, false);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecParallelHashJoin
+ *
+ *		Parallel-aware version.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *			/* return: a tuple or NULL */
+ExecParallelHashJoin(PlanState *pstate)
+{
+	/*
+	 * On sufficiently smart compilers this should be inlined with the
+	 * parallel-oblivious branches removed.
+	 */
+	return ExecHashJoinImpl(pstate, true);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitHashJoin
+ *
+ *		Init routine for HashJoin node.
+ * ----------------------------------------------------------------
+ */
+HashJoinState *
+ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
+{
+	HashJoinState *hjstate;
+	Plan	   *outerNode;
+	Hash	   *hashNode;
+	TupleDesc	outerDesc,
+				innerDesc;
+	const TupleTableSlotOps *ops;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	hjstate = makeNode(HashJoinState);
+	hjstate->js.ps.plan = (Plan *) node;
+	hjstate->js.ps.state = estate;
+
+	/*
+	 * See ExecHashJoinInitializeDSM() and ExecHashJoinInitializeWorker()
+	 * where this function may be replaced with a parallel version, if we
+	 * managed to launch a parallel query.
+	 */
+	hjstate->js.ps.ExecProcNode = ExecHashJoin;
+	hjstate->js.jointype = node->join.jointype;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &hjstate->js.ps);
+
+	/*
+	 * initialize child nodes
+	 *
+	 * Note: we could suppress the REWIND flag for the inner input, which
+	 * would amount to betting that the hash will be a single batch.  Not
+	 * clear if this would be a win or not.
+	 */
+	outerNode = outerPlan(node);
+	hashNode = (Hash *) innerPlan(node);
+
+	outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
+	outerDesc = ExecGetResultType(outerPlanState(hjstate));
+	innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
+	innerDesc = ExecGetResultType(innerPlanState(hjstate));
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTupleSlotTL(&hjstate->js.ps, &TTSOpsVirtual);
+	ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
+
+	/*
+	 * tuple table initialization
+	 */
+	ops = ExecGetResultSlotOps(outerPlanState(hjstate), NULL);
+	hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate, outerDesc,
+														ops);
+
+	/*
+	 * detect whether we need only consider the first matching inner tuple
+	 */
+	hjstate->js.single_match = (node->join.inner_unique ||
+								node->join.jointype == JOIN_SEMI);
+
+	/* set up null tuples for outer joins, if needed */
+	switch (node->join.jointype)
+	{
+		case JOIN_INNER:
+		case JOIN_SEMI:
+			break;
+		case JOIN_LEFT:
+		case JOIN_ANTI:
+			hjstate->hj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual);
+			break;
+		case JOIN_RIGHT:
+			hjstate->hj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual);
+			break;
+		case JOIN_FULL:
+			hjstate->hj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual);
+			hjstate->hj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual);
+			break;
+		default:
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) node->join.jointype);
+	}
+
+	/*
+	 * now for some voodoo.  our temporary tuple slot is actually the result
+	 * tuple slot of the Hash node (which is our inner plan).  we can do this
+	 * because Hash nodes don't return tuples via ExecProcNode() -- instead
+	 * the hash join node uses ExecScanHashBucket() to get at the contents of
+	 * the hash table.  -cim 6/9/91
+	 */
+	{
+		HashState  *hashstate = (HashState *) innerPlanState(hjstate);
+		TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
+
+		hjstate->hj_HashTupleSlot = slot;
+	}
+
+	/*
+	 * initialize child expressions
+	 */
+	hjstate->js.ps.qual =
+		ExecInitQual(node->join.plan.qual, (PlanState *) hjstate);
+	hjstate->js.joinqual =
+		ExecInitQual(node->join.joinqual, (PlanState *) hjstate);
+	hjstate->hashclauses =
+		ExecInitQual(node->hashclauses, (PlanState *) hjstate);
+
+	/*
+	 * initialize hash-specific info
+	 */
+	hjstate->hj_HashTable = NULL;
+	hjstate->hj_FirstOuterTupleSlot = NULL;
+
+	hjstate->hj_CurHashValue = 0;
+	hjstate->hj_CurBucketNo = 0;
+	hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
+	hjstate->hj_CurTuple = NULL;
+
+	hjstate->hj_OuterHashKeys = ExecInitExprList(node->hashkeys,
+												 (PlanState *) hjstate);
+	hjstate->hj_HashOperators = node->hashoperators;
+	hjstate->hj_Collations = node->hashcollations;
+
+	hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
+	hjstate->hj_MatchedOuter = false;
+	hjstate->hj_OuterNotEmpty = false;
+
+	return hjstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndHashJoin
+ *
+ *		clean up routine for HashJoin node
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndHashJoin(HashJoinState *node)
+{
+	/*
+	 * Free hash table
+	 */
+	if (node->hj_HashTable)
+	{
+		ExecHashTableDestroy(node->hj_HashTable);
+		node->hj_HashTable = NULL;
+	}
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->js.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->hj_OuterTupleSlot);
+	ExecClearTuple(node->hj_HashTupleSlot);
+
+	/*
+	 * clean up subtrees
+	 */
+	ExecEndNode(outerPlanState(node));
+	ExecEndNode(innerPlanState(node));
+}
+
+/*
+ * ExecHashJoinOuterGetTuple
+ *
+ *		get the next outer tuple for a parallel oblivious hashjoin: either by
+ *		executing the outer plan node in the first pass, or from the temp
+ *		files for the hashjoin batches.
+ *
+ * Returns a null slot if no more outer tuples (within the current batch).
+ *
+ * On success, the tuple's hash value is stored at *hashvalue --- this is
+ * either originally computed, or re-read from the temp file.
+ */
+static TupleTableSlot *
+ExecHashJoinOuterGetTuple(PlanState *outerNode,
+						  HashJoinState *hjstate,
+						  uint32 *hashvalue)
+{
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	int			curbatch = hashtable->curbatch;
+	TupleTableSlot *slot;
+
+	if (curbatch == 0)			/* if it is the first pass */
+	{
+		/*
+		 * Check to see if first outer tuple was already fetched by
+		 * ExecHashJoin() and not used yet.
+		 */
+		slot = hjstate->hj_FirstOuterTupleSlot;
+		if (!TupIsNull(slot))
+			hjstate->hj_FirstOuterTupleSlot = NULL;
+		else
+			slot = ExecProcNode(outerNode);
+
+		while (!TupIsNull(slot))
+		{
+			/*
+			 * We have to compute the tuple's hash value.
+			 */
+			ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
+
+			econtext->ecxt_outertuple = slot;
+			if (ExecHashGetHashValue(hashtable, econtext,
+									 hjstate->hj_OuterHashKeys,
+									 true,	/* outer tuple */
+									 HJ_FILL_OUTER(hjstate),
+									 hashvalue))
+			{
+				/* remember outer relation is not empty for possible rescan */
+				hjstate->hj_OuterNotEmpty = true;
+
+				return slot;
+			}
+
+			/*
+			 * That tuple couldn't match because of a NULL, so discard it and
+			 * continue with the next one.
+			 */
+			slot = ExecProcNode(outerNode);
+		}
+	}
+	else if (curbatch < hashtable->nbatch)
+	{
+		BufFile    *file = hashtable->outerBatchFile[curbatch];
+
+		/*
+		 * In outer-join cases, we could get here even though the batch file
+		 * is empty.
+		 */
+		if (file == NULL)
+			return NULL;
+
+		slot = ExecHashJoinGetSavedTuple(hjstate,
+										 file,
+										 hashvalue,
+										 hjstate->hj_OuterTupleSlot);
+		if (!TupIsNull(slot))
+			return slot;
+	}
+
+	/* End of this batch */
+	return NULL;
+}
+
+/*
+ * ExecHashJoinOuterGetTuple variant for the parallel case.
+ */
+static TupleTableSlot *
+ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
+								  HashJoinState *hjstate,
+								  uint32 *hashvalue)
+{
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	int			curbatch = hashtable->curbatch;
+	TupleTableSlot *slot;
+
+	/*
+	 * In the Parallel Hash case we only run the outer plan directly for
+	 * single-batch hash joins.  Otherwise we have to go to batch files, even
+	 * for batch 0.
+	 */
+	if (curbatch == 0 && hashtable->nbatch == 1)
+	{
+		slot = ExecProcNode(outerNode);
+
+		while (!TupIsNull(slot))
+		{
+			ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
+
+			econtext->ecxt_outertuple = slot;
+			if (ExecHashGetHashValue(hashtable, econtext,
+									 hjstate->hj_OuterHashKeys,
+									 true,	/* outer tuple */
+									 HJ_FILL_OUTER(hjstate),
+									 hashvalue))
+				return slot;
+
+			/*
+			 * That tuple couldn't match because of a NULL, so discard it and
+			 * continue with the next one.
+			 */
+			slot = ExecProcNode(outerNode);
+		}
+	}
+	else if (curbatch < hashtable->nbatch)
+	{
+		MinimalTuple tuple;
+
+		tuple = sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples,
+									   hashvalue);
+		if (tuple != NULL)
+		{
+			ExecForceStoreMinimalTuple(tuple,
+									   hjstate->hj_OuterTupleSlot,
+									   false);
+			slot = hjstate->hj_OuterTupleSlot;
+			return slot;
+		}
+		else
+			ExecClearTuple(hjstate->hj_OuterTupleSlot);
+	}
+
+	/* End of this batch */
+	return NULL;
+}
+
+/*
+ * ExecHashJoinNewBatch
+ *		switch to a new hashjoin batch
+ *
+ * Returns true if successful, false if there are no more batches.
+ */
+static bool
+ExecHashJoinNewBatch(HashJoinState *hjstate)
+{
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	int			nbatch;
+	int			curbatch;
+	BufFile    *innerFile;
+	TupleTableSlot *slot;
+	uint32		hashvalue;
+
+	nbatch = hashtable->nbatch;
+	curbatch = hashtable->curbatch;
+
+	if (curbatch > 0)
+	{
+		/*
+		 * We no longer need the previous outer batch file; close it right
+		 * away to free disk space.
+		 */
+		if (hashtable->outerBatchFile[curbatch])
+			BufFileClose(hashtable->outerBatchFile[curbatch]);
+		hashtable->outerBatchFile[curbatch] = NULL;
+	}
+	else						/* we just finished the first batch */
+	{
+		/*
+		 * Reset some of the skew optimization state variables, since we no
+		 * longer need to consider skew tuples after the first batch. The
+		 * memory context reset we are about to do will release the skew
+		 * hashtable itself.
+		 */
+		hashtable->skewEnabled = false;
+		hashtable->skewBucket = NULL;
+		hashtable->skewBucketNums = NULL;
+		hashtable->nSkewBuckets = 0;
+		hashtable->spaceUsedSkew = 0;
+	}
+
+	/*
+	 * We can always skip over any batches that are completely empty on both
+	 * sides.  We can sometimes skip over batches that are empty on only one
+	 * side, but there are exceptions:
+	 *
+	 * 1. In a left/full outer join, we have to process outer batches even if
+	 * the inner batch is empty.  Similarly, in a right/full outer join, we
+	 * have to process inner batches even if the outer batch is empty.
+	 *
+	 * 2. If we have increased nbatch since the initial estimate, we have to
+	 * scan inner batches since they might contain tuples that need to be
+	 * reassigned to later inner batches.
+	 *
+	 * 3. Similarly, if we have increased nbatch since starting the outer
+	 * scan, we have to rescan outer batches in case they contain tuples that
+	 * need to be reassigned.
+	 */
+	curbatch++;
+	while (curbatch < nbatch &&
+		   (hashtable->outerBatchFile[curbatch] == NULL ||
+			hashtable->innerBatchFile[curbatch] == NULL))
+	{
+		if (hashtable->outerBatchFile[curbatch] &&
+			HJ_FILL_OUTER(hjstate))
+			break;				/* must process due to rule 1 */
+		if (hashtable->innerBatchFile[curbatch] &&
+			HJ_FILL_INNER(hjstate))
+			break;				/* must process due to rule 1 */
+		if (hashtable->innerBatchFile[curbatch] &&
+			nbatch != hashtable->nbatch_original)
+			break;				/* must process due to rule 2 */
+		if (hashtable->outerBatchFile[curbatch] &&
+			nbatch != hashtable->nbatch_outstart)
+			break;				/* must process due to rule 3 */
+		/* We can ignore this batch. */
+		/* Release associated temp files right away. */
+		if (hashtable->innerBatchFile[curbatch])
+			BufFileClose(hashtable->innerBatchFile[curbatch]);
+		hashtable->innerBatchFile[curbatch] = NULL;
+		if (hashtable->outerBatchFile[curbatch])
+			BufFileClose(hashtable->outerBatchFile[curbatch]);
+		hashtable->outerBatchFile[curbatch] = NULL;
+		curbatch++;
+	}
+
+	if (curbatch >= nbatch)
+		return false;			/* no more batches */
+
+	hashtable->curbatch = curbatch;
+
+	/*
+	 * Reload the hash table with the new inner batch (which could be empty)
+	 */
+	ExecHashTableReset(hashtable);
+
+	innerFile = hashtable->innerBatchFile[curbatch];
+
+	if (innerFile != NULL)
+	{
+		if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not rewind hash-join temporary file")));
+
+		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
+												 innerFile,
+												 &hashvalue,
+												 hjstate->hj_HashTupleSlot)))
+		{
+			/*
+			 * NOTE: some tuples may be sent to future batches.  Also, it is
+			 * possible for hashtable->nbatch to be increased here!
+			 */
+			ExecHashTableInsert(hashtable, slot, hashvalue);
+		}
+
+		/*
+		 * after we build the hash table, the inner batch file is no longer
+		 * needed
+		 */
+		BufFileClose(innerFile);
+		hashtable->innerBatchFile[curbatch] = NULL;
+	}
+
+	/*
+	 * Rewind outer batch file (if present), so that we can start reading it.
+	 */
+	if (hashtable->outerBatchFile[curbatch] != NULL)
+	{
+		if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not rewind hash-join temporary file")));
+	}
+
+	return true;
+}
+
+/*
+ * Choose a batch to work on, and attach to it.  Returns true if successful,
+ * false if there are no more batches.
+ */
+static bool
+ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
+{
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	int			start_batchno;
+	int			batchno;
+
+	/*
+	 * If we started up so late that the batch tracking array has been freed
+	 * already by ExecHashTableDetach(), then we are finished.  See also
+	 * ExecParallelHashEnsureBatchAccessors().
+	 */
+	if (hashtable->batches == NULL)
+		return false;
+
+	/*
+	 * If we were already attached to a batch, remember not to bother checking
+	 * it again, and detach from it (possibly freeing the hash table if we are
+	 * last to detach).
+	 */
+	if (hashtable->curbatch >= 0)
+	{
+		hashtable->batches[hashtable->curbatch].done = true;
+		ExecHashTableDetachBatch(hashtable);
+	}
+
+	/*
+	 * Search for a batch that isn't done.  We use an atomic counter to start
+	 * our search at a different batch in every participant when there are
+	 * more batches than participants.
+	 */
+	batchno = start_batchno =
+		pg_atomic_fetch_add_u32(&hashtable->parallel_state->distributor, 1) %
+		hashtable->nbatch;
+	do
+	{
+		uint32		hashvalue;
+		MinimalTuple tuple;
+		TupleTableSlot *slot;
+
+		if (!hashtable->batches[batchno].done)
+		{
+			SharedTuplestoreAccessor *inner_tuples;
+			Barrier    *batch_barrier =
+			&hashtable->batches[batchno].shared->batch_barrier;
+
+			switch (BarrierAttach(batch_barrier))
+			{
+				case PHJ_BATCH_ELECTING:
+
+					/* One backend allocates the hash table. */
+					if (BarrierArriveAndWait(batch_barrier,
+											 WAIT_EVENT_HASH_BATCH_ELECT))
+						ExecParallelHashTableAlloc(hashtable, batchno);
+					/* Fall through. */
+
+				case PHJ_BATCH_ALLOCATING:
+					/* Wait for allocation to complete. */
+					BarrierArriveAndWait(batch_barrier,
+										 WAIT_EVENT_HASH_BATCH_ALLOCATE);
+					/* Fall through. */
+
+				case PHJ_BATCH_LOADING:
+					/* Start (or join in) loading tuples. */
+					ExecParallelHashTableSetCurrentBatch(hashtable, batchno);
+					inner_tuples = hashtable->batches[batchno].inner_tuples;
+					sts_begin_parallel_scan(inner_tuples);
+					while ((tuple = sts_parallel_scan_next(inner_tuples,
+														   &hashvalue)))
+					{
+						ExecForceStoreMinimalTuple(tuple,
+												   hjstate->hj_HashTupleSlot,
+												   false);
+						slot = hjstate->hj_HashTupleSlot;
+						ExecParallelHashTableInsertCurrentBatch(hashtable, slot,
+																hashvalue);
+					}
+					sts_end_parallel_scan(inner_tuples);
+					BarrierArriveAndWait(batch_barrier,
+										 WAIT_EVENT_HASH_BATCH_LOAD);
+					/* Fall through. */
+
+				case PHJ_BATCH_PROBING:
+
+					/*
+					 * This batch is ready to probe.  Return control to
+					 * caller. We stay attached to batch_barrier so that the
+					 * hash table stays alive until everyone's finished
+					 * probing it, but no participant is allowed to wait at
+					 * this barrier again (or else a deadlock could occur).
+					 * All attached participants must eventually call
+					 * BarrierArriveAndDetach() so that the final phase
+					 * PHJ_BATCH_DONE can be reached.
+					 */
+					ExecParallelHashTableSetCurrentBatch(hashtable, batchno);
+					sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples);
+					return true;
+
+				case PHJ_BATCH_DONE:
+
+					/*
+					 * Already done.  Detach and go around again (if any
+					 * remain).
+					 */
+					BarrierDetach(batch_barrier);
+					hashtable->batches[batchno].done = true;
+					hashtable->curbatch = -1;
+					break;
+
+				default:
+					elog(ERROR, "unexpected batch phase %d",
+						 BarrierPhase(batch_barrier));
+			}
+		}
+		batchno = (batchno + 1) % hashtable->nbatch;
+	} while (batchno != start_batchno);
+
+	return false;
+}
+
+/*
+ * ExecHashJoinSaveTuple
+ *		save a tuple to a batch file.
+ *
+ * The data recorded in the file for each tuple is its hash value,
+ * then the tuple in MinimalTuple format.
+ *
+ * Note: it is important always to call this in the regular executor
+ * context, not in a shorter-lived context; else the temp file buffers
+ * will get messed up.
+ */
+void
+ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
+					  BufFile **fileptr)
+{
+	BufFile    *file = *fileptr;
+
+	if (file == NULL)
+	{
+		/* First write to this batch file, so open it. */
+		file = BufFileCreateTemp(false);
+		*fileptr = file;
+	}
+
+	BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
+	BufFileWrite(file, (void *) tuple, tuple->t_len);
+}
+
+/*
+ * ExecHashJoinGetSavedTuple
+ *		read the next tuple from a batch file.  Return NULL if no more.
+ *
+ * On success, *hashvalue is set to the tuple's hash value, and the tuple
+ * itself is stored in the given slot.
+ */
+static TupleTableSlot *
+ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
+						  BufFile *file,
+						  uint32 *hashvalue,
+						  TupleTableSlot *tupleSlot)
+{
+	uint32		header[2];
+	size_t		nread;
+	MinimalTuple tuple;
+
+	/*
+	 * We check for interrupts here because this is typically taken as an
+	 * alternative code path to an ExecProcNode() call, which would include
+	 * such a check.
+	 */
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * Since both the hash value and the MinimalTuple length word are uint32,
+	 * we can read them both in one BufFileRead() call without any type
+	 * cheating.
+	 */
+	nread = BufFileRead(file, (void *) header, sizeof(header));
+	if (nread == 0)				/* end of file */
+	{
+		ExecClearTuple(tupleSlot);
+		return NULL;
+	}
+	if (nread != sizeof(header))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
+						nread, sizeof(header))));
+	*hashvalue = header[0];
+	tuple = (MinimalTuple) palloc(header[1]);
+	tuple->t_len = header[1];
+	nread = BufFileRead(file,
+						(void *) ((char *) tuple + sizeof(uint32)),
+						header[1] - sizeof(uint32));
+	if (nread != header[1] - sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
+						nread, header[1] - sizeof(uint32))));
+	ExecForceStoreMinimalTuple(tuple, tupleSlot, true);
+	return tupleSlot;
+}
+
+
+void
+ExecReScanHashJoin(HashJoinState *node)
+{
+	/*
+	 * In a multi-batch join, we currently have to do rescans the hard way,
+	 * primarily because batch temp files may have already been released. But
+	 * if it's a single-batch join, and there is no parameter change for the
+	 * inner subnode, then we can just re-use the existing hash table without
+	 * rebuilding it.
+	 */
+	if (node->hj_HashTable != NULL)
+	{
+		if (node->hj_HashTable->nbatch == 1 &&
+			node->js.ps.righttree->chgParam == NULL)
+		{
+			/*
+			 * Okay to reuse the hash table; needn't rescan inner, either.
+			 *
+			 * However, if it's a right/full join, we'd better reset the
+			 * inner-tuple match flags contained in the table.
+			 */
+			if (HJ_FILL_INNER(node))
+				ExecHashTableResetMatchFlags(node->hj_HashTable);
+
+			/*
+			 * Also, we need to reset our state about the emptiness of the
+			 * outer relation, so that the new scan of the outer will update
+			 * it correctly if it turns out to be empty this time. (There's no
+			 * harm in clearing it now because ExecHashJoin won't need the
+			 * info.  In the other cases, where the hash table doesn't exist
+			 * or we are destroying it, we leave this state alone because
+			 * ExecHashJoin will need it the first time through.)
+			 */
+			node->hj_OuterNotEmpty = false;
+
+			/* ExecHashJoin can skip the BUILD_HASHTABLE step */
+			node->hj_JoinState = HJ_NEED_NEW_OUTER;
+		}
+		else
+		{
+			/* must destroy and rebuild hash table */
+			HashState  *hashNode = castNode(HashState, innerPlanState(node));
+
+			Assert(hashNode->hashtable == node->hj_HashTable);
+			/* accumulate stats from old hash table, if wanted */
+			/* (this should match ExecShutdownHash) */
+			if (hashNode->ps.instrument && !hashNode->hinstrument)
+				hashNode->hinstrument = (HashInstrumentation *)
+					palloc0(sizeof(HashInstrumentation));
+			if (hashNode->hinstrument)
+				ExecHashAccumInstrumentation(hashNode->hinstrument,
+											 hashNode->hashtable);
+			/* for safety, be sure to clear child plan node's pointer too */
+			hashNode->hashtable = NULL;
+
+			ExecHashTableDestroy(node->hj_HashTable);
+			node->hj_HashTable = NULL;
+			node->hj_JoinState = HJ_BUILD_HASHTABLE;
+
+			/*
+			 * if chgParam of subnode is not null then plan will be re-scanned
+			 * by first ExecProcNode.
+			 */
+			if (node->js.ps.righttree->chgParam == NULL)
+				ExecReScan(node->js.ps.righttree);
+		}
+	}
+
+	/* Always reset intra-tuple state */
+	node->hj_CurHashValue = 0;
+	node->hj_CurBucketNo = 0;
+	node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
+	node->hj_CurTuple = NULL;
+
+	node->hj_MatchedOuter = false;
+	node->hj_FirstOuterTupleSlot = NULL;
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->js.ps.lefttree->chgParam == NULL)
+		ExecReScan(node->js.ps.lefttree);
+}
+
+void
+ExecShutdownHashJoin(HashJoinState *node)
+{
+	if (node->hj_HashTable)
+	{
+		/*
+		 * Detach from shared state before DSM memory goes away.  This makes
+		 * sure that we don't have any pointers into DSM memory by the time
+		 * ExecEndHashJoin runs.
+		 */
+		ExecHashTableDetachBatch(node->hj_HashTable);
+		ExecHashTableDetach(node->hj_HashTable);
+	}
+}
+
+static void
+ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate)
+{
+	PlanState  *outerState = outerPlanState(hjstate);
+	ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	TupleTableSlot *slot;
+	uint32		hashvalue;
+	int			i;
+
+	Assert(hjstate->hj_FirstOuterTupleSlot == NULL);
+
+	/* Execute outer plan, writing all tuples to shared tuplestores. */
+	for (;;)
+	{
+		slot = ExecProcNode(outerState);
+		if (TupIsNull(slot))
+			break;
+		econtext->ecxt_outertuple = slot;
+		if (ExecHashGetHashValue(hashtable, econtext,
+								 hjstate->hj_OuterHashKeys,
+								 true,	/* outer tuple */
+								 HJ_FILL_OUTER(hjstate),
+								 &hashvalue))
+		{
+			int			batchno;
+			int			bucketno;
+			bool		shouldFree;
+			MinimalTuple mintup = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+			ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno,
+									  &batchno);
+			sts_puttuple(hashtable->batches[batchno].outer_tuples,
+						 &hashvalue, mintup);
+
+			if (shouldFree)
+				heap_free_minimal_tuple(mintup);
+		}
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	/* Make sure all outer partitions are readable by any backend. */
+	for (i = 0; i < hashtable->nbatch; ++i)
+		sts_end_write(hashtable->batches[i].outer_tuples);
+}
+
+void
+ExecHashJoinEstimate(HashJoinState *state, ParallelContext *pcxt)
+{
+	shm_toc_estimate_chunk(&pcxt->estimator, sizeof(ParallelHashJoinState));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+void
+ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt)
+{
+	int			plan_node_id = state->js.ps.plan->plan_node_id;
+	HashState  *hashNode;
+	ParallelHashJoinState *pstate;
+
+	/*
+	 * Disable shared hash table mode if we failed to create a real DSM
+	 * segment, because that means that we don't have a DSA area to work with.
+	 */
+	if (pcxt->seg == NULL)
+		return;
+
+	ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin);
+
+	/*
+	 * Set up the state needed to coordinate access to the shared hash
+	 * table(s), using the plan node ID as the toc key.
+	 */
+	pstate = shm_toc_allocate(pcxt->toc, sizeof(ParallelHashJoinState));
+	shm_toc_insert(pcxt->toc, plan_node_id, pstate);
+
+	/*
+	 * Set up the shared hash join state with no batches initially.
+	 * ExecHashTableCreate() will prepare at least one later and set nbatch
+	 * and space_allowed.
+	 */
+	pstate->nbatch = 0;
+	pstate->space_allowed = 0;
+	pstate->batches = InvalidDsaPointer;
+	pstate->old_batches = InvalidDsaPointer;
+	pstate->nbuckets = 0;
+	pstate->growth = PHJ_GROWTH_OK;
+	pstate->chunk_work_queue = InvalidDsaPointer;
+	pg_atomic_init_u32(&pstate->distributor, 0);
+	pstate->nparticipants = pcxt->nworkers + 1;
+	pstate->total_tuples = 0;
+	LWLockInitialize(&pstate->lock,
+					 LWTRANCHE_PARALLEL_HASH_JOIN);
+	BarrierInit(&pstate->build_barrier, 0);
+	BarrierInit(&pstate->grow_batches_barrier, 0);
+	BarrierInit(&pstate->grow_buckets_barrier, 0);
+
+	/* Set up the space we'll use for shared temporary files. */
+	SharedFileSetInit(&pstate->fileset, pcxt->seg);
+
+	/* Initialize the shared state in the hash node. */
+	hashNode = (HashState *) innerPlanState(state);
+	hashNode->parallel_state = pstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecHashJoinReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecHashJoinReInitializeDSM(HashJoinState *state, ParallelContext *cxt)
+{
+	int			plan_node_id = state->js.ps.plan->plan_node_id;
+	ParallelHashJoinState *pstate =
+	shm_toc_lookup(cxt->toc, plan_node_id, false);
+
+	/*
+	 * It would be possible to reuse the shared hash table in single-batch
+	 * cases by resetting and then fast-forwarding build_barrier to
+	 * PHJ_BUILD_DONE and batch 0's batch_barrier to PHJ_BATCH_PROBING, but
+	 * currently shared hash tables are already freed by now (by the last
+	 * participant to detach from the batch).  We could consider keeping it
+	 * around for single-batch joins.  We'd also need to adjust
+	 * finalize_plan() so that it doesn't record a dummy dependency for
+	 * Parallel Hash nodes, preventing the rescan optimization.  For now we
+	 * don't try.
+	 */
+
+	/* Detach, freeing any remaining shared memory. */
+	if (state->hj_HashTable != NULL)
+	{
+		ExecHashTableDetachBatch(state->hj_HashTable);
+		ExecHashTableDetach(state->hj_HashTable);
+	}
+
+	/* Clear any shared batch files. */
+	SharedFileSetDeleteAll(&pstate->fileset);
+
+	/* Reset build_barrier to PHJ_BUILD_ELECTING so we can go around again. */
+	BarrierInit(&pstate->build_barrier, 0);
+}
+
+void
+ExecHashJoinInitializeWorker(HashJoinState *state,
+							 ParallelWorkerContext *pwcxt)
+{
+	HashState  *hashNode;
+	int			plan_node_id = state->js.ps.plan->plan_node_id;
+	ParallelHashJoinState *pstate =
+	shm_toc_lookup(pwcxt->toc, plan_node_id, false);
+
+	/* Attach to the space for shared temporary files. */
+	SharedFileSetAttach(&pstate->fileset, pwcxt->seg);
+
+	/* Attach to the shared state in the hash node. */
+	hashNode = (HashState *) innerPlanState(state);
+	hashNode->parallel_state = pstate;
+
+	ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin);
+}
diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c
new file mode 100644
index 0000000..934426a
--- /dev/null
+++ b/src/backend/executor/nodeIncrementalSort.c
@@ -0,0 +1,1257 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIncrementalSort.c
+ *	  Routines to handle incremental sorting of relations.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeIncrementalSort.c
+ *
+ * DESCRIPTION
+ *
+ *	Incremental sort is an optimized variant of multikey sort for cases
+ *	when the input is already sorted by a prefix of the sort keys.  For
+ *	example when a sort by (key1, key2 ... keyN) is requested, and the
+ *	input is already sorted by (key1, key2 ... keyM), M < N, we can
+ *	divide the input into groups where keys (key1, ... keyM) are equal,
+ *	and only sort on the remaining columns.
+ *
+ *	Consider the following example.  We have input tuples consisting of
+ *	two integers (X, Y) already presorted by X, while it's required to
+ *	sort them by both X and Y.  Let input tuples be following.
+ *
+ *	(1, 5)
+ *	(1, 2)
+ *	(2, 9)
+ *	(2, 1)
+ *	(2, 5)
+ *	(3, 3)
+ *	(3, 7)
+ *
+ *	An incremental sort algorithm would split the input into the following
+ *	groups, which have equal X, and then sort them by Y individually:
+ *
+ *		(1, 5) (1, 2)
+ *		(2, 9) (2, 1) (2, 5)
+ *		(3, 3) (3, 7)
+ *
+ *	After sorting these groups and putting them altogether, we would get
+ *	the following result which is sorted by X and Y, as requested:
+ *
+ *	(1, 2)
+ *	(1, 5)
+ *	(2, 1)
+ *	(2, 5)
+ *	(2, 9)
+ *	(3, 3)
+ *	(3, 7)
+ *
+ *	Incremental sort may be more efficient than plain sort, particularly
+ *	on large datasets, as it reduces the amount of data to sort at once,
+ *	making it more likely it fits into work_mem (eliminating the need to
+ *	spill to disk).  But the main advantage of incremental sort is that
+ *	it can start producing rows early, before sorting the whole dataset,
+ *	which is a significant benefit especially for queries with LIMIT.
+ *
+ *	The algorithm we've implemented here is modified from the theoretical
+ *	base described above by operating in two different modes:
+ *	  - Fetching a minimum number of tuples without checking prefix key
+ *	    group membership and sorting on all columns when safe.
+ *	  - Fetching all tuples for a single prefix key group and sorting on
+ *	    solely the unsorted columns.
+ *	We always begin in the first mode, and employ a heuristic to switch
+ *	into the second mode if we believe it's beneficial.
+ *
+ *	Sorting incrementally can potentially use less memory, avoid fetching
+ *	and sorting all tuples in the dataset, and begin returning tuples before
+ *	the entire result set is available.
+ *
+ *	The hybrid mode approach allows us to optimize for both very small
+ *	groups (where the overhead of a new tuplesort is high) and very	large
+ *	groups (where we can lower cost by not having to sort on already sorted
+ *	columns), albeit at some extra cost while switching between modes.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIncrementalSort.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/tuplesort.h"
+
+/*
+ * We need to store the instrumentation information in either local node's sort
+ * info or, for a parallel worker process, in the shared info (this avoids
+ * having to additionally memcpy the info from local memory to shared memory
+ * at each instrumentation call). This macro expands to choose the proper sort
+ * state and group info.
+ *
+ * Arguments:
+ * - node: type IncrementalSortState *
+ * - groupName: the token fullsort or prefixsort
+ */
+#define INSTRUMENT_SORT_GROUP(node, groupName) \
+	do { \
+		if ((node)->ss.ps.instrument != NULL) \
+		{ \
+			if ((node)->shared_info && (node)->am_worker) \
+			{ \
+				Assert(IsParallelWorker()); \
+				Assert(ParallelWorkerNumber <= (node)->shared_info->num_workers); \
+				instrumentSortedGroup(&(node)->shared_info->sinfo[ParallelWorkerNumber].groupName##GroupInfo, \
+									  (node)->groupName##_state); \
+			} \
+			else \
+			{ \
+				instrumentSortedGroup(&(node)->incsort_info.groupName##GroupInfo, \
+									  (node)->groupName##_state); \
+			} \
+		} \
+	} while (0)
+
+
+/* ----------------------------------------------------------------
+ * instrumentSortedGroup
+ *
+ * Because incremental sort processes (potentially many) sort batches, we need
+ * to capture tuplesort stats each time we finalize a sort state. This summary
+ * data is later used for EXPLAIN ANALYZE output.
+ * ----------------------------------------------------------------
+ */
+static void
+instrumentSortedGroup(IncrementalSortGroupInfo *groupInfo,
+					  Tuplesortstate *sortState)
+{
+	TuplesortInstrumentation sort_instr;
+
+	groupInfo->groupCount++;
+
+	tuplesort_get_stats(sortState, &sort_instr);
+
+	/* Calculate total and maximum memory and disk space used. */
+	switch (sort_instr.spaceType)
+	{
+		case SORT_SPACE_TYPE_DISK:
+			groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed;
+			if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed)
+				groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed;
+
+			break;
+		case SORT_SPACE_TYPE_MEMORY:
+			groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed;
+			if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed)
+				groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed;
+
+			break;
+	}
+
+	/* Track each sort method we've used. */
+	groupInfo->sortMethods |= sort_instr.sortMethod;
+}
+
+/* ----------------------------------------------------------------
+ * preparePresortedCols
+ *
+ * Prepare information for presorted_keys comparisons.
+ * ----------------------------------------------------------------
+ */
+static void
+preparePresortedCols(IncrementalSortState *node)
+{
+	IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);
+
+	node->presorted_keys =
+		(PresortedKeyData *) palloc(plannode->nPresortedCols *
+									sizeof(PresortedKeyData));
+
+	/* Pre-cache comparison functions for each pre-sorted key. */
+	for (int i = 0; i < plannode->nPresortedCols; i++)
+	{
+		Oid			equalityOp,
+					equalityFunc;
+		PresortedKeyData *key;
+
+		key = &node->presorted_keys[i];
+		key->attno = plannode->sort.sortColIdx[i];
+
+		equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i],
+													 NULL);
+		if (!OidIsValid(equalityOp))
+			elog(ERROR, "missing equality operator for ordering operator %u",
+				 plannode->sort.sortOperators[i]);
+
+		equalityFunc = get_opcode(equalityOp);
+		if (!OidIsValid(equalityFunc))
+			elog(ERROR, "missing function for operator %u", equalityOp);
+
+		/* Lookup the comparison function */
+		fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext);
+
+		/* We can initialize the callinfo just once and re-use it */
+		key->fcinfo = palloc0(SizeForFunctionCallInfo(2));
+		InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2,
+								 plannode->sort.collations[i], NULL, NULL);
+		key->fcinfo->args[0].isnull = false;
+		key->fcinfo->args[1].isnull = false;
+	}
+}
+
+/* ----------------------------------------------------------------
+ * isCurrentGroup
+ *
+ * Check whether a given tuple belongs to the current sort group by comparing
+ * the presorted column values to the pivot tuple of the current group.
+ * ----------------------------------------------------------------
+ */
+static bool
+isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple)
+{
+	int			nPresortedCols;
+
+	nPresortedCols = castNode(IncrementalSort, node->ss.ps.plan)->nPresortedCols;
+
+	/*
+	 * That the input is sorted by keys * (0, ... n) implies that the tail
+	 * keys are more likely to change. Therefore we do our comparison starting
+	 * from the last pre-sorted column to optimize for early detection of
+	 * inequality and minimizing the number of function calls..
+	 */
+	for (int i = nPresortedCols - 1; i >= 0; i--)
+	{
+		Datum		datumA,
+					datumB,
+					result;
+		bool		isnullA,
+					isnullB;
+		AttrNumber	attno = node->presorted_keys[i].attno;
+		PresortedKeyData *key;
+
+		datumA = slot_getattr(pivot, attno, &isnullA);
+		datumB = slot_getattr(tuple, attno, &isnullB);
+
+		/* Special case for NULL-vs-NULL, else use standard comparison */
+		if (isnullA || isnullB)
+		{
+			if (isnullA == isnullB)
+				continue;
+			else
+				return false;
+		}
+
+		key = &node->presorted_keys[i];
+
+		key->fcinfo->args[0].value = datumA;
+		key->fcinfo->args[1].value = datumB;
+
+		/* just for paranoia's sake, we reset isnull each time */
+		key->fcinfo->isnull = false;
+
+		result = FunctionCallInvoke(key->fcinfo);
+
+		/* Check for null result, since caller is clearly not expecting one */
+		if (key->fcinfo->isnull)
+			elog(ERROR, "function %u returned NULL", key->flinfo.fn_oid);
+
+		if (!DatumGetBool(result))
+			return false;
+	}
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ * switchToPresortedPrefixMode
+ *
+ * When we determine that we've likely encountered a large batch of tuples all
+ * having the same presorted prefix values, we want to optimize tuplesort by
+ * only sorting on unsorted suffix keys.
+ *
+ * The problem is that we've already accumulated several tuples in another
+ * tuplesort configured to sort by all columns (assuming that there may be
+ * more than one prefix key group). So to switch to presorted prefix mode we
+ * have to go back and look at all the tuples we've already accumulated to
+ * verify they're all part of the same prefix key group before sorting them
+ * solely by unsorted suffix keys.
+ *
+ * While it's likely that all tuples already fetched are all part of a single
+ * prefix group, we also have to handle the possibility that there is at least
+ * one different prefix key group before the large prefix key group.
+ * ----------------------------------------------------------------
+ */
+static void
+switchToPresortedPrefixMode(PlanState *pstate)
+{
+	IncrementalSortState *node = castNode(IncrementalSortState, pstate);
+	ScanDirection dir;
+	int64		nTuples;
+	TupleDesc	tupDesc;
+	PlanState  *outerNode;
+	IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);
+
+	dir = node->ss.ps.state->es_direction;
+	outerNode = outerPlanState(node);
+	tupDesc = ExecGetResultType(outerNode);
+
+	/* Configure the prefix sort state the first time around. */
+	if (node->prefixsort_state == NULL)
+	{
+		Tuplesortstate *prefixsort_state;
+		int			nPresortedCols = plannode->nPresortedCols;
+
+		/*
+		 * Optimize the sort by assuming the prefix columns are all equal and
+		 * thus we only need to sort by any remaining columns.
+		 */
+		prefixsort_state = tuplesort_begin_heap(tupDesc,
+												plannode->sort.numCols - nPresortedCols,
+												&(plannode->sort.sortColIdx[nPresortedCols]),
+												&(plannode->sort.sortOperators[nPresortedCols]),
+												&(plannode->sort.collations[nPresortedCols]),
+												&(plannode->sort.nullsFirst[nPresortedCols]),
+												work_mem,
+												NULL,
+												false);
+		node->prefixsort_state = prefixsort_state;
+	}
+	else
+	{
+		/* Next group of presorted data */
+		tuplesort_reset(node->prefixsort_state);
+	}
+
+	/*
+	 * If the current node has a bound, then it's reasonably likely that a
+	 * large prefix key group will benefit from bounded sort, so configure the
+	 * tuplesort to allow for that optimization.
+	 */
+	if (node->bounded)
+	{
+		SO1_printf("Setting bound on presorted prefix tuplesort to: " INT64_FORMAT "\n",
+				   node->bound - node->bound_Done);
+		tuplesort_set_bound(node->prefixsort_state,
+							node->bound - node->bound_Done);
+	}
+
+	/*
+	 * Copy as many tuples as we can (i.e., in the same prefix key group) from
+	 * the full sort state to the prefix sort state.
+	 */
+	for (nTuples = 0; nTuples < node->n_fullsort_remaining; nTuples++)
+	{
+		/*
+		 * When we encounter multiple prefix key groups inside the full sort
+		 * tuplesort we have to carry over the last read tuple into the next
+		 * batch.
+		 */
+		if (nTuples == 0 && !TupIsNull(node->transfer_tuple))
+		{
+			tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
+			/* The carried over tuple is our new group pivot tuple. */
+			ExecCopySlot(node->group_pivot, node->transfer_tuple);
+		}
+		else
+		{
+			tuplesort_gettupleslot(node->fullsort_state,
+								   ScanDirectionIsForward(dir),
+								   false, node->transfer_tuple, NULL);
+
+			/*
+			 * If this is our first time through the loop, then we need to
+			 * save the first tuple we get as our new group pivot.
+			 */
+			if (TupIsNull(node->group_pivot))
+				ExecCopySlot(node->group_pivot, node->transfer_tuple);
+
+			if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple))
+			{
+				tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
+			}
+			else
+			{
+				/*
+				 * The tuple isn't part of the current batch so we need to
+				 * carry it over into the next batch of tuples we transfer out
+				 * of the full sort tuplesort into the presorted prefix
+				 * tuplesort. We don't actually have to do anything special to
+				 * save the tuple since we've already loaded it into the
+				 * node->transfer_tuple slot, and, even though that slot
+				 * points to memory inside the full sort tuplesort, we can't
+				 * reset that tuplesort anyway until we've fully transferred
+				 * out its tuples, so this reference is safe. We do need to
+				 * reset the group pivot tuple though since we've finished the
+				 * current prefix key group.
+				 */
+				ExecClearTuple(node->group_pivot);
+
+				/* Break out of for-loop early */
+				break;
+			}
+		}
+	}
+
+	/*
+	 * Track how many tuples remain in the full sort batch so that we know if
+	 * we need to sort multiple prefix key groups before processing tuples
+	 * remaining in the large single prefix key group we think we've
+	 * encountered.
+	 */
+	SO1_printf("Moving " INT64_FORMAT " tuples to presorted prefix tuplesort\n", nTuples);
+	node->n_fullsort_remaining -= nTuples;
+	SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT "\n", node->n_fullsort_remaining);
+
+	if (node->n_fullsort_remaining == 0)
+	{
+		/*
+		 * We've found that all tuples remaining in the full sort batch are in
+		 * the same prefix key group and moved all of those tuples into the
+		 * presorted prefix tuplesort.  We don't know that we've yet found the
+		 * last tuple in the current prefix key group, so save our pivot
+		 * comparison tuple and continue fetching tuples from the outer
+		 * execution node to load into the presorted prefix tuplesort.
+		 */
+		ExecCopySlot(node->group_pivot, node->transfer_tuple);
+		SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n");
+		node->execution_status = INCSORT_LOADPREFIXSORT;
+
+		/*
+		 * Make sure we clear the transfer tuple slot so that next time we
+		 * encounter a large prefix key group we don't incorrectly assume we
+		 * have a tuple carried over from the previous group.
+		 */
+		ExecClearTuple(node->transfer_tuple);
+	}
+	else
+	{
+		/*
+		 * We finished a group but didn't consume all of the tuples from the
+		 * full sort state, so we'll sort this batch, let the outer node read
+		 * out all of those tuples, and then come back around to find another
+		 * batch.
+		 */
+		SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples);
+		tuplesort_performsort(node->prefixsort_state);
+
+		INSTRUMENT_SORT_GROUP(node, prefixsort);
+
+		if (node->bounded)
+		{
+			/*
+			 * If the current node has a bound and we've already sorted n
+			 * tuples, then the functional bound remaining is (original bound
+			 * - n), so store the current number of processed tuples for use
+			 * in configuring sorting bound.
+			 */
+			SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
+					   Min(node->bound, node->bound_Done + nTuples), node->bound_Done);
+			node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+		}
+
+		SO_printf("Setting execution_status to INCSORT_READPREFIXSORT  (switchToPresortedPrefixMode)\n");
+		node->execution_status = INCSORT_READPREFIXSORT;
+	}
+}
+
+/*
+ * Sorting many small groups with tuplesort is inefficient. In order to
+ * cope with this problem we don't start a new group until the current one
+ * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also
+ * means we can't assume small groups of tuples all have the same prefix keys.)
+ * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking
+ * for the new group as soon as we've met our bound to avoid fetching more
+ * tuples than we absolutely have to fetch.
+ */
+#define DEFAULT_MIN_GROUP_SIZE 32
+
+/*
+ * While we've optimized for small prefix key groups by not starting our prefix
+ * key comparisons until we've reached a minimum number of tuples, we don't want
+ * that optimization to cause us to lose out on the benefits of being able to
+ * assume a large group of tuples is fully presorted by its prefix keys.
+ * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic
+ * for determining when we believe we've encountered a large group, and, if we
+ * get to that point without finding a new prefix key group we transition to
+ * presorted prefix key mode.
+ */
+#define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE)
+
+/* ----------------------------------------------------------------
+ *		ExecIncrementalSort
+ *
+ *		Assuming that outer subtree returns tuple presorted by some prefix
+ *		of target sort columns, performs incremental sort.
+ *
+ *		Conditions:
+ *		  -- none.
+ *
+ *		Initial States:
+ *		  -- the outer child is prepared to return the first tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecIncrementalSort(PlanState *pstate)
+{
+	IncrementalSortState *node = castNode(IncrementalSortState, pstate);
+	EState	   *estate;
+	ScanDirection dir;
+	Tuplesortstate *read_sortstate;
+	Tuplesortstate *fullsort_state;
+	TupleTableSlot *slot;
+	IncrementalSort *plannode = (IncrementalSort *) node->ss.ps.plan;
+	PlanState  *outerNode;
+	TupleDesc	tupDesc;
+	int64		nTuples = 0;
+	int64		minGroupSize;
+
+	CHECK_FOR_INTERRUPTS();
+
+	estate = node->ss.ps.state;
+	dir = estate->es_direction;
+	fullsort_state = node->fullsort_state;
+
+	/*
+	 * If a previous iteration has sorted a batch, then we need to check to
+	 * see if there are any remaining tuples in that batch that we can return
+	 * before moving on to other execution states.
+	 */
+	if (node->execution_status == INCSORT_READFULLSORT
+		|| node->execution_status == INCSORT_READPREFIXSORT)
+	{
+		/*
+		 * Return next tuple from the current sorted group set if available.
+		 */
+		read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
+			fullsort_state : node->prefixsort_state;
+		slot = node->ss.ps.ps_ResultTupleSlot;
+
+		/*
+		 * We have to populate the slot from the tuplesort before checking
+		 * outerNodeDone because it will set the slot to NULL if no more
+		 * tuples remain. If the tuplesort is empty, but we don't have any
+		 * more tuples available for sort from the outer node, then
+		 * outerNodeDone will have been set so we'll return that now-empty
+		 * slot to the caller.
+		 */
+		if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
+								   false, slot, NULL) || node->outerNodeDone)
+
+			/*
+			 * Note: there isn't a good test case for the node->outerNodeDone
+			 * check directly, but we need it for any plan where the outer
+			 * node will fail when trying to fetch too many tuples.
+			 */
+			return slot;
+		else if (node->n_fullsort_remaining > 0)
+		{
+			/*
+			 * When we transition to presorted prefix mode, we might have
+			 * accumulated at least one additional prefix key group in the
+			 * full sort tuplesort. The first call to
+			 * switchToPresortedPrefixMode() will have pulled the first one of
+			 * those groups out, and we've returned those tuples to the parent
+			 * node, but if at this point we still have tuples remaining in
+			 * the full sort state (i.e., n_fullsort_remaining > 0), then we
+			 * need to re-execute the prefix mode transition function to pull
+			 * out the next prefix key group.
+			 */
+			SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (" INT64_FORMAT ")\n",
+					   node->n_fullsort_remaining);
+			switchToPresortedPrefixMode(pstate);
+		}
+		else
+		{
+			/*
+			 * If we don't have any sorted tuples to read and we're not
+			 * currently transitioning into presorted prefix sort mode, then
+			 * it's time to start the process all over again by building a new
+			 * group in the full sort state.
+			 */
+			SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n");
+			node->execution_status = INCSORT_LOADFULLSORT;
+		}
+	}
+
+	/*
+	 * Scan the subplan in the forward direction while creating the sorted
+	 * data.
+	 */
+	estate->es_direction = ForwardScanDirection;
+
+	outerNode = outerPlanState(node);
+	tupDesc = ExecGetResultType(outerNode);
+
+	/* Load tuples into the full sort state. */
+	if (node->execution_status == INCSORT_LOADFULLSORT)
+	{
+		/*
+		 * Initialize sorting structures.
+		 */
+		if (fullsort_state == NULL)
+		{
+			/*
+			 * Initialize presorted column support structures for
+			 * isCurrentGroup(). It's correct to do this along with the
+			 * initial initialization for the full sort state (and not for the
+			 * prefix sort state) since we always load the full sort state
+			 * first.
+			 */
+			preparePresortedCols(node);
+
+			/*
+			 * Since we optimize small prefix key groups by accumulating a
+			 * minimum number of tuples before sorting, we can't assume that a
+			 * group of tuples all have the same prefix key values. Hence we
+			 * setup the full sort tuplesort to sort by all requested sort
+			 * keys.
+			 */
+			fullsort_state = tuplesort_begin_heap(tupDesc,
+												  plannode->sort.numCols,
+												  plannode->sort.sortColIdx,
+												  plannode->sort.sortOperators,
+												  plannode->sort.collations,
+												  plannode->sort.nullsFirst,
+												  work_mem,
+												  NULL,
+												  false);
+			node->fullsort_state = fullsort_state;
+		}
+		else
+		{
+			/* Reset sort for the next batch. */
+			tuplesort_reset(fullsort_state);
+		}
+
+		/*
+		 * Calculate the remaining tuples left if bounded and configure both
+		 * bounded sort and the minimum group size accordingly.
+		 */
+		if (node->bounded)
+		{
+			int64		currentBound = node->bound - node->bound_Done;
+
+			/*
+			 * Bounded sort isn't likely to be a useful optimization for full
+			 * sort mode since we limit full sort mode to a relatively small
+			 * number of tuples and tuplesort doesn't switch over to top-n
+			 * heap sort anyway unless it hits (2 * bound) tuples.
+			 */
+			if (currentBound < DEFAULT_MIN_GROUP_SIZE)
+				tuplesort_set_bound(fullsort_state, currentBound);
+
+			minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound);
+		}
+		else
+			minGroupSize = DEFAULT_MIN_GROUP_SIZE;
+
+		/*
+		 * Because we have to read the next tuple to find out that we've
+		 * encountered a new prefix key group, on subsequent groups we have to
+		 * carry over that extra tuple and add it to the new group's sort here
+		 * before we read any new tuples from the outer node.
+		 */
+		if (!TupIsNull(node->group_pivot))
+		{
+			tuplesort_puttupleslot(fullsort_state, node->group_pivot);
+			nTuples++;
+
+			/*
+			 * We're in full sort mode accumulating a minimum number of tuples
+			 * and not checking for prefix key equality yet, so we can't
+			 * assume the group pivot tuple will remain the same -- unless
+			 * we're using a minimum group size of 1, in which case the pivot
+			 * is obviously still the pivot.
+			 */
+			if (nTuples != minGroupSize)
+				ExecClearTuple(node->group_pivot);
+		}
+
+
+		/*
+		 * Pull as many tuples from the outer node as possible given our
+		 * current operating mode.
+		 */
+		for (;;)
+		{
+			slot = ExecProcNode(outerNode);
+
+			/*
+			 * If the outer node can't provide us any more tuples, then we can
+			 * sort the current group and return those tuples.
+			 */
+			if (TupIsNull(slot))
+			{
+				/*
+				 * We need to know later if the outer node has completed to be
+				 * able to distinguish between being done with a batch and
+				 * being done with the whole node.
+				 */
+				node->outerNodeDone = true;
+
+				SO1_printf("Sorting fullsort with " INT64_FORMAT " tuples\n", nTuples);
+				tuplesort_performsort(fullsort_state);
+
+				INSTRUMENT_SORT_GROUP(node, fullsort);
+
+				SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n");
+				node->execution_status = INCSORT_READFULLSORT;
+				break;
+			}
+
+			/* Accumulate the next group of presorted tuples. */
+			if (nTuples < minGroupSize)
+			{
+				/*
+				 * If we haven't yet hit our target minimum group size, then
+				 * we don't need to bother checking for inclusion in the
+				 * current prefix group since at this point we'll assume that
+				 * we'll full sort this batch to avoid a large number of very
+				 * tiny (and thus inefficient) sorts.
+				 */
+				tuplesort_puttupleslot(fullsort_state, slot);
+				nTuples++;
+
+				/*
+				 * If we've reached our minimum group size, then we need to
+				 * store the most recent tuple as a pivot.
+				 */
+				if (nTuples == minGroupSize)
+					ExecCopySlot(node->group_pivot, slot);
+			}
+			else
+			{
+				/*
+				 * If we've already accumulated enough tuples to reach our
+				 * minimum group size, then we need to compare any additional
+				 * tuples to our pivot tuple to see if we reach the end of
+				 * that prefix key group. Only after we find changed prefix
+				 * keys can we guarantee sort stability of the tuples we've
+				 * already accumulated.
+				 */
+				if (isCurrentGroup(node, node->group_pivot, slot))
+				{
+					/*
+					 * As long as the prefix keys match the pivot tuple then
+					 * load the tuple into the tuplesort.
+					 */
+					tuplesort_puttupleslot(fullsort_state, slot);
+					nTuples++;
+				}
+				else
+				{
+					/*
+					 * Since the tuple we fetched isn't part of the current
+					 * prefix key group we don't want to sort it as part of
+					 * the current batch. Instead we use the group_pivot slot
+					 * to carry it over to the next batch (even though we
+					 * won't actually treat it as a group pivot).
+					 */
+					ExecCopySlot(node->group_pivot, slot);
+
+					if (node->bounded)
+					{
+						/*
+						 * If the current node has a bound, and we've already
+						 * sorted n tuples, then the functional bound
+						 * remaining is (original bound - n), so store the
+						 * current number of processed tuples for later use
+						 * configuring the sort state's bound.
+						 */
+						SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
+								   node->bound_Done,
+								   Min(node->bound, node->bound_Done + nTuples));
+						node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+					}
+
+					/*
+					 * Once we find changed prefix keys we can complete the
+					 * sort and transition modes to reading out the sorted
+					 * tuples.
+					 */
+					SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n",
+							   nTuples);
+					tuplesort_performsort(fullsort_state);
+
+					INSTRUMENT_SORT_GROUP(node, fullsort);
+
+					SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n");
+					node->execution_status = INCSORT_READFULLSORT;
+					break;
+				}
+			}
+
+			/*
+			 * Unless we've already transitioned modes to reading from the
+			 * full sort state, then we assume that having read at least
+			 * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're
+			 * processing a large group of tuples all having equal prefix keys
+			 * (but haven't yet found the final tuple in that prefix key
+			 * group), so we need to transition into presorted prefix mode.
+			 */
+			if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE &&
+				node->execution_status != INCSORT_READFULLSORT)
+			{
+				/*
+				 * The group pivot we have stored has already been put into
+				 * the tuplesort; we don't want to carry it over. Since we
+				 * haven't yet found the end of the prefix key group, it might
+				 * seem like we should keep this, but we don't actually know
+				 * how many prefix key groups might be represented in the full
+				 * sort state, so we'll let the mode transition function
+				 * manage this state for us.
+				 */
+				ExecClearTuple(node->group_pivot);
+
+				/*
+				 * Unfortunately the tuplesort API doesn't include a way to
+				 * retrieve tuples unless a sort has been performed, so we
+				 * perform the sort even though we could just as easily rely
+				 * on FIFO retrieval semantics when transferring them to the
+				 * presorted prefix tuplesort.
+				 */
+				SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n", nTuples);
+				tuplesort_performsort(fullsort_state);
+
+				INSTRUMENT_SORT_GROUP(node, fullsort);
+
+				/*
+				 * If the full sort tuplesort happened to switch into top-n
+				 * heapsort mode then we will only be able to retrieve
+				 * currentBound tuples (since the tuplesort will have only
+				 * retained the top-n tuples). This is safe even though we
+				 * haven't yet completed fetching the current prefix key group
+				 * because the tuples we've "lost" already sorted "below" the
+				 * retained ones, and we're already contractually guaranteed
+				 * to not need any more than the currentBound tuples.
+				 */
+				if (tuplesort_used_bound(node->fullsort_state))
+				{
+					int64		currentBound = node->bound - node->bound_Done;
+
+					SO2_printf("Read " INT64_FORMAT " tuples, but setting to " INT64_FORMAT " because we used bounded sort\n",
+							   nTuples, Min(currentBound, nTuples));
+					nTuples = Min(currentBound, nTuples);
+				}
+
+				SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT " and calling switchToPresortedPrefixMode()\n",
+						   nTuples);
+
+				/*
+				 * We might have multiple prefix key groups in the full sort
+				 * state, so the mode transition function needs to know that
+				 * it needs to move from the fullsort to presorted prefix
+				 * sort.
+				 */
+				node->n_fullsort_remaining = nTuples;
+
+				/* Transition the tuples to the presorted prefix tuplesort. */
+				switchToPresortedPrefixMode(pstate);
+
+				/*
+				 * Since we know we had tuples to move to the presorted prefix
+				 * tuplesort, we know that unless that transition has verified
+				 * that all tuples belonged to the same prefix key group (in
+				 * which case we can go straight to continuing to load tuples
+				 * into that tuplesort), we should have a tuple to return
+				 * here.
+				 *
+				 * Either way, the appropriate execution status should have
+				 * been set by switchToPresortedPrefixMode(), so we can drop
+				 * out of the loop here and let the appropriate path kick in.
+				 */
+				break;
+			}
+		}
+	}
+
+	if (node->execution_status == INCSORT_LOADPREFIXSORT)
+	{
+		/*
+		 * We only enter this state after the mode transition function has
+		 * confirmed all remaining tuples from the full sort state have the
+		 * same prefix and moved those tuples to the prefix sort state. That
+		 * function has also set a group pivot tuple (which doesn't need to be
+		 * carried over; it's already been put into the prefix sort state).
+		 */
+		Assert(!TupIsNull(node->group_pivot));
+
+		/*
+		 * Read tuples from the outer node and load them into the prefix sort
+		 * state until we encounter a tuple whose prefix keys don't match the
+		 * current group_pivot tuple, since we can't guarantee sort stability
+		 * until we have all tuples matching those prefix keys.
+		 */
+		for (;;)
+		{
+			slot = ExecProcNode(outerNode);
+
+			/*
+			 * If we've exhausted tuples from the outer node we're done
+			 * loading the prefix sort state.
+			 */
+			if (TupIsNull(slot))
+			{
+				/*
+				 * We need to know later if the outer node has completed to be
+				 * able to distinguish between being done with a batch and
+				 * being done with the whole node.
+				 */
+				node->outerNodeDone = true;
+				break;
+			}
+
+			/*
+			 * If the tuple's prefix keys match our pivot tuple, we're not
+			 * done yet and can load it into the prefix sort state. If not, we
+			 * don't want to sort it as part of the current batch. Instead we
+			 * use the group_pivot slot to carry it over to the next batch
+			 * (even though we won't actually treat it as a group pivot).
+			 */
+			if (isCurrentGroup(node, node->group_pivot, slot))
+			{
+				tuplesort_puttupleslot(node->prefixsort_state, slot);
+				nTuples++;
+			}
+			else
+			{
+				ExecCopySlot(node->group_pivot, slot);
+				break;
+			}
+		}
+
+		/*
+		 * Perform the sort and begin returning the tuples to the parent plan
+		 * node.
+		 */
+		SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples);
+		tuplesort_performsort(node->prefixsort_state);
+
+		INSTRUMENT_SORT_GROUP(node, prefixsort);
+
+		SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n");
+		node->execution_status = INCSORT_READPREFIXSORT;
+
+		if (node->bounded)
+		{
+			/*
+			 * If the current node has a bound, and we've already sorted n
+			 * tuples, then the functional bound remaining is (original bound
+			 * - n), so store the current number of processed tuples for use
+			 * in configuring sorting bound.
+			 */
+			SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n",
+					   node->bound_Done,
+					   Min(node->bound, node->bound_Done + nTuples));
+			node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+		}
+	}
+
+	/* Restore to user specified direction. */
+	estate->es_direction = dir;
+
+	/*
+	 * Get the first or next tuple from tuplesort. Returns NULL if no more
+	 * tuples.
+	 */
+	read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
+		fullsort_state : node->prefixsort_state;
+	slot = node->ss.ps.ps_ResultTupleSlot;
+	(void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
+								  false, slot, NULL);
+	return slot;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitIncrementalSort
+ *
+ *		Creates the run-time state information for the sort node
+ *		produced by the planner and initializes its outer subtree.
+ * ----------------------------------------------------------------
+ */
+IncrementalSortState *
+ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags)
+{
+	IncrementalSortState *incrsortstate;
+
+	SO_printf("ExecInitIncrementalSort: initializing sort node\n");
+
+	/*
+	 * Incremental sort can't be used with EXEC_FLAG_BACKWARD or
+	 * EXEC_FLAG_MARK, because the current sort state contains only one sort
+	 * batch rather than the full result set.
+	 */
+	Assert((eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) == 0);
+
+	/* Initialize state structure. */
+	incrsortstate = makeNode(IncrementalSortState);
+	incrsortstate->ss.ps.plan = (Plan *) node;
+	incrsortstate->ss.ps.state = estate;
+	incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort;
+
+	incrsortstate->execution_status = INCSORT_LOADFULLSORT;
+	incrsortstate->bounded = false;
+	incrsortstate->outerNodeDone = false;
+	incrsortstate->bound_Done = 0;
+	incrsortstate->fullsort_state = NULL;
+	incrsortstate->prefixsort_state = NULL;
+	incrsortstate->group_pivot = NULL;
+	incrsortstate->transfer_tuple = NULL;
+	incrsortstate->n_fullsort_remaining = 0;
+	incrsortstate->presorted_keys = NULL;
+
+	if (incrsortstate->ss.ps.instrument != NULL)
+	{
+		IncrementalSortGroupInfo *fullsortGroupInfo =
+		&incrsortstate->incsort_info.fullsortGroupInfo;
+		IncrementalSortGroupInfo *prefixsortGroupInfo =
+		&incrsortstate->incsort_info.prefixsortGroupInfo;
+
+		fullsortGroupInfo->groupCount = 0;
+		fullsortGroupInfo->maxDiskSpaceUsed = 0;
+		fullsortGroupInfo->totalDiskSpaceUsed = 0;
+		fullsortGroupInfo->maxMemorySpaceUsed = 0;
+		fullsortGroupInfo->totalMemorySpaceUsed = 0;
+		fullsortGroupInfo->sortMethods = 0;
+		prefixsortGroupInfo->groupCount = 0;
+		prefixsortGroupInfo->maxDiskSpaceUsed = 0;
+		prefixsortGroupInfo->totalDiskSpaceUsed = 0;
+		prefixsortGroupInfo->maxMemorySpaceUsed = 0;
+		prefixsortGroupInfo->totalMemorySpaceUsed = 0;
+		prefixsortGroupInfo->sortMethods = 0;
+	}
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * Sort nodes don't initialize their ExprContexts because they never call
+	 * ExecQual or ExecProject.
+	 */
+
+	/*
+	 * Initialize child nodes.
+	 *
+	 * Incremental sort does not support backwards scans and mark/restore, so
+	 * we don't bother removing the flags from eflags here. We allow passing a
+	 * REWIND flag, because although incremental sort can't use it, the child
+	 * nodes may be able to do something more useful.
+	 */
+	outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * Initialize scan slot and type.
+	 */
+	ExecCreateScanSlotFromOuterPlan(estate, &incrsortstate->ss, &TTSOpsMinimalTuple);
+
+	/*
+	 * Initialize return slot and type. No need to initialize projection info
+	 * because we don't do any projections.
+	 */
+	ExecInitResultTupleSlotTL(&incrsortstate->ss.ps, &TTSOpsMinimalTuple);
+	incrsortstate->ss.ps.ps_ProjInfo = NULL;
+
+	/*
+	 * Initialize standalone slots to store a tuple for pivot prefix keys and
+	 * for carrying over a tuple from one batch to the next.
+	 */
+	incrsortstate->group_pivot =
+		MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
+								 &TTSOpsMinimalTuple);
+	incrsortstate->transfer_tuple =
+		MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
+								 &TTSOpsMinimalTuple);
+
+	SO_printf("ExecInitIncrementalSort: sort node initialized\n");
+
+	return incrsortstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndIncrementalSort(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIncrementalSort(IncrementalSortState *node)
+{
+	SO_printf("ExecEndIncrementalSort: shutting down sort node\n");
+
+	/* clean out the scan tuple */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	/* must drop pointer to sort result tuple */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	/* must drop standalone tuple slots from outer node */
+	ExecDropSingleTupleTableSlot(node->group_pivot);
+	ExecDropSingleTupleTableSlot(node->transfer_tuple);
+
+	/*
+	 * Release tuplesort resources.
+	 */
+	if (node->fullsort_state != NULL)
+	{
+		tuplesort_end(node->fullsort_state);
+		node->fullsort_state = NULL;
+	}
+	if (node->prefixsort_state != NULL)
+	{
+		tuplesort_end(node->prefixsort_state);
+		node->prefixsort_state = NULL;
+	}
+
+	/*
+	 * Shut down the subplan.
+	 */
+	ExecEndNode(outerPlanState(node));
+
+	SO_printf("ExecEndIncrementalSort: sort node shutdown\n");
+}
+
+void
+ExecReScanIncrementalSort(IncrementalSortState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/*
+	 * Incremental sort doesn't support efficient rescan even when parameters
+	 * haven't changed (e.g., rewind) because unlike regular sort we don't
+	 * store all tuples at once for the full sort.
+	 *
+	 * So even if EXEC_FLAG_REWIND is set we just reset all of our state and
+	 * re-execute the sort along with the child node. Incremental sort itself
+	 * can't do anything smarter, but maybe the child nodes can.
+	 *
+	 * In theory if we've only filled the full sort with one batch (and
+	 * haven't reset it for a new batch yet) then we could efficiently rewind,
+	 * but that seems a narrow enough case that it's not worth handling
+	 * specially at this time.
+	 */
+
+	/* must drop pointer to sort result tuple */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	if (node->group_pivot != NULL)
+		ExecClearTuple(node->group_pivot);
+	if (node->transfer_tuple != NULL)
+		ExecClearTuple(node->transfer_tuple);
+
+	node->outerNodeDone = false;
+	node->n_fullsort_remaining = 0;
+	node->bound_Done = 0;
+	node->presorted_keys = NULL;
+
+	node->execution_status = INCSORT_LOADFULLSORT;
+
+	/*
+	 * If we've set up either of the sort states yet, we need to reset them.
+	 * We could end them and null out the pointers, but there's no reason to
+	 * repay the setup cost, and because ExecIncrementalSort guards presorted
+	 * column functions by checking to see if the full sort state has been
+	 * initialized yet, setting the sort states to null here might actually
+	 * cause a leak.
+	 */
+	if (node->fullsort_state != NULL)
+	{
+		tuplesort_reset(node->fullsort_state);
+		node->fullsort_state = NULL;
+	}
+	if (node->prefixsort_state != NULL)
+	{
+		tuplesort_reset(node->prefixsort_state);
+		node->prefixsort_state = NULL;
+	}
+
+	/*
+	 * If chgParam of subnode is not null, then the plan will be re-scanned by
+	 * the first ExecProcNode.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecSortEstimate
+ *
+ *		Estimate space required to propagate sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(IncrementalSortInfo));
+	size = add_size(size, offsetof(SharedIncrementalSortInfo, sinfo));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortInitializeDSM
+ *
+ *		Initialize DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedIncrementalSortInfo, sinfo)
+		+ pcxt->nworkers * sizeof(IncrementalSortInfo);
+	node->shared_info = shm_toc_allocate(pcxt->toc, size);
+	/* ensure any unfilled slots will contain zeroes */
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortInitializeWorker
+ *
+ *		Attach worker to DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pwcxt)
+{
+	node->shared_info =
+		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+	node->am_worker = true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortRetrieveInstrumentation
+ *
+ *		Transfer sort statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node)
+{
+	Size		size;
+	SharedIncrementalSortInfo *si;
+
+	if (node->shared_info == NULL)
+		return;
+
+	size = offsetof(SharedIncrementalSortInfo, sinfo)
+		+ node->shared_info->num_workers * sizeof(IncrementalSortInfo);
+	si = palloc(size);
+	memcpy(si, node->shared_info, size);
+	node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
new file mode 100644
index 0000000..8fee958
--- /dev/null
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -0,0 +1,735 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIndexonlyscan.c
+ *	  Routines to support index-only scans
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeIndexonlyscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecIndexOnlyScan			scans an index
+ *		IndexOnlyNext				retrieve next tuple
+ *		ExecInitIndexOnlyScan		creates and initializes state info.
+ *		ExecReScanIndexOnlyScan		rescans the indexed relation.
+ *		ExecEndIndexOnlyScan		releases all storage.
+ *		ExecIndexOnlyMarkPos		marks scan position.
+ *		ExecIndexOnlyRestrPos		restores scan position.
+ *		ExecIndexOnlyScanEstimate	estimates DSM space needed for
+ *						parallel index-only scan
+ *		ExecIndexOnlyScanInitializeDSM	initialize DSM for parallel
+ *						index-only scan
+ *		ExecIndexOnlyScanReInitializeDSM	reinitialize DSM for fresh scan
+ *		ExecIndexOnlyScanInitializeWorker attach to DSM info in parallel worker
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/tupdesc.h"
+#include "access/visibilitymap.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
+static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
+							TupleDesc itupdesc);
+
+
+/* ----------------------------------------------------------------
+ *		IndexOnlyNext
+ *
+ *		Retrieve a tuple from the IndexOnlyScan node's index.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+IndexOnlyNext(IndexOnlyScanState *node)
+{
+	EState	   *estate;
+	ExprContext *econtext;
+	ScanDirection direction;
+	IndexScanDesc scandesc;
+	TupleTableSlot *slot;
+	ItemPointer tid;
+
+	/*
+	 * extract necessary information from index scan node
+	 */
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	/* flip direction if this is an overall backward scan */
+	if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
+	{
+		if (ScanDirectionIsForward(direction))
+			direction = BackwardScanDirection;
+		else if (ScanDirectionIsBackward(direction))
+			direction = ForwardScanDirection;
+	}
+	scandesc = node->ioss_ScanDesc;
+	econtext = node->ss.ps.ps_ExprContext;
+	slot = node->ss.ss_ScanTupleSlot;
+
+	if (scandesc == NULL)
+	{
+		/*
+		 * We reach here if the index only scan is not parallel, or if we're
+		 * serially executing an index only scan that was planned to be
+		 * parallel.
+		 */
+		scandesc = index_beginscan(node->ss.ss_currentRelation,
+								   node->ioss_RelationDesc,
+								   estate->es_snapshot,
+								   node->ioss_NumScanKeys,
+								   node->ioss_NumOrderByKeys);
+
+		node->ioss_ScanDesc = scandesc;
+
+
+		/* Set it up for index-only scan */
+		node->ioss_ScanDesc->xs_want_itup = true;
+		node->ioss_VMBuffer = InvalidBuffer;
+
+		/*
+		 * If no run-time keys to calculate or they are ready, go ahead and
+		 * pass the scankeys to the index AM.
+		 */
+		if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
+			index_rescan(scandesc,
+						 node->ioss_ScanKeys,
+						 node->ioss_NumScanKeys,
+						 node->ioss_OrderByKeys,
+						 node->ioss_NumOrderByKeys);
+	}
+
+	/*
+	 * OK, now that we have what we need, fetch the next tuple.
+	 */
+	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	{
+		bool		tuple_from_heap = false;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * We can skip the heap fetch if the TID references a heap page on
+		 * which all tuples are known visible to everybody.  In any case,
+		 * we'll use the index tuple not the heap tuple as the data source.
+		 *
+		 * Note on Memory Ordering Effects: visibilitymap_get_status does not
+		 * lock the visibility map buffer, and therefore the result we read
+		 * here could be slightly stale.  However, it can't be stale enough to
+		 * matter.
+		 *
+		 * We need to detect clearing a VM bit due to an insert right away,
+		 * because the tuple is present in the index page but not visible. The
+		 * reading of the TID by this scan (using a shared lock on the index
+		 * buffer) is serialized with the insert of the TID into the index
+		 * (using an exclusive lock on the index buffer). Because the VM bit
+		 * is cleared before updating the index, and locking/unlocking of the
+		 * index page acts as a full memory barrier, we are sure to see the
+		 * cleared bit if we see a recently-inserted TID.
+		 *
+		 * Deletes do not update the index page (only VACUUM will clear out
+		 * the TID), so the clearing of the VM bit by a delete is not
+		 * serialized with this test below, and we may see a value that is
+		 * significantly stale. However, we don't care about the delete right
+		 * away, because the tuple is still visible until the deleting
+		 * transaction commits or the statement ends (if it's our
+		 * transaction). In either case, the lock on the VM buffer will have
+		 * been released (acting as a write barrier) after clearing the bit.
+		 * And for us to have a snapshot that includes the deleting
+		 * transaction (making the tuple invisible), we must have acquired
+		 * ProcArrayLock after that time, acting as a read barrier.
+		 *
+		 * It's worth going through this complexity to avoid needing to lock
+		 * the VM buffer, which could cause significant contention.
+		 */
+		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
+							ItemPointerGetBlockNumber(tid),
+							&node->ioss_VMBuffer))
+		{
+			/*
+			 * Rats, we have to visit the heap to check visibility.
+			 */
+			InstrCountTuples2(node, 1);
+			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+				continue;		/* no visible tuple, try next index entry */
+
+			ExecClearTuple(node->ioss_TableSlot);
+
+			/*
+			 * Only MVCC snapshots are supported here, so there should be no
+			 * need to keep following the HOT chain once a visible entry has
+			 * been found.  If we did want to allow that, we'd need to keep
+			 * more state to remember not to call index_getnext_tid next time.
+			 */
+			if (scandesc->xs_heap_continue)
+				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+			/*
+			 * Note: at this point we are holding a pin on the heap page, as
+			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
+			 * but it's not clear whether it's a win to do so.  The next index
+			 * entry might require a visit to the same heap page.
+			 */
+
+			tuple_from_heap = true;
+		}
+
+		/*
+		 * Fill the scan tuple slot with data from the index.  This might be
+		 * provided in either HeapTuple or IndexTuple format.  Conceivably an
+		 * index AM might fill both fields, in which case we prefer the heap
+		 * format, since it's probably a bit cheaper to fill a slot from.
+		 */
+		if (scandesc->xs_hitup)
+		{
+			/*
+			 * We don't take the trouble to verify that the provided tuple has
+			 * exactly the slot's format, but it seems worth doing a quick
+			 * check on the number of fields.
+			 */
+			Assert(slot->tts_tupleDescriptor->natts ==
+				   scandesc->xs_hitupdesc->natts);
+			ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
+		}
+		else if (scandesc->xs_itup)
+			StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc);
+		else
+			elog(ERROR, "no data returned for index-only scan");
+
+		/*
+		 * If the index was lossy, we have to recheck the index quals.
+		 */
+		if (scandesc->xs_recheck)
+		{
+			econtext->ecxt_scantuple = slot;
+			if (!ExecQualAndReset(node->recheckqual, econtext))
+			{
+				/* Fails recheck, so drop it and loop back for another */
+				InstrCountFiltered2(node, 1);
+				continue;
+			}
+		}
+
+		/*
+		 * We don't currently support rechecking ORDER BY distances.  (In
+		 * principle, if the index can support retrieval of the originally
+		 * indexed value, it should be able to produce an exact distance
+		 * calculation too.  So it's not clear that adding code here for
+		 * recheck/re-sort would be worth the trouble.  But we should at least
+		 * throw an error if someone tries it.)
+		 */
+		if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("lossy distance functions are not supported in index-only scans")));
+
+		/*
+		 * If we didn't access the heap, then we'll need to take a predicate
+		 * lock explicitly, as if we had.  For now we do that at page level.
+		 */
+		if (!tuple_from_heap)
+			PredicateLockPage(scandesc->heapRelation,
+							  ItemPointerGetBlockNumber(tid),
+							  estate->es_snapshot);
+
+		return slot;
+	}
+
+	/*
+	 * if we get here it means the index scan failed so we are at the end of
+	 * the scan..
+	 */
+	return ExecClearTuple(slot);
+}
+
+/*
+ * StoreIndexTuple
+ *		Fill the slot with data from the index tuple.
+ *
+ * At some point this might be generally-useful functionality, but
+ * right now we don't need it elsewhere.
+ */
+static void
+StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
+{
+	/*
+	 * Note: we must use the tupdesc supplied by the AM in index_deform_tuple,
+	 * not the slot's tupdesc, in case the latter has different datatypes
+	 * (this happens for btree name_ops in particular).  They'd better have
+	 * the same number of columns though, as well as being datatype-compatible
+	 * which is something we can't so easily check.
+	 */
+	Assert(slot->tts_tupleDescriptor->natts == itupdesc->natts);
+
+	ExecClearTuple(slot);
+	index_deform_tuple(itup, itupdesc, slot->tts_values, slot->tts_isnull);
+	ExecStoreVirtualTuple(slot);
+}
+
+/*
+ * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ *
+ * This can't really happen, since an index can't supply CTID which would
+ * be necessary data for any potential EvalPlanQual target relation.  If it
+ * did happen, the EPQ code would pass us the wrong data, namely a heap
+ * tuple not an index tuple.  So throw an error.
+ */
+static bool
+IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
+{
+	elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
+	return false;				/* keep compiler quiet */
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyScan(node)
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecIndexOnlyScan(PlanState *pstate)
+{
+	IndexOnlyScanState *node = castNode(IndexOnlyScanState, pstate);
+
+	/*
+	 * If we have runtime keys and they've not already been set up, do it now.
+	 */
+	if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
+		ExecReScan((PlanState *) node);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) IndexOnlyNext,
+					(ExecScanRecheckMtd) IndexOnlyRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanIndexOnlyScan(node)
+ *
+ *		Recalculates the values of any scan keys whose value depends on
+ *		information known at runtime, then rescans the indexed relation.
+ *
+ *		Updating the scan key was formerly done separately in
+ *		ExecUpdateIndexScanKeys. Integrating it into ReScan makes
+ *		rescans of indices and relations/general streams more uniform.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
+{
+	/*
+	 * If we are doing runtime key calculations (ie, any of the index key
+	 * values weren't simple Consts), compute the new key values.  But first,
+	 * reset the context so we don't leak memory as each outer tuple is
+	 * scanned.  Note this assumes that we will recalculate *all* runtime keys
+	 * on each call.
+	 */
+	if (node->ioss_NumRuntimeKeys != 0)
+	{
+		ExprContext *econtext = node->ioss_RuntimeContext;
+
+		ResetExprContext(econtext);
+		ExecIndexEvalRuntimeKeys(econtext,
+								 node->ioss_RuntimeKeys,
+								 node->ioss_NumRuntimeKeys);
+	}
+	node->ioss_RuntimeKeysReady = true;
+
+	/* reset index scan */
+	if (node->ioss_ScanDesc)
+		index_rescan(node->ioss_ScanDesc,
+					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
+					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
+
+	ExecScanReScan(&node->ss);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecEndIndexOnlyScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIndexOnlyScan(IndexOnlyScanState *node)
+{
+	Relation	indexRelationDesc;
+	IndexScanDesc indexScanDesc;
+
+	/*
+	 * extract information from the node
+	 */
+	indexRelationDesc = node->ioss_RelationDesc;
+	indexScanDesc = node->ioss_ScanDesc;
+
+	/* Release VM buffer pin, if any. */
+	if (node->ioss_VMBuffer != InvalidBuffer)
+	{
+		ReleaseBuffer(node->ioss_VMBuffer);
+		node->ioss_VMBuffer = InvalidBuffer;
+	}
+
+	/*
+	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
+	 */
+#ifdef NOT_USED
+	ExecFreeExprContext(&node->ss.ps);
+	if (node->ioss_RuntimeContext)
+		FreeExprContext(node->ioss_RuntimeContext, true);
+#endif
+
+	/*
+	 * clear out tuple table slots
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close the index relation (no-op if we didn't open it)
+	 */
+	if (indexScanDesc)
+		index_endscan(indexScanDesc);
+	if (indexRelationDesc)
+		index_close(indexRelationDesc, NoLock);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyMarkPos
+ *
+ * Note: we assume that no caller attempts to set a mark before having read
+ * at least one tuple.  Otherwise, ioss_ScanDesc might still be NULL.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
+{
+	EState	   *estate = node->ss.ps.state;
+	EPQState   *epqstate = estate->es_epq_active;
+
+	if (epqstate != NULL)
+	{
+		/*
+		 * We are inside an EvalPlanQual recheck.  If a test tuple exists for
+		 * this relation, then we shouldn't access the index at all.  We would
+		 * instead need to save, and later restore, the state of the
+		 * relsubs_done flag, so that re-fetching the test tuple is possible.
+		 * However, given the assumption that no caller sets a mark at the
+		 * start of the scan, we can only get here with relsubs_done[i]
+		 * already set, and so no state need be saved.
+		 */
+		Index		scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
+
+		Assert(scanrelid > 0);
+		if (epqstate->relsubs_slot[scanrelid - 1] != NULL ||
+			epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+		{
+			/* Verify the claim above */
+			if (!epqstate->relsubs_done[scanrelid - 1])
+				elog(ERROR, "unexpected ExecIndexOnlyMarkPos call in EPQ recheck");
+			return;
+		}
+	}
+
+	index_markpos(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyRestrPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
+{
+	EState	   *estate = node->ss.ps.state;
+	EPQState   *epqstate = estate->es_epq_active;
+
+	if (estate->es_epq_active != NULL)
+	{
+		/* See comments in ExecIndexMarkPos */
+		Index		scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
+
+		Assert(scanrelid > 0);
+		if (epqstate->relsubs_slot[scanrelid - 1] != NULL ||
+			epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+		{
+			/* Verify the claim above */
+			if (!epqstate->relsubs_done[scanrelid - 1])
+				elog(ERROR, "unexpected ExecIndexOnlyRestrPos call in EPQ recheck");
+			return;
+		}
+	}
+
+	index_restrpos(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitIndexOnlyScan
+ *
+ *		Initializes the index scan's state information, creates
+ *		scan keys, and opens the base and index relations.
+ *
+ *		Note: index scans have 2 sets of state information because
+ *			  we have to keep track of the base relation and the
+ *			  index relation.
+ * ----------------------------------------------------------------
+ */
+IndexOnlyScanState *
+ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
+{
+	IndexOnlyScanState *indexstate;
+	Relation	currentRelation;
+	LOCKMODE	lockmode;
+	TupleDesc	tupDesc;
+
+	/*
+	 * create state structure
+	 */
+	indexstate = makeNode(IndexOnlyScanState);
+	indexstate->ss.ps.plan = (Plan *) node;
+	indexstate->ss.ps.state = estate;
+	indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &indexstate->ss.ps);
+
+	/*
+	 * open the scan relation
+	 */
+	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+	indexstate->ss.ss_currentRelation = currentRelation;
+	indexstate->ss.ss_currentScanDesc = NULL;	/* no heap scan here */
+
+	/*
+	 * Build the scan tuple type using the indextlist generated by the
+	 * planner.  We use this, rather than the index's physical tuple
+	 * descriptor, because the latter contains storage column types not the
+	 * types of the original datums.  (It's the AM's responsibility to return
+	 * suitable data anyway.)
+	 */
+	tupDesc = ExecTypeFromTL(node->indextlist);
+	ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc,
+						  &TTSOpsVirtual);
+
+	/*
+	 * We need another slot, in a format that's suitable for the table AM, for
+	 * when we need to fetch a tuple from the table for rechecking visibility.
+	 */
+	indexstate->ioss_TableSlot =
+		ExecAllocTableSlot(&estate->es_tupleTable,
+						   RelationGetDescr(currentRelation),
+						   table_slot_callbacks(currentRelation));
+
+	/*
+	 * Initialize result type and projection info.  The node's targetlist will
+	 * contain Vars with varno = INDEX_VAR, referencing the scan tuple.
+	 */
+	ExecInitResultTypeTL(&indexstate->ss.ps);
+	ExecAssignScanProjectionInfoWithVarno(&indexstate->ss, INDEX_VAR);
+
+	/*
+	 * initialize child expressions
+	 *
+	 * Note: we don't initialize all of the indexorderby expression, only the
+	 * sub-parts corresponding to runtime keys (see below).
+	 */
+	indexstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) indexstate);
+	indexstate->recheckqual =
+		ExecInitQual(node->recheckqual, (PlanState *) indexstate);
+
+	/*
+	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
+	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
+	 * references to nonexistent indexes.
+	 */
+	if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
+		return indexstate;
+
+	/* Open the index relation. */
+	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
+	indexstate->ioss_RelationDesc = index_open(node->indexid, lockmode);
+
+	/*
+	 * Initialize index-specific scan state
+	 */
+	indexstate->ioss_RuntimeKeysReady = false;
+	indexstate->ioss_RuntimeKeys = NULL;
+	indexstate->ioss_NumRuntimeKeys = 0;
+
+	/*
+	 * build the index scan keys from the index qualification
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->ioss_RelationDesc,
+						   node->indexqual,
+						   false,
+						   &indexstate->ioss_ScanKeys,
+						   &indexstate->ioss_NumScanKeys,
+						   &indexstate->ioss_RuntimeKeys,
+						   &indexstate->ioss_NumRuntimeKeys,
+						   NULL,	/* no ArrayKeys */
+						   NULL);
+
+	/*
+	 * any ORDER BY exprs have to be turned into scankeys in the same way
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->ioss_RelationDesc,
+						   node->indexorderby,
+						   true,
+						   &indexstate->ioss_OrderByKeys,
+						   &indexstate->ioss_NumOrderByKeys,
+						   &indexstate->ioss_RuntimeKeys,
+						   &indexstate->ioss_NumRuntimeKeys,
+						   NULL,	/* no ArrayKeys */
+						   NULL);
+
+	/*
+	 * If we have runtime keys, we need an ExprContext to evaluate them. The
+	 * node's standard context won't do because we want to reset that context
+	 * for every tuple.  So, build another context just like the other one...
+	 * -tgl 7/11/00
+	 */
+	if (indexstate->ioss_NumRuntimeKeys != 0)
+	{
+		ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
+
+		ExecAssignExprContext(estate, &indexstate->ss.ps);
+		indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
+		indexstate->ss.ps.ps_ExprContext = stdecontext;
+	}
+	else
+	{
+		indexstate->ioss_RuntimeContext = NULL;
+	}
+
+	/*
+	 * all done.
+	 */
+	return indexstate;
+}
+
+/* ----------------------------------------------------------------
+ *		Parallel Index-only Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyScanEstimate
+ *
+ *		Compute the amount of space we'll need in the parallel
+ *		query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanEstimate(IndexOnlyScanState *node,
+						  ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+
+	node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc,
+													  estate->es_snapshot);
+	shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyScanInitializeDSM
+ *
+ *		Set up a parallel index-only scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
+							   ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+	ParallelIndexScanDesc piscan;
+
+	piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen);
+	index_parallelscan_initialize(node->ss.ss_currentRelation,
+								  node->ioss_RelationDesc,
+								  estate->es_snapshot,
+								  piscan);
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+	node->ioss_ScanDesc =
+		index_beginscan_parallel(node->ss.ss_currentRelation,
+								 node->ioss_RelationDesc,
+								 node->ioss_NumScanKeys,
+								 node->ioss_NumOrderByKeys,
+								 piscan);
+	node->ioss_ScanDesc->xs_want_itup = true;
+	node->ioss_VMBuffer = InvalidBuffer;
+
+	/*
+	 * If no run-time keys to calculate or they are ready, go ahead and pass
+	 * the scankeys to the index AM.
+	 */
+	if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
+		index_rescan(node->ioss_ScanDesc,
+					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
+					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyScanReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node,
+								 ParallelContext *pcxt)
+{
+	index_parallelrescan(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyScanInitializeWorker
+ *
+ *		Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
+								  ParallelWorkerContext *pwcxt)
+{
+	ParallelIndexScanDesc piscan;
+
+	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+	node->ioss_ScanDesc =
+		index_beginscan_parallel(node->ss.ss_currentRelation,
+								 node->ioss_RelationDesc,
+								 node->ioss_NumScanKeys,
+								 node->ioss_NumOrderByKeys,
+								 piscan);
+	node->ioss_ScanDesc->xs_want_itup = true;
+
+	/*
+	 * If no run-time keys to calculate or they are ready, go ahead and pass
+	 * the scankeys to the index AM.
+	 */
+	if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
+		index_rescan(node->ioss_ScanDesc,
+					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
+					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
+}
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
new file mode 100644
index 0000000..add29b3
--- /dev/null
+++ b/src/backend/executor/nodeIndexscan.c
@@ -0,0 +1,1747 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIndexscan.c
+ *	  Routines to support indexed scans of relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeIndexscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecIndexScan			scans a relation using an index
+ *		IndexNext				retrieve next tuple using index
+ *		IndexNextWithReorder	same, but recheck ORDER BY expressions
+ *		ExecInitIndexScan		creates and initializes state info.
+ *		ExecReScanIndexScan		rescans the indexed relation.
+ *		ExecEndIndexScan		releases all storage.
+ *		ExecIndexMarkPos		marks scan position.
+ *		ExecIndexRestrPos		restores scan position.
+ *		ExecIndexScanEstimate	estimates DSM space needed for parallel index scan
+ *		ExecIndexScanInitializeDSM initialize DSM for parallel indexscan
+ *		ExecIndexScanReInitializeDSM reinitialize DSM for fresh scan
+ *		ExecIndexScanInitializeWorker attach to DSM info in parallel worker
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "catalog/pg_am.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIndexscan.h"
+#include "lib/pairingheap.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/array.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/*
+ * When an ordering operator is used, tuples fetched from the index that
+ * need to be reordered are queued in a pairing heap, as ReorderTuples.
+ */
+typedef struct
+{
+	pairingheap_node ph_node;
+	HeapTuple	htup;
+	Datum	   *orderbyvals;
+	bool	   *orderbynulls;
+} ReorderTuple;
+
+static TupleTableSlot *IndexNext(IndexScanState *node);
+static TupleTableSlot *IndexNextWithReorder(IndexScanState *node);
+static void EvalOrderByExpressions(IndexScanState *node, ExprContext *econtext);
+static bool IndexRecheck(IndexScanState *node, TupleTableSlot *slot);
+static int	cmp_orderbyvals(const Datum *adist, const bool *anulls,
+							const Datum *bdist, const bool *bnulls,
+							IndexScanState *node);
+static int	reorderqueue_cmp(const pairingheap_node *a,
+							 const pairingheap_node *b, void *arg);
+static void reorderqueue_push(IndexScanState *node, TupleTableSlot *slot,
+							  Datum *orderbyvals, bool *orderbynulls);
+static HeapTuple reorderqueue_pop(IndexScanState *node);
+
+
+/* ----------------------------------------------------------------
+ *		IndexNext
+ *
+ *		Retrieve a tuple from the IndexScan node's currentRelation
+ *		using the index specified in the IndexScanState information.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+IndexNext(IndexScanState *node)
+{
+	EState	   *estate;
+	ExprContext *econtext;
+	ScanDirection direction;
+	IndexScanDesc scandesc;
+	TupleTableSlot *slot;
+
+	/*
+	 * extract necessary information from index scan node
+	 */
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	/* flip direction if this is an overall backward scan */
+	if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir))
+	{
+		if (ScanDirectionIsForward(direction))
+			direction = BackwardScanDirection;
+		else if (ScanDirectionIsBackward(direction))
+			direction = ForwardScanDirection;
+	}
+	scandesc = node->iss_ScanDesc;
+	econtext = node->ss.ps.ps_ExprContext;
+	slot = node->ss.ss_ScanTupleSlot;
+
+	if (scandesc == NULL)
+	{
+		/*
+		 * We reach here if the index scan is not parallel, or if we're
+		 * serially executing an index scan that was planned to be parallel.
+		 */
+		scandesc = index_beginscan(node->ss.ss_currentRelation,
+								   node->iss_RelationDesc,
+								   estate->es_snapshot,
+								   node->iss_NumScanKeys,
+								   node->iss_NumOrderByKeys);
+
+		node->iss_ScanDesc = scandesc;
+
+		/*
+		 * If no run-time keys to calculate or they are ready, go ahead and
+		 * pass the scankeys to the index AM.
+		 */
+		if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady)
+			index_rescan(scandesc,
+						 node->iss_ScanKeys, node->iss_NumScanKeys,
+						 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+	}
+
+	/*
+	 * ok, now that we have what we need, fetch the next tuple.
+	 */
+	while (index_getnext_slot(scandesc, direction, slot))
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the index was lossy, we have to recheck the index quals using
+		 * the fetched tuple.
+		 */
+		if (scandesc->xs_recheck)
+		{
+			econtext->ecxt_scantuple = slot;
+			if (!ExecQualAndReset(node->indexqualorig, econtext))
+			{
+				/* Fails recheck, so drop it and loop back for another */
+				InstrCountFiltered2(node, 1);
+				continue;
+			}
+		}
+
+		return slot;
+	}
+
+	/*
+	 * if we get here it means the index scan failed so we are at the end of
+	 * the scan..
+	 */
+	node->iss_ReachedEnd = true;
+	return ExecClearTuple(slot);
+}
+
+/* ----------------------------------------------------------------
+ *		IndexNextWithReorder
+ *
+ *		Like IndexNext, but this version can also re-check ORDER BY
+ *		expressions, and reorder the tuples as necessary.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+IndexNextWithReorder(IndexScanState *node)
+{
+	EState	   *estate;
+	ExprContext *econtext;
+	IndexScanDesc scandesc;
+	TupleTableSlot *slot;
+	ReorderTuple *topmost = NULL;
+	bool		was_exact;
+	Datum	   *lastfetched_vals;
+	bool	   *lastfetched_nulls;
+	int			cmp;
+
+	estate = node->ss.ps.state;
+
+	/*
+	 * Only forward scan is supported with reordering.  Note: we can get away
+	 * with just Asserting here because the system will not try to run the
+	 * plan backwards if ExecSupportsBackwardScan() says it won't work.
+	 * Currently, that is guaranteed because no index AMs support both
+	 * amcanorderbyop and amcanbackward; if any ever do,
+	 * ExecSupportsBackwardScan() will need to consider indexorderbys
+	 * explicitly.
+	 */
+	Assert(!ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir));
+	Assert(ScanDirectionIsForward(estate->es_direction));
+
+	scandesc = node->iss_ScanDesc;
+	econtext = node->ss.ps.ps_ExprContext;
+	slot = node->ss.ss_ScanTupleSlot;
+
+	if (scandesc == NULL)
+	{
+		/*
+		 * We reach here if the index scan is not parallel, or if we're
+		 * serially executing an index scan that was planned to be parallel.
+		 */
+		scandesc = index_beginscan(node->ss.ss_currentRelation,
+								   node->iss_RelationDesc,
+								   estate->es_snapshot,
+								   node->iss_NumScanKeys,
+								   node->iss_NumOrderByKeys);
+
+		node->iss_ScanDesc = scandesc;
+
+		/*
+		 * If no run-time keys to calculate or they are ready, go ahead and
+		 * pass the scankeys to the index AM.
+		 */
+		if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady)
+			index_rescan(scandesc,
+						 node->iss_ScanKeys, node->iss_NumScanKeys,
+						 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+	}
+
+	for (;;)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Check the reorder queue first.  If the topmost tuple in the queue
+		 * has an ORDER BY value smaller than (or equal to) the value last
+		 * returned by the index, we can return it now.
+		 */
+		if (!pairingheap_is_empty(node->iss_ReorderQueue))
+		{
+			topmost = (ReorderTuple *) pairingheap_first(node->iss_ReorderQueue);
+
+			if (node->iss_ReachedEnd ||
+				cmp_orderbyvals(topmost->orderbyvals,
+								topmost->orderbynulls,
+								scandesc->xs_orderbyvals,
+								scandesc->xs_orderbynulls,
+								node) <= 0)
+			{
+				HeapTuple	tuple;
+
+				tuple = reorderqueue_pop(node);
+
+				/* Pass 'true', as the tuple in the queue is a palloc'd copy */
+				ExecForceStoreHeapTuple(tuple, slot, true);
+				return slot;
+			}
+		}
+		else if (node->iss_ReachedEnd)
+		{
+			/* Queue is empty, and no more tuples from index.  We're done. */
+			return ExecClearTuple(slot);
+		}
+
+		/*
+		 * Fetch next tuple from the index.
+		 */
+next_indextuple:
+		if (!index_getnext_slot(scandesc, ForwardScanDirection, slot))
+		{
+			/*
+			 * No more tuples from the index.  But we still need to drain any
+			 * remaining tuples from the queue before we're done.
+			 */
+			node->iss_ReachedEnd = true;
+			continue;
+		}
+
+		/*
+		 * If the index was lossy, we have to recheck the index quals and
+		 * ORDER BY expressions using the fetched tuple.
+		 */
+		if (scandesc->xs_recheck)
+		{
+			econtext->ecxt_scantuple = slot;
+			if (!ExecQualAndReset(node->indexqualorig, econtext))
+			{
+				/* Fails recheck, so drop it and loop back for another */
+				InstrCountFiltered2(node, 1);
+				/* allow this loop to be cancellable */
+				CHECK_FOR_INTERRUPTS();
+				goto next_indextuple;
+			}
+		}
+
+		if (scandesc->xs_recheckorderby)
+		{
+			econtext->ecxt_scantuple = slot;
+			ResetExprContext(econtext);
+			EvalOrderByExpressions(node, econtext);
+
+			/*
+			 * Was the ORDER BY value returned by the index accurate?  The
+			 * recheck flag means that the index can return inaccurate values,
+			 * but then again, the value returned for any particular tuple
+			 * could also be exactly correct.  Compare the value returned by
+			 * the index with the recalculated value.  (If the value returned
+			 * by the index happened to be exact right, we can often avoid
+			 * pushing the tuple to the queue, just to pop it back out again.)
+			 */
+			cmp = cmp_orderbyvals(node->iss_OrderByValues,
+								  node->iss_OrderByNulls,
+								  scandesc->xs_orderbyvals,
+								  scandesc->xs_orderbynulls,
+								  node);
+			if (cmp < 0)
+				elog(ERROR, "index returned tuples in wrong order");
+			else if (cmp == 0)
+				was_exact = true;
+			else
+				was_exact = false;
+			lastfetched_vals = node->iss_OrderByValues;
+			lastfetched_nulls = node->iss_OrderByNulls;
+		}
+		else
+		{
+			was_exact = true;
+			lastfetched_vals = scandesc->xs_orderbyvals;
+			lastfetched_nulls = scandesc->xs_orderbynulls;
+		}
+
+		/*
+		 * Can we return this tuple immediately, or does it need to be pushed
+		 * to the reorder queue?  If the ORDER BY expression values returned
+		 * by the index were inaccurate, we can't return it yet, because the
+		 * next tuple from the index might need to come before this one. Also,
+		 * we can't return it yet if there are any smaller tuples in the queue
+		 * already.
+		 */
+		if (!was_exact || (topmost && cmp_orderbyvals(lastfetched_vals,
+													  lastfetched_nulls,
+													  topmost->orderbyvals,
+													  topmost->orderbynulls,
+													  node) > 0))
+		{
+			/* Put this tuple to the queue */
+			reorderqueue_push(node, slot, lastfetched_vals, lastfetched_nulls);
+			continue;
+		}
+		else
+		{
+			/* Can return this tuple immediately. */
+			return slot;
+		}
+	}
+
+	/*
+	 * if we get here it means the index scan failed so we are at the end of
+	 * the scan..
+	 */
+	return ExecClearTuple(slot);
+}
+
+/*
+ * Calculate the expressions in the ORDER BY clause, based on the heap tuple.
+ */
+static void
+EvalOrderByExpressions(IndexScanState *node, ExprContext *econtext)
+{
+	int			i;
+	ListCell   *l;
+	MemoryContext oldContext;
+
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	i = 0;
+	foreach(l, node->indexorderbyorig)
+	{
+		ExprState  *orderby = (ExprState *) lfirst(l);
+
+		node->iss_OrderByValues[i] = ExecEvalExpr(orderby,
+												  econtext,
+												  &node->iss_OrderByNulls[i]);
+		i++;
+	}
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * IndexRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+IndexRecheck(IndexScanState *node, TupleTableSlot *slot)
+{
+	ExprContext *econtext;
+
+	/*
+	 * extract necessary information from index scan node
+	 */
+	econtext = node->ss.ps.ps_ExprContext;
+
+	/* Does the tuple meet the indexqual condition? */
+	econtext->ecxt_scantuple = slot;
+	return ExecQualAndReset(node->indexqualorig, econtext);
+}
+
+
+/*
+ * Compare ORDER BY expression values.
+ */
+static int
+cmp_orderbyvals(const Datum *adist, const bool *anulls,
+				const Datum *bdist, const bool *bnulls,
+				IndexScanState *node)
+{
+	int			i;
+	int			result;
+
+	for (i = 0; i < node->iss_NumOrderByKeys; i++)
+	{
+		SortSupport ssup = &node->iss_SortSupport[i];
+
+		/*
+		 * Handle nulls.  We only need to support NULLS LAST ordering, because
+		 * match_pathkeys_to_index() doesn't consider indexorderby
+		 * implementation otherwise.
+		 */
+		if (anulls[i] && !bnulls[i])
+			return 1;
+		else if (!anulls[i] && bnulls[i])
+			return -1;
+		else if (anulls[i] && bnulls[i])
+			return 0;
+
+		result = ssup->comparator(adist[i], bdist[i], ssup);
+		if (result != 0)
+			return result;
+	}
+
+	return 0;
+}
+
+/*
+ * Pairing heap provides getting topmost (greatest) element while KNN provides
+ * ascending sort.  That's why we invert the sort order.
+ */
+static int
+reorderqueue_cmp(const pairingheap_node *a, const pairingheap_node *b,
+				 void *arg)
+{
+	ReorderTuple *rta = (ReorderTuple *) a;
+	ReorderTuple *rtb = (ReorderTuple *) b;
+	IndexScanState *node = (IndexScanState *) arg;
+
+	/* exchange argument order to invert the sort order */
+	return cmp_orderbyvals(rtb->orderbyvals, rtb->orderbynulls,
+						   rta->orderbyvals, rta->orderbynulls,
+						   node);
+}
+
+/*
+ * Helper function to push a tuple to the reorder queue.
+ */
+static void
+reorderqueue_push(IndexScanState *node, TupleTableSlot *slot,
+				  Datum *orderbyvals, bool *orderbynulls)
+{
+	IndexScanDesc scandesc = node->iss_ScanDesc;
+	EState	   *estate = node->ss.ps.state;
+	MemoryContext oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+	ReorderTuple *rt;
+	int			i;
+
+	rt = (ReorderTuple *) palloc(sizeof(ReorderTuple));
+	rt->htup = ExecCopySlotHeapTuple(slot);
+	rt->orderbyvals =
+		(Datum *) palloc(sizeof(Datum) * scandesc->numberOfOrderBys);
+	rt->orderbynulls =
+		(bool *) palloc(sizeof(bool) * scandesc->numberOfOrderBys);
+	for (i = 0; i < node->iss_NumOrderByKeys; i++)
+	{
+		if (!orderbynulls[i])
+			rt->orderbyvals[i] = datumCopy(orderbyvals[i],
+										   node->iss_OrderByTypByVals[i],
+										   node->iss_OrderByTypLens[i]);
+		else
+			rt->orderbyvals[i] = (Datum) 0;
+		rt->orderbynulls[i] = orderbynulls[i];
+	}
+	pairingheap_add(node->iss_ReorderQueue, &rt->ph_node);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Helper function to pop the next tuple from the reorder queue.
+ */
+static HeapTuple
+reorderqueue_pop(IndexScanState *node)
+{
+	HeapTuple	result;
+	ReorderTuple *topmost;
+	int			i;
+
+	topmost = (ReorderTuple *) pairingheap_remove_first(node->iss_ReorderQueue);
+
+	result = topmost->htup;
+	for (i = 0; i < node->iss_NumOrderByKeys; i++)
+	{
+		if (!node->iss_OrderByTypByVals[i] && !topmost->orderbynulls[i])
+			pfree(DatumGetPointer(topmost->orderbyvals[i]));
+	}
+	pfree(topmost->orderbyvals);
+	pfree(topmost->orderbynulls);
+	pfree(topmost);
+
+	return result;
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecIndexScan(node)
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecIndexScan(PlanState *pstate)
+{
+	IndexScanState *node = castNode(IndexScanState, pstate);
+
+	/*
+	 * If we have runtime keys and they've not already been set up, do it now.
+	 */
+	if (node->iss_NumRuntimeKeys != 0 && !node->iss_RuntimeKeysReady)
+		ExecReScan((PlanState *) node);
+
+	if (node->iss_NumOrderByKeys > 0)
+		return ExecScan(&node->ss,
+						(ExecScanAccessMtd) IndexNextWithReorder,
+						(ExecScanRecheckMtd) IndexRecheck);
+	else
+		return ExecScan(&node->ss,
+						(ExecScanAccessMtd) IndexNext,
+						(ExecScanRecheckMtd) IndexRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanIndexScan(node)
+ *
+ *		Recalculates the values of any scan keys whose value depends on
+ *		information known at runtime, then rescans the indexed relation.
+ *
+ *		Updating the scan key was formerly done separately in
+ *		ExecUpdateIndexScanKeys. Integrating it into ReScan makes
+ *		rescans of indices and relations/general streams more uniform.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanIndexScan(IndexScanState *node)
+{
+	/*
+	 * If we are doing runtime key calculations (ie, any of the index key
+	 * values weren't simple Consts), compute the new key values.  But first,
+	 * reset the context so we don't leak memory as each outer tuple is
+	 * scanned.  Note this assumes that we will recalculate *all* runtime keys
+	 * on each call.
+	 */
+	if (node->iss_NumRuntimeKeys != 0)
+	{
+		ExprContext *econtext = node->iss_RuntimeContext;
+
+		ResetExprContext(econtext);
+		ExecIndexEvalRuntimeKeys(econtext,
+								 node->iss_RuntimeKeys,
+								 node->iss_NumRuntimeKeys);
+	}
+	node->iss_RuntimeKeysReady = true;
+
+	/* flush the reorder queue */
+	if (node->iss_ReorderQueue)
+	{
+		HeapTuple	tuple;
+		while (!pairingheap_is_empty(node->iss_ReorderQueue))
+		{
+			tuple = reorderqueue_pop(node);
+			heap_freetuple(tuple);
+		}
+	}
+
+	/* reset index scan */
+	if (node->iss_ScanDesc)
+		index_rescan(node->iss_ScanDesc,
+					 node->iss_ScanKeys, node->iss_NumScanKeys,
+					 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+	node->iss_ReachedEnd = false;
+
+	ExecScanReScan(&node->ss);
+}
+
+
+/*
+ * ExecIndexEvalRuntimeKeys
+ *		Evaluate any runtime key values, and update the scankeys.
+ */
+void
+ExecIndexEvalRuntimeKeys(ExprContext *econtext,
+						 IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys)
+{
+	int			j;
+	MemoryContext oldContext;
+
+	/* We want to keep the key values in per-tuple memory */
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	for (j = 0; j < numRuntimeKeys; j++)
+	{
+		ScanKey		scan_key = runtimeKeys[j].scan_key;
+		ExprState  *key_expr = runtimeKeys[j].key_expr;
+		Datum		scanvalue;
+		bool		isNull;
+
+		/*
+		 * For each run-time key, extract the run-time expression and evaluate
+		 * it with respect to the current context.  We then stick the result
+		 * into the proper scan key.
+		 *
+		 * Note: the result of the eval could be a pass-by-ref value that's
+		 * stored in some outer scan's tuple, not in
+		 * econtext->ecxt_per_tuple_memory.  We assume that the outer tuple
+		 * will stay put throughout our scan.  If this is wrong, we could copy
+		 * the result into our context explicitly, but I think that's not
+		 * necessary.
+		 *
+		 * It's also entirely possible that the result of the eval is a
+		 * toasted value.  In this case we should forcibly detoast it, to
+		 * avoid repeat detoastings each time the value is examined by an
+		 * index support function.
+		 */
+		scanvalue = ExecEvalExpr(key_expr,
+								 econtext,
+								 &isNull);
+		if (isNull)
+		{
+			scan_key->sk_argument = scanvalue;
+			scan_key->sk_flags |= SK_ISNULL;
+		}
+		else
+		{
+			if (runtimeKeys[j].key_toastable)
+				scanvalue = PointerGetDatum(PG_DETOAST_DATUM(scanvalue));
+			scan_key->sk_argument = scanvalue;
+			scan_key->sk_flags &= ~SK_ISNULL;
+		}
+	}
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * ExecIndexEvalArrayKeys
+ *		Evaluate any array key values, and set up to iterate through arrays.
+ *
+ * Returns true if there are array elements to consider; false means there
+ * is at least one null or empty array, so no match is possible.  On true
+ * result, the scankeys are initialized with the first elements of the arrays.
+ */
+bool
+ExecIndexEvalArrayKeys(ExprContext *econtext,
+					   IndexArrayKeyInfo *arrayKeys, int numArrayKeys)
+{
+	bool		result = true;
+	int			j;
+	MemoryContext oldContext;
+
+	/* We want to keep the arrays in per-tuple memory */
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	for (j = 0; j < numArrayKeys; j++)
+	{
+		ScanKey		scan_key = arrayKeys[j].scan_key;
+		ExprState  *array_expr = arrayKeys[j].array_expr;
+		Datum		arraydatum;
+		bool		isNull;
+		ArrayType  *arrayval;
+		int16		elmlen;
+		bool		elmbyval;
+		char		elmalign;
+		int			num_elems;
+		Datum	   *elem_values;
+		bool	   *elem_nulls;
+
+		/*
+		 * Compute and deconstruct the array expression. (Notes in
+		 * ExecIndexEvalRuntimeKeys() apply here too.)
+		 */
+		arraydatum = ExecEvalExpr(array_expr,
+								  econtext,
+								  &isNull);
+		if (isNull)
+		{
+			result = false;
+			break;				/* no point in evaluating more */
+		}
+		arrayval = DatumGetArrayTypeP(arraydatum);
+		/* We could cache this data, but not clear it's worth it */
+		get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+							 &elmlen, &elmbyval, &elmalign);
+		deconstruct_array(arrayval,
+						  ARR_ELEMTYPE(arrayval),
+						  elmlen, elmbyval, elmalign,
+						  &elem_values, &elem_nulls, &num_elems);
+		if (num_elems <= 0)
+		{
+			result = false;
+			break;				/* no point in evaluating more */
+		}
+
+		/*
+		 * Note: we expect the previous array data, if any, to be
+		 * automatically freed by resetting the per-tuple context; hence no
+		 * pfree's here.
+		 */
+		arrayKeys[j].elem_values = elem_values;
+		arrayKeys[j].elem_nulls = elem_nulls;
+		arrayKeys[j].num_elems = num_elems;
+		scan_key->sk_argument = elem_values[0];
+		if (elem_nulls[0])
+			scan_key->sk_flags |= SK_ISNULL;
+		else
+			scan_key->sk_flags &= ~SK_ISNULL;
+		arrayKeys[j].next_elem = 1;
+	}
+
+	MemoryContextSwitchTo(oldContext);
+
+	return result;
+}
+
+/*
+ * ExecIndexAdvanceArrayKeys
+ *		Advance to the next set of array key values, if any.
+ *
+ * Returns true if there is another set of values to consider, false if not.
+ * On true result, the scankeys are initialized with the next set of values.
+ */
+bool
+ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys)
+{
+	bool		found = false;
+	int			j;
+
+	/*
+	 * Note we advance the rightmost array key most quickly, since it will
+	 * correspond to the lowest-order index column among the available
+	 * qualifications.  This is hypothesized to result in better locality of
+	 * access in the index.
+	 */
+	for (j = numArrayKeys - 1; j >= 0; j--)
+	{
+		ScanKey		scan_key = arrayKeys[j].scan_key;
+		int			next_elem = arrayKeys[j].next_elem;
+		int			num_elems = arrayKeys[j].num_elems;
+		Datum	   *elem_values = arrayKeys[j].elem_values;
+		bool	   *elem_nulls = arrayKeys[j].elem_nulls;
+
+		if (next_elem >= num_elems)
+		{
+			next_elem = 0;
+			found = false;		/* need to advance next array key */
+		}
+		else
+			found = true;
+		scan_key->sk_argument = elem_values[next_elem];
+		if (elem_nulls[next_elem])
+			scan_key->sk_flags |= SK_ISNULL;
+		else
+			scan_key->sk_flags &= ~SK_ISNULL;
+		arrayKeys[j].next_elem = next_elem + 1;
+		if (found)
+			break;
+	}
+
+	return found;
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecEndIndexScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIndexScan(IndexScanState *node)
+{
+	Relation	indexRelationDesc;
+	IndexScanDesc indexScanDesc;
+
+	/*
+	 * extract information from the node
+	 */
+	indexRelationDesc = node->iss_RelationDesc;
+	indexScanDesc = node->iss_ScanDesc;
+
+	/*
+	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
+	 */
+#ifdef NOT_USED
+	ExecFreeExprContext(&node->ss.ps);
+	if (node->iss_RuntimeContext)
+		FreeExprContext(node->iss_RuntimeContext, true);
+#endif
+
+	/*
+	 * clear out tuple table slots
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close the index relation (no-op if we didn't open it)
+	 */
+	if (indexScanDesc)
+		index_endscan(indexScanDesc);
+	if (indexRelationDesc)
+		index_close(indexRelationDesc, NoLock);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexMarkPos
+ *
+ * Note: we assume that no caller attempts to set a mark before having read
+ * at least one tuple.  Otherwise, iss_ScanDesc might still be NULL.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexMarkPos(IndexScanState *node)
+{
+	EState	   *estate = node->ss.ps.state;
+	EPQState   *epqstate = estate->es_epq_active;
+
+	if (epqstate != NULL)
+	{
+		/*
+		 * We are inside an EvalPlanQual recheck.  If a test tuple exists for
+		 * this relation, then we shouldn't access the index at all.  We would
+		 * instead need to save, and later restore, the state of the
+		 * relsubs_done flag, so that re-fetching the test tuple is possible.
+		 * However, given the assumption that no caller sets a mark at the
+		 * start of the scan, we can only get here with relsubs_done[i]
+		 * already set, and so no state need be saved.
+		 */
+		Index		scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
+
+		Assert(scanrelid > 0);
+		if (epqstate->relsubs_slot[scanrelid - 1] != NULL ||
+			epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+		{
+			/* Verify the claim above */
+			if (!epqstate->relsubs_done[scanrelid - 1])
+				elog(ERROR, "unexpected ExecIndexMarkPos call in EPQ recheck");
+			return;
+		}
+	}
+
+	index_markpos(node->iss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexRestrPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexRestrPos(IndexScanState *node)
+{
+	EState	   *estate = node->ss.ps.state;
+	EPQState   *epqstate = estate->es_epq_active;
+
+	if (estate->es_epq_active != NULL)
+	{
+		/* See comments in ExecIndexMarkPos */
+		Index		scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
+
+		Assert(scanrelid > 0);
+		if (epqstate->relsubs_slot[scanrelid - 1] != NULL ||
+			epqstate->relsubs_rowmark[scanrelid - 1] != NULL)
+		{
+			/* Verify the claim above */
+			if (!epqstate->relsubs_done[scanrelid - 1])
+				elog(ERROR, "unexpected ExecIndexRestrPos call in EPQ recheck");
+			return;
+		}
+	}
+
+	index_restrpos(node->iss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitIndexScan
+ *
+ *		Initializes the index scan's state information, creates
+ *		scan keys, and opens the base and index relations.
+ *
+ *		Note: index scans have 2 sets of state information because
+ *			  we have to keep track of the base relation and the
+ *			  index relation.
+ * ----------------------------------------------------------------
+ */
+IndexScanState *
+ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
+{
+	IndexScanState *indexstate;
+	Relation	currentRelation;
+	LOCKMODE	lockmode;
+
+	/*
+	 * create state structure
+	 */
+	indexstate = makeNode(IndexScanState);
+	indexstate->ss.ps.plan = (Plan *) node;
+	indexstate->ss.ps.state = estate;
+	indexstate->ss.ps.ExecProcNode = ExecIndexScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &indexstate->ss.ps);
+
+	/*
+	 * open the scan relation
+	 */
+	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+	indexstate->ss.ss_currentRelation = currentRelation;
+	indexstate->ss.ss_currentScanDesc = NULL;	/* no heap scan here */
+
+	/*
+	 * get the scan type from the relation descriptor.
+	 */
+	ExecInitScanTupleSlot(estate, &indexstate->ss,
+						  RelationGetDescr(currentRelation),
+						  table_slot_callbacks(currentRelation));
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&indexstate->ss.ps);
+	ExecAssignScanProjectionInfo(&indexstate->ss);
+
+	/*
+	 * initialize child expressions
+	 *
+	 * Note: we don't initialize all of the indexqual expression, only the
+	 * sub-parts corresponding to runtime keys (see below).  Likewise for
+	 * indexorderby, if any.  But the indexqualorig expression is always
+	 * initialized even though it will only be used in some uncommon cases ---
+	 * would be nice to improve that.  (Problem is that any SubPlans present
+	 * in the expression must be found now...)
+	 */
+	indexstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) indexstate);
+	indexstate->indexqualorig =
+		ExecInitQual(node->indexqualorig, (PlanState *) indexstate);
+	indexstate->indexorderbyorig =
+		ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate);
+
+	/*
+	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
+	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
+	 * references to nonexistent indexes.
+	 */
+	if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
+		return indexstate;
+
+	/* Open the index relation. */
+	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
+	indexstate->iss_RelationDesc = index_open(node->indexid, lockmode);
+
+	/*
+	 * Initialize index-specific scan state
+	 */
+	indexstate->iss_RuntimeKeysReady = false;
+	indexstate->iss_RuntimeKeys = NULL;
+	indexstate->iss_NumRuntimeKeys = 0;
+
+	/*
+	 * build the index scan keys from the index qualification
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->iss_RelationDesc,
+						   node->indexqual,
+						   false,
+						   &indexstate->iss_ScanKeys,
+						   &indexstate->iss_NumScanKeys,
+						   &indexstate->iss_RuntimeKeys,
+						   &indexstate->iss_NumRuntimeKeys,
+						   NULL,	/* no ArrayKeys */
+						   NULL);
+
+	/*
+	 * any ORDER BY exprs have to be turned into scankeys in the same way
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->iss_RelationDesc,
+						   node->indexorderby,
+						   true,
+						   &indexstate->iss_OrderByKeys,
+						   &indexstate->iss_NumOrderByKeys,
+						   &indexstate->iss_RuntimeKeys,
+						   &indexstate->iss_NumRuntimeKeys,
+						   NULL,	/* no ArrayKeys */
+						   NULL);
+
+	/* Initialize sort support, if we need to re-check ORDER BY exprs */
+	if (indexstate->iss_NumOrderByKeys > 0)
+	{
+		int			numOrderByKeys = indexstate->iss_NumOrderByKeys;
+		int			i;
+		ListCell   *lco;
+		ListCell   *lcx;
+
+		/*
+		 * Prepare sort support, and look up the data type for each ORDER BY
+		 * expression.
+		 */
+		Assert(numOrderByKeys == list_length(node->indexorderbyops));
+		Assert(numOrderByKeys == list_length(node->indexorderbyorig));
+		indexstate->iss_SortSupport = (SortSupportData *)
+			palloc0(numOrderByKeys * sizeof(SortSupportData));
+		indexstate->iss_OrderByTypByVals = (bool *)
+			palloc(numOrderByKeys * sizeof(bool));
+		indexstate->iss_OrderByTypLens = (int16 *)
+			palloc(numOrderByKeys * sizeof(int16));
+		i = 0;
+		forboth(lco, node->indexorderbyops, lcx, node->indexorderbyorig)
+		{
+			Oid			orderbyop = lfirst_oid(lco);
+			Node	   *orderbyexpr = (Node *) lfirst(lcx);
+			Oid			orderbyType = exprType(orderbyexpr);
+			Oid			orderbyColl = exprCollation(orderbyexpr);
+			SortSupport orderbysort = &indexstate->iss_SortSupport[i];
+
+			/* Initialize sort support */
+			orderbysort->ssup_cxt = CurrentMemoryContext;
+			orderbysort->ssup_collation = orderbyColl;
+			/* See cmp_orderbyvals() comments on NULLS LAST */
+			orderbysort->ssup_nulls_first = false;
+			/* ssup_attno is unused here and elsewhere */
+			orderbysort->ssup_attno = 0;
+			/* No abbreviation */
+			orderbysort->abbreviate = false;
+			PrepareSortSupportFromOrderingOp(orderbyop, orderbysort);
+
+			get_typlenbyval(orderbyType,
+							&indexstate->iss_OrderByTypLens[i],
+							&indexstate->iss_OrderByTypByVals[i]);
+			i++;
+		}
+
+		/* allocate arrays to hold the re-calculated distances */
+		indexstate->iss_OrderByValues = (Datum *)
+			palloc(numOrderByKeys * sizeof(Datum));
+		indexstate->iss_OrderByNulls = (bool *)
+			palloc(numOrderByKeys * sizeof(bool));
+
+		/* and initialize the reorder queue */
+		indexstate->iss_ReorderQueue = pairingheap_allocate(reorderqueue_cmp,
+															indexstate);
+	}
+
+	/*
+	 * If we have runtime keys, we need an ExprContext to evaluate them. The
+	 * node's standard context won't do because we want to reset that context
+	 * for every tuple.  So, build another context just like the other one...
+	 * -tgl 7/11/00
+	 */
+	if (indexstate->iss_NumRuntimeKeys != 0)
+	{
+		ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
+
+		ExecAssignExprContext(estate, &indexstate->ss.ps);
+		indexstate->iss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
+		indexstate->ss.ps.ps_ExprContext = stdecontext;
+	}
+	else
+	{
+		indexstate->iss_RuntimeContext = NULL;
+	}
+
+	/*
+	 * all done.
+	 */
+	return indexstate;
+}
+
+
+/*
+ * ExecIndexBuildScanKeys
+ *		Build the index scan keys from the index qualification expressions
+ *
+ * The index quals are passed to the index AM in the form of a ScanKey array.
+ * This routine sets up the ScanKeys, fills in all constant fields of the
+ * ScanKeys, and prepares information about the keys that have non-constant
+ * comparison values.  We divide index qual expressions into five types:
+ *
+ * 1. Simple operator with constant comparison value ("indexkey op constant").
+ * For these, we just fill in a ScanKey containing the constant value.
+ *
+ * 2. Simple operator with non-constant value ("indexkey op expression").
+ * For these, we create a ScanKey with everything filled in except the
+ * expression value, and set up an IndexRuntimeKeyInfo struct to drive
+ * evaluation of the expression at the right times.
+ *
+ * 3. RowCompareExpr ("(indexkey, indexkey, ...) op (expr, expr, ...)").
+ * For these, we create a header ScanKey plus a subsidiary ScanKey array,
+ * as specified in access/skey.h.  The elements of the row comparison
+ * can have either constant or non-constant comparison values.
+ *
+ * 4. ScalarArrayOpExpr ("indexkey op ANY (array-expression)").  If the index
+ * supports amsearcharray, we handle these the same as simple operators,
+ * setting the SK_SEARCHARRAY flag to tell the AM to handle them.  Otherwise,
+ * we create a ScanKey with everything filled in except the comparison value,
+ * and set up an IndexArrayKeyInfo struct to drive processing of the qual.
+ * (Note that if we use an IndexArrayKeyInfo struct, the array expression is
+ * always treated as requiring runtime evaluation, even if it's a constant.)
+ *
+ * 5. NullTest ("indexkey IS NULL/IS NOT NULL").  We just fill in the
+ * ScanKey properly.
+ *
+ * This code is also used to prepare ORDER BY expressions for amcanorderbyop
+ * indexes.  The behavior is exactly the same, except that we have to look up
+ * the operator differently.  Note that only cases 1 and 2 are currently
+ * possible for ORDER BY.
+ *
+ * Input params are:
+ *
+ * planstate: executor state node we are working for
+ * index: the index we are building scan keys for
+ * quals: indexquals (or indexorderbys) expressions
+ * isorderby: true if processing ORDER BY exprs, false if processing quals
+ * *runtimeKeys: ptr to pre-existing IndexRuntimeKeyInfos, or NULL if none
+ * *numRuntimeKeys: number of pre-existing runtime keys
+ *
+ * Output params are:
+ *
+ * *scanKeys: receives ptr to array of ScanKeys
+ * *numScanKeys: receives number of scankeys
+ * *runtimeKeys: receives ptr to array of IndexRuntimeKeyInfos, or NULL if none
+ * *numRuntimeKeys: receives number of runtime keys
+ * *arrayKeys: receives ptr to array of IndexArrayKeyInfos, or NULL if none
+ * *numArrayKeys: receives number of array keys
+ *
+ * Caller may pass NULL for arrayKeys and numArrayKeys to indicate that
+ * IndexArrayKeyInfos are not supported.
+ */
+void
+ExecIndexBuildScanKeys(PlanState *planstate, Relation index,
+					   List *quals, bool isorderby,
+					   ScanKey *scanKeys, int *numScanKeys,
+					   IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys,
+					   IndexArrayKeyInfo **arrayKeys, int *numArrayKeys)
+{
+	ListCell   *qual_cell;
+	ScanKey		scan_keys;
+	IndexRuntimeKeyInfo *runtime_keys;
+	IndexArrayKeyInfo *array_keys;
+	int			n_scan_keys;
+	int			n_runtime_keys;
+	int			max_runtime_keys;
+	int			n_array_keys;
+	int			j;
+
+	/* Allocate array for ScanKey structs: one per qual */
+	n_scan_keys = list_length(quals);
+	scan_keys = (ScanKey) palloc(n_scan_keys * sizeof(ScanKeyData));
+
+	/*
+	 * runtime_keys array is dynamically resized as needed.  We handle it this
+	 * way so that the same runtime keys array can be shared between
+	 * indexquals and indexorderbys, which will be processed in separate calls
+	 * of this function.  Caller must be sure to pass in NULL/0 for first
+	 * call.
+	 */
+	runtime_keys = *runtimeKeys;
+	n_runtime_keys = max_runtime_keys = *numRuntimeKeys;
+
+	/* Allocate array_keys as large as it could possibly need to be */
+	array_keys = (IndexArrayKeyInfo *)
+		palloc0(n_scan_keys * sizeof(IndexArrayKeyInfo));
+	n_array_keys = 0;
+
+	/*
+	 * for each opclause in the given qual, convert the opclause into a single
+	 * scan key
+	 */
+	j = 0;
+	foreach(qual_cell, quals)
+	{
+		Expr	   *clause = (Expr *) lfirst(qual_cell);
+		ScanKey		this_scan_key = &scan_keys[j++];
+		Oid			opno;		/* operator's OID */
+		RegProcedure opfuncid;	/* operator proc id used in scan */
+		Oid			opfamily;	/* opfamily of index column */
+		int			op_strategy;	/* operator's strategy number */
+		Oid			op_lefttype;	/* operator's declared input types */
+		Oid			op_righttype;
+		Expr	   *leftop;		/* expr on lhs of operator */
+		Expr	   *rightop;	/* expr on rhs ... */
+		AttrNumber	varattno;	/* att number used in scan */
+		int			indnkeyatts;
+
+		indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index);
+		if (IsA(clause, OpExpr))
+		{
+			/* indexkey op const or indexkey op expression */
+			int			flags = 0;
+			Datum		scanvalue;
+
+			opno = ((OpExpr *) clause)->opno;
+			opfuncid = ((OpExpr *) clause)->opfuncid;
+
+			/*
+			 * leftop should be the index key Var, possibly relabeled
+			 */
+			leftop = (Expr *) get_leftop(clause);
+
+			if (leftop && IsA(leftop, RelabelType))
+				leftop = ((RelabelType *) leftop)->arg;
+
+			Assert(leftop != NULL);
+
+			if (!(IsA(leftop, Var) &&
+				  ((Var *) leftop)->varno == INDEX_VAR))
+				elog(ERROR, "indexqual doesn't have key on left side");
+
+			varattno = ((Var *) leftop)->varattno;
+			if (varattno < 1 || varattno > indnkeyatts)
+				elog(ERROR, "bogus index qualification");
+
+			/*
+			 * We have to look up the operator's strategy number.  This
+			 * provides a cross-check that the operator does match the index.
+			 */
+			opfamily = index->rd_opfamily[varattno - 1];
+
+			get_op_opfamily_properties(opno, opfamily, isorderby,
+									   &op_strategy,
+									   &op_lefttype,
+									   &op_righttype);
+
+			if (isorderby)
+				flags |= SK_ORDER_BY;
+
+			/*
+			 * rightop is the constant or variable comparison value
+			 */
+			rightop = (Expr *) get_rightop(clause);
+
+			if (rightop && IsA(rightop, RelabelType))
+				rightop = ((RelabelType *) rightop)->arg;
+
+			Assert(rightop != NULL);
+
+			if (IsA(rightop, Const))
+			{
+				/* OK, simple constant comparison value */
+				scanvalue = ((Const *) rightop)->constvalue;
+				if (((Const *) rightop)->constisnull)
+					flags |= SK_ISNULL;
+			}
+			else
+			{
+				/* Need to treat this one as a runtime key */
+				if (n_runtime_keys >= max_runtime_keys)
+				{
+					if (max_runtime_keys == 0)
+					{
+						max_runtime_keys = 8;
+						runtime_keys = (IndexRuntimeKeyInfo *)
+							palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+					}
+					else
+					{
+						max_runtime_keys *= 2;
+						runtime_keys = (IndexRuntimeKeyInfo *)
+							repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+					}
+				}
+				runtime_keys[n_runtime_keys].scan_key = this_scan_key;
+				runtime_keys[n_runtime_keys].key_expr =
+					ExecInitExpr(rightop, planstate);
+				runtime_keys[n_runtime_keys].key_toastable =
+					TypeIsToastable(op_righttype);
+				n_runtime_keys++;
+				scanvalue = (Datum) 0;
+			}
+
+			/*
+			 * initialize the scan key's fields appropriately
+			 */
+			ScanKeyEntryInitialize(this_scan_key,
+								   flags,
+								   varattno,	/* attribute number to scan */
+								   op_strategy, /* op's strategy */
+								   op_righttype,	/* strategy subtype */
+								   ((OpExpr *) clause)->inputcollid,	/* collation */
+								   opfuncid,	/* reg proc to use */
+								   scanvalue);	/* constant */
+		}
+		else if (IsA(clause, RowCompareExpr))
+		{
+			/* (indexkey, indexkey, ...) op (expression, expression, ...) */
+			RowCompareExpr *rc = (RowCompareExpr *) clause;
+			ScanKey		first_sub_key;
+			int			n_sub_key;
+			ListCell   *largs_cell;
+			ListCell   *rargs_cell;
+			ListCell   *opnos_cell;
+			ListCell   *collids_cell;
+
+			Assert(!isorderby);
+
+			first_sub_key = (ScanKey)
+				palloc(list_length(rc->opnos) * sizeof(ScanKeyData));
+			n_sub_key = 0;
+
+			/* Scan RowCompare columns and generate subsidiary ScanKey items */
+			forfour(largs_cell, rc->largs, rargs_cell, rc->rargs,
+					opnos_cell, rc->opnos, collids_cell, rc->inputcollids)
+			{
+				ScanKey		this_sub_key = &first_sub_key[n_sub_key];
+				int			flags = SK_ROW_MEMBER;
+				Datum		scanvalue;
+				Oid			inputcollation;
+
+				leftop = (Expr *) lfirst(largs_cell);
+				rightop = (Expr *) lfirst(rargs_cell);
+				opno = lfirst_oid(opnos_cell);
+				inputcollation = lfirst_oid(collids_cell);
+
+				/*
+				 * leftop should be the index key Var, possibly relabeled
+				 */
+				if (leftop && IsA(leftop, RelabelType))
+					leftop = ((RelabelType *) leftop)->arg;
+
+				Assert(leftop != NULL);
+
+				if (!(IsA(leftop, Var) &&
+					  ((Var *) leftop)->varno == INDEX_VAR))
+					elog(ERROR, "indexqual doesn't have key on left side");
+
+				varattno = ((Var *) leftop)->varattno;
+
+				/*
+				 * We have to look up the operator's associated btree support
+				 * function
+				 */
+				if (index->rd_rel->relam != BTREE_AM_OID ||
+					varattno < 1 || varattno > indnkeyatts)
+					elog(ERROR, "bogus RowCompare index qualification");
+				opfamily = index->rd_opfamily[varattno - 1];
+
+				get_op_opfamily_properties(opno, opfamily, isorderby,
+										   &op_strategy,
+										   &op_lefttype,
+										   &op_righttype);
+
+				if (op_strategy != rc->rctype)
+					elog(ERROR, "RowCompare index qualification contains wrong operator");
+
+				opfuncid = get_opfamily_proc(opfamily,
+											 op_lefttype,
+											 op_righttype,
+											 BTORDER_PROC);
+				if (!RegProcedureIsValid(opfuncid))
+					elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
+						 BTORDER_PROC, op_lefttype, op_righttype, opfamily);
+
+				/*
+				 * rightop is the constant or variable comparison value
+				 */
+				if (rightop && IsA(rightop, RelabelType))
+					rightop = ((RelabelType *) rightop)->arg;
+
+				Assert(rightop != NULL);
+
+				if (IsA(rightop, Const))
+				{
+					/* OK, simple constant comparison value */
+					scanvalue = ((Const *) rightop)->constvalue;
+					if (((Const *) rightop)->constisnull)
+						flags |= SK_ISNULL;
+				}
+				else
+				{
+					/* Need to treat this one as a runtime key */
+					if (n_runtime_keys >= max_runtime_keys)
+					{
+						if (max_runtime_keys == 0)
+						{
+							max_runtime_keys = 8;
+							runtime_keys = (IndexRuntimeKeyInfo *)
+								palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+						}
+						else
+						{
+							max_runtime_keys *= 2;
+							runtime_keys = (IndexRuntimeKeyInfo *)
+								repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+						}
+					}
+					runtime_keys[n_runtime_keys].scan_key = this_sub_key;
+					runtime_keys[n_runtime_keys].key_expr =
+						ExecInitExpr(rightop, planstate);
+					runtime_keys[n_runtime_keys].key_toastable =
+						TypeIsToastable(op_righttype);
+					n_runtime_keys++;
+					scanvalue = (Datum) 0;
+				}
+
+				/*
+				 * initialize the subsidiary scan key's fields appropriately
+				 */
+				ScanKeyEntryInitialize(this_sub_key,
+									   flags,
+									   varattno,	/* attribute number */
+									   op_strategy, /* op's strategy */
+									   op_righttype,	/* strategy subtype */
+									   inputcollation,	/* collation */
+									   opfuncid,	/* reg proc to use */
+									   scanvalue);	/* constant */
+				n_sub_key++;
+			}
+
+			/* Mark the last subsidiary scankey correctly */
+			first_sub_key[n_sub_key - 1].sk_flags |= SK_ROW_END;
+
+			/*
+			 * We don't use ScanKeyEntryInitialize for the header because it
+			 * isn't going to contain a valid sk_func pointer.
+			 */
+			MemSet(this_scan_key, 0, sizeof(ScanKeyData));
+			this_scan_key->sk_flags = SK_ROW_HEADER;
+			this_scan_key->sk_attno = first_sub_key->sk_attno;
+			this_scan_key->sk_strategy = rc->rctype;
+			/* sk_subtype, sk_collation, sk_func not used in a header */
+			this_scan_key->sk_argument = PointerGetDatum(first_sub_key);
+		}
+		else if (IsA(clause, ScalarArrayOpExpr))
+		{
+			/* indexkey op ANY (array-expression) */
+			ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+			int			flags = 0;
+			Datum		scanvalue;
+
+			Assert(!isorderby);
+
+			Assert(saop->useOr);
+			opno = saop->opno;
+			opfuncid = saop->opfuncid;
+
+			/*
+			 * leftop should be the index key Var, possibly relabeled
+			 */
+			leftop = (Expr *) linitial(saop->args);
+
+			if (leftop && IsA(leftop, RelabelType))
+				leftop = ((RelabelType *) leftop)->arg;
+
+			Assert(leftop != NULL);
+
+			if (!(IsA(leftop, Var) &&
+				  ((Var *) leftop)->varno == INDEX_VAR))
+				elog(ERROR, "indexqual doesn't have key on left side");
+
+			varattno = ((Var *) leftop)->varattno;
+			if (varattno < 1 || varattno > indnkeyatts)
+				elog(ERROR, "bogus index qualification");
+
+			/*
+			 * We have to look up the operator's strategy number.  This
+			 * provides a cross-check that the operator does match the index.
+			 */
+			opfamily = index->rd_opfamily[varattno - 1];
+
+			get_op_opfamily_properties(opno, opfamily, isorderby,
+									   &op_strategy,
+									   &op_lefttype,
+									   &op_righttype);
+
+			/*
+			 * rightop is the constant or variable array value
+			 */
+			rightop = (Expr *) lsecond(saop->args);
+
+			if (rightop && IsA(rightop, RelabelType))
+				rightop = ((RelabelType *) rightop)->arg;
+
+			Assert(rightop != NULL);
+
+			if (index->rd_indam->amsearcharray)
+			{
+				/* Index AM will handle this like a simple operator */
+				flags |= SK_SEARCHARRAY;
+				if (IsA(rightop, Const))
+				{
+					/* OK, simple constant comparison value */
+					scanvalue = ((Const *) rightop)->constvalue;
+					if (((Const *) rightop)->constisnull)
+						flags |= SK_ISNULL;
+				}
+				else
+				{
+					/* Need to treat this one as a runtime key */
+					if (n_runtime_keys >= max_runtime_keys)
+					{
+						if (max_runtime_keys == 0)
+						{
+							max_runtime_keys = 8;
+							runtime_keys = (IndexRuntimeKeyInfo *)
+								palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+						}
+						else
+						{
+							max_runtime_keys *= 2;
+							runtime_keys = (IndexRuntimeKeyInfo *)
+								repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo));
+						}
+					}
+					runtime_keys[n_runtime_keys].scan_key = this_scan_key;
+					runtime_keys[n_runtime_keys].key_expr =
+						ExecInitExpr(rightop, planstate);
+
+					/*
+					 * Careful here: the runtime expression is not of
+					 * op_righttype, but rather is an array of same; so
+					 * TypeIsToastable() isn't helpful.  However, we can
+					 * assume that all array types are toastable.
+					 */
+					runtime_keys[n_runtime_keys].key_toastable = true;
+					n_runtime_keys++;
+					scanvalue = (Datum) 0;
+				}
+			}
+			else
+			{
+				/* Executor has to expand the array value */
+				array_keys[n_array_keys].scan_key = this_scan_key;
+				array_keys[n_array_keys].array_expr =
+					ExecInitExpr(rightop, planstate);
+				/* the remaining fields were zeroed by palloc0 */
+				n_array_keys++;
+				scanvalue = (Datum) 0;
+			}
+
+			/*
+			 * initialize the scan key's fields appropriately
+			 */
+			ScanKeyEntryInitialize(this_scan_key,
+								   flags,
+								   varattno,	/* attribute number to scan */
+								   op_strategy, /* op's strategy */
+								   op_righttype,	/* strategy subtype */
+								   saop->inputcollid,	/* collation */
+								   opfuncid,	/* reg proc to use */
+								   scanvalue);	/* constant */
+		}
+		else if (IsA(clause, NullTest))
+		{
+			/* indexkey IS NULL or indexkey IS NOT NULL */
+			NullTest   *ntest = (NullTest *) clause;
+			int			flags;
+
+			Assert(!isorderby);
+
+			/*
+			 * argument should be the index key Var, possibly relabeled
+			 */
+			leftop = ntest->arg;
+
+			if (leftop && IsA(leftop, RelabelType))
+				leftop = ((RelabelType *) leftop)->arg;
+
+			Assert(leftop != NULL);
+
+			if (!(IsA(leftop, Var) &&
+				  ((Var *) leftop)->varno == INDEX_VAR))
+				elog(ERROR, "NullTest indexqual has wrong key");
+
+			varattno = ((Var *) leftop)->varattno;
+
+			/*
+			 * initialize the scan key's fields appropriately
+			 */
+			switch (ntest->nulltesttype)
+			{
+				case IS_NULL:
+					flags = SK_ISNULL | SK_SEARCHNULL;
+					break;
+				case IS_NOT_NULL:
+					flags = SK_ISNULL | SK_SEARCHNOTNULL;
+					break;
+				default:
+					elog(ERROR, "unrecognized nulltesttype: %d",
+						 (int) ntest->nulltesttype);
+					flags = 0;	/* keep compiler quiet */
+					break;
+			}
+
+			ScanKeyEntryInitialize(this_scan_key,
+								   flags,
+								   varattno,	/* attribute number to scan */
+								   InvalidStrategy, /* no strategy */
+								   InvalidOid,	/* no strategy subtype */
+								   InvalidOid,	/* no collation */
+								   InvalidOid,	/* no reg proc for this */
+								   (Datum) 0);	/* constant */
+		}
+		else
+			elog(ERROR, "unsupported indexqual type: %d",
+				 (int) nodeTag(clause));
+	}
+
+	Assert(n_runtime_keys <= max_runtime_keys);
+
+	/* Get rid of any unused arrays */
+	if (n_array_keys == 0)
+	{
+		pfree(array_keys);
+		array_keys = NULL;
+	}
+
+	/*
+	 * Return info to our caller.
+	 */
+	*scanKeys = scan_keys;
+	*numScanKeys = n_scan_keys;
+	*runtimeKeys = runtime_keys;
+	*numRuntimeKeys = n_runtime_keys;
+	if (arrayKeys)
+	{
+		*arrayKeys = array_keys;
+		*numArrayKeys = n_array_keys;
+	}
+	else if (n_array_keys != 0)
+		elog(ERROR, "ScalarArrayOpExpr index qual found where not allowed");
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecIndexScanEstimate
+ *
+ *		Compute the amount of space we'll need in the parallel
+ *		query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanEstimate(IndexScanState *node,
+					  ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+
+	node->iss_PscanLen = index_parallelscan_estimate(node->iss_RelationDesc,
+													 estate->es_snapshot);
+	shm_toc_estimate_chunk(&pcxt->estimator, node->iss_PscanLen);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexScanInitializeDSM
+ *
+ *		Set up a parallel index scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanInitializeDSM(IndexScanState *node,
+						   ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+	ParallelIndexScanDesc piscan;
+
+	piscan = shm_toc_allocate(pcxt->toc, node->iss_PscanLen);
+	index_parallelscan_initialize(node->ss.ss_currentRelation,
+								  node->iss_RelationDesc,
+								  estate->es_snapshot,
+								  piscan);
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+	node->iss_ScanDesc =
+		index_beginscan_parallel(node->ss.ss_currentRelation,
+								 node->iss_RelationDesc,
+								 node->iss_NumScanKeys,
+								 node->iss_NumOrderByKeys,
+								 piscan);
+
+	/*
+	 * If no run-time keys to calculate or they are ready, go ahead and pass
+	 * the scankeys to the index AM.
+	 */
+	if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady)
+		index_rescan(node->iss_ScanDesc,
+					 node->iss_ScanKeys, node->iss_NumScanKeys,
+					 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexScanReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanReInitializeDSM(IndexScanState *node,
+							 ParallelContext *pcxt)
+{
+	index_parallelrescan(node->iss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexScanInitializeWorker
+ *
+ *		Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanInitializeWorker(IndexScanState *node,
+							  ParallelWorkerContext *pwcxt)
+{
+	ParallelIndexScanDesc piscan;
+
+	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+	node->iss_ScanDesc =
+		index_beginscan_parallel(node->ss.ss_currentRelation,
+								 node->iss_RelationDesc,
+								 node->iss_NumScanKeys,
+								 node->iss_NumOrderByKeys,
+								 piscan);
+
+	/*
+	 * If no run-time keys to calculate or they are ready, go ahead and pass
+	 * the scankeys to the index AM.
+	 */
+	if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady)
+		index_rescan(node->iss_ScanDesc,
+					 node->iss_ScanKeys, node->iss_NumScanKeys,
+					 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+}
diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c
new file mode 100644
index 0000000..128eb3e
--- /dev/null
+++ b/src/backend/executor/nodeLimit.c
@@ -0,0 +1,558 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeLimit.c
+ *	  Routines to handle limiting of query results where appropriate
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeLimit.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecLimit		- extract a limited range of tuples
+ *		ExecInitLimit	- initialize node and subnodes..
+ *		ExecEndLimit	- shutdown node and subnodes
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeLimit.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+
+static void recompute_limits(LimitState *node);
+static int64 compute_tuples_needed(LimitState *node);
+
+
+/* ----------------------------------------------------------------
+ *		ExecLimit
+ *
+ *		This is a very simple node which just performs LIMIT/OFFSET
+ *		filtering on the stream of tuples returned by a subplan.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *			/* return: a tuple or NULL */
+ExecLimit(PlanState *pstate)
+{
+	LimitState *node = castNode(LimitState, pstate);
+	ExprContext *econtext = node->ps.ps_ExprContext;
+	ScanDirection direction;
+	TupleTableSlot *slot;
+	PlanState  *outerPlan;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * get information from the node
+	 */
+	direction = node->ps.state->es_direction;
+	outerPlan = outerPlanState(node);
+
+	/*
+	 * The main logic is a simple state machine.
+	 */
+	switch (node->lstate)
+	{
+		case LIMIT_INITIAL:
+
+			/*
+			 * First call for this node, so compute limit/offset. (We can't do
+			 * this any earlier, because parameters from upper nodes will not
+			 * be set during ExecInitLimit.)  This also sets position = 0 and
+			 * changes the state to LIMIT_RESCAN.
+			 */
+			recompute_limits(node);
+
+			/* FALL THRU */
+
+		case LIMIT_RESCAN:
+
+			/*
+			 * If backwards scan, just return NULL without changing state.
+			 */
+			if (!ScanDirectionIsForward(direction))
+				return NULL;
+
+			/*
+			 * Check for empty window; if so, treat like empty subplan.
+			 */
+			if (node->count <= 0 && !node->noCount)
+			{
+				node->lstate = LIMIT_EMPTY;
+				return NULL;
+			}
+
+			/*
+			 * Fetch rows from subplan until we reach position > offset.
+			 */
+			for (;;)
+			{
+				slot = ExecProcNode(outerPlan);
+				if (TupIsNull(slot))
+				{
+					/*
+					 * The subplan returns too few tuples for us to produce
+					 * any output at all.
+					 */
+					node->lstate = LIMIT_EMPTY;
+					return NULL;
+				}
+
+				/*
+				 * Tuple at limit is needed for comparison in subsequent
+				 * execution to detect ties.
+				 */
+				if (node->limitOption == LIMIT_OPTION_WITH_TIES &&
+					node->position - node->offset == node->count - 1)
+				{
+					ExecCopySlot(node->last_slot, slot);
+				}
+				node->subSlot = slot;
+				if (++node->position > node->offset)
+					break;
+			}
+
+			/*
+			 * Okay, we have the first tuple of the window.
+			 */
+			node->lstate = LIMIT_INWINDOW;
+			break;
+
+		case LIMIT_EMPTY:
+
+			/*
+			 * The subplan is known to return no tuples (or not more than
+			 * OFFSET tuples, in general).  So we return no tuples.
+			 */
+			return NULL;
+
+		case LIMIT_INWINDOW:
+			if (ScanDirectionIsForward(direction))
+			{
+				/*
+				 * Forwards scan, so check for stepping off end of window.  At
+				 * the end of the window, the behavior depends on whether WITH
+				 * TIES was specified: if so, we need to change the state
+				 * machine to WINDOWEND_TIES, and fall through to the code for
+				 * that case.  If not (nothing was specified, or ONLY was)
+				 * return NULL without advancing the subplan or the position
+				 * variable, but change the state machine to record having
+				 * done so.
+				 *
+				 * Once at the end, ideally, we would shut down parallel
+				 * resources; but that would destroy the parallel context
+				 * which might be required for rescans.  To do that, we'll
+				 * need to find a way to pass down more information about
+				 * whether rescans are possible.
+				 */
+				if (!node->noCount &&
+					node->position - node->offset >= node->count)
+				{
+					if (node->limitOption == LIMIT_OPTION_COUNT)
+					{
+						node->lstate = LIMIT_WINDOWEND;
+						return NULL;
+					}
+					else
+					{
+						node->lstate = LIMIT_WINDOWEND_TIES;
+						/* we'll fall through to the next case */
+					}
+				}
+				else
+				{
+					/*
+					 * Get next tuple from subplan, if any.
+					 */
+					slot = ExecProcNode(outerPlan);
+					if (TupIsNull(slot))
+					{
+						node->lstate = LIMIT_SUBPLANEOF;
+						return NULL;
+					}
+
+					/*
+					 * If WITH TIES is active, and this is the last in-window
+					 * tuple, save it to be used in subsequent WINDOWEND_TIES
+					 * processing.
+					 */
+					if (node->limitOption == LIMIT_OPTION_WITH_TIES &&
+						node->position - node->offset == node->count - 1)
+					{
+						ExecCopySlot(node->last_slot, slot);
+					}
+					node->subSlot = slot;
+					node->position++;
+					break;
+				}
+			}
+			else
+			{
+				/*
+				 * Backwards scan, so check for stepping off start of window.
+				 * As above, only change state-machine status if so.
+				 */
+				if (node->position <= node->offset + 1)
+				{
+					node->lstate = LIMIT_WINDOWSTART;
+					return NULL;
+				}
+
+				/*
+				 * Get previous tuple from subplan; there should be one!
+				 */
+				slot = ExecProcNode(outerPlan);
+				if (TupIsNull(slot))
+					elog(ERROR, "LIMIT subplan failed to run backwards");
+				node->subSlot = slot;
+				node->position--;
+				break;
+			}
+
+			Assert(node->lstate == LIMIT_WINDOWEND_TIES);
+			/* FALL THRU */
+
+		case LIMIT_WINDOWEND_TIES:
+			if (ScanDirectionIsForward(direction))
+			{
+				/*
+				 * Advance the subplan until we find the first row with
+				 * different ORDER BY pathkeys.
+				 */
+				slot = ExecProcNode(outerPlan);
+				if (TupIsNull(slot))
+				{
+					node->lstate = LIMIT_SUBPLANEOF;
+					return NULL;
+				}
+
+				/*
+				 * Test if the new tuple and the last tuple match. If so we
+				 * return the tuple.
+				 */
+				econtext->ecxt_innertuple = slot;
+				econtext->ecxt_outertuple = node->last_slot;
+				if (ExecQualAndReset(node->eqfunction, econtext))
+				{
+					node->subSlot = slot;
+					node->position++;
+				}
+				else
+				{
+					node->lstate = LIMIT_WINDOWEND;
+					return NULL;
+				}
+			}
+			else
+			{
+				/*
+				 * Backwards scan, so check for stepping off start of window.
+				 * Change only state-machine status if so.
+				 */
+				if (node->position <= node->offset + 1)
+				{
+					node->lstate = LIMIT_WINDOWSTART;
+					return NULL;
+				}
+
+				/*
+				 * Get previous tuple from subplan; there should be one! And
+				 * change state-machine status.
+				 */
+				slot = ExecProcNode(outerPlan);
+				if (TupIsNull(slot))
+					elog(ERROR, "LIMIT subplan failed to run backwards");
+				node->subSlot = slot;
+				node->position--;
+				node->lstate = LIMIT_INWINDOW;
+			}
+			break;
+
+		case LIMIT_SUBPLANEOF:
+			if (ScanDirectionIsForward(direction))
+				return NULL;
+
+			/*
+			 * Backing up from subplan EOF, so re-fetch previous tuple; there
+			 * should be one!  Note previous tuple must be in window.
+			 */
+			slot = ExecProcNode(outerPlan);
+			if (TupIsNull(slot))
+				elog(ERROR, "LIMIT subplan failed to run backwards");
+			node->subSlot = slot;
+			node->lstate = LIMIT_INWINDOW;
+			/* position does not change 'cause we didn't advance it before */
+			break;
+
+		case LIMIT_WINDOWEND:
+			if (ScanDirectionIsForward(direction))
+				return NULL;
+
+			/*
+			 * We already past one position to detect ties so re-fetch
+			 * previous tuple; there should be one!  Note previous tuple must
+			 * be in window.
+			 */
+			if (node->limitOption == LIMIT_OPTION_WITH_TIES)
+			{
+				slot = ExecProcNode(outerPlan);
+				if (TupIsNull(slot))
+					elog(ERROR, "LIMIT subplan failed to run backwards");
+				node->subSlot = slot;
+				node->lstate = LIMIT_INWINDOW;
+			}
+			else
+			{
+				/*
+				 * Backing up from window end: simply re-return the last tuple
+				 * fetched from the subplan.
+				 */
+				slot = node->subSlot;
+				node->lstate = LIMIT_INWINDOW;
+				/* position does not change 'cause we didn't advance it before */
+			}
+			break;
+
+		case LIMIT_WINDOWSTART:
+			if (!ScanDirectionIsForward(direction))
+				return NULL;
+
+			/*
+			 * Advancing after having backed off window start: simply
+			 * re-return the last tuple fetched from the subplan.
+			 */
+			slot = node->subSlot;
+			node->lstate = LIMIT_INWINDOW;
+			/* position does not change 'cause we didn't change it before */
+			break;
+
+		default:
+			elog(ERROR, "impossible LIMIT state: %d",
+				 (int) node->lstate);
+			slot = NULL;		/* keep compiler quiet */
+			break;
+	}
+
+	/* Return the current tuple */
+	Assert(!TupIsNull(slot));
+
+	return slot;
+}
+
+/*
+ * Evaluate the limit/offset expressions --- done at startup or rescan.
+ *
+ * This is also a handy place to reset the current-position state info.
+ */
+static void
+recompute_limits(LimitState *node)
+{
+	ExprContext *econtext = node->ps.ps_ExprContext;
+	Datum		val;
+	bool		isNull;
+
+	if (node->limitOffset)
+	{
+		val = ExecEvalExprSwitchContext(node->limitOffset,
+										econtext,
+										&isNull);
+		/* Interpret NULL offset as no offset */
+		if (isNull)
+			node->offset = 0;
+		else
+		{
+			node->offset = DatumGetInt64(val);
+			if (node->offset < 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE),
+						 errmsg("OFFSET must not be negative")));
+		}
+	}
+	else
+	{
+		/* No OFFSET supplied */
+		node->offset = 0;
+	}
+
+	if (node->limitCount)
+	{
+		val = ExecEvalExprSwitchContext(node->limitCount,
+										econtext,
+										&isNull);
+		/* Interpret NULL count as no count (LIMIT ALL) */
+		if (isNull)
+		{
+			node->count = 0;
+			node->noCount = true;
+		}
+		else
+		{
+			node->count = DatumGetInt64(val);
+			if (node->count < 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_ROW_COUNT_IN_LIMIT_CLAUSE),
+						 errmsg("LIMIT must not be negative")));
+			node->noCount = false;
+		}
+	}
+	else
+	{
+		/* No COUNT supplied */
+		node->count = 0;
+		node->noCount = true;
+	}
+
+	/* Reset position to start-of-scan */
+	node->position = 0;
+	node->subSlot = NULL;
+
+	/* Set state-machine state */
+	node->lstate = LIMIT_RESCAN;
+
+	/*
+	 * Notify child node about limit.  Note: think not to "optimize" by
+	 * skipping ExecSetTupleBound if compute_tuples_needed returns < 0.  We
+	 * must update the child node anyway, in case this is a rescan and the
+	 * previous time we got a different result.
+	 */
+	ExecSetTupleBound(compute_tuples_needed(node), outerPlanState(node));
+}
+
+/*
+ * Compute the maximum number of tuples needed to satisfy this Limit node.
+ * Return a negative value if there is not a determinable limit.
+ */
+static int64
+compute_tuples_needed(LimitState *node)
+{
+	if ((node->noCount) || (node->limitOption == LIMIT_OPTION_WITH_TIES))
+		return -1;
+	/* Note: if this overflows, we'll return a negative value, which is OK */
+	return node->count + node->offset;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitLimit
+ *
+ *		This initializes the limit node state structures and
+ *		the node's subplan.
+ * ----------------------------------------------------------------
+ */
+LimitState *
+ExecInitLimit(Limit *node, EState *estate, int eflags)
+{
+	LimitState *limitstate;
+	Plan	   *outerPlan;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & EXEC_FLAG_MARK));
+
+	/*
+	 * create state structure
+	 */
+	limitstate = makeNode(LimitState);
+	limitstate->ps.plan = (Plan *) node;
+	limitstate->ps.state = estate;
+	limitstate->ps.ExecProcNode = ExecLimit;
+
+	limitstate->lstate = LIMIT_INITIAL;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * Limit nodes never call ExecQual or ExecProject, but they need an
+	 * exprcontext anyway to evaluate the limit/offset parameters in.
+	 */
+	ExecAssignExprContext(estate, &limitstate->ps);
+
+	/*
+	 * initialize outer plan
+	 */
+	outerPlan = outerPlan(node);
+	outerPlanState(limitstate) = ExecInitNode(outerPlan, estate, eflags);
+
+	/*
+	 * initialize child expressions
+	 */
+	limitstate->limitOffset = ExecInitExpr((Expr *) node->limitOffset,
+										   (PlanState *) limitstate);
+	limitstate->limitCount = ExecInitExpr((Expr *) node->limitCount,
+										  (PlanState *) limitstate);
+	limitstate->limitOption = node->limitOption;
+
+	/*
+	 * Initialize result type.
+	 */
+	ExecInitResultTypeTL(&limitstate->ps);
+
+	limitstate->ps.resultopsset = true;
+	limitstate->ps.resultops = ExecGetResultSlotOps(outerPlanState(limitstate),
+													&limitstate->ps.resultopsfixed);
+
+	/*
+	 * limit nodes do no projections, so initialize projection info for this
+	 * node appropriately
+	 */
+	limitstate->ps.ps_ProjInfo = NULL;
+
+	/*
+	 * Initialize the equality evaluation, to detect ties.
+	 */
+	if (node->limitOption == LIMIT_OPTION_WITH_TIES)
+	{
+		TupleDesc	desc;
+		const TupleTableSlotOps *ops;
+
+		desc = ExecGetResultType(outerPlanState(limitstate));
+		ops = ExecGetResultSlotOps(outerPlanState(limitstate), NULL);
+
+		limitstate->last_slot = ExecInitExtraTupleSlot(estate, desc, ops);
+		limitstate->eqfunction = execTuplesMatchPrepare(desc,
+														node->uniqNumCols,
+														node->uniqColIdx,
+														node->uniqOperators,
+														node->uniqCollations,
+														&limitstate->ps);
+	}
+
+	return limitstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndLimit
+ *
+ *		This shuts down the subplan and frees resources allocated
+ *		to this node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndLimit(LimitState *node)
+{
+	ExecFreeExprContext(&node->ps);
+	ExecEndNode(outerPlanState(node));
+}
+
+
+void
+ExecReScanLimit(LimitState *node)
+{
+	/*
+	 * Recompute limit/offset in case parameters changed, and reset the state
+	 * machine.  We must do this before rescanning our child node, in case
+	 * it's a Sort that we are passing the parameters down to.
+	 */
+	recompute_limits(node);
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->ps.lefttree->chgParam == NULL)
+		ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c
new file mode 100644
index 0000000..7583973
--- /dev/null
+++ b/src/backend/executor/nodeLockRows.c
@@ -0,0 +1,403 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeLockRows.c
+ *	  Routines to handle FOR UPDATE/FOR SHARE row locking
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeLockRows.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecLockRows		- fetch locked rows
+ *		ExecInitLockRows	- initialize node and subnodes..
+ *		ExecEndLockRows		- shutdown node and subnodes
+ */
+
+#include "postgres.h"
+
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "executor/executor.h"
+#include "executor/nodeLockRows.h"
+#include "foreign/fdwapi.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+
+/* ----------------------------------------------------------------
+ *		ExecLockRows
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *			/* return: a tuple or NULL */
+ExecLockRows(PlanState *pstate)
+{
+	LockRowsState *node = castNode(LockRowsState, pstate);
+	TupleTableSlot *slot;
+	EState	   *estate;
+	PlanState  *outerPlan;
+	bool		epq_needed;
+	ListCell   *lc;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * get information from the node
+	 */
+	estate = node->ps.state;
+	outerPlan = outerPlanState(node);
+
+	/*
+	 * Get next tuple from subplan, if any.
+	 */
+lnext:
+	slot = ExecProcNode(outerPlan);
+
+	if (TupIsNull(slot))
+	{
+		/* Release any resources held by EPQ mechanism before exiting */
+		EvalPlanQualEnd(&node->lr_epqstate);
+		return NULL;
+	}
+
+	/* We don't need EvalPlanQual unless we get updated tuple version(s) */
+	epq_needed = false;
+
+	/*
+	 * Attempt to lock the source tuple(s).  (Note we only have locking
+	 * rowmarks in lr_arowMarks.)
+	 */
+	foreach(lc, node->lr_arowMarks)
+	{
+		ExecAuxRowMark *aerm = (ExecAuxRowMark *) lfirst(lc);
+		ExecRowMark *erm = aerm->rowmark;
+		Datum		datum;
+		bool		isNull;
+		ItemPointerData tid;
+		TM_FailureData tmfd;
+		LockTupleMode lockmode;
+		int			lockflags = 0;
+		TM_Result	test;
+		TupleTableSlot *markSlot;
+
+		/* clear any leftover test tuple for this rel */
+		markSlot = EvalPlanQualSlot(&node->lr_epqstate, erm->relation, erm->rti);
+		ExecClearTuple(markSlot);
+
+		/* if child rel, must check whether it produced this row */
+		if (erm->rti != erm->prti)
+		{
+			Oid			tableoid;
+
+			datum = ExecGetJunkAttribute(slot,
+										 aerm->toidAttNo,
+										 &isNull);
+			/* shouldn't ever get a null result... */
+			if (isNull)
+				elog(ERROR, "tableoid is NULL");
+			tableoid = DatumGetObjectId(datum);
+
+			Assert(OidIsValid(erm->relid));
+			if (tableoid != erm->relid)
+			{
+				/* this child is inactive right now */
+				erm->ermActive = false;
+				ItemPointerSetInvalid(&(erm->curCtid));
+				ExecClearTuple(markSlot);
+				continue;
+			}
+		}
+		erm->ermActive = true;
+
+		/* fetch the tuple's ctid */
+		datum = ExecGetJunkAttribute(slot,
+									 aerm->ctidAttNo,
+									 &isNull);
+		/* shouldn't ever get a null result... */
+		if (isNull)
+			elog(ERROR, "ctid is NULL");
+
+		/* requests for foreign tables must be passed to their FDW */
+		if (erm->relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+		{
+			FdwRoutine *fdwroutine;
+			bool		updated = false;
+
+			fdwroutine = GetFdwRoutineForRelation(erm->relation, false);
+			/* this should have been checked already, but let's be safe */
+			if (fdwroutine->RefetchForeignRow == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot lock rows in foreign table \"%s\"",
+								RelationGetRelationName(erm->relation))));
+
+			fdwroutine->RefetchForeignRow(estate,
+										  erm,
+										  datum,
+										  markSlot,
+										  &updated);
+			if (TupIsNull(markSlot))
+			{
+				/* couldn't get the lock, so skip this row */
+				goto lnext;
+			}
+
+			/*
+			 * if FDW says tuple was updated before getting locked, we need to
+			 * perform EPQ testing to see if quals are still satisfied
+			 */
+			if (updated)
+				epq_needed = true;
+
+			continue;
+		}
+
+		/* okay, try to lock (and fetch) the tuple */
+		tid = *((ItemPointer) DatumGetPointer(datum));
+		switch (erm->markType)
+		{
+			case ROW_MARK_EXCLUSIVE:
+				lockmode = LockTupleExclusive;
+				break;
+			case ROW_MARK_NOKEYEXCLUSIVE:
+				lockmode = LockTupleNoKeyExclusive;
+				break;
+			case ROW_MARK_SHARE:
+				lockmode = LockTupleShare;
+				break;
+			case ROW_MARK_KEYSHARE:
+				lockmode = LockTupleKeyShare;
+				break;
+			default:
+				elog(ERROR, "unsupported rowmark type");
+				lockmode = LockTupleNoKeyExclusive; /* keep compiler quiet */
+				break;
+		}
+
+		lockflags = TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS;
+		if (!IsolationUsesXactSnapshot())
+			lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION;
+
+		test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot,
+								markSlot, estate->es_output_cid,
+								lockmode, erm->waitPolicy,
+								lockflags,
+								&tmfd);
+
+		switch (test)
+		{
+			case TM_WouldBlock:
+				/* couldn't lock tuple in SKIP LOCKED mode */
+				goto lnext;
+
+			case TM_SelfModified:
+
+				/*
+				 * The target tuple was already updated or deleted by the
+				 * current command, or by a later command in the current
+				 * transaction.  We *must* ignore the tuple in the former
+				 * case, so as to avoid the "Halloween problem" of repeated
+				 * update attempts.  In the latter case it might be sensible
+				 * to fetch the updated tuple instead, but doing so would
+				 * require changing heap_update and heap_delete to not
+				 * complain about updating "invisible" tuples, which seems
+				 * pretty scary (table_tuple_lock will not complain, but few
+				 * callers expect TM_Invisible, and we're not one of them). So
+				 * for now, treat the tuple as deleted and do not process.
+				 */
+				goto lnext;
+
+			case TM_Ok:
+
+				/*
+				 * Got the lock successfully, the locked tuple saved in
+				 * markSlot for, if needed, EvalPlanQual testing below.
+				 */
+				if (tmfd.traversed)
+					epq_needed = true;
+				break;
+
+			case TM_Updated:
+				if (IsolationUsesXactSnapshot())
+					ereport(ERROR,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("could not serialize access due to concurrent update")));
+				elog(ERROR, "unexpected table_tuple_lock status: %u",
+					 test);
+				break;
+
+			case TM_Deleted:
+				if (IsolationUsesXactSnapshot())
+					ereport(ERROR,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("could not serialize access due to concurrent update")));
+				/* tuple was deleted so don't return it */
+				goto lnext;
+
+			case TM_Invisible:
+				elog(ERROR, "attempted to lock invisible tuple");
+				break;
+
+			default:
+				elog(ERROR, "unrecognized table_tuple_lock status: %u",
+					 test);
+		}
+
+		/* Remember locked tuple's TID for EPQ testing and WHERE CURRENT OF */
+		erm->curCtid = tid;
+	}
+
+	/*
+	 * If we need to do EvalPlanQual testing, do so.
+	 */
+	if (epq_needed)
+	{
+		/* Initialize EPQ machinery */
+		EvalPlanQualBegin(&node->lr_epqstate);
+
+		/*
+		 * To fetch non-locked source rows the EPQ logic needs to access junk
+		 * columns from the tuple being tested.
+		 */
+		EvalPlanQualSetSlot(&node->lr_epqstate, slot);
+
+		/*
+		 * And finally we can re-evaluate the tuple.
+		 */
+		slot = EvalPlanQualNext(&node->lr_epqstate);
+		if (TupIsNull(slot))
+		{
+			/* Updated tuple fails qual, so ignore it and go on */
+			goto lnext;
+		}
+	}
+
+	/* Got all locks, so return the current tuple */
+	return slot;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitLockRows
+ *
+ *		This initializes the LockRows node state structures and
+ *		the node's subplan.
+ * ----------------------------------------------------------------
+ */
+LockRowsState *
+ExecInitLockRows(LockRows *node, EState *estate, int eflags)
+{
+	LockRowsState *lrstate;
+	Plan	   *outerPlan = outerPlan(node);
+	List	   *epq_arowmarks;
+	ListCell   *lc;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & EXEC_FLAG_MARK));
+
+	/*
+	 * create state structure
+	 */
+	lrstate = makeNode(LockRowsState);
+	lrstate->ps.plan = (Plan *) node;
+	lrstate->ps.state = estate;
+	lrstate->ps.ExecProcNode = ExecLockRows;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * LockRows nodes never call ExecQual or ExecProject, therefore no
+	 * ExprContext is needed.
+	 */
+
+	/*
+	 * Initialize result type.
+	 */
+	ExecInitResultTypeTL(&lrstate->ps);
+
+	/*
+	 * then initialize outer plan
+	 */
+	outerPlanState(lrstate) = ExecInitNode(outerPlan, estate, eflags);
+
+	/* node returns unmodified slots from the outer plan */
+	lrstate->ps.resultopsset = true;
+	lrstate->ps.resultops = ExecGetResultSlotOps(outerPlanState(lrstate),
+												 &lrstate->ps.resultopsfixed);
+
+	/*
+	 * LockRows nodes do no projections, so initialize projection info for
+	 * this node appropriately
+	 */
+	lrstate->ps.ps_ProjInfo = NULL;
+
+	/*
+	 * Locate the ExecRowMark(s) that this node is responsible for, and
+	 * construct ExecAuxRowMarks for them.  (InitPlan should already have
+	 * built the global list of ExecRowMarks.)
+	 */
+	lrstate->lr_arowMarks = NIL;
+	epq_arowmarks = NIL;
+	foreach(lc, node->rowMarks)
+	{
+		PlanRowMark *rc = lfirst_node(PlanRowMark, lc);
+		ExecRowMark *erm;
+		ExecAuxRowMark *aerm;
+
+		/* ignore "parent" rowmarks; they are irrelevant at runtime */
+		if (rc->isParent)
+			continue;
+
+		/* find ExecRowMark and build ExecAuxRowMark */
+		erm = ExecFindRowMark(estate, rc->rti, false);
+		aerm = ExecBuildAuxRowMark(erm, outerPlan->targetlist);
+
+		/*
+		 * Only locking rowmarks go into our own list.  Non-locking marks are
+		 * passed off to the EvalPlanQual machinery.  This is because we don't
+		 * want to bother fetching non-locked rows unless we actually have to
+		 * do an EPQ recheck.
+		 */
+		if (RowMarkRequiresRowShareLock(erm->markType))
+			lrstate->lr_arowMarks = lappend(lrstate->lr_arowMarks, aerm);
+		else
+			epq_arowmarks = lappend(epq_arowmarks, aerm);
+	}
+
+	/* Now we have the info needed to set up EPQ state */
+	EvalPlanQualInit(&lrstate->lr_epqstate, estate,
+					 outerPlan, epq_arowmarks, node->epqParam);
+
+	return lrstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndLockRows
+ *
+ *		This shuts down the subplan and frees resources allocated
+ *		to this node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndLockRows(LockRowsState *node)
+{
+	/* We may have shut down EPQ already, but no harm in another call */
+	EvalPlanQualEnd(&node->lr_epqstate);
+	ExecEndNode(outerPlanState(node));
+}
+
+
+void
+ExecReScanLockRows(LockRowsState *node)
+{
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->ps.lefttree->chgParam == NULL)
+		ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeMaterial.c b/src/backend/executor/nodeMaterial.c
new file mode 100644
index 0000000..7c53f8e
--- /dev/null
+++ b/src/backend/executor/nodeMaterial.c
@@ -0,0 +1,368 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeMaterial.c
+ *	  Routines to handle materialization nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeMaterial.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecMaterial			- materialize the result of a subplan
+ *		ExecInitMaterial		- initialize node and subnodes
+ *		ExecEndMaterial			- shutdown node and subnodes
+ *
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeMaterial.h"
+#include "miscadmin.h"
+
+/* ----------------------------------------------------------------
+ *		ExecMaterial
+ *
+ *		As long as we are at the end of the data collected in the tuplestore,
+ *		we collect one new row from the subplan on each call, and stash it
+ *		aside in the tuplestore before returning it.  The tuplestore is
+ *		only read if we are asked to scan backwards, rescan, or mark/restore.
+ *
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *			/* result tuple from subplan */
+ExecMaterial(PlanState *pstate)
+{
+	MaterialState *node = castNode(MaterialState, pstate);
+	EState	   *estate;
+	ScanDirection dir;
+	bool		forward;
+	Tuplestorestate *tuplestorestate;
+	bool		eof_tuplestore;
+	TupleTableSlot *slot;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * get state info from node
+	 */
+	estate = node->ss.ps.state;
+	dir = estate->es_direction;
+	forward = ScanDirectionIsForward(dir);
+	tuplestorestate = node->tuplestorestate;
+
+	/*
+	 * If first time through, and we need a tuplestore, initialize it.
+	 */
+	if (tuplestorestate == NULL && node->eflags != 0)
+	{
+		tuplestorestate = tuplestore_begin_heap(true, false, work_mem);
+		tuplestore_set_eflags(tuplestorestate, node->eflags);
+		if (node->eflags & EXEC_FLAG_MARK)
+		{
+			/*
+			 * Allocate a second read pointer to serve as the mark. We know it
+			 * must have index 1, so needn't store that.
+			 */
+			int			ptrno PG_USED_FOR_ASSERTS_ONLY;
+
+			ptrno = tuplestore_alloc_read_pointer(tuplestorestate,
+												  node->eflags);
+			Assert(ptrno == 1);
+		}
+		node->tuplestorestate = tuplestorestate;
+	}
+
+	/*
+	 * If we are not at the end of the tuplestore, or are going backwards, try
+	 * to fetch a tuple from tuplestore.
+	 */
+	eof_tuplestore = (tuplestorestate == NULL) ||
+		tuplestore_ateof(tuplestorestate);
+
+	if (!forward && eof_tuplestore)
+	{
+		if (!node->eof_underlying)
+		{
+			/*
+			 * When reversing direction at tuplestore EOF, the first
+			 * gettupleslot call will fetch the last-added tuple; but we want
+			 * to return the one before that, if possible. So do an extra
+			 * fetch.
+			 */
+			if (!tuplestore_advance(tuplestorestate, forward))
+				return NULL;	/* the tuplestore must be empty */
+		}
+		eof_tuplestore = false;
+	}
+
+	/*
+	 * If we can fetch another tuple from the tuplestore, return it.
+	 */
+	slot = node->ss.ps.ps_ResultTupleSlot;
+	if (!eof_tuplestore)
+	{
+		if (tuplestore_gettupleslot(tuplestorestate, forward, false, slot))
+			return slot;
+		if (forward)
+			eof_tuplestore = true;
+	}
+
+	/*
+	 * If necessary, try to fetch another row from the subplan.
+	 *
+	 * Note: the eof_underlying state variable exists to short-circuit further
+	 * subplan calls.  It's not optional, unfortunately, because some plan
+	 * node types are not robust about being called again when they've already
+	 * returned NULL.
+	 */
+	if (eof_tuplestore && !node->eof_underlying)
+	{
+		PlanState  *outerNode;
+		TupleTableSlot *outerslot;
+
+		/*
+		 * We can only get here with forward==true, so no need to worry about
+		 * which direction the subplan will go.
+		 */
+		outerNode = outerPlanState(node);
+		outerslot = ExecProcNode(outerNode);
+		if (TupIsNull(outerslot))
+		{
+			node->eof_underlying = true;
+			return NULL;
+		}
+
+		/*
+		 * Append a copy of the returned tuple to tuplestore.  NOTE: because
+		 * the tuplestore is certainly in EOF state, its read position will
+		 * move forward over the added tuple.  This is what we want.
+		 */
+		if (tuplestorestate)
+			tuplestore_puttupleslot(tuplestorestate, outerslot);
+
+		ExecCopySlot(slot, outerslot);
+		return slot;
+	}
+
+	/*
+	 * Nothing left ...
+	 */
+	return ExecClearTuple(slot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitMaterial
+ * ----------------------------------------------------------------
+ */
+MaterialState *
+ExecInitMaterial(Material *node, EState *estate, int eflags)
+{
+	MaterialState *matstate;
+	Plan	   *outerPlan;
+
+	/*
+	 * create state structure
+	 */
+	matstate = makeNode(MaterialState);
+	matstate->ss.ps.plan = (Plan *) node;
+	matstate->ss.ps.state = estate;
+	matstate->ss.ps.ExecProcNode = ExecMaterial;
+
+	/*
+	 * We must have a tuplestore buffering the subplan output to do backward
+	 * scan or mark/restore.  We also prefer to materialize the subplan output
+	 * if we might be called on to rewind and replay it many times. However,
+	 * if none of these cases apply, we can skip storing the data.
+	 */
+	matstate->eflags = (eflags & (EXEC_FLAG_REWIND |
+								  EXEC_FLAG_BACKWARD |
+								  EXEC_FLAG_MARK));
+
+	/*
+	 * Tuplestore's interpretation of the flag bits is subtly different from
+	 * the general executor meaning: it doesn't think BACKWARD necessarily
+	 * means "backwards all the way to start".  If told to support BACKWARD we
+	 * must include REWIND in the tuplestore eflags, else tuplestore_trim
+	 * might throw away too much.
+	 */
+	if (eflags & EXEC_FLAG_BACKWARD)
+		matstate->eflags |= EXEC_FLAG_REWIND;
+
+	matstate->eof_underlying = false;
+	matstate->tuplestorestate = NULL;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * Materialization nodes don't need ExprContexts because they never call
+	 * ExecQual or ExecProject.
+	 */
+
+	/*
+	 * initialize child nodes
+	 *
+	 * We shield the child node from the need to support REWIND, BACKWARD, or
+	 * MARK/RESTORE.
+	 */
+	eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);
+
+	outerPlan = outerPlan(node);
+	outerPlanState(matstate) = ExecInitNode(outerPlan, estate, eflags);
+
+	/*
+	 * Initialize result type and slot. No need to initialize projection info
+	 * because this node doesn't do projections.
+	 *
+	 * material nodes only return tuples from their materialized relation.
+	 */
+	ExecInitResultTupleSlotTL(&matstate->ss.ps, &TTSOpsMinimalTuple);
+	matstate->ss.ps.ps_ProjInfo = NULL;
+
+	/*
+	 * initialize tuple type.
+	 */
+	ExecCreateScanSlotFromOuterPlan(estate, &matstate->ss, &TTSOpsMinimalTuple);
+
+	return matstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndMaterial
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndMaterial(MaterialState *node)
+{
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * Release tuplestore resources
+	 */
+	if (node->tuplestorestate != NULL)
+		tuplestore_end(node->tuplestorestate);
+	node->tuplestorestate = NULL;
+
+	/*
+	 * shut down the subplan
+	 */
+	ExecEndNode(outerPlanState(node));
+}
+
+/* ----------------------------------------------------------------
+ *		ExecMaterialMarkPos
+ *
+ *		Calls tuplestore to save the current position in the stored file.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMaterialMarkPos(MaterialState *node)
+{
+	Assert(node->eflags & EXEC_FLAG_MARK);
+
+	/*
+	 * if we haven't materialized yet, just return.
+	 */
+	if (!node->tuplestorestate)
+		return;
+
+	/*
+	 * copy the active read pointer to the mark.
+	 */
+	tuplestore_copy_read_pointer(node->tuplestorestate, 0, 1);
+
+	/*
+	 * since we may have advanced the mark, try to truncate the tuplestore.
+	 */
+	tuplestore_trim(node->tuplestorestate);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecMaterialRestrPos
+ *
+ *		Calls tuplestore to restore the last saved file position.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMaterialRestrPos(MaterialState *node)
+{
+	Assert(node->eflags & EXEC_FLAG_MARK);
+
+	/*
+	 * if we haven't materialized yet, just return.
+	 */
+	if (!node->tuplestorestate)
+		return;
+
+	/*
+	 * copy the mark to the active read pointer.
+	 */
+	tuplestore_copy_read_pointer(node->tuplestorestate, 1, 0);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanMaterial
+ *
+ *		Rescans the materialized relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanMaterial(MaterialState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	if (node->eflags != 0)
+	{
+		/*
+		 * If we haven't materialized yet, just return. If outerplan's
+		 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+		 * else no reason to re-scan it at all.
+		 */
+		if (!node->tuplestorestate)
+			return;
+
+		/*
+		 * If subnode is to be rescanned then we forget previous stored
+		 * results; we have to re-read the subplan and re-store.  Also, if we
+		 * told tuplestore it needn't support rescan, we lose and must
+		 * re-read.  (This last should not happen in common cases; else our
+		 * caller lied by not passing EXEC_FLAG_REWIND to us.)
+		 *
+		 * Otherwise we can just rewind and rescan the stored output. The
+		 * state of the subnode does not change.
+		 */
+		if (outerPlan->chgParam != NULL ||
+			(node->eflags & EXEC_FLAG_REWIND) == 0)
+		{
+			tuplestore_end(node->tuplestorestate);
+			node->tuplestorestate = NULL;
+			if (outerPlan->chgParam == NULL)
+				ExecReScan(outerPlan);
+			node->eof_underlying = false;
+		}
+		else
+			tuplestore_rescan(node->tuplestorestate);
+	}
+	else
+	{
+		/* In this case we are just passing on the subquery's output */
+
+		/*
+		 * if chgParam of subnode is not null then plan will be re-scanned by
+		 * first ExecProcNode.
+		 */
+		if (outerPlan->chgParam == NULL)
+			ExecReScan(outerPlan);
+		node->eof_underlying = false;
+	}
+}
diff --git a/src/backend/executor/nodeMemoize.c b/src/backend/executor/nodeMemoize.c
new file mode 100644
index 0000000..f82f41f
--- /dev/null
+++ b/src/backend/executor/nodeMemoize.c
@@ -0,0 +1,1225 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeMemoize.c
+ *	  Routines to handle caching of results from parameterized nodes
+ *
+ * Portions Copyright (c) 2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeMemoize.c
+ *
+ * Memoize nodes are intended to sit above parameterized nodes in the plan
+ * tree in order to cache results from them.  The intention here is that a
+ * repeat scan with a parameter value that has already been seen by the node
+ * can fetch tuples from the cache rather than having to re-scan the outer
+ * node all over again.  The query planner may choose to make use of one of
+ * these when it thinks rescans for previously seen values are likely enough
+ * to warrant adding the additional node.
+ *
+ * The method of cache we use is a hash table.  When the cache fills, we never
+ * spill tuples to disk, instead, we choose to evict the least recently used
+ * cache entry from the cache.  We remember the least recently used entry by
+ * always pushing new entries and entries we look for onto the tail of a
+ * doubly linked list.  This means that older items always bubble to the top
+ * of this LRU list.
+ *
+ * Sometimes our callers won't run their scans to completion. For example a
+ * semi-join only needs to run until it finds a matching tuple, and once it
+ * does, the join operator skips to the next outer tuple and does not execute
+ * the inner side again on that scan.  Because of this, we must keep track of
+ * when a cache entry is complete, and by default, we know it is when we run
+ * out of tuples to read during the scan.  However, there are cases where we
+ * can mark the cache entry as complete without exhausting the scan of all
+ * tuples.  One case is unique joins, where the join operator knows that there
+ * will only be at most one match for any given outer tuple.  In order to
+ * support such cases we allow the "singlerow" option to be set for the cache.
+ * This option marks the cache entry as complete after we read the first tuple
+ * from the subnode.
+ *
+ * It's possible when we're filling the cache for a given set of parameters
+ * that we're unable to free enough memory to store any more tuples.  If this
+ * happens then we'll have already evicted all other cache entries.  When
+ * caching another tuple would cause us to exceed our memory budget, we must
+ * free the entry that we're currently populating and move the state machine
+ * into MEMO_CACHE_BYPASS_MODE.  This means that we'll not attempt to cache
+ * any further tuples for this particular scan.  We don't have the memory for
+ * it.  The state machine will be reset again on the next rescan.  If the
+ * memory requirements to cache the next parameter's tuples are less
+ * demanding, then that may allow us to start putting useful entries back into
+ * the cache again.
+ *
+ *
+ * INTERFACE ROUTINES
+ *		ExecMemoize			- lookup cache, exec subplan when not found
+ *		ExecInitMemoize		- initialize node and subnodes
+ *		ExecEndMemoize		- shutdown node and subnodes
+ *		ExecReScanMemoize	- rescan the memoize node
+ *
+ *		ExecMemoizeEstimate		estimates DSM space needed for parallel plan
+ *		ExecMemoizeInitializeDSM initialize DSM for parallel plan
+ *		ExecMemoizeInitializeWorker attach to DSM info in parallel worker
+ *		ExecMemoizeRetrieveInstrumentation get instrumentation from worker
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "executor/nodeMemoize.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+
+/* States of the ExecMemoize state machine */
+#define MEMO_CACHE_LOOKUP			1	/* Attempt to perform a cache lookup */
+#define MEMO_CACHE_FETCH_NEXT_TUPLE	2	/* Get another tuple from the cache */
+#define MEMO_FILLING_CACHE			3	/* Read outer node to fill cache */
+#define MEMO_CACHE_BYPASS_MODE		4	/* Bypass mode.  Just read from our
+										 * subplan without caching anything */
+#define MEMO_END_OF_SCAN			5	/* Ready for rescan */
+
+
+/* Helper macros for memory accounting */
+#define EMPTY_ENTRY_MEMORY_BYTES(e)		(sizeof(MemoizeEntry) + \
+										 sizeof(MemoizeKey) + \
+										 (e)->key->params->t_len);
+#define CACHE_TUPLE_BYTES(t)			(sizeof(MemoizeTuple) + \
+										 (t)->mintuple->t_len)
+
+ /* MemoizeTuple Stores an individually cached tuple */
+typedef struct MemoizeTuple
+{
+	MinimalTuple mintuple;		/* Cached tuple */
+	struct MemoizeTuple *next;	/* The next tuple with the same parameter
+								 * values or NULL if it's the last one */
+} MemoizeTuple;
+
+/*
+ * MemoizeKey
+ * The hash table key for cached entries plus the LRU list link
+ */
+typedef struct MemoizeKey
+{
+	MinimalTuple params;
+	dlist_node	lru_node;		/* Pointer to next/prev key in LRU list */
+} MemoizeKey;
+
+/*
+ * MemoizeEntry
+ *		The data struct that the cache hash table stores
+ */
+typedef struct MemoizeEntry
+{
+	MemoizeKey *key;			/* Hash key for hash table lookups */
+	MemoizeTuple *tuplehead;	/* Pointer to the first tuple or NULL if
+								 * no tuples are cached for this entry */
+	uint32		hash;			/* Hash value (cached) */
+	char		status;			/* Hash status */
+	bool		complete;		/* Did we read the outer plan to completion? */
+} MemoizeEntry;
+
+
+#define SH_PREFIX memoize
+#define SH_ELEMENT_TYPE MemoizeEntry
+#define SH_KEY_TYPE MemoizeKey *
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static uint32 MemoizeHash_hash(struct memoize_hash *tb,
+							   const MemoizeKey *key);
+static bool MemoizeHash_equal(struct memoize_hash *tb,
+							  const MemoizeKey *params1,
+							  const MemoizeKey *params2);
+
+#define SH_PREFIX memoize
+#define SH_ELEMENT_TYPE MemoizeEntry
+#define SH_KEY_TYPE MemoizeKey *
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) MemoizeHash_hash(tb, key)
+#define SH_EQUAL(tb, a, b) MemoizeHash_equal(tb, a, b)
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a) a->hash
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+/*
+ * MemoizeHash_hash
+ *		Hash function for simplehash hashtable.  'key' is unused here as we
+ *		require that all table lookups first populate the MemoizeState's
+ *		probeslot with the key values to be looked up.
+ */
+static uint32
+MemoizeHash_hash(struct memoize_hash *tb, const MemoizeKey *key)
+{
+	MemoizeState *mstate = (MemoizeState *) tb->private_data;
+	TupleTableSlot *pslot = mstate->probeslot;
+	uint32		hashkey = 0;
+	int			numkeys = mstate->nkeys;
+
+	if (mstate->binary_mode)
+	{
+		for (int i = 0; i < numkeys; i++)
+		{
+			/* rotate hashkey left 1 bit at each step */
+			hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+			if (!pslot->tts_isnull[i])	/* treat nulls as having hash key 0 */
+			{
+				FormData_pg_attribute *attr;
+				uint32		hkey;
+
+				attr = &pslot->tts_tupleDescriptor->attrs[i];
+
+				hkey = datum_image_hash(pslot->tts_values[i], attr->attbyval, attr->attlen);
+
+				hashkey ^= hkey;
+			}
+		}
+	}
+	else
+	{
+		FmgrInfo   *hashfunctions = mstate->hashfunctions;
+		Oid		   *collations = mstate->collations;
+
+		for (int i = 0; i < numkeys; i++)
+		{
+			/* rotate hashkey left 1 bit at each step */
+			hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
+
+			if (!pslot->tts_isnull[i])	/* treat nulls as having hash key 0 */
+			{
+				uint32		hkey;
+
+				hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i],
+														collations[i], pslot->tts_values[i]));
+				hashkey ^= hkey;
+			}
+		}
+	}
+
+	return murmurhash32(hashkey);
+}
+
+/*
+ * MemoizeHash_equal
+ *		Equality function for confirming hash value matches during a hash
+ *		table lookup.  'key2' is never used.  Instead the MemoizeState's
+ *		probeslot is always populated with details of what's being looked up.
+ */
+static bool
+MemoizeHash_equal(struct memoize_hash *tb, const MemoizeKey *key1,
+			  const MemoizeKey *key2)
+{
+	MemoizeState *mstate = (MemoizeState *) tb->private_data;
+	ExprContext *econtext = mstate->ss.ps.ps_ExprContext;
+	TupleTableSlot *tslot = mstate->tableslot;
+	TupleTableSlot *pslot = mstate->probeslot;
+
+	/* probeslot should have already been prepared by prepare_probe_slot() */
+	ExecStoreMinimalTuple(key1->params, tslot, false);
+
+	if (mstate->binary_mode)
+	{
+		int			numkeys = mstate->nkeys;
+
+		slot_getallattrs(tslot);
+		slot_getallattrs(pslot);
+
+		for (int i = 0; i < numkeys; i++)
+		{
+			FormData_pg_attribute *attr;
+
+			if (tslot->tts_isnull[i] != pslot->tts_isnull[i])
+				return false;
+
+			/* both NULL? they're equal */
+			if (tslot->tts_isnull[i])
+				continue;
+
+			/* perform binary comparison on the two datums */
+			attr = &tslot->tts_tupleDescriptor->attrs[i];
+			if (!datum_image_eq(tslot->tts_values[i], pslot->tts_values[i],
+								attr->attbyval, attr->attlen))
+				return false;
+		}
+		return true;
+	}
+	else
+	{
+		econtext->ecxt_innertuple = tslot;
+		econtext->ecxt_outertuple = pslot;
+		return ExecQualAndReset(mstate->cache_eq_expr, econtext);
+	}
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(MemoizeState *mstate, uint32 size)
+{
+	/* Make a guess at a good size when we're not given a valid size. */
+	if (size == 0)
+		size = 1024;
+
+	/* memoize_create will convert the size to a power of 2 */
+	mstate->hashtable = memoize_create(mstate->tableContext, size, mstate);
+}
+
+/*
+ * prepare_probe_slot
+ *		Populate mstate's probeslot with the values from the tuple stored
+ *		in 'key'.  If 'key' is NULL, then perform the population by evaluating
+ *		mstate's param_exprs.
+ */
+static inline void
+prepare_probe_slot(MemoizeState *mstate, MemoizeKey *key)
+{
+	TupleTableSlot *pslot = mstate->probeslot;
+	TupleTableSlot *tslot = mstate->tableslot;
+	int			numKeys = mstate->nkeys;
+
+	ExecClearTuple(pslot);
+
+	if (key == NULL)
+	{
+		/* Set the probeslot's values based on the current parameter values */
+		for (int i = 0; i < numKeys; i++)
+			pslot->tts_values[i] = ExecEvalExpr(mstate->param_exprs[i],
+												mstate->ss.ps.ps_ExprContext,
+												&pslot->tts_isnull[i]);
+	}
+	else
+	{
+		/* Process the key's MinimalTuple and store the values in probeslot */
+		ExecStoreMinimalTuple(key->params, tslot, false);
+		slot_getallattrs(tslot);
+		memcpy(pslot->tts_values, tslot->tts_values, sizeof(Datum) * numKeys);
+		memcpy(pslot->tts_isnull, tslot->tts_isnull, sizeof(bool) * numKeys);
+	}
+
+	ExecStoreVirtualTuple(pslot);
+}
+
+/*
+ * entry_purge_tuples
+ *		Remove all tuples from the cache entry pointed to by 'entry'.  This
+ *		leaves an empty cache entry.  Also, update the memory accounting to
+ *		reflect the removal of the tuples.
+ */
+static inline void
+entry_purge_tuples(MemoizeState *mstate, MemoizeEntry *entry)
+{
+	MemoizeTuple *tuple = entry->tuplehead;
+	uint64		freed_mem = 0;
+
+	while (tuple != NULL)
+	{
+		MemoizeTuple *next = tuple->next;
+
+		freed_mem += CACHE_TUPLE_BYTES(tuple);
+
+		/* Free memory used for this tuple */
+		pfree(tuple->mintuple);
+		pfree(tuple);
+
+		tuple = next;
+	}
+
+	entry->complete = false;
+	entry->tuplehead = NULL;
+
+	/* Update the memory accounting */
+	mstate->mem_used -= freed_mem;
+}
+
+/*
+ * remove_cache_entry
+ *		Remove 'entry' from the cache and free memory used by it.
+ */
+static void
+remove_cache_entry(MemoizeState *mstate, MemoizeEntry *entry)
+{
+	MemoizeKey *key = entry->key;
+
+	dlist_delete(&entry->key->lru_node);
+
+	/* Remove all of the tuples from this entry */
+	entry_purge_tuples(mstate, entry);
+
+	/*
+	 * Update memory accounting. entry_purge_tuples should have already
+	 * subtracted the memory used for each cached tuple.  Here we just update
+	 * the amount used by the entry itself.
+	 */
+	mstate->mem_used -= EMPTY_ENTRY_MEMORY_BYTES(entry);
+
+	/* Remove the entry from the cache */
+	memoize_delete_item(mstate->hashtable, entry);
+
+	pfree(key->params);
+	pfree(key);
+}
+
+/*
+ * cache_purge_all
+ *		Remove all items from the cache
+ */
+static void
+cache_purge_all(MemoizeState *mstate)
+{
+	uint64		evictions = mstate->hashtable->members;
+	PlanState *pstate = (PlanState *) mstate;
+
+	/*
+	 * Likely the most efficient way to remove all items is to just reset the
+	 * memory context for the cache and then rebuild a fresh hash table.  This
+	 * saves having to remove each item one by one and pfree each cached tuple
+	 */
+	MemoryContextReset(mstate->tableContext);
+
+	/* Make the hash table the same size as the original size */
+	build_hash_table(mstate, ((Memoize *) pstate->plan)->est_entries);
+
+	/* reset the LRU list */
+	dlist_init(&mstate->lru_list);
+	mstate->last_tuple = NULL;
+	mstate->entry = NULL;
+
+	mstate->mem_used = 0;
+
+	/* XXX should we add something new to track these purges? */
+	mstate->stats.cache_evictions += evictions; /* Update Stats */
+}
+
+/*
+ * cache_reduce_memory
+ *		Evict older and less recently used items from the cache in order to
+ *		reduce the memory consumption back to something below the
+ *		MemoizeState's mem_limit.
+ *
+ * 'specialkey', if not NULL, causes the function to return false if the entry
+ * which the key belongs to is removed from the cache.
+ */
+static bool
+cache_reduce_memory(MemoizeState *mstate, MemoizeKey *specialkey)
+{
+	bool		specialkey_intact = true;	/* for now */
+	dlist_mutable_iter iter;
+	uint64		evictions = 0;
+
+	/* Update peak memory usage */
+	if (mstate->mem_used > mstate->stats.mem_peak)
+		mstate->stats.mem_peak = mstate->mem_used;
+
+	/* We expect only to be called when we've gone over budget on memory */
+	Assert(mstate->mem_used > mstate->mem_limit);
+
+	/* Start the eviction process starting at the head of the LRU list. */
+	dlist_foreach_modify(iter, &mstate->lru_list)
+	{
+		MemoizeKey *key = dlist_container(MemoizeKey, lru_node, iter.cur);
+		MemoizeEntry *entry;
+
+		/*
+		 * Populate the hash probe slot in preparation for looking up this LRU
+		 * entry.
+		 */
+		prepare_probe_slot(mstate, key);
+
+		/*
+		 * Ideally the LRU list pointers would be stored in the entry itself
+		 * rather than in the key.  Unfortunately, we can't do that as the
+		 * simplehash.h code may resize the table and allocate new memory for
+		 * entries which would result in those pointers pointing to the old
+		 * buckets.  However, it's fine to use the key to store this as that's
+		 * only referenced by a pointer in the entry, which of course follows
+		 * the entry whenever the hash table is resized.  Since we only have a
+		 * pointer to the key here, we must perform a hash table lookup to
+		 * find the entry that the key belongs to.
+		 */
+		entry = memoize_lookup(mstate->hashtable, NULL);
+
+		/*
+		 * Sanity check that we found the entry belonging to the LRU list
+		 * item.  A misbehaving hash or equality function could cause the
+		 * entry not to be found or the wrong entry to be found.
+		 */
+		if (unlikely(entry == NULL || entry->key != key))
+			elog(ERROR, "could not find memoization table entry");
+
+		/*
+		 * If we're being called to free memory while the cache is being
+		 * populated with new tuples, then we'd better take some care as we
+		 * could end up freeing the entry which 'specialkey' belongs to.
+		 * Generally callers will pass 'specialkey' as the key for the cache
+		 * entry which is currently being populated, so we must set
+		 * 'specialkey_intact' to false to inform the caller the specialkey
+		 * entry has been removed.
+		 */
+		if (key == specialkey)
+			specialkey_intact = false;
+
+		/*
+		 * Finally remove the entry.  This will remove from the LRU list too.
+		 */
+		remove_cache_entry(mstate, entry);
+
+		evictions++;
+
+		/* Exit if we've freed enough memory */
+		if (mstate->mem_used <= mstate->mem_limit)
+			break;
+	}
+
+	mstate->stats.cache_evictions += evictions;	/* Update Stats */
+
+	return specialkey_intact;
+}
+
+/*
+ * cache_lookup
+ *		Perform a lookup to see if we've already cached tuples based on the
+ *		scan's current parameters.  If we find an existing entry we move it to
+ *		the end of the LRU list, set *found to true then return it.  If we
+ *		don't find an entry then we create a new one and add it to the end of
+ *		the LRU list.  We also update cache memory accounting and remove older
+ *		entries if we go over the memory budget.  If we managed to free enough
+ *		memory we return the new entry, else we return NULL.
+ *
+ * Callers can assume we'll never return NULL when *found is true.
+ */
+static MemoizeEntry *
+cache_lookup(MemoizeState *mstate, bool *found)
+{
+	MemoizeKey *key;
+	MemoizeEntry *entry;
+	MemoryContext oldcontext;
+
+	/* prepare the probe slot with the current scan parameters */
+	prepare_probe_slot(mstate, NULL);
+
+	/*
+	 * Add the new entry to the cache.  No need to pass a valid key since the
+	 * hash function uses mstate's probeslot, which we populated above.
+	 */
+	entry = memoize_insert(mstate->hashtable, NULL, found);
+
+	if (*found)
+	{
+		/*
+		 * Move existing entry to the tail of the LRU list to mark it as the
+		 * most recently used item.
+		 */
+		dlist_move_tail(&mstate->lru_list, &entry->key->lru_node);
+
+		return entry;
+	}
+
+	oldcontext = MemoryContextSwitchTo(mstate->tableContext);
+
+	/* Allocate a new key */
+	entry->key = key = (MemoizeKey *) palloc(sizeof(MemoizeKey));
+	key->params = ExecCopySlotMinimalTuple(mstate->probeslot);
+
+	/* Update the total cache memory utilization */
+	mstate->mem_used += EMPTY_ENTRY_MEMORY_BYTES(entry);
+
+	/* Initialize this entry */
+	entry->complete = false;
+	entry->tuplehead = NULL;
+
+	/*
+	 * Since this is the most recently used entry, push this entry onto the
+	 * end of the LRU list.
+	 */
+	dlist_push_tail(&mstate->lru_list, &entry->key->lru_node);
+
+	mstate->last_tuple = NULL;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/*
+	 * If we've gone over our memory budget, then we'll free up some space in
+	 * the cache.
+	 */
+	if (mstate->mem_used > mstate->mem_limit)
+	{
+		/*
+		 * Try to free up some memory.  It's highly unlikely that we'll fail
+		 * to do so here since the entry we've just added is yet to contain
+		 * any tuples and we're able to remove any other entry to reduce the
+		 * memory consumption.
+		 */
+		if (unlikely(!cache_reduce_memory(mstate, key)))
+			return NULL;
+
+		/*
+		 * The process of removing entries from the cache may have caused the
+		 * code in simplehash.h to shuffle elements to earlier buckets in the
+		 * hash table.  If it has, we'll need to find the entry again by
+		 * performing a lookup.  Fortunately, we can detect if this has
+		 * happened by seeing if the entry is still in use and that the key
+		 * pointer matches our expected key.
+		 */
+		if (entry->status != memoize_SH_IN_USE || entry->key != key)
+		{
+			/*
+			 * We need to repopulate the probeslot as lookups performed during
+			 * the cache evictions above will have stored some other key.
+			 */
+			prepare_probe_slot(mstate, key);
+
+			/* Re-find the newly added entry */
+			entry = memoize_lookup(mstate->hashtable, NULL);
+			Assert(entry != NULL);
+		}
+	}
+
+	return entry;
+}
+
+/*
+ * cache_store_tuple
+ *		Add the tuple stored in 'slot' to the mstate's current cache entry.
+ *		The cache entry must have already been made with cache_lookup().
+ *		mstate's last_tuple field must point to the tail of mstate->entry's
+ *		list of tuples.
+ */
+static bool
+cache_store_tuple(MemoizeState *mstate, TupleTableSlot *slot)
+{
+	MemoizeTuple *tuple;
+	MemoizeEntry *entry = mstate->entry;
+	MemoryContext oldcontext;
+
+	Assert(slot != NULL);
+	Assert(entry != NULL);
+
+	oldcontext = MemoryContextSwitchTo(mstate->tableContext);
+
+	tuple = (MemoizeTuple *) palloc(sizeof(MemoizeTuple));
+	tuple->mintuple = ExecCopySlotMinimalTuple(slot);
+	tuple->next = NULL;
+
+	/* Account for the memory we just consumed */
+	mstate->mem_used += CACHE_TUPLE_BYTES(tuple);
+
+	if (entry->tuplehead == NULL)
+	{
+		/*
+		 * This is the first tuple for this entry, so just point the list head
+		 * to it.
+		 */
+		entry->tuplehead = tuple;
+	}
+	else
+	{
+		/* push this tuple onto the tail of the list */
+		mstate->last_tuple->next = tuple;
+	}
+
+	mstate->last_tuple = tuple;
+	MemoryContextSwitchTo(oldcontext);
+
+	/*
+	 * If we've gone over our memory budget then free up some space in the
+	 * cache.
+	 */
+	if (mstate->mem_used > mstate->mem_limit)
+	{
+		MemoizeKey *key = entry->key;
+
+		if (!cache_reduce_memory(mstate, key))
+			return false;
+
+		/*
+		 * The process of removing entries from the cache may have caused the
+		 * code in simplehash.h to shuffle elements to earlier buckets in the
+		 * hash table.  If it has, we'll need to find the entry again by
+		 * performing a lookup.  Fortunately, we can detect if this has
+		 * happened by seeing if the entry is still in use and that the key
+		 * pointer matches our expected key.
+		 */
+		if (entry->status != memoize_SH_IN_USE || entry->key != key)
+		{
+			/*
+			 * We need to repopulate the probeslot as lookups performed during
+			 * the cache evictions above will have stored some other key.
+			 */
+			prepare_probe_slot(mstate, key);
+
+			/* Re-find the entry */
+			mstate->entry = entry = memoize_lookup(mstate->hashtable, NULL);
+			Assert(entry != NULL);
+		}
+	}
+
+	return true;
+}
+
+static TupleTableSlot *
+ExecMemoize(PlanState *pstate)
+{
+	MemoizeState *node = castNode(MemoizeState, pstate);
+	PlanState  *outerNode;
+	TupleTableSlot *slot;
+
+	switch (node->mstatus)
+	{
+		case MEMO_CACHE_LOOKUP:
+			{
+				MemoizeEntry *entry;
+				TupleTableSlot *outerslot;
+				bool		found;
+
+				Assert(node->entry == NULL);
+
+				/*
+				 * We're only ever in this state for the first call of the
+				 * scan.  Here we have a look to see if we've already seen the
+				 * current parameters before and if we have already cached a
+				 * complete set of records that the outer plan will return for
+				 * these parameters.
+				 *
+				 * When we find a valid cache entry, we'll return the first
+				 * tuple from it. If not found, we'll create a cache entry and
+				 * then try to fetch a tuple from the outer scan.  If we find
+				 * one there, we'll try to cache it.
+				 */
+
+				/* see if we've got anything cached for the current parameters */
+				entry = cache_lookup(node, &found);
+
+				if (found && entry->complete)
+				{
+					node->stats.cache_hits += 1;	/* stats update */
+
+					/*
+					 * Set last_tuple and entry so that the state
+					 * MEMO_CACHE_FETCH_NEXT_TUPLE can easily find the next
+					 * tuple for these parameters.
+					 */
+					node->last_tuple = entry->tuplehead;
+					node->entry = entry;
+
+					/* Fetch the first cached tuple, if there is one */
+					if (entry->tuplehead)
+					{
+						node->mstatus = MEMO_CACHE_FETCH_NEXT_TUPLE;
+
+						slot = node->ss.ps.ps_ResultTupleSlot;
+						ExecStoreMinimalTuple(entry->tuplehead->mintuple,
+											  slot, false);
+
+						return slot;
+					}
+
+					/* The cache entry is void of any tuples. */
+					node->mstatus = MEMO_END_OF_SCAN;
+					return NULL;
+				}
+
+				/* Handle cache miss */
+				node->stats.cache_misses += 1;	/* stats update */
+
+				if (found)
+				{
+					/*
+					 * A cache entry was found, but the scan for that entry
+					 * did not run to completion.  We'll just remove all
+					 * tuples and start again.  It might be tempting to
+					 * continue where we left off, but there's no guarantee
+					 * the outer node will produce the tuples in the same
+					 * order as it did last time.
+					 */
+					entry_purge_tuples(node, entry);
+				}
+
+				/* Scan the outer node for a tuple to cache */
+				outerNode = outerPlanState(node);
+				outerslot = ExecProcNode(outerNode);
+				if (TupIsNull(outerslot))
+				{
+					/*
+					 * cache_lookup may have returned NULL due to failure to
+					 * free enough cache space, so ensure we don't do anything
+					 * here that assumes it worked. There's no need to go into
+					 * bypass mode here as we're setting mstatus to end of
+					 * scan.
+					 */
+					if (likely(entry))
+						entry->complete = true;
+
+					node->mstatus = MEMO_END_OF_SCAN;
+					return NULL;
+				}
+
+				node->entry = entry;
+
+				/*
+				 * If we failed to create the entry or failed to store the
+				 * tuple in the entry, then go into bypass mode.
+				 */
+				if (unlikely(entry == NULL ||
+							 !cache_store_tuple(node, outerslot)))
+				{
+					node->stats.cache_overflows += 1;	/* stats update */
+
+					node->mstatus = MEMO_CACHE_BYPASS_MODE;
+
+					/*
+					 * No need to clear out last_tuple as we'll stay in bypass
+					 * mode until the end of the scan.
+					 */
+				}
+				else
+				{
+					/*
+					 * If we only expect a single row from this scan then we
+					 * can mark that we're not expecting more.  This allows
+					 * cache lookups to work even when the scan has not been
+					 * executed to completion.
+					 */
+					entry->complete = node->singlerow;
+					node->mstatus = MEMO_FILLING_CACHE;
+				}
+
+				slot = node->ss.ps.ps_ResultTupleSlot;
+				ExecCopySlot(slot, outerslot);
+				return slot;
+			}
+
+		case MEMO_CACHE_FETCH_NEXT_TUPLE:
+			{
+				/* We shouldn't be in this state if these are not set */
+				Assert(node->entry != NULL);
+				Assert(node->last_tuple != NULL);
+
+				/* Skip to the next tuple to output */
+				node->last_tuple = node->last_tuple->next;
+
+				/* No more tuples in the cache */
+				if (node->last_tuple == NULL)
+				{
+					node->mstatus = MEMO_END_OF_SCAN;
+					return NULL;
+				}
+
+				slot = node->ss.ps.ps_ResultTupleSlot;
+				ExecStoreMinimalTuple(node->last_tuple->mintuple, slot,
+									  false);
+
+				return slot;
+			}
+
+		case MEMO_FILLING_CACHE:
+			{
+				TupleTableSlot *outerslot;
+				MemoizeEntry *entry = node->entry;
+
+				/* entry should already have been set by MEMO_CACHE_LOOKUP */
+				Assert(entry != NULL);
+
+				/*
+				 * When in the MEMO_FILLING_CACHE state, we've just had a
+				 * cache miss and are populating the cache with the current
+				 * scan tuples.
+				 */
+				outerNode = outerPlanState(node);
+				outerslot = ExecProcNode(outerNode);
+				if (TupIsNull(outerslot))
+				{
+					/* No more tuples.  Mark it as complete */
+					entry->complete = true;
+					node->mstatus = MEMO_END_OF_SCAN;
+					return NULL;
+				}
+
+				/*
+				 * Validate if the planner properly set the singlerow flag. It
+				 * should only set that if each cache entry can, at most,
+				 * return 1 row.
+				 */
+				if (unlikely(entry->complete))
+					elog(ERROR, "cache entry already complete");
+
+				/* Record the tuple in the current cache entry */
+				if (unlikely(!cache_store_tuple(node, outerslot)))
+				{
+					/* Couldn't store it?  Handle overflow */
+					node->stats.cache_overflows += 1;	/* stats update */
+
+					node->mstatus = MEMO_CACHE_BYPASS_MODE;
+
+					/*
+					 * No need to clear out entry or last_tuple as we'll stay
+					 * in bypass mode until the end of the scan.
+					 */
+				}
+
+				slot = node->ss.ps.ps_ResultTupleSlot;
+				ExecCopySlot(slot, outerslot);
+				return slot;
+			}
+
+		case MEMO_CACHE_BYPASS_MODE:
+			{
+				TupleTableSlot *outerslot;
+
+				/*
+				 * When in bypass mode we just continue to read tuples without
+				 * caching.  We need to wait until the next rescan before we
+				 * can come out of this mode.
+				 */
+				outerNode = outerPlanState(node);
+				outerslot = ExecProcNode(outerNode);
+				if (TupIsNull(outerslot))
+				{
+					node->mstatus = MEMO_END_OF_SCAN;
+					return NULL;
+				}
+
+				slot = node->ss.ps.ps_ResultTupleSlot;
+				ExecCopySlot(slot, outerslot);
+				return slot;
+			}
+
+		case MEMO_END_OF_SCAN:
+
+			/*
+			 * We've already returned NULL for this scan, but just in case
+			 * something calls us again by mistake.
+			 */
+			return NULL;
+
+		default:
+			elog(ERROR, "unrecognized memoize state: %d",
+				 (int) node->mstatus);
+			return NULL;
+	}							/* switch */
+}
+
+MemoizeState *
+ExecInitMemoize(Memoize *node, EState *estate, int eflags)
+{
+	MemoizeState *mstate = makeNode(MemoizeState);
+	Plan	   *outerNode;
+	int			i;
+	int			nkeys;
+	Oid		   *eqfuncoids;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	mstate->ss.ps.plan = (Plan *) node;
+	mstate->ss.ps.state = estate;
+	mstate->ss.ps.ExecProcNode = ExecMemoize;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &mstate->ss.ps);
+
+	outerNode = outerPlan(node);
+	outerPlanState(mstate) = ExecInitNode(outerNode, estate, eflags);
+
+	/*
+	 * Initialize return slot and type. No need to initialize projection info
+	 * because this node doesn't do projections.
+	 */
+	ExecInitResultTupleSlotTL(&mstate->ss.ps, &TTSOpsMinimalTuple);
+	mstate->ss.ps.ps_ProjInfo = NULL;
+
+	/*
+	 * Initialize scan slot and type.
+	 */
+	ExecCreateScanSlotFromOuterPlan(estate, &mstate->ss, &TTSOpsMinimalTuple);
+
+	/*
+	 * Set the state machine to lookup the cache.  We won't find anything
+	 * until we cache something, but this saves a special case to create the
+	 * first entry.
+	 */
+	mstate->mstatus = MEMO_CACHE_LOOKUP;
+
+	mstate->nkeys = nkeys = node->numKeys;
+	mstate->hashkeydesc = ExecTypeFromExprList(node->param_exprs);
+	mstate->tableslot = MakeSingleTupleTableSlot(mstate->hashkeydesc,
+												 &TTSOpsMinimalTuple);
+	mstate->probeslot = MakeSingleTupleTableSlot(mstate->hashkeydesc,
+												 &TTSOpsVirtual);
+
+	mstate->param_exprs = (ExprState **) palloc(nkeys * sizeof(ExprState *));
+	mstate->collations = node->collations; /* Just point directly to the plan
+											* data */
+	mstate->hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
+
+	eqfuncoids = palloc(nkeys * sizeof(Oid));
+
+	for (i = 0; i < nkeys; i++)
+	{
+		Oid			hashop = node->hashOperators[i];
+		Oid			left_hashfn;
+		Oid			right_hashfn;
+		Expr	   *param_expr = (Expr *) list_nth(node->param_exprs, i);
+
+		if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
+			elog(ERROR, "could not find hash function for hash operator %u",
+				 hashop);
+
+		fmgr_info(left_hashfn, &mstate->hashfunctions[i]);
+
+		mstate->param_exprs[i] = ExecInitExpr(param_expr, (PlanState *) mstate);
+		eqfuncoids[i] = get_opcode(hashop);
+	}
+
+	mstate->cache_eq_expr = ExecBuildParamSetEqual(mstate->hashkeydesc,
+												   &TTSOpsMinimalTuple,
+												   &TTSOpsVirtual,
+												   eqfuncoids,
+												   node->collations,
+												   node->param_exprs,
+												   (PlanState *) mstate);
+
+	pfree(eqfuncoids);
+	mstate->mem_used = 0;
+
+	/* Limit the total memory consumed by the cache to this */
+	mstate->mem_limit = get_hash_memory_limit();
+
+	/* A memory context dedicated for the cache */
+	mstate->tableContext = AllocSetContextCreate(CurrentMemoryContext,
+												 "MemoizeHashTable",
+												 ALLOCSET_DEFAULT_SIZES);
+
+	dlist_init(&mstate->lru_list);
+	mstate->last_tuple = NULL;
+	mstate->entry = NULL;
+
+	/*
+	 * Mark if we can assume the cache entry is completed after we get the
+	 * first record for it.  Some callers might not call us again after
+	 * getting the first match. e.g. A join operator performing a unique join
+	 * is able to skip to the next outer tuple after getting the first
+	 * matching inner tuple.  In this case, the cache entry is complete after
+	 * getting the first tuple.  This allows us to mark it as so.
+	 */
+	mstate->singlerow = node->singlerow;
+	mstate->keyparamids = node->keyparamids;
+
+	/*
+	 * Record if the cache keys should be compared bit by bit, or logically
+	 * using the type's hash equality operator
+	 */
+	mstate->binary_mode = node->binary_mode;
+
+	/* Zero the statistics counters */
+	memset(&mstate->stats, 0, sizeof(MemoizeInstrumentation));
+
+	/* Allocate and set up the actual cache */
+	build_hash_table(mstate, node->est_entries);
+
+	return mstate;
+}
+
+void
+ExecEndMemoize(MemoizeState *node)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* Validate the memory accounting code is correct in assert builds. */
+	{
+		int			count;
+		uint64		mem = 0;
+		memoize_iterator i;
+		MemoizeEntry *entry;
+
+		memoize_start_iterate(node->hashtable, &i);
+
+		count = 0;
+		while ((entry = memoize_iterate(node->hashtable, &i)) != NULL)
+		{
+			MemoizeTuple *tuple = entry->tuplehead;
+
+			mem += EMPTY_ENTRY_MEMORY_BYTES(entry);
+			while (tuple != NULL)
+			{
+				mem += CACHE_TUPLE_BYTES(tuple);
+				tuple = tuple->next;
+			}
+			count++;
+		}
+
+		Assert(count == node->hashtable->members);
+		Assert(mem == node->mem_used);
+	}
+#endif
+
+	/*
+	 * When ending a parallel worker, copy the statistics gathered by the
+	 * worker back into shared memory so that it can be picked up by the main
+	 * process to report in EXPLAIN ANALYZE.
+	 */
+	if (node->shared_info != NULL && IsParallelWorker())
+	{
+		MemoizeInstrumentation *si;
+
+		/* Make mem_peak available for EXPLAIN */
+		if (node->stats.mem_peak == 0)
+			node->stats.mem_peak = node->mem_used;
+
+		Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+		si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+		memcpy(si, &node->stats, sizeof(MemoizeInstrumentation));
+	}
+
+	/* Remove the cache context */
+	MemoryContextDelete(node->tableContext);
+
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	/* must drop pointer to cache result tuple */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	/*
+	 * free exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * shut down the subplan
+	 */
+	ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanMemoize(MemoizeState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/* Mark that we must lookup the cache for a new set of parameters */
+	node->mstatus = MEMO_CACHE_LOOKUP;
+
+	/* nullify pointers used for the last scan */
+	node->entry = NULL;
+	node->last_tuple = NULL;
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+
+	/*
+	 * Purge the entire cache if a parameter changed that is not part of the
+	 * cache key.
+	 */
+	if (bms_nonempty_difference(outerPlan->chgParam, node->keyparamids))
+		cache_purge_all(node);
+}
+
+/*
+ * ExecEstimateCacheEntryOverheadBytes
+ *		For use in the query planner to help it estimate the amount of memory
+ *		required to store a single entry in the cache.
+ */
+double
+ExecEstimateCacheEntryOverheadBytes(double ntuples)
+{
+	return sizeof(MemoizeEntry) + sizeof(MemoizeKey) + sizeof(MemoizeTuple) *
+			ntuples;
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+ /* ----------------------------------------------------------------
+  *		ExecMemoizeEstimate
+  *
+  *		Estimate space required to propagate memoize statistics.
+  * ----------------------------------------------------------------
+  */
+void
+ExecMemoizeEstimate(MemoizeState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(MemoizeInstrumentation));
+	size = add_size(size, offsetof(SharedMemoizeInfo, sinstrument));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecMemoizeInitializeDSM
+ *
+ *		Initialize DSM space for memoize statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMemoizeInitializeDSM(MemoizeState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedMemoizeInfo, sinstrument)
+		+ pcxt->nworkers * sizeof(MemoizeInstrumentation);
+	node->shared_info = shm_toc_allocate(pcxt->toc, size);
+	/* ensure any unfilled slots will contain zeroes */
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecMemoizeInitializeWorker
+ *
+ *		Attach worker to DSM space for memoize statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMemoizeInitializeWorker(MemoizeState *node, ParallelWorkerContext *pwcxt)
+{
+	node->shared_info =
+		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecMemoizeRetrieveInstrumentation
+ *
+ *		Transfer memoize statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecMemoizeRetrieveInstrumentation(MemoizeState *node)
+{
+	Size		size;
+	SharedMemoizeInfo *si;
+
+	if (node->shared_info == NULL)
+		return;
+
+	size = offsetof(SharedMemoizeInfo, sinstrument)
+		+ node->shared_info->num_workers * sizeof(MemoizeInstrumentation);
+	si = palloc(size);
+	memcpy(si, node->shared_info, size);
+	node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c
new file mode 100644
index 0000000..617bffb
--- /dev/null
+++ b/src/backend/executor/nodeMergeAppend.c
@@ -0,0 +1,389 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeMergeAppend.c
+ *	  routines to handle MergeAppend nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeMergeAppend.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ *		ExecInitMergeAppend		- initialize the MergeAppend node
+ *		ExecMergeAppend			- retrieve the next tuple from the node
+ *		ExecEndMergeAppend		- shut down the MergeAppend node
+ *		ExecReScanMergeAppend	- rescan the MergeAppend node
+ *
+ *	 NOTES
+ *		A MergeAppend node contains a list of one or more subplans.
+ *		These are each expected to deliver tuples that are sorted according
+ *		to a common sort key.  The MergeAppend node merges these streams
+ *		to produce output sorted the same way.
+ *
+ *		MergeAppend nodes don't make use of their left and right
+ *		subtrees, rather they maintain a list of subplans so
+ *		a typical MergeAppend node looks like this in the plan tree:
+ *
+ *				   ...
+ *				   /
+ *				MergeAppend---+------+------+--- nil
+ *				/	\		  |		 |		|
+ *			  nil	nil		 ...    ...    ...
+ *								 subplans
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/execPartition.h"
+#include "executor/nodeMergeAppend.h"
+#include "lib/binaryheap.h"
+#include "miscadmin.h"
+
+/*
+ * We have one slot for each item in the heap array.  We use SlotNumber
+ * to store slot indexes.  This doesn't actually provide any formal
+ * type-safety, but it makes the code more self-documenting.
+ */
+typedef int32 SlotNumber;
+
+static TupleTableSlot *ExecMergeAppend(PlanState *pstate);
+static int	heap_compare_slots(Datum a, Datum b, void *arg);
+
+
+/* ----------------------------------------------------------------
+ *		ExecInitMergeAppend
+ *
+ *		Begin all of the subscans of the MergeAppend node.
+ * ----------------------------------------------------------------
+ */
+MergeAppendState *
+ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags)
+{
+	MergeAppendState *mergestate = makeNode(MergeAppendState);
+	PlanState **mergeplanstates;
+	Bitmapset  *validsubplans;
+	int			nplans;
+	int			i,
+				j;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create new MergeAppendState for our node
+	 */
+	mergestate->ps.plan = (Plan *) node;
+	mergestate->ps.state = estate;
+	mergestate->ps.ExecProcNode = ExecMergeAppend;
+
+	/* If run-time partition pruning is enabled, then set that up now */
+	if (node->part_prune_info != NULL)
+	{
+		PartitionPruneState *prunestate;
+
+		/* We may need an expression context to evaluate partition exprs */
+		ExecAssignExprContext(estate, &mergestate->ps);
+
+		prunestate = ExecCreatePartitionPruneState(&mergestate->ps,
+												   node->part_prune_info);
+		mergestate->ms_prune_state = prunestate;
+
+		/* Perform an initial partition prune, if required. */
+		if (prunestate->do_initial_prune)
+		{
+			/* Determine which subplans survive initial pruning */
+			validsubplans = ExecFindInitialMatchingSubPlans(prunestate,
+															list_length(node->mergeplans));
+
+			nplans = bms_num_members(validsubplans);
+		}
+		else
+		{
+			/* We'll need to initialize all subplans */
+			nplans = list_length(node->mergeplans);
+			Assert(nplans > 0);
+			validsubplans = bms_add_range(NULL, 0, nplans - 1);
+		}
+
+		/*
+		 * When no run-time pruning is required and there's at least one
+		 * subplan, we can fill as_valid_subplans immediately, preventing
+		 * later calls to ExecFindMatchingSubPlans.
+		 */
+		if (!prunestate->do_exec_prune && nplans > 0)
+			mergestate->ms_valid_subplans = bms_add_range(NULL, 0, nplans - 1);
+	}
+	else
+	{
+		nplans = list_length(node->mergeplans);
+
+		/*
+		 * When run-time partition pruning is not enabled we can just mark all
+		 * subplans as valid; they must also all be initialized.
+		 */
+		Assert(nplans > 0);
+		mergestate->ms_valid_subplans = validsubplans =
+			bms_add_range(NULL, 0, nplans - 1);
+		mergestate->ms_prune_state = NULL;
+	}
+
+	mergeplanstates = (PlanState **) palloc(nplans * sizeof(PlanState *));
+	mergestate->mergeplans = mergeplanstates;
+	mergestate->ms_nplans = nplans;
+
+	mergestate->ms_slots = (TupleTableSlot **) palloc0(sizeof(TupleTableSlot *) * nplans);
+	mergestate->ms_heap = binaryheap_allocate(nplans, heap_compare_slots,
+											  mergestate);
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * MergeAppend nodes do have Result slots, which hold pointers to tuples,
+	 * so we have to initialize them.  FIXME
+	 */
+	ExecInitResultTupleSlotTL(&mergestate->ps, &TTSOpsVirtual);
+
+	/* node returns slots from each of its subnodes, therefore not fixed */
+	mergestate->ps.resultopsset = true;
+	mergestate->ps.resultopsfixed = false;
+
+	/*
+	 * call ExecInitNode on each of the valid plans to be executed and save
+	 * the results into the mergeplanstates array.
+	 */
+	j = 0;
+	i = -1;
+	while ((i = bms_next_member(validsubplans, i)) >= 0)
+	{
+		Plan	   *initNode = (Plan *) list_nth(node->mergeplans, i);
+
+		mergeplanstates[j++] = ExecInitNode(initNode, estate, eflags);
+	}
+
+	mergestate->ps.ps_ProjInfo = NULL;
+
+	/*
+	 * initialize sort-key information
+	 */
+	mergestate->ms_nkeys = node->numCols;
+	mergestate->ms_sortkeys = palloc0(sizeof(SortSupportData) * node->numCols);
+
+	for (i = 0; i < node->numCols; i++)
+	{
+		SortSupport sortKey = mergestate->ms_sortkeys + i;
+
+		sortKey->ssup_cxt = CurrentMemoryContext;
+		sortKey->ssup_collation = node->collations[i];
+		sortKey->ssup_nulls_first = node->nullsFirst[i];
+		sortKey->ssup_attno = node->sortColIdx[i];
+
+		/*
+		 * It isn't feasible to perform abbreviated key conversion, since
+		 * tuples are pulled into mergestate's binary heap as needed.  It
+		 * would likely be counter-productive to convert tuples into an
+		 * abbreviated representation as they're pulled up, so opt out of that
+		 * additional optimization entirely.
+		 */
+		sortKey->abbreviate = false;
+
+		PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
+	}
+
+	/*
+	 * initialize to show we have not run the subplans yet
+	 */
+	mergestate->ms_initialized = false;
+
+	return mergestate;
+}
+
+/* ----------------------------------------------------------------
+ *	   ExecMergeAppend
+ *
+ *		Handles iteration over multiple subplans.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecMergeAppend(PlanState *pstate)
+{
+	MergeAppendState *node = castNode(MergeAppendState, pstate);
+	TupleTableSlot *result;
+	SlotNumber	i;
+
+	CHECK_FOR_INTERRUPTS();
+
+	if (!node->ms_initialized)
+	{
+		/* Nothing to do if all subplans were pruned */
+		if (node->ms_nplans == 0)
+			return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+		/*
+		 * If we've yet to determine the valid subplans then do so now.  If
+		 * run-time pruning is disabled then the valid subplans will always be
+		 * set to all subplans.
+		 */
+		if (node->ms_valid_subplans == NULL)
+			node->ms_valid_subplans =
+				ExecFindMatchingSubPlans(node->ms_prune_state);
+
+		/*
+		 * First time through: pull the first tuple from each valid subplan,
+		 * and set up the heap.
+		 */
+		i = -1;
+		while ((i = bms_next_member(node->ms_valid_subplans, i)) >= 0)
+		{
+			node->ms_slots[i] = ExecProcNode(node->mergeplans[i]);
+			if (!TupIsNull(node->ms_slots[i]))
+				binaryheap_add_unordered(node->ms_heap, Int32GetDatum(i));
+		}
+		binaryheap_build(node->ms_heap);
+		node->ms_initialized = true;
+	}
+	else
+	{
+		/*
+		 * Otherwise, pull the next tuple from whichever subplan we returned
+		 * from last time, and reinsert the subplan index into the heap,
+		 * because it might now compare differently against the existing
+		 * elements of the heap.  (We could perhaps simplify the logic a bit
+		 * by doing this before returning from the prior call, but it's better
+		 * to not pull tuples until necessary.)
+		 */
+		i = DatumGetInt32(binaryheap_first(node->ms_heap));
+		node->ms_slots[i] = ExecProcNode(node->mergeplans[i]);
+		if (!TupIsNull(node->ms_slots[i]))
+			binaryheap_replace_first(node->ms_heap, Int32GetDatum(i));
+		else
+			(void) binaryheap_remove_first(node->ms_heap);
+	}
+
+	if (binaryheap_empty(node->ms_heap))
+	{
+		/* All the subplans are exhausted, and so is the heap */
+		result = ExecClearTuple(node->ps.ps_ResultTupleSlot);
+	}
+	else
+	{
+		i = DatumGetInt32(binaryheap_first(node->ms_heap));
+		result = node->ms_slots[i];
+	}
+
+	return result;
+}
+
+/*
+ * Compare the tuples in the two given slots.
+ */
+static int32
+heap_compare_slots(Datum a, Datum b, void *arg)
+{
+	MergeAppendState *node = (MergeAppendState *) arg;
+	SlotNumber	slot1 = DatumGetInt32(a);
+	SlotNumber	slot2 = DatumGetInt32(b);
+
+	TupleTableSlot *s1 = node->ms_slots[slot1];
+	TupleTableSlot *s2 = node->ms_slots[slot2];
+	int			nkey;
+
+	Assert(!TupIsNull(s1));
+	Assert(!TupIsNull(s2));
+
+	for (nkey = 0; nkey < node->ms_nkeys; nkey++)
+	{
+		SortSupport sortKey = node->ms_sortkeys + nkey;
+		AttrNumber	attno = sortKey->ssup_attno;
+		Datum		datum1,
+					datum2;
+		bool		isNull1,
+					isNull2;
+		int			compare;
+
+		datum1 = slot_getattr(s1, attno, &isNull1);
+		datum2 = slot_getattr(s2, attno, &isNull2);
+
+		compare = ApplySortComparator(datum1, isNull1,
+									  datum2, isNull2,
+									  sortKey);
+		if (compare != 0)
+		{
+			INVERT_COMPARE_RESULT(compare);
+			return compare;
+		}
+	}
+	return 0;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndMergeAppend
+ *
+ *		Shuts down the subscans of the MergeAppend node.
+ *
+ *		Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndMergeAppend(MergeAppendState *node)
+{
+	PlanState **mergeplans;
+	int			nplans;
+	int			i;
+
+	/*
+	 * get information from the node
+	 */
+	mergeplans = node->mergeplans;
+	nplans = node->ms_nplans;
+
+	/*
+	 * shut down each of the subscans
+	 */
+	for (i = 0; i < nplans; i++)
+		ExecEndNode(mergeplans[i]);
+}
+
+void
+ExecReScanMergeAppend(MergeAppendState *node)
+{
+	int			i;
+
+	/*
+	 * If any PARAM_EXEC Params used in pruning expressions have changed, then
+	 * we'd better unset the valid subplans so that they are reselected for
+	 * the new parameter values.
+	 */
+	if (node->ms_prune_state &&
+		bms_overlap(node->ps.chgParam,
+					node->ms_prune_state->execparamids))
+	{
+		bms_free(node->ms_valid_subplans);
+		node->ms_valid_subplans = NULL;
+	}
+
+	for (i = 0; i < node->ms_nplans; i++)
+	{
+		PlanState  *subnode = node->mergeplans[i];
+
+		/*
+		 * ExecReScan doesn't know about my subplans, so I have to do
+		 * changed-parameter signaling myself.
+		 */
+		if (node->ps.chgParam != NULL)
+			UpdateChangedParamSet(subnode, node->ps.chgParam);
+
+		/*
+		 * If chgParam of subnode is not null then plan will be re-scanned by
+		 * first ExecProcNode.
+		 */
+		if (subnode->chgParam == NULL)
+			ExecReScan(subnode);
+	}
+	binaryheap_reset(node->ms_heap);
+	node->ms_initialized = false;
+}
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
new file mode 100644
index 0000000..5ff3f4c
--- /dev/null
+++ b/src/backend/executor/nodeMergejoin.c
@@ -0,0 +1,1678 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeMergejoin.c
+ *	  routines supporting merge joins
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeMergejoin.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecMergeJoin			mergejoin outer and inner relations.
+ *		ExecInitMergeJoin		creates and initializes run time states
+ *		ExecEndMergeJoin		cleans up the node.
+ *
+ * NOTES
+ *
+ *		Merge-join is done by joining the inner and outer tuples satisfying
+ *		join clauses of the form ((= outerKey innerKey) ...).
+ *		The join clause list is provided by the query planner and may contain
+ *		more than one (= outerKey innerKey) clause (for composite sort key).
+ *
+ *		However, the query executor needs to know whether an outer
+ *		tuple is "greater/smaller" than an inner tuple so that it can
+ *		"synchronize" the two relations. For example, consider the following
+ *		relations:
+ *
+ *				outer: (0 ^1 1 2 5 5 5 6 6 7)	current tuple: 1
+ *				inner: (1 ^3 5 5 5 5 6)			current tuple: 3
+ *
+ *		To continue the merge-join, the executor needs to scan both inner
+ *		and outer relations till the matching tuples 5. It needs to know
+ *		that currently inner tuple 3 is "greater" than outer tuple 1 and
+ *		therefore it should scan the outer relation first to find a
+ *		matching tuple and so on.
+ *
+ *		Therefore, rather than directly executing the merge join clauses,
+ *		we evaluate the left and right key expressions separately and then
+ *		compare the columns one at a time (see MJCompare).  The planner
+ *		passes us enough information about the sort ordering of the inputs
+ *		to allow us to determine how to make the comparison.  We may use the
+ *		appropriate btree comparison function, since Postgres' only notion
+ *		of ordering is specified by btree opfamilies.
+ *
+ *
+ *		Consider the above relations and suppose that the executor has
+ *		just joined the first outer "5" with the last inner "5". The
+ *		next step is of course to join the second outer "5" with all
+ *		the inner "5's". This requires repositioning the inner "cursor"
+ *		to point at the first inner "5". This is done by "marking" the
+ *		first inner 5 so we can restore the "cursor" to it before joining
+ *		with the second outer 5. The access method interface provides
+ *		routines to mark and restore to a tuple.
+ *
+ *
+ *		Essential operation of the merge join algorithm is as follows:
+ *
+ *		Join {
+ *			get initial outer and inner tuples				INITIALIZE
+ *			do forever {
+ *				while (outer != inner) {					SKIP_TEST
+ *					if (outer < inner)
+ *						advance outer						SKIPOUTER_ADVANCE
+ *					else
+ *						advance inner						SKIPINNER_ADVANCE
+ *				}
+ *				mark inner position							SKIP_TEST
+ *				do forever {
+ *					while (outer == inner) {
+ *						join tuples							JOINTUPLES
+ *						advance inner position				NEXTINNER
+ *					}
+ *					advance outer position					NEXTOUTER
+ *					if (outer == mark)						TESTOUTER
+ *						restore inner position to mark		TESTOUTER
+ *					else
+ *						break	// return to top of outer loop
+ *				}
+ *			}
+ *		}
+ *
+ *		The merge join operation is coded in the fashion
+ *		of a state machine.  At each state, we do something and then
+ *		proceed to another state.  This state is stored in the node's
+ *		execution state information and is preserved across calls to
+ *		ExecMergeJoin. -cim 10/31/89
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "executor/execdebug.h"
+#include "executor/nodeMergejoin.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+
+
+/*
+ * States of the ExecMergeJoin state machine
+ */
+#define EXEC_MJ_INITIALIZE_OUTER		1
+#define EXEC_MJ_INITIALIZE_INNER		2
+#define EXEC_MJ_JOINTUPLES				3
+#define EXEC_MJ_NEXTOUTER				4
+#define EXEC_MJ_TESTOUTER				5
+#define EXEC_MJ_NEXTINNER				6
+#define EXEC_MJ_SKIP_TEST				7
+#define EXEC_MJ_SKIPOUTER_ADVANCE		8
+#define EXEC_MJ_SKIPINNER_ADVANCE		9
+#define EXEC_MJ_ENDOUTER				10
+#define EXEC_MJ_ENDINNER				11
+
+/*
+ * Runtime data for each mergejoin clause
+ */
+typedef struct MergeJoinClauseData
+{
+	/* Executable expression trees */
+	ExprState  *lexpr;			/* left-hand (outer) input expression */
+	ExprState  *rexpr;			/* right-hand (inner) input expression */
+
+	/*
+	 * If we have a current left or right input tuple, the values of the
+	 * expressions are loaded into these fields:
+	 */
+	Datum		ldatum;			/* current left-hand value */
+	Datum		rdatum;			/* current right-hand value */
+	bool		lisnull;		/* and their isnull flags */
+	bool		risnull;
+
+	/*
+	 * Everything we need to know to compare the left and right values is
+	 * stored here.
+	 */
+	SortSupportData ssup;
+}			MergeJoinClauseData;
+
+/* Result type for MJEvalOuterValues and MJEvalInnerValues */
+typedef enum
+{
+	MJEVAL_MATCHABLE,			/* normal, potentially matchable tuple */
+	MJEVAL_NONMATCHABLE,		/* tuple cannot join because it has a null */
+	MJEVAL_ENDOFJOIN			/* end of input (physical or effective) */
+} MJEvalResult;
+
+
+#define MarkInnerTuple(innerTupleSlot, mergestate) \
+	ExecCopySlot((mergestate)->mj_MarkedTupleSlot, (innerTupleSlot))
+
+
+/*
+ * MJExamineQuals
+ *
+ * This deconstructs the list of mergejoinable expressions, which is given
+ * to us by the planner in the form of a list of "leftexpr = rightexpr"
+ * expression trees in the order matching the sort columns of the inputs.
+ * We build an array of MergeJoinClause structs containing the information
+ * we will need at runtime.  Each struct essentially tells us how to compare
+ * the two expressions from the original clause.
+ *
+ * In addition to the expressions themselves, the planner passes the btree
+ * opfamily OID, collation OID, btree strategy number (BTLessStrategyNumber or
+ * BTGreaterStrategyNumber), and nulls-first flag that identify the intended
+ * sort ordering for each merge key.  The mergejoinable operator is an
+ * equality operator in the opfamily, and the two inputs are guaranteed to be
+ * ordered in either increasing or decreasing (respectively) order according
+ * to the opfamily and collation, with nulls at the indicated end of the range.
+ * This allows us to obtain the needed comparison function from the opfamily.
+ */
+static MergeJoinClause
+MJExamineQuals(List *mergeclauses,
+			   Oid *mergefamilies,
+			   Oid *mergecollations,
+			   int *mergestrategies,
+			   bool *mergenullsfirst,
+			   PlanState *parent)
+{
+	MergeJoinClause clauses;
+	int			nClauses = list_length(mergeclauses);
+	int			iClause;
+	ListCell   *cl;
+
+	clauses = (MergeJoinClause) palloc0(nClauses * sizeof(MergeJoinClauseData));
+
+	iClause = 0;
+	foreach(cl, mergeclauses)
+	{
+		OpExpr	   *qual = (OpExpr *) lfirst(cl);
+		MergeJoinClause clause = &clauses[iClause];
+		Oid			opfamily = mergefamilies[iClause];
+		Oid			collation = mergecollations[iClause];
+		StrategyNumber opstrategy = mergestrategies[iClause];
+		bool		nulls_first = mergenullsfirst[iClause];
+		int			op_strategy;
+		Oid			op_lefttype;
+		Oid			op_righttype;
+		Oid			sortfunc;
+
+		if (!IsA(qual, OpExpr))
+			elog(ERROR, "mergejoin clause is not an OpExpr");
+
+		/*
+		 * Prepare the input expressions for execution.
+		 */
+		clause->lexpr = ExecInitExpr((Expr *) linitial(qual->args), parent);
+		clause->rexpr = ExecInitExpr((Expr *) lsecond(qual->args), parent);
+
+		/* Set up sort support data */
+		clause->ssup.ssup_cxt = CurrentMemoryContext;
+		clause->ssup.ssup_collation = collation;
+		if (opstrategy == BTLessStrategyNumber)
+			clause->ssup.ssup_reverse = false;
+		else if (opstrategy == BTGreaterStrategyNumber)
+			clause->ssup.ssup_reverse = true;
+		else					/* planner screwed up */
+			elog(ERROR, "unsupported mergejoin strategy %d", opstrategy);
+		clause->ssup.ssup_nulls_first = nulls_first;
+
+		/* Extract the operator's declared left/right datatypes */
+		get_op_opfamily_properties(qual->opno, opfamily, false,
+								   &op_strategy,
+								   &op_lefttype,
+								   &op_righttype);
+		if (op_strategy != BTEqualStrategyNumber)	/* should not happen */
+			elog(ERROR, "cannot merge using non-equality operator %u",
+				 qual->opno);
+
+		/*
+		 * sortsupport routine must know if abbreviation optimization is
+		 * applicable in principle.  It is never applicable for merge joins
+		 * because there is no convenient opportunity to convert to
+		 * alternative representation.
+		 */
+		clause->ssup.abbreviate = false;
+
+		/* And get the matching support or comparison function */
+		Assert(clause->ssup.comparator == NULL);
+		sortfunc = get_opfamily_proc(opfamily,
+									 op_lefttype,
+									 op_righttype,
+									 BTSORTSUPPORT_PROC);
+		if (OidIsValid(sortfunc))
+		{
+			/* The sort support function can provide a comparator */
+			OidFunctionCall1(sortfunc, PointerGetDatum(&clause->ssup));
+		}
+		if (clause->ssup.comparator == NULL)
+		{
+			/* support not available, get comparison func */
+			sortfunc = get_opfamily_proc(opfamily,
+										 op_lefttype,
+										 op_righttype,
+										 BTORDER_PROC);
+			if (!OidIsValid(sortfunc))	/* should not happen */
+				elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
+					 BTORDER_PROC, op_lefttype, op_righttype, opfamily);
+			/* We'll use a shim to call the old-style btree comparator */
+			PrepareSortSupportComparisonShim(sortfunc, &clause->ssup);
+		}
+
+		iClause++;
+	}
+
+	return clauses;
+}
+
+/*
+ * MJEvalOuterValues
+ *
+ * Compute the values of the mergejoined expressions for the current
+ * outer tuple.  We also detect whether it's impossible for the current
+ * outer tuple to match anything --- this is true if it yields a NULL
+ * input, since we assume mergejoin operators are strict.  If the NULL
+ * is in the first join column, and that column sorts nulls last, then
+ * we can further conclude that no following tuple can match anything
+ * either, since they must all have nulls in the first column.  However,
+ * that case is only interesting if we're not in FillOuter mode, else
+ * we have to visit all the tuples anyway.
+ *
+ * For the convenience of callers, we also make this routine responsible
+ * for testing for end-of-input (null outer tuple), and returning
+ * MJEVAL_ENDOFJOIN when that's seen.  This allows the same code to be used
+ * for both real end-of-input and the effective end-of-input represented by
+ * a first-column NULL.
+ *
+ * We evaluate the values in OuterEContext, which can be reset each
+ * time we move to a new tuple.
+ */
+static MJEvalResult
+MJEvalOuterValues(MergeJoinState *mergestate)
+{
+	ExprContext *econtext = mergestate->mj_OuterEContext;
+	MJEvalResult result = MJEVAL_MATCHABLE;
+	int			i;
+	MemoryContext oldContext;
+
+	/* Check for end of outer subplan */
+	if (TupIsNull(mergestate->mj_OuterTupleSlot))
+		return MJEVAL_ENDOFJOIN;
+
+	ResetExprContext(econtext);
+
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	econtext->ecxt_outertuple = mergestate->mj_OuterTupleSlot;
+
+	for (i = 0; i < mergestate->mj_NumClauses; i++)
+	{
+		MergeJoinClause clause = &mergestate->mj_Clauses[i];
+
+		clause->ldatum = ExecEvalExpr(clause->lexpr, econtext,
+									  &clause->lisnull);
+		if (clause->lisnull)
+		{
+			/* match is impossible; can we end the join early? */
+			if (i == 0 && !clause->ssup.ssup_nulls_first &&
+				!mergestate->mj_FillOuter)
+				result = MJEVAL_ENDOFJOIN;
+			else if (result == MJEVAL_MATCHABLE)
+				result = MJEVAL_NONMATCHABLE;
+		}
+	}
+
+	MemoryContextSwitchTo(oldContext);
+
+	return result;
+}
+
+/*
+ * MJEvalInnerValues
+ *
+ * Same as above, but for the inner tuple.  Here, we have to be prepared
+ * to load data from either the true current inner, or the marked inner,
+ * so caller must tell us which slot to load from.
+ */
+static MJEvalResult
+MJEvalInnerValues(MergeJoinState *mergestate, TupleTableSlot *innerslot)
+{
+	ExprContext *econtext = mergestate->mj_InnerEContext;
+	MJEvalResult result = MJEVAL_MATCHABLE;
+	int			i;
+	MemoryContext oldContext;
+
+	/* Check for end of inner subplan */
+	if (TupIsNull(innerslot))
+		return MJEVAL_ENDOFJOIN;
+
+	ResetExprContext(econtext);
+
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	econtext->ecxt_innertuple = innerslot;
+
+	for (i = 0; i < mergestate->mj_NumClauses; i++)
+	{
+		MergeJoinClause clause = &mergestate->mj_Clauses[i];
+
+		clause->rdatum = ExecEvalExpr(clause->rexpr, econtext,
+									  &clause->risnull);
+		if (clause->risnull)
+		{
+			/* match is impossible; can we end the join early? */
+			if (i == 0 && !clause->ssup.ssup_nulls_first &&
+				!mergestate->mj_FillInner)
+				result = MJEVAL_ENDOFJOIN;
+			else if (result == MJEVAL_MATCHABLE)
+				result = MJEVAL_NONMATCHABLE;
+		}
+	}
+
+	MemoryContextSwitchTo(oldContext);
+
+	return result;
+}
+
+/*
+ * MJCompare
+ *
+ * Compare the mergejoinable values of the current two input tuples
+ * and return 0 if they are equal (ie, the mergejoin equalities all
+ * succeed), >0 if outer > inner, <0 if outer < inner.
+ *
+ * MJEvalOuterValues and MJEvalInnerValues must already have been called
+ * for the current outer and inner tuples, respectively.
+ */
+static int
+MJCompare(MergeJoinState *mergestate)
+{
+	int			result = 0;
+	bool		nulleqnull = false;
+	ExprContext *econtext = mergestate->js.ps.ps_ExprContext;
+	int			i;
+	MemoryContext oldContext;
+
+	/*
+	 * Call the comparison functions in short-lived context, in case they leak
+	 * memory.
+	 */
+	ResetExprContext(econtext);
+
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	for (i = 0; i < mergestate->mj_NumClauses; i++)
+	{
+		MergeJoinClause clause = &mergestate->mj_Clauses[i];
+
+		/*
+		 * Special case for NULL-vs-NULL, else use standard comparison.
+		 */
+		if (clause->lisnull && clause->risnull)
+		{
+			nulleqnull = true;	/* NULL "=" NULL */
+			continue;
+		}
+
+		result = ApplySortComparator(clause->ldatum, clause->lisnull,
+									 clause->rdatum, clause->risnull,
+									 &clause->ssup);
+
+		if (result != 0)
+			break;
+	}
+
+	/*
+	 * If we had any NULL-vs-NULL inputs, we do not want to report that the
+	 * tuples are equal.  Instead, if result is still 0, change it to +1. This
+	 * will result in advancing the inner side of the join.
+	 *
+	 * Likewise, if there was a constant-false joinqual, do not report
+	 * equality.  We have to check this as part of the mergequals, else the
+	 * rescan logic will do the wrong thing.
+	 */
+	if (result == 0 &&
+		(nulleqnull || mergestate->mj_ConstFalseJoin))
+		result = 1;
+
+	MemoryContextSwitchTo(oldContext);
+
+	return result;
+}
+
+
+/*
+ * Generate a fake join tuple with nulls for the inner tuple,
+ * and return it if it passes the non-join quals.
+ */
+static TupleTableSlot *
+MJFillOuter(MergeJoinState *node)
+{
+	ExprContext *econtext = node->js.ps.ps_ExprContext;
+	ExprState  *otherqual = node->js.ps.qual;
+
+	ResetExprContext(econtext);
+
+	econtext->ecxt_outertuple = node->mj_OuterTupleSlot;
+	econtext->ecxt_innertuple = node->mj_NullInnerTupleSlot;
+
+	if (ExecQual(otherqual, econtext))
+	{
+		/*
+		 * qualification succeeded.  now form the desired projection tuple and
+		 * return the slot containing it.
+		 */
+		MJ_printf("ExecMergeJoin: returning outer fill tuple\n");
+
+		return ExecProject(node->js.ps.ps_ProjInfo);
+	}
+	else
+		InstrCountFiltered2(node, 1);
+
+	return NULL;
+}
+
+/*
+ * Generate a fake join tuple with nulls for the outer tuple,
+ * and return it if it passes the non-join quals.
+ */
+static TupleTableSlot *
+MJFillInner(MergeJoinState *node)
+{
+	ExprContext *econtext = node->js.ps.ps_ExprContext;
+	ExprState  *otherqual = node->js.ps.qual;
+
+	ResetExprContext(econtext);
+
+	econtext->ecxt_outertuple = node->mj_NullOuterTupleSlot;
+	econtext->ecxt_innertuple = node->mj_InnerTupleSlot;
+
+	if (ExecQual(otherqual, econtext))
+	{
+		/*
+		 * qualification succeeded.  now form the desired projection tuple and
+		 * return the slot containing it.
+		 */
+		MJ_printf("ExecMergeJoin: returning inner fill tuple\n");
+
+		return ExecProject(node->js.ps.ps_ProjInfo);
+	}
+	else
+		InstrCountFiltered2(node, 1);
+
+	return NULL;
+}
+
+
+/*
+ * Check that a qual condition is constant true or constant false.
+ * If it is constant false (or null), set *is_const_false to true.
+ *
+ * Constant true would normally be represented by a NIL list, but we allow an
+ * actual bool Const as well.  We do expect that the planner will have thrown
+ * away any non-constant terms that have been ANDed with a constant false.
+ */
+static bool
+check_constant_qual(List *qual, bool *is_const_false)
+{
+	ListCell   *lc;
+
+	foreach(lc, qual)
+	{
+		Const	   *con = (Const *) lfirst(lc);
+
+		if (!con || !IsA(con, Const))
+			return false;
+		if (con->constisnull || !DatumGetBool(con->constvalue))
+			*is_const_false = true;
+	}
+	return true;
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecMergeTupleDump
+ *
+ *		This function is called through the MJ_dump() macro
+ *		when EXEC_MERGEJOINDEBUG is defined
+ * ----------------------------------------------------------------
+ */
+#ifdef EXEC_MERGEJOINDEBUG
+
+static void
+ExecMergeTupleDumpOuter(MergeJoinState *mergestate)
+{
+	TupleTableSlot *outerSlot = mergestate->mj_OuterTupleSlot;
+
+	printf("==== outer tuple ====\n");
+	if (TupIsNull(outerSlot))
+		printf("(nil)\n");
+	else
+		MJ_debugtup(outerSlot);
+}
+
+static void
+ExecMergeTupleDumpInner(MergeJoinState *mergestate)
+{
+	TupleTableSlot *innerSlot = mergestate->mj_InnerTupleSlot;
+
+	printf("==== inner tuple ====\n");
+	if (TupIsNull(innerSlot))
+		printf("(nil)\n");
+	else
+		MJ_debugtup(innerSlot);
+}
+
+static void
+ExecMergeTupleDumpMarked(MergeJoinState *mergestate)
+{
+	TupleTableSlot *markedSlot = mergestate->mj_MarkedTupleSlot;
+
+	printf("==== marked tuple ====\n");
+	if (TupIsNull(markedSlot))
+		printf("(nil)\n");
+	else
+		MJ_debugtup(markedSlot);
+}
+
+static void
+ExecMergeTupleDump(MergeJoinState *mergestate)
+{
+	printf("******** ExecMergeTupleDump ********\n");
+
+	ExecMergeTupleDumpOuter(mergestate);
+	ExecMergeTupleDumpInner(mergestate);
+	ExecMergeTupleDumpMarked(mergestate);
+
+	printf("********\n");
+}
+#endif
+
+/* ----------------------------------------------------------------
+ *		ExecMergeJoin
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecMergeJoin(PlanState *pstate)
+{
+	MergeJoinState *node = castNode(MergeJoinState, pstate);
+	ExprState  *joinqual;
+	ExprState  *otherqual;
+	bool		qualResult;
+	int			compareResult;
+	PlanState  *innerPlan;
+	TupleTableSlot *innerTupleSlot;
+	PlanState  *outerPlan;
+	TupleTableSlot *outerTupleSlot;
+	ExprContext *econtext;
+	bool		doFillOuter;
+	bool		doFillInner;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * get information from node
+	 */
+	innerPlan = innerPlanState(node);
+	outerPlan = outerPlanState(node);
+	econtext = node->js.ps.ps_ExprContext;
+	joinqual = node->js.joinqual;
+	otherqual = node->js.ps.qual;
+	doFillOuter = node->mj_FillOuter;
+	doFillInner = node->mj_FillInner;
+
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage allocated in the previous tuple cycle.
+	 */
+	ResetExprContext(econtext);
+
+	/*
+	 * ok, everything is setup.. let's go to work
+	 */
+	for (;;)
+	{
+		MJ_dump(node);
+
+		/*
+		 * get the current state of the join and do things accordingly.
+		 */
+		switch (node->mj_JoinState)
+		{
+				/*
+				 * EXEC_MJ_INITIALIZE_OUTER means that this is the first time
+				 * ExecMergeJoin() has been called and so we have to fetch the
+				 * first matchable tuple for both outer and inner subplans. We
+				 * do the outer side in INITIALIZE_OUTER state, then advance
+				 * to INITIALIZE_INNER state for the inner subplan.
+				 */
+			case EXEC_MJ_INITIALIZE_OUTER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_INITIALIZE_OUTER\n");
+
+				outerTupleSlot = ExecProcNode(outerPlan);
+				node->mj_OuterTupleSlot = outerTupleSlot;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalOuterValues(node))
+				{
+					case MJEVAL_MATCHABLE:
+						/* OK to go get the first inner tuple */
+						node->mj_JoinState = EXEC_MJ_INITIALIZE_INNER;
+						break;
+					case MJEVAL_NONMATCHABLE:
+						/* Stay in same state to fetch next outer tuple */
+						if (doFillOuter)
+						{
+							/*
+							 * Generate a fake join tuple with nulls for the
+							 * inner tuple, and return it if it passes the
+							 * non-join quals.
+							 */
+							TupleTableSlot *result;
+
+							result = MJFillOuter(node);
+							if (result)
+								return result;
+						}
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more outer tuples */
+						MJ_printf("ExecMergeJoin: nothing in outer subplan\n");
+						if (doFillInner)
+						{
+							/*
+							 * Need to emit right-join tuples for remaining
+							 * inner tuples. We set MatchedInner = true to
+							 * force the ENDOUTER state to advance inner.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDOUTER;
+							node->mj_MatchedInner = true;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+			case EXEC_MJ_INITIALIZE_INNER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_INITIALIZE_INNER\n");
+
+				innerTupleSlot = ExecProcNode(innerPlan);
+				node->mj_InnerTupleSlot = innerTupleSlot;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalInnerValues(node, innerTupleSlot))
+				{
+					case MJEVAL_MATCHABLE:
+
+						/*
+						 * OK, we have the initial tuples.  Begin by skipping
+						 * non-matching tuples.
+						 */
+						node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+						break;
+					case MJEVAL_NONMATCHABLE:
+						/* Mark before advancing, if wanted */
+						if (node->mj_ExtraMarks)
+							ExecMarkPos(innerPlan);
+						/* Stay in same state to fetch next inner tuple */
+						if (doFillInner)
+						{
+							/*
+							 * Generate a fake join tuple with nulls for the
+							 * outer tuple, and return it if it passes the
+							 * non-join quals.
+							 */
+							TupleTableSlot *result;
+
+							result = MJFillInner(node);
+							if (result)
+								return result;
+						}
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more inner tuples */
+						MJ_printf("ExecMergeJoin: nothing in inner subplan\n");
+						if (doFillOuter)
+						{
+							/*
+							 * Need to emit left-join tuples for all outer
+							 * tuples, including the one we just fetched.  We
+							 * set MatchedOuter = false to force the ENDINNER
+							 * state to emit first tuple before advancing
+							 * outer.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDINNER;
+							node->mj_MatchedOuter = false;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+				/*
+				 * EXEC_MJ_JOINTUPLES means we have two tuples which satisfied
+				 * the merge clause so we join them and then proceed to get
+				 * the next inner tuple (EXEC_MJ_NEXTINNER).
+				 */
+			case EXEC_MJ_JOINTUPLES:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_JOINTUPLES\n");
+
+				/*
+				 * Set the next state machine state.  The right things will
+				 * happen whether we return this join tuple or just fall
+				 * through to continue the state machine execution.
+				 */
+				node->mj_JoinState = EXEC_MJ_NEXTINNER;
+
+				/*
+				 * Check the extra qual conditions to see if we actually want
+				 * to return this join tuple.  If not, can proceed with merge.
+				 * We must distinguish the additional joinquals (which must
+				 * pass to consider the tuples "matched" for outer-join logic)
+				 * from the otherquals (which must pass before we actually
+				 * return the tuple).
+				 *
+				 * We don't bother with a ResetExprContext here, on the
+				 * assumption that we just did one while checking the merge
+				 * qual.  One per tuple should be sufficient.  We do have to
+				 * set up the econtext links to the tuples for ExecQual to
+				 * use.
+				 */
+				outerTupleSlot = node->mj_OuterTupleSlot;
+				econtext->ecxt_outertuple = outerTupleSlot;
+				innerTupleSlot = node->mj_InnerTupleSlot;
+				econtext->ecxt_innertuple = innerTupleSlot;
+
+				qualResult = (joinqual == NULL ||
+							  ExecQual(joinqual, econtext));
+				MJ_DEBUG_QUAL(joinqual, qualResult);
+
+				if (qualResult)
+				{
+					node->mj_MatchedOuter = true;
+					node->mj_MatchedInner = true;
+
+					/* In an antijoin, we never return a matched tuple */
+					if (node->js.jointype == JOIN_ANTI)
+					{
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						break;
+					}
+
+					/*
+					 * If we only need to join to the first matching inner
+					 * tuple, then consider returning this one, but after that
+					 * continue with next outer tuple.
+					 */
+					if (node->js.single_match)
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+
+					qualResult = (otherqual == NULL ||
+								  ExecQual(otherqual, econtext));
+					MJ_DEBUG_QUAL(otherqual, qualResult);
+
+					if (qualResult)
+					{
+						/*
+						 * qualification succeeded.  now form the desired
+						 * projection tuple and return the slot containing it.
+						 */
+						MJ_printf("ExecMergeJoin: returning tuple\n");
+
+						return ExecProject(node->js.ps.ps_ProjInfo);
+					}
+					else
+						InstrCountFiltered2(node, 1);
+				}
+				else
+					InstrCountFiltered1(node, 1);
+				break;
+
+				/*
+				 * EXEC_MJ_NEXTINNER means advance the inner scan to the next
+				 * tuple. If the tuple is not nil, we then proceed to test it
+				 * against the join qualification.
+				 *
+				 * Before advancing, we check to see if we must emit an
+				 * outer-join fill tuple for this inner tuple.
+				 */
+			case EXEC_MJ_NEXTINNER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTINNER\n");
+
+				if (doFillInner && !node->mj_MatchedInner)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the outer
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedInner = true;	/* do it only once */
+
+					result = MJFillInner(node);
+					if (result)
+						return result;
+				}
+
+				/*
+				 * now we get the next inner tuple, if any.  If there's none,
+				 * advance to next outer tuple (which may be able to join to
+				 * previously marked tuples).
+				 *
+				 * NB: must NOT do "extraMarks" here, since we may need to
+				 * return to previously marked tuples.
+				 */
+				innerTupleSlot = ExecProcNode(innerPlan);
+				node->mj_InnerTupleSlot = innerTupleSlot;
+				MJ_DEBUG_PROC_NODE(innerTupleSlot);
+				node->mj_MatchedInner = false;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalInnerValues(node, innerTupleSlot))
+				{
+					case MJEVAL_MATCHABLE:
+
+						/*
+						 * Test the new inner tuple to see if it matches
+						 * outer.
+						 *
+						 * If they do match, then we join them and move on to
+						 * the next inner tuple (EXEC_MJ_JOINTUPLES).
+						 *
+						 * If they do not match then advance to next outer
+						 * tuple.
+						 */
+						compareResult = MJCompare(node);
+						MJ_DEBUG_COMPARE(compareResult);
+
+						if (compareResult == 0)
+							node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+						else if (compareResult < 0)
+							node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						else	/* compareResult > 0 should not happen */
+							elog(ERROR, "mergejoin input data is out of order");
+						break;
+					case MJEVAL_NONMATCHABLE:
+
+						/*
+						 * It contains a NULL and hence can't match any outer
+						 * tuple, so we can skip the comparison and assume the
+						 * new tuple is greater than current outer.
+						 */
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						break;
+					case MJEVAL_ENDOFJOIN:
+
+						/*
+						 * No more inner tuples.  However, this might be only
+						 * effective and not physical end of inner plan, so
+						 * force mj_InnerTupleSlot to null to make sure we
+						 * don't fetch more inner tuples.  (We need this hack
+						 * because we are not transiting to a state where the
+						 * inner plan is assumed to be exhausted.)
+						 */
+						node->mj_InnerTupleSlot = NULL;
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						break;
+				}
+				break;
+
+				/*-------------------------------------------
+				 * EXEC_MJ_NEXTOUTER means
+				 *
+				 *				outer inner
+				 * outer tuple -  5		5  - marked tuple
+				 *				  5		5
+				 *				  6		6  - inner tuple
+				 *				  7		7
+				 *
+				 * we know we just bumped into the
+				 * first inner tuple > current outer tuple (or possibly
+				 * the end of the inner stream)
+				 * so get a new outer tuple and then
+				 * proceed to test it against the marked tuple
+				 * (EXEC_MJ_TESTOUTER)
+				 *
+				 * Before advancing, we check to see if we must emit an
+				 * outer-join fill tuple for this outer tuple.
+				 *------------------------------------------------
+				 */
+			case EXEC_MJ_NEXTOUTER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTOUTER\n");
+
+				if (doFillOuter && !node->mj_MatchedOuter)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the inner
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedOuter = true;	/* do it only once */
+
+					result = MJFillOuter(node);
+					if (result)
+						return result;
+				}
+
+				/*
+				 * now we get the next outer tuple, if any
+				 */
+				outerTupleSlot = ExecProcNode(outerPlan);
+				node->mj_OuterTupleSlot = outerTupleSlot;
+				MJ_DEBUG_PROC_NODE(outerTupleSlot);
+				node->mj_MatchedOuter = false;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalOuterValues(node))
+				{
+					case MJEVAL_MATCHABLE:
+						/* Go test the new tuple against the marked tuple */
+						node->mj_JoinState = EXEC_MJ_TESTOUTER;
+						break;
+					case MJEVAL_NONMATCHABLE:
+						/* Can't match, so fetch next outer tuple */
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more outer tuples */
+						MJ_printf("ExecMergeJoin: end of outer subplan\n");
+						innerTupleSlot = node->mj_InnerTupleSlot;
+						if (doFillInner && !TupIsNull(innerTupleSlot))
+						{
+							/*
+							 * Need to emit right-join tuples for remaining
+							 * inner tuples.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDOUTER;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+				/*--------------------------------------------------------
+				 * EXEC_MJ_TESTOUTER If the new outer tuple and the marked
+				 * tuple satisfy the merge clause then we know we have
+				 * duplicates in the outer scan so we have to restore the
+				 * inner scan to the marked tuple and proceed to join the
+				 * new outer tuple with the inner tuples.
+				 *
+				 * This is the case when
+				 *						  outer inner
+				 *							4	  5  - marked tuple
+				 *			 outer tuple -	5	  5
+				 *		 new outer tuple -	5	  5
+				 *							6	  8  - inner tuple
+				 *							7	 12
+				 *
+				 *				new outer tuple == marked tuple
+				 *
+				 * If the outer tuple fails the test, then we are done
+				 * with the marked tuples, and we have to look for a
+				 * match to the current inner tuple.  So we will
+				 * proceed to skip outer tuples until outer >= inner
+				 * (EXEC_MJ_SKIP_TEST).
+				 *
+				 *		This is the case when
+				 *
+				 *						  outer inner
+				 *							5	  5  - marked tuple
+				 *			 outer tuple -	5	  5
+				 *		 new outer tuple -	6	  8  - inner tuple
+				 *							7	 12
+				 *
+				 *				new outer tuple > marked tuple
+				 *
+				 *---------------------------------------------------------
+				 */
+			case EXEC_MJ_TESTOUTER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_TESTOUTER\n");
+
+				/*
+				 * Here we must compare the outer tuple with the marked inner
+				 * tuple.  (We can ignore the result of MJEvalInnerValues,
+				 * since the marked inner tuple is certainly matchable.)
+				 */
+				innerTupleSlot = node->mj_MarkedTupleSlot;
+				(void) MJEvalInnerValues(node, innerTupleSlot);
+
+				compareResult = MJCompare(node);
+				MJ_DEBUG_COMPARE(compareResult);
+
+				if (compareResult == 0)
+				{
+					/*
+					 * the merge clause matched so now we restore the inner
+					 * scan position to the first mark, and go join that tuple
+					 * (and any following ones) to the new outer.
+					 *
+					 * If we were able to determine mark and restore are not
+					 * needed, then we don't have to back up; the current
+					 * inner is already the first possible match.
+					 *
+					 * NOTE: we do not need to worry about the MatchedInner
+					 * state for the rescanned inner tuples.  We know all of
+					 * them will match this new outer tuple and therefore
+					 * won't be emitted as fill tuples.  This works *only*
+					 * because we require the extra joinquals to be constant
+					 * when doing a right or full join --- otherwise some of
+					 * the rescanned tuples might fail the extra joinquals.
+					 * This obviously won't happen for a constant-true extra
+					 * joinqual, while the constant-false case is handled by
+					 * forcing the merge clause to never match, so we never
+					 * get here.
+					 */
+					if (!node->mj_SkipMarkRestore)
+					{
+						ExecRestrPos(innerPlan);
+
+						/*
+						 * ExecRestrPos probably should give us back a new
+						 * Slot, but since it doesn't, use the marked slot.
+						 * (The previously returned mj_InnerTupleSlot cannot
+						 * be assumed to hold the required tuple.)
+						 */
+						node->mj_InnerTupleSlot = innerTupleSlot;
+						/* we need not do MJEvalInnerValues again */
+					}
+
+					node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+				}
+				else if (compareResult > 0)
+				{
+					/* ----------------
+					 *	if the new outer tuple didn't match the marked inner
+					 *	tuple then we have a case like:
+					 *
+					 *			 outer inner
+					 *			   4	 4	- marked tuple
+					 * new outer - 5	 4
+					 *			   6	 5	- inner tuple
+					 *			   7
+					 *
+					 *	which means that all subsequent outer tuples will be
+					 *	larger than our marked inner tuples.  So we need not
+					 *	revisit any of the marked tuples but can proceed to
+					 *	look for a match to the current inner.  If there's
+					 *	no more inners, no more matches are possible.
+					 * ----------------
+					 */
+					innerTupleSlot = node->mj_InnerTupleSlot;
+
+					/* reload comparison data for current inner */
+					switch (MJEvalInnerValues(node, innerTupleSlot))
+					{
+						case MJEVAL_MATCHABLE:
+							/* proceed to compare it to the current outer */
+							node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+							break;
+						case MJEVAL_NONMATCHABLE:
+
+							/*
+							 * current inner can't possibly match any outer;
+							 * better to advance the inner scan than the
+							 * outer.
+							 */
+							node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+							break;
+						case MJEVAL_ENDOFJOIN:
+							/* No more inner tuples */
+							if (doFillOuter)
+							{
+								/*
+								 * Need to emit left-join tuples for remaining
+								 * outer tuples.
+								 */
+								node->mj_JoinState = EXEC_MJ_ENDINNER;
+								break;
+							}
+							/* Otherwise we're done. */
+							return NULL;
+					}
+				}
+				else			/* compareResult < 0 should not happen */
+					elog(ERROR, "mergejoin input data is out of order");
+				break;
+
+				/*----------------------------------------------------------
+				 * EXEC_MJ_SKIP means compare tuples and if they do not
+				 * match, skip whichever is lesser.
+				 *
+				 * For example:
+				 *
+				 *				outer inner
+				 *				  5		5
+				 *				  5		5
+				 * outer tuple -  6		8  - inner tuple
+				 *				  7    12
+				 *				  8    14
+				 *
+				 * we have to advance the outer scan
+				 * until we find the outer 8.
+				 *
+				 * On the other hand:
+				 *
+				 *				outer inner
+				 *				  5		5
+				 *				  5		5
+				 * outer tuple - 12		8  - inner tuple
+				 *				 14    10
+				 *				 17    12
+				 *
+				 * we have to advance the inner scan
+				 * until we find the inner 12.
+				 *----------------------------------------------------------
+				 */
+			case EXEC_MJ_SKIP_TEST:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_SKIP_TEST\n");
+
+				/*
+				 * before we advance, make sure the current tuples do not
+				 * satisfy the mergeclauses.  If they do, then we update the
+				 * marked tuple position and go join them.
+				 */
+				compareResult = MJCompare(node);
+				MJ_DEBUG_COMPARE(compareResult);
+
+				if (compareResult == 0)
+				{
+					if (!node->mj_SkipMarkRestore)
+						ExecMarkPos(innerPlan);
+
+					MarkInnerTuple(node->mj_InnerTupleSlot, node);
+
+					node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+				}
+				else if (compareResult < 0)
+					node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE;
+				else
+					/* compareResult > 0 */
+					node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+				break;
+
+				/*
+				 * SKIPOUTER_ADVANCE: advance over an outer tuple that is
+				 * known not to join to any inner tuple.
+				 *
+				 * Before advancing, we check to see if we must emit an
+				 * outer-join fill tuple for this outer tuple.
+				 */
+			case EXEC_MJ_SKIPOUTER_ADVANCE:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPOUTER_ADVANCE\n");
+
+				if (doFillOuter && !node->mj_MatchedOuter)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the inner
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedOuter = true;	/* do it only once */
+
+					result = MJFillOuter(node);
+					if (result)
+						return result;
+				}
+
+				/*
+				 * now we get the next outer tuple, if any
+				 */
+				outerTupleSlot = ExecProcNode(outerPlan);
+				node->mj_OuterTupleSlot = outerTupleSlot;
+				MJ_DEBUG_PROC_NODE(outerTupleSlot);
+				node->mj_MatchedOuter = false;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalOuterValues(node))
+				{
+					case MJEVAL_MATCHABLE:
+						/* Go test the new tuple against the current inner */
+						node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+						break;
+					case MJEVAL_NONMATCHABLE:
+						/* Can't match, so fetch next outer tuple */
+						node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE;
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more outer tuples */
+						MJ_printf("ExecMergeJoin: end of outer subplan\n");
+						innerTupleSlot = node->mj_InnerTupleSlot;
+						if (doFillInner && !TupIsNull(innerTupleSlot))
+						{
+							/*
+							 * Need to emit right-join tuples for remaining
+							 * inner tuples.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDOUTER;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+				/*
+				 * SKIPINNER_ADVANCE: advance over an inner tuple that is
+				 * known not to join to any outer tuple.
+				 *
+				 * Before advancing, we check to see if we must emit an
+				 * outer-join fill tuple for this inner tuple.
+				 */
+			case EXEC_MJ_SKIPINNER_ADVANCE:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPINNER_ADVANCE\n");
+
+				if (doFillInner && !node->mj_MatchedInner)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the outer
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedInner = true;	/* do it only once */
+
+					result = MJFillInner(node);
+					if (result)
+						return result;
+				}
+
+				/* Mark before advancing, if wanted */
+				if (node->mj_ExtraMarks)
+					ExecMarkPos(innerPlan);
+
+				/*
+				 * now we get the next inner tuple, if any
+				 */
+				innerTupleSlot = ExecProcNode(innerPlan);
+				node->mj_InnerTupleSlot = innerTupleSlot;
+				MJ_DEBUG_PROC_NODE(innerTupleSlot);
+				node->mj_MatchedInner = false;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalInnerValues(node, innerTupleSlot))
+				{
+					case MJEVAL_MATCHABLE:
+						/* proceed to compare it to the current outer */
+						node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+						break;
+					case MJEVAL_NONMATCHABLE:
+
+						/*
+						 * current inner can't possibly match any outer;
+						 * better to advance the inner scan than the outer.
+						 */
+						node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more inner tuples */
+						MJ_printf("ExecMergeJoin: end of inner subplan\n");
+						outerTupleSlot = node->mj_OuterTupleSlot;
+						if (doFillOuter && !TupIsNull(outerTupleSlot))
+						{
+							/*
+							 * Need to emit left-join tuples for remaining
+							 * outer tuples.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDINNER;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+				/*
+				 * EXEC_MJ_ENDOUTER means we have run out of outer tuples, but
+				 * are doing a right/full join and therefore must null-fill
+				 * any remaining unmatched inner tuples.
+				 */
+			case EXEC_MJ_ENDOUTER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_ENDOUTER\n");
+
+				Assert(doFillInner);
+
+				if (!node->mj_MatchedInner)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the outer
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedInner = true;	/* do it only once */
+
+					result = MJFillInner(node);
+					if (result)
+						return result;
+				}
+
+				/* Mark before advancing, if wanted */
+				if (node->mj_ExtraMarks)
+					ExecMarkPos(innerPlan);
+
+				/*
+				 * now we get the next inner tuple, if any
+				 */
+				innerTupleSlot = ExecProcNode(innerPlan);
+				node->mj_InnerTupleSlot = innerTupleSlot;
+				MJ_DEBUG_PROC_NODE(innerTupleSlot);
+				node->mj_MatchedInner = false;
+
+				if (TupIsNull(innerTupleSlot))
+				{
+					MJ_printf("ExecMergeJoin: end of inner subplan\n");
+					return NULL;
+				}
+
+				/* Else remain in ENDOUTER state and process next tuple. */
+				break;
+
+				/*
+				 * EXEC_MJ_ENDINNER means we have run out of inner tuples, but
+				 * are doing a left/full join and therefore must null- fill
+				 * any remaining unmatched outer tuples.
+				 */
+			case EXEC_MJ_ENDINNER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_ENDINNER\n");
+
+				Assert(doFillOuter);
+
+				if (!node->mj_MatchedOuter)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the inner
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedOuter = true;	/* do it only once */
+
+					result = MJFillOuter(node);
+					if (result)
+						return result;
+				}
+
+				/*
+				 * now we get the next outer tuple, if any
+				 */
+				outerTupleSlot = ExecProcNode(outerPlan);
+				node->mj_OuterTupleSlot = outerTupleSlot;
+				MJ_DEBUG_PROC_NODE(outerTupleSlot);
+				node->mj_MatchedOuter = false;
+
+				if (TupIsNull(outerTupleSlot))
+				{
+					MJ_printf("ExecMergeJoin: end of outer subplan\n");
+					return NULL;
+				}
+
+				/* Else remain in ENDINNER state and process next tuple. */
+				break;
+
+				/*
+				 * broken state value?
+				 */
+			default:
+				elog(ERROR, "unrecognized mergejoin state: %d",
+					 (int) node->mj_JoinState);
+		}
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitMergeJoin
+ * ----------------------------------------------------------------
+ */
+MergeJoinState *
+ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
+{
+	MergeJoinState *mergestate;
+	TupleDesc	outerDesc,
+				innerDesc;
+	const TupleTableSlotOps *innerOps;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	MJ1_printf("ExecInitMergeJoin: %s\n",
+			   "initializing node");
+
+	/*
+	 * create state structure
+	 */
+	mergestate = makeNode(MergeJoinState);
+	mergestate->js.ps.plan = (Plan *) node;
+	mergestate->js.ps.state = estate;
+	mergestate->js.ps.ExecProcNode = ExecMergeJoin;
+	mergestate->js.jointype = node->join.jointype;
+	mergestate->mj_ConstFalseJoin = false;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &mergestate->js.ps);
+
+	/*
+	 * we need two additional econtexts in which we can compute the join
+	 * expressions from the left and right input tuples.  The node's regular
+	 * econtext won't do because it gets reset too often.
+	 */
+	mergestate->mj_OuterEContext = CreateExprContext(estate);
+	mergestate->mj_InnerEContext = CreateExprContext(estate);
+
+	/*
+	 * initialize child nodes
+	 *
+	 * inner child must support MARK/RESTORE, unless we have detected that we
+	 * don't need that.  Note that skip_mark_restore must never be set if
+	 * there are non-mergeclause joinquals, since the logic wouldn't work.
+	 */
+	Assert(node->join.joinqual == NIL || !node->skip_mark_restore);
+	mergestate->mj_SkipMarkRestore = node->skip_mark_restore;
+
+	outerPlanState(mergestate) = ExecInitNode(outerPlan(node), estate, eflags);
+	outerDesc = ExecGetResultType(outerPlanState(mergestate));
+	innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate,
+											  mergestate->mj_SkipMarkRestore ?
+											  eflags :
+											  (eflags | EXEC_FLAG_MARK));
+	innerDesc = ExecGetResultType(innerPlanState(mergestate));
+
+	/*
+	 * For certain types of inner child nodes, it is advantageous to issue
+	 * MARK every time we advance past an inner tuple we will never return to.
+	 * For other types, MARK on a tuple we cannot return to is a waste of
+	 * cycles.  Detect which case applies and set mj_ExtraMarks if we want to
+	 * issue "unnecessary" MARK calls.
+	 *
+	 * Currently, only Material wants the extra MARKs, and it will be helpful
+	 * only if eflags doesn't specify REWIND.
+	 *
+	 * Note that for IndexScan and IndexOnlyScan, it is *necessary* that we
+	 * not set mj_ExtraMarks; otherwise we might attempt to set a mark before
+	 * the first inner tuple, which they do not support.
+	 */
+	if (IsA(innerPlan(node), Material) &&
+		(eflags & EXEC_FLAG_REWIND) == 0 &&
+		!mergestate->mj_SkipMarkRestore)
+		mergestate->mj_ExtraMarks = true;
+	else
+		mergestate->mj_ExtraMarks = false;
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTupleSlotTL(&mergestate->js.ps, &TTSOpsVirtual);
+	ExecAssignProjectionInfo(&mergestate->js.ps, NULL);
+
+	/*
+	 * tuple table initialization
+	 */
+	innerOps = ExecGetResultSlotOps(innerPlanState(mergestate), NULL);
+	mergestate->mj_MarkedTupleSlot = ExecInitExtraTupleSlot(estate, innerDesc,
+															innerOps);
+
+	/*
+	 * initialize child expressions
+	 */
+	mergestate->js.ps.qual =
+		ExecInitQual(node->join.plan.qual, (PlanState *) mergestate);
+	mergestate->js.joinqual =
+		ExecInitQual(node->join.joinqual, (PlanState *) mergestate);
+	/* mergeclauses are handled below */
+
+	/*
+	 * detect whether we need only consider the first matching inner tuple
+	 */
+	mergestate->js.single_match = (node->join.inner_unique ||
+								   node->join.jointype == JOIN_SEMI);
+
+	/* set up null tuples for outer joins, if needed */
+	switch (node->join.jointype)
+	{
+		case JOIN_INNER:
+		case JOIN_SEMI:
+			mergestate->mj_FillOuter = false;
+			mergestate->mj_FillInner = false;
+			break;
+		case JOIN_LEFT:
+		case JOIN_ANTI:
+			mergestate->mj_FillOuter = true;
+			mergestate->mj_FillInner = false;
+			mergestate->mj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual);
+			break;
+		case JOIN_RIGHT:
+			mergestate->mj_FillOuter = false;
+			mergestate->mj_FillInner = true;
+			mergestate->mj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual);
+
+			/*
+			 * Can't handle right or full join with non-constant extra
+			 * joinclauses.  This should have been caught by planner.
+			 */
+			if (!check_constant_qual(node->join.joinqual,
+									 &mergestate->mj_ConstFalseJoin))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("RIGHT JOIN is only supported with merge-joinable join conditions")));
+			break;
+		case JOIN_FULL:
+			mergestate->mj_FillOuter = true;
+			mergestate->mj_FillInner = true;
+			mergestate->mj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual);
+			mergestate->mj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual);
+
+			/*
+			 * Can't handle right or full join with non-constant extra
+			 * joinclauses.  This should have been caught by planner.
+			 */
+			if (!check_constant_qual(node->join.joinqual,
+									 &mergestate->mj_ConstFalseJoin))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("FULL JOIN is only supported with merge-joinable join conditions")));
+			break;
+		default:
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) node->join.jointype);
+	}
+
+	/*
+	 * preprocess the merge clauses
+	 */
+	mergestate->mj_NumClauses = list_length(node->mergeclauses);
+	mergestate->mj_Clauses = MJExamineQuals(node->mergeclauses,
+											node->mergeFamilies,
+											node->mergeCollations,
+											node->mergeStrategies,
+											node->mergeNullsFirst,
+											(PlanState *) mergestate);
+
+	/*
+	 * initialize join state
+	 */
+	mergestate->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER;
+	mergestate->mj_MatchedOuter = false;
+	mergestate->mj_MatchedInner = false;
+	mergestate->mj_OuterTupleSlot = NULL;
+	mergestate->mj_InnerTupleSlot = NULL;
+
+	/*
+	 * initialization successful
+	 */
+	MJ1_printf("ExecInitMergeJoin: %s\n",
+			   "node initialized");
+
+	return mergestate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndMergeJoin
+ *
+ * old comments
+ *		frees storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndMergeJoin(MergeJoinState *node)
+{
+	MJ1_printf("ExecEndMergeJoin: %s\n",
+			   "ending node processing");
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->js.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->mj_MarkedTupleSlot);
+
+	/*
+	 * shut down the subplans
+	 */
+	ExecEndNode(innerPlanState(node));
+	ExecEndNode(outerPlanState(node));
+
+	MJ1_printf("ExecEndMergeJoin: %s\n",
+			   "node processing ended");
+}
+
+void
+ExecReScanMergeJoin(MergeJoinState *node)
+{
+	ExecClearTuple(node->mj_MarkedTupleSlot);
+
+	node->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER;
+	node->mj_MatchedOuter = false;
+	node->mj_MatchedInner = false;
+	node->mj_OuterTupleSlot = NULL;
+	node->mj_InnerTupleSlot = NULL;
+
+	/*
+	 * if chgParam of subnodes is not null then plans will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->js.ps.lefttree->chgParam == NULL)
+		ExecReScan(node->js.ps.lefttree);
+	if (node->js.ps.righttree->chgParam == NULL)
+		ExecReScan(node->js.ps.righttree);
+
+}
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
new file mode 100644
index 0000000..1e79d18
--- /dev/null
+++ b/src/backend/executor/nodeModifyTable.c
@@ -0,0 +1,3243 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeModifyTable.c
+ *	  routines to handle ModifyTable nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeModifyTable.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* INTERFACE ROUTINES
+ *		ExecInitModifyTable - initialize the ModifyTable node
+ *		ExecModifyTable		- retrieve the next tuple from the node
+ *		ExecEndModifyTable	- shut down the ModifyTable node
+ *		ExecReScanModifyTable - rescan the ModifyTable node
+ *
+ *	 NOTES
+ *		The ModifyTable node receives input from its outerPlan, which is
+ *		the data to insert for INSERT cases, or the changed columns' new
+ *		values plus row-locating info for UPDATE cases, or just the
+ *		row-locating info for DELETE cases.
+ *
+ *		If the query specifies RETURNING, then the ModifyTable returns a
+ *		RETURNING tuple after completing each row insert, update, or delete.
+ *		It must be called again to continue the operation.  Without RETURNING,
+ *		we just loop within the node until all the work is done, then
+ *		return NULL.  This avoids useless call/return overhead.
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "catalog/catalog.h"
+#include "commands/trigger.h"
+#include "executor/execPartition.h"
+#include "executor/executor.h"
+#include "executor/nodeModifyTable.h"
+#include "foreign/fdwapi.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "rewrite/rewriteHandler.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+typedef struct MTTargetRelLookup
+{
+	Oid			relationOid;	/* hash key, must be first */
+	int			relationIndex;	/* rel's index in resultRelInfo[] array */
+} MTTargetRelLookup;
+
+static void ExecBatchInsert(ModifyTableState *mtstate,
+							ResultRelInfo *resultRelInfo,
+							TupleTableSlot **slots,
+							TupleTableSlot **planSlots,
+							int numSlots,
+							EState *estate,
+							bool canSetTag);
+static bool ExecOnConflictUpdate(ModifyTableState *mtstate,
+								 ResultRelInfo *resultRelInfo,
+								 ItemPointer conflictTid,
+								 TupleTableSlot *planSlot,
+								 TupleTableSlot *excludedSlot,
+								 EState *estate,
+								 bool canSetTag,
+								 TupleTableSlot **returning);
+static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
+											   EState *estate,
+											   PartitionTupleRouting *proute,
+											   ResultRelInfo *targetRelInfo,
+											   TupleTableSlot *slot,
+											   ResultRelInfo **partRelInfo);
+
+/*
+ * Verify that the tuples to be produced by INSERT match the
+ * target relation's rowtype
+ *
+ * We do this to guard against stale plans.  If plan invalidation is
+ * functioning properly then we should never get a failure here, but better
+ * safe than sorry.  Note that this is called after we have obtained lock
+ * on the target rel, so the rowtype can't change underneath us.
+ *
+ * The plan output is represented by its targetlist, because that makes
+ * handling the dropped-column case easier.
+ *
+ * We used to use this for UPDATE as well, but now the equivalent checks
+ * are done in ExecBuildUpdateProjection.
+ */
+static void
+ExecCheckPlanOutput(Relation resultRel, List *targetList)
+{
+	TupleDesc	resultDesc = RelationGetDescr(resultRel);
+	int			attno = 0;
+	ListCell   *lc;
+
+	foreach(lc, targetList)
+	{
+		TargetEntry *tle = (TargetEntry *) lfirst(lc);
+		Form_pg_attribute attr;
+
+		Assert(!tle->resjunk);	/* caller removed junk items already */
+
+		if (attno >= resultDesc->natts)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("table row type and query-specified row type do not match"),
+					 errdetail("Query has too many columns.")));
+		attr = TupleDescAttr(resultDesc, attno);
+		attno++;
+
+		if (!attr->attisdropped)
+		{
+			/* Normal case: demand type match */
+			if (exprType((Node *) tle->expr) != attr->atttypid)
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("table row type and query-specified row type do not match"),
+						 errdetail("Table has type %s at ordinal position %d, but query expects %s.",
+								   format_type_be(attr->atttypid),
+								   attno,
+								   format_type_be(exprType((Node *) tle->expr)))));
+		}
+		else
+		{
+			/*
+			 * For a dropped column, we can't check atttypid (it's likely 0).
+			 * In any case the planner has most likely inserted an INT4 null.
+			 * What we insist on is just *some* NULL constant.
+			 */
+			if (!IsA(tle->expr, Const) ||
+				!((Const *) tle->expr)->constisnull)
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("table row type and query-specified row type do not match"),
+						 errdetail("Query provides a value for a dropped column at ordinal position %d.",
+								   attno)));
+		}
+	}
+	if (attno != resultDesc->natts)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATATYPE_MISMATCH),
+				 errmsg("table row type and query-specified row type do not match"),
+				 errdetail("Query has too few columns.")));
+}
+
+/*
+ * ExecProcessReturning --- evaluate a RETURNING list
+ *
+ * resultRelInfo: current result rel
+ * tupleSlot: slot holding tuple actually inserted/updated/deleted
+ * planSlot: slot holding tuple returned by top subplan node
+ *
+ * Note: If tupleSlot is NULL, the FDW should have already provided econtext's
+ * scan tuple.
+ *
+ * Returns a slot holding the result tuple
+ */
+static TupleTableSlot *
+ExecProcessReturning(ResultRelInfo *resultRelInfo,
+					 TupleTableSlot *tupleSlot,
+					 TupleTableSlot *planSlot)
+{
+	ProjectionInfo *projectReturning = resultRelInfo->ri_projectReturning;
+	ExprContext *econtext = projectReturning->pi_exprContext;
+
+	/* Make tuple and any needed join variables available to ExecProject */
+	if (tupleSlot)
+		econtext->ecxt_scantuple = tupleSlot;
+	econtext->ecxt_outertuple = planSlot;
+
+	/*
+	 * RETURNING expressions might reference the tableoid column, so
+	 * reinitialize tts_tableOid before evaluating them.
+	 */
+	econtext->ecxt_scantuple->tts_tableOid =
+		RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+	/* Compute the RETURNING expressions */
+	return ExecProject(projectReturning);
+}
+
+/*
+ * ExecCheckTupleVisible -- verify tuple is visible
+ *
+ * It would not be consistent with guarantees of the higher isolation levels to
+ * proceed with avoiding insertion (taking speculative insertion's alternative
+ * path) on the basis of another tuple that is not visible to MVCC snapshot.
+ * Check for the need to raise a serialization failure, and do so as necessary.
+ */
+static void
+ExecCheckTupleVisible(EState *estate,
+					  Relation rel,
+					  TupleTableSlot *slot)
+{
+	if (!IsolationUsesXactSnapshot())
+		return;
+
+	if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot))
+	{
+		Datum		xminDatum;
+		TransactionId xmin;
+		bool		isnull;
+
+		xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull);
+		Assert(!isnull);
+		xmin = DatumGetTransactionId(xminDatum);
+
+		/*
+		 * We should not raise a serialization failure if the conflict is
+		 * against a tuple inserted by our own transaction, even if it's not
+		 * visible to our snapshot.  (This would happen, for example, if
+		 * conflicting keys are proposed for insertion in a single command.)
+		 */
+		if (!TransactionIdIsCurrentTransactionId(xmin))
+			ereport(ERROR,
+					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+					 errmsg("could not serialize access due to concurrent update")));
+	}
+}
+
+/*
+ * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible()
+ */
+static void
+ExecCheckTIDVisible(EState *estate,
+					ResultRelInfo *relinfo,
+					ItemPointer tid,
+					TupleTableSlot *tempSlot)
+{
+	Relation	rel = relinfo->ri_RelationDesc;
+
+	/* Redundantly check isolation level */
+	if (!IsolationUsesXactSnapshot())
+		return;
+
+	if (!table_tuple_fetch_row_version(rel, tid, SnapshotAny, tempSlot))
+		elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT");
+	ExecCheckTupleVisible(estate, rel, tempSlot);
+	ExecClearTuple(tempSlot);
+}
+
+/*
+ * Compute stored generated columns for a tuple
+ */
+void
+ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo,
+						   EState *estate, TupleTableSlot *slot,
+						   CmdType cmdtype)
+{
+	Relation	rel = resultRelInfo->ri_RelationDesc;
+	TupleDesc	tupdesc = RelationGetDescr(rel);
+	int			natts = tupdesc->natts;
+	MemoryContext oldContext;
+	Datum	   *values;
+	bool	   *nulls;
+
+	Assert(tupdesc->constr && tupdesc->constr->has_generated_stored);
+
+	/*
+	 * If first time through for this result relation, build expression
+	 * nodetrees for rel's stored generation expressions.  Keep them in the
+	 * per-query memory context so they'll survive throughout the query.
+	 */
+	if (resultRelInfo->ri_GeneratedExprs == NULL)
+	{
+		oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+		resultRelInfo->ri_GeneratedExprs =
+			(ExprState **) palloc(natts * sizeof(ExprState *));
+		resultRelInfo->ri_NumGeneratedNeeded = 0;
+
+		for (int i = 0; i < natts; i++)
+		{
+			if (TupleDescAttr(tupdesc, i)->attgenerated == ATTRIBUTE_GENERATED_STORED)
+			{
+				Expr	   *expr;
+
+				/*
+				 * If it's an update and the current column was not marked as
+				 * being updated, then we can skip the computation.  But if
+				 * there is a BEFORE ROW UPDATE trigger, we cannot skip
+				 * because the trigger might affect additional columns.
+				 */
+				if (cmdtype == CMD_UPDATE &&
+					!(rel->trigdesc && rel->trigdesc->trig_update_before_row) &&
+					!bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
+								   ExecGetExtraUpdatedCols(resultRelInfo, estate)))
+				{
+					resultRelInfo->ri_GeneratedExprs[i] = NULL;
+					continue;
+				}
+
+				expr = (Expr *) build_column_default(rel, i + 1);
+				if (expr == NULL)
+					elog(ERROR, "no generation expression found for column number %d of table \"%s\"",
+						 i + 1, RelationGetRelationName(rel));
+
+				resultRelInfo->ri_GeneratedExprs[i] = ExecPrepareExpr(expr, estate);
+				resultRelInfo->ri_NumGeneratedNeeded++;
+			}
+		}
+
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	/*
+	 * If no generated columns have been affected by this change, then skip
+	 * the rest.
+	 */
+	if (resultRelInfo->ri_NumGeneratedNeeded == 0)
+		return;
+
+	oldContext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
+
+	values = palloc(sizeof(*values) * natts);
+	nulls = palloc(sizeof(*nulls) * natts);
+
+	slot_getallattrs(slot);
+	memcpy(nulls, slot->tts_isnull, sizeof(*nulls) * natts);
+
+	for (int i = 0; i < natts; i++)
+	{
+		Form_pg_attribute attr = TupleDescAttr(tupdesc, i);
+
+		if (attr->attgenerated == ATTRIBUTE_GENERATED_STORED &&
+			resultRelInfo->ri_GeneratedExprs[i])
+		{
+			ExprContext *econtext;
+			Datum		val;
+			bool		isnull;
+
+			econtext = GetPerTupleExprContext(estate);
+			econtext->ecxt_scantuple = slot;
+
+			val = ExecEvalExpr(resultRelInfo->ri_GeneratedExprs[i], econtext, &isnull);
+
+			/*
+			 * We must make a copy of val as we have no guarantees about where
+			 * memory for a pass-by-reference Datum is located.
+			 */
+			if (!isnull)
+				val = datumCopy(val, attr->attbyval, attr->attlen);
+
+			values[i] = val;
+			nulls[i] = isnull;
+		}
+		else
+		{
+			if (!nulls[i])
+				values[i] = datumCopy(slot->tts_values[i], attr->attbyval, attr->attlen);
+		}
+	}
+
+	ExecClearTuple(slot);
+	memcpy(slot->tts_values, values, sizeof(*values) * natts);
+	memcpy(slot->tts_isnull, nulls, sizeof(*nulls) * natts);
+	ExecStoreVirtualTuple(slot);
+	ExecMaterializeSlot(slot);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * ExecInitInsertProjection
+ *		Do one-time initialization of projection data for INSERT tuples.
+ *
+ * INSERT queries may need a projection to filter out junk attrs in the tlist.
+ *
+ * This is also a convenient place to verify that the
+ * output of an INSERT matches the target table.
+ */
+static void
+ExecInitInsertProjection(ModifyTableState *mtstate,
+						 ResultRelInfo *resultRelInfo)
+{
+	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+	Plan	   *subplan = outerPlan(node);
+	EState	   *estate = mtstate->ps.state;
+	List	   *insertTargetList = NIL;
+	bool		need_projection = false;
+	ListCell   *l;
+
+	/* Extract non-junk columns of the subplan's result tlist. */
+	foreach(l, subplan->targetlist)
+	{
+		TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+		if (!tle->resjunk)
+			insertTargetList = lappend(insertTargetList, tle);
+		else
+			need_projection = true;
+	}
+
+	/*
+	 * The junk-free list must produce a tuple suitable for the result
+	 * relation.
+	 */
+	ExecCheckPlanOutput(resultRelInfo->ri_RelationDesc, insertTargetList);
+
+	/* We'll need a slot matching the table's format. */
+	resultRelInfo->ri_newTupleSlot =
+		table_slot_create(resultRelInfo->ri_RelationDesc,
+						  &estate->es_tupleTable);
+
+	/* Build ProjectionInfo if needed (it probably isn't). */
+	if (need_projection)
+	{
+		TupleDesc	relDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+
+		/* need an expression context to do the projection */
+		if (mtstate->ps.ps_ExprContext == NULL)
+			ExecAssignExprContext(estate, &mtstate->ps);
+
+		resultRelInfo->ri_projectNew =
+			ExecBuildProjectionInfo(insertTargetList,
+									mtstate->ps.ps_ExprContext,
+									resultRelInfo->ri_newTupleSlot,
+									&mtstate->ps,
+									relDesc);
+	}
+
+	resultRelInfo->ri_projectNewInfoValid = true;
+}
+
+/*
+ * ExecInitUpdateProjection
+ *		Do one-time initialization of projection data for UPDATE tuples.
+ *
+ * UPDATE always needs a projection, because (1) there's always some junk
+ * attrs, and (2) we may need to merge values of not-updated columns from
+ * the old tuple into the final tuple.  In UPDATE, the tuple arriving from
+ * the subplan contains only new values for the changed columns, plus row
+ * identity info in the junk attrs.
+ *
+ * This is "one-time" for any given result rel, but we might touch more than
+ * one result rel in the course of an inherited UPDATE, and each one needs
+ * its own projection due to possible column order variation.
+ *
+ * This is also a convenient place to verify that the output of an UPDATE
+ * matches the target table (ExecBuildUpdateProjection does that).
+ */
+static void
+ExecInitUpdateProjection(ModifyTableState *mtstate,
+						 ResultRelInfo *resultRelInfo)
+{
+	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+	Plan	   *subplan = outerPlan(node);
+	EState	   *estate = mtstate->ps.state;
+	TupleDesc	relDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+	int			whichrel;
+	List	   *updateColnos;
+
+	/*
+	 * Usually, mt_lastResultIndex matches the target rel.  If it happens not
+	 * to, we can get the index the hard way with an integer division.
+	 */
+	whichrel = mtstate->mt_lastResultIndex;
+	if (resultRelInfo != mtstate->resultRelInfo + whichrel)
+	{
+		whichrel = resultRelInfo - mtstate->resultRelInfo;
+		Assert(whichrel >= 0 && whichrel < mtstate->mt_nrels);
+	}
+
+	updateColnos = (List *) list_nth(node->updateColnosLists, whichrel);
+
+	/*
+	 * For UPDATE, we use the old tuple to fill up missing values in the tuple
+	 * produced by the subplan to get the new tuple.  We need two slots, both
+	 * matching the table's desired format.
+	 */
+	resultRelInfo->ri_oldTupleSlot =
+		table_slot_create(resultRelInfo->ri_RelationDesc,
+						  &estate->es_tupleTable);
+	resultRelInfo->ri_newTupleSlot =
+		table_slot_create(resultRelInfo->ri_RelationDesc,
+						  &estate->es_tupleTable);
+
+	/* need an expression context to do the projection */
+	if (mtstate->ps.ps_ExprContext == NULL)
+		ExecAssignExprContext(estate, &mtstate->ps);
+
+	resultRelInfo->ri_projectNew =
+		ExecBuildUpdateProjection(subplan->targetlist,
+								  false,	/* subplan did the evaluation */
+								  updateColnos,
+								  relDesc,
+								  mtstate->ps.ps_ExprContext,
+								  resultRelInfo->ri_newTupleSlot,
+								  &mtstate->ps);
+
+	resultRelInfo->ri_projectNewInfoValid = true;
+}
+
+/*
+ * ExecGetInsertNewTuple
+ *		This prepares a "new" tuple ready to be inserted into given result
+ *		relation, by removing any junk columns of the plan's output tuple
+ *		and (if necessary) coercing the tuple to the right tuple format.
+ */
+static TupleTableSlot *
+ExecGetInsertNewTuple(ResultRelInfo *relinfo,
+					  TupleTableSlot *planSlot)
+{
+	ProjectionInfo *newProj = relinfo->ri_projectNew;
+	ExprContext *econtext;
+
+	/*
+	 * If there's no projection to be done, just make sure the slot is of the
+	 * right type for the target rel.  If the planSlot is the right type we
+	 * can use it as-is, else copy the data into ri_newTupleSlot.
+	 */
+	if (newProj == NULL)
+	{
+		if (relinfo->ri_newTupleSlot->tts_ops != planSlot->tts_ops)
+		{
+			ExecCopySlot(relinfo->ri_newTupleSlot, planSlot);
+			return relinfo->ri_newTupleSlot;
+		}
+		else
+			return planSlot;
+	}
+
+	/*
+	 * Else project; since the projection output slot is ri_newTupleSlot, this
+	 * will also fix any slot-type problem.
+	 *
+	 * Note: currently, this is dead code, because INSERT cases don't receive
+	 * any junk columns so there's never a projection to be done.
+	 */
+	econtext = newProj->pi_exprContext;
+	econtext->ecxt_outertuple = planSlot;
+	return ExecProject(newProj);
+}
+
+/*
+ * ExecGetUpdateNewTuple
+ *		This prepares a "new" tuple by combining an UPDATE subplan's output
+ *		tuple (which contains values of changed columns) with unchanged
+ *		columns taken from the old tuple.
+ *
+ * The subplan tuple might also contain junk columns, which are ignored.
+ * Note that the projection also ensures we have a slot of the right type.
+ */
+TupleTableSlot *
+ExecGetUpdateNewTuple(ResultRelInfo *relinfo,
+					  TupleTableSlot *planSlot,
+					  TupleTableSlot *oldSlot)
+{
+	ProjectionInfo *newProj = relinfo->ri_projectNew;
+	ExprContext *econtext;
+
+	/* Use a few extra Asserts to protect against outside callers */
+	Assert(relinfo->ri_projectNewInfoValid);
+	Assert(planSlot != NULL && !TTS_EMPTY(planSlot));
+	Assert(oldSlot != NULL && !TTS_EMPTY(oldSlot));
+
+	econtext = newProj->pi_exprContext;
+	econtext->ecxt_outertuple = planSlot;
+	econtext->ecxt_scantuple = oldSlot;
+	return ExecProject(newProj);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecInsert
+ *
+ *		For INSERT, we have to insert the tuple into the target relation
+ *		(or partition thereof) and insert appropriate tuples into the index
+ *		relations.
+ *
+ *		slot contains the new tuple value to be stored.
+ *		planSlot is the output of the ModifyTable's subplan; we use it
+ *		to access "junk" columns that are not going to be stored.
+ *
+ *		Returns RETURNING result if any, otherwise NULL.
+ *
+ *		This may change the currently active tuple conversion map in
+ *		mtstate->mt_transition_capture, so the callers must take care to
+ *		save the previous value to avoid losing track of it.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecInsert(ModifyTableState *mtstate,
+		   ResultRelInfo *resultRelInfo,
+		   TupleTableSlot *slot,
+		   TupleTableSlot *planSlot,
+		   EState *estate,
+		   bool canSetTag)
+{
+	Relation	resultRelationDesc;
+	List	   *recheckIndexes = NIL;
+	TupleTableSlot *result = NULL;
+	TransitionCaptureState *ar_insert_trig_tcs;
+	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+	OnConflictAction onconflict = node->onConflictAction;
+	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+	MemoryContext oldContext;
+
+	/*
+	 * If the input result relation is a partitioned table, find the leaf
+	 * partition to insert the tuple into.
+	 */
+	if (proute)
+	{
+		ResultRelInfo *partRelInfo;
+
+		slot = ExecPrepareTupleRouting(mtstate, estate, proute,
+									   resultRelInfo, slot,
+									   &partRelInfo);
+		resultRelInfo = partRelInfo;
+	}
+
+	ExecMaterializeSlot(slot);
+
+	resultRelationDesc = resultRelInfo->ri_RelationDesc;
+
+	/*
+	 * Open the table's indexes, if we have not done so already, so that we
+	 * can add new index entries for the inserted tuple.
+	 */
+	if (resultRelationDesc->rd_rel->relhasindex &&
+		resultRelInfo->ri_IndexRelationDescs == NULL)
+		ExecOpenIndices(resultRelInfo, onconflict != ONCONFLICT_NONE);
+
+	/*
+	 * BEFORE ROW INSERT Triggers.
+	 *
+	 * Note: We fire BEFORE ROW TRIGGERS for every attempted insertion in an
+	 * INSERT ... ON CONFLICT statement.  We cannot check for constraint
+	 * violations before firing these triggers, because they can change the
+	 * values to insert.  Also, they can run arbitrary user-defined code with
+	 * side-effects that we can't cancel by just not inserting the tuple.
+	 */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_insert_before_row)
+	{
+		if (!ExecBRInsertTriggers(estate, resultRelInfo, slot))
+			return NULL;		/* "do nothing" */
+	}
+
+	/* INSTEAD OF ROW INSERT Triggers */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_insert_instead_row)
+	{
+		if (!ExecIRInsertTriggers(estate, resultRelInfo, slot))
+			return NULL;		/* "do nothing" */
+	}
+	else if (resultRelInfo->ri_FdwRoutine)
+	{
+		/*
+		 * GENERATED expressions might reference the tableoid column, so
+		 * (re-)initialize tts_tableOid before evaluating them.
+		 */
+		slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+		/*
+		 * Compute stored generated columns
+		 */
+		if (resultRelationDesc->rd_att->constr &&
+			resultRelationDesc->rd_att->constr->has_generated_stored)
+			ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+									   CMD_INSERT);
+
+		/*
+		 * If the FDW supports batching, and batching is requested, accumulate
+		 * rows and insert them in batches. Otherwise use the per-row inserts.
+		 */
+		if (resultRelInfo->ri_BatchSize > 1)
+		{
+			/*
+			 * If a certain number of tuples have already been accumulated, or
+			 * a tuple has come for a different relation than that for the
+			 * accumulated tuples, perform the batch insert
+			 */
+			if (resultRelInfo->ri_NumSlots == resultRelInfo->ri_BatchSize)
+			{
+				ExecBatchInsert(mtstate, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, canSetTag);
+				resultRelInfo->ri_NumSlots = 0;
+			}
+
+			oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+			if (resultRelInfo->ri_Slots == NULL)
+			{
+				resultRelInfo->ri_Slots = palloc(sizeof(TupleTableSlot *) *
+												 resultRelInfo->ri_BatchSize);
+				resultRelInfo->ri_PlanSlots = palloc(sizeof(TupleTableSlot *) *
+													 resultRelInfo->ri_BatchSize);
+			}
+
+			/*
+			 * Initialize the batch slots. We don't know how many slots will
+			 * be needed, so we initialize them as the batch grows, and we
+			 * keep them across batches. To mitigate an inefficiency in how
+			 * resource owner handles objects with many references (as with
+			 * many slots all referencing the same tuple descriptor) we copy
+			 * the appropriate tuple descriptor for each slot.
+			 */
+			if (resultRelInfo->ri_NumSlots >= resultRelInfo->ri_NumSlotsInitialized)
+			{
+				TupleDesc	tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
+				TupleDesc	plan_tdesc =
+					CreateTupleDescCopy(planSlot->tts_tupleDescriptor);
+
+				resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
+					MakeSingleTupleTableSlot(tdesc, slot->tts_ops);
+
+				resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] =
+					MakeSingleTupleTableSlot(plan_tdesc, planSlot->tts_ops);
+
+				/* remember how many batch slots we initialized */
+				resultRelInfo->ri_NumSlotsInitialized++;
+			}
+
+			ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots],
+						 slot);
+
+			ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots],
+						 planSlot);
+
+			resultRelInfo->ri_NumSlots++;
+
+			MemoryContextSwitchTo(oldContext);
+
+			return NULL;
+		}
+
+		/*
+		 * insert into foreign table: let the FDW do it
+		 */
+		slot = resultRelInfo->ri_FdwRoutine->ExecForeignInsert(estate,
+															   resultRelInfo,
+															   slot,
+															   planSlot);
+
+		if (slot == NULL)		/* "do nothing" */
+			return NULL;
+
+		/*
+		 * AFTER ROW Triggers or RETURNING expressions might reference the
+		 * tableoid column, so (re-)initialize tts_tableOid before evaluating
+		 * them.  (This covers the case where the FDW replaced the slot.)
+		 */
+		slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+	}
+	else
+	{
+		WCOKind		wco_kind;
+
+		/*
+		 * Constraints and GENERATED expressions might reference the tableoid
+		 * column, so (re-)initialize tts_tableOid before evaluating them.
+		 */
+		slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
+
+		/*
+		 * Compute stored generated columns
+		 */
+		if (resultRelationDesc->rd_att->constr &&
+			resultRelationDesc->rd_att->constr->has_generated_stored)
+			ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+									   CMD_INSERT);
+
+		/*
+		 * Check any RLS WITH CHECK policies.
+		 *
+		 * Normally we should check INSERT policies. But if the insert is the
+		 * result of a partition key update that moved the tuple to a new
+		 * partition, we should instead check UPDATE policies, because we are
+		 * executing policies defined on the target table, and not those
+		 * defined on the child partitions.
+		 */
+		wco_kind = (mtstate->operation == CMD_UPDATE) ?
+			WCO_RLS_UPDATE_CHECK : WCO_RLS_INSERT_CHECK;
+
+		/*
+		 * ExecWithCheckOptions() will skip any WCOs which are not of the kind
+		 * we are looking for at this point.
+		 */
+		if (resultRelInfo->ri_WithCheckOptions != NIL)
+			ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate);
+
+		/*
+		 * Check the constraints of the tuple.
+		 */
+		if (resultRelationDesc->rd_att->constr)
+			ExecConstraints(resultRelInfo, slot, estate);
+
+		/*
+		 * Also check the tuple against the partition constraint, if there is
+		 * one; except that if we got here via tuple-routing, we don't need to
+		 * if there's no BR trigger defined on the partition.
+		 */
+		if (resultRelationDesc->rd_rel->relispartition &&
+			(resultRelInfo->ri_RootResultRelInfo == NULL ||
+			 (resultRelInfo->ri_TrigDesc &&
+			  resultRelInfo->ri_TrigDesc->trig_insert_before_row)))
+			ExecPartitionCheck(resultRelInfo, slot, estate, true);
+
+		if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0)
+		{
+			/* Perform a speculative insertion. */
+			uint32		specToken;
+			ItemPointerData conflictTid;
+			bool		specConflict;
+			List	   *arbiterIndexes;
+
+			arbiterIndexes = resultRelInfo->ri_onConflictArbiterIndexes;
+
+			/*
+			 * Do a non-conclusive check for conflicts first.
+			 *
+			 * We're not holding any locks yet, so this doesn't guarantee that
+			 * the later insert won't conflict.  But it avoids leaving behind
+			 * a lot of canceled speculative insertions, if you run a lot of
+			 * INSERT ON CONFLICT statements that do conflict.
+			 *
+			 * We loop back here if we find a conflict below, either during
+			 * the pre-check, or when we re-check after inserting the tuple
+			 * speculatively.  Better allow interrupts in case some bug makes
+			 * this an infinite loop.
+			 */
+	vlock:
+			CHECK_FOR_INTERRUPTS();
+			specConflict = false;
+			if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate,
+										   &conflictTid, arbiterIndexes))
+			{
+				/* committed conflict tuple found */
+				if (onconflict == ONCONFLICT_UPDATE)
+				{
+					/*
+					 * In case of ON CONFLICT DO UPDATE, execute the UPDATE
+					 * part.  Be prepared to retry if the UPDATE fails because
+					 * of another concurrent UPDATE/DELETE to the conflict
+					 * tuple.
+					 */
+					TupleTableSlot *returning = NULL;
+
+					if (ExecOnConflictUpdate(mtstate, resultRelInfo,
+											 &conflictTid, planSlot, slot,
+											 estate, canSetTag, &returning))
+					{
+						InstrCountTuples2(&mtstate->ps, 1);
+						return returning;
+					}
+					else
+						goto vlock;
+				}
+				else
+				{
+					/*
+					 * In case of ON CONFLICT DO NOTHING, do nothing. However,
+					 * verify that the tuple is visible to the executor's MVCC
+					 * snapshot at higher isolation levels.
+					 *
+					 * Using ExecGetReturningSlot() to store the tuple for the
+					 * recheck isn't that pretty, but we can't trivially use
+					 * the input slot, because it might not be of a compatible
+					 * type. As there's no conflicting usage of
+					 * ExecGetReturningSlot() in the DO NOTHING case...
+					 */
+					Assert(onconflict == ONCONFLICT_NOTHING);
+					ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid,
+										ExecGetReturningSlot(estate, resultRelInfo));
+					InstrCountTuples2(&mtstate->ps, 1);
+					return NULL;
+				}
+			}
+
+			/*
+			 * Before we start insertion proper, acquire our "speculative
+			 * insertion lock".  Others can use that to wait for us to decide
+			 * if we're going to go ahead with the insertion, instead of
+			 * waiting for the whole transaction to complete.
+			 */
+			specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId());
+
+			/* insert the tuple, with the speculative token */
+			table_tuple_insert_speculative(resultRelationDesc, slot,
+										   estate->es_output_cid,
+										   0,
+										   NULL,
+										   specToken);
+
+			/* insert index entries for tuple */
+			recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+												   slot, estate, false, true,
+												   &specConflict,
+												   arbiterIndexes);
+
+			/* adjust the tuple's state accordingly */
+			table_tuple_complete_speculative(resultRelationDesc, slot,
+											 specToken, !specConflict);
+
+			/*
+			 * Wake up anyone waiting for our decision.  They will re-check
+			 * the tuple, see that it's no longer speculative, and wait on our
+			 * XID as if this was a regularly inserted tuple all along.  Or if
+			 * we killed the tuple, they will see it's dead, and proceed as if
+			 * the tuple never existed.
+			 */
+			SpeculativeInsertionLockRelease(GetCurrentTransactionId());
+
+			/*
+			 * If there was a conflict, start from the beginning.  We'll do
+			 * the pre-check again, which will now find the conflicting tuple
+			 * (unless it aborts before we get there).
+			 */
+			if (specConflict)
+			{
+				list_free(recheckIndexes);
+				goto vlock;
+			}
+
+			/* Since there was no insertion conflict, we're done */
+		}
+		else
+		{
+			/* insert the tuple normally */
+			table_tuple_insert(resultRelationDesc, slot,
+							   estate->es_output_cid,
+							   0, NULL);
+
+			/* insert index entries for tuple */
+			if (resultRelInfo->ri_NumIndices > 0)
+				recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+													   slot, estate, false,
+													   false, NULL, NIL);
+		}
+	}
+
+	if (canSetTag)
+		(estate->es_processed)++;
+
+	/*
+	 * If this insert is the result of a partition key update that moved the
+	 * tuple to a new partition, put this row into the transition NEW TABLE,
+	 * if there is one. We need to do this separately for DELETE and INSERT
+	 * because they happen on different tables.
+	 */
+	ar_insert_trig_tcs = mtstate->mt_transition_capture;
+	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	{
+		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+							 NULL,
+							 slot,
+							 NULL,
+							 mtstate->mt_transition_capture);
+
+		/*
+		 * We've already captured the NEW TABLE row, so make sure any AR
+		 * INSERT trigger fired below doesn't capture it again.
+		 */
+		ar_insert_trig_tcs = NULL;
+	}
+
+	/* AFTER ROW INSERT Triggers */
+	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+						 ar_insert_trig_tcs);
+
+	list_free(recheckIndexes);
+
+	/*
+	 * Check any WITH CHECK OPTION constraints from parent views.  We are
+	 * required to do this after testing all constraints and uniqueness
+	 * violations per the SQL spec, so we do it after actually inserting the
+	 * record into the heap and all indexes.
+	 *
+	 * ExecWithCheckOptions will elog(ERROR) if a violation is found, so the
+	 * tuple will never be seen, if it violates the WITH CHECK OPTION.
+	 *
+	 * ExecWithCheckOptions() will skip any WCOs which are not of the kind we
+	 * are looking for at this point.
+	 */
+	if (resultRelInfo->ri_WithCheckOptions != NIL)
+		ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);
+
+	/* Process RETURNING if present */
+	if (resultRelInfo->ri_projectReturning)
+		result = ExecProcessReturning(resultRelInfo, slot, planSlot);
+
+	return result;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBatchInsert
+ *
+ *		Insert multiple tuples in an efficient way.
+ *		Currently, this handles inserting into a foreign table without
+ *		RETURNING clause.
+ * ----------------------------------------------------------------
+ */
+static void
+ExecBatchInsert(ModifyTableState *mtstate,
+				ResultRelInfo *resultRelInfo,
+				TupleTableSlot **slots,
+				TupleTableSlot **planSlots,
+				int numSlots,
+				EState *estate,
+				bool canSetTag)
+{
+	int			i;
+	int			numInserted = numSlots;
+	TupleTableSlot *slot = NULL;
+	TupleTableSlot **rslots;
+
+	/*
+	 * insert into foreign table: let the FDW do it
+	 */
+	rslots = resultRelInfo->ri_FdwRoutine->ExecForeignBatchInsert(estate,
+																  resultRelInfo,
+																  slots,
+																  planSlots,
+																  &numInserted);
+
+	for (i = 0; i < numInserted; i++)
+	{
+		slot = rslots[i];
+
+		/*
+		 * AFTER ROW Triggers or RETURNING expressions might reference the
+		 * tableoid column, so (re-)initialize tts_tableOid before evaluating
+		 * them.
+		 */
+		slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, NIL,
+							 mtstate->mt_transition_capture);
+
+		/*
+		 * Check any WITH CHECK OPTION constraints from parent views.  See the
+		 * comment in ExecInsert.
+		 */
+		if (resultRelInfo->ri_WithCheckOptions != NIL)
+			ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);
+	}
+
+	if (canSetTag && numInserted > 0)
+		estate->es_processed += numInserted;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecDelete
+ *
+ *		DELETE is like UPDATE, except that we delete the tuple and no
+ *		index modifications are needed.
+ *
+ *		When deleting from a table, tupleid identifies the tuple to
+ *		delete and oldtuple is NULL.  When deleting from a view,
+ *		oldtuple is passed to the INSTEAD OF triggers and identifies
+ *		what to delete, and tupleid is invalid.  When deleting from a
+ *		foreign table, tupleid is invalid; the FDW has to figure out
+ *		which row to delete using data from the planSlot.  oldtuple is
+ *		passed to foreign table triggers; it is NULL when the foreign
+ *		table has no relevant triggers.  We use tupleDeleted to indicate
+ *		whether the tuple is actually deleted, callers can use it to
+ *		decide whether to continue the operation.  When this DELETE is a
+ *		part of an UPDATE of partition-key, then the slot returned by
+ *		EvalPlanQual() is passed back using output parameter epqslot.
+ *
+ *		Returns RETURNING result if any, otherwise NULL.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecDelete(ModifyTableState *mtstate,
+		   ResultRelInfo *resultRelInfo,
+		   ItemPointer tupleid,
+		   HeapTuple oldtuple,
+		   TupleTableSlot *planSlot,
+		   EPQState *epqstate,
+		   EState *estate,
+		   bool processReturning,
+		   bool canSetTag,
+		   bool changingPart,
+		   bool *tupleDeleted,
+		   TupleTableSlot **epqreturnslot)
+{
+	Relation	resultRelationDesc = resultRelInfo->ri_RelationDesc;
+	TM_Result	result;
+	TM_FailureData tmfd;
+	TupleTableSlot *slot = NULL;
+	TransitionCaptureState *ar_delete_trig_tcs;
+
+	if (tupleDeleted)
+		*tupleDeleted = false;
+
+	/* BEFORE ROW DELETE Triggers */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_delete_before_row)
+	{
+		bool		dodelete;
+
+		dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
+										tupleid, oldtuple, epqreturnslot);
+
+		if (!dodelete)			/* "do nothing" */
+			return NULL;
+	}
+
+	/* INSTEAD OF ROW DELETE Triggers */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_delete_instead_row)
+	{
+		bool		dodelete;
+
+		Assert(oldtuple != NULL);
+		dodelete = ExecIRDeleteTriggers(estate, resultRelInfo, oldtuple);
+
+		if (!dodelete)			/* "do nothing" */
+			return NULL;
+	}
+	else if (resultRelInfo->ri_FdwRoutine)
+	{
+		/*
+		 * delete from foreign table: let the FDW do it
+		 *
+		 * We offer the returning slot as a place to store RETURNING data,
+		 * although the FDW can return some other slot if it wants.
+		 */
+		slot = ExecGetReturningSlot(estate, resultRelInfo);
+		slot = resultRelInfo->ri_FdwRoutine->ExecForeignDelete(estate,
+															   resultRelInfo,
+															   slot,
+															   planSlot);
+
+		if (slot == NULL)		/* "do nothing" */
+			return NULL;
+
+		/*
+		 * RETURNING expressions might reference the tableoid column, so
+		 * (re)initialize tts_tableOid before evaluating them.
+		 */
+		if (TTS_EMPTY(slot))
+			ExecStoreAllNullTuple(slot);
+
+		slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
+	}
+	else
+	{
+		/*
+		 * delete the tuple
+		 *
+		 * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check
+		 * that the row to be deleted is visible to that snapshot, and throw a
+		 * can't-serialize error if not. This is a special-case behavior
+		 * needed for referential integrity updates in transaction-snapshot
+		 * mode transactions.
+		 */
+ldelete:;
+		result = table_tuple_delete(resultRelationDesc, tupleid,
+									estate->es_output_cid,
+									estate->es_snapshot,
+									estate->es_crosscheck_snapshot,
+									true /* wait for commit */ ,
+									&tmfd,
+									changingPart);
+
+		switch (result)
+		{
+			case TM_SelfModified:
+
+				/*
+				 * The target tuple was already updated or deleted by the
+				 * current command, or by a later command in the current
+				 * transaction.  The former case is possible in a join DELETE
+				 * where multiple tuples join to the same target tuple. This
+				 * is somewhat questionable, but Postgres has always allowed
+				 * it: we just ignore additional deletion attempts.
+				 *
+				 * The latter case arises if the tuple is modified by a
+				 * command in a BEFORE trigger, or perhaps by a command in a
+				 * volatile function used in the query.  In such situations we
+				 * should not ignore the deletion, but it is equally unsafe to
+				 * proceed.  We don't want to discard the original DELETE
+				 * while keeping the triggered actions based on its deletion;
+				 * and it would be no better to allow the original DELETE
+				 * while discarding updates that it triggered.  The row update
+				 * carries some information that might be important according
+				 * to business rules; so throwing an error is the only safe
+				 * course.
+				 *
+				 * If a trigger actually intends this type of interaction, it
+				 * can re-execute the DELETE and then return NULL to cancel
+				 * the outer delete.
+				 */
+				if (tmfd.cmax != estate->es_output_cid)
+					ereport(ERROR,
+							(errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
+							 errmsg("tuple to be deleted was already modified by an operation triggered by the current command"),
+							 errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
+
+				/* Else, already deleted by self; nothing to do */
+				return NULL;
+
+			case TM_Ok:
+				break;
+
+			case TM_Updated:
+				{
+					TupleTableSlot *inputslot;
+					TupleTableSlot *epqslot;
+
+					if (IsolationUsesXactSnapshot())
+						ereport(ERROR,
+								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+								 errmsg("could not serialize access due to concurrent update")));
+
+					/*
+					 * Already know that we're going to need to do EPQ, so
+					 * fetch tuple directly into the right slot.
+					 */
+					EvalPlanQualBegin(epqstate);
+					inputslot = EvalPlanQualSlot(epqstate, resultRelationDesc,
+												 resultRelInfo->ri_RangeTableIndex);
+
+					result = table_tuple_lock(resultRelationDesc, tupleid,
+											  estate->es_snapshot,
+											  inputslot, estate->es_output_cid,
+											  LockTupleExclusive, LockWaitBlock,
+											  TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
+											  &tmfd);
+
+					switch (result)
+					{
+						case TM_Ok:
+							Assert(tmfd.traversed);
+							epqslot = EvalPlanQual(epqstate,
+												   resultRelationDesc,
+												   resultRelInfo->ri_RangeTableIndex,
+												   inputslot);
+							if (TupIsNull(epqslot))
+								/* Tuple not passing quals anymore, exiting... */
+								return NULL;
+
+							/*
+							 * If requested, skip delete and pass back the
+							 * updated row.
+							 */
+							if (epqreturnslot)
+							{
+								*epqreturnslot = epqslot;
+								return NULL;
+							}
+							else
+								goto ldelete;
+
+						case TM_SelfModified:
+
+							/*
+							 * This can be reached when following an update
+							 * chain from a tuple updated by another session,
+							 * reaching a tuple that was already updated in
+							 * this transaction. If previously updated by this
+							 * command, ignore the delete, otherwise error
+							 * out.
+							 *
+							 * See also TM_SelfModified response to
+							 * table_tuple_delete() above.
+							 */
+							if (tmfd.cmax != estate->es_output_cid)
+								ereport(ERROR,
+										(errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
+										 errmsg("tuple to be deleted was already modified by an operation triggered by the current command"),
+										 errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
+							return NULL;
+
+						case TM_Deleted:
+							/* tuple already deleted; nothing to do */
+							return NULL;
+
+						default:
+
+							/*
+							 * TM_Invisible should be impossible because we're
+							 * waiting for updated row versions, and would
+							 * already have errored out if the first version
+							 * is invisible.
+							 *
+							 * TM_Updated should be impossible, because we're
+							 * locking the latest version via
+							 * TUPLE_LOCK_FLAG_FIND_LAST_VERSION.
+							 */
+							elog(ERROR, "unexpected table_tuple_lock status: %u",
+								 result);
+							return NULL;
+					}
+
+					Assert(false);
+					break;
+				}
+
+			case TM_Deleted:
+				if (IsolationUsesXactSnapshot())
+					ereport(ERROR,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("could not serialize access due to concurrent delete")));
+				/* tuple already deleted; nothing to do */
+				return NULL;
+
+			default:
+				elog(ERROR, "unrecognized table_tuple_delete status: %u",
+					 result);
+				return NULL;
+		}
+
+		/*
+		 * Note: Normally one would think that we have to delete index tuples
+		 * associated with the heap tuple now...
+		 *
+		 * ... but in POSTGRES, we have no need to do this because VACUUM will
+		 * take care of it later.  We can't delete index tuples immediately
+		 * anyway, since the tuple is still visible to other transactions.
+		 */
+	}
+
+	if (canSetTag)
+		(estate->es_processed)++;
+
+	/* Tell caller that the delete actually happened. */
+	if (tupleDeleted)
+		*tupleDeleted = true;
+
+	/*
+	 * If this delete is the result of a partition key update that moved the
+	 * tuple to a new partition, put this row into the transition OLD TABLE,
+	 * if there is one. We need to do this separately for DELETE and INSERT
+	 * because they happen on different tables.
+	 */
+	ar_delete_trig_tcs = mtstate->mt_transition_capture;
+	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+		&& mtstate->mt_transition_capture->tcs_update_old_table)
+	{
+		ExecARUpdateTriggers(estate, resultRelInfo,
+							 tupleid,
+							 oldtuple,
+							 NULL,
+							 NULL,
+							 mtstate->mt_transition_capture);
+
+		/*
+		 * We've already captured the NEW TABLE row, so make sure any AR
+		 * DELETE trigger fired below doesn't capture it again.
+		 */
+		ar_delete_trig_tcs = NULL;
+	}
+
+	/* AFTER ROW DELETE Triggers */
+	ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,
+						 ar_delete_trig_tcs);
+
+	/* Process RETURNING if present and if requested */
+	if (processReturning && resultRelInfo->ri_projectReturning)
+	{
+		/*
+		 * We have to put the target tuple into a slot, which means first we
+		 * gotta fetch it.  We can use the trigger tuple slot.
+		 */
+		TupleTableSlot *rslot;
+
+		if (resultRelInfo->ri_FdwRoutine)
+		{
+			/* FDW must have provided a slot containing the deleted row */
+			Assert(!TupIsNull(slot));
+		}
+		else
+		{
+			slot = ExecGetReturningSlot(estate, resultRelInfo);
+			if (oldtuple != NULL)
+			{
+				ExecForceStoreHeapTuple(oldtuple, slot, false);
+			}
+			else
+			{
+				if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid,
+												   SnapshotAny, slot))
+					elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING");
+			}
+		}
+
+		rslot = ExecProcessReturning(resultRelInfo, slot, planSlot);
+
+		/*
+		 * Before releasing the target tuple again, make sure rslot has a
+		 * local copy of any pass-by-reference values.
+		 */
+		ExecMaterializeSlot(rslot);
+
+		ExecClearTuple(slot);
+
+		return rslot;
+	}
+
+	return NULL;
+}
+
+/*
+ * ExecCrossPartitionUpdate --- Move an updated tuple to another partition.
+ *
+ * This works by first deleting the old tuple from the current partition,
+ * followed by inserting the new tuple into the root parent table, that is,
+ * mtstate->rootResultRelInfo.  It will be re-routed from there to the
+ * correct partition.
+ *
+ * Returns true if the tuple has been successfully moved, or if it's found
+ * that the tuple was concurrently deleted so there's nothing more to do
+ * for the caller.
+ *
+ * False is returned if the tuple we're trying to move is found to have been
+ * concurrently updated.  In that case, the caller must to check if the
+ * updated tuple that's returned in *retry_slot still needs to be re-routed,
+ * and call this function again or perform a regular update accordingly.
+ */
+static bool
+ExecCrossPartitionUpdate(ModifyTableState *mtstate,
+						 ResultRelInfo *resultRelInfo,
+						 ItemPointer tupleid, HeapTuple oldtuple,
+						 TupleTableSlot *slot, TupleTableSlot *planSlot,
+						 EPQState *epqstate, bool canSetTag,
+						 TupleTableSlot **retry_slot,
+						 TupleTableSlot **inserted_tuple)
+{
+	EState	   *estate = mtstate->ps.state;
+	TupleConversionMap *tupconv_map;
+	bool		tuple_deleted;
+	TupleTableSlot *epqslot = NULL;
+
+	*inserted_tuple = NULL;
+	*retry_slot = NULL;
+
+	/*
+	 * Disallow an INSERT ON CONFLICT DO UPDATE that causes the original row
+	 * to migrate to a different partition.  Maybe this can be implemented
+	 * some day, but it seems a fringe feature with little redeeming value.
+	 */
+	if (((ModifyTable *) mtstate->ps.plan)->onConflictAction == ONCONFLICT_UPDATE)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("invalid ON UPDATE specification"),
+				 errdetail("The result tuple would appear in a different partition than the original tuple.")));
+
+	/*
+	 * When an UPDATE is run directly on a leaf partition, simply fail with a
+	 * partition constraint violation error.
+	 */
+	if (resultRelInfo == mtstate->rootResultRelInfo)
+		ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+	/* Initialize tuple routing info if not already done. */
+	if (mtstate->mt_partition_tuple_routing == NULL)
+	{
+		Relation	rootRel = mtstate->rootResultRelInfo->ri_RelationDesc;
+		MemoryContext oldcxt;
+
+		/* Things built here have to last for the query duration. */
+		oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
+
+		mtstate->mt_partition_tuple_routing =
+			ExecSetupPartitionTupleRouting(estate, rootRel);
+
+		/*
+		 * Before a partition's tuple can be re-routed, it must first be
+		 * converted to the root's format, so we'll need a slot for storing
+		 * such tuples.
+		 */
+		Assert(mtstate->mt_root_tuple_slot == NULL);
+		mtstate->mt_root_tuple_slot = table_slot_create(rootRel, NULL);
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	/*
+	 * Row movement, part 1.  Delete the tuple, but skip RETURNING processing.
+	 * We want to return rows from INSERT.
+	 */
+	ExecDelete(mtstate, resultRelInfo, tupleid, oldtuple, planSlot,
+			   epqstate, estate,
+			   false,			/* processReturning */
+			   false,			/* canSetTag */
+			   true,			/* changingPart */
+			   &tuple_deleted, &epqslot);
+
+	/*
+	 * For some reason if DELETE didn't happen (e.g. trigger prevented it, or
+	 * it was already deleted by self, or it was concurrently deleted by
+	 * another transaction), then we should skip the insert as well;
+	 * otherwise, an UPDATE could cause an increase in the total number of
+	 * rows across all partitions, which is clearly wrong.
+	 *
+	 * For a normal UPDATE, the case where the tuple has been the subject of a
+	 * concurrent UPDATE or DELETE would be handled by the EvalPlanQual
+	 * machinery, but for an UPDATE that we've translated into a DELETE from
+	 * this partition and an INSERT into some other partition, that's not
+	 * available, because CTID chains can't span relation boundaries.  We
+	 * mimic the semantics to a limited extent by skipping the INSERT if the
+	 * DELETE fails to find a tuple.  This ensures that two concurrent
+	 * attempts to UPDATE the same tuple at the same time can't turn one tuple
+	 * into two, and that an UPDATE of a just-deleted tuple can't resurrect
+	 * it.
+	 */
+	if (!tuple_deleted)
+	{
+		/*
+		 * epqslot will be typically NULL.  But when ExecDelete() finds that
+		 * another transaction has concurrently updated the same row, it
+		 * re-fetches the row, skips the delete, and epqslot is set to the
+		 * re-fetched tuple slot.  In that case, we need to do all the checks
+		 * again.
+		 */
+		if (TupIsNull(epqslot))
+			return true;
+		else
+		{
+			/* Fetch the most recent version of old tuple. */
+			TupleTableSlot *oldSlot;
+
+			/* ... but first, make sure ri_oldTupleSlot is initialized. */
+			if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+				ExecInitUpdateProjection(mtstate, resultRelInfo);
+			oldSlot = resultRelInfo->ri_oldTupleSlot;
+			if (!table_tuple_fetch_row_version(resultRelInfo->ri_RelationDesc,
+											   tupleid,
+											   SnapshotAny,
+											   oldSlot))
+				elog(ERROR, "failed to fetch tuple being updated");
+			*retry_slot = ExecGetUpdateNewTuple(resultRelInfo, epqslot,
+												oldSlot);
+			return false;
+		}
+	}
+
+	/*
+	 * resultRelInfo is one of the per-relation resultRelInfos.  So we should
+	 * convert the tuple into root's tuple descriptor if needed, since
+	 * ExecInsert() starts the search from root.
+	 */
+	tupconv_map = ExecGetChildToRootMap(resultRelInfo);
+	if (tupconv_map != NULL)
+		slot = execute_attr_map_slot(tupconv_map->attrMap,
+									 slot,
+									 mtstate->mt_root_tuple_slot);
+
+	/* Tuple routing starts from the root table. */
+	*inserted_tuple = ExecInsert(mtstate, mtstate->rootResultRelInfo, slot,
+								 planSlot, estate, canSetTag);
+
+	/*
+	 * Reset the transition state that may possibly have been written by
+	 * INSERT.
+	 */
+	if (mtstate->mt_transition_capture)
+		mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+
+	/* We're done moving. */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecUpdate
+ *
+ *		note: we can't run UPDATE queries with transactions
+ *		off because UPDATEs are actually INSERTs and our
+ *		scan will mistakenly loop forever, updating the tuple
+ *		it just inserted..  This should be fixed but until it
+ *		is, we don't want to get stuck in an infinite loop
+ *		which corrupts your database..
+ *
+ *		When updating a table, tupleid identifies the tuple to
+ *		update and oldtuple is NULL.  When updating a view, oldtuple
+ *		is passed to the INSTEAD OF triggers and identifies what to
+ *		update, and tupleid is invalid.  When updating a foreign table,
+ *		tupleid is invalid; the FDW has to figure out which row to
+ *		update using data from the planSlot.  oldtuple is passed to
+ *		foreign table triggers; it is NULL when the foreign table has
+ *		no relevant triggers.
+ *
+ *		slot contains the new tuple value to be stored.
+ *		planSlot is the output of the ModifyTable's subplan; we use it
+ *		to access values from other input tables (for RETURNING),
+ *		row-ID junk columns, etc.
+ *
+ *		Returns RETURNING result if any, otherwise NULL.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecUpdate(ModifyTableState *mtstate,
+		   ResultRelInfo *resultRelInfo,
+		   ItemPointer tupleid,
+		   HeapTuple oldtuple,
+		   TupleTableSlot *slot,
+		   TupleTableSlot *planSlot,
+		   EPQState *epqstate,
+		   EState *estate,
+		   bool canSetTag)
+{
+	Relation	resultRelationDesc = resultRelInfo->ri_RelationDesc;
+	TM_Result	result;
+	TM_FailureData tmfd;
+	List	   *recheckIndexes = NIL;
+
+	/*
+	 * abort the operation if not running transactions
+	 */
+	if (IsBootstrapProcessingMode())
+		elog(ERROR, "cannot UPDATE during bootstrap");
+
+	ExecMaterializeSlot(slot);
+
+	/*
+	 * Open the table's indexes, if we have not done so already, so that we
+	 * can add new index entries for the updated tuple.
+	 */
+	if (resultRelationDesc->rd_rel->relhasindex &&
+		resultRelInfo->ri_IndexRelationDescs == NULL)
+		ExecOpenIndices(resultRelInfo, false);
+
+	/* BEFORE ROW UPDATE Triggers */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_update_before_row)
+	{
+		if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo,
+								  tupleid, oldtuple, slot))
+			return NULL;		/* "do nothing" */
+	}
+
+	/* INSTEAD OF ROW UPDATE Triggers */
+	if (resultRelInfo->ri_TrigDesc &&
+		resultRelInfo->ri_TrigDesc->trig_update_instead_row)
+	{
+		if (!ExecIRUpdateTriggers(estate, resultRelInfo,
+								  oldtuple, slot))
+			return NULL;		/* "do nothing" */
+	}
+	else if (resultRelInfo->ri_FdwRoutine)
+	{
+		/*
+		 * GENERATED expressions might reference the tableoid column, so
+		 * (re-)initialize tts_tableOid before evaluating them.
+		 */
+		slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+		/*
+		 * Compute stored generated columns
+		 */
+		if (resultRelationDesc->rd_att->constr &&
+			resultRelationDesc->rd_att->constr->has_generated_stored)
+			ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+									   CMD_UPDATE);
+
+		/*
+		 * update in foreign table: let the FDW do it
+		 */
+		slot = resultRelInfo->ri_FdwRoutine->ExecForeignUpdate(estate,
+															   resultRelInfo,
+															   slot,
+															   planSlot);
+
+		if (slot == NULL)		/* "do nothing" */
+			return NULL;
+
+		/*
+		 * AFTER ROW Triggers or RETURNING expressions might reference the
+		 * tableoid column, so (re-)initialize tts_tableOid before evaluating
+		 * them.  (This covers the case where the FDW replaced the slot.)
+		 */
+		slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
+	}
+	else
+	{
+		LockTupleMode lockmode;
+		bool		partition_constraint_failed;
+		bool		update_indexes;
+
+		/*
+		 * Constraints and GENERATED expressions might reference the tableoid
+		 * column, so (re-)initialize tts_tableOid before evaluating them.
+		 */
+		slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
+
+		/*
+		 * Compute stored generated columns
+		 */
+		if (resultRelationDesc->rd_att->constr &&
+			resultRelationDesc->rd_att->constr->has_generated_stored)
+			ExecComputeStoredGenerated(resultRelInfo, estate, slot,
+									   CMD_UPDATE);
+
+		/*
+		 * Check any RLS UPDATE WITH CHECK policies
+		 *
+		 * If we generate a new candidate tuple after EvalPlanQual testing, we
+		 * must loop back here and recheck any RLS policies and constraints.
+		 * (We don't need to redo triggers, however.  If there are any BEFORE
+		 * triggers then trigger.c will have done table_tuple_lock to lock the
+		 * correct tuple, so there's no need to do them again.)
+		 */
+lreplace:;
+
+		/* ensure slot is independent, consider e.g. EPQ */
+		ExecMaterializeSlot(slot);
+
+		/*
+		 * If partition constraint fails, this row might get moved to another
+		 * partition, in which case we should check the RLS CHECK policy just
+		 * before inserting into the new partition, rather than doing it here.
+		 * This is because a trigger on that partition might again change the
+		 * row.  So skip the WCO checks if the partition constraint fails.
+		 */
+		partition_constraint_failed =
+			resultRelationDesc->rd_rel->relispartition &&
+			!ExecPartitionCheck(resultRelInfo, slot, estate, false);
+
+		if (!partition_constraint_failed &&
+			resultRelInfo->ri_WithCheckOptions != NIL)
+		{
+			/*
+			 * ExecWithCheckOptions() will skip any WCOs which are not of the
+			 * kind we are looking for at this point.
+			 */
+			ExecWithCheckOptions(WCO_RLS_UPDATE_CHECK,
+								 resultRelInfo, slot, estate);
+		}
+
+		/*
+		 * If a partition check failed, try to move the row into the right
+		 * partition.
+		 */
+		if (partition_constraint_failed)
+		{
+			TupleTableSlot *inserted_tuple,
+					   *retry_slot;
+			bool		retry;
+
+			/*
+			 * ExecCrossPartitionUpdate will first DELETE the row from the
+			 * partition it's currently in and then insert it back into the
+			 * root table, which will re-route it to the correct partition.
+			 * The first part may have to be repeated if it is detected that
+			 * the tuple we're trying to move has been concurrently updated.
+			 */
+			retry = !ExecCrossPartitionUpdate(mtstate, resultRelInfo, tupleid,
+											  oldtuple, slot, planSlot,
+											  epqstate, canSetTag,
+											  &retry_slot, &inserted_tuple);
+			if (retry)
+			{
+				slot = retry_slot;
+				goto lreplace;
+			}
+
+			return inserted_tuple;
+		}
+
+		/*
+		 * Check the constraints of the tuple.  We've already checked the
+		 * partition constraint above; however, we must still ensure the tuple
+		 * passes all other constraints, so we will call ExecConstraints() and
+		 * have it validate all remaining checks.
+		 */
+		if (resultRelationDesc->rd_att->constr)
+			ExecConstraints(resultRelInfo, slot, estate);
+
+		/*
+		 * replace the heap tuple
+		 *
+		 * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check
+		 * that the row to be updated is visible to that snapshot, and throw a
+		 * can't-serialize error if not. This is a special-case behavior
+		 * needed for referential integrity updates in transaction-snapshot
+		 * mode transactions.
+		 */
+		result = table_tuple_update(resultRelationDesc, tupleid, slot,
+									estate->es_output_cid,
+									estate->es_snapshot,
+									estate->es_crosscheck_snapshot,
+									true /* wait for commit */ ,
+									&tmfd, &lockmode, &update_indexes);
+
+		switch (result)
+		{
+			case TM_SelfModified:
+
+				/*
+				 * The target tuple was already updated or deleted by the
+				 * current command, or by a later command in the current
+				 * transaction.  The former case is possible in a join UPDATE
+				 * where multiple tuples join to the same target tuple. This
+				 * is pretty questionable, but Postgres has always allowed it:
+				 * we just execute the first update action and ignore
+				 * additional update attempts.
+				 *
+				 * The latter case arises if the tuple is modified by a
+				 * command in a BEFORE trigger, or perhaps by a command in a
+				 * volatile function used in the query.  In such situations we
+				 * should not ignore the update, but it is equally unsafe to
+				 * proceed.  We don't want to discard the original UPDATE
+				 * while keeping the triggered actions based on it; and we
+				 * have no principled way to merge this update with the
+				 * previous ones.  So throwing an error is the only safe
+				 * course.
+				 *
+				 * If a trigger actually intends this type of interaction, it
+				 * can re-execute the UPDATE (assuming it can figure out how)
+				 * and then return NULL to cancel the outer update.
+				 */
+				if (tmfd.cmax != estate->es_output_cid)
+					ereport(ERROR,
+							(errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
+							 errmsg("tuple to be updated was already modified by an operation triggered by the current command"),
+							 errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
+
+				/* Else, already updated by self; nothing to do */
+				return NULL;
+
+			case TM_Ok:
+				break;
+
+			case TM_Updated:
+				{
+					TupleTableSlot *inputslot;
+					TupleTableSlot *epqslot;
+					TupleTableSlot *oldSlot;
+
+					if (IsolationUsesXactSnapshot())
+						ereport(ERROR,
+								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+								 errmsg("could not serialize access due to concurrent update")));
+
+					/*
+					 * Already know that we're going to need to do EPQ, so
+					 * fetch tuple directly into the right slot.
+					 */
+					inputslot = EvalPlanQualSlot(epqstate, resultRelationDesc,
+												 resultRelInfo->ri_RangeTableIndex);
+
+					result = table_tuple_lock(resultRelationDesc, tupleid,
+											  estate->es_snapshot,
+											  inputslot, estate->es_output_cid,
+											  lockmode, LockWaitBlock,
+											  TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
+											  &tmfd);
+
+					switch (result)
+					{
+						case TM_Ok:
+							Assert(tmfd.traversed);
+
+							epqslot = EvalPlanQual(epqstate,
+												   resultRelationDesc,
+												   resultRelInfo->ri_RangeTableIndex,
+												   inputslot);
+							if (TupIsNull(epqslot))
+								/* Tuple not passing quals anymore, exiting... */
+								return NULL;
+
+							/* Make sure ri_oldTupleSlot is initialized. */
+							if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+								ExecInitUpdateProjection(mtstate, resultRelInfo);
+
+							/* Fetch the most recent version of old tuple. */
+							oldSlot = resultRelInfo->ri_oldTupleSlot;
+							if (!table_tuple_fetch_row_version(resultRelationDesc,
+															   tupleid,
+															   SnapshotAny,
+															   oldSlot))
+								elog(ERROR, "failed to fetch tuple being updated");
+							slot = ExecGetUpdateNewTuple(resultRelInfo,
+														 epqslot, oldSlot);
+							goto lreplace;
+
+						case TM_Deleted:
+							/* tuple already deleted; nothing to do */
+							return NULL;
+
+						case TM_SelfModified:
+
+							/*
+							 * This can be reached when following an update
+							 * chain from a tuple updated by another session,
+							 * reaching a tuple that was already updated in
+							 * this transaction. If previously modified by
+							 * this command, ignore the redundant update,
+							 * otherwise error out.
+							 *
+							 * See also TM_SelfModified response to
+							 * table_tuple_update() above.
+							 */
+							if (tmfd.cmax != estate->es_output_cid)
+								ereport(ERROR,
+										(errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
+										 errmsg("tuple to be updated was already modified by an operation triggered by the current command"),
+										 errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
+							return NULL;
+
+						default:
+							/* see table_tuple_lock call in ExecDelete() */
+							elog(ERROR, "unexpected table_tuple_lock status: %u",
+								 result);
+							return NULL;
+					}
+				}
+
+				break;
+
+			case TM_Deleted:
+				if (IsolationUsesXactSnapshot())
+					ereport(ERROR,
+							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+							 errmsg("could not serialize access due to concurrent delete")));
+				/* tuple already deleted; nothing to do */
+				return NULL;
+
+			default:
+				elog(ERROR, "unrecognized table_tuple_update status: %u",
+					 result);
+				return NULL;
+		}
+
+		/* insert index entries for tuple if necessary */
+		if (resultRelInfo->ri_NumIndices > 0 && update_indexes)
+			recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
+												   slot, estate, true, false,
+												   NULL, NIL);
+	}
+
+	if (canSetTag)
+		(estate->es_processed)++;
+
+	/* AFTER ROW UPDATE Triggers */
+	ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple, slot,
+						 recheckIndexes,
+						 mtstate->operation == CMD_INSERT ?
+						 mtstate->mt_oc_transition_capture :
+						 mtstate->mt_transition_capture);
+
+	list_free(recheckIndexes);
+
+	/*
+	 * Check any WITH CHECK OPTION constraints from parent views.  We are
+	 * required to do this after testing all constraints and uniqueness
+	 * violations per the SQL spec, so we do it after actually updating the
+	 * record in the heap and all indexes.
+	 *
+	 * ExecWithCheckOptions() will skip any WCOs which are not of the kind we
+	 * are looking for at this point.
+	 */
+	if (resultRelInfo->ri_WithCheckOptions != NIL)
+		ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);
+
+	/* Process RETURNING if present */
+	if (resultRelInfo->ri_projectReturning)
+		return ExecProcessReturning(resultRelInfo, slot, planSlot);
+
+	return NULL;
+}
+
+/*
+ * ExecOnConflictUpdate --- execute UPDATE of INSERT ON CONFLICT DO UPDATE
+ *
+ * Try to lock tuple for update as part of speculative insertion.  If
+ * a qual originating from ON CONFLICT DO UPDATE is satisfied, update
+ * (but still lock row, even though it may not satisfy estate's
+ * snapshot).
+ *
+ * Returns true if we're done (with or without an update), or false if
+ * the caller must retry the INSERT from scratch.
+ */
+static bool
+ExecOnConflictUpdate(ModifyTableState *mtstate,
+					 ResultRelInfo *resultRelInfo,
+					 ItemPointer conflictTid,
+					 TupleTableSlot *planSlot,
+					 TupleTableSlot *excludedSlot,
+					 EState *estate,
+					 bool canSetTag,
+					 TupleTableSlot **returning)
+{
+	ExprContext *econtext = mtstate->ps.ps_ExprContext;
+	Relation	relation = resultRelInfo->ri_RelationDesc;
+	ExprState  *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause;
+	TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing;
+	TM_FailureData tmfd;
+	LockTupleMode lockmode;
+	TM_Result	test;
+	Datum		xminDatum;
+	TransactionId xmin;
+	bool		isnull;
+
+	/* Determine lock mode to use */
+	lockmode = ExecUpdateLockMode(estate, resultRelInfo);
+
+	/*
+	 * Lock tuple for update.  Don't follow updates when tuple cannot be
+	 * locked without doing so.  A row locking conflict here means our
+	 * previous conclusion that the tuple is conclusively committed is not
+	 * true anymore.
+	 */
+	test = table_tuple_lock(relation, conflictTid,
+							estate->es_snapshot,
+							existing, estate->es_output_cid,
+							lockmode, LockWaitBlock, 0,
+							&tmfd);
+	switch (test)
+	{
+		case TM_Ok:
+			/* success! */
+			break;
+
+		case TM_Invisible:
+
+			/*
+			 * This can occur when a just inserted tuple is updated again in
+			 * the same command. E.g. because multiple rows with the same
+			 * conflicting key values are inserted.
+			 *
+			 * This is somewhat similar to the ExecUpdate() TM_SelfModified
+			 * case.  We do not want to proceed because it would lead to the
+			 * same row being updated a second time in some unspecified order,
+			 * and in contrast to plain UPDATEs there's no historical behavior
+			 * to break.
+			 *
+			 * It is the user's responsibility to prevent this situation from
+			 * occurring.  These problems are why SQL-2003 similarly specifies
+			 * that for SQL MERGE, an exception must be raised in the event of
+			 * an attempt to update the same row twice.
+			 */
+			xminDatum = slot_getsysattr(existing,
+										MinTransactionIdAttributeNumber,
+										&isnull);
+			Assert(!isnull);
+			xmin = DatumGetTransactionId(xminDatum);
+
+			if (TransactionIdIsCurrentTransactionId(xmin))
+				ereport(ERROR,
+						(errcode(ERRCODE_CARDINALITY_VIOLATION),
+						 errmsg("ON CONFLICT DO UPDATE command cannot affect row a second time"),
+						 errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values.")));
+
+			/* This shouldn't happen */
+			elog(ERROR, "attempted to lock invisible tuple");
+			break;
+
+		case TM_SelfModified:
+
+			/*
+			 * This state should never be reached. As a dirty snapshot is used
+			 * to find conflicting tuples, speculative insertion wouldn't have
+			 * seen this row to conflict with.
+			 */
+			elog(ERROR, "unexpected self-updated tuple");
+			break;
+
+		case TM_Updated:
+			if (IsolationUsesXactSnapshot())
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to concurrent update")));
+
+			/*
+			 * As long as we don't support an UPDATE of INSERT ON CONFLICT for
+			 * a partitioned table we shouldn't reach to a case where tuple to
+			 * be lock is moved to another partition due to concurrent update
+			 * of the partition key.
+			 */
+			Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid));
+
+			/*
+			 * Tell caller to try again from the very start.
+			 *
+			 * It does not make sense to use the usual EvalPlanQual() style
+			 * loop here, as the new version of the row might not conflict
+			 * anymore, or the conflicting tuple has actually been deleted.
+			 */
+			ExecClearTuple(existing);
+			return false;
+
+		case TM_Deleted:
+			if (IsolationUsesXactSnapshot())
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to concurrent delete")));
+
+			/* see TM_Updated case */
+			Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid));
+			ExecClearTuple(existing);
+			return false;
+
+		default:
+			elog(ERROR, "unrecognized table_tuple_lock status: %u", test);
+	}
+
+	/* Success, the tuple is locked. */
+
+	/*
+	 * Verify that the tuple is visible to our MVCC snapshot if the current
+	 * isolation level mandates that.
+	 *
+	 * It's not sufficient to rely on the check within ExecUpdate() as e.g.
+	 * CONFLICT ... WHERE clause may prevent us from reaching that.
+	 *
+	 * This means we only ever continue when a new command in the current
+	 * transaction could see the row, even though in READ COMMITTED mode the
+	 * tuple will not be visible according to the current statement's
+	 * snapshot.  This is in line with the way UPDATE deals with newer tuple
+	 * versions.
+	 */
+	ExecCheckTupleVisible(estate, relation, existing);
+
+	/*
+	 * Make tuple and any needed join variables available to ExecQual and
+	 * ExecProject.  The EXCLUDED tuple is installed in ecxt_innertuple, while
+	 * the target's existing tuple is installed in the scantuple.  EXCLUDED
+	 * has been made to reference INNER_VAR in setrefs.c, but there is no
+	 * other redirection.
+	 */
+	econtext->ecxt_scantuple = existing;
+	econtext->ecxt_innertuple = excludedSlot;
+	econtext->ecxt_outertuple = NULL;
+
+	if (!ExecQual(onConflictSetWhere, econtext))
+	{
+		ExecClearTuple(existing);	/* see return below */
+		InstrCountFiltered1(&mtstate->ps, 1);
+		return true;			/* done with the tuple */
+	}
+
+	if (resultRelInfo->ri_WithCheckOptions != NIL)
+	{
+		/*
+		 * Check target's existing tuple against UPDATE-applicable USING
+		 * security barrier quals (if any), enforced here as RLS checks/WCOs.
+		 *
+		 * The rewriter creates UPDATE RLS checks/WCOs for UPDATE security
+		 * quals, and stores them as WCOs of "kind" WCO_RLS_CONFLICT_CHECK,
+		 * but that's almost the extent of its special handling for ON
+		 * CONFLICT DO UPDATE.
+		 *
+		 * The rewriter will also have associated UPDATE applicable straight
+		 * RLS checks/WCOs for the benefit of the ExecUpdate() call that
+		 * follows.  INSERTs and UPDATEs naturally have mutually exclusive WCO
+		 * kinds, so there is no danger of spurious over-enforcement in the
+		 * INSERT or UPDATE path.
+		 */
+		ExecWithCheckOptions(WCO_RLS_CONFLICT_CHECK, resultRelInfo,
+							 existing,
+							 mtstate->ps.state);
+	}
+
+	/* Project the new tuple version */
+	ExecProject(resultRelInfo->ri_onConflict->oc_ProjInfo);
+
+	/*
+	 * Note that it is possible that the target tuple has been modified in
+	 * this session, after the above table_tuple_lock. We choose to not error
+	 * out in that case, in line with ExecUpdate's treatment of similar cases.
+	 * This can happen if an UPDATE is triggered from within ExecQual(),
+	 * ExecWithCheckOptions() or ExecProject() above, e.g. by selecting from a
+	 * wCTE in the ON CONFLICT's SET.
+	 */
+
+	/* Execute UPDATE with projection */
+	*returning = ExecUpdate(mtstate, resultRelInfo, conflictTid, NULL,
+							resultRelInfo->ri_onConflict->oc_ProjSlot,
+							planSlot,
+							&mtstate->mt_epqstate, mtstate->ps.state,
+							canSetTag);
+
+	/*
+	 * Clear out existing tuple, as there might not be another conflict among
+	 * the next input rows. Don't want to hold resources till the end of the
+	 * query.
+	 */
+	ExecClearTuple(existing);
+	return true;
+}
+
+
+/*
+ * Process BEFORE EACH STATEMENT triggers
+ */
+static void
+fireBSTriggers(ModifyTableState *node)
+{
+	ModifyTable *plan = (ModifyTable *) node->ps.plan;
+	ResultRelInfo *resultRelInfo = node->rootResultRelInfo;
+
+	switch (node->operation)
+	{
+		case CMD_INSERT:
+			ExecBSInsertTriggers(node->ps.state, resultRelInfo);
+			if (plan->onConflictAction == ONCONFLICT_UPDATE)
+				ExecBSUpdateTriggers(node->ps.state,
+									 resultRelInfo);
+			break;
+		case CMD_UPDATE:
+			ExecBSUpdateTriggers(node->ps.state, resultRelInfo);
+			break;
+		case CMD_DELETE:
+			ExecBSDeleteTriggers(node->ps.state, resultRelInfo);
+			break;
+		default:
+			elog(ERROR, "unknown operation");
+			break;
+	}
+}
+
+/*
+ * Process AFTER EACH STATEMENT triggers
+ */
+static void
+fireASTriggers(ModifyTableState *node)
+{
+	ModifyTable *plan = (ModifyTable *) node->ps.plan;
+	ResultRelInfo *resultRelInfo = node->rootResultRelInfo;
+
+	switch (node->operation)
+	{
+		case CMD_INSERT:
+			if (plan->onConflictAction == ONCONFLICT_UPDATE)
+				ExecASUpdateTriggers(node->ps.state,
+									 resultRelInfo,
+									 node->mt_oc_transition_capture);
+			ExecASInsertTriggers(node->ps.state, resultRelInfo,
+								 node->mt_transition_capture);
+			break;
+		case CMD_UPDATE:
+			ExecASUpdateTriggers(node->ps.state, resultRelInfo,
+								 node->mt_transition_capture);
+			break;
+		case CMD_DELETE:
+			ExecASDeleteTriggers(node->ps.state, resultRelInfo,
+								 node->mt_transition_capture);
+			break;
+		default:
+			elog(ERROR, "unknown operation");
+			break;
+	}
+}
+
+/*
+ * Set up the state needed for collecting transition tuples for AFTER
+ * triggers.
+ */
+static void
+ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
+{
+	ModifyTable *plan = (ModifyTable *) mtstate->ps.plan;
+	ResultRelInfo *targetRelInfo = mtstate->rootResultRelInfo;
+
+	/* Check for transition tables on the directly targeted relation. */
+	mtstate->mt_transition_capture =
+		MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc,
+								   RelationGetRelid(targetRelInfo->ri_RelationDesc),
+								   mtstate->operation);
+	if (plan->operation == CMD_INSERT &&
+		plan->onConflictAction == ONCONFLICT_UPDATE)
+		mtstate->mt_oc_transition_capture =
+			MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc,
+									   RelationGetRelid(targetRelInfo->ri_RelationDesc),
+									   CMD_UPDATE);
+}
+
+/*
+ * ExecPrepareTupleRouting --- prepare for routing one tuple
+ *
+ * Determine the partition in which the tuple in slot is to be inserted,
+ * and return its ResultRelInfo in *partRelInfo.  The return value is
+ * a slot holding the tuple of the partition rowtype.
+ *
+ * This also sets the transition table information in mtstate based on the
+ * selected partition.
+ */
+static TupleTableSlot *
+ExecPrepareTupleRouting(ModifyTableState *mtstate,
+						EState *estate,
+						PartitionTupleRouting *proute,
+						ResultRelInfo *targetRelInfo,
+						TupleTableSlot *slot,
+						ResultRelInfo **partRelInfo)
+{
+	ResultRelInfo *partrel;
+	TupleConversionMap *map;
+
+	/*
+	 * Lookup the target partition's ResultRelInfo.  If ExecFindPartition does
+	 * not find a valid partition for the tuple in 'slot' then an error is
+	 * raised.  An error may also be raised if the found partition is not a
+	 * valid target for INSERTs.  This is required since a partitioned table
+	 * UPDATE to another partition becomes a DELETE+INSERT.
+	 */
+	partrel = ExecFindPartition(mtstate, targetRelInfo, proute, slot, estate);
+
+	/*
+	 * If we're capturing transition tuples, we might need to convert from the
+	 * partition rowtype to root partitioned table's rowtype.  But if there
+	 * are no BEFORE triggers on the partition that could change the tuple, we
+	 * can just remember the original unconverted tuple to avoid a needless
+	 * round trip conversion.
+	 */
+	if (mtstate->mt_transition_capture != NULL)
+	{
+		bool		has_before_insert_row_trig;
+
+		has_before_insert_row_trig = (partrel->ri_TrigDesc &&
+									  partrel->ri_TrigDesc->trig_insert_before_row);
+
+		mtstate->mt_transition_capture->tcs_original_insert_tuple =
+			!has_before_insert_row_trig ? slot : NULL;
+	}
+
+	/*
+	 * Convert the tuple, if necessary.
+	 */
+	map = partrel->ri_RootToPartitionMap;
+	if (map != NULL)
+	{
+		TupleTableSlot *new_slot = partrel->ri_PartitionTupleSlot;
+
+		slot = execute_attr_map_slot(map->attrMap, slot, new_slot);
+	}
+
+	*partRelInfo = partrel;
+	return slot;
+}
+
+/* ----------------------------------------------------------------
+ *	   ExecModifyTable
+ *
+ *		Perform table modifications as required, and return RETURNING results
+ *		if needed.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecModifyTable(PlanState *pstate)
+{
+	ModifyTableState *node = castNode(ModifyTableState, pstate);
+	EState	   *estate = node->ps.state;
+	CmdType		operation = node->operation;
+	ResultRelInfo *resultRelInfo;
+	PlanState  *subplanstate;
+	TupleTableSlot *slot;
+	TupleTableSlot *planSlot;
+	TupleTableSlot *oldSlot;
+	ItemPointer tupleid;
+	ItemPointerData tuple_ctid;
+	HeapTupleData oldtupdata;
+	HeapTuple	oldtuple;
+	PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
+	List	   *relinfos = NIL;
+	ListCell   *lc;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * This should NOT get called during EvalPlanQual; we should have passed a
+	 * subplan tree to EvalPlanQual, instead.  Use a runtime test not just
+	 * Assert because this condition is easy to miss in testing.  (Note:
+	 * although ModifyTable should not get executed within an EvalPlanQual
+	 * operation, we do have to allow it to be initialized and shut down in
+	 * case it is within a CTE subplan.  Hence this test must be here, not in
+	 * ExecInitModifyTable.)
+	 */
+	if (estate->es_epq_active != NULL)
+		elog(ERROR, "ModifyTable should not be called during EvalPlanQual");
+
+	/*
+	 * If we've already completed processing, don't try to do more.  We need
+	 * this test because ExecPostprocessPlan might call us an extra time, and
+	 * our subplan's nodes aren't necessarily robust against being called
+	 * extra times.
+	 */
+	if (node->mt_done)
+		return NULL;
+
+	/*
+	 * On first call, fire BEFORE STATEMENT triggers before proceeding.
+	 */
+	if (node->fireBSTriggers)
+	{
+		fireBSTriggers(node);
+		node->fireBSTriggers = false;
+	}
+
+	/* Preload local variables */
+	resultRelInfo = node->resultRelInfo + node->mt_lastResultIndex;
+	subplanstate = outerPlanState(node);
+
+	/*
+	 * Fetch rows from subplan, and execute the required table modification
+	 * for each row.
+	 */
+	for (;;)
+	{
+		/*
+		 * Reset the per-output-tuple exprcontext.  This is needed because
+		 * triggers expect to use that context as workspace.  It's a bit ugly
+		 * to do this below the top level of the plan, however.  We might need
+		 * to rethink this later.
+		 */
+		ResetPerTupleExprContext(estate);
+
+		/*
+		 * Reset per-tuple memory context used for processing on conflict and
+		 * returning clauses, to free any expression evaluation storage
+		 * allocated in the previous cycle.
+		 */
+		if (pstate->ps_ExprContext)
+			ResetExprContext(pstate->ps_ExprContext);
+
+		planSlot = ExecProcNode(subplanstate);
+
+		/* No more tuples to process? */
+		if (TupIsNull(planSlot))
+			break;
+
+		/*
+		 * When there are multiple result relations, each tuple contains a
+		 * junk column that gives the OID of the rel from which it came.
+		 * Extract it and select the correct result relation.
+		 */
+		if (AttributeNumberIsValid(node->mt_resultOidAttno))
+		{
+			Datum		datum;
+			bool		isNull;
+			Oid			resultoid;
+
+			datum = ExecGetJunkAttribute(planSlot, node->mt_resultOidAttno,
+										 &isNull);
+			if (isNull)
+				elog(ERROR, "tableoid is NULL");
+			resultoid = DatumGetObjectId(datum);
+
+			/* If it's not the same as last time, we need to locate the rel */
+			if (resultoid != node->mt_lastResultOid)
+				resultRelInfo = ExecLookupResultRelByOid(node, resultoid,
+														 false, true);
+		}
+
+		/*
+		 * If resultRelInfo->ri_usesFdwDirectModify is true, all we need to do
+		 * here is compute the RETURNING expressions.
+		 */
+		if (resultRelInfo->ri_usesFdwDirectModify)
+		{
+			Assert(resultRelInfo->ri_projectReturning);
+
+			/*
+			 * A scan slot containing the data that was actually inserted,
+			 * updated or deleted has already been made available to
+			 * ExecProcessReturning by IterateDirectModify, so no need to
+			 * provide it here.
+			 */
+			slot = ExecProcessReturning(resultRelInfo, NULL, planSlot);
+
+			return slot;
+		}
+
+		EvalPlanQualSetSlot(&node->mt_epqstate, planSlot);
+		slot = planSlot;
+
+		tupleid = NULL;
+		oldtuple = NULL;
+
+		/*
+		 * For UPDATE/DELETE, fetch the row identity info for the tuple to be
+		 * updated/deleted.  For a heap relation, that's a TID; otherwise we
+		 * may have a wholerow junk attr that carries the old tuple in toto.
+		 * Keep this in step with the part of ExecInitModifyTable that sets up
+		 * ri_RowIdAttNo.
+		 */
+		if (operation == CMD_UPDATE || operation == CMD_DELETE)
+		{
+			char		relkind;
+			Datum		datum;
+			bool		isNull;
+
+			relkind = resultRelInfo->ri_RelationDesc->rd_rel->relkind;
+			if (relkind == RELKIND_RELATION ||
+				relkind == RELKIND_MATVIEW ||
+				relkind == RELKIND_PARTITIONED_TABLE)
+			{
+				/* ri_RowIdAttNo refers to a ctid attribute */
+				Assert(AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo));
+				datum = ExecGetJunkAttribute(slot,
+											 resultRelInfo->ri_RowIdAttNo,
+											 &isNull);
+				/* shouldn't ever get a null result... */
+				if (isNull)
+					elog(ERROR, "ctid is NULL");
+
+				tupleid = (ItemPointer) DatumGetPointer(datum);
+				tuple_ctid = *tupleid;	/* be sure we don't free ctid!! */
+				tupleid = &tuple_ctid;
+			}
+
+			/*
+			 * Use the wholerow attribute, when available, to reconstruct the
+			 * old relation tuple.  The old tuple serves one or both of two
+			 * purposes: 1) it serves as the OLD tuple for row triggers, 2) it
+			 * provides values for any unchanged columns for the NEW tuple of
+			 * an UPDATE, because the subplan does not produce all the columns
+			 * of the target table.
+			 *
+			 * Note that the wholerow attribute does not carry system columns,
+			 * so foreign table triggers miss seeing those, except that we
+			 * know enough here to set t_tableOid.  Quite separately from
+			 * this, the FDW may fetch its own junk attrs to identify the row.
+			 *
+			 * Other relevant relkinds, currently limited to views, always
+			 * have a wholerow attribute.
+			 */
+			else if (AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo))
+			{
+				datum = ExecGetJunkAttribute(slot,
+											 resultRelInfo->ri_RowIdAttNo,
+											 &isNull);
+				/* shouldn't ever get a null result... */
+				if (isNull)
+					elog(ERROR, "wholerow is NULL");
+
+				oldtupdata.t_data = DatumGetHeapTupleHeader(datum);
+				oldtupdata.t_len =
+					HeapTupleHeaderGetDatumLength(oldtupdata.t_data);
+				ItemPointerSetInvalid(&(oldtupdata.t_self));
+				/* Historically, view triggers see invalid t_tableOid. */
+				oldtupdata.t_tableOid =
+					(relkind == RELKIND_VIEW) ? InvalidOid :
+					RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+				oldtuple = &oldtupdata;
+			}
+			else
+			{
+				/* Only foreign tables are allowed to omit a row-ID attr */
+				Assert(relkind == RELKIND_FOREIGN_TABLE);
+			}
+		}
+
+		switch (operation)
+		{
+			case CMD_INSERT:
+				/* Initialize projection info if first time for this table */
+				if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+					ExecInitInsertProjection(node, resultRelInfo);
+				slot = ExecGetInsertNewTuple(resultRelInfo, planSlot);
+				slot = ExecInsert(node, resultRelInfo, slot, planSlot,
+								  estate, node->canSetTag);
+				break;
+			case CMD_UPDATE:
+				/* Initialize projection info if first time for this table */
+				if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+					ExecInitUpdateProjection(node, resultRelInfo);
+
+				/*
+				 * Make the new tuple by combining plan's output tuple with
+				 * the old tuple being updated.
+				 */
+				oldSlot = resultRelInfo->ri_oldTupleSlot;
+				if (oldtuple != NULL)
+				{
+					/* Use the wholerow junk attr as the old tuple. */
+					ExecForceStoreHeapTuple(oldtuple, oldSlot, false);
+				}
+				else
+				{
+					/* Fetch the most recent version of old tuple. */
+					Relation	relation = resultRelInfo->ri_RelationDesc;
+
+					Assert(tupleid != NULL);
+					if (!table_tuple_fetch_row_version(relation, tupleid,
+													   SnapshotAny,
+													   oldSlot))
+						elog(ERROR, "failed to fetch tuple being updated");
+				}
+				slot = ExecGetUpdateNewTuple(resultRelInfo, planSlot,
+											 oldSlot);
+
+				/* Now apply the update. */
+				slot = ExecUpdate(node, resultRelInfo, tupleid, oldtuple, slot,
+								  planSlot, &node->mt_epqstate, estate,
+								  node->canSetTag);
+				break;
+			case CMD_DELETE:
+				slot = ExecDelete(node, resultRelInfo, tupleid, oldtuple,
+								  planSlot, &node->mt_epqstate, estate,
+								  true, /* processReturning */
+								  node->canSetTag,
+								  false,	/* changingPart */
+								  NULL, NULL);
+				break;
+			default:
+				elog(ERROR, "unknown operation");
+				break;
+		}
+
+		/*
+		 * If we got a RETURNING result, return it to caller.  We'll continue
+		 * the work on next call.
+		 */
+		if (slot)
+			return slot;
+	}
+
+	/*
+	 * Insert remaining tuples for batch insert.
+	 */
+	if (proute)
+		relinfos = estate->es_tuple_routing_result_relations;
+	else
+		relinfos = estate->es_opened_result_relations;
+
+	foreach(lc, relinfos)
+	{
+		resultRelInfo = lfirst(lc);
+		if (resultRelInfo->ri_NumSlots > 0)
+			ExecBatchInsert(node, resultRelInfo,
+							resultRelInfo->ri_Slots,
+							resultRelInfo->ri_PlanSlots,
+							resultRelInfo->ri_NumSlots,
+							estate, node->canSetTag);
+	}
+
+	/*
+	 * We're done, but fire AFTER STATEMENT triggers before exiting.
+	 */
+	fireASTriggers(node);
+
+	node->mt_done = true;
+
+	return NULL;
+}
+
+/*
+ * ExecLookupResultRelByOid
+ * 		If the table with given OID is among the result relations to be
+ * 		updated by the given ModifyTable node, return its ResultRelInfo.
+ *
+ * If not found, return NULL if missing_ok, else raise error.
+ *
+ * If update_cache is true, then upon successful lookup, update the node's
+ * one-element cache.  ONLY ExecModifyTable may pass true for this.
+ */
+ResultRelInfo *
+ExecLookupResultRelByOid(ModifyTableState *node, Oid resultoid,
+						 bool missing_ok, bool update_cache)
+{
+	if (node->mt_resultOidHash)
+	{
+		/* Use the pre-built hash table to locate the rel */
+		MTTargetRelLookup *mtlookup;
+
+		mtlookup = (MTTargetRelLookup *)
+			hash_search(node->mt_resultOidHash, &resultoid, HASH_FIND, NULL);
+		if (mtlookup)
+		{
+			if (update_cache)
+			{
+				node->mt_lastResultOid = resultoid;
+				node->mt_lastResultIndex = mtlookup->relationIndex;
+			}
+			return node->resultRelInfo + mtlookup->relationIndex;
+		}
+	}
+	else
+	{
+		/* With few target rels, just search the ResultRelInfo array */
+		for (int ndx = 0; ndx < node->mt_nrels; ndx++)
+		{
+			ResultRelInfo *rInfo = node->resultRelInfo + ndx;
+
+			if (RelationGetRelid(rInfo->ri_RelationDesc) == resultoid)
+			{
+				if (update_cache)
+				{
+					node->mt_lastResultOid = resultoid;
+					node->mt_lastResultIndex = ndx;
+				}
+				return rInfo;
+			}
+		}
+	}
+
+	if (!missing_ok)
+		elog(ERROR, "incorrect result relation OID %u", resultoid);
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitModifyTable
+ * ----------------------------------------------------------------
+ */
+ModifyTableState *
+ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
+{
+	ModifyTableState *mtstate;
+	Plan	   *subplan = outerPlan(node);
+	CmdType		operation = node->operation;
+	int			nrels = list_length(node->resultRelations);
+	ResultRelInfo *resultRelInfo;
+	List	   *arowmarks;
+	ListCell   *l;
+	int			i;
+	Relation	rel;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	mtstate = makeNode(ModifyTableState);
+	mtstate->ps.plan = (Plan *) node;
+	mtstate->ps.state = estate;
+	mtstate->ps.ExecProcNode = ExecModifyTable;
+
+	mtstate->operation = operation;
+	mtstate->canSetTag = node->canSetTag;
+	mtstate->mt_done = false;
+
+	mtstate->mt_nrels = nrels;
+	mtstate->resultRelInfo = (ResultRelInfo *)
+		palloc(nrels * sizeof(ResultRelInfo));
+
+	/*----------
+	 * Resolve the target relation. This is the same as:
+	 *
+	 * - the relation for which we will fire FOR STATEMENT triggers,
+	 * - the relation into whose tuple format all captured transition tuples
+	 *   must be converted, and
+	 * - the root partitioned table used for tuple routing.
+	 *
+	 * If it's a partitioned table, the root partition doesn't appear
+	 * elsewhere in the plan and its RT index is given explicitly in
+	 * node->rootRelation.  Otherwise (i.e. table inheritance) the target
+	 * relation is the first relation in the node->resultRelations list.
+	 *----------
+	 */
+	if (node->rootRelation > 0)
+	{
+		mtstate->rootResultRelInfo = makeNode(ResultRelInfo);
+		ExecInitResultRelation(estate, mtstate->rootResultRelInfo,
+							   node->rootRelation);
+	}
+	else
+	{
+		mtstate->rootResultRelInfo = mtstate->resultRelInfo;
+		ExecInitResultRelation(estate, mtstate->resultRelInfo,
+							   linitial_int(node->resultRelations));
+	}
+
+	/* set up epqstate with dummy subplan data for the moment */
+	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);
+	mtstate->fireBSTriggers = true;
+
+	/*
+	 * Build state for collecting transition tuples.  This requires having a
+	 * valid trigger query context, so skip it in explain-only mode.
+	 */
+	if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+		ExecSetupTransitionCaptureState(mtstate, estate);
+
+	/*
+	 * Open all the result relations and initialize the ResultRelInfo structs.
+	 * (But root relation was initialized above, if it's part of the array.)
+	 * We must do this before initializing the subplan, because direct-modify
+	 * FDWs expect their ResultRelInfos to be available.
+	 */
+	resultRelInfo = mtstate->resultRelInfo;
+	i = 0;
+	foreach(l, node->resultRelations)
+	{
+		Index		resultRelation = lfirst_int(l);
+
+		if (resultRelInfo != mtstate->rootResultRelInfo)
+		{
+			ExecInitResultRelation(estate, resultRelInfo, resultRelation);
+
+			/*
+			 * For child result relations, store the root result relation
+			 * pointer.  We do so for the convenience of places that want to
+			 * look at the query's original target relation but don't have the
+			 * mtstate handy.
+			 */
+			resultRelInfo->ri_RootResultRelInfo = mtstate->rootResultRelInfo;
+		}
+
+		/* Initialize the usesFdwDirectModify flag */
+		resultRelInfo->ri_usesFdwDirectModify = bms_is_member(i,
+															  node->fdwDirectModifyPlans);
+
+		/*
+		 * Verify result relation is a valid target for the current operation
+		 */
+		CheckValidResultRel(resultRelInfo, operation);
+
+		resultRelInfo++;
+		i++;
+	}
+
+	/*
+	 * Now we may initialize the subplan.
+	 */
+	outerPlanState(mtstate) = ExecInitNode(subplan, estate, eflags);
+
+	/*
+	 * Do additional per-result-relation initialization.
+	 */
+	for (i = 0; i < nrels; i++)
+	{
+		resultRelInfo = &mtstate->resultRelInfo[i];
+
+		/* Let FDWs init themselves for foreign-table result rels */
+		if (!resultRelInfo->ri_usesFdwDirectModify &&
+			resultRelInfo->ri_FdwRoutine != NULL &&
+			resultRelInfo->ri_FdwRoutine->BeginForeignModify != NULL)
+		{
+			List	   *fdw_private = (List *) list_nth(node->fdwPrivLists, i);
+
+			resultRelInfo->ri_FdwRoutine->BeginForeignModify(mtstate,
+															 resultRelInfo,
+															 fdw_private,
+															 i,
+															 eflags);
+		}
+
+		/*
+		 * For UPDATE/DELETE, find the appropriate junk attr now, either a
+		 * 'ctid' or 'wholerow' attribute depending on relkind.  For foreign
+		 * tables, the FDW might have created additional junk attr(s), but
+		 * those are no concern of ours.
+		 */
+		if (operation == CMD_UPDATE || operation == CMD_DELETE)
+		{
+			char		relkind;
+
+			relkind = resultRelInfo->ri_RelationDesc->rd_rel->relkind;
+			if (relkind == RELKIND_RELATION ||
+				relkind == RELKIND_MATVIEW ||
+				relkind == RELKIND_PARTITIONED_TABLE)
+			{
+				resultRelInfo->ri_RowIdAttNo =
+					ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid");
+				if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo))
+					elog(ERROR, "could not find junk ctid column");
+			}
+			else if (relkind == RELKIND_FOREIGN_TABLE)
+			{
+				/*
+				 * When there is a row-level trigger, there should be a
+				 * wholerow attribute.  We also require it to be present in
+				 * UPDATE, so we can get the values of unchanged columns.
+				 */
+				resultRelInfo->ri_RowIdAttNo =
+					ExecFindJunkAttributeInTlist(subplan->targetlist,
+												 "wholerow");
+				if (mtstate->operation == CMD_UPDATE &&
+					!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo))
+					elog(ERROR, "could not find junk wholerow column");
+			}
+			else
+			{
+				/* Other valid target relkinds must provide wholerow */
+				resultRelInfo->ri_RowIdAttNo =
+					ExecFindJunkAttributeInTlist(subplan->targetlist,
+												 "wholerow");
+				if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo))
+					elog(ERROR, "could not find junk wholerow column");
+			}
+		}
+	}
+
+	/*
+	 * If this is an inherited update/delete, there will be a junk attribute
+	 * named "tableoid" present in the subplan's targetlist.  It will be used
+	 * to identify the result relation for a given tuple to be
+	 * updated/deleted.
+	 */
+	mtstate->mt_resultOidAttno =
+		ExecFindJunkAttributeInTlist(subplan->targetlist, "tableoid");
+	Assert(AttributeNumberIsValid(mtstate->mt_resultOidAttno) || nrels == 1);
+	mtstate->mt_lastResultOid = InvalidOid; /* force lookup at first tuple */
+	mtstate->mt_lastResultIndex = 0;	/* must be zero if no such attr */
+
+	/* Get the root target relation */
+	rel = mtstate->rootResultRelInfo->ri_RelationDesc;
+
+	/*
+	 * Build state for tuple routing if it's a partitioned INSERT.  An UPDATE
+	 * might need this too, but only if it actually moves tuples between
+	 * partitions; in that case setup is done by ExecCrossPartitionUpdate.
+	 */
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
+		operation == CMD_INSERT)
+		mtstate->mt_partition_tuple_routing =
+			ExecSetupPartitionTupleRouting(estate, rel);
+
+	/*
+	 * Initialize any WITH CHECK OPTION constraints if needed.
+	 */
+	resultRelInfo = mtstate->resultRelInfo;
+	foreach(l, node->withCheckOptionLists)
+	{
+		List	   *wcoList = (List *) lfirst(l);
+		List	   *wcoExprs = NIL;
+		ListCell   *ll;
+
+		foreach(ll, wcoList)
+		{
+			WithCheckOption *wco = (WithCheckOption *) lfirst(ll);
+			ExprState  *wcoExpr = ExecInitQual((List *) wco->qual,
+											   &mtstate->ps);
+
+			wcoExprs = lappend(wcoExprs, wcoExpr);
+		}
+
+		resultRelInfo->ri_WithCheckOptions = wcoList;
+		resultRelInfo->ri_WithCheckOptionExprs = wcoExprs;
+		resultRelInfo++;
+	}
+
+	/*
+	 * Initialize RETURNING projections if needed.
+	 */
+	if (node->returningLists)
+	{
+		TupleTableSlot *slot;
+		ExprContext *econtext;
+
+		/*
+		 * Initialize result tuple slot and assign its rowtype using the first
+		 * RETURNING list.  We assume the rest will look the same.
+		 */
+		mtstate->ps.plan->targetlist = (List *) linitial(node->returningLists);
+
+		/* Set up a slot for the output of the RETURNING projection(s) */
+		ExecInitResultTupleSlotTL(&mtstate->ps, &TTSOpsVirtual);
+		slot = mtstate->ps.ps_ResultTupleSlot;
+
+		/* Need an econtext too */
+		if (mtstate->ps.ps_ExprContext == NULL)
+			ExecAssignExprContext(estate, &mtstate->ps);
+		econtext = mtstate->ps.ps_ExprContext;
+
+		/*
+		 * Build a projection for each result rel.
+		 */
+		resultRelInfo = mtstate->resultRelInfo;
+		foreach(l, node->returningLists)
+		{
+			List	   *rlist = (List *) lfirst(l);
+
+			resultRelInfo->ri_returningList = rlist;
+			resultRelInfo->ri_projectReturning =
+				ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps,
+										resultRelInfo->ri_RelationDesc->rd_att);
+			resultRelInfo++;
+		}
+	}
+	else
+	{
+		/*
+		 * We still must construct a dummy result tuple type, because InitPlan
+		 * expects one (maybe should change that?).
+		 */
+		mtstate->ps.plan->targetlist = NIL;
+		ExecInitResultTypeTL(&mtstate->ps);
+
+		mtstate->ps.ps_ExprContext = NULL;
+	}
+
+	/* Set the list of arbiter indexes if needed for ON CONFLICT */
+	resultRelInfo = mtstate->resultRelInfo;
+	if (node->onConflictAction != ONCONFLICT_NONE)
+	{
+		/* insert may only have one relation, inheritance is not expanded */
+		Assert(nrels == 1);
+		resultRelInfo->ri_onConflictArbiterIndexes = node->arbiterIndexes;
+	}
+
+	/*
+	 * If needed, Initialize target list, projection and qual for ON CONFLICT
+	 * DO UPDATE.
+	 */
+	if (node->onConflictAction == ONCONFLICT_UPDATE)
+	{
+		OnConflictSetState *onconfl = makeNode(OnConflictSetState);
+		ExprContext *econtext;
+		TupleDesc	relationDesc;
+
+		/* already exists if created by RETURNING processing above */
+		if (mtstate->ps.ps_ExprContext == NULL)
+			ExecAssignExprContext(estate, &mtstate->ps);
+
+		econtext = mtstate->ps.ps_ExprContext;
+		relationDesc = resultRelInfo->ri_RelationDesc->rd_att;
+
+		/* create state for DO UPDATE SET operation */
+		resultRelInfo->ri_onConflict = onconfl;
+
+		/* initialize slot for the existing tuple */
+		onconfl->oc_Existing =
+			table_slot_create(resultRelInfo->ri_RelationDesc,
+							  &mtstate->ps.state->es_tupleTable);
+
+		/*
+		 * Create the tuple slot for the UPDATE SET projection. We want a slot
+		 * of the table's type here, because the slot will be used to insert
+		 * into the table, and for RETURNING processing - which may access
+		 * system attributes.
+		 */
+		onconfl->oc_ProjSlot =
+			table_slot_create(resultRelInfo->ri_RelationDesc,
+							  &mtstate->ps.state->es_tupleTable);
+
+		/* build UPDATE SET projection state */
+		onconfl->oc_ProjInfo =
+			ExecBuildUpdateProjection(node->onConflictSet,
+									  true,
+									  node->onConflictCols,
+									  relationDesc,
+									  econtext,
+									  onconfl->oc_ProjSlot,
+									  &mtstate->ps);
+
+		/* initialize state to evaluate the WHERE clause, if any */
+		if (node->onConflictWhere)
+		{
+			ExprState  *qualexpr;
+
+			qualexpr = ExecInitQual((List *) node->onConflictWhere,
+									&mtstate->ps);
+			onconfl->oc_WhereClause = qualexpr;
+		}
+	}
+
+	/*
+	 * If we have any secondary relations in an UPDATE or DELETE, they need to
+	 * be treated like non-locked relations in SELECT FOR UPDATE, ie, the
+	 * EvalPlanQual mechanism needs to be told about them.  Locate the
+	 * relevant ExecRowMarks.
+	 */
+	arowmarks = NIL;
+	foreach(l, node->rowMarks)
+	{
+		PlanRowMark *rc = lfirst_node(PlanRowMark, l);
+		ExecRowMark *erm;
+		ExecAuxRowMark *aerm;
+
+		/* ignore "parent" rowmarks; they are irrelevant at runtime */
+		if (rc->isParent)
+			continue;
+
+		/* Find ExecRowMark and build ExecAuxRowMark */
+		erm = ExecFindRowMark(estate, rc->rti, false);
+		aerm = ExecBuildAuxRowMark(erm, subplan->targetlist);
+		arowmarks = lappend(arowmarks, aerm);
+	}
+
+	EvalPlanQualSetPlan(&mtstate->mt_epqstate, subplan, arowmarks);
+
+	/*
+	 * If there are a lot of result relations, use a hash table to speed the
+	 * lookups.  If there are not a lot, a simple linear search is faster.
+	 *
+	 * It's not clear where the threshold is, but try 64 for starters.  In a
+	 * debugging build, use a small threshold so that we get some test
+	 * coverage of both code paths.
+	 */
+#ifdef USE_ASSERT_CHECKING
+#define MT_NRELS_HASH 4
+#else
+#define MT_NRELS_HASH 64
+#endif
+	if (nrels >= MT_NRELS_HASH)
+	{
+		HASHCTL		hash_ctl;
+
+		hash_ctl.keysize = sizeof(Oid);
+		hash_ctl.entrysize = sizeof(MTTargetRelLookup);
+		hash_ctl.hcxt = CurrentMemoryContext;
+		mtstate->mt_resultOidHash =
+			hash_create("ModifyTable target hash",
+						nrels, &hash_ctl,
+						HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+		for (i = 0; i < nrels; i++)
+		{
+			Oid			hashkey;
+			MTTargetRelLookup *mtlookup;
+			bool		found;
+
+			resultRelInfo = &mtstate->resultRelInfo[i];
+			hashkey = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+			mtlookup = (MTTargetRelLookup *)
+				hash_search(mtstate->mt_resultOidHash, &hashkey,
+							HASH_ENTER, &found);
+			Assert(!found);
+			mtlookup->relationIndex = i;
+		}
+	}
+	else
+		mtstate->mt_resultOidHash = NULL;
+
+	/*
+	 * Determine if the FDW supports batch insert and determine the batch size
+	 * (a FDW may support batching, but it may be disabled for the
+	 * server/table).
+	 *
+	 * We only do this for INSERT, so that for UPDATE/DELETE the batch size
+	 * remains set to 0.
+	 */
+	if (operation == CMD_INSERT)
+	{
+		/* insert may only have one relation, inheritance is not expanded */
+		Assert(nrels == 1);
+		resultRelInfo = mtstate->resultRelInfo;
+		if (!resultRelInfo->ri_usesFdwDirectModify &&
+			resultRelInfo->ri_FdwRoutine != NULL &&
+			resultRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
+			resultRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
+		{
+			resultRelInfo->ri_BatchSize =
+				resultRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(resultRelInfo);
+			Assert(resultRelInfo->ri_BatchSize >= 1);
+		}
+		else
+			resultRelInfo->ri_BatchSize = 1;
+	}
+
+	/*
+	 * Lastly, if this is not the primary (canSetTag) ModifyTable node, add it
+	 * to estate->es_auxmodifytables so that it will be run to completion by
+	 * ExecPostprocessPlan.  (It'd actually work fine to add the primary
+	 * ModifyTable node too, but there's no need.)  Note the use of lcons not
+	 * lappend: we need later-initialized ModifyTable nodes to be shut down
+	 * before earlier ones.  This ensures that we don't throw away RETURNING
+	 * rows that need to be seen by a later CTE subplan.
+	 */
+	if (!mtstate->canSetTag)
+		estate->es_auxmodifytables = lcons(mtstate,
+										   estate->es_auxmodifytables);
+
+	return mtstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndModifyTable
+ *
+ *		Shuts down the plan.
+ *
+ *		Returns nothing of interest.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndModifyTable(ModifyTableState *node)
+{
+	int			i;
+
+	/*
+	 * Allow any FDWs to shut down
+	 */
+	for (i = 0; i < node->mt_nrels; i++)
+	{
+		int			j;
+		ResultRelInfo *resultRelInfo = node->resultRelInfo + i;
+
+		if (!resultRelInfo->ri_usesFdwDirectModify &&
+			resultRelInfo->ri_FdwRoutine != NULL &&
+			resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL)
+			resultRelInfo->ri_FdwRoutine->EndForeignModify(node->ps.state,
+														   resultRelInfo);
+
+		/*
+		 * Cleanup the initialized batch slots. This only matters for FDWs
+		 * with batching, but the other cases will have ri_NumSlotsInitialized
+		 * == 0.
+		 */
+		for (j = 0; j < resultRelInfo->ri_NumSlotsInitialized; j++)
+		{
+			ExecDropSingleTupleTableSlot(resultRelInfo->ri_Slots[j]);
+			ExecDropSingleTupleTableSlot(resultRelInfo->ri_PlanSlots[j]);
+		}
+	}
+
+	/*
+	 * Close all the partitioned tables, leaf partitions, and their indices
+	 * and release the slot used for tuple routing, if set.
+	 */
+	if (node->mt_partition_tuple_routing)
+	{
+		ExecCleanupTupleRouting(node, node->mt_partition_tuple_routing);
+
+		if (node->mt_root_tuple_slot)
+			ExecDropSingleTupleTableSlot(node->mt_root_tuple_slot);
+	}
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+	/*
+	 * Terminate EPQ execution if active
+	 */
+	EvalPlanQualEnd(&node->mt_epqstate);
+
+	/*
+	 * shut down subplan
+	 */
+	ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanModifyTable(ModifyTableState *node)
+{
+	/*
+	 * Currently, we don't need to support rescan on ModifyTable nodes. The
+	 * semantics of that would be a bit debatable anyway.
+	 */
+	elog(ERROR, "ExecReScanModifyTable is not implemented");
+}
diff --git a/src/backend/executor/nodeNamedtuplestorescan.c b/src/backend/executor/nodeNamedtuplestorescan.c
new file mode 100644
index 0000000..c0d1069
--- /dev/null
+++ b/src/backend/executor/nodeNamedtuplestorescan.c
@@ -0,0 +1,201 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeNamedtuplestorescan.c
+ *	  routines to handle NamedTuplestoreScan nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeNamedtuplestorescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeNamedtuplestorescan.h"
+#include "miscadmin.h"
+#include "utils/queryenvironment.h"
+
+static TupleTableSlot *NamedTuplestoreScanNext(NamedTuplestoreScanState *node);
+
+/* ----------------------------------------------------------------
+ *		NamedTuplestoreScanNext
+ *
+ *		This is a workhorse for ExecNamedTuplestoreScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+NamedTuplestoreScanNext(NamedTuplestoreScanState *node)
+{
+	TupleTableSlot *slot;
+
+	/* We intentionally do not support backward scan. */
+	Assert(ScanDirectionIsForward(node->ss.ps.state->es_direction));
+
+	/*
+	 * Get the next tuple from tuplestore. Return NULL if no more tuples.
+	 */
+	slot = node->ss.ss_ScanTupleSlot;
+	tuplestore_select_read_pointer(node->relation, node->readptr);
+	(void) tuplestore_gettupleslot(node->relation, true, false, slot);
+	return slot;
+}
+
+/*
+ * NamedTuplestoreScanRecheck -- access method routine to recheck a tuple in
+ * EvalPlanQual
+ */
+static bool
+NamedTuplestoreScanRecheck(NamedTuplestoreScanState *node, TupleTableSlot *slot)
+{
+	/* nothing to check */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecNamedTuplestoreScan(node)
+ *
+ *		Scans the CTE sequentially and returns the next qualifying tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecNamedTuplestoreScan(PlanState *pstate)
+{
+	NamedTuplestoreScanState *node = castNode(NamedTuplestoreScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) NamedTuplestoreScanNext,
+					(ExecScanRecheckMtd) NamedTuplestoreScanRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecInitNamedTuplestoreScan
+ * ----------------------------------------------------------------
+ */
+NamedTuplestoreScanState *
+ExecInitNamedTuplestoreScan(NamedTuplestoreScan *node, EState *estate, int eflags)
+{
+	NamedTuplestoreScanState *scanstate;
+	EphemeralNamedRelation enr;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * NamedTuplestoreScan should not have any children.
+	 */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create new NamedTuplestoreScanState for node
+	 */
+	scanstate = makeNode(NamedTuplestoreScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecNamedTuplestoreScan;
+
+	enr = get_ENR(estate->es_queryEnv, node->enrname);
+	if (!enr)
+		elog(ERROR, "executor could not find named tuplestore \"%s\"",
+			 node->enrname);
+
+	Assert(enr->reldata);
+	scanstate->relation = (Tuplestorestate *) enr->reldata;
+	scanstate->tupdesc = ENRMetadataGetTupDesc(&(enr->md));
+	scanstate->readptr =
+		tuplestore_alloc_read_pointer(scanstate->relation, EXEC_FLAG_REWIND);
+
+	/*
+	 * The new read pointer copies its position from read pointer 0, which
+	 * could be anywhere, so explicitly rewind it.
+	 */
+	tuplestore_select_read_pointer(scanstate->relation, scanstate->readptr);
+	tuplestore_rescan(scanstate->relation);
+
+	/*
+	 * XXX: Should we add a function to free that read pointer when done?
+	 *
+	 * This was attempted, but it did not improve performance or memory usage
+	 * in any tested cases.
+	 */
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * The scan tuple type is specified for the tuplestore.
+	 */
+	ExecInitScanTupleSlot(estate, &scanstate->ss, scanstate->tupdesc,
+						  &TTSOpsMinimalTuple);
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndNamedTuplestoreScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndNamedTuplestoreScan(NamedTuplestoreScanState *node)
+{
+	/*
+	 * Free exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanNamedTuplestoreScan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanNamedTuplestoreScan(NamedTuplestoreScanState *node)
+{
+	Tuplestorestate *tuplestorestate = node->relation;
+
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	ExecScanReScan(&node->ss);
+
+	/*
+	 * Rewind my own pointer.
+	 */
+	tuplestore_select_read_pointer(tuplestorestate, node->readptr);
+	tuplestore_rescan(tuplestorestate);
+}
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
new file mode 100644
index 0000000..41e5eca
--- /dev/null
+++ b/src/backend/executor/nodeNestloop.c
@@ -0,0 +1,411 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeNestloop.c
+ *	  routines to support nest-loop joins
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeNestloop.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *	 INTERFACE ROUTINES
+ *		ExecNestLoop	 - process a nestloop join of two plans
+ *		ExecInitNestLoop - initialize the join
+ *		ExecEndNestLoop  - shut down the join
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeNestloop.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/* ----------------------------------------------------------------
+ *		ExecNestLoop(node)
+ *
+ * old comments
+ *		Returns the tuple joined from inner and outer tuples which
+ *		satisfies the qualification clause.
+ *
+ *		It scans the inner relation to join with current outer tuple.
+ *
+ *		If none is found, next tuple from the outer relation is retrieved
+ *		and the inner relation is scanned from the beginning again to join
+ *		with the outer tuple.
+ *
+ *		NULL is returned if all the remaining outer tuples are tried and
+ *		all fail to join with the inner tuples.
+ *
+ *		NULL is also returned if there is no tuple from inner relation.
+ *
+ *		Conditions:
+ *		  -- outerTuple contains current tuple from outer relation and
+ *			 the right son(inner relation) maintains "cursor" at the tuple
+ *			 returned previously.
+ *				This is achieved by maintaining a scan position on the outer
+ *				relation.
+ *
+ *		Initial States:
+ *		  -- the outer child and the inner child
+ *			   are prepared to return the first tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecNestLoop(PlanState *pstate)
+{
+	NestLoopState *node = castNode(NestLoopState, pstate);
+	NestLoop   *nl;
+	PlanState  *innerPlan;
+	PlanState  *outerPlan;
+	TupleTableSlot *outerTupleSlot;
+	TupleTableSlot *innerTupleSlot;
+	ExprState  *joinqual;
+	ExprState  *otherqual;
+	ExprContext *econtext;
+	ListCell   *lc;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * get information from the node
+	 */
+	ENL1_printf("getting info from node");
+
+	nl = (NestLoop *) node->js.ps.plan;
+	joinqual = node->js.joinqual;
+	otherqual = node->js.ps.qual;
+	outerPlan = outerPlanState(node);
+	innerPlan = innerPlanState(node);
+	econtext = node->js.ps.ps_ExprContext;
+
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage allocated in the previous tuple cycle.
+	 */
+	ResetExprContext(econtext);
+
+	/*
+	 * Ok, everything is setup for the join so now loop until we return a
+	 * qualifying join tuple.
+	 */
+	ENL1_printf("entering main loop");
+
+	for (;;)
+	{
+		/*
+		 * If we don't have an outer tuple, get the next one and reset the
+		 * inner scan.
+		 */
+		if (node->nl_NeedNewOuter)
+		{
+			ENL1_printf("getting new outer tuple");
+			outerTupleSlot = ExecProcNode(outerPlan);
+
+			/*
+			 * if there are no more outer tuples, then the join is complete..
+			 */
+			if (TupIsNull(outerTupleSlot))
+			{
+				ENL1_printf("no outer tuple, ending join");
+				return NULL;
+			}
+
+			ENL1_printf("saving new outer tuple information");
+			econtext->ecxt_outertuple = outerTupleSlot;
+			node->nl_NeedNewOuter = false;
+			node->nl_MatchedOuter = false;
+
+			/*
+			 * fetch the values of any outer Vars that must be passed to the
+			 * inner scan, and store them in the appropriate PARAM_EXEC slots.
+			 */
+			foreach(lc, nl->nestParams)
+			{
+				NestLoopParam *nlp = (NestLoopParam *) lfirst(lc);
+				int			paramno = nlp->paramno;
+				ParamExecData *prm;
+
+				prm = &(econtext->ecxt_param_exec_vals[paramno]);
+				/* Param value should be an OUTER_VAR var */
+				Assert(IsA(nlp->paramval, Var));
+				Assert(nlp->paramval->varno == OUTER_VAR);
+				Assert(nlp->paramval->varattno > 0);
+				prm->value = slot_getattr(outerTupleSlot,
+										  nlp->paramval->varattno,
+										  &(prm->isnull));
+				/* Flag parameter value as changed */
+				innerPlan->chgParam = bms_add_member(innerPlan->chgParam,
+													 paramno);
+			}
+
+			/*
+			 * now rescan the inner plan
+			 */
+			ENL1_printf("rescanning inner plan");
+			ExecReScan(innerPlan);
+		}
+
+		/*
+		 * we have an outerTuple, try to get the next inner tuple.
+		 */
+		ENL1_printf("getting new inner tuple");
+
+		innerTupleSlot = ExecProcNode(innerPlan);
+		econtext->ecxt_innertuple = innerTupleSlot;
+
+		if (TupIsNull(innerTupleSlot))
+		{
+			ENL1_printf("no inner tuple, need new outer tuple");
+
+			node->nl_NeedNewOuter = true;
+
+			if (!node->nl_MatchedOuter &&
+				(node->js.jointype == JOIN_LEFT ||
+				 node->js.jointype == JOIN_ANTI))
+			{
+				/*
+				 * We are doing an outer join and there were no join matches
+				 * for this outer tuple.  Generate a fake join tuple with
+				 * nulls for the inner tuple, and return it if it passes the
+				 * non-join quals.
+				 */
+				econtext->ecxt_innertuple = node->nl_NullInnerTupleSlot;
+
+				ENL1_printf("testing qualification for outer-join tuple");
+
+				if (otherqual == NULL || ExecQual(otherqual, econtext))
+				{
+					/*
+					 * qualification was satisfied so we project and return
+					 * the slot containing the result tuple using
+					 * ExecProject().
+					 */
+					ENL1_printf("qualification succeeded, projecting tuple");
+
+					return ExecProject(node->js.ps.ps_ProjInfo);
+				}
+				else
+					InstrCountFiltered2(node, 1);
+			}
+
+			/*
+			 * Otherwise just return to top of loop for a new outer tuple.
+			 */
+			continue;
+		}
+
+		/*
+		 * at this point we have a new pair of inner and outer tuples so we
+		 * test the inner and outer tuples to see if they satisfy the node's
+		 * qualification.
+		 *
+		 * Only the joinquals determine MatchedOuter status, but all quals
+		 * must pass to actually return the tuple.
+		 */
+		ENL1_printf("testing qualification");
+
+		if (ExecQual(joinqual, econtext))
+		{
+			node->nl_MatchedOuter = true;
+
+			/* In an antijoin, we never return a matched tuple */
+			if (node->js.jointype == JOIN_ANTI)
+			{
+				node->nl_NeedNewOuter = true;
+				continue;		/* return to top of loop */
+			}
+
+			/*
+			 * If we only need to join to the first matching inner tuple, then
+			 * consider returning this one, but after that continue with next
+			 * outer tuple.
+			 */
+			if (node->js.single_match)
+				node->nl_NeedNewOuter = true;
+
+			if (otherqual == NULL || ExecQual(otherqual, econtext))
+			{
+				/*
+				 * qualification was satisfied so we project and return the
+				 * slot containing the result tuple using ExecProject().
+				 */
+				ENL1_printf("qualification succeeded, projecting tuple");
+
+				return ExecProject(node->js.ps.ps_ProjInfo);
+			}
+			else
+				InstrCountFiltered2(node, 1);
+		}
+		else
+			InstrCountFiltered1(node, 1);
+
+		/*
+		 * Tuple fails qual, so free per-tuple memory and try again.
+		 */
+		ResetExprContext(econtext);
+
+		ENL1_printf("qualification failed, looping");
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitNestLoop
+ * ----------------------------------------------------------------
+ */
+NestLoopState *
+ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
+{
+	NestLoopState *nlstate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	NL1_printf("ExecInitNestLoop: %s\n",
+			   "initializing node");
+
+	/*
+	 * create state structure
+	 */
+	nlstate = makeNode(NestLoopState);
+	nlstate->js.ps.plan = (Plan *) node;
+	nlstate->js.ps.state = estate;
+	nlstate->js.ps.ExecProcNode = ExecNestLoop;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &nlstate->js.ps);
+
+	/*
+	 * initialize child nodes
+	 *
+	 * If we have no parameters to pass into the inner rel from the outer,
+	 * tell the inner child that cheap rescans would be good.  If we do have
+	 * such parameters, then there is no point in REWIND support at all in the
+	 * inner child, because it will always be rescanned with fresh parameter
+	 * values.
+	 */
+	outerPlanState(nlstate) = ExecInitNode(outerPlan(node), estate, eflags);
+	if (node->nestParams == NIL)
+		eflags |= EXEC_FLAG_REWIND;
+	else
+		eflags &= ~EXEC_FLAG_REWIND;
+	innerPlanState(nlstate) = ExecInitNode(innerPlan(node), estate, eflags);
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTupleSlotTL(&nlstate->js.ps, &TTSOpsVirtual);
+	ExecAssignProjectionInfo(&nlstate->js.ps, NULL);
+
+	/*
+	 * initialize child expressions
+	 */
+	nlstate->js.ps.qual =
+		ExecInitQual(node->join.plan.qual, (PlanState *) nlstate);
+	nlstate->js.jointype = node->join.jointype;
+	nlstate->js.joinqual =
+		ExecInitQual(node->join.joinqual, (PlanState *) nlstate);
+
+	/*
+	 * detect whether we need only consider the first matching inner tuple
+	 */
+	nlstate->js.single_match = (node->join.inner_unique ||
+								node->join.jointype == JOIN_SEMI);
+
+	/* set up null tuples for outer joins, if needed */
+	switch (node->join.jointype)
+	{
+		case JOIN_INNER:
+		case JOIN_SEMI:
+			break;
+		case JOIN_LEFT:
+		case JOIN_ANTI:
+			nlstate->nl_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(innerPlanState(nlstate)),
+									  &TTSOpsVirtual);
+			break;
+		default:
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) node->join.jointype);
+	}
+
+	/*
+	 * finally, wipe the current outer tuple clean.
+	 */
+	nlstate->nl_NeedNewOuter = true;
+	nlstate->nl_MatchedOuter = false;
+
+	NL1_printf("ExecInitNestLoop: %s\n",
+			   "node initialized");
+
+	return nlstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndNestLoop
+ *
+ *		closes down scans and frees allocated storage
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndNestLoop(NestLoopState *node)
+{
+	NL1_printf("ExecEndNestLoop: %s\n",
+			   "ending node processing");
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->js.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
+
+	/*
+	 * close down subplans
+	 */
+	ExecEndNode(outerPlanState(node));
+	ExecEndNode(innerPlanState(node));
+
+	NL1_printf("ExecEndNestLoop: %s\n",
+			   "node processing ended");
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanNestLoop
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanNestLoop(NestLoopState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/*
+	 * If outerPlan->chgParam is not null then plan will be automatically
+	 * re-scanned by first ExecProcNode.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+
+	/*
+	 * innerPlan is re-scanned for each new outer tuple and MUST NOT be
+	 * re-scanned from here or you'll get troubles from inner index scans when
+	 * outer Vars are used as run-time keys...
+	 */
+
+	node->nl_NeedNewOuter = true;
+	node->nl_MatchedOuter = false;
+}
diff --git a/src/backend/executor/nodeProjectSet.c b/src/backend/executor/nodeProjectSet.c
new file mode 100644
index 0000000..07be814
--- /dev/null
+++ b/src/backend/executor/nodeProjectSet.c
@@ -0,0 +1,351 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeProjectSet.c
+ *	  support for evaluating targetlists containing set-returning functions
+ *
+ * DESCRIPTION
+ *
+ *		ProjectSet nodes are inserted by the planner to evaluate set-returning
+ *		functions in the targetlist.  It's guaranteed that all set-returning
+ *		functions are directly at the top level of the targetlist, i.e. they
+ *		can't be inside more-complex expressions.  If that'd otherwise be
+ *		the case, the planner adds additional ProjectSet nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeProjectSet.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeProjectSet.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/memutils.h"
+
+
+static TupleTableSlot *ExecProjectSRF(ProjectSetState *node, bool continuing);
+
+
+/* ----------------------------------------------------------------
+ *		ExecProjectSet(node)
+ *
+ *		Return tuples after evaluating the targetlist (which contains set
+ *		returning functions).
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecProjectSet(PlanState *pstate)
+{
+	ProjectSetState *node = castNode(ProjectSetState, pstate);
+	TupleTableSlot *outerTupleSlot;
+	TupleTableSlot *resultSlot;
+	PlanState  *outerPlan;
+	ExprContext *econtext;
+
+	CHECK_FOR_INTERRUPTS();
+
+	econtext = node->ps.ps_ExprContext;
+
+	/*
+	 * Reset per-tuple context to free expression-evaluation storage allocated
+	 * for a potentially previously returned tuple. Note that the SRF argument
+	 * context has a different lifetime and is reset below.
+	 */
+	ResetExprContext(econtext);
+
+	/*
+	 * Check to see if we're still projecting out tuples from a previous scan
+	 * tuple (because there is a function-returning-set in the projection
+	 * expressions).  If so, try to project another one.
+	 */
+	if (node->pending_srf_tuples)
+	{
+		resultSlot = ExecProjectSRF(node, true);
+
+		if (resultSlot != NULL)
+			return resultSlot;
+	}
+
+	/*
+	 * Reset argument context to free any expression evaluation storage
+	 * allocated in the previous tuple cycle.  Note this can't happen until
+	 * we're done projecting out tuples from a scan tuple, as ValuePerCall
+	 * functions are allowed to reference the arguments for each returned
+	 * tuple.
+	 */
+	MemoryContextReset(node->argcontext);
+
+	/*
+	 * Get another input tuple and project SRFs from it.
+	 */
+	for (;;)
+	{
+		/*
+		 * Retrieve tuples from the outer plan until there are no more.
+		 */
+		outerPlan = outerPlanState(node);
+		outerTupleSlot = ExecProcNode(outerPlan);
+
+		if (TupIsNull(outerTupleSlot))
+			return NULL;
+
+		/*
+		 * Prepare to compute projection expressions, which will expect to
+		 * access the input tuples as varno OUTER.
+		 */
+		econtext->ecxt_outertuple = outerTupleSlot;
+
+		/* Evaluate the expressions */
+		resultSlot = ExecProjectSRF(node, false);
+
+		/*
+		 * Return the tuple unless the projection produced no rows (due to an
+		 * empty set), in which case we must loop back to see if there are
+		 * more outerPlan tuples.
+		 */
+		if (resultSlot)
+			return resultSlot;
+	}
+
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecProjectSRF
+ *
+ *		Project a targetlist containing one or more set-returning functions.
+ *
+ *		'continuing' indicates whether to continue projecting rows for the
+ *		same input tuple; or whether a new input tuple is being projected.
+ *
+ *		Returns NULL if no output tuple has been produced.
+ *
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecProjectSRF(ProjectSetState *node, bool continuing)
+{
+	TupleTableSlot *resultSlot = node->ps.ps_ResultTupleSlot;
+	ExprContext *econtext = node->ps.ps_ExprContext;
+	MemoryContext oldcontext;
+	bool		hassrf PG_USED_FOR_ASSERTS_ONLY;
+	bool		hasresult;
+	int			argno;
+
+	ExecClearTuple(resultSlot);
+
+	/* Call SRFs, as well as plain expressions, in per-tuple context */
+	oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	/*
+	 * Assume no further tuples are produced unless an ExprMultipleResult is
+	 * encountered from a set returning function.
+	 */
+	node->pending_srf_tuples = false;
+
+	hassrf = hasresult = false;
+	for (argno = 0; argno < node->nelems; argno++)
+	{
+		Node	   *elem = node->elems[argno];
+		ExprDoneCond *isdone = &node->elemdone[argno];
+		Datum	   *result = &resultSlot->tts_values[argno];
+		bool	   *isnull = &resultSlot->tts_isnull[argno];
+
+		if (continuing && *isdone == ExprEndResult)
+		{
+			/*
+			 * If we're continuing to project output rows from a source tuple,
+			 * return NULLs once the SRF has been exhausted.
+			 */
+			*result = (Datum) 0;
+			*isnull = true;
+			hassrf = true;
+		}
+		else if (IsA(elem, SetExprState))
+		{
+			/*
+			 * Evaluate SRF - possibly continuing previously started output.
+			 */
+			*result = ExecMakeFunctionResultSet((SetExprState *) elem,
+												econtext, node->argcontext,
+												isnull, isdone);
+
+			if (*isdone != ExprEndResult)
+				hasresult = true;
+			if (*isdone == ExprMultipleResult)
+				node->pending_srf_tuples = true;
+			hassrf = true;
+		}
+		else
+		{
+			/* Non-SRF tlist expression, just evaluate normally. */
+			*result = ExecEvalExpr((ExprState *) elem, econtext, isnull);
+			*isdone = ExprSingleResult;
+		}
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* ProjectSet should not be used if there's no SRFs */
+	Assert(hassrf);
+
+	/*
+	 * If all the SRFs returned ExprEndResult, we consider that as no row
+	 * being produced.
+	 */
+	if (hasresult)
+	{
+		ExecStoreVirtualTuple(resultSlot);
+		return resultSlot;
+	}
+
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitProjectSet
+ *
+ *		Creates the run-time state information for the ProjectSet node
+ *		produced by the planner and initializes outer relations
+ *		(child nodes).
+ * ----------------------------------------------------------------
+ */
+ProjectSetState *
+ExecInitProjectSet(ProjectSet *node, EState *estate, int eflags)
+{
+	ProjectSetState *state;
+	ListCell   *lc;
+	int			off;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)));
+
+	/*
+	 * create state structure
+	 */
+	state = makeNode(ProjectSetState);
+	state->ps.plan = (Plan *) node;
+	state->ps.state = estate;
+	state->ps.ExecProcNode = ExecProjectSet;
+
+	state->pending_srf_tuples = false;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &state->ps);
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlanState(state) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * we don't use inner plan
+	 */
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * tuple table and result type initialization
+	 */
+	ExecInitResultTupleSlotTL(&state->ps, &TTSOpsVirtual);
+
+	/* Create workspace for per-tlist-entry expr state & SRF-is-done state */
+	state->nelems = list_length(node->plan.targetlist);
+	state->elems = (Node **)
+		palloc(sizeof(Node *) * state->nelems);
+	state->elemdone = (ExprDoneCond *)
+		palloc(sizeof(ExprDoneCond) * state->nelems);
+
+	/*
+	 * Build expressions to evaluate targetlist.  We can't use
+	 * ExecBuildProjectionInfo here, since that doesn't deal with SRFs.
+	 * Instead compile each expression separately, using
+	 * ExecInitFunctionResultSet where applicable.
+	 */
+	off = 0;
+	foreach(lc, node->plan.targetlist)
+	{
+		TargetEntry *te = (TargetEntry *) lfirst(lc);
+		Expr	   *expr = te->expr;
+
+		if ((IsA(expr, FuncExpr) && ((FuncExpr *) expr)->funcretset) ||
+			(IsA(expr, OpExpr) && ((OpExpr *) expr)->opretset))
+		{
+			state->elems[off] = (Node *)
+				ExecInitFunctionResultSet(expr, state->ps.ps_ExprContext,
+										  &state->ps);
+		}
+		else
+		{
+			Assert(!expression_returns_set((Node *) expr));
+			state->elems[off] = (Node *) ExecInitExpr(expr, &state->ps);
+		}
+
+		off++;
+	}
+
+	/* We don't support any qual on ProjectSet nodes */
+	Assert(node->plan.qual == NIL);
+
+	/*
+	 * Create a memory context that ExecMakeFunctionResultSet can use to
+	 * evaluate function arguments in.  We can't use the per-tuple context for
+	 * this because it gets reset too often; but we don't want to leak
+	 * evaluation results into the query-lifespan context either.  We use one
+	 * context for the arguments of all tSRFs, as they have roughly equivalent
+	 * lifetimes.
+	 */
+	state->argcontext = AllocSetContextCreate(CurrentMemoryContext,
+											  "tSRF function arguments",
+											  ALLOCSET_DEFAULT_SIZES);
+
+	return state;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndProjectSet
+ *
+ *		frees up storage allocated through C routines
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndProjectSet(ProjectSetState *node)
+{
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+	/*
+	 * shut down subplans
+	 */
+	ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanProjectSet(ProjectSetState *node)
+{
+	/* Forget any incompletely-evaluated SRFs */
+	node->pending_srf_tuples = false;
+
+	/*
+	 * If chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->ps.lefttree->chgParam == NULL)
+		ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c
new file mode 100644
index 0000000..f9e91fd
--- /dev/null
+++ b/src/backend/executor/nodeRecursiveunion.c
@@ -0,0 +1,331 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeRecursiveunion.c
+ *	  routines to handle RecursiveUnion nodes.
+ *
+ * To implement UNION (without ALL), we need a hashtable that stores tuples
+ * already seen.  The hash key is computed from the grouping columns.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeRecursiveunion.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeRecursiveunion.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(RecursiveUnionState *rustate)
+{
+	RecursiveUnion *node = (RecursiveUnion *) rustate->ps.plan;
+	TupleDesc	desc = ExecGetResultType(outerPlanState(rustate));
+
+	Assert(node->numCols > 0);
+	Assert(node->numGroups > 0);
+
+	rustate->hashtable = BuildTupleHashTableExt(&rustate->ps,
+												desc,
+												node->numCols,
+												node->dupColIdx,
+												rustate->eqfuncoids,
+												rustate->hashfunctions,
+												node->dupCollations,
+												node->numGroups,
+												0,
+												rustate->ps.state->es_query_cxt,
+												rustate->tableContext,
+												rustate->tempContext,
+												false);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecRecursiveUnion(node)
+ *
+ *		Scans the recursive query sequentially and returns the next
+ *		qualifying tuple.
+ *
+ * 1. evaluate non recursive term and assign the result to RT
+ *
+ * 2. execute recursive terms
+ *
+ * 2.1 WT := RT
+ * 2.2 while WT is not empty repeat 2.3 to 2.6. if WT is empty returns RT
+ * 2.3 replace the name of recursive term with WT
+ * 2.4 evaluate the recursive term and store into WT
+ * 2.5 append WT to RT
+ * 2.6 go back to 2.2
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecRecursiveUnion(PlanState *pstate)
+{
+	RecursiveUnionState *node = castNode(RecursiveUnionState, pstate);
+	PlanState  *outerPlan = outerPlanState(node);
+	PlanState  *innerPlan = innerPlanState(node);
+	RecursiveUnion *plan = (RecursiveUnion *) node->ps.plan;
+	TupleTableSlot *slot;
+	bool		isnew;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/* 1. Evaluate non-recursive term */
+	if (!node->recursing)
+	{
+		for (;;)
+		{
+			slot = ExecProcNode(outerPlan);
+			if (TupIsNull(slot))
+				break;
+			if (plan->numCols > 0)
+			{
+				/* Find or build hashtable entry for this tuple's group */
+				LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
+				/* Must reset temp context after each hashtable lookup */
+				MemoryContextReset(node->tempContext);
+				/* Ignore tuple if already seen */
+				if (!isnew)
+					continue;
+			}
+			/* Each non-duplicate tuple goes to the working table ... */
+			tuplestore_puttupleslot(node->working_table, slot);
+			/* ... and to the caller */
+			return slot;
+		}
+		node->recursing = true;
+	}
+
+	/* 2. Execute recursive term */
+	for (;;)
+	{
+		slot = ExecProcNode(innerPlan);
+		if (TupIsNull(slot))
+		{
+			/* Done if there's nothing in the intermediate table */
+			if (node->intermediate_empty)
+				break;
+
+			/* done with old working table ... */
+			tuplestore_end(node->working_table);
+
+			/* intermediate table becomes working table */
+			node->working_table = node->intermediate_table;
+
+			/* create new empty intermediate table */
+			node->intermediate_table = tuplestore_begin_heap(false, false,
+															 work_mem);
+			node->intermediate_empty = true;
+
+			/* reset the recursive term */
+			innerPlan->chgParam = bms_add_member(innerPlan->chgParam,
+												 plan->wtParam);
+
+			/* and continue fetching from recursive term */
+			continue;
+		}
+
+		if (plan->numCols > 0)
+		{
+			/* Find or build hashtable entry for this tuple's group */
+			LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
+			/* Must reset temp context after each hashtable lookup */
+			MemoryContextReset(node->tempContext);
+			/* Ignore tuple if already seen */
+			if (!isnew)
+				continue;
+		}
+
+		/* Else, tuple is good; stash it in intermediate table ... */
+		node->intermediate_empty = false;
+		tuplestore_puttupleslot(node->intermediate_table, slot);
+		/* ... and return it */
+		return slot;
+	}
+
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitRecursiveUnion
+ * ----------------------------------------------------------------
+ */
+RecursiveUnionState *
+ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags)
+{
+	RecursiveUnionState *rustate;
+	ParamExecData *prmdata;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	rustate = makeNode(RecursiveUnionState);
+	rustate->ps.plan = (Plan *) node;
+	rustate->ps.state = estate;
+	rustate->ps.ExecProcNode = ExecRecursiveUnion;
+
+	rustate->eqfuncoids = NULL;
+	rustate->hashfunctions = NULL;
+	rustate->hashtable = NULL;
+	rustate->tempContext = NULL;
+	rustate->tableContext = NULL;
+
+	/* initialize processing state */
+	rustate->recursing = false;
+	rustate->intermediate_empty = true;
+	rustate->working_table = tuplestore_begin_heap(false, false, work_mem);
+	rustate->intermediate_table = tuplestore_begin_heap(false, false, work_mem);
+
+	/*
+	 * If hashing, we need a per-tuple memory context for comparisons, and a
+	 * longer-lived context to store the hash table.  The table can't just be
+	 * kept in the per-query context because we want to be able to throw it
+	 * away when rescanning.
+	 */
+	if (node->numCols > 0)
+	{
+		rustate->tempContext =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "RecursiveUnion",
+								  ALLOCSET_DEFAULT_SIZES);
+		rustate->tableContext =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "RecursiveUnion hash table",
+								  ALLOCSET_DEFAULT_SIZES);
+	}
+
+	/*
+	 * Make the state structure available to descendant WorkTableScan nodes
+	 * via the Param slot reserved for it.
+	 */
+	prmdata = &(estate->es_param_exec_vals[node->wtParam]);
+	Assert(prmdata->execPlan == NULL);
+	prmdata->value = PointerGetDatum(rustate);
+	prmdata->isnull = false;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * RecursiveUnion plans don't have expression contexts because they never
+	 * call ExecQual or ExecProject.
+	 */
+	Assert(node->plan.qual == NIL);
+
+	/*
+	 * RecursiveUnion nodes still have Result slots, which hold pointers to
+	 * tuples, so we have to initialize them.
+	 */
+	ExecInitResultTypeTL(&rustate->ps);
+
+	/*
+	 * Initialize result tuple type.  (Note: we have to set up the result type
+	 * before initializing child nodes, because nodeWorktablescan.c expects it
+	 * to be valid.)
+	 */
+	rustate->ps.ps_ProjInfo = NULL;
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlanState(rustate) = ExecInitNode(outerPlan(node), estate, eflags);
+	innerPlanState(rustate) = ExecInitNode(innerPlan(node), estate, eflags);
+
+	/*
+	 * If hashing, precompute fmgr lookup data for inner loop, and create the
+	 * hash table.
+	 */
+	if (node->numCols > 0)
+	{
+		execTuplesHashPrepare(node->numCols,
+							  node->dupOperators,
+							  &rustate->eqfuncoids,
+							  &rustate->hashfunctions);
+		build_hash_table(rustate);
+	}
+
+	return rustate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndRecursiveUnion
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndRecursiveUnion(RecursiveUnionState *node)
+{
+	/* Release tuplestores */
+	tuplestore_end(node->working_table);
+	tuplestore_end(node->intermediate_table);
+
+	/* free subsidiary stuff including hashtable */
+	if (node->tempContext)
+		MemoryContextDelete(node->tempContext);
+	if (node->tableContext)
+		MemoryContextDelete(node->tableContext);
+
+	/*
+	 * close down subplans
+	 */
+	ExecEndNode(outerPlanState(node));
+	ExecEndNode(innerPlanState(node));
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanRecursiveUnion
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanRecursiveUnion(RecursiveUnionState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+	PlanState  *innerPlan = innerPlanState(node);
+	RecursiveUnion *plan = (RecursiveUnion *) node->ps.plan;
+
+	/*
+	 * Set recursive term's chgParam to tell it that we'll modify the working
+	 * table and therefore it has to rescan.
+	 */
+	innerPlan->chgParam = bms_add_member(innerPlan->chgParam, plan->wtParam);
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.  Because of above, we only have to do this to the
+	 * non-recursive term.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+
+	/* Release any hashtable storage */
+	if (node->tableContext)
+		MemoryContextResetAndDeleteChildren(node->tableContext);
+
+	/* Empty hashtable if needed */
+	if (plan->numCols > 0)
+		ResetTupleHashTable(node->hashtable);
+
+	/* reset processing state */
+	node->recursing = false;
+	node->intermediate_empty = true;
+	tuplestore_clear(node->working_table);
+	tuplestore_clear(node->intermediate_table);
+}
diff --git a/src/backend/executor/nodeResult.c b/src/backend/executor/nodeResult.c
new file mode 100644
index 0000000..0946af0
--- /dev/null
+++ b/src/backend/executor/nodeResult.c
@@ -0,0 +1,272 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeResult.c
+ *	  support for constant nodes needing special code.
+ *
+ * DESCRIPTION
+ *
+ *		Result nodes are used in queries where no relations are scanned.
+ *		Examples of such queries are:
+ *
+ *				select 1 * 2
+ *
+ *				insert into emp values ('mike', 15000)
+ *
+ *		(Remember that in an INSERT or UPDATE, we need a plan tree that
+ *		generates the new rows.)
+ *
+ *		Result nodes are also used to optimise queries with constant
+ *		qualifications (ie, quals that do not depend on the scanned data),
+ *		such as:
+ *
+ *				select * from emp where 2 > 1
+ *
+ *		In this case, the plan generated is
+ *
+ *						Result	(with 2 > 1 qual)
+ *						/
+ *				   SeqScan (emp.*)
+ *
+ *		At runtime, the Result node evaluates the constant qual once,
+ *		which is shown by EXPLAIN as a One-Time Filter.  If it's
+ *		false, we can return an empty result set without running the
+ *		controlled plan at all.  If it's true, we run the controlled
+ *		plan normally and pass back the results.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeResult.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeResult.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/* ----------------------------------------------------------------
+ *		ExecResult(node)
+ *
+ *		returns the tuples from the outer plan which satisfy the
+ *		qualification clause.  Since result nodes with right
+ *		subtrees are never planned, we ignore the right subtree
+ *		entirely (for now).. -cim 10/7/89
+ *
+ *		The qualification containing only constant clauses are
+ *		checked first before any processing is done. It always returns
+ *		'nil' if the constant qualification is not satisfied.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecResult(PlanState *pstate)
+{
+	ResultState *node = castNode(ResultState, pstate);
+	TupleTableSlot *outerTupleSlot;
+	PlanState  *outerPlan;
+	ExprContext *econtext;
+
+	CHECK_FOR_INTERRUPTS();
+
+	econtext = node->ps.ps_ExprContext;
+
+	/*
+	 * check constant qualifications like (2 > 1), if not already done
+	 */
+	if (node->rs_checkqual)
+	{
+		bool		qualResult = ExecQual(node->resconstantqual, econtext);
+
+		node->rs_checkqual = false;
+		if (!qualResult)
+		{
+			node->rs_done = true;
+			return NULL;
+		}
+	}
+
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage allocated in the previous tuple cycle.
+	 */
+	ResetExprContext(econtext);
+
+	/*
+	 * if rs_done is true then it means that we were asked to return a
+	 * constant tuple and we already did the last time ExecResult() was
+	 * called, OR that we failed the constant qual check. Either way, now we
+	 * are through.
+	 */
+	if (!node->rs_done)
+	{
+		outerPlan = outerPlanState(node);
+
+		if (outerPlan != NULL)
+		{
+			/*
+			 * retrieve tuples from the outer plan until there are no more.
+			 */
+			outerTupleSlot = ExecProcNode(outerPlan);
+
+			if (TupIsNull(outerTupleSlot))
+				return NULL;
+
+			/*
+			 * prepare to compute projection expressions, which will expect to
+			 * access the input tuples as varno OUTER.
+			 */
+			econtext->ecxt_outertuple = outerTupleSlot;
+		}
+		else
+		{
+			/*
+			 * if we don't have an outer plan, then we are just generating the
+			 * results from a constant target list.  Do it only once.
+			 */
+			node->rs_done = true;
+		}
+
+		/* form the result tuple using ExecProject(), and return it */
+		return ExecProject(node->ps.ps_ProjInfo);
+	}
+
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecResultMarkPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecResultMarkPos(ResultState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	if (outerPlan != NULL)
+		ExecMarkPos(outerPlan);
+	else
+		elog(DEBUG2, "Result nodes do not support mark/restore");
+}
+
+/* ----------------------------------------------------------------
+ *		ExecResultRestrPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecResultRestrPos(ResultState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	if (outerPlan != NULL)
+		ExecRestrPos(outerPlan);
+	else
+		elog(ERROR, "Result nodes do not support mark/restore");
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitResult
+ *
+ *		Creates the run-time state information for the result node
+ *		produced by the planner and initializes outer relations
+ *		(child nodes).
+ * ----------------------------------------------------------------
+ */
+ResultState *
+ExecInitResult(Result *node, EState *estate, int eflags)
+{
+	ResultState *resstate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)) ||
+		   outerPlan(node) != NULL);
+
+	/*
+	 * create state structure
+	 */
+	resstate = makeNode(ResultState);
+	resstate->ps.plan = (Plan *) node;
+	resstate->ps.state = estate;
+	resstate->ps.ExecProcNode = ExecResult;
+
+	resstate->rs_done = false;
+	resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &resstate->ps);
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlanState(resstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * we don't use inner plan
+	 */
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTupleSlotTL(&resstate->ps, &TTSOpsVirtual);
+	ExecAssignProjectionInfo(&resstate->ps, NULL);
+
+	/*
+	 * initialize child expressions
+	 */
+	resstate->ps.qual =
+		ExecInitQual(node->plan.qual, (PlanState *) resstate);
+	resstate->resconstantqual =
+		ExecInitQual((List *) node->resconstantqual, (PlanState *) resstate);
+
+	return resstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndResult
+ *
+ *		frees up storage allocated through C routines
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndResult(ResultState *node)
+{
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+	/*
+	 * shut down subplans
+	 */
+	ExecEndNode(outerPlanState(node));
+}
+
+void
+ExecReScanResult(ResultState *node)
+{
+	node->rs_done = false;
+	node->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
+
+	/*
+	 * If chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->ps.lefttree &&
+		node->ps.lefttree->chgParam == NULL)
+		ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c
new file mode 100644
index 0000000..44232d5
--- /dev/null
+++ b/src/backend/executor/nodeSamplescan.c
@@ -0,0 +1,378 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSamplescan.c
+ *	  Support routines for sample scans of relations (table sampling).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeSamplescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/tsmapi.h"
+#include "executor/executor.h"
+#include "executor/nodeSamplescan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+
+static TupleTableSlot *SampleNext(SampleScanState *node);
+static void tablesample_init(SampleScanState *scanstate);
+static TupleTableSlot *tablesample_getnext(SampleScanState *scanstate);
+
+/* ----------------------------------------------------------------
+ *						Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		SampleNext
+ *
+ *		This is a workhorse for ExecSampleScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+SampleNext(SampleScanState *node)
+{
+	/*
+	 * if this is first call within a scan, initialize
+	 */
+	if (!node->begun)
+		tablesample_init(node);
+
+	/*
+	 * get the next tuple, and store it in our result slot
+	 */
+	return tablesample_getnext(node);
+}
+
+/*
+ * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
+{
+	/*
+	 * No need to recheck for SampleScan, since like SeqScan we don't pass any
+	 * checkable keys to heap_beginscan.
+	 */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSampleScan(node)
+ *
+ *		Scans the relation using the sampling method and returns
+ *		the next qualifying tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecSampleScan(PlanState *pstate)
+{
+	SampleScanState *node = castNode(SampleScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) SampleNext,
+					(ExecScanRecheckMtd) SampleRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitSampleScan
+ * ----------------------------------------------------------------
+ */
+SampleScanState *
+ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
+{
+	SampleScanState *scanstate;
+	TableSampleClause *tsc = node->tablesample;
+	TsmRoutine *tsm;
+
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create state structure
+	 */
+	scanstate = makeNode(SampleScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecSampleScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * open the scan relation
+	 */
+	scanstate->ss.ss_currentRelation =
+		ExecOpenScanRelation(estate,
+							 node->scan.scanrelid,
+							 eflags);
+
+	/* we won't set up the HeapScanDesc till later */
+	scanstate->ss.ss_currentScanDesc = NULL;
+
+	/* and create slot with appropriate rowtype */
+	ExecInitScanTupleSlot(estate, &scanstate->ss,
+						  RelationGetDescr(scanstate->ss.ss_currentRelation),
+						  table_slot_callbacks(scanstate->ss.ss_currentRelation));
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+	scanstate->args = ExecInitExprList(tsc->args, (PlanState *) scanstate);
+	scanstate->repeatable =
+		ExecInitExpr(tsc->repeatable, (PlanState *) scanstate);
+
+	/*
+	 * If we don't have a REPEATABLE clause, select a random seed.  We want to
+	 * do this just once, since the seed shouldn't change over rescans.
+	 */
+	if (tsc->repeatable == NULL)
+		scanstate->seed = random();
+
+	/*
+	 * Finally, initialize the TABLESAMPLE method handler.
+	 */
+	tsm = GetTsmRoutine(tsc->tsmhandler);
+	scanstate->tsmroutine = tsm;
+	scanstate->tsm_state = NULL;
+
+	if (tsm->InitSampleScan)
+		tsm->InitSampleScan(scanstate, eflags);
+
+	/* We'll do BeginSampleScan later; we can't evaluate params yet */
+	scanstate->begun = false;
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndSampleScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSampleScan(SampleScanState *node)
+{
+	/*
+	 * Tell sampling function that we finished the scan.
+	 */
+	if (node->tsmroutine->EndSampleScan)
+		node->tsmroutine->EndSampleScan(node);
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close heap scan
+	 */
+	if (node->ss.ss_currentScanDesc)
+		table_endscan(node->ss.ss_currentScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanSampleScan
+ *
+ *		Rescans the relation.
+ *
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanSampleScan(SampleScanState *node)
+{
+	/* Remember we need to do BeginSampleScan again (if we did it at all) */
+	node->begun = false;
+	node->done = false;
+	node->haveblock = false;
+	node->donetuples = 0;
+
+	ExecScanReScan(&node->ss);
+}
+
+
+/*
+ * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
+ */
+static void
+tablesample_init(SampleScanState *scanstate)
+{
+	TsmRoutine *tsm = scanstate->tsmroutine;
+	ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
+	Datum	   *params;
+	Datum		datum;
+	bool		isnull;
+	uint32		seed;
+	bool		allow_sync;
+	int			i;
+	ListCell   *arg;
+
+	scanstate->donetuples = 0;
+	params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
+
+	i = 0;
+	foreach(arg, scanstate->args)
+	{
+		ExprState  *argstate = (ExprState *) lfirst(arg);
+
+		params[i] = ExecEvalExprSwitchContext(argstate,
+											  econtext,
+											  &isnull);
+		if (isnull)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+					 errmsg("TABLESAMPLE parameter cannot be null")));
+		i++;
+	}
+
+	if (scanstate->repeatable)
+	{
+		datum = ExecEvalExprSwitchContext(scanstate->repeatable,
+										  econtext,
+										  &isnull);
+		if (isnull)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
+					 errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
+
+		/*
+		 * The REPEATABLE parameter has been coerced to float8 by the parser.
+		 * The reason for using float8 at the SQL level is that it will
+		 * produce unsurprising results both for users used to databases that
+		 * accept only integers in the REPEATABLE clause and for those who
+		 * might expect that REPEATABLE works like setseed() (a float in the
+		 * range from -1 to 1).
+		 *
+		 * We use hashfloat8() to convert the supplied value into a suitable
+		 * seed.  For regression-testing purposes, that has the convenient
+		 * property that REPEATABLE(0) gives a machine-independent result.
+		 */
+		seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
+	}
+	else
+	{
+		/* Use the seed selected by ExecInitSampleScan */
+		seed = scanstate->seed;
+	}
+
+	/* Set default values for params that BeginSampleScan can adjust */
+	scanstate->use_bulkread = true;
+	scanstate->use_pagemode = true;
+
+	/* Let tablesample method do its thing */
+	tsm->BeginSampleScan(scanstate,
+						 params,
+						 list_length(scanstate->args),
+						 seed);
+
+	/* We'll use syncscan if there's no NextSampleBlock function */
+	allow_sync = (tsm->NextSampleBlock == NULL);
+
+	/* Now we can create or reset the HeapScanDesc */
+	if (scanstate->ss.ss_currentScanDesc == NULL)
+	{
+		scanstate->ss.ss_currentScanDesc =
+			table_beginscan_sampling(scanstate->ss.ss_currentRelation,
+									 scanstate->ss.ps.state->es_snapshot,
+									 0, NULL,
+									 scanstate->use_bulkread,
+									 allow_sync,
+									 scanstate->use_pagemode);
+	}
+	else
+	{
+		table_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
+								scanstate->use_bulkread,
+								allow_sync,
+								scanstate->use_pagemode);
+	}
+
+	pfree(params);
+
+	/* And we're initialized. */
+	scanstate->begun = true;
+}
+
+/*
+ * Get next tuple from TABLESAMPLE method.
+ */
+static TupleTableSlot *
+tablesample_getnext(SampleScanState *scanstate)
+{
+	TableScanDesc scan = scanstate->ss.ss_currentScanDesc;
+	TupleTableSlot *slot = scanstate->ss.ss_ScanTupleSlot;
+
+	ExecClearTuple(slot);
+
+	if (scanstate->done)
+		return NULL;
+
+	for (;;)
+	{
+		if (!scanstate->haveblock)
+		{
+			if (!table_scan_sample_next_block(scan, scanstate))
+			{
+				scanstate->haveblock = false;
+				scanstate->done = true;
+
+				/* exhausted relation */
+				return NULL;
+			}
+
+			scanstate->haveblock = true;
+		}
+
+		if (!table_scan_sample_next_tuple(scan, scanstate, slot))
+		{
+			/*
+			 * If we get here, it means we've exhausted the items on this page
+			 * and it's time to move to the next.
+			 */
+			scanstate->haveblock = false;
+			continue;
+		}
+
+		/* Found visible tuple, return it. */
+		break;
+	}
+
+	scanstate->donetuples++;
+
+	return slot;
+}
diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c
new file mode 100644
index 0000000..066f9ae
--- /dev/null
+++ b/src/backend/executor/nodeSeqscan.c
@@ -0,0 +1,314 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSeqscan.c
+ *	  Support routines for sequential scans of relations.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeSeqscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecSeqScan				sequentially scans a relation.
+ *		ExecSeqNext				retrieve next tuple in sequential order.
+ *		ExecInitSeqScan			creates and initializes a seqscan node.
+ *		ExecEndSeqScan			releases any storage allocated.
+ *		ExecReScanSeqScan		rescans the relation
+ *
+ *		ExecSeqScanEstimate		estimates DSM space needed for parallel scan
+ *		ExecSeqScanInitializeDSM initialize DSM for parallel scan
+ *		ExecSeqScanReInitializeDSM reinitialize DSM for fresh parallel scan
+ *		ExecSeqScanInitializeWorker attach to DSM info in parallel worker
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "executor/execdebug.h"
+#include "executor/nodeSeqscan.h"
+#include "utils/rel.h"
+
+static TupleTableSlot *SeqNext(SeqScanState *node);
+
+/* ----------------------------------------------------------------
+ *						Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		SeqNext
+ *
+ *		This is a workhorse for ExecSeqScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+SeqNext(SeqScanState *node)
+{
+	TableScanDesc scandesc;
+	EState	   *estate;
+	ScanDirection direction;
+	TupleTableSlot *slot;
+
+	/*
+	 * get information from the estate and scan state
+	 */
+	scandesc = node->ss.ss_currentScanDesc;
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	slot = node->ss.ss_ScanTupleSlot;
+
+	if (scandesc == NULL)
+	{
+		/*
+		 * We reach here if the scan is not parallel, or if we're serially
+		 * executing a scan that was planned to be parallel.
+		 */
+		scandesc = table_beginscan(node->ss.ss_currentRelation,
+								   estate->es_snapshot,
+								   0, NULL);
+		node->ss.ss_currentScanDesc = scandesc;
+	}
+
+	/*
+	 * get the next tuple from the table
+	 */
+	if (table_scan_getnextslot(scandesc, direction, slot))
+		return slot;
+	return NULL;
+}
+
+/*
+ * SeqRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+SeqRecheck(SeqScanState *node, TupleTableSlot *slot)
+{
+	/*
+	 * Note that unlike IndexScan, SeqScan never use keys in heap_beginscan
+	 * (and this is very bad) - so, here we do not check are keys ok or not.
+	 */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSeqScan(node)
+ *
+ *		Scans the relation sequentially and returns the next qualifying
+ *		tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecSeqScan(PlanState *pstate)
+{
+	SeqScanState *node = castNode(SeqScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) SeqNext,
+					(ExecScanRecheckMtd) SeqRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecInitSeqScan
+ * ----------------------------------------------------------------
+ */
+SeqScanState *
+ExecInitSeqScan(SeqScan *node, EState *estate, int eflags)
+{
+	SeqScanState *scanstate;
+
+	/*
+	 * Once upon a time it was possible to have an outerPlan of a SeqScan, but
+	 * not any more.
+	 */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create state structure
+	 */
+	scanstate = makeNode(SeqScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecSeqScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * open the scan relation
+	 */
+	scanstate->ss.ss_currentRelation =
+		ExecOpenScanRelation(estate,
+							 node->scanrelid,
+							 eflags);
+
+	/* and create slot with the appropriate rowtype */
+	ExecInitScanTupleSlot(estate, &scanstate->ss,
+						  RelationGetDescr(scanstate->ss.ss_currentRelation),
+						  table_slot_callbacks(scanstate->ss.ss_currentRelation));
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->plan.qual, (PlanState *) scanstate);
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndSeqScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSeqScan(SeqScanState *node)
+{
+	TableScanDesc scanDesc;
+
+	/*
+	 * get information from node
+	 */
+	scanDesc = node->ss.ss_currentScanDesc;
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close heap scan
+	 */
+	if (scanDesc != NULL)
+		table_endscan(scanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *						Join Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecReScanSeqScan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanSeqScan(SeqScanState *node)
+{
+	TableScanDesc scan;
+
+	scan = node->ss.ss_currentScanDesc;
+
+	if (scan != NULL)
+		table_rescan(scan,		/* scan desc */
+					 NULL);		/* new scan keys */
+
+	ExecScanReScan((ScanState *) node);
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecSeqScanEstimate
+ *
+ *		Compute the amount of space we'll need in the parallel
+ *		query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanEstimate(SeqScanState *node,
+					ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+
+	node->pscan_len = table_parallelscan_estimate(node->ss.ss_currentRelation,
+												  estate->es_snapshot);
+	shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSeqScanInitializeDSM
+ *
+ *		Set up a parallel heap scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanInitializeDSM(SeqScanState *node,
+						 ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+	ParallelTableScanDesc pscan;
+
+	pscan = shm_toc_allocate(pcxt->toc, node->pscan_len);
+	table_parallelscan_initialize(node->ss.ss_currentRelation,
+								  pscan,
+								  estate->es_snapshot);
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
+	node->ss.ss_currentScanDesc =
+		table_beginscan_parallel(node->ss.ss_currentRelation, pscan);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSeqScanReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanReInitializeDSM(SeqScanState *node,
+						   ParallelContext *pcxt)
+{
+	ParallelTableScanDesc pscan;
+
+	pscan = node->ss.ss_currentScanDesc->rs_parallel;
+	table_parallelscan_reinitialize(node->ss.ss_currentRelation, pscan);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSeqScanInitializeWorker
+ *
+ *		Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanInitializeWorker(SeqScanState *node,
+							ParallelWorkerContext *pwcxt)
+{
+	ParallelTableScanDesc pscan;
+
+	pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+	node->ss.ss_currentScanDesc =
+		table_beginscan_parallel(node->ss.ss_currentRelation, pscan);
+}
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
new file mode 100644
index 0000000..aad7ac0
--- /dev/null
+++ b/src/backend/executor/nodeSetOp.c
@@ -0,0 +1,651 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSetOp.c
+ *	  Routines to handle INTERSECT and EXCEPT selection
+ *
+ * The input of a SetOp node consists of tuples from two relations,
+ * which have been combined into one dataset, with a junk attribute added
+ * that shows which relation each tuple came from.  In SETOP_SORTED mode,
+ * the input has furthermore been sorted according to all the grouping
+ * columns (ie, all the non-junk attributes).  The SetOp node scans each
+ * group of identical tuples to determine how many came from each input
+ * relation.  Then it is a simple matter to emit the output demanded by the
+ * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ *
+ * In SETOP_HASHED mode, the input is delivered in no particular order,
+ * except that we know all the tuples from one input relation will come before
+ * all the tuples of the other.  The planner guarantees that the first input
+ * relation is the left-hand one for EXCEPT, and tries to make the smaller
+ * input relation come first for INTERSECT.  We build a hash table in memory
+ * with one entry for each group of identical tuples, and count the number of
+ * tuples in the group from each relation.  After seeing all the input, we
+ * scan the hashtable and generate the correct output using those counts.
+ * We can avoid making hashtable entries for any tuples appearing only in the
+ * second input relation, since they cannot result in any output.
+ *
+ * This node type is not used for UNION or UNION ALL, since those can be
+ * implemented more cheaply (there's no need for the junk attribute to
+ * identify the source relation).
+ *
+ * Note that SetOp does no qual checking nor projection.  The delivered
+ * output tuples are just copies of the first-to-arrive tuple in each
+ * input group.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeSetOp.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/executor.h"
+#include "executor/nodeSetOp.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/*
+ * SetOpStatePerGroupData - per-group working state
+ *
+ * These values are working state that is initialized at the start of
+ * an input tuple group and updated for each input tuple.
+ *
+ * In SETOP_SORTED mode, we need only one of these structs, and it's kept in
+ * the plan state node.  In SETOP_HASHED mode, the hash table contains one
+ * of these for each tuple group.
+ */
+typedef struct SetOpStatePerGroupData
+{
+	long		numLeft;		/* number of left-input dups in group */
+	long		numRight;		/* number of right-input dups in group */
+}			SetOpStatePerGroupData;
+
+
+static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate);
+static void setop_fill_hash_table(SetOpState *setopstate);
+static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate);
+
+
+/*
+ * Initialize state for a new group of input values.
+ */
+static inline void
+initialize_counts(SetOpStatePerGroup pergroup)
+{
+	pergroup->numLeft = pergroup->numRight = 0;
+}
+
+/*
+ * Advance the appropriate counter for one input tuple.
+ */
+static inline void
+advance_counts(SetOpStatePerGroup pergroup, int flag)
+{
+	if (flag)
+		pergroup->numRight++;
+	else
+		pergroup->numLeft++;
+}
+
+/*
+ * Fetch the "flag" column from an input tuple.
+ * This is an integer column with value 0 for left side, 1 for right side.
+ */
+static int
+fetch_tuple_flag(SetOpState *setopstate, TupleTableSlot *inputslot)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	int			flag;
+	bool		isNull;
+
+	flag = DatumGetInt32(slot_getattr(inputslot,
+									  node->flagColIdx,
+									  &isNull));
+	Assert(!isNull);
+	Assert(flag == 0 || flag == 1);
+	return flag;
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(SetOpState *setopstate)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	ExprContext *econtext = setopstate->ps.ps_ExprContext;
+	TupleDesc	desc = ExecGetResultType(outerPlanState(setopstate));
+
+	Assert(node->strategy == SETOP_HASHED);
+	Assert(node->numGroups > 0);
+
+	setopstate->hashtable = BuildTupleHashTableExt(&setopstate->ps,
+												   desc,
+												   node->numCols,
+												   node->dupColIdx,
+												   setopstate->eqfuncoids,
+												   setopstate->hashfunctions,
+												   node->dupCollations,
+												   node->numGroups,
+												   0,
+												   setopstate->ps.state->es_query_cxt,
+												   setopstate->tableContext,
+												   econtext->ecxt_per_tuple_memory,
+												   false);
+}
+
+/*
+ * We've completed processing a tuple group.  Decide how many copies (if any)
+ * of its representative row to emit, and store the count into numOutput.
+ * This logic is straight from the SQL92 specification.
+ */
+static void
+set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+	SetOp	   *plannode = (SetOp *) setopstate->ps.plan;
+
+	switch (plannode->cmd)
+	{
+		case SETOPCMD_INTERSECT:
+			if (pergroup->numLeft > 0 && pergroup->numRight > 0)
+				setopstate->numOutput = 1;
+			else
+				setopstate->numOutput = 0;
+			break;
+		case SETOPCMD_INTERSECT_ALL:
+			setopstate->numOutput =
+				(pergroup->numLeft < pergroup->numRight) ?
+				pergroup->numLeft : pergroup->numRight;
+			break;
+		case SETOPCMD_EXCEPT:
+			if (pergroup->numLeft > 0 && pergroup->numRight == 0)
+				setopstate->numOutput = 1;
+			else
+				setopstate->numOutput = 0;
+			break;
+		case SETOPCMD_EXCEPT_ALL:
+			setopstate->numOutput =
+				(pergroup->numLeft < pergroup->numRight) ?
+				0 : (pergroup->numLeft - pergroup->numRight);
+			break;
+		default:
+			elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd);
+			break;
+	}
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecSetOp
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *			/* return: a tuple or NULL */
+ExecSetOp(PlanState *pstate)
+{
+	SetOpState *node = castNode(SetOpState, pstate);
+	SetOp	   *plannode = (SetOp *) node->ps.plan;
+	TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * If the previously-returned tuple needs to be returned more than once,
+	 * keep returning it.
+	 */
+	if (node->numOutput > 0)
+	{
+		node->numOutput--;
+		return resultTupleSlot;
+	}
+
+	/* Otherwise, we're done if we are out of groups */
+	if (node->setop_done)
+		return NULL;
+
+	/* Fetch the next tuple group according to the correct strategy */
+	if (plannode->strategy == SETOP_HASHED)
+	{
+		if (!node->table_filled)
+			setop_fill_hash_table(node);
+		return setop_retrieve_hash_table(node);
+	}
+	else
+		return setop_retrieve_direct(node);
+}
+
+/*
+ * ExecSetOp for non-hashed case
+ */
+static TupleTableSlot *
+setop_retrieve_direct(SetOpState *setopstate)
+{
+	PlanState  *outerPlan;
+	SetOpStatePerGroup pergroup;
+	TupleTableSlot *outerslot;
+	TupleTableSlot *resultTupleSlot;
+	ExprContext *econtext = setopstate->ps.ps_ExprContext;
+
+	/*
+	 * get state info from node
+	 */
+	outerPlan = outerPlanState(setopstate);
+	pergroup = (SetOpStatePerGroup) setopstate->pergroup;
+	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+
+	/*
+	 * We loop retrieving groups until we find one we should return
+	 */
+	while (!setopstate->setop_done)
+	{
+		/*
+		 * If we don't already have the first tuple of the new group, fetch it
+		 * from the outer plan.
+		 */
+		if (setopstate->grp_firstTuple == NULL)
+		{
+			outerslot = ExecProcNode(outerPlan);
+			if (!TupIsNull(outerslot))
+			{
+				/* Make a copy of the first input tuple */
+				setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
+			}
+			else
+			{
+				/* outer plan produced no tuples at all */
+				setopstate->setop_done = true;
+				return NULL;
+			}
+		}
+
+		/*
+		 * Store the copied first input tuple in the tuple table slot reserved
+		 * for it.  The tuple will be deleted when it is cleared from the
+		 * slot.
+		 */
+		ExecStoreHeapTuple(setopstate->grp_firstTuple,
+						   resultTupleSlot,
+						   true);
+		setopstate->grp_firstTuple = NULL;	/* don't keep two pointers */
+
+		/* Initialize working state for a new input tuple group */
+		initialize_counts(pergroup);
+
+		/* Count the first input tuple */
+		advance_counts(pergroup,
+					   fetch_tuple_flag(setopstate, resultTupleSlot));
+
+		/*
+		 * Scan the outer plan until we exhaust it or cross a group boundary.
+		 */
+		for (;;)
+		{
+			outerslot = ExecProcNode(outerPlan);
+			if (TupIsNull(outerslot))
+			{
+				/* no more outer-plan tuples available */
+				setopstate->setop_done = true;
+				break;
+			}
+
+			/*
+			 * Check whether we've crossed a group boundary.
+			 */
+			econtext->ecxt_outertuple = resultTupleSlot;
+			econtext->ecxt_innertuple = outerslot;
+
+			if (!ExecQualAndReset(setopstate->eqfunction, econtext))
+			{
+				/*
+				 * Save the first input tuple of the next group.
+				 */
+				setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
+				break;
+			}
+
+			/* Still in same group, so count this tuple */
+			advance_counts(pergroup,
+						   fetch_tuple_flag(setopstate, outerslot));
+		}
+
+		/*
+		 * Done scanning input tuple group.  See if we should emit any copies
+		 * of result tuple, and if so return the first copy.
+		 */
+		set_output_count(setopstate, pergroup);
+
+		if (setopstate->numOutput > 0)
+		{
+			setopstate->numOutput--;
+			return resultTupleSlot;
+		}
+	}
+
+	/* No more groups */
+	ExecClearTuple(resultTupleSlot);
+	return NULL;
+}
+
+/*
+ * ExecSetOp for hashed case: phase 1, read input and build hash table
+ */
+static void
+setop_fill_hash_table(SetOpState *setopstate)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	PlanState  *outerPlan;
+	int			firstFlag;
+	bool		in_first_rel PG_USED_FOR_ASSERTS_ONLY;
+	ExprContext *econtext = setopstate->ps.ps_ExprContext;
+
+	/*
+	 * get state info from node
+	 */
+	outerPlan = outerPlanState(setopstate);
+	firstFlag = node->firstFlag;
+	/* verify planner didn't mess up */
+	Assert(firstFlag == 0 ||
+		   (firstFlag == 1 &&
+			(node->cmd == SETOPCMD_INTERSECT ||
+			 node->cmd == SETOPCMD_INTERSECT_ALL)));
+
+	/*
+	 * Process each outer-plan tuple, and then fetch the next one, until we
+	 * exhaust the outer plan.
+	 */
+	in_first_rel = true;
+	for (;;)
+	{
+		TupleTableSlot *outerslot;
+		int			flag;
+		TupleHashEntryData *entry;
+		bool		isnew;
+
+		outerslot = ExecProcNode(outerPlan);
+		if (TupIsNull(outerslot))
+			break;
+
+		/* Identify whether it's left or right input */
+		flag = fetch_tuple_flag(setopstate, outerslot);
+
+		if (flag == firstFlag)
+		{
+			/* (still) in first input relation */
+			Assert(in_first_rel);
+
+			/* Find or build hashtable entry for this tuple's group */
+			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
+										 &isnew, NULL);
+
+			/* If new tuple group, initialize counts */
+			if (isnew)
+			{
+				entry->additional = (SetOpStatePerGroup)
+					MemoryContextAlloc(setopstate->hashtable->tablecxt,
+									   sizeof(SetOpStatePerGroupData));
+				initialize_counts((SetOpStatePerGroup) entry->additional);
+			}
+
+			/* Advance the counts */
+			advance_counts((SetOpStatePerGroup) entry->additional, flag);
+		}
+		else
+		{
+			/* reached second relation */
+			in_first_rel = false;
+
+			/* For tuples not seen previously, do not make hashtable entry */
+			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
+										 NULL, NULL);
+
+			/* Advance the counts if entry is already present */
+			if (entry)
+				advance_counts((SetOpStatePerGroup) entry->additional, flag);
+		}
+
+		/* Must reset expression context after each hashtable lookup */
+		ResetExprContext(econtext);
+	}
+
+	setopstate->table_filled = true;
+	/* Initialize to walk the hash table */
+	ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
+}
+
+/*
+ * ExecSetOp for hashed case: phase 2, retrieving groups from hash table
+ */
+static TupleTableSlot *
+setop_retrieve_hash_table(SetOpState *setopstate)
+{
+	TupleHashEntryData *entry;
+	TupleTableSlot *resultTupleSlot;
+
+	/*
+	 * get state info from node
+	 */
+	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+
+	/*
+	 * We loop retrieving groups until we find one we should return
+	 */
+	while (!setopstate->setop_done)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Find the next entry in the hash table
+		 */
+		entry = ScanTupleHashTable(setopstate->hashtable, &setopstate->hashiter);
+		if (entry == NULL)
+		{
+			/* No more entries in hashtable, so done */
+			setopstate->setop_done = true;
+			return NULL;
+		}
+
+		/*
+		 * See if we should emit any copies of this tuple, and if so return
+		 * the first copy.
+		 */
+		set_output_count(setopstate, (SetOpStatePerGroup) entry->additional);
+
+		if (setopstate->numOutput > 0)
+		{
+			setopstate->numOutput--;
+			return ExecStoreMinimalTuple(entry->firstTuple,
+										 resultTupleSlot,
+										 false);
+		}
+	}
+
+	/* No more groups */
+	ExecClearTuple(resultTupleSlot);
+	return NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitSetOp
+ *
+ *		This initializes the setop node state structures and
+ *		the node's subplan.
+ * ----------------------------------------------------------------
+ */
+SetOpState *
+ExecInitSetOp(SetOp *node, EState *estate, int eflags)
+{
+	SetOpState *setopstate;
+	TupleDesc	outerDesc;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	setopstate = makeNode(SetOpState);
+	setopstate->ps.plan = (Plan *) node;
+	setopstate->ps.state = estate;
+	setopstate->ps.ExecProcNode = ExecSetOp;
+
+	setopstate->eqfuncoids = NULL;
+	setopstate->hashfunctions = NULL;
+	setopstate->setop_done = false;
+	setopstate->numOutput = 0;
+	setopstate->pergroup = NULL;
+	setopstate->grp_firstTuple = NULL;
+	setopstate->hashtable = NULL;
+	setopstate->tableContext = NULL;
+
+	/*
+	 * create expression context
+	 */
+	ExecAssignExprContext(estate, &setopstate->ps);
+
+	/*
+	 * If hashing, we also need a longer-lived context to store the hash
+	 * table.  The table can't just be kept in the per-query context because
+	 * we want to be able to throw it away in ExecReScanSetOp.
+	 */
+	if (node->strategy == SETOP_HASHED)
+		setopstate->tableContext =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "SetOp hash table",
+								  ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * initialize child nodes
+	 *
+	 * If we are hashing then the child plan does not need to handle REWIND
+	 * efficiently; see ExecReScanSetOp.
+	 */
+	if (node->strategy == SETOP_HASHED)
+		eflags &= ~EXEC_FLAG_REWIND;
+	outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags);
+	outerDesc = ExecGetResultType(outerPlanState(setopstate));
+
+	/*
+	 * Initialize result slot and type. Setop nodes do no projections, so
+	 * initialize projection info for this node appropriately.
+	 */
+	ExecInitResultTupleSlotTL(&setopstate->ps,
+							  node->strategy == SETOP_HASHED ?
+							  &TTSOpsMinimalTuple : &TTSOpsHeapTuple);
+	setopstate->ps.ps_ProjInfo = NULL;
+
+	/*
+	 * Precompute fmgr lookup data for inner loop. We need both equality and
+	 * hashing functions to do it by hashing, but only equality if not
+	 * hashing.
+	 */
+	if (node->strategy == SETOP_HASHED)
+		execTuplesHashPrepare(node->numCols,
+							  node->dupOperators,
+							  &setopstate->eqfuncoids,
+							  &setopstate->hashfunctions);
+	else
+		setopstate->eqfunction =
+			execTuplesMatchPrepare(outerDesc,
+								   node->numCols,
+								   node->dupColIdx,
+								   node->dupOperators,
+								   node->dupCollations,
+								   &setopstate->ps);
+
+	if (node->strategy == SETOP_HASHED)
+	{
+		build_hash_table(setopstate);
+		setopstate->table_filled = false;
+	}
+	else
+	{
+		setopstate->pergroup =
+			(SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData));
+	}
+
+	return setopstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndSetOp
+ *
+ *		This shuts down the subplan and frees resources allocated
+ *		to this node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSetOp(SetOpState *node)
+{
+	/* clean up tuple table */
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+	/* free subsidiary stuff including hashtable */
+	if (node->tableContext)
+		MemoryContextDelete(node->tableContext);
+	ExecFreeExprContext(&node->ps);
+
+	ExecEndNode(outerPlanState(node));
+}
+
+
+void
+ExecReScanSetOp(SetOpState *node)
+{
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+	node->setop_done = false;
+	node->numOutput = 0;
+
+	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+	{
+		/*
+		 * In the hashed case, if we haven't yet built the hash table then we
+		 * can just return; nothing done yet, so nothing to undo. If subnode's
+		 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+		 * else no reason to re-scan it at all.
+		 */
+		if (!node->table_filled)
+			return;
+
+		/*
+		 * If we do have the hash table and the subplan does not have any
+		 * parameter changes, then we can just rescan the existing hash table;
+		 * no need to build it again.
+		 */
+		if (node->ps.lefttree->chgParam == NULL)
+		{
+			ResetTupleHashIterator(node->hashtable, &node->hashiter);
+			return;
+		}
+	}
+
+	/* Release first tuple of group, if we have made a copy */
+	if (node->grp_firstTuple != NULL)
+	{
+		heap_freetuple(node->grp_firstTuple);
+		node->grp_firstTuple = NULL;
+	}
+
+	/* Release any hashtable storage */
+	if (node->tableContext)
+		MemoryContextResetAndDeleteChildren(node->tableContext);
+
+	/* And rebuild empty hashtable if needed */
+	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+	{
+		ResetTupleHashTable(node->hashtable);
+		node->table_filled = false;
+	}
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->ps.lefttree->chgParam == NULL)
+		ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
new file mode 100644
index 0000000..b99027e
--- /dev/null
+++ b/src/backend/executor/nodeSort.c
@@ -0,0 +1,430 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSort.c
+ *	  Routines to handle sorting of relations.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeSort.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "executor/execdebug.h"
+#include "executor/nodeSort.h"
+#include "miscadmin.h"
+#include "utils/tuplesort.h"
+
+
+/* ----------------------------------------------------------------
+ *		ExecSort
+ *
+ *		Sorts tuples from the outer subtree of the node using tuplesort,
+ *		which saves the results in a temporary file or memory. After the
+ *		initial call, returns a tuple from the file with each call.
+ *
+ *		Conditions:
+ *		  -- none.
+ *
+ *		Initial States:
+ *		  -- the outer child is prepared to return the first tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecSort(PlanState *pstate)
+{
+	SortState  *node = castNode(SortState, pstate);
+	EState	   *estate;
+	ScanDirection dir;
+	Tuplesortstate *tuplesortstate;
+	TupleTableSlot *slot;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * get state info from node
+	 */
+	SO1_printf("ExecSort: %s\n",
+			   "entering routine");
+
+	estate = node->ss.ps.state;
+	dir = estate->es_direction;
+	tuplesortstate = (Tuplesortstate *) node->tuplesortstate;
+
+	/*
+	 * If first time through, read all tuples from outer plan and pass them to
+	 * tuplesort.c. Subsequent calls just fetch tuples from tuplesort.
+	 */
+
+	if (!node->sort_Done)
+	{
+		Sort	   *plannode = (Sort *) node->ss.ps.plan;
+		PlanState  *outerNode;
+		TupleDesc	tupDesc;
+
+		SO1_printf("ExecSort: %s\n",
+				   "sorting subplan");
+
+		/*
+		 * Want to scan subplan in the forward direction while creating the
+		 * sorted data.
+		 */
+		estate->es_direction = ForwardScanDirection;
+
+		/*
+		 * Initialize tuplesort module.
+		 */
+		SO1_printf("ExecSort: %s\n",
+				   "calling tuplesort_begin");
+
+		outerNode = outerPlanState(node);
+		tupDesc = ExecGetResultType(outerNode);
+
+		tuplesortstate = tuplesort_begin_heap(tupDesc,
+											  plannode->numCols,
+											  plannode->sortColIdx,
+											  plannode->sortOperators,
+											  plannode->collations,
+											  plannode->nullsFirst,
+											  work_mem,
+											  NULL,
+											  node->randomAccess);
+		if (node->bounded)
+			tuplesort_set_bound(tuplesortstate, node->bound);
+		node->tuplesortstate = (void *) tuplesortstate;
+
+		/*
+		 * Scan the subplan and feed all the tuples to tuplesort.
+		 */
+
+		for (;;)
+		{
+			slot = ExecProcNode(outerNode);
+
+			if (TupIsNull(slot))
+				break;
+
+			tuplesort_puttupleslot(tuplesortstate, slot);
+		}
+
+		/*
+		 * Complete the sort.
+		 */
+		tuplesort_performsort(tuplesortstate);
+
+		/*
+		 * restore to user specified direction
+		 */
+		estate->es_direction = dir;
+
+		/*
+		 * finally set the sorted flag to true
+		 */
+		node->sort_Done = true;
+		node->bounded_Done = node->bounded;
+		node->bound_Done = node->bound;
+		if (node->shared_info && node->am_worker)
+		{
+			TuplesortInstrumentation *si;
+
+			Assert(IsParallelWorker());
+			Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+			si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+			tuplesort_get_stats(tuplesortstate, si);
+		}
+		SO1_printf("ExecSort: %s\n", "sorting done");
+	}
+
+	SO1_printf("ExecSort: %s\n",
+			   "retrieving tuple from tuplesort");
+
+	/*
+	 * Get the first or next tuple from tuplesort. Returns NULL if no more
+	 * tuples.  Note that we only rely on slot tuple remaining valid until the
+	 * next fetch from the tuplesort.
+	 */
+	slot = node->ss.ps.ps_ResultTupleSlot;
+	(void) tuplesort_gettupleslot(tuplesortstate,
+								  ScanDirectionIsForward(dir),
+								  false, slot, NULL);
+	return slot;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitSort
+ *
+ *		Creates the run-time state information for the sort node
+ *		produced by the planner and initializes its outer subtree.
+ * ----------------------------------------------------------------
+ */
+SortState *
+ExecInitSort(Sort *node, EState *estate, int eflags)
+{
+	SortState  *sortstate;
+
+	SO1_printf("ExecInitSort: %s\n",
+			   "initializing sort node");
+
+	/*
+	 * create state structure
+	 */
+	sortstate = makeNode(SortState);
+	sortstate->ss.ps.plan = (Plan *) node;
+	sortstate->ss.ps.state = estate;
+	sortstate->ss.ps.ExecProcNode = ExecSort;
+
+	/*
+	 * We must have random access to the sort output to do backward scan or
+	 * mark/restore.  We also prefer to materialize the sort output if we
+	 * might be called on to rewind and replay it many times.
+	 */
+	sortstate->randomAccess = (eflags & (EXEC_FLAG_REWIND |
+										 EXEC_FLAG_BACKWARD |
+										 EXEC_FLAG_MARK)) != 0;
+
+	sortstate->bounded = false;
+	sortstate->sort_Done = false;
+	sortstate->tuplesortstate = NULL;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * Sort nodes don't initialize their ExprContexts because they never call
+	 * ExecQual or ExecProject.
+	 */
+
+	/*
+	 * initialize child nodes
+	 *
+	 * We shield the child node from the need to support REWIND, BACKWARD, or
+	 * MARK/RESTORE.
+	 */
+	eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);
+
+	outerPlanState(sortstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * Initialize scan slot and type.
+	 */
+	ExecCreateScanSlotFromOuterPlan(estate, &sortstate->ss, &TTSOpsVirtual);
+
+	/*
+	 * Initialize return slot and type. No need to initialize projection info
+	 * because this node doesn't do projections.
+	 */
+	ExecInitResultTupleSlotTL(&sortstate->ss.ps, &TTSOpsMinimalTuple);
+	sortstate->ss.ps.ps_ProjInfo = NULL;
+
+	SO1_printf("ExecInitSort: %s\n",
+			   "sort node initialized");
+
+	return sortstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndSort(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSort(SortState *node)
+{
+	SO1_printf("ExecEndSort: %s\n",
+			   "shutting down sort node");
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	/* must drop pointer to sort result tuple */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	/*
+	 * Release tuplesort resources
+	 */
+	if (node->tuplesortstate != NULL)
+		tuplesort_end((Tuplesortstate *) node->tuplesortstate);
+	node->tuplesortstate = NULL;
+
+	/*
+	 * shut down the subplan
+	 */
+	ExecEndNode(outerPlanState(node));
+
+	SO1_printf("ExecEndSort: %s\n",
+			   "sort node shutdown");
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortMarkPos
+ *
+ *		Calls tuplesort to save the current position in the sorted file.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortMarkPos(SortState *node)
+{
+	/*
+	 * if we haven't sorted yet, just return
+	 */
+	if (!node->sort_Done)
+		return;
+
+	tuplesort_markpos((Tuplesortstate *) node->tuplesortstate);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortRestrPos
+ *
+ *		Calls tuplesort to restore the last saved sort file position.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortRestrPos(SortState *node)
+{
+	/*
+	 * if we haven't sorted yet, just return.
+	 */
+	if (!node->sort_Done)
+		return;
+
+	/*
+	 * restore the scan to the previously marked position
+	 */
+	tuplesort_restorepos((Tuplesortstate *) node->tuplesortstate);
+}
+
+void
+ExecReScanSort(SortState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/*
+	 * If we haven't sorted yet, just return. If outerplan's chgParam is not
+	 * NULL then it will be re-scanned by ExecProcNode, else no reason to
+	 * re-scan it at all.
+	 */
+	if (!node->sort_Done)
+		return;
+
+	/* must drop pointer to sort result tuple */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	/*
+	 * If subnode is to be rescanned then we forget previous sort results; we
+	 * have to re-read the subplan and re-sort.  Also must re-sort if the
+	 * bounded-sort parameters changed or we didn't select randomAccess.
+	 *
+	 * Otherwise we can just rewind and rescan the sorted output.
+	 */
+	if (outerPlan->chgParam != NULL ||
+		node->bounded != node->bounded_Done ||
+		node->bound != node->bound_Done ||
+		!node->randomAccess)
+	{
+		node->sort_Done = false;
+		tuplesort_end((Tuplesortstate *) node->tuplesortstate);
+		node->tuplesortstate = NULL;
+
+		/*
+		 * if chgParam of subnode is not null then plan will be re-scanned by
+		 * first ExecProcNode.
+		 */
+		if (outerPlan->chgParam == NULL)
+			ExecReScan(outerPlan);
+	}
+	else
+		tuplesort_rescan((Tuplesortstate *) node->tuplesortstate);
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecSortEstimate
+ *
+ *		Estimate space required to propagate sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortEstimate(SortState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(TuplesortInstrumentation));
+	size = add_size(size, offsetof(SharedSortInfo, sinstrument));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortInitializeDSM
+ *
+ *		Initialize DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedSortInfo, sinstrument)
+		+ pcxt->nworkers * sizeof(TuplesortInstrumentation);
+	node->shared_info = shm_toc_allocate(pcxt->toc, size);
+	/* ensure any unfilled slots will contain zeroes */
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortInitializeWorker
+ *
+ *		Attach worker to DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortInitializeWorker(SortState *node, ParallelWorkerContext *pwcxt)
+{
+	node->shared_info =
+		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+	node->am_worker = true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortRetrieveInstrumentation
+ *
+ *		Transfer sort statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortRetrieveInstrumentation(SortState *node)
+{
+	Size		size;
+	SharedSortInfo *si;
+
+	if (node->shared_info == NULL)
+		return;
+
+	size = offsetof(SharedSortInfo, sinstrument)
+		+ node->shared_info->num_workers * sizeof(TuplesortInstrumentation);
+	si = palloc(size);
+	memcpy(si, node->shared_info, size);
+	node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
new file mode 100644
index 0000000..d46227e
--- /dev/null
+++ b/src/backend/executor/nodeSubplan.c
@@ -0,0 +1,1313 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSubplan.c
+ *	  routines to support sub-selects appearing in expressions
+ *
+ * This module is concerned with executing SubPlan expression nodes, which
+ * should not be confused with sub-SELECTs appearing in FROM.  SubPlans are
+ * divided into "initplans", which are those that need only one evaluation per
+ * query (among other restrictions, this requires that they don't use any
+ * direct correlation variables from the parent plan level), and "regular"
+ * subplans, which are re-evaluated every time their result is required.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeSubplan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *	 INTERFACE ROUTINES
+ *		ExecSubPlan  - process a subselect
+ *		ExecInitSubPlan - initialize a subselect
+ */
+#include "postgres.h"
+
+#include <limits.h>
+#include <math.h>
+
+#include "access/htup_details.h"
+#include "executor/executor.h"
+#include "executor/nodeSubplan.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/array.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+
+static Datum ExecHashSubPlan(SubPlanState *node,
+							 ExprContext *econtext,
+							 bool *isNull);
+static Datum ExecScanSubPlan(SubPlanState *node,
+							 ExprContext *econtext,
+							 bool *isNull);
+static void buildSubPlanHash(SubPlanState *node, ExprContext *econtext);
+static bool findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
+							 FmgrInfo *eqfunctions);
+static bool slotAllNulls(TupleTableSlot *slot);
+static bool slotNoNulls(TupleTableSlot *slot);
+
+
+/* ----------------------------------------------------------------
+ *		ExecSubPlan
+ *
+ * This is the main entry point for execution of a regular SubPlan.
+ * ----------------------------------------------------------------
+ */
+Datum
+ExecSubPlan(SubPlanState *node,
+			ExprContext *econtext,
+			bool *isNull)
+{
+	SubPlan    *subplan = node->subplan;
+	EState	   *estate = node->planstate->state;
+	ScanDirection dir = estate->es_direction;
+	Datum		retval;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/* Set non-null as default */
+	*isNull = false;
+
+	/* Sanity checks */
+	if (subplan->subLinkType == CTE_SUBLINK)
+		elog(ERROR, "CTE subplans should not be executed via ExecSubPlan");
+	if (subplan->setParam != NIL && subplan->subLinkType != MULTIEXPR_SUBLINK)
+		elog(ERROR, "cannot set parent params from subquery");
+
+	/* Force forward-scan mode for evaluation */
+	estate->es_direction = ForwardScanDirection;
+
+	/* Select appropriate evaluation strategy */
+	if (subplan->useHashTable)
+		retval = ExecHashSubPlan(node, econtext, isNull);
+	else
+		retval = ExecScanSubPlan(node, econtext, isNull);
+
+	/* restore scan direction */
+	estate->es_direction = dir;
+
+	return retval;
+}
+
+/*
+ * ExecHashSubPlan: store subselect result in an in-memory hash table
+ */
+static Datum
+ExecHashSubPlan(SubPlanState *node,
+				ExprContext *econtext,
+				bool *isNull)
+{
+	SubPlan    *subplan = node->subplan;
+	PlanState  *planstate = node->planstate;
+	TupleTableSlot *slot;
+
+	/* Shouldn't have any direct correlation Vars */
+	if (subplan->parParam != NIL || node->args != NIL)
+		elog(ERROR, "hashed subplan with direct correlation not supported");
+
+	/*
+	 * If first time through or we need to rescan the subplan, build the hash
+	 * table.
+	 */
+	if (node->hashtable == NULL || planstate->chgParam != NULL)
+		buildSubPlanHash(node, econtext);
+
+	/*
+	 * The result for an empty subplan is always FALSE; no need to evaluate
+	 * lefthand side.
+	 */
+	*isNull = false;
+	if (!node->havehashrows && !node->havenullrows)
+		return BoolGetDatum(false);
+
+	/*
+	 * Evaluate lefthand expressions and form a projection tuple. First we
+	 * have to set the econtext to use (hack alert!).
+	 */
+	node->projLeft->pi_exprContext = econtext;
+	slot = ExecProject(node->projLeft);
+
+	/*
+	 * Note: because we are typically called in a per-tuple context, we have
+	 * to explicitly clear the projected tuple before returning. Otherwise,
+	 * we'll have a double-free situation: the per-tuple context will probably
+	 * be reset before we're called again, and then the tuple slot will think
+	 * it still needs to free the tuple.
+	 */
+
+	/*
+	 * If the LHS is all non-null, probe for an exact match in the main hash
+	 * table.  If we find one, the result is TRUE. Otherwise, scan the
+	 * partly-null table to see if there are any rows that aren't provably
+	 * unequal to the LHS; if so, the result is UNKNOWN.  (We skip that part
+	 * if we don't care about UNKNOWN.) Otherwise, the result is FALSE.
+	 *
+	 * Note: the reason we can avoid a full scan of the main hash table is
+	 * that the combining operators are assumed never to yield NULL when both
+	 * inputs are non-null.  If they were to do so, we might need to produce
+	 * UNKNOWN instead of FALSE because of an UNKNOWN result in comparing the
+	 * LHS to some main-table entry --- which is a comparison we will not even
+	 * make, unless there's a chance match of hash keys.
+	 */
+	if (slotNoNulls(slot))
+	{
+		if (node->havehashrows &&
+			FindTupleHashEntry(node->hashtable,
+							   slot,
+							   node->cur_eq_comp,
+							   node->lhs_hash_funcs) != NULL)
+		{
+			ExecClearTuple(slot);
+			return BoolGetDatum(true);
+		}
+		if (node->havenullrows &&
+			findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs))
+		{
+			ExecClearTuple(slot);
+			*isNull = true;
+			return BoolGetDatum(false);
+		}
+		ExecClearTuple(slot);
+		return BoolGetDatum(false);
+	}
+
+	/*
+	 * When the LHS is partly or wholly NULL, we can never return TRUE. If we
+	 * don't care about UNKNOWN, just return FALSE.  Otherwise, if the LHS is
+	 * wholly NULL, immediately return UNKNOWN.  (Since the combining
+	 * operators are strict, the result could only be FALSE if the sub-select
+	 * were empty, but we already handled that case.) Otherwise, we must scan
+	 * both the main and partly-null tables to see if there are any rows that
+	 * aren't provably unequal to the LHS; if so, the result is UNKNOWN.
+	 * Otherwise, the result is FALSE.
+	 */
+	if (node->hashnulls == NULL)
+	{
+		ExecClearTuple(slot);
+		return BoolGetDatum(false);
+	}
+	if (slotAllNulls(slot))
+	{
+		ExecClearTuple(slot);
+		*isNull = true;
+		return BoolGetDatum(false);
+	}
+	/* Scan partly-null table first, since more likely to get a match */
+	if (node->havenullrows &&
+		findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs))
+	{
+		ExecClearTuple(slot);
+		*isNull = true;
+		return BoolGetDatum(false);
+	}
+	if (node->havehashrows &&
+		findPartialMatch(node->hashtable, slot, node->cur_eq_funcs))
+	{
+		ExecClearTuple(slot);
+		*isNull = true;
+		return BoolGetDatum(false);
+	}
+	ExecClearTuple(slot);
+	return BoolGetDatum(false);
+}
+
+/*
+ * ExecScanSubPlan: default case where we have to rescan subplan each time
+ */
+static Datum
+ExecScanSubPlan(SubPlanState *node,
+				ExprContext *econtext,
+				bool *isNull)
+{
+	SubPlan    *subplan = node->subplan;
+	PlanState  *planstate = node->planstate;
+	SubLinkType subLinkType = subplan->subLinkType;
+	MemoryContext oldcontext;
+	TupleTableSlot *slot;
+	Datum		result;
+	bool		found = false;	/* true if got at least one subplan tuple */
+	ListCell   *pvar;
+	ListCell   *l;
+	ArrayBuildStateAny *astate = NULL;
+
+	/*
+	 * MULTIEXPR subplans, when "executed", just return NULL; but first we
+	 * mark the subplan's output parameters as needing recalculation.  (This
+	 * is a bit of a hack: it relies on the subplan appearing later in its
+	 * targetlist than any of the referencing Params, so that all the Params
+	 * have been evaluated before we re-mark them for the next evaluation
+	 * cycle.  But in general resjunk tlist items appear after non-resjunk
+	 * ones, so this should be safe.)  Unlike ExecReScanSetParamPlan, we do
+	 * *not* set bits in the parent plan node's chgParam, because we don't
+	 * want to cause a rescan of the parent.
+	 */
+	if (subLinkType == MULTIEXPR_SUBLINK)
+	{
+		EState	   *estate = node->parent->state;
+
+		foreach(l, subplan->setParam)
+		{
+			int			paramid = lfirst_int(l);
+			ParamExecData *prm = &(estate->es_param_exec_vals[paramid]);
+
+			prm->execPlan = node;
+		}
+		*isNull = true;
+		return (Datum) 0;
+	}
+
+	/* Initialize ArrayBuildStateAny in caller's context, if needed */
+	if (subLinkType == ARRAY_SUBLINK)
+		astate = initArrayResultAny(subplan->firstColType,
+									CurrentMemoryContext, true);
+
+	/*
+	 * We are probably in a short-lived expression-evaluation context. Switch
+	 * to the per-query context for manipulating the child plan's chgParam,
+	 * calling ExecProcNode on it, etc.
+	 */
+	oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+	/*
+	 * Set Params of this plan from parent plan correlation values. (Any
+	 * calculation we have to do is done in the parent econtext, since the
+	 * Param values don't need to have per-query lifetime.)
+	 */
+	Assert(list_length(subplan->parParam) == list_length(node->args));
+
+	forboth(l, subplan->parParam, pvar, node->args)
+	{
+		int			paramid = lfirst_int(l);
+		ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+		prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
+											   econtext,
+											   &(prm->isnull));
+		planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
+	}
+
+	/*
+	 * Now that we've set up its parameters, we can reset the subplan.
+	 */
+	ExecReScan(planstate);
+
+	/*
+	 * For all sublink types except EXPR_SUBLINK and ARRAY_SUBLINK, the result
+	 * is boolean as are the results of the combining operators. We combine
+	 * results across tuples (if the subplan produces more than one) using OR
+	 * semantics for ANY_SUBLINK or AND semantics for ALL_SUBLINK.
+	 * (ROWCOMPARE_SUBLINK doesn't allow multiple tuples from the subplan.)
+	 * NULL results from the combining operators are handled according to the
+	 * usual SQL semantics for OR and AND.  The result for no input tuples is
+	 * FALSE for ANY_SUBLINK, TRUE for ALL_SUBLINK, NULL for
+	 * ROWCOMPARE_SUBLINK.
+	 *
+	 * For EXPR_SUBLINK we require the subplan to produce no more than one
+	 * tuple, else an error is raised.  If zero tuples are produced, we return
+	 * NULL.  Assuming we get a tuple, we just use its first column (there can
+	 * be only one non-junk column in this case).
+	 *
+	 * For ARRAY_SUBLINK we allow the subplan to produce any number of tuples,
+	 * and form an array of the first column's values.  Note in particular
+	 * that we produce a zero-element array if no tuples are produced (this is
+	 * a change from pre-8.3 behavior of returning NULL).
+	 */
+	result = BoolGetDatum(subLinkType == ALL_SUBLINK);
+	*isNull = false;
+
+	for (slot = ExecProcNode(planstate);
+		 !TupIsNull(slot);
+		 slot = ExecProcNode(planstate))
+	{
+		TupleDesc	tdesc = slot->tts_tupleDescriptor;
+		Datum		rowresult;
+		bool		rownull;
+		int			col;
+		ListCell   *plst;
+
+		if (subLinkType == EXISTS_SUBLINK)
+		{
+			found = true;
+			result = BoolGetDatum(true);
+			break;
+		}
+
+		if (subLinkType == EXPR_SUBLINK)
+		{
+			/* cannot allow multiple input tuples for EXPR sublink */
+			if (found)
+				ereport(ERROR,
+						(errcode(ERRCODE_CARDINALITY_VIOLATION),
+						 errmsg("more than one row returned by a subquery used as an expression")));
+			found = true;
+
+			/*
+			 * We need to copy the subplan's tuple in case the result is of
+			 * pass-by-ref type --- our return value will point into this
+			 * copied tuple!  Can't use the subplan's instance of the tuple
+			 * since it won't still be valid after next ExecProcNode() call.
+			 * node->curTuple keeps track of the copied tuple for eventual
+			 * freeing.
+			 */
+			if (node->curTuple)
+				heap_freetuple(node->curTuple);
+			node->curTuple = ExecCopySlotHeapTuple(slot);
+
+			result = heap_getattr(node->curTuple, 1, tdesc, isNull);
+			/* keep scanning subplan to make sure there's only one tuple */
+			continue;
+		}
+
+		if (subLinkType == ARRAY_SUBLINK)
+		{
+			Datum		dvalue;
+			bool		disnull;
+
+			found = true;
+			/* stash away current value */
+			Assert(subplan->firstColType == TupleDescAttr(tdesc, 0)->atttypid);
+			dvalue = slot_getattr(slot, 1, &disnull);
+			astate = accumArrayResultAny(astate, dvalue, disnull,
+										 subplan->firstColType, oldcontext);
+			/* keep scanning subplan to collect all values */
+			continue;
+		}
+
+		/* cannot allow multiple input tuples for ROWCOMPARE sublink either */
+		if (subLinkType == ROWCOMPARE_SUBLINK && found)
+			ereport(ERROR,
+					(errcode(ERRCODE_CARDINALITY_VIOLATION),
+					 errmsg("more than one row returned by a subquery used as an expression")));
+
+		found = true;
+
+		/*
+		 * For ALL, ANY, and ROWCOMPARE sublinks, load up the Params
+		 * representing the columns of the sub-select, and then evaluate the
+		 * combining expression.
+		 */
+		col = 1;
+		foreach(plst, subplan->paramIds)
+		{
+			int			paramid = lfirst_int(plst);
+			ParamExecData *prmdata;
+
+			prmdata = &(econtext->ecxt_param_exec_vals[paramid]);
+			Assert(prmdata->execPlan == NULL);
+			prmdata->value = slot_getattr(slot, col, &(prmdata->isnull));
+			col++;
+		}
+
+		rowresult = ExecEvalExprSwitchContext(node->testexpr, econtext,
+											  &rownull);
+
+		if (subLinkType == ANY_SUBLINK)
+		{
+			/* combine across rows per OR semantics */
+			if (rownull)
+				*isNull = true;
+			else if (DatumGetBool(rowresult))
+			{
+				result = BoolGetDatum(true);
+				*isNull = false;
+				break;			/* needn't look at any more rows */
+			}
+		}
+		else if (subLinkType == ALL_SUBLINK)
+		{
+			/* combine across rows per AND semantics */
+			if (rownull)
+				*isNull = true;
+			else if (!DatumGetBool(rowresult))
+			{
+				result = BoolGetDatum(false);
+				*isNull = false;
+				break;			/* needn't look at any more rows */
+			}
+		}
+		else
+		{
+			/* must be ROWCOMPARE_SUBLINK */
+			result = rowresult;
+			*isNull = rownull;
+		}
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+
+	if (subLinkType == ARRAY_SUBLINK)
+	{
+		/* We return the result in the caller's context */
+		result = makeArrayResultAny(astate, oldcontext, true);
+	}
+	else if (!found)
+	{
+		/*
+		 * deal with empty subplan result.  result/isNull were previously
+		 * initialized correctly for all sublink types except EXPR and
+		 * ROWCOMPARE; for those, return NULL.
+		 */
+		if (subLinkType == EXPR_SUBLINK ||
+			subLinkType == ROWCOMPARE_SUBLINK)
+		{
+			result = (Datum) 0;
+			*isNull = true;
+		}
+	}
+
+	return result;
+}
+
+/*
+ * buildSubPlanHash: load hash table by scanning subplan output.
+ */
+static void
+buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
+{
+	SubPlan    *subplan = node->subplan;
+	PlanState  *planstate = node->planstate;
+	int			ncols = node->numCols;
+	ExprContext *innerecontext = node->innerecontext;
+	MemoryContext oldcontext;
+	long		nbuckets;
+	TupleTableSlot *slot;
+
+	Assert(subplan->subLinkType == ANY_SUBLINK);
+
+	/*
+	 * If we already had any hash tables, reset 'em; otherwise create empty
+	 * hash table(s).
+	 *
+	 * If we need to distinguish accurately between FALSE and UNKNOWN (i.e.,
+	 * NULL) results of the IN operation, then we have to store subplan output
+	 * rows that are partly or wholly NULL.  We store such rows in a separate
+	 * hash table that we expect will be much smaller than the main table. (We
+	 * can use hashing to eliminate partly-null rows that are not distinct. We
+	 * keep them separate to minimize the cost of the inevitable full-table
+	 * searches; see findPartialMatch.)
+	 *
+	 * If it's not necessary to distinguish FALSE and UNKNOWN, then we don't
+	 * need to store subplan output rows that contain NULL.
+	 */
+	MemoryContextReset(node->hashtablecxt);
+	node->havehashrows = false;
+	node->havenullrows = false;
+
+	nbuckets = (long) Min(planstate->plan->plan_rows, (double) LONG_MAX);
+	if (nbuckets < 1)
+		nbuckets = 1;
+
+	if (node->hashtable)
+		ResetTupleHashTable(node->hashtable);
+	else
+		node->hashtable = BuildTupleHashTableExt(node->parent,
+												 node->descRight,
+												 ncols,
+												 node->keyColIdx,
+												 node->tab_eq_funcoids,
+												 node->tab_hash_funcs,
+												 node->tab_collations,
+												 nbuckets,
+												 0,
+												 node->planstate->state->es_query_cxt,
+												 node->hashtablecxt,
+												 node->hashtempcxt,
+												 false);
+
+	if (!subplan->unknownEqFalse)
+	{
+		if (ncols == 1)
+			nbuckets = 1;		/* there can only be one entry */
+		else
+		{
+			nbuckets /= 16;
+			if (nbuckets < 1)
+				nbuckets = 1;
+		}
+
+		if (node->hashnulls)
+			ResetTupleHashTable(node->hashnulls);
+		else
+			node->hashnulls = BuildTupleHashTableExt(node->parent,
+													 node->descRight,
+													 ncols,
+													 node->keyColIdx,
+													 node->tab_eq_funcoids,
+													 node->tab_hash_funcs,
+													 node->tab_collations,
+													 nbuckets,
+													 0,
+													 node->planstate->state->es_query_cxt,
+													 node->hashtablecxt,
+													 node->hashtempcxt,
+													 false);
+	}
+	else
+		node->hashnulls = NULL;
+
+	/*
+	 * We are probably in a short-lived expression-evaluation context. Switch
+	 * to the per-query context for manipulating the child plan.
+	 */
+	oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+	/*
+	 * Reset subplan to start.
+	 */
+	ExecReScan(planstate);
+
+	/*
+	 * Scan the subplan and load the hash table(s).  Note that when there are
+	 * duplicate rows coming out of the sub-select, only one copy is stored.
+	 */
+	for (slot = ExecProcNode(planstate);
+		 !TupIsNull(slot);
+		 slot = ExecProcNode(planstate))
+	{
+		int			col = 1;
+		ListCell   *plst;
+		bool		isnew;
+
+		/*
+		 * Load up the Params representing the raw sub-select outputs, then
+		 * form the projection tuple to store in the hashtable.
+		 */
+		foreach(plst, subplan->paramIds)
+		{
+			int			paramid = lfirst_int(plst);
+			ParamExecData *prmdata;
+
+			prmdata = &(innerecontext->ecxt_param_exec_vals[paramid]);
+			Assert(prmdata->execPlan == NULL);
+			prmdata->value = slot_getattr(slot, col,
+										  &(prmdata->isnull));
+			col++;
+		}
+		slot = ExecProject(node->projRight);
+
+		/*
+		 * If result contains any nulls, store separately or not at all.
+		 */
+		if (slotNoNulls(slot))
+		{
+			(void) LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
+			node->havehashrows = true;
+		}
+		else if (node->hashnulls)
+		{
+			(void) LookupTupleHashEntry(node->hashnulls, slot, &isnew, NULL);
+			node->havenullrows = true;
+		}
+
+		/*
+		 * Reset innerecontext after each inner tuple to free any memory used
+		 * during ExecProject.
+		 */
+		ResetExprContext(innerecontext);
+	}
+
+	/*
+	 * Since the projected tuples are in the sub-query's context and not the
+	 * main context, we'd better clear the tuple slot before there's any
+	 * chance of a reset of the sub-query's context.  Else we will have the
+	 * potential for a double free attempt.  (XXX possibly no longer needed,
+	 * but can't hurt.)
+	 */
+	ExecClearTuple(node->projRight->pi_state.resultslot);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * execTuplesUnequal
+ *		Return true if two tuples are definitely unequal in the indicated
+ *		fields.
+ *
+ * Nulls are neither equal nor unequal to anything else.  A true result
+ * is obtained only if there are non-null fields that compare not-equal.
+ *
+ * slot1, slot2: the tuples to compare (must have same columns!)
+ * numCols: the number of attributes to be examined
+ * matchColIdx: array of attribute column numbers
+ * eqFunctions: array of fmgr lookup info for the equality functions to use
+ * evalContext: short-term memory context for executing the functions
+ */
+static bool
+execTuplesUnequal(TupleTableSlot *slot1,
+				  TupleTableSlot *slot2,
+				  int numCols,
+				  AttrNumber *matchColIdx,
+				  FmgrInfo *eqfunctions,
+				  const Oid *collations,
+				  MemoryContext evalContext)
+{
+	MemoryContext oldContext;
+	bool		result;
+	int			i;
+
+	/* Reset and switch into the temp context. */
+	MemoryContextReset(evalContext);
+	oldContext = MemoryContextSwitchTo(evalContext);
+
+	/*
+	 * We cannot report a match without checking all the fields, but we can
+	 * report a non-match as soon as we find unequal fields.  So, start
+	 * comparing at the last field (least significant sort key). That's the
+	 * most likely to be different if we are dealing with sorted input.
+	 */
+	result = false;
+
+	for (i = numCols; --i >= 0;)
+	{
+		AttrNumber	att = matchColIdx[i];
+		Datum		attr1,
+					attr2;
+		bool		isNull1,
+					isNull2;
+
+		attr1 = slot_getattr(slot1, att, &isNull1);
+
+		if (isNull1)
+			continue;			/* can't prove anything here */
+
+		attr2 = slot_getattr(slot2, att, &isNull2);
+
+		if (isNull2)
+			continue;			/* can't prove anything here */
+
+		/* Apply the type-specific equality function */
+		if (!DatumGetBool(FunctionCall2Coll(&eqfunctions[i],
+											collations[i],
+											attr1, attr2)))
+		{
+			result = true;		/* they are unequal */
+			break;
+		}
+	}
+
+	MemoryContextSwitchTo(oldContext);
+
+	return result;
+}
+
+/*
+ * findPartialMatch: does the hashtable contain an entry that is not
+ * provably distinct from the tuple?
+ *
+ * We have to scan the whole hashtable; we can't usefully use hashkeys
+ * to guide probing, since we might get partial matches on tuples with
+ * hashkeys quite unrelated to what we'd get from the given tuple.
+ *
+ * Caller must provide the equality functions to use, since in cross-type
+ * cases these are different from the hashtable's internal functions.
+ */
+static bool
+findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
+				 FmgrInfo *eqfunctions)
+{
+	int			numCols = hashtable->numCols;
+	AttrNumber *keyColIdx = hashtable->keyColIdx;
+	TupleHashIterator hashiter;
+	TupleHashEntry entry;
+
+	InitTupleHashIterator(hashtable, &hashiter);
+	while ((entry = ScanTupleHashTable(hashtable, &hashiter)) != NULL)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
+		if (!execTuplesUnequal(slot, hashtable->tableslot,
+							   numCols, keyColIdx,
+							   eqfunctions,
+							   hashtable->tab_collations,
+							   hashtable->tempcxt))
+		{
+			TermTupleHashIterator(&hashiter);
+			return true;
+		}
+	}
+	/* No TermTupleHashIterator call needed here */
+	return false;
+}
+
+/*
+ * slotAllNulls: is the slot completely NULL?
+ *
+ * This does not test for dropped columns, which is OK because we only
+ * use it on projected tuples.
+ */
+static bool
+slotAllNulls(TupleTableSlot *slot)
+{
+	int			ncols = slot->tts_tupleDescriptor->natts;
+	int			i;
+
+	for (i = 1; i <= ncols; i++)
+	{
+		if (!slot_attisnull(slot, i))
+			return false;
+	}
+	return true;
+}
+
+/*
+ * slotNoNulls: is the slot entirely not NULL?
+ *
+ * This does not test for dropped columns, which is OK because we only
+ * use it on projected tuples.
+ */
+static bool
+slotNoNulls(TupleTableSlot *slot)
+{
+	int			ncols = slot->tts_tupleDescriptor->natts;
+	int			i;
+
+	for (i = 1; i <= ncols; i++)
+	{
+		if (slot_attisnull(slot, i))
+			return false;
+	}
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitSubPlan
+ *
+ * Create a SubPlanState for a SubPlan; this is the SubPlan-specific part
+ * of ExecInitExpr().  We split it out so that it can be used for InitPlans
+ * as well as regular SubPlans.  Note that we don't link the SubPlan into
+ * the parent's subPlan list, because that shouldn't happen for InitPlans.
+ * Instead, ExecInitExpr() does that one part.
+ * ----------------------------------------------------------------
+ */
+SubPlanState *
+ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
+{
+	SubPlanState *sstate = makeNode(SubPlanState);
+	EState	   *estate = parent->state;
+
+	sstate->subplan = subplan;
+
+	/* Link the SubPlanState to already-initialized subplan */
+	sstate->planstate = (PlanState *) list_nth(estate->es_subplanstates,
+											   subplan->plan_id - 1);
+
+	/*
+	 * This check can fail if the planner mistakenly puts a parallel-unsafe
+	 * subplan into a parallelized subquery; see ExecSerializePlan.
+	 */
+	if (sstate->planstate == NULL)
+		elog(ERROR, "subplan \"%s\" was not initialized",
+			 subplan->plan_name);
+
+	/* Link to parent's state, too */
+	sstate->parent = parent;
+
+	/* Initialize subexpressions */
+	sstate->testexpr = ExecInitExpr((Expr *) subplan->testexpr, parent);
+	sstate->args = ExecInitExprList(subplan->args, parent);
+
+	/*
+	 * initialize my state
+	 */
+	sstate->curTuple = NULL;
+	sstate->curArray = PointerGetDatum(NULL);
+	sstate->projLeft = NULL;
+	sstate->projRight = NULL;
+	sstate->hashtable = NULL;
+	sstate->hashnulls = NULL;
+	sstate->hashtablecxt = NULL;
+	sstate->hashtempcxt = NULL;
+	sstate->innerecontext = NULL;
+	sstate->keyColIdx = NULL;
+	sstate->tab_eq_funcoids = NULL;
+	sstate->tab_hash_funcs = NULL;
+	sstate->tab_eq_funcs = NULL;
+	sstate->tab_collations = NULL;
+	sstate->lhs_hash_funcs = NULL;
+	sstate->cur_eq_funcs = NULL;
+
+	/*
+	 * If this is an initplan or MULTIEXPR subplan, it has output parameters
+	 * that the parent plan will use, so mark those parameters as needing
+	 * evaluation.  We don't actually run the subplan until we first need one
+	 * of its outputs.
+	 *
+	 * A CTE subplan's output parameter is never to be evaluated in the normal
+	 * way, so skip this in that case.
+	 *
+	 * Note that we don't set parent->chgParam here: the parent plan hasn't
+	 * been run yet, so no need to force it to re-run.
+	 */
+	if (subplan->setParam != NIL && subplan->subLinkType != CTE_SUBLINK)
+	{
+		ListCell   *lst;
+
+		foreach(lst, subplan->setParam)
+		{
+			int			paramid = lfirst_int(lst);
+			ParamExecData *prm = &(estate->es_param_exec_vals[paramid]);
+
+			prm->execPlan = sstate;
+		}
+	}
+
+	/*
+	 * If we are going to hash the subquery output, initialize relevant stuff.
+	 * (We don't create the hashtable until needed, though.)
+	 */
+	if (subplan->useHashTable)
+	{
+		int			ncols,
+					i;
+		TupleDesc	tupDescLeft;
+		TupleDesc	tupDescRight;
+		Oid		   *cross_eq_funcoids;
+		TupleTableSlot *slot;
+		List	   *oplist,
+				   *lefttlist,
+				   *righttlist;
+		ListCell   *l;
+
+		/* We need a memory context to hold the hash table(s) */
+		sstate->hashtablecxt =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "Subplan HashTable Context",
+								  ALLOCSET_DEFAULT_SIZES);
+		/* and a small one for the hash tables to use as temp storage */
+		sstate->hashtempcxt =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "Subplan HashTable Temp Context",
+								  ALLOCSET_SMALL_SIZES);
+		/* and a short-lived exprcontext for function evaluation */
+		sstate->innerecontext = CreateExprContext(estate);
+
+		/*
+		 * We use ExecProject to evaluate the lefthand and righthand
+		 * expression lists and form tuples.  (You might think that we could
+		 * use the sub-select's output tuples directly, but that is not the
+		 * case if we had to insert any run-time coercions of the sub-select's
+		 * output datatypes; anyway this avoids storing any resjunk columns
+		 * that might be in the sub-select's output.)  Run through the
+		 * combining expressions to build tlists for the lefthand and
+		 * righthand sides.
+		 *
+		 * We also extract the combining operators themselves to initialize
+		 * the equality and hashing functions for the hash tables.
+		 */
+		if (IsA(subplan->testexpr, OpExpr))
+		{
+			/* single combining operator */
+			oplist = list_make1(subplan->testexpr);
+		}
+		else if (is_andclause(subplan->testexpr))
+		{
+			/* multiple combining operators */
+			oplist = castNode(BoolExpr, subplan->testexpr)->args;
+		}
+		else
+		{
+			/* shouldn't see anything else in a hashable subplan */
+			elog(ERROR, "unrecognized testexpr type: %d",
+				 (int) nodeTag(subplan->testexpr));
+			oplist = NIL;		/* keep compiler quiet */
+		}
+		ncols = list_length(oplist);
+
+		lefttlist = righttlist = NIL;
+		sstate->numCols = ncols;
+		sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber));
+		sstate->tab_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid));
+		sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid));
+		sstate->tab_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+		sstate->tab_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+		sstate->lhs_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+		sstate->cur_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+		/* we'll need the cross-type equality fns below, but not in sstate */
+		cross_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid));
+
+		i = 1;
+		foreach(l, oplist)
+		{
+			OpExpr	   *opexpr = lfirst_node(OpExpr, l);
+			Expr	   *expr;
+			TargetEntry *tle;
+			Oid			rhs_eq_oper;
+			Oid			left_hashfn;
+			Oid			right_hashfn;
+
+			Assert(list_length(opexpr->args) == 2);
+
+			/* Process lefthand argument */
+			expr = (Expr *) linitial(opexpr->args);
+			tle = makeTargetEntry(expr,
+								  i,
+								  NULL,
+								  false);
+			lefttlist = lappend(lefttlist, tle);
+
+			/* Process righthand argument */
+			expr = (Expr *) lsecond(opexpr->args);
+			tle = makeTargetEntry(expr,
+								  i,
+								  NULL,
+								  false);
+			righttlist = lappend(righttlist, tle);
+
+			/* Lookup the equality function (potentially cross-type) */
+			cross_eq_funcoids[i - 1] = opexpr->opfuncid;
+			fmgr_info(opexpr->opfuncid, &sstate->cur_eq_funcs[i - 1]);
+			fmgr_info_set_expr((Node *) opexpr, &sstate->cur_eq_funcs[i - 1]);
+
+			/* Look up the equality function for the RHS type */
+			if (!get_compatible_hash_operators(opexpr->opno,
+											   NULL, &rhs_eq_oper))
+				elog(ERROR, "could not find compatible hash operator for operator %u",
+					 opexpr->opno);
+			sstate->tab_eq_funcoids[i - 1] = get_opcode(rhs_eq_oper);
+			fmgr_info(sstate->tab_eq_funcoids[i - 1],
+					  &sstate->tab_eq_funcs[i - 1]);
+
+			/* Lookup the associated hash functions */
+			if (!get_op_hash_functions(opexpr->opno,
+									   &left_hashfn, &right_hashfn))
+				elog(ERROR, "could not find hash function for hash operator %u",
+					 opexpr->opno);
+			fmgr_info(left_hashfn, &sstate->lhs_hash_funcs[i - 1]);
+			fmgr_info(right_hashfn, &sstate->tab_hash_funcs[i - 1]);
+
+			/* Set collation */
+			sstate->tab_collations[i - 1] = opexpr->inputcollid;
+
+			/* keyColIdx is just column numbers 1..n */
+			sstate->keyColIdx[i - 1] = i;
+
+			i++;
+		}
+
+		/*
+		 * Construct tupdescs, slots and projection nodes for left and right
+		 * sides.  The lefthand expressions will be evaluated in the parent
+		 * plan node's exprcontext, which we don't have access to here.
+		 * Fortunately we can just pass NULL for now and fill it in later
+		 * (hack alert!).  The righthand expressions will be evaluated in our
+		 * own innerecontext.
+		 */
+		tupDescLeft = ExecTypeFromTL(lefttlist);
+		slot = ExecInitExtraTupleSlot(estate, tupDescLeft, &TTSOpsVirtual);
+		sstate->projLeft = ExecBuildProjectionInfo(lefttlist,
+												   NULL,
+												   slot,
+												   parent,
+												   NULL);
+
+		sstate->descRight = tupDescRight = ExecTypeFromTL(righttlist);
+		slot = ExecInitExtraTupleSlot(estate, tupDescRight, &TTSOpsVirtual);
+		sstate->projRight = ExecBuildProjectionInfo(righttlist,
+													sstate->innerecontext,
+													slot,
+													sstate->planstate,
+													NULL);
+
+		/*
+		 * Create comparator for lookups of rows in the table (potentially
+		 * cross-type comparisons).
+		 */
+		sstate->cur_eq_comp = ExecBuildGroupingEqual(tupDescLeft, tupDescRight,
+													 &TTSOpsVirtual, &TTSOpsMinimalTuple,
+													 ncols,
+													 sstate->keyColIdx,
+													 cross_eq_funcoids,
+													 sstate->tab_collations,
+													 parent);
+	}
+
+	return sstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSetParamPlan
+ *
+ *		Executes a subplan and sets its output parameters.
+ *
+ * This is called from ExecEvalParamExec() when the value of a PARAM_EXEC
+ * parameter is requested and the param's execPlan field is set (indicating
+ * that the param has not yet been evaluated).  This allows lazy evaluation
+ * of initplans: we don't run the subplan until/unless we need its output.
+ * Note that this routine MUST clear the execPlan fields of the plan's
+ * output parameters after evaluating them!
+ *
+ * The results of this function are stored in the EState associated with the
+ * ExprContext (particularly, its ecxt_param_exec_vals); any pass-by-ref
+ * result Datums are allocated in the EState's per-query memory.  The passed
+ * econtext can be any ExprContext belonging to that EState; which one is
+ * important only to the extent that the ExprContext's per-tuple memory
+ * context is used to evaluate any parameters passed down to the subplan.
+ * (Thus in principle, the shorter-lived the ExprContext the better, since
+ * that data isn't needed after we return.  In practice, because initplan
+ * parameters are never more complex than Vars, Aggrefs, etc, evaluating them
+ * currently never leaks any memory anyway.)
+ * ----------------------------------------------------------------
+ */
+void
+ExecSetParamPlan(SubPlanState *node, ExprContext *econtext)
+{
+	SubPlan    *subplan = node->subplan;
+	PlanState  *planstate = node->planstate;
+	SubLinkType subLinkType = subplan->subLinkType;
+	EState	   *estate = planstate->state;
+	ScanDirection dir = estate->es_direction;
+	MemoryContext oldcontext;
+	TupleTableSlot *slot;
+	ListCell   *pvar;
+	ListCell   *l;
+	bool		found = false;
+	ArrayBuildStateAny *astate = NULL;
+
+	if (subLinkType == ANY_SUBLINK ||
+		subLinkType == ALL_SUBLINK)
+		elog(ERROR, "ANY/ALL subselect unsupported as initplan");
+	if (subLinkType == CTE_SUBLINK)
+		elog(ERROR, "CTE subplans should not be executed via ExecSetParamPlan");
+
+	/*
+	 * Enforce forward scan direction regardless of caller. It's hard but not
+	 * impossible to get here in backward scan, so make it work anyway.
+	 */
+	estate->es_direction = ForwardScanDirection;
+
+	/* Initialize ArrayBuildStateAny in caller's context, if needed */
+	if (subLinkType == ARRAY_SUBLINK)
+		astate = initArrayResultAny(subplan->firstColType,
+									CurrentMemoryContext, true);
+
+	/*
+	 * Must switch to per-query memory context.
+	 */
+	oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+
+	/*
+	 * Set Params of this plan from parent plan correlation values. (Any
+	 * calculation we have to do is done in the parent econtext, since the
+	 * Param values don't need to have per-query lifetime.)  Currently, we
+	 * expect only MULTIEXPR_SUBLINK plans to have any correlation values.
+	 */
+	Assert(subplan->parParam == NIL || subLinkType == MULTIEXPR_SUBLINK);
+	Assert(list_length(subplan->parParam) == list_length(node->args));
+
+	forboth(l, subplan->parParam, pvar, node->args)
+	{
+		int			paramid = lfirst_int(l);
+		ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+		prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
+											   econtext,
+											   &(prm->isnull));
+		planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
+	}
+
+	/*
+	 * Run the plan.  (If it needs to be rescanned, the first ExecProcNode
+	 * call will take care of that.)
+	 */
+	for (slot = ExecProcNode(planstate);
+		 !TupIsNull(slot);
+		 slot = ExecProcNode(planstate))
+	{
+		TupleDesc	tdesc = slot->tts_tupleDescriptor;
+		int			i = 1;
+
+		if (subLinkType == EXISTS_SUBLINK)
+		{
+			/* There can be only one setParam... */
+			int			paramid = linitial_int(subplan->setParam);
+			ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+			prm->execPlan = NULL;
+			prm->value = BoolGetDatum(true);
+			prm->isnull = false;
+			found = true;
+			break;
+		}
+
+		if (subLinkType == ARRAY_SUBLINK)
+		{
+			Datum		dvalue;
+			bool		disnull;
+
+			found = true;
+			/* stash away current value */
+			Assert(subplan->firstColType == TupleDescAttr(tdesc, 0)->atttypid);
+			dvalue = slot_getattr(slot, 1, &disnull);
+			astate = accumArrayResultAny(astate, dvalue, disnull,
+										 subplan->firstColType, oldcontext);
+			/* keep scanning subplan to collect all values */
+			continue;
+		}
+
+		if (found &&
+			(subLinkType == EXPR_SUBLINK ||
+			 subLinkType == MULTIEXPR_SUBLINK ||
+			 subLinkType == ROWCOMPARE_SUBLINK))
+			ereport(ERROR,
+					(errcode(ERRCODE_CARDINALITY_VIOLATION),
+					 errmsg("more than one row returned by a subquery used as an expression")));
+
+		found = true;
+
+		/*
+		 * We need to copy the subplan's tuple into our own context, in case
+		 * any of the params are pass-by-ref type --- the pointers stored in
+		 * the param structs will point at this copied tuple! node->curTuple
+		 * keeps track of the copied tuple for eventual freeing.
+		 */
+		if (node->curTuple)
+			heap_freetuple(node->curTuple);
+		node->curTuple = ExecCopySlotHeapTuple(slot);
+
+		/*
+		 * Now set all the setParam params from the columns of the tuple
+		 */
+		foreach(l, subplan->setParam)
+		{
+			int			paramid = lfirst_int(l);
+			ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+			prm->execPlan = NULL;
+			prm->value = heap_getattr(node->curTuple, i, tdesc,
+									  &(prm->isnull));
+			i++;
+		}
+	}
+
+	if (subLinkType == ARRAY_SUBLINK)
+	{
+		/* There can be only one setParam... */
+		int			paramid = linitial_int(subplan->setParam);
+		ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+		/*
+		 * We build the result array in query context so it won't disappear;
+		 * to avoid leaking memory across repeated calls, we have to remember
+		 * the latest value, much as for curTuple above.
+		 */
+		if (node->curArray != PointerGetDatum(NULL))
+			pfree(DatumGetPointer(node->curArray));
+		node->curArray = makeArrayResultAny(astate,
+											econtext->ecxt_per_query_memory,
+											true);
+		prm->execPlan = NULL;
+		prm->value = node->curArray;
+		prm->isnull = false;
+	}
+	else if (!found)
+	{
+		if (subLinkType == EXISTS_SUBLINK)
+		{
+			/* There can be only one setParam... */
+			int			paramid = linitial_int(subplan->setParam);
+			ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+			prm->execPlan = NULL;
+			prm->value = BoolGetDatum(false);
+			prm->isnull = false;
+		}
+		else
+		{
+			/* For other sublink types, set all the output params to NULL */
+			foreach(l, subplan->setParam)
+			{
+				int			paramid = lfirst_int(l);
+				ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+				prm->execPlan = NULL;
+				prm->value = (Datum) 0;
+				prm->isnull = true;
+			}
+		}
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* restore scan direction */
+	estate->es_direction = dir;
+}
+
+/*
+ * ExecSetParamPlanMulti
+ *
+ * Apply ExecSetParamPlan to evaluate any not-yet-evaluated initplan output
+ * parameters whose ParamIDs are listed in "params".  Any listed params that
+ * are not initplan outputs are ignored.
+ *
+ * As with ExecSetParamPlan, any ExprContext belonging to the current EState
+ * can be used, but in principle a shorter-lived ExprContext is better than a
+ * longer-lived one.
+ */
+void
+ExecSetParamPlanMulti(const Bitmapset *params, ExprContext *econtext)
+{
+	int			paramid;
+
+	paramid = -1;
+	while ((paramid = bms_next_member(params, paramid)) >= 0)
+	{
+		ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]);
+
+		if (prm->execPlan != NULL)
+		{
+			/* Parameter not evaluated yet, so go do it */
+			ExecSetParamPlan(prm->execPlan, econtext);
+			/* ExecSetParamPlan should have processed this param... */
+			Assert(prm->execPlan == NULL);
+		}
+	}
+}
+
+/*
+ * Mark an initplan as needing recalculation
+ */
+void
+ExecReScanSetParamPlan(SubPlanState *node, PlanState *parent)
+{
+	PlanState  *planstate = node->planstate;
+	SubPlan    *subplan = node->subplan;
+	EState	   *estate = parent->state;
+	ListCell   *l;
+
+	/* sanity checks */
+	if (subplan->parParam != NIL)
+		elog(ERROR, "direct correlated subquery unsupported as initplan");
+	if (subplan->setParam == NIL)
+		elog(ERROR, "setParam list of initplan is empty");
+	if (bms_is_empty(planstate->plan->extParam))
+		elog(ERROR, "extParam set of initplan is empty");
+
+	/*
+	 * Don't actually re-scan: it'll happen inside ExecSetParamPlan if needed.
+	 */
+
+	/*
+	 * Mark this subplan's output parameters as needing recalculation.
+	 *
+	 * CTE subplans are never executed via parameter recalculation; instead
+	 * they get run when called by nodeCtescan.c.  So don't mark the output
+	 * parameter of a CTE subplan as dirty, but do set the chgParam bit for it
+	 * so that dependent plan nodes will get told to rescan.
+	 */
+	foreach(l, subplan->setParam)
+	{
+		int			paramid = lfirst_int(l);
+		ParamExecData *prm = &(estate->es_param_exec_vals[paramid]);
+
+		if (subplan->subLinkType != CTE_SUBLINK)
+			prm->execPlan = node;
+
+		parent->chgParam = bms_add_member(parent->chgParam, paramid);
+	}
+}
diff --git a/src/backend/executor/nodeSubqueryscan.c b/src/backend/executor/nodeSubqueryscan.c
new file mode 100644
index 0000000..c09f628
--- /dev/null
+++ b/src/backend/executor/nodeSubqueryscan.c
@@ -0,0 +1,213 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeSubqueryscan.c
+ *	  Support routines for scanning subqueries (subselects in rangetable).
+ *
+ * This is just enough different from sublinks (nodeSubplan.c) to mean that
+ * we need two sets of code.  Ought to look at trying to unify the cases.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeSubqueryscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecSubqueryScan			scans a subquery.
+ *		ExecSubqueryNext			retrieve next tuple in sequential order.
+ *		ExecInitSubqueryScan		creates and initializes a subqueryscan node.
+ *		ExecEndSubqueryScan			releases any storage allocated.
+ *		ExecReScanSubqueryScan		rescans the relation
+ *
+ */
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeSubqueryscan.h"
+
+static TupleTableSlot *SubqueryNext(SubqueryScanState *node);
+
+/* ----------------------------------------------------------------
+ *						Scan Support
+ * ----------------------------------------------------------------
+ */
+/* ----------------------------------------------------------------
+ *		SubqueryNext
+ *
+ *		This is a workhorse for ExecSubqueryScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+SubqueryNext(SubqueryScanState *node)
+{
+	TupleTableSlot *slot;
+
+	/*
+	 * Get the next tuple from the sub-query.
+	 */
+	slot = ExecProcNode(node->subplan);
+
+	/*
+	 * We just return the subplan's result slot, rather than expending extra
+	 * cycles for ExecCopySlot().  (Our own ScanTupleSlot is used only for
+	 * EvalPlanQual rechecks.)
+	 */
+	return slot;
+}
+
+/*
+ * SubqueryRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+SubqueryRecheck(SubqueryScanState *node, TupleTableSlot *slot)
+{
+	/* nothing to check */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSubqueryScan(node)
+ *
+ *		Scans the subquery sequentially and returns the next qualifying
+ *		tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecSubqueryScan(PlanState *pstate)
+{
+	SubqueryScanState *node = castNode(SubqueryScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) SubqueryNext,
+					(ExecScanRecheckMtd) SubqueryRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitSubqueryScan
+ * ----------------------------------------------------------------
+ */
+SubqueryScanState *
+ExecInitSubqueryScan(SubqueryScan *node, EState *estate, int eflags)
+{
+	SubqueryScanState *subquerystate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & EXEC_FLAG_MARK));
+
+	/* SubqueryScan should not have any "normal" children */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create state structure
+	 */
+	subquerystate = makeNode(SubqueryScanState);
+	subquerystate->ss.ps.plan = (Plan *) node;
+	subquerystate->ss.ps.state = estate;
+	subquerystate->ss.ps.ExecProcNode = ExecSubqueryScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &subquerystate->ss.ps);
+
+	/*
+	 * initialize subquery
+	 */
+	subquerystate->subplan = ExecInitNode(node->subplan, estate, eflags);
+
+	/*
+	 * Initialize scan slot and type (needed by ExecAssignScanProjectionInfo)
+	 */
+	ExecInitScanTupleSlot(estate, &subquerystate->ss,
+						  ExecGetResultType(subquerystate->subplan),
+						  ExecGetResultSlotOps(subquerystate->subplan, NULL));
+
+	/*
+	 * The slot used as the scantuple isn't the slot above (outside of EPQ),
+	 * but the one from the node below.
+	 */
+	subquerystate->ss.ps.scanopsset = true;
+	subquerystate->ss.ps.scanops = ExecGetResultSlotOps(subquerystate->subplan,
+														&subquerystate->ss.ps.scanopsfixed);
+	subquerystate->ss.ps.resultopsset = true;
+	subquerystate->ss.ps.resultops = subquerystate->ss.ps.scanops;
+	subquerystate->ss.ps.resultopsfixed = subquerystate->ss.ps.scanopsfixed;
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&subquerystate->ss.ps);
+	ExecAssignScanProjectionInfo(&subquerystate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	subquerystate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) subquerystate);
+
+	return subquerystate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndSubqueryScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndSubqueryScan(SubqueryScanState *node)
+{
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the upper tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close down subquery
+	 */
+	ExecEndNode(node->subplan);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanSubqueryScan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanSubqueryScan(SubqueryScanState *node)
+{
+	ExecScanReScan(&node->ss);
+
+	/*
+	 * ExecReScan doesn't know about my subplan, so I have to do
+	 * changed-parameter signaling myself.  This is just as well, because the
+	 * subplan has its own memory context in which its chgParam state lives.
+	 */
+	if (node->ss.ps.chgParam != NULL)
+		UpdateChangedParamSet(node->subplan, node->ss.ps.chgParam);
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->subplan->chgParam == NULL)
+		ExecReScan(node->subplan);
+}
diff --git a/src/backend/executor/nodeTableFuncscan.c b/src/backend/executor/nodeTableFuncscan.c
new file mode 100644
index 0000000..4d7eca4
--- /dev/null
+++ b/src/backend/executor/nodeTableFuncscan.c
@@ -0,0 +1,523 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeTableFuncscan.c
+ *	  Support routines for scanning RangeTableFunc (XMLTABLE like functions).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeTableFuncscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecTableFuncscan		scans a function.
+ *		ExecFunctionNext		retrieve next tuple in sequential order.
+ *		ExecInitTableFuncscan	creates and initializes a TableFuncscan node.
+ *		ExecEndTableFuncscan		releases any storage allocated.
+ *		ExecReScanTableFuncscan rescans the function
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeTableFuncscan.h"
+#include "executor/tablefunc.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/xml.h"
+
+static TupleTableSlot *TableFuncNext(TableFuncScanState *node);
+static bool TableFuncRecheck(TableFuncScanState *node, TupleTableSlot *slot);
+
+static void tfuncFetchRows(TableFuncScanState *tstate, ExprContext *econtext);
+static void tfuncInitialize(TableFuncScanState *tstate, ExprContext *econtext, Datum doc);
+static void tfuncLoadRows(TableFuncScanState *tstate, ExprContext *econtext);
+
+/* ----------------------------------------------------------------
+ *						Scan Support
+ * ----------------------------------------------------------------
+ */
+/* ----------------------------------------------------------------
+ *		TableFuncNext
+ *
+ *		This is a workhorse for ExecTableFuncscan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+TableFuncNext(TableFuncScanState *node)
+{
+	TupleTableSlot *scanslot;
+
+	scanslot = node->ss.ss_ScanTupleSlot;
+
+	/*
+	 * If first time through, read all tuples from function and put them in a
+	 * tuplestore. Subsequent calls just fetch tuples from tuplestore.
+	 */
+	if (node->tupstore == NULL)
+		tfuncFetchRows(node, node->ss.ps.ps_ExprContext);
+
+	/*
+	 * Get the next tuple from tuplestore.
+	 */
+	(void) tuplestore_gettupleslot(node->tupstore,
+								   true,
+								   false,
+								   scanslot);
+	return scanslot;
+}
+
+/*
+ * TableFuncRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+TableFuncRecheck(TableFuncScanState *node, TupleTableSlot *slot)
+{
+	/* nothing to check */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecTableFuncscan(node)
+ *
+ *		Scans the function sequentially and returns the next qualifying
+ *		tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecTableFuncScan(PlanState *pstate)
+{
+	TableFuncScanState *node = castNode(TableFuncScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) TableFuncNext,
+					(ExecScanRecheckMtd) TableFuncRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitTableFuncscan
+ * ----------------------------------------------------------------
+ */
+TableFuncScanState *
+ExecInitTableFuncScan(TableFuncScan *node, EState *estate, int eflags)
+{
+	TableFuncScanState *scanstate;
+	TableFunc  *tf = node->tablefunc;
+	TupleDesc	tupdesc;
+	int			i;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & EXEC_FLAG_MARK));
+
+	/*
+	 * TableFuncscan should not have any children.
+	 */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create new ScanState for node
+	 */
+	scanstate = makeNode(TableFuncScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecTableFuncScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * initialize source tuple type
+	 */
+	tupdesc = BuildDescFromLists(tf->colnames,
+								 tf->coltypes,
+								 tf->coltypmods,
+								 tf->colcollations);
+	/* and the corresponding scan slot */
+	ExecInitScanTupleSlot(estate, &scanstate->ss, tupdesc,
+						  &TTSOpsMinimalTuple);
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, &scanstate->ss.ps);
+
+	/* Only XMLTABLE is supported currently */
+	scanstate->routine = &XmlTableRoutine;
+
+	scanstate->perTableCxt =
+		AllocSetContextCreate(CurrentMemoryContext,
+							  "TableFunc per value context",
+							  ALLOCSET_DEFAULT_SIZES);
+	scanstate->opaque = NULL;	/* initialized at runtime */
+
+	scanstate->ns_names = tf->ns_names;
+
+	scanstate->ns_uris =
+		ExecInitExprList(tf->ns_uris, (PlanState *) scanstate);
+	scanstate->docexpr =
+		ExecInitExpr((Expr *) tf->docexpr, (PlanState *) scanstate);
+	scanstate->rowexpr =
+		ExecInitExpr((Expr *) tf->rowexpr, (PlanState *) scanstate);
+	scanstate->colexprs =
+		ExecInitExprList(tf->colexprs, (PlanState *) scanstate);
+	scanstate->coldefexprs =
+		ExecInitExprList(tf->coldefexprs, (PlanState *) scanstate);
+
+	scanstate->notnulls = tf->notnulls;
+
+	/* these are allocated now and initialized later */
+	scanstate->in_functions = palloc(sizeof(FmgrInfo) * tupdesc->natts);
+	scanstate->typioparams = palloc(sizeof(Oid) * tupdesc->natts);
+
+	/*
+	 * Fill in the necessary fmgr infos.
+	 */
+	for (i = 0; i < tupdesc->natts; i++)
+	{
+		Oid			in_funcid;
+
+		getTypeInputInfo(TupleDescAttr(tupdesc, i)->atttypid,
+						 &in_funcid, &scanstate->typioparams[i]);
+		fmgr_info(in_funcid, &scanstate->in_functions[i]);
+	}
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndTableFuncscan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndTableFuncScan(TableFuncScanState *node)
+{
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * Release tuplestore resources
+	 */
+	if (node->tupstore != NULL)
+		tuplestore_end(node->tupstore);
+	node->tupstore = NULL;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanTableFuncscan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanTableFuncScan(TableFuncScanState *node)
+{
+	Bitmapset  *chgparam = node->ss.ps.chgParam;
+
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecScanReScan(&node->ss);
+
+	/*
+	 * Recompute when parameters are changed.
+	 */
+	if (chgparam)
+	{
+		if (node->tupstore != NULL)
+		{
+			tuplestore_end(node->tupstore);
+			node->tupstore = NULL;
+		}
+	}
+
+	if (node->tupstore != NULL)
+		tuplestore_rescan(node->tupstore);
+}
+
+/* ----------------------------------------------------------------
+ *		tfuncFetchRows
+ *
+ *		Read rows from a TableFunc producer
+ * ----------------------------------------------------------------
+ */
+static void
+tfuncFetchRows(TableFuncScanState *tstate, ExprContext *econtext)
+{
+	const TableFuncRoutine *routine = tstate->routine;
+	MemoryContext oldcxt;
+	Datum		value;
+	bool		isnull;
+
+	Assert(tstate->opaque == NULL);
+
+	/* build tuplestore for the result */
+	oldcxt = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
+	tstate->tupstore = tuplestore_begin_heap(false, false, work_mem);
+
+	/*
+	 * Each call to fetch a new set of rows - of which there may be very many
+	 * if XMLTABLE is being used in a lateral join - will allocate a possibly
+	 * substantial amount of memory, so we cannot use the per-query context
+	 * here. perTableCxt now serves the same function as "argcontext" does in
+	 * FunctionScan - a place to store per-one-call (i.e. one result table)
+	 * lifetime data (as opposed to per-query or per-result-tuple).
+	 */
+	MemoryContextSwitchTo(tstate->perTableCxt);
+
+	PG_TRY();
+	{
+		routine->InitOpaque(tstate,
+							tstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor->natts);
+
+		/*
+		 * If evaluating the document expression returns NULL, the table
+		 * expression is empty and we return immediately.
+		 */
+		value = ExecEvalExpr(tstate->docexpr, econtext, &isnull);
+
+		if (!isnull)
+		{
+			/* otherwise, pass the document value to the table builder */
+			tfuncInitialize(tstate, econtext, value);
+
+			/* initialize ordinality counter */
+			tstate->ordinal = 1;
+
+			/* Load all rows into the tuplestore, and we're done */
+			tfuncLoadRows(tstate, econtext);
+		}
+	}
+	PG_CATCH();
+	{
+		if (tstate->opaque != NULL)
+			routine->DestroyOpaque(tstate);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	/* clean up and return to original memory context */
+
+	if (tstate->opaque != NULL)
+	{
+		routine->DestroyOpaque(tstate);
+		tstate->opaque = NULL;
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+	MemoryContextReset(tstate->perTableCxt);
+}
+
+/*
+ * Fill in namespace declarations, the row filter, and column filters in a
+ * table expression builder context.
+ */
+static void
+tfuncInitialize(TableFuncScanState *tstate, ExprContext *econtext, Datum doc)
+{
+	const TableFuncRoutine *routine = tstate->routine;
+	TupleDesc	tupdesc;
+	ListCell   *lc1,
+			   *lc2;
+	bool		isnull;
+	int			colno;
+	Datum		value;
+	int			ordinalitycol =
+	((TableFuncScan *) (tstate->ss.ps.plan))->tablefunc->ordinalitycol;
+
+	/*
+	 * Install the document as a possibly-toasted Datum into the tablefunc
+	 * context.
+	 */
+	routine->SetDocument(tstate, doc);
+
+	/* Evaluate namespace specifications */
+	forboth(lc1, tstate->ns_uris, lc2, tstate->ns_names)
+	{
+		ExprState  *expr = (ExprState *) lfirst(lc1);
+		Value	   *ns_node = (Value *) lfirst(lc2);
+		char	   *ns_uri;
+		char	   *ns_name;
+
+		value = ExecEvalExpr((ExprState *) expr, econtext, &isnull);
+		if (isnull)
+			ereport(ERROR,
+					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+					 errmsg("namespace URI must not be null")));
+		ns_uri = TextDatumGetCString(value);
+
+		/* DEFAULT is passed down to SetNamespace as NULL */
+		ns_name = ns_node ? strVal(ns_node) : NULL;
+
+		routine->SetNamespace(tstate, ns_name, ns_uri);
+	}
+
+	/* Install the row filter expression into the table builder context */
+	value = ExecEvalExpr(tstate->rowexpr, econtext, &isnull);
+	if (isnull)
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("row filter expression must not be null")));
+
+	routine->SetRowFilter(tstate, TextDatumGetCString(value));
+
+	/*
+	 * Install the column filter expressions into the table builder context.
+	 * If an expression is given, use that; otherwise the column name itself
+	 * is the column filter.
+	 */
+	colno = 0;
+	tupdesc = tstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
+	foreach(lc1, tstate->colexprs)
+	{
+		char	   *colfilter;
+		Form_pg_attribute att = TupleDescAttr(tupdesc, colno);
+
+		if (colno != ordinalitycol)
+		{
+			ExprState  *colexpr = lfirst(lc1);
+
+			if (colexpr != NULL)
+			{
+				value = ExecEvalExpr(colexpr, econtext, &isnull);
+				if (isnull)
+					ereport(ERROR,
+							(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+							 errmsg("column filter expression must not be null"),
+							 errdetail("Filter for column \"%s\" is null.",
+									   NameStr(att->attname))));
+				colfilter = TextDatumGetCString(value);
+			}
+			else
+				colfilter = NameStr(att->attname);
+
+			routine->SetColumnFilter(tstate, colfilter, colno);
+		}
+
+		colno++;
+	}
+}
+
+/*
+ * Load all the rows from the TableFunc table builder into a tuplestore.
+ */
+static void
+tfuncLoadRows(TableFuncScanState *tstate, ExprContext *econtext)
+{
+	const TableFuncRoutine *routine = tstate->routine;
+	TupleTableSlot *slot = tstate->ss.ss_ScanTupleSlot;
+	TupleDesc	tupdesc = slot->tts_tupleDescriptor;
+	Datum	   *values = slot->tts_values;
+	bool	   *nulls = slot->tts_isnull;
+	int			natts = tupdesc->natts;
+	MemoryContext oldcxt;
+	int			ordinalitycol;
+
+	ordinalitycol =
+		((TableFuncScan *) (tstate->ss.ps.plan))->tablefunc->ordinalitycol;
+
+	/*
+	 * We need a short-lived memory context that we can clean up each time
+	 * around the loop, to avoid wasting space. Our default per-tuple context
+	 * is fine for the job, since we won't have used it for anything yet in
+	 * this tuple cycle.
+	 */
+	oldcxt = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	/*
+	 * Keep requesting rows from the table builder until there aren't any.
+	 */
+	while (routine->FetchRow(tstate))
+	{
+		ListCell   *cell = list_head(tstate->coldefexprs);
+		int			colno;
+
+		CHECK_FOR_INTERRUPTS();
+
+		ExecClearTuple(tstate->ss.ss_ScanTupleSlot);
+
+		/*
+		 * Obtain the value of each column for this row, installing them into
+		 * the slot; then add the tuple to the tuplestore.
+		 */
+		for (colno = 0; colno < natts; colno++)
+		{
+			Form_pg_attribute att = TupleDescAttr(tupdesc, colno);
+
+			if (colno == ordinalitycol)
+			{
+				/* Fast path for ordinality column */
+				values[colno] = Int32GetDatum(tstate->ordinal++);
+				nulls[colno] = false;
+			}
+			else
+			{
+				bool		isnull;
+
+				values[colno] = routine->GetValue(tstate,
+												  colno,
+												  att->atttypid,
+												  att->atttypmod,
+												  &isnull);
+
+				/* No value?  Evaluate and apply the default, if any */
+				if (isnull && cell != NULL)
+				{
+					ExprState  *coldefexpr = (ExprState *) lfirst(cell);
+
+					if (coldefexpr != NULL)
+						values[colno] = ExecEvalExpr(coldefexpr, econtext,
+													 &isnull);
+				}
+
+				/* Verify a possible NOT NULL constraint */
+				if (isnull && bms_is_member(colno, tstate->notnulls))
+					ereport(ERROR,
+							(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+							 errmsg("null is not allowed in column \"%s\"",
+									NameStr(att->attname))));
+
+				nulls[colno] = isnull;
+			}
+
+			/* advance list of default expressions */
+			if (cell != NULL)
+				cell = lnext(tstate->coldefexprs, cell);
+		}
+
+		tuplestore_putvalues(tstate->tupstore, tupdesc, values, nulls);
+
+		MemoryContextReset(econtext->ecxt_per_tuple_memory);
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+}
diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c
new file mode 100644
index 0000000..2b0d205
--- /dev/null
+++ b/src/backend/executor/nodeTidrangescan.c
@@ -0,0 +1,413 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeTidrangescan.c
+ *	  Routines to support TID range scans of relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeTidrangescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/sysattr.h"
+#include "access/tableam.h"
+#include "catalog/pg_operator.h"
+#include "executor/execdebug.h"
+#include "executor/nodeTidrangescan.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/bufmgr.h"
+#include "utils/rel.h"
+
+
+#define IsCTIDVar(node)  \
+	((node) != NULL && \
+	 IsA((node), Var) && \
+	 ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \
+	 ((Var *) (node))->varlevelsup == 0)
+
+typedef enum
+{
+	TIDEXPR_UPPER_BOUND,
+	TIDEXPR_LOWER_BOUND
+} TidExprType;
+
+/* Upper or lower range bound for scan */
+typedef struct TidOpExpr
+{
+	TidExprType exprtype;		/* type of op; lower or upper */
+	ExprState  *exprstate;		/* ExprState for a TID-yielding subexpr */
+	bool		inclusive;		/* whether op is inclusive */
+} TidOpExpr;
+
+/*
+ * For the given 'expr', build and return an appropriate TidOpExpr taking into
+ * account the expr's operator and operand order.
+ */
+static TidOpExpr *
+MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate)
+{
+	Node	   *arg1 = get_leftop((Expr *) expr);
+	Node	   *arg2 = get_rightop((Expr *) expr);
+	ExprState  *exprstate = NULL;
+	bool		invert = false;
+	TidOpExpr  *tidopexpr;
+
+	if (IsCTIDVar(arg1))
+		exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps);
+	else if (IsCTIDVar(arg2))
+	{
+		exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps);
+		invert = true;
+	}
+	else
+		elog(ERROR, "could not identify CTID variable");
+
+	tidopexpr = (TidOpExpr *) palloc(sizeof(TidOpExpr));
+	tidopexpr->inclusive = false;	/* for now */
+
+	switch (expr->opno)
+	{
+		case TIDLessEqOperator:
+			tidopexpr->inclusive = true;
+			/* fall through */
+		case TIDLessOperator:
+			tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND;
+			break;
+		case TIDGreaterEqOperator:
+			tidopexpr->inclusive = true;
+			/* fall through */
+		case TIDGreaterOperator:
+			tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND;
+			break;
+		default:
+			elog(ERROR, "could not identify CTID operator");
+	}
+
+	tidopexpr->exprstate = exprstate;
+
+	return tidopexpr;
+}
+
+/*
+ * Extract the qual subexpressions that yield TIDs to search for,
+ * and compile them into ExprStates if they're ordinary expressions.
+ */
+static void
+TidExprListCreate(TidRangeScanState *tidrangestate)
+{
+	TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan;
+	List	   *tidexprs = NIL;
+	ListCell   *l;
+
+	foreach(l, node->tidrangequals)
+	{
+		OpExpr	   *opexpr = lfirst(l);
+		TidOpExpr  *tidopexpr;
+
+		if (!IsA(opexpr, OpExpr))
+			elog(ERROR, "could not identify CTID expression");
+
+		tidopexpr = MakeTidOpExpr(opexpr, tidrangestate);
+		tidexprs = lappend(tidexprs, tidopexpr);
+	}
+
+	tidrangestate->trss_tidexprs = tidexprs;
+}
+
+/* ----------------------------------------------------------------
+ *		TidRangeEval
+ *
+ *		Compute and set node's block and offset range to scan by evaluating
+ *		the trss_tidexprs.  Returns false if we detect the range cannot
+ *		contain any tuples.  Returns true if it's possible for the range to
+ *		contain tuples.
+ * ----------------------------------------------------------------
+ */
+static bool
+TidRangeEval(TidRangeScanState *node)
+{
+	ExprContext *econtext = node->ss.ps.ps_ExprContext;
+	ItemPointerData lowerBound;
+	ItemPointerData upperBound;
+	ListCell   *l;
+
+	/*
+	 * Set the upper and lower bounds to the absolute limits of the range of
+	 * the ItemPointer type.  Below we'll try to narrow this range on either
+	 * side by looking at the TidOpExprs.
+	 */
+	ItemPointerSet(&lowerBound, 0, 0);
+	ItemPointerSet(&upperBound, InvalidBlockNumber, PG_UINT16_MAX);
+
+	foreach(l, node->trss_tidexprs)
+	{
+		TidOpExpr  *tidopexpr = (TidOpExpr *) lfirst(l);
+		ItemPointer itemptr;
+		bool		isNull;
+
+		/* Evaluate this bound. */
+		itemptr = (ItemPointer)
+			DatumGetPointer(ExecEvalExprSwitchContext(tidopexpr->exprstate,
+													  econtext,
+													  &isNull));
+
+		/* If the bound is NULL, *nothing* matches the qual. */
+		if (isNull)
+			return false;
+
+		if (tidopexpr->exprtype == TIDEXPR_LOWER_BOUND)
+		{
+			ItemPointerData lb;
+
+			ItemPointerCopy(itemptr, &lb);
+
+			/*
+			 * Normalize non-inclusive ranges to become inclusive.  The
+			 * resulting ItemPointer here may not be a valid item pointer.
+			 */
+			if (!tidopexpr->inclusive)
+				ItemPointerInc(&lb);
+
+			/* Check if we can narrow the range using this qual */
+			if (ItemPointerCompare(&lb, &lowerBound) > 0)
+				ItemPointerCopy(&lb, &lowerBound);
+		}
+
+		else if (tidopexpr->exprtype == TIDEXPR_UPPER_BOUND)
+		{
+			ItemPointerData ub;
+
+			ItemPointerCopy(itemptr, &ub);
+
+			/*
+			 * Normalize non-inclusive ranges to become inclusive.  The
+			 * resulting ItemPointer here may not be a valid item pointer.
+			 */
+			if (!tidopexpr->inclusive)
+				ItemPointerDec(&ub);
+
+			/* Check if we can narrow the range using this qual */
+			if (ItemPointerCompare(&ub, &upperBound) < 0)
+				ItemPointerCopy(&ub, &upperBound);
+		}
+	}
+
+	ItemPointerCopy(&lowerBound, &node->trss_mintid);
+	ItemPointerCopy(&upperBound, &node->trss_maxtid);
+
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		TidRangeNext
+ *
+ *		Retrieve a tuple from the TidRangeScan node's currentRelation
+ *		using the TIDs in the TidRangeScanState information.
+ *
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+TidRangeNext(TidRangeScanState *node)
+{
+	TableScanDesc scandesc;
+	EState	   *estate;
+	ScanDirection direction;
+	TupleTableSlot *slot;
+
+	/*
+	 * extract necessary information from TID scan node
+	 */
+	scandesc = node->ss.ss_currentScanDesc;
+	estate = node->ss.ps.state;
+	slot = node->ss.ss_ScanTupleSlot;
+	direction = estate->es_direction;
+
+	if (!node->trss_inScan)
+	{
+		/* First time through, compute TID range to scan */
+		if (!TidRangeEval(node))
+			return NULL;
+
+		if (scandesc == NULL)
+		{
+			scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation,
+												estate->es_snapshot,
+												&node->trss_mintid,
+												&node->trss_maxtid);
+			node->ss.ss_currentScanDesc = scandesc;
+		}
+		else
+		{
+			/* rescan with the updated TID range */
+			table_rescan_tidrange(scandesc, &node->trss_mintid,
+								  &node->trss_maxtid);
+		}
+
+		node->trss_inScan = true;
+	}
+
+	/* Fetch the next tuple. */
+	if (!table_scan_getnextslot_tidrange(scandesc, direction, slot))
+	{
+		node->trss_inScan = false;
+		ExecClearTuple(slot);
+	}
+
+	return slot;
+}
+
+/*
+ * TidRangeRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot)
+{
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecTidRangeScan(node)
+ *
+ *		Scans the relation using tids and returns the next qualifying tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ *
+ *		Conditions:
+ *		  -- the "cursor" maintained by the AMI is positioned at the tuple
+ *			 returned previously.
+ *
+ *		Initial States:
+ *		  -- the relation indicated is opened for TID range scanning.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecTidRangeScan(PlanState *pstate)
+{
+	TidRangeScanState *node = castNode(TidRangeScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) TidRangeNext,
+					(ExecScanRecheckMtd) TidRangeRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanTidRangeScan(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanTidRangeScan(TidRangeScanState *node)
+{
+	/* mark scan as not in progress, and tid range list as not computed yet */
+	node->trss_inScan = false;
+
+	/*
+	 * We must wait until TidRangeNext before calling table_rescan_tidrange.
+	 */
+	ExecScanReScan(&node->ss);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndTidRangeScan
+ *
+ *		Releases any storage allocated through C routines.
+ *		Returns nothing.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndTidRangeScan(TidRangeScanState *node)
+{
+	TableScanDesc scan = node->ss.ss_currentScanDesc;
+
+	if (scan != NULL)
+		table_endscan(scan);
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clear out tuple table slots
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitTidRangeScan
+ *
+ *		Initializes the tid range scan's state information, creates
+ *		scan keys, and opens the scan relation.
+ *
+ *		Parameters:
+ *		  node: TidRangeScan node produced by the planner.
+ *		  estate: the execution state initialized in InitPlan.
+ * ----------------------------------------------------------------
+ */
+TidRangeScanState *
+ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags)
+{
+	TidRangeScanState *tidrangestate;
+	Relation	currentRelation;
+
+	/*
+	 * create state structure
+	 */
+	tidrangestate = makeNode(TidRangeScanState);
+	tidrangestate->ss.ps.plan = (Plan *) node;
+	tidrangestate->ss.ps.state = estate;
+	tidrangestate->ss.ps.ExecProcNode = ExecTidRangeScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &tidrangestate->ss.ps);
+
+	/*
+	 * mark scan as not in progress, and TID range as not computed yet
+	 */
+	tidrangestate->trss_inScan = false;
+
+	/*
+	 * open the scan relation
+	 */
+	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+	tidrangestate->ss.ss_currentRelation = currentRelation;
+	tidrangestate->ss.ss_currentScanDesc = NULL;	/* no table scan here */
+
+	/*
+	 * get the scan type from the relation descriptor.
+	 */
+	ExecInitScanTupleSlot(estate, &tidrangestate->ss,
+						  RelationGetDescr(currentRelation),
+						  table_slot_callbacks(currentRelation));
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&tidrangestate->ss.ps);
+	ExecAssignScanProjectionInfo(&tidrangestate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	tidrangestate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) tidrangestate);
+
+	TidExprListCreate(tidrangestate);
+
+	/*
+	 * all done.
+	 */
+	return tidrangestate;
+}
diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c
new file mode 100644
index 0000000..48c3737
--- /dev/null
+++ b/src/backend/executor/nodeTidscan.c
@@ -0,0 +1,558 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeTidscan.c
+ *	  Routines to support direct tid scans of relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeTidscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *
+ *		ExecTidScan			scans a relation using tids
+ *		ExecInitTidScan		creates and initializes state info.
+ *		ExecReScanTidScan	rescans the tid relation.
+ *		ExecEndTidScan		releases all storage.
+ */
+#include "postgres.h"
+
+#include "access/sysattr.h"
+#include "access/tableam.h"
+#include "catalog/pg_type.h"
+#include "executor/execdebug.h"
+#include "executor/nodeTidscan.h"
+#include "lib/qunique.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/bufmgr.h"
+#include "utils/array.h"
+#include "utils/rel.h"
+
+
+#define IsCTIDVar(node)  \
+	((node) != NULL && \
+	 IsA((node), Var) && \
+	 ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \
+	 ((Var *) (node))->varlevelsup == 0)
+
+/* one element in tss_tidexprs */
+typedef struct TidExpr
+{
+	ExprState  *exprstate;		/* ExprState for a TID-yielding subexpr */
+	bool		isarray;		/* if true, it yields tid[] not just tid */
+	CurrentOfExpr *cexpr;		/* alternatively, we can have CURRENT OF */
+} TidExpr;
+
+static void TidExprListCreate(TidScanState *tidstate);
+static void TidListEval(TidScanState *tidstate);
+static int	itemptr_comparator(const void *a, const void *b);
+static TupleTableSlot *TidNext(TidScanState *node);
+
+
+/*
+ * Extract the qual subexpressions that yield TIDs to search for,
+ * and compile them into ExprStates if they're ordinary expressions.
+ *
+ * CURRENT OF is a special case that we can't compile usefully;
+ * just drop it into the TidExpr list as-is.
+ */
+static void
+TidExprListCreate(TidScanState *tidstate)
+{
+	TidScan    *node = (TidScan *) tidstate->ss.ps.plan;
+	ListCell   *l;
+
+	tidstate->tss_tidexprs = NIL;
+	tidstate->tss_isCurrentOf = false;
+
+	foreach(l, node->tidquals)
+	{
+		Expr	   *expr = (Expr *) lfirst(l);
+		TidExpr    *tidexpr = (TidExpr *) palloc0(sizeof(TidExpr));
+
+		if (is_opclause(expr))
+		{
+			Node	   *arg1;
+			Node	   *arg2;
+
+			arg1 = get_leftop(expr);
+			arg2 = get_rightop(expr);
+			if (IsCTIDVar(arg1))
+				tidexpr->exprstate = ExecInitExpr((Expr *) arg2,
+												  &tidstate->ss.ps);
+			else if (IsCTIDVar(arg2))
+				tidexpr->exprstate = ExecInitExpr((Expr *) arg1,
+												  &tidstate->ss.ps);
+			else
+				elog(ERROR, "could not identify CTID variable");
+			tidexpr->isarray = false;
+		}
+		else if (expr && IsA(expr, ScalarArrayOpExpr))
+		{
+			ScalarArrayOpExpr *saex = (ScalarArrayOpExpr *) expr;
+
+			Assert(IsCTIDVar(linitial(saex->args)));
+			tidexpr->exprstate = ExecInitExpr(lsecond(saex->args),
+											  &tidstate->ss.ps);
+			tidexpr->isarray = true;
+		}
+		else if (expr && IsA(expr, CurrentOfExpr))
+		{
+			CurrentOfExpr *cexpr = (CurrentOfExpr *) expr;
+
+			tidexpr->cexpr = cexpr;
+			tidstate->tss_isCurrentOf = true;
+		}
+		else
+			elog(ERROR, "could not identify CTID expression");
+
+		tidstate->tss_tidexprs = lappend(tidstate->tss_tidexprs, tidexpr);
+	}
+
+	/* CurrentOfExpr could never appear OR'd with something else */
+	Assert(list_length(tidstate->tss_tidexprs) == 1 ||
+		   !tidstate->tss_isCurrentOf);
+}
+
+/*
+ * Compute the list of TIDs to be visited, by evaluating the expressions
+ * for them.
+ *
+ * (The result is actually an array, not a list.)
+ */
+static void
+TidListEval(TidScanState *tidstate)
+{
+	ExprContext *econtext = tidstate->ss.ps.ps_ExprContext;
+	TableScanDesc scan;
+	ItemPointerData *tidList;
+	int			numAllocTids;
+	int			numTids;
+	ListCell   *l;
+
+	/*
+	 * Start scan on-demand - initializing a scan isn't free (e.g. heap stats
+	 * the size of the table), so it makes sense to delay that until needed -
+	 * the node might never get executed.
+	 */
+	if (tidstate->ss.ss_currentScanDesc == NULL)
+		tidstate->ss.ss_currentScanDesc =
+			table_beginscan_tid(tidstate->ss.ss_currentRelation,
+								tidstate->ss.ps.state->es_snapshot);
+	scan = tidstate->ss.ss_currentScanDesc;
+
+	/*
+	 * We initialize the array with enough slots for the case that all quals
+	 * are simple OpExprs or CurrentOfExprs.  If there are any
+	 * ScalarArrayOpExprs, we may have to enlarge the array.
+	 */
+	numAllocTids = list_length(tidstate->tss_tidexprs);
+	tidList = (ItemPointerData *)
+		palloc(numAllocTids * sizeof(ItemPointerData));
+	numTids = 0;
+
+	foreach(l, tidstate->tss_tidexprs)
+	{
+		TidExpr    *tidexpr = (TidExpr *) lfirst(l);
+		ItemPointer itemptr;
+		bool		isNull;
+
+		if (tidexpr->exprstate && !tidexpr->isarray)
+		{
+			itemptr = (ItemPointer)
+				DatumGetPointer(ExecEvalExprSwitchContext(tidexpr->exprstate,
+														  econtext,
+														  &isNull));
+			if (isNull)
+				continue;
+
+			/*
+			 * We silently discard any TIDs that the AM considers invalid
+			 * (E.g. for heap, they could be out of range at the time of scan
+			 * start.  Since we hold at least AccessShareLock on the table, it
+			 * won't be possible for someone to truncate away the blocks we
+			 * intend to visit.).
+			 */
+			if (!table_tuple_tid_valid(scan, itemptr))
+				continue;
+
+			if (numTids >= numAllocTids)
+			{
+				numAllocTids *= 2;
+				tidList = (ItemPointerData *)
+					repalloc(tidList,
+							 numAllocTids * sizeof(ItemPointerData));
+			}
+			tidList[numTids++] = *itemptr;
+		}
+		else if (tidexpr->exprstate && tidexpr->isarray)
+		{
+			Datum		arraydatum;
+			ArrayType  *itemarray;
+			Datum	   *ipdatums;
+			bool	   *ipnulls;
+			int			ndatums;
+			int			i;
+
+			arraydatum = ExecEvalExprSwitchContext(tidexpr->exprstate,
+												   econtext,
+												   &isNull);
+			if (isNull)
+				continue;
+			itemarray = DatumGetArrayTypeP(arraydatum);
+			deconstruct_array(itemarray,
+							  TIDOID, sizeof(ItemPointerData), false, TYPALIGN_SHORT,
+							  &ipdatums, &ipnulls, &ndatums);
+			if (numTids + ndatums > numAllocTids)
+			{
+				numAllocTids = numTids + ndatums;
+				tidList = (ItemPointerData *)
+					repalloc(tidList,
+							 numAllocTids * sizeof(ItemPointerData));
+			}
+			for (i = 0; i < ndatums; i++)
+			{
+				if (ipnulls[i])
+					continue;
+
+				itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]);
+
+				if (!table_tuple_tid_valid(scan, itemptr))
+					continue;
+
+				tidList[numTids++] = *itemptr;
+			}
+			pfree(ipdatums);
+			pfree(ipnulls);
+		}
+		else
+		{
+			ItemPointerData cursor_tid;
+
+			Assert(tidexpr->cexpr);
+			if (execCurrentOf(tidexpr->cexpr, econtext,
+							  RelationGetRelid(tidstate->ss.ss_currentRelation),
+							  &cursor_tid))
+			{
+				if (numTids >= numAllocTids)
+				{
+					numAllocTids *= 2;
+					tidList = (ItemPointerData *)
+						repalloc(tidList,
+								 numAllocTids * sizeof(ItemPointerData));
+				}
+				tidList[numTids++] = cursor_tid;
+			}
+		}
+	}
+
+	/*
+	 * Sort the array of TIDs into order, and eliminate duplicates.
+	 * Eliminating duplicates is necessary since we want OR semantics across
+	 * the list.  Sorting makes it easier to detect duplicates, and as a bonus
+	 * ensures that we will visit the heap in the most efficient way.
+	 */
+	if (numTids > 1)
+	{
+		/* CurrentOfExpr could never appear OR'd with something else */
+		Assert(!tidstate->tss_isCurrentOf);
+
+		qsort((void *) tidList, numTids, sizeof(ItemPointerData),
+			  itemptr_comparator);
+		numTids = qunique(tidList, numTids, sizeof(ItemPointerData),
+						  itemptr_comparator);
+	}
+
+	tidstate->tss_TidList = tidList;
+	tidstate->tss_NumTids = numTids;
+	tidstate->tss_TidPtr = -1;
+}
+
+/*
+ * qsort comparator for ItemPointerData items
+ */
+static int
+itemptr_comparator(const void *a, const void *b)
+{
+	const ItemPointerData *ipa = (const ItemPointerData *) a;
+	const ItemPointerData *ipb = (const ItemPointerData *) b;
+	BlockNumber ba = ItemPointerGetBlockNumber(ipa);
+	BlockNumber bb = ItemPointerGetBlockNumber(ipb);
+	OffsetNumber oa = ItemPointerGetOffsetNumber(ipa);
+	OffsetNumber ob = ItemPointerGetOffsetNumber(ipb);
+
+	if (ba < bb)
+		return -1;
+	if (ba > bb)
+		return 1;
+	if (oa < ob)
+		return -1;
+	if (oa > ob)
+		return 1;
+	return 0;
+}
+
+/* ----------------------------------------------------------------
+ *		TidNext
+ *
+ *		Retrieve a tuple from the TidScan node's currentRelation
+ *		using the tids in the TidScanState information.
+ *
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+TidNext(TidScanState *node)
+{
+	EState	   *estate;
+	ScanDirection direction;
+	Snapshot	snapshot;
+	TableScanDesc scan;
+	Relation	heapRelation;
+	TupleTableSlot *slot;
+	ItemPointerData *tidList;
+	int			numTids;
+	bool		bBackward;
+
+	/*
+	 * extract necessary information from tid scan node
+	 */
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	snapshot = estate->es_snapshot;
+	heapRelation = node->ss.ss_currentRelation;
+	slot = node->ss.ss_ScanTupleSlot;
+
+	/*
+	 * First time through, compute the list of TIDs to be visited
+	 */
+	if (node->tss_TidList == NULL)
+		TidListEval(node);
+
+	scan = node->ss.ss_currentScanDesc;
+	tidList = node->tss_TidList;
+	numTids = node->tss_NumTids;
+
+	/*
+	 * Initialize or advance scan position, depending on direction.
+	 */
+	bBackward = ScanDirectionIsBackward(direction);
+	if (bBackward)
+	{
+		if (node->tss_TidPtr < 0)
+		{
+			/* initialize for backward scan */
+			node->tss_TidPtr = numTids - 1;
+		}
+		else
+			node->tss_TidPtr--;
+	}
+	else
+	{
+		if (node->tss_TidPtr < 0)
+		{
+			/* initialize for forward scan */
+			node->tss_TidPtr = 0;
+		}
+		else
+			node->tss_TidPtr++;
+	}
+
+	while (node->tss_TidPtr >= 0 && node->tss_TidPtr < numTids)
+	{
+		ItemPointerData tid = tidList[node->tss_TidPtr];
+
+		/*
+		 * For WHERE CURRENT OF, the tuple retrieved from the cursor might
+		 * since have been updated; if so, we should fetch the version that is
+		 * current according to our snapshot.
+		 */
+		if (node->tss_isCurrentOf)
+			table_tuple_get_latest_tid(scan, &tid);
+
+		if (table_tuple_fetch_row_version(heapRelation, &tid, snapshot, slot))
+			return slot;
+
+		/* Bad TID or failed snapshot qual; try next */
+		if (bBackward)
+			node->tss_TidPtr--;
+		else
+			node->tss_TidPtr++;
+
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	/*
+	 * if we get here it means the tid scan failed so we are at the end of the
+	 * scan..
+	 */
+	return ExecClearTuple(slot);
+}
+
+/*
+ * TidRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+TidRecheck(TidScanState *node, TupleTableSlot *slot)
+{
+	/*
+	 * XXX shouldn't we check here to make sure tuple matches TID list? In
+	 * runtime-key case this is not certain, is it?  However, in the WHERE
+	 * CURRENT OF case it might not match anyway ...
+	 */
+	return true;
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecTidScan(node)
+ *
+ *		Scans the relation using tids and returns
+ *		   the next qualifying tuple in the direction specified.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ *
+ *		Conditions:
+ *		  -- the "cursor" maintained by the AMI is positioned at the tuple
+ *			 returned previously.
+ *
+ *		Initial States:
+ *		  -- the relation indicated is opened for scanning so that the
+ *			 "cursor" is positioned before the first qualifying tuple.
+ *		  -- tss_TidPtr is -1.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecTidScan(PlanState *pstate)
+{
+	TidScanState *node = castNode(TidScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) TidNext,
+					(ExecScanRecheckMtd) TidRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanTidScan(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanTidScan(TidScanState *node)
+{
+	if (node->tss_TidList)
+		pfree(node->tss_TidList);
+	node->tss_TidList = NULL;
+	node->tss_NumTids = 0;
+	node->tss_TidPtr = -1;
+
+	/* not really necessary, but seems good form */
+	if (node->ss.ss_currentScanDesc)
+		table_rescan(node->ss.ss_currentScanDesc, NULL);
+
+	ExecScanReScan(&node->ss);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndTidScan
+ *
+ *		Releases any storage allocated through C routines.
+ *		Returns nothing.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndTidScan(TidScanState *node)
+{
+	if (node->ss.ss_currentScanDesc)
+		table_endscan(node->ss.ss_currentScanDesc);
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clear out tuple table slots
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitTidScan
+ *
+ *		Initializes the tid scan's state information, creates
+ *		scan keys, and opens the base and tid relations.
+ *
+ *		Parameters:
+ *		  node: TidScan node produced by the planner.
+ *		  estate: the execution state initialized in InitPlan.
+ * ----------------------------------------------------------------
+ */
+TidScanState *
+ExecInitTidScan(TidScan *node, EState *estate, int eflags)
+{
+	TidScanState *tidstate;
+	Relation	currentRelation;
+
+	/*
+	 * create state structure
+	 */
+	tidstate = makeNode(TidScanState);
+	tidstate->ss.ps.plan = (Plan *) node;
+	tidstate->ss.ps.state = estate;
+	tidstate->ss.ps.ExecProcNode = ExecTidScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &tidstate->ss.ps);
+
+	/*
+	 * mark tid list as not computed yet
+	 */
+	tidstate->tss_TidList = NULL;
+	tidstate->tss_NumTids = 0;
+	tidstate->tss_TidPtr = -1;
+
+	/*
+	 * open the scan relation
+	 */
+	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+
+	tidstate->ss.ss_currentRelation = currentRelation;
+	tidstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
+
+	/*
+	 * get the scan type from the relation descriptor.
+	 */
+	ExecInitScanTupleSlot(estate, &tidstate->ss,
+						  RelationGetDescr(currentRelation),
+						  table_slot_callbacks(currentRelation));
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&tidstate->ss.ps);
+	ExecAssignScanProjectionInfo(&tidstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	tidstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) tidstate);
+
+	TidExprListCreate(tidstate);
+
+	/*
+	 * all done.
+	 */
+	return tidstate;
+}
diff --git a/src/backend/executor/nodeUnique.c b/src/backend/executor/nodeUnique.c
new file mode 100644
index 0000000..9214d6f
--- /dev/null
+++ b/src/backend/executor/nodeUnique.c
@@ -0,0 +1,192 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeUnique.c
+ *	  Routines to handle unique'ing of queries where appropriate
+ *
+ * Unique is a very simple node type that just filters out duplicate
+ * tuples from a stream of sorted tuples from its subplan.  It's essentially
+ * a dumbed-down form of Group: the duplicate-removal functionality is
+ * identical.  However, Unique doesn't do projection nor qual checking,
+ * so it's marginally more efficient for cases where neither is needed.
+ * (It's debatable whether the savings justifies carrying two plan node
+ * types, though.)
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeUnique.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecUnique		- generate a unique'd temporary relation
+ *		ExecInitUnique	- initialize node and subnodes
+ *		ExecEndUnique	- shutdown node and subnodes
+ *
+ * NOTES
+ *		Assumes tuples returned from subplan arrive in
+ *		sorted order.
+ */
+
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeUnique.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+/* ----------------------------------------------------------------
+ *		ExecUnique
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *			/* return: a tuple or NULL */
+ExecUnique(PlanState *pstate)
+{
+	UniqueState *node = castNode(UniqueState, pstate);
+	ExprContext *econtext = node->ps.ps_ExprContext;
+	TupleTableSlot *resultTupleSlot;
+	TupleTableSlot *slot;
+	PlanState  *outerPlan;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * get information from the node
+	 */
+	outerPlan = outerPlanState(node);
+	resultTupleSlot = node->ps.ps_ResultTupleSlot;
+
+	/*
+	 * now loop, returning only non-duplicate tuples. We assume that the
+	 * tuples arrive in sorted order so we can detect duplicates easily. The
+	 * first tuple of each group is returned.
+	 */
+	for (;;)
+	{
+		/*
+		 * fetch a tuple from the outer subplan
+		 */
+		slot = ExecProcNode(outerPlan);
+		if (TupIsNull(slot))
+		{
+			/* end of subplan, so we're done */
+			ExecClearTuple(resultTupleSlot);
+			return NULL;
+		}
+
+		/*
+		 * Always return the first tuple from the subplan.
+		 */
+		if (TupIsNull(resultTupleSlot))
+			break;
+
+		/*
+		 * Else test if the new tuple and the previously returned tuple match.
+		 * If so then we loop back and fetch another new tuple from the
+		 * subplan.
+		 */
+		econtext->ecxt_innertuple = slot;
+		econtext->ecxt_outertuple = resultTupleSlot;
+		if (!ExecQualAndReset(node->eqfunction, econtext))
+			break;
+	}
+
+	/*
+	 * We have a new tuple different from the previous saved tuple (if any).
+	 * Save it and return it.  We must copy it because the source subplan
+	 * won't guarantee that this source tuple is still accessible after
+	 * fetching the next source tuple.
+	 */
+	return ExecCopySlot(resultTupleSlot, slot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitUnique
+ *
+ *		This initializes the unique node state structures and
+ *		the node's subplan.
+ * ----------------------------------------------------------------
+ */
+UniqueState *
+ExecInitUnique(Unique *node, EState *estate, int eflags)
+{
+	UniqueState *uniquestate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	uniquestate = makeNode(UniqueState);
+	uniquestate->ps.plan = (Plan *) node;
+	uniquestate->ps.state = estate;
+	uniquestate->ps.ExecProcNode = ExecUnique;
+
+	/*
+	 * create expression context
+	 */
+	ExecAssignExprContext(estate, &uniquestate->ps);
+
+	/*
+	 * then initialize outer plan
+	 */
+	outerPlanState(uniquestate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * Initialize result slot and type. Unique nodes do no projections, so
+	 * initialize projection info for this node appropriately.
+	 */
+	ExecInitResultTupleSlotTL(&uniquestate->ps, &TTSOpsMinimalTuple);
+	uniquestate->ps.ps_ProjInfo = NULL;
+
+	/*
+	 * Precompute fmgr lookup data for inner loop
+	 */
+	uniquestate->eqfunction =
+		execTuplesMatchPrepare(ExecGetResultType(outerPlanState(uniquestate)),
+							   node->numCols,
+							   node->uniqColIdx,
+							   node->uniqOperators,
+							   node->uniqCollations,
+							   &uniquestate->ps);
+
+	return uniquestate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndUnique
+ *
+ *		This shuts down the subplan and frees resources allocated
+ *		to this node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndUnique(UniqueState *node)
+{
+	/* clean up tuple table */
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+	ExecFreeExprContext(&node->ps);
+
+	ExecEndNode(outerPlanState(node));
+}
+
+
+void
+ExecReScanUnique(UniqueState *node)
+{
+	/* must clear result tuple so first input tuple is returned */
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->ps.lefttree->chgParam == NULL)
+		ExecReScan(node->ps.lefttree);
+}
diff --git a/src/backend/executor/nodeValuesscan.c b/src/backend/executor/nodeValuesscan.c
new file mode 100644
index 0000000..5de1429
--- /dev/null
+++ b/src/backend/executor/nodeValuesscan.c
@@ -0,0 +1,361 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeValuesscan.c
+ *	  Support routines for scanning Values lists
+ *	  ("VALUES (...), (...), ..." in rangetable).
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeValuesscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecValuesScan			scans a values list.
+ *		ExecValuesNext			retrieve next tuple in sequential order.
+ *		ExecInitValuesScan		creates and initializes a valuesscan node.
+ *		ExecEndValuesScan		releases any storage allocated.
+ *		ExecReScanValuesScan	rescans the values list
+ */
+#include "postgres.h"
+
+#include "executor/executor.h"
+#include "executor/nodeValuesscan.h"
+#include "jit/jit.h"
+#include "optimizer/clauses.h"
+#include "utils/expandeddatum.h"
+
+
+static TupleTableSlot *ValuesNext(ValuesScanState *node);
+
+
+/* ----------------------------------------------------------------
+ *						Scan Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ValuesNext
+ *
+ *		This is a workhorse for ExecValuesScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ValuesNext(ValuesScanState *node)
+{
+	TupleTableSlot *slot;
+	EState	   *estate;
+	ExprContext *econtext;
+	ScanDirection direction;
+	int			curr_idx;
+
+	/*
+	 * get information from the estate and scan state
+	 */
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	slot = node->ss.ss_ScanTupleSlot;
+	econtext = node->rowcontext;
+
+	/*
+	 * Get the next tuple. Return NULL if no more tuples.
+	 */
+	if (ScanDirectionIsForward(direction))
+	{
+		if (node->curr_idx < node->array_len)
+			node->curr_idx++;
+	}
+	else
+	{
+		if (node->curr_idx >= 0)
+			node->curr_idx--;
+	}
+
+	/*
+	 * Always clear the result slot; this is appropriate if we are at the end
+	 * of the data, and if we're not, we still need it as the first step of
+	 * the store-virtual-tuple protocol.  It seems wise to clear the slot
+	 * before we reset the context it might have pointers into.
+	 */
+	ExecClearTuple(slot);
+
+	curr_idx = node->curr_idx;
+	if (curr_idx >= 0 && curr_idx < node->array_len)
+	{
+		List	   *exprlist = node->exprlists[curr_idx];
+		List	   *exprstatelist = node->exprstatelists[curr_idx];
+		MemoryContext oldContext;
+		Datum	   *values;
+		bool	   *isnull;
+		ListCell   *lc;
+		int			resind;
+
+		/*
+		 * Get rid of any prior cycle's leftovers.  We use ReScanExprContext
+		 * not just ResetExprContext because we want any registered shutdown
+		 * callbacks to be called.
+		 */
+		ReScanExprContext(econtext);
+
+		/*
+		 * Do per-VALUES-row work in the per-tuple context.
+		 */
+		oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+		/*
+		 * Unless we already made the expression eval state for this row,
+		 * build it in the econtext's per-tuple memory.  This is a tad
+		 * unusual, but we want to delete the eval state again when we move to
+		 * the next row, to avoid growth of memory requirements over a long
+		 * values list.  For rows in which that won't work, we already built
+		 * the eval state at plan startup.
+		 */
+		if (exprstatelist == NIL)
+		{
+			/*
+			 * Pass parent as NULL, not my plan node, because we don't want
+			 * anything in this transient state linking into permanent state.
+			 * The only expression type that might wish to do so is a SubPlan,
+			 * and we already checked that there aren't any.
+			 *
+			 * Note that passing parent = NULL also disables JIT compilation
+			 * of the expressions, which is a win, because they're only going
+			 * to be used once under normal circumstances.
+			 */
+			exprstatelist = ExecInitExprList(exprlist, NULL);
+		}
+
+		/* parser should have checked all sublists are the same length */
+		Assert(list_length(exprstatelist) == slot->tts_tupleDescriptor->natts);
+
+		/*
+		 * Compute the expressions and build a virtual result tuple. We
+		 * already did ExecClearTuple(slot).
+		 */
+		values = slot->tts_values;
+		isnull = slot->tts_isnull;
+
+		resind = 0;
+		foreach(lc, exprstatelist)
+		{
+			ExprState  *estate = (ExprState *) lfirst(lc);
+			Form_pg_attribute attr = TupleDescAttr(slot->tts_tupleDescriptor,
+												   resind);
+
+			values[resind] = ExecEvalExpr(estate,
+										  econtext,
+										  &isnull[resind]);
+
+			/*
+			 * We must force any R/W expanded datums to read-only state, in
+			 * case they are multiply referenced in the plan node's output
+			 * expressions, or in case we skip the output projection and the
+			 * output column is multiply referenced in higher plan nodes.
+			 */
+			values[resind] = MakeExpandedObjectReadOnly(values[resind],
+														isnull[resind],
+														attr->attlen);
+
+			resind++;
+		}
+
+		MemoryContextSwitchTo(oldContext);
+
+		/*
+		 * And return the virtual tuple.
+		 */
+		ExecStoreVirtualTuple(slot);
+	}
+
+	return slot;
+}
+
+/*
+ * ValuesRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+ValuesRecheck(ValuesScanState *node, TupleTableSlot *slot)
+{
+	/* nothing to check */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecValuesScan(node)
+ *
+ *		Scans the values lists sequentially and returns the next qualifying
+ *		tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecValuesScan(PlanState *pstate)
+{
+	ValuesScanState *node = castNode(ValuesScanState, pstate);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) ValuesNext,
+					(ExecScanRecheckMtd) ValuesRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitValuesScan
+ * ----------------------------------------------------------------
+ */
+ValuesScanState *
+ExecInitValuesScan(ValuesScan *node, EState *estate, int eflags)
+{
+	ValuesScanState *scanstate;
+	TupleDesc	tupdesc;
+	ListCell   *vtl;
+	int			i;
+	PlanState  *planstate;
+
+	/*
+	 * ValuesScan should not have any children.
+	 */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create new ScanState for node
+	 */
+	scanstate = makeNode(ValuesScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecValuesScan;
+
+	/*
+	 * Miscellaneous initialization
+	 */
+	planstate = &scanstate->ss.ps;
+
+	/*
+	 * Create expression contexts.  We need two, one for per-sublist
+	 * processing and one for execScan.c to use for quals and projections. We
+	 * cheat a little by using ExecAssignExprContext() to build both.
+	 */
+	ExecAssignExprContext(estate, planstate);
+	scanstate->rowcontext = planstate->ps_ExprContext;
+	ExecAssignExprContext(estate, planstate);
+
+	/*
+	 * Get info about values list, initialize scan slot with it.
+	 */
+	tupdesc = ExecTypeFromExprList((List *) linitial(node->values_lists));
+	ExecInitScanTupleSlot(estate, &scanstate->ss, tupdesc, &TTSOpsVirtual);
+
+	/*
+	 * Initialize result type and projection.
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+	/*
+	 * Other node-specific setup
+	 */
+	scanstate->curr_idx = -1;
+	scanstate->array_len = list_length(node->values_lists);
+
+	/*
+	 * Convert the list of expression sublists into an array for easier
+	 * addressing at runtime.  Also, detect whether any sublists contain
+	 * SubPlans; for just those sublists, go ahead and do expression
+	 * initialization.  (This avoids problems with SubPlans wanting to connect
+	 * themselves up to the outer plan tree.  Notably, EXPLAIN won't see the
+	 * subplans otherwise; also we will have troubles with dangling pointers
+	 * and/or leaked resources if we try to handle SubPlans the same as
+	 * simpler expressions.)
+	 */
+	scanstate->exprlists = (List **)
+		palloc(scanstate->array_len * sizeof(List *));
+	scanstate->exprstatelists = (List **)
+		palloc0(scanstate->array_len * sizeof(List *));
+	i = 0;
+	foreach(vtl, node->values_lists)
+	{
+		List	   *exprs = castNode(List, lfirst(vtl));
+
+		scanstate->exprlists[i] = exprs;
+
+		/*
+		 * We can avoid the cost of a contain_subplans() scan in the simple
+		 * case where there are no SubPlans anywhere.
+		 */
+		if (estate->es_subplanstates &&
+			contain_subplans((Node *) exprs))
+		{
+			int			saved_jit_flags;
+
+			/*
+			 * As these expressions are only used once, disable JIT for them.
+			 * This is worthwhile because it's common to insert significant
+			 * amounts of data via VALUES().  Note that this doesn't prevent
+			 * use of JIT *within* a subplan, since that's initialized
+			 * separately; this just affects the upper-level subexpressions.
+			 */
+			saved_jit_flags = estate->es_jit_flags;
+			estate->es_jit_flags = PGJIT_NONE;
+
+			scanstate->exprstatelists[i] = ExecInitExprList(exprs,
+															&scanstate->ss.ps);
+
+			estate->es_jit_flags = saved_jit_flags;
+		}
+		i++;
+	}
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndValuesScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndValuesScan(ValuesScanState *node)
+{
+	/*
+	 * Free both exprcontexts
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+	node->ss.ps.ps_ExprContext = node->rowcontext;
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanValuesScan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanValuesScan(ValuesScanState *node)
+{
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	ExecScanReScan(&node->ss);
+
+	node->curr_idx = -1;
+}
diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c
new file mode 100644
index 0000000..f8ea9e9
--- /dev/null
+++ b/src/backend/executor/nodeWindowAgg.c
@@ -0,0 +1,3463 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeWindowAgg.c
+ *	  routines to handle WindowAgg nodes.
+ *
+ * A WindowAgg node evaluates "window functions" across suitable partitions
+ * of the input tuple set.  Any one WindowAgg works for just a single window
+ * specification, though it can evaluate multiple window functions sharing
+ * identical window specifications.  The input tuples are required to be
+ * delivered in sorted order, with the PARTITION BY columns (if any) as
+ * major sort keys and the ORDER BY columns (if any) as minor sort keys.
+ * (The planner generates a stack of WindowAggs with intervening Sort nodes
+ * as needed, if a query involves more than one window specification.)
+ *
+ * Since window functions can require access to any or all of the rows in
+ * the current partition, we accumulate rows of the partition into a
+ * tuplestore.  The window functions are called using the WindowObject API
+ * so that they can access those rows as needed.
+ *
+ * We also support using plain aggregate functions as window functions.
+ * For these, the regular Agg-node environment is emulated for each partition.
+ * As required by the SQL spec, the output represents the value of the
+ * aggregate function over all rows in the current row's window frame.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeWindowAgg.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/objectaccess.h"
+#include "catalog/pg_aggregate.h"
+#include "catalog/pg_proc.h"
+#include "executor/executor.h"
+#include "executor/nodeWindowAgg.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
+#include "parser/parse_agg.h"
+#include "parser/parse_coerce.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/expandeddatum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+#include "windowapi.h"
+
+/*
+ * All the window function APIs are called with this object, which is passed
+ * to window functions as fcinfo->context.
+ */
+typedef struct WindowObjectData
+{
+	NodeTag		type;
+	WindowAggState *winstate;	/* parent WindowAggState */
+	List	   *argstates;		/* ExprState trees for fn's arguments */
+	void	   *localmem;		/* WinGetPartitionLocalMemory's chunk */
+	int			markptr;		/* tuplestore mark pointer for this fn */
+	int			readptr;		/* tuplestore read pointer for this fn */
+	int64		markpos;		/* row that markptr is positioned on */
+	int64		seekpos;		/* row that readptr is positioned on */
+} WindowObjectData;
+
+/*
+ * We have one WindowStatePerFunc struct for each window function and
+ * window aggregate handled by this node.
+ */
+typedef struct WindowStatePerFuncData
+{
+	/* Links to WindowFunc expr and state nodes this working state is for */
+	WindowFuncExprState *wfuncstate;
+	WindowFunc *wfunc;
+
+	int			numArguments;	/* number of arguments */
+
+	FmgrInfo	flinfo;			/* fmgr lookup data for window function */
+
+	Oid			winCollation;	/* collation derived for window function */
+
+	/*
+	 * We need the len and byval info for the result of each function in order
+	 * to know how to copy/delete values.
+	 */
+	int16		resulttypeLen;
+	bool		resulttypeByVal;
+
+	bool		plain_agg;		/* is it just a plain aggregate function? */
+	int			aggno;			/* if so, index of its WindowStatePerAggData */
+
+	WindowObject winobj;		/* object used in window function API */
+}			WindowStatePerFuncData;
+
+/*
+ * For plain aggregate window functions, we also have one of these.
+ */
+typedef struct WindowStatePerAggData
+{
+	/* Oids of transition functions */
+	Oid			transfn_oid;
+	Oid			invtransfn_oid; /* may be InvalidOid */
+	Oid			finalfn_oid;	/* may be InvalidOid */
+
+	/*
+	 * fmgr lookup data for transition functions --- only valid when
+	 * corresponding oid is not InvalidOid.  Note in particular that fn_strict
+	 * flags are kept here.
+	 */
+	FmgrInfo	transfn;
+	FmgrInfo	invtransfn;
+	FmgrInfo	finalfn;
+
+	int			numFinalArgs;	/* number of arguments to pass to finalfn */
+
+	/*
+	 * initial value from pg_aggregate entry
+	 */
+	Datum		initValue;
+	bool		initValueIsNull;
+
+	/*
+	 * cached value for current frame boundaries
+	 */
+	Datum		resultValue;
+	bool		resultValueIsNull;
+
+	/*
+	 * We need the len and byval info for the agg's input, result, and
+	 * transition data types in order to know how to copy/delete values.
+	 */
+	int16		inputtypeLen,
+				resulttypeLen,
+				transtypeLen;
+	bool		inputtypeByVal,
+				resulttypeByVal,
+				transtypeByVal;
+
+	int			wfuncno;		/* index of associated WindowStatePerFuncData */
+
+	/* Context holding transition value and possibly other subsidiary data */
+	MemoryContext aggcontext;	/* may be private, or winstate->aggcontext */
+
+	/* Current transition value */
+	Datum		transValue;		/* current transition value */
+	bool		transValueIsNull;
+
+	int64		transValueCount;	/* number of currently-aggregated rows */
+
+	/* Data local to eval_windowaggregates() */
+	bool		restart;		/* need to restart this agg in this cycle? */
+} WindowStatePerAggData;
+
+static void initialize_windowaggregate(WindowAggState *winstate,
+									   WindowStatePerFunc perfuncstate,
+									   WindowStatePerAgg peraggstate);
+static void advance_windowaggregate(WindowAggState *winstate,
+									WindowStatePerFunc perfuncstate,
+									WindowStatePerAgg peraggstate);
+static bool advance_windowaggregate_base(WindowAggState *winstate,
+										 WindowStatePerFunc perfuncstate,
+										 WindowStatePerAgg peraggstate);
+static void finalize_windowaggregate(WindowAggState *winstate,
+									 WindowStatePerFunc perfuncstate,
+									 WindowStatePerAgg peraggstate,
+									 Datum *result, bool *isnull);
+
+static void eval_windowaggregates(WindowAggState *winstate);
+static void eval_windowfunction(WindowAggState *winstate,
+								WindowStatePerFunc perfuncstate,
+								Datum *result, bool *isnull);
+
+static void begin_partition(WindowAggState *winstate);
+static void spool_tuples(WindowAggState *winstate, int64 pos);
+static void release_partition(WindowAggState *winstate);
+
+static int	row_is_in_frame(WindowAggState *winstate, int64 pos,
+							TupleTableSlot *slot);
+static void update_frameheadpos(WindowAggState *winstate);
+static void update_frametailpos(WindowAggState *winstate);
+static void update_grouptailpos(WindowAggState *winstate);
+
+static WindowStatePerAggData *initialize_peragg(WindowAggState *winstate,
+												WindowFunc *wfunc,
+												WindowStatePerAgg peraggstate);
+static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
+
+static bool are_peers(WindowAggState *winstate, TupleTableSlot *slot1,
+					  TupleTableSlot *slot2);
+static bool window_gettupleslot(WindowObject winobj, int64 pos,
+								TupleTableSlot *slot);
+
+
+/*
+ * initialize_windowaggregate
+ * parallel to initialize_aggregates in nodeAgg.c
+ */
+static void
+initialize_windowaggregate(WindowAggState *winstate,
+						   WindowStatePerFunc perfuncstate,
+						   WindowStatePerAgg peraggstate)
+{
+	MemoryContext oldContext;
+
+	/*
+	 * If we're using a private aggcontext, we may reset it here.  But if the
+	 * context is shared, we don't know which other aggregates may still need
+	 * it, so we must leave it to the caller to reset at an appropriate time.
+	 */
+	if (peraggstate->aggcontext != winstate->aggcontext)
+		MemoryContextResetAndDeleteChildren(peraggstate->aggcontext);
+
+	if (peraggstate->initValueIsNull)
+		peraggstate->transValue = peraggstate->initValue;
+	else
+	{
+		oldContext = MemoryContextSwitchTo(peraggstate->aggcontext);
+		peraggstate->transValue = datumCopy(peraggstate->initValue,
+											peraggstate->transtypeByVal,
+											peraggstate->transtypeLen);
+		MemoryContextSwitchTo(oldContext);
+	}
+	peraggstate->transValueIsNull = peraggstate->initValueIsNull;
+	peraggstate->transValueCount = 0;
+	peraggstate->resultValue = (Datum) 0;
+	peraggstate->resultValueIsNull = true;
+}
+
+/*
+ * advance_windowaggregate
+ * parallel to advance_aggregates in nodeAgg.c
+ */
+static void
+advance_windowaggregate(WindowAggState *winstate,
+						WindowStatePerFunc perfuncstate,
+						WindowStatePerAgg peraggstate)
+{
+	LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+	WindowFuncExprState *wfuncstate = perfuncstate->wfuncstate;
+	int			numArguments = perfuncstate->numArguments;
+	Datum		newVal;
+	ListCell   *arg;
+	int			i;
+	MemoryContext oldContext;
+	ExprContext *econtext = winstate->tmpcontext;
+	ExprState  *filter = wfuncstate->aggfilter;
+
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	/* Skip anything FILTERed out */
+	if (filter)
+	{
+		bool		isnull;
+		Datum		res = ExecEvalExpr(filter, econtext, &isnull);
+
+		if (isnull || !DatumGetBool(res))
+		{
+			MemoryContextSwitchTo(oldContext);
+			return;
+		}
+	}
+
+	/* We start from 1, since the 0th arg will be the transition value */
+	i = 1;
+	foreach(arg, wfuncstate->args)
+	{
+		ExprState  *argstate = (ExprState *) lfirst(arg);
+
+		fcinfo->args[i].value = ExecEvalExpr(argstate, econtext,
+											 &fcinfo->args[i].isnull);
+		i++;
+	}
+
+	if (peraggstate->transfn.fn_strict)
+	{
+		/*
+		 * For a strict transfn, nothing happens when there's a NULL input; we
+		 * just keep the prior transValue.  Note transValueCount doesn't
+		 * change either.
+		 */
+		for (i = 1; i <= numArguments; i++)
+		{
+			if (fcinfo->args[i].isnull)
+			{
+				MemoryContextSwitchTo(oldContext);
+				return;
+			}
+		}
+
+		/*
+		 * For strict transition functions with initial value NULL we use the
+		 * first non-NULL input as the initial state.  (We already checked
+		 * that the agg's input type is binary-compatible with its transtype,
+		 * so straight copy here is OK.)
+		 *
+		 * We must copy the datum into aggcontext if it is pass-by-ref.  We do
+		 * not need to pfree the old transValue, since it's NULL.
+		 */
+		if (peraggstate->transValueCount == 0 && peraggstate->transValueIsNull)
+		{
+			MemoryContextSwitchTo(peraggstate->aggcontext);
+			peraggstate->transValue = datumCopy(fcinfo->args[1].value,
+												peraggstate->transtypeByVal,
+												peraggstate->transtypeLen);
+			peraggstate->transValueIsNull = false;
+			peraggstate->transValueCount = 1;
+			MemoryContextSwitchTo(oldContext);
+			return;
+		}
+
+		if (peraggstate->transValueIsNull)
+		{
+			/*
+			 * Don't call a strict function with NULL inputs.  Note it is
+			 * possible to get here despite the above tests, if the transfn is
+			 * strict *and* returned a NULL on a prior cycle.  If that happens
+			 * we will propagate the NULL all the way to the end.  That can
+			 * only happen if there's no inverse transition function, though,
+			 * since we disallow transitions back to NULL when there is one.
+			 */
+			MemoryContextSwitchTo(oldContext);
+			Assert(!OidIsValid(peraggstate->invtransfn_oid));
+			return;
+		}
+	}
+
+	/*
+	 * OK to call the transition function.  Set winstate->curaggcontext while
+	 * calling it, for possible use by AggCheckCallContext.
+	 */
+	InitFunctionCallInfoData(*fcinfo, &(peraggstate->transfn),
+							 numArguments + 1,
+							 perfuncstate->winCollation,
+							 (void *) winstate, NULL);
+	fcinfo->args[0].value = peraggstate->transValue;
+	fcinfo->args[0].isnull = peraggstate->transValueIsNull;
+	winstate->curaggcontext = peraggstate->aggcontext;
+	newVal = FunctionCallInvoke(fcinfo);
+	winstate->curaggcontext = NULL;
+
+	/*
+	 * Moving-aggregate transition functions must not return null, see
+	 * advance_windowaggregate_base().
+	 */
+	if (fcinfo->isnull && OidIsValid(peraggstate->invtransfn_oid))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("moving-aggregate transition function must not return null")));
+
+	/*
+	 * We must track the number of rows included in transValue, since to
+	 * remove the last input, advance_windowaggregate_base() mustn't call the
+	 * inverse transition function, but simply reset transValue back to its
+	 * initial value.
+	 */
+	peraggstate->transValueCount++;
+
+	/*
+	 * If pass-by-ref datatype, must copy the new value into aggcontext and
+	 * free the prior transValue.  But if transfn returned a pointer to its
+	 * first input, we don't need to do anything.  Also, if transfn returned a
+	 * pointer to a R/W expanded object that is already a child of the
+	 * aggcontext, assume we can adopt that value without copying it.
+	 */
+	if (!peraggstate->transtypeByVal &&
+		DatumGetPointer(newVal) != DatumGetPointer(peraggstate->transValue))
+	{
+		if (!fcinfo->isnull)
+		{
+			MemoryContextSwitchTo(peraggstate->aggcontext);
+			if (DatumIsReadWriteExpandedObject(newVal,
+											   false,
+											   peraggstate->transtypeLen) &&
+				MemoryContextGetParent(DatumGetEOHP(newVal)->eoh_context) == CurrentMemoryContext)
+				 /* do nothing */ ;
+			else
+				newVal = datumCopy(newVal,
+								   peraggstate->transtypeByVal,
+								   peraggstate->transtypeLen);
+		}
+		if (!peraggstate->transValueIsNull)
+		{
+			if (DatumIsReadWriteExpandedObject(peraggstate->transValue,
+											   false,
+											   peraggstate->transtypeLen))
+				DeleteExpandedObject(peraggstate->transValue);
+			else
+				pfree(DatumGetPointer(peraggstate->transValue));
+		}
+	}
+
+	MemoryContextSwitchTo(oldContext);
+	peraggstate->transValue = newVal;
+	peraggstate->transValueIsNull = fcinfo->isnull;
+}
+
+/*
+ * advance_windowaggregate_base
+ * Remove the oldest tuple from an aggregation.
+ *
+ * This is very much like advance_windowaggregate, except that we will call
+ * the inverse transition function (which caller must have checked is
+ * available).
+ *
+ * Returns true if we successfully removed the current row from this
+ * aggregate, false if not (in the latter case, caller is responsible
+ * for cleaning up by restarting the aggregation).
+ */
+static bool
+advance_windowaggregate_base(WindowAggState *winstate,
+							 WindowStatePerFunc perfuncstate,
+							 WindowStatePerAgg peraggstate)
+{
+	LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+	WindowFuncExprState *wfuncstate = perfuncstate->wfuncstate;
+	int			numArguments = perfuncstate->numArguments;
+	Datum		newVal;
+	ListCell   *arg;
+	int			i;
+	MemoryContext oldContext;
+	ExprContext *econtext = winstate->tmpcontext;
+	ExprState  *filter = wfuncstate->aggfilter;
+
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	/* Skip anything FILTERed out */
+	if (filter)
+	{
+		bool		isnull;
+		Datum		res = ExecEvalExpr(filter, econtext, &isnull);
+
+		if (isnull || !DatumGetBool(res))
+		{
+			MemoryContextSwitchTo(oldContext);
+			return true;
+		}
+	}
+
+	/* We start from 1, since the 0th arg will be the transition value */
+	i = 1;
+	foreach(arg, wfuncstate->args)
+	{
+		ExprState  *argstate = (ExprState *) lfirst(arg);
+
+		fcinfo->args[i].value = ExecEvalExpr(argstate, econtext,
+											 &fcinfo->args[i].isnull);
+		i++;
+	}
+
+	if (peraggstate->invtransfn.fn_strict)
+	{
+		/*
+		 * For a strict (inv)transfn, nothing happens when there's a NULL
+		 * input; we just keep the prior transValue.  Note transValueCount
+		 * doesn't change either.
+		 */
+		for (i = 1; i <= numArguments; i++)
+		{
+			if (fcinfo->args[i].isnull)
+			{
+				MemoryContextSwitchTo(oldContext);
+				return true;
+			}
+		}
+	}
+
+	/* There should still be an added but not yet removed value */
+	Assert(peraggstate->transValueCount > 0);
+
+	/*
+	 * In moving-aggregate mode, the state must never be NULL, except possibly
+	 * before any rows have been aggregated (which is surely not the case at
+	 * this point).  This restriction allows us to interpret a NULL result
+	 * from the inverse function as meaning "sorry, can't do an inverse
+	 * transition in this case".  We already checked this in
+	 * advance_windowaggregate, but just for safety, check again.
+	 */
+	if (peraggstate->transValueIsNull)
+		elog(ERROR, "aggregate transition value is NULL before inverse transition");
+
+	/*
+	 * We mustn't use the inverse transition function to remove the last
+	 * input.  Doing so would yield a non-NULL state, whereas we should be in
+	 * the initial state afterwards which may very well be NULL.  So instead,
+	 * we simply re-initialize the aggregate in this case.
+	 */
+	if (peraggstate->transValueCount == 1)
+	{
+		MemoryContextSwitchTo(oldContext);
+		initialize_windowaggregate(winstate,
+								   &winstate->perfunc[peraggstate->wfuncno],
+								   peraggstate);
+		return true;
+	}
+
+	/*
+	 * OK to call the inverse transition function.  Set
+	 * winstate->curaggcontext while calling it, for possible use by
+	 * AggCheckCallContext.
+	 */
+	InitFunctionCallInfoData(*fcinfo, &(peraggstate->invtransfn),
+							 numArguments + 1,
+							 perfuncstate->winCollation,
+							 (void *) winstate, NULL);
+	fcinfo->args[0].value = peraggstate->transValue;
+	fcinfo->args[0].isnull = peraggstate->transValueIsNull;
+	winstate->curaggcontext = peraggstate->aggcontext;
+	newVal = FunctionCallInvoke(fcinfo);
+	winstate->curaggcontext = NULL;
+
+	/*
+	 * If the function returns NULL, report failure, forcing a restart.
+	 */
+	if (fcinfo->isnull)
+	{
+		MemoryContextSwitchTo(oldContext);
+		return false;
+	}
+
+	/* Update number of rows included in transValue */
+	peraggstate->transValueCount--;
+
+	/*
+	 * If pass-by-ref datatype, must copy the new value into aggcontext and
+	 * free the prior transValue.  But if invtransfn returned a pointer to its
+	 * first input, we don't need to do anything.  Also, if invtransfn
+	 * returned a pointer to a R/W expanded object that is already a child of
+	 * the aggcontext, assume we can adopt that value without copying it.
+	 *
+	 * Note: the checks for null values here will never fire, but it seems
+	 * best to have this stanza look just like advance_windowaggregate.
+	 */
+	if (!peraggstate->transtypeByVal &&
+		DatumGetPointer(newVal) != DatumGetPointer(peraggstate->transValue))
+	{
+		if (!fcinfo->isnull)
+		{
+			MemoryContextSwitchTo(peraggstate->aggcontext);
+			if (DatumIsReadWriteExpandedObject(newVal,
+											   false,
+											   peraggstate->transtypeLen) &&
+				MemoryContextGetParent(DatumGetEOHP(newVal)->eoh_context) == CurrentMemoryContext)
+				 /* do nothing */ ;
+			else
+				newVal = datumCopy(newVal,
+								   peraggstate->transtypeByVal,
+								   peraggstate->transtypeLen);
+		}
+		if (!peraggstate->transValueIsNull)
+		{
+			if (DatumIsReadWriteExpandedObject(peraggstate->transValue,
+											   false,
+											   peraggstate->transtypeLen))
+				DeleteExpandedObject(peraggstate->transValue);
+			else
+				pfree(DatumGetPointer(peraggstate->transValue));
+		}
+	}
+
+	MemoryContextSwitchTo(oldContext);
+	peraggstate->transValue = newVal;
+	peraggstate->transValueIsNull = fcinfo->isnull;
+
+	return true;
+}
+
+/*
+ * finalize_windowaggregate
+ * parallel to finalize_aggregate in nodeAgg.c
+ */
+static void
+finalize_windowaggregate(WindowAggState *winstate,
+						 WindowStatePerFunc perfuncstate,
+						 WindowStatePerAgg peraggstate,
+						 Datum *result, bool *isnull)
+{
+	MemoryContext oldContext;
+
+	oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+	/*
+	 * Apply the agg's finalfn if one is provided, else return transValue.
+	 */
+	if (OidIsValid(peraggstate->finalfn_oid))
+	{
+		LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+		int			numFinalArgs = peraggstate->numFinalArgs;
+		bool		anynull;
+		int			i;
+
+		InitFunctionCallInfoData(fcinfodata.fcinfo, &(peraggstate->finalfn),
+								 numFinalArgs,
+								 perfuncstate->winCollation,
+								 (void *) winstate, NULL);
+		fcinfo->args[0].value =
+			MakeExpandedObjectReadOnly(peraggstate->transValue,
+									   peraggstate->transValueIsNull,
+									   peraggstate->transtypeLen);
+		fcinfo->args[0].isnull = peraggstate->transValueIsNull;
+		anynull = peraggstate->transValueIsNull;
+
+		/* Fill any remaining argument positions with nulls */
+		for (i = 1; i < numFinalArgs; i++)
+		{
+			fcinfo->args[i].value = (Datum) 0;
+			fcinfo->args[i].isnull = true;
+			anynull = true;
+		}
+
+		if (fcinfo->flinfo->fn_strict && anynull)
+		{
+			/* don't call a strict function with NULL inputs */
+			*result = (Datum) 0;
+			*isnull = true;
+		}
+		else
+		{
+			winstate->curaggcontext = peraggstate->aggcontext;
+			*result = FunctionCallInvoke(fcinfo);
+			winstate->curaggcontext = NULL;
+			*isnull = fcinfo->isnull;
+		}
+	}
+	else
+	{
+		/* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
+		*result = peraggstate->transValue;
+		*isnull = peraggstate->transValueIsNull;
+	}
+
+	/*
+	 * If result is pass-by-ref, make sure it is in the right context.
+	 */
+	if (!peraggstate->resulttypeByVal && !*isnull &&
+		!MemoryContextContains(CurrentMemoryContext,
+							   DatumGetPointer(*result)))
+		*result = datumCopy(*result,
+							peraggstate->resulttypeByVal,
+							peraggstate->resulttypeLen);
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * eval_windowaggregates
+ * evaluate plain aggregates being used as window functions
+ *
+ * This differs from nodeAgg.c in two ways.  First, if the window's frame
+ * start position moves, we use the inverse transition function (if it exists)
+ * to remove rows from the transition value.  And second, we expect to be
+ * able to call aggregate final functions repeatedly after aggregating more
+ * data onto the same transition value.  This is not a behavior required by
+ * nodeAgg.c.
+ */
+static void
+eval_windowaggregates(WindowAggState *winstate)
+{
+	WindowStatePerAgg peraggstate;
+	int			wfuncno,
+				numaggs,
+				numaggs_restart,
+				i;
+	int64		aggregatedupto_nonrestarted;
+	MemoryContext oldContext;
+	ExprContext *econtext;
+	WindowObject agg_winobj;
+	TupleTableSlot *agg_row_slot;
+	TupleTableSlot *temp_slot;
+
+	numaggs = winstate->numaggs;
+	if (numaggs == 0)
+		return;					/* nothing to do */
+
+	/* final output execution is in ps_ExprContext */
+	econtext = winstate->ss.ps.ps_ExprContext;
+	agg_winobj = winstate->agg_winobj;
+	agg_row_slot = winstate->agg_row_slot;
+	temp_slot = winstate->temp_slot_1;
+
+	/*
+	 * If the window's frame start clause is UNBOUNDED_PRECEDING and no
+	 * exclusion clause is specified, then the window frame consists of a
+	 * contiguous group of rows extending forward from the start of the
+	 * partition, and rows only enter the frame, never exit it, as the current
+	 * row advances forward.  This makes it possible to use an incremental
+	 * strategy for evaluating aggregates: we run the transition function for
+	 * each row added to the frame, and run the final function whenever we
+	 * need the current aggregate value.  This is considerably more efficient
+	 * than the naive approach of re-running the entire aggregate calculation
+	 * for each current row.  It does assume that the final function doesn't
+	 * damage the running transition value, but we have the same assumption in
+	 * nodeAgg.c too (when it rescans an existing hash table).
+	 *
+	 * If the frame start does sometimes move, we can still optimize as above
+	 * whenever successive rows share the same frame head, but if the frame
+	 * head moves beyond the previous head we try to remove those rows using
+	 * the aggregate's inverse transition function.  This function restores
+	 * the aggregate's current state to what it would be if the removed row
+	 * had never been aggregated in the first place.  Inverse transition
+	 * functions may optionally return NULL, indicating that the function was
+	 * unable to remove the tuple from aggregation.  If this happens, or if
+	 * the aggregate doesn't have an inverse transition function at all, we
+	 * must perform the aggregation all over again for all tuples within the
+	 * new frame boundaries.
+	 *
+	 * If there's any exclusion clause, then we may have to aggregate over a
+	 * non-contiguous set of rows, so we punt and recalculate for every row.
+	 * (For some frame end choices, it might be that the frame is always
+	 * contiguous anyway, but that's an optimization to investigate later.)
+	 *
+	 * In many common cases, multiple rows share the same frame and hence the
+	 * same aggregate value. (In particular, if there's no ORDER BY in a RANGE
+	 * window, then all rows are peers and so they all have window frame equal
+	 * to the whole partition.)  We optimize such cases by calculating the
+	 * aggregate value once when we reach the first row of a peer group, and
+	 * then returning the saved value for all subsequent rows.
+	 *
+	 * 'aggregatedupto' keeps track of the first row that has not yet been
+	 * accumulated into the aggregate transition values.  Whenever we start a
+	 * new peer group, we accumulate forward to the end of the peer group.
+	 */
+
+	/*
+	 * First, update the frame head position.
+	 *
+	 * The frame head should never move backwards, and the code below wouldn't
+	 * cope if it did, so for safety we complain if it does.
+	 */
+	update_frameheadpos(winstate);
+	if (winstate->frameheadpos < winstate->aggregatedbase)
+		elog(ERROR, "window frame head moved backward");
+
+	/*
+	 * If the frame didn't change compared to the previous row, we can re-use
+	 * the result values that were previously saved at the bottom of this
+	 * function.  Since we don't know the current frame's end yet, this is not
+	 * possible to check for fully.  But if the frame end mode is UNBOUNDED
+	 * FOLLOWING or CURRENT ROW, no exclusion clause is specified, and the
+	 * current row lies within the previous row's frame, then the two frames'
+	 * ends must coincide.  Note that on the first row aggregatedbase ==
+	 * aggregatedupto, meaning this test must fail, so we don't need to check
+	 * the "there was no previous row" case explicitly here.
+	 */
+	if (winstate->aggregatedbase == winstate->frameheadpos &&
+		(winstate->frameOptions & (FRAMEOPTION_END_UNBOUNDED_FOLLOWING |
+								   FRAMEOPTION_END_CURRENT_ROW)) &&
+		!(winstate->frameOptions & FRAMEOPTION_EXCLUSION) &&
+		winstate->aggregatedbase <= winstate->currentpos &&
+		winstate->aggregatedupto > winstate->currentpos)
+	{
+		for (i = 0; i < numaggs; i++)
+		{
+			peraggstate = &winstate->peragg[i];
+			wfuncno = peraggstate->wfuncno;
+			econtext->ecxt_aggvalues[wfuncno] = peraggstate->resultValue;
+			econtext->ecxt_aggnulls[wfuncno] = peraggstate->resultValueIsNull;
+		}
+		return;
+	}
+
+	/*----------
+	 * Initialize restart flags.
+	 *
+	 * We restart the aggregation:
+	 *	 - if we're processing the first row in the partition, or
+	 *	 - if the frame's head moved and we cannot use an inverse
+	 *	   transition function, or
+	 *	 - we have an EXCLUSION clause, or
+	 *	 - if the new frame doesn't overlap the old one
+	 *
+	 * Note that we don't strictly need to restart in the last case, but if
+	 * we're going to remove all rows from the aggregation anyway, a restart
+	 * surely is faster.
+	 *----------
+	 */
+	numaggs_restart = 0;
+	for (i = 0; i < numaggs; i++)
+	{
+		peraggstate = &winstate->peragg[i];
+		if (winstate->currentpos == 0 ||
+			(winstate->aggregatedbase != winstate->frameheadpos &&
+			 !OidIsValid(peraggstate->invtransfn_oid)) ||
+			(winstate->frameOptions & FRAMEOPTION_EXCLUSION) ||
+			winstate->aggregatedupto <= winstate->frameheadpos)
+		{
+			peraggstate->restart = true;
+			numaggs_restart++;
+		}
+		else
+			peraggstate->restart = false;
+	}
+
+	/*
+	 * If we have any possibly-moving aggregates, attempt to advance
+	 * aggregatedbase to match the frame's head by removing input rows that
+	 * fell off the top of the frame from the aggregations.  This can fail,
+	 * i.e. advance_windowaggregate_base() can return false, in which case
+	 * we'll restart that aggregate below.
+	 */
+	while (numaggs_restart < numaggs &&
+		   winstate->aggregatedbase < winstate->frameheadpos)
+	{
+		/*
+		 * Fetch the next tuple of those being removed. This should never fail
+		 * as we should have been here before.
+		 */
+		if (!window_gettupleslot(agg_winobj, winstate->aggregatedbase,
+								 temp_slot))
+			elog(ERROR, "could not re-fetch previously fetched frame row");
+
+		/* Set tuple context for evaluation of aggregate arguments */
+		winstate->tmpcontext->ecxt_outertuple = temp_slot;
+
+		/*
+		 * Perform the inverse transition for each aggregate function in the
+		 * window, unless it has already been marked as needing a restart.
+		 */
+		for (i = 0; i < numaggs; i++)
+		{
+			bool		ok;
+
+			peraggstate = &winstate->peragg[i];
+			if (peraggstate->restart)
+				continue;
+
+			wfuncno = peraggstate->wfuncno;
+			ok = advance_windowaggregate_base(winstate,
+											  &winstate->perfunc[wfuncno],
+											  peraggstate);
+			if (!ok)
+			{
+				/* Inverse transition function has failed, must restart */
+				peraggstate->restart = true;
+				numaggs_restart++;
+			}
+		}
+
+		/* Reset per-input-tuple context after each tuple */
+		ResetExprContext(winstate->tmpcontext);
+
+		/* And advance the aggregated-row state */
+		winstate->aggregatedbase++;
+		ExecClearTuple(temp_slot);
+	}
+
+	/*
+	 * If we successfully advanced the base rows of all the aggregates,
+	 * aggregatedbase now equals frameheadpos; but if we failed for any, we
+	 * must forcibly update aggregatedbase.
+	 */
+	winstate->aggregatedbase = winstate->frameheadpos;
+
+	/*
+	 * If we created a mark pointer for aggregates, keep it pushed up to frame
+	 * head, so that tuplestore can discard unnecessary rows.
+	 */
+	if (agg_winobj->markptr >= 0)
+		WinSetMarkPosition(agg_winobj, winstate->frameheadpos);
+
+	/*
+	 * Now restart the aggregates that require it.
+	 *
+	 * We assume that aggregates using the shared context always restart if
+	 * *any* aggregate restarts, and we may thus clean up the shared
+	 * aggcontext if that is the case.  Private aggcontexts are reset by
+	 * initialize_windowaggregate() if their owning aggregate restarts. If we
+	 * aren't restarting an aggregate, we need to free any previously saved
+	 * result for it, else we'll leak memory.
+	 */
+	if (numaggs_restart > 0)
+		MemoryContextResetAndDeleteChildren(winstate->aggcontext);
+	for (i = 0; i < numaggs; i++)
+	{
+		peraggstate = &winstate->peragg[i];
+
+		/* Aggregates using the shared ctx must restart if *any* agg does */
+		Assert(peraggstate->aggcontext != winstate->aggcontext ||
+			   numaggs_restart == 0 ||
+			   peraggstate->restart);
+
+		if (peraggstate->restart)
+		{
+			wfuncno = peraggstate->wfuncno;
+			initialize_windowaggregate(winstate,
+									   &winstate->perfunc[wfuncno],
+									   peraggstate);
+		}
+		else if (!peraggstate->resultValueIsNull)
+		{
+			if (!peraggstate->resulttypeByVal)
+				pfree(DatumGetPointer(peraggstate->resultValue));
+			peraggstate->resultValue = (Datum) 0;
+			peraggstate->resultValueIsNull = true;
+		}
+	}
+
+	/*
+	 * Non-restarted aggregates now contain the rows between aggregatedbase
+	 * (i.e., frameheadpos) and aggregatedupto, while restarted aggregates
+	 * contain no rows.  If there are any restarted aggregates, we must thus
+	 * begin aggregating anew at frameheadpos, otherwise we may simply
+	 * continue at aggregatedupto.  We must remember the old value of
+	 * aggregatedupto to know how long to skip advancing non-restarted
+	 * aggregates.  If we modify aggregatedupto, we must also clear
+	 * agg_row_slot, per the loop invariant below.
+	 */
+	aggregatedupto_nonrestarted = winstate->aggregatedupto;
+	if (numaggs_restart > 0 &&
+		winstate->aggregatedupto != winstate->frameheadpos)
+	{
+		winstate->aggregatedupto = winstate->frameheadpos;
+		ExecClearTuple(agg_row_slot);
+	}
+
+	/*
+	 * Advance until we reach a row not in frame (or end of partition).
+	 *
+	 * Note the loop invariant: agg_row_slot is either empty or holds the row
+	 * at position aggregatedupto.  We advance aggregatedupto after processing
+	 * a row.
+	 */
+	for (;;)
+	{
+		int			ret;
+
+		/* Fetch next row if we didn't already */
+		if (TupIsNull(agg_row_slot))
+		{
+			if (!window_gettupleslot(agg_winobj, winstate->aggregatedupto,
+									 agg_row_slot))
+				break;			/* must be end of partition */
+		}
+
+		/*
+		 * Exit loop if no more rows can be in frame.  Skip aggregation if
+		 * current row is not in frame but there might be more in the frame.
+		 */
+		ret = row_is_in_frame(winstate, winstate->aggregatedupto, agg_row_slot);
+		if (ret < 0)
+			break;
+		if (ret == 0)
+			goto next_tuple;
+
+		/* Set tuple context for evaluation of aggregate arguments */
+		winstate->tmpcontext->ecxt_outertuple = agg_row_slot;
+
+		/* Accumulate row into the aggregates */
+		for (i = 0; i < numaggs; i++)
+		{
+			peraggstate = &winstate->peragg[i];
+
+			/* Non-restarted aggs skip until aggregatedupto_nonrestarted */
+			if (!peraggstate->restart &&
+				winstate->aggregatedupto < aggregatedupto_nonrestarted)
+				continue;
+
+			wfuncno = peraggstate->wfuncno;
+			advance_windowaggregate(winstate,
+									&winstate->perfunc[wfuncno],
+									peraggstate);
+		}
+
+next_tuple:
+		/* Reset per-input-tuple context after each tuple */
+		ResetExprContext(winstate->tmpcontext);
+
+		/* And advance the aggregated-row state */
+		winstate->aggregatedupto++;
+		ExecClearTuple(agg_row_slot);
+	}
+
+	/* The frame's end is not supposed to move backwards, ever */
+	Assert(aggregatedupto_nonrestarted <= winstate->aggregatedupto);
+
+	/*
+	 * finalize aggregates and fill result/isnull fields.
+	 */
+	for (i = 0; i < numaggs; i++)
+	{
+		Datum	   *result;
+		bool	   *isnull;
+
+		peraggstate = &winstate->peragg[i];
+		wfuncno = peraggstate->wfuncno;
+		result = &econtext->ecxt_aggvalues[wfuncno];
+		isnull = &econtext->ecxt_aggnulls[wfuncno];
+		finalize_windowaggregate(winstate,
+								 &winstate->perfunc[wfuncno],
+								 peraggstate,
+								 result, isnull);
+
+		/*
+		 * save the result in case next row shares the same frame.
+		 *
+		 * XXX in some framing modes, eg ROWS/END_CURRENT_ROW, we can know in
+		 * advance that the next row can't possibly share the same frame. Is
+		 * it worth detecting that and skipping this code?
+		 */
+		if (!peraggstate->resulttypeByVal && !*isnull)
+		{
+			oldContext = MemoryContextSwitchTo(peraggstate->aggcontext);
+			peraggstate->resultValue =
+				datumCopy(*result,
+						  peraggstate->resulttypeByVal,
+						  peraggstate->resulttypeLen);
+			MemoryContextSwitchTo(oldContext);
+		}
+		else
+		{
+			peraggstate->resultValue = *result;
+		}
+		peraggstate->resultValueIsNull = *isnull;
+	}
+}
+
+/*
+ * eval_windowfunction
+ *
+ * Arguments of window functions are not evaluated here, because a window
+ * function can need random access to arbitrary rows in the partition.
+ * The window function uses the special WinGetFuncArgInPartition and
+ * WinGetFuncArgInFrame functions to evaluate the arguments for the rows
+ * it wants.
+ */
+static void
+eval_windowfunction(WindowAggState *winstate, WindowStatePerFunc perfuncstate,
+					Datum *result, bool *isnull)
+{
+	LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
+	MemoryContext oldContext;
+
+	oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+	/*
+	 * We don't pass any normal arguments to a window function, but we do pass
+	 * it the number of arguments, in order to permit window function
+	 * implementations to support varying numbers of arguments.  The real info
+	 * goes through the WindowObject, which is passed via fcinfo->context.
+	 */
+	InitFunctionCallInfoData(*fcinfo, &(perfuncstate->flinfo),
+							 perfuncstate->numArguments,
+							 perfuncstate->winCollation,
+							 (void *) perfuncstate->winobj, NULL);
+	/* Just in case, make all the regular argument slots be null */
+	for (int argno = 0; argno < perfuncstate->numArguments; argno++)
+		fcinfo->args[argno].isnull = true;
+	/* Window functions don't have a current aggregate context, either */
+	winstate->curaggcontext = NULL;
+
+	*result = FunctionCallInvoke(fcinfo);
+	*isnull = fcinfo->isnull;
+
+	/*
+	 * Make sure pass-by-ref data is allocated in the appropriate context. (We
+	 * need this in case the function returns a pointer into some short-lived
+	 * tuple, as is entirely possible.)
+	 */
+	if (!perfuncstate->resulttypeByVal && !fcinfo->isnull &&
+		!MemoryContextContains(CurrentMemoryContext,
+							   DatumGetPointer(*result)))
+		*result = datumCopy(*result,
+							perfuncstate->resulttypeByVal,
+							perfuncstate->resulttypeLen);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * begin_partition
+ * Start buffering rows of the next partition.
+ */
+static void
+begin_partition(WindowAggState *winstate)
+{
+	WindowAgg  *node = (WindowAgg *) winstate->ss.ps.plan;
+	PlanState  *outerPlan = outerPlanState(winstate);
+	int			frameOptions = winstate->frameOptions;
+	int			numfuncs = winstate->numfuncs;
+	int			i;
+
+	winstate->partition_spooled = false;
+	winstate->framehead_valid = false;
+	winstate->frametail_valid = false;
+	winstate->grouptail_valid = false;
+	winstate->spooled_rows = 0;
+	winstate->currentpos = 0;
+	winstate->frameheadpos = 0;
+	winstate->frametailpos = 0;
+	winstate->currentgroup = 0;
+	winstate->frameheadgroup = 0;
+	winstate->frametailgroup = 0;
+	winstate->groupheadpos = 0;
+	winstate->grouptailpos = -1;	/* see update_grouptailpos */
+	ExecClearTuple(winstate->agg_row_slot);
+	if (winstate->framehead_slot)
+		ExecClearTuple(winstate->framehead_slot);
+	if (winstate->frametail_slot)
+		ExecClearTuple(winstate->frametail_slot);
+
+	/*
+	 * If this is the very first partition, we need to fetch the first input
+	 * row to store in first_part_slot.
+	 */
+	if (TupIsNull(winstate->first_part_slot))
+	{
+		TupleTableSlot *outerslot = ExecProcNode(outerPlan);
+
+		if (!TupIsNull(outerslot))
+			ExecCopySlot(winstate->first_part_slot, outerslot);
+		else
+		{
+			/* outer plan is empty, so we have nothing to do */
+			winstate->partition_spooled = true;
+			winstate->more_partitions = false;
+			return;
+		}
+	}
+
+	/* Create new tuplestore for this partition */
+	winstate->buffer = tuplestore_begin_heap(false, false, work_mem);
+
+	/*
+	 * Set up read pointers for the tuplestore.  The current pointer doesn't
+	 * need BACKWARD capability, but the per-window-function read pointers do,
+	 * and the aggregate pointer does if we might need to restart aggregation.
+	 */
+	winstate->current_ptr = 0;	/* read pointer 0 is pre-allocated */
+
+	/* reset default REWIND capability bit for current ptr */
+	tuplestore_set_eflags(winstate->buffer, 0);
+
+	/* create read pointers for aggregates, if needed */
+	if (winstate->numaggs > 0)
+	{
+		WindowObject agg_winobj = winstate->agg_winobj;
+		int			readptr_flags = 0;
+
+		/*
+		 * If the frame head is potentially movable, or we have an EXCLUSION
+		 * clause, we might need to restart aggregation ...
+		 */
+		if (!(frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING) ||
+			(frameOptions & FRAMEOPTION_EXCLUSION))
+		{
+			/* ... so create a mark pointer to track the frame head */
+			agg_winobj->markptr = tuplestore_alloc_read_pointer(winstate->buffer, 0);
+			/* and the read pointer will need BACKWARD capability */
+			readptr_flags |= EXEC_FLAG_BACKWARD;
+		}
+
+		agg_winobj->readptr = tuplestore_alloc_read_pointer(winstate->buffer,
+															readptr_flags);
+		agg_winobj->markpos = -1;
+		agg_winobj->seekpos = -1;
+
+		/* Also reset the row counters for aggregates */
+		winstate->aggregatedbase = 0;
+		winstate->aggregatedupto = 0;
+	}
+
+	/* create mark and read pointers for each real window function */
+	for (i = 0; i < numfuncs; i++)
+	{
+		WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]);
+
+		if (!perfuncstate->plain_agg)
+		{
+			WindowObject winobj = perfuncstate->winobj;
+
+			winobj->markptr = tuplestore_alloc_read_pointer(winstate->buffer,
+															0);
+			winobj->readptr = tuplestore_alloc_read_pointer(winstate->buffer,
+															EXEC_FLAG_BACKWARD);
+			winobj->markpos = -1;
+			winobj->seekpos = -1;
+		}
+	}
+
+	/*
+	 * If we are in RANGE or GROUPS mode, then determining frame boundaries
+	 * requires physical access to the frame endpoint rows, except in certain
+	 * degenerate cases.  We create read pointers to point to those rows, to
+	 * simplify access and ensure that the tuplestore doesn't discard the
+	 * endpoint rows prematurely.  (Must create pointers in exactly the same
+	 * cases that update_frameheadpos and update_frametailpos need them.)
+	 */
+	winstate->framehead_ptr = winstate->frametail_ptr = -1; /* if not used */
+
+	if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+	{
+		if (((frameOptions & FRAMEOPTION_START_CURRENT_ROW) &&
+			 node->ordNumCols != 0) ||
+			(frameOptions & FRAMEOPTION_START_OFFSET))
+			winstate->framehead_ptr =
+				tuplestore_alloc_read_pointer(winstate->buffer, 0);
+		if (((frameOptions & FRAMEOPTION_END_CURRENT_ROW) &&
+			 node->ordNumCols != 0) ||
+			(frameOptions & FRAMEOPTION_END_OFFSET))
+			winstate->frametail_ptr =
+				tuplestore_alloc_read_pointer(winstate->buffer, 0);
+	}
+
+	/*
+	 * If we have an exclusion clause that requires knowing the boundaries of
+	 * the current row's peer group, we create a read pointer to track the
+	 * tail position of the peer group (i.e., first row of the next peer
+	 * group).  The head position does not require its own pointer because we
+	 * maintain that as a side effect of advancing the current row.
+	 */
+	winstate->grouptail_ptr = -1;
+
+	if ((frameOptions & (FRAMEOPTION_EXCLUDE_GROUP |
+						 FRAMEOPTION_EXCLUDE_TIES)) &&
+		node->ordNumCols != 0)
+	{
+		winstate->grouptail_ptr =
+			tuplestore_alloc_read_pointer(winstate->buffer, 0);
+	}
+
+	/*
+	 * Store the first tuple into the tuplestore (it's always available now;
+	 * we either read it above, or saved it at the end of previous partition)
+	 */
+	tuplestore_puttupleslot(winstate->buffer, winstate->first_part_slot);
+	winstate->spooled_rows++;
+}
+
+/*
+ * Read tuples from the outer node, up to and including position 'pos', and
+ * store them into the tuplestore. If pos is -1, reads the whole partition.
+ */
+static void
+spool_tuples(WindowAggState *winstate, int64 pos)
+{
+	WindowAgg  *node = (WindowAgg *) winstate->ss.ps.plan;
+	PlanState  *outerPlan;
+	TupleTableSlot *outerslot;
+	MemoryContext oldcontext;
+
+	if (!winstate->buffer)
+		return;					/* just a safety check */
+	if (winstate->partition_spooled)
+		return;					/* whole partition done already */
+
+	/*
+	 * If the tuplestore has spilled to disk, alternate reading and writing
+	 * becomes quite expensive due to frequent buffer flushes.  It's cheaper
+	 * to force the entire partition to get spooled in one go.
+	 *
+	 * XXX this is a horrid kluge --- it'd be better to fix the performance
+	 * problem inside tuplestore.  FIXME
+	 */
+	if (!tuplestore_in_memory(winstate->buffer))
+		pos = -1;
+
+	outerPlan = outerPlanState(winstate);
+
+	/* Must be in query context to call outerplan */
+	oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+	while (winstate->spooled_rows <= pos || pos == -1)
+	{
+		outerslot = ExecProcNode(outerPlan);
+		if (TupIsNull(outerslot))
+		{
+			/* reached the end of the last partition */
+			winstate->partition_spooled = true;
+			winstate->more_partitions = false;
+			break;
+		}
+
+		if (node->partNumCols > 0)
+		{
+			ExprContext *econtext = winstate->tmpcontext;
+
+			econtext->ecxt_innertuple = winstate->first_part_slot;
+			econtext->ecxt_outertuple = outerslot;
+
+			/* Check if this tuple still belongs to the current partition */
+			if (!ExecQualAndReset(winstate->partEqfunction, econtext))
+			{
+				/*
+				 * end of partition; copy the tuple for the next cycle.
+				 */
+				ExecCopySlot(winstate->first_part_slot, outerslot);
+				winstate->partition_spooled = true;
+				winstate->more_partitions = true;
+				break;
+			}
+		}
+
+		/* Still in partition, so save it into the tuplestore */
+		tuplestore_puttupleslot(winstate->buffer, outerslot);
+		winstate->spooled_rows++;
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * release_partition
+ * clear information kept within a partition, including
+ * tuplestore and aggregate results.
+ */
+static void
+release_partition(WindowAggState *winstate)
+{
+	int			i;
+
+	for (i = 0; i < winstate->numfuncs; i++)
+	{
+		WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]);
+
+		/* Release any partition-local state of this window function */
+		if (perfuncstate->winobj)
+			perfuncstate->winobj->localmem = NULL;
+	}
+
+	/*
+	 * Release all partition-local memory (in particular, any partition-local
+	 * state that we might have trashed our pointers to in the above loop, and
+	 * any aggregate temp data).  We don't rely on retail pfree because some
+	 * aggregates might have allocated data we don't have direct pointers to.
+	 */
+	MemoryContextResetAndDeleteChildren(winstate->partcontext);
+	MemoryContextResetAndDeleteChildren(winstate->aggcontext);
+	for (i = 0; i < winstate->numaggs; i++)
+	{
+		if (winstate->peragg[i].aggcontext != winstate->aggcontext)
+			MemoryContextResetAndDeleteChildren(winstate->peragg[i].aggcontext);
+	}
+
+	if (winstate->buffer)
+		tuplestore_end(winstate->buffer);
+	winstate->buffer = NULL;
+	winstate->partition_spooled = false;
+}
+
+/*
+ * row_is_in_frame
+ * Determine whether a row is in the current row's window frame according
+ * to our window framing rule
+ *
+ * The caller must have already determined that the row is in the partition
+ * and fetched it into a slot.  This function just encapsulates the framing
+ * rules.
+ *
+ * Returns:
+ * -1, if the row is out of frame and no succeeding rows can be in frame
+ * 0, if the row is out of frame but succeeding rows might be in frame
+ * 1, if the row is in frame
+ *
+ * May clobber winstate->temp_slot_2.
+ */
+static int
+row_is_in_frame(WindowAggState *winstate, int64 pos, TupleTableSlot *slot)
+{
+	int			frameOptions = winstate->frameOptions;
+
+	Assert(pos >= 0);			/* else caller error */
+
+	/*
+	 * First, check frame starting conditions.  We might as well delegate this
+	 * to update_frameheadpos always; it doesn't add any notable cost.
+	 */
+	update_frameheadpos(winstate);
+	if (pos < winstate->frameheadpos)
+		return 0;
+
+	/*
+	 * Okay so far, now check frame ending conditions.  Here, we avoid calling
+	 * update_frametailpos in simple cases, so as not to spool tuples further
+	 * ahead than necessary.
+	 */
+	if (frameOptions & FRAMEOPTION_END_CURRENT_ROW)
+	{
+		if (frameOptions & FRAMEOPTION_ROWS)
+		{
+			/* rows after current row are out of frame */
+			if (pos > winstate->currentpos)
+				return -1;
+		}
+		else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+		{
+			/* following row that is not peer is out of frame */
+			if (pos > winstate->currentpos &&
+				!are_peers(winstate, slot, winstate->ss.ss_ScanTupleSlot))
+				return -1;
+		}
+		else
+			Assert(false);
+	}
+	else if (frameOptions & FRAMEOPTION_END_OFFSET)
+	{
+		if (frameOptions & FRAMEOPTION_ROWS)
+		{
+			int64		offset = DatumGetInt64(winstate->endOffsetValue);
+
+			/* rows after current row + offset are out of frame */
+			if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+				offset = -offset;
+
+			if (pos > winstate->currentpos + offset)
+				return -1;
+		}
+		else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+		{
+			/* hard cases, so delegate to update_frametailpos */
+			update_frametailpos(winstate);
+			if (pos >= winstate->frametailpos)
+				return -1;
+		}
+		else
+			Assert(false);
+	}
+
+	/* Check exclusion clause */
+	if (frameOptions & FRAMEOPTION_EXCLUDE_CURRENT_ROW)
+	{
+		if (pos == winstate->currentpos)
+			return 0;
+	}
+	else if ((frameOptions & FRAMEOPTION_EXCLUDE_GROUP) ||
+			 ((frameOptions & FRAMEOPTION_EXCLUDE_TIES) &&
+			  pos != winstate->currentpos))
+	{
+		WindowAgg  *node = (WindowAgg *) winstate->ss.ps.plan;
+
+		/* If no ORDER BY, all rows are peers with each other */
+		if (node->ordNumCols == 0)
+			return 0;
+		/* Otherwise, check the group boundaries */
+		if (pos >= winstate->groupheadpos)
+		{
+			update_grouptailpos(winstate);
+			if (pos < winstate->grouptailpos)
+				return 0;
+		}
+	}
+
+	/* If we get here, it's in frame */
+	return 1;
+}
+
+/*
+ * update_frameheadpos
+ * make frameheadpos valid for the current row
+ *
+ * Note that frameheadpos is computed without regard for any window exclusion
+ * clause; the current row and/or its peers are considered part of the frame
+ * for this purpose even if they must be excluded later.
+ *
+ * May clobber winstate->temp_slot_2.
+ */
+static void
+update_frameheadpos(WindowAggState *winstate)
+{
+	WindowAgg  *node = (WindowAgg *) winstate->ss.ps.plan;
+	int			frameOptions = winstate->frameOptions;
+	MemoryContext oldcontext;
+
+	if (winstate->framehead_valid)
+		return;					/* already known for current row */
+
+	/* We may be called in a short-lived context */
+	oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+	if (frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING)
+	{
+		/* In UNBOUNDED PRECEDING mode, frame head is always row 0 */
+		winstate->frameheadpos = 0;
+		winstate->framehead_valid = true;
+	}
+	else if (frameOptions & FRAMEOPTION_START_CURRENT_ROW)
+	{
+		if (frameOptions & FRAMEOPTION_ROWS)
+		{
+			/* In ROWS mode, frame head is the same as current */
+			winstate->frameheadpos = winstate->currentpos;
+			winstate->framehead_valid = true;
+		}
+		else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+		{
+			/* If no ORDER BY, all rows are peers with each other */
+			if (node->ordNumCols == 0)
+			{
+				winstate->frameheadpos = 0;
+				winstate->framehead_valid = true;
+				MemoryContextSwitchTo(oldcontext);
+				return;
+			}
+
+			/*
+			 * In RANGE or GROUPS START_CURRENT_ROW mode, frame head is the
+			 * first row that is a peer of current row.  We keep a copy of the
+			 * last-known frame head row in framehead_slot, and advance as
+			 * necessary.  Note that if we reach end of partition, we will
+			 * leave frameheadpos = end+1 and framehead_slot empty.
+			 */
+			tuplestore_select_read_pointer(winstate->buffer,
+										   winstate->framehead_ptr);
+			if (winstate->frameheadpos == 0 &&
+				TupIsNull(winstate->framehead_slot))
+			{
+				/* fetch first row into framehead_slot, if we didn't already */
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->framehead_slot))
+					elog(ERROR, "unexpected end of tuplestore");
+			}
+
+			while (!TupIsNull(winstate->framehead_slot))
+			{
+				if (are_peers(winstate, winstate->framehead_slot,
+							  winstate->ss.ss_ScanTupleSlot))
+					break;		/* this row is the correct frame head */
+				/* Note we advance frameheadpos even if the fetch fails */
+				winstate->frameheadpos++;
+				spool_tuples(winstate, winstate->frameheadpos);
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->framehead_slot))
+					break;		/* end of partition */
+			}
+			winstate->framehead_valid = true;
+		}
+		else
+			Assert(false);
+	}
+	else if (frameOptions & FRAMEOPTION_START_OFFSET)
+	{
+		if (frameOptions & FRAMEOPTION_ROWS)
+		{
+			/* In ROWS mode, bound is physically n before/after current */
+			int64		offset = DatumGetInt64(winstate->startOffsetValue);
+
+			if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING)
+				offset = -offset;
+
+			winstate->frameheadpos = winstate->currentpos + offset;
+			/* frame head can't go before first row */
+			if (winstate->frameheadpos < 0)
+				winstate->frameheadpos = 0;
+			else if (winstate->frameheadpos > winstate->currentpos + 1)
+			{
+				/* make sure frameheadpos is not past end of partition */
+				spool_tuples(winstate, winstate->frameheadpos - 1);
+				if (winstate->frameheadpos > winstate->spooled_rows)
+					winstate->frameheadpos = winstate->spooled_rows;
+			}
+			winstate->framehead_valid = true;
+		}
+		else if (frameOptions & FRAMEOPTION_RANGE)
+		{
+			/*
+			 * In RANGE START_OFFSET mode, frame head is the first row that
+			 * satisfies the in_range constraint relative to the current row.
+			 * We keep a copy of the last-known frame head row in
+			 * framehead_slot, and advance as necessary.  Note that if we
+			 * reach end of partition, we will leave frameheadpos = end+1 and
+			 * framehead_slot empty.
+			 */
+			int			sortCol = node->ordColIdx[0];
+			bool		sub,
+						less;
+
+			/* We must have an ordering column */
+			Assert(node->ordNumCols == 1);
+
+			/* Precompute flags for in_range checks */
+			if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING)
+				sub = true;		/* subtract startOffset from current row */
+			else
+				sub = false;	/* add it */
+			less = false;		/* normally, we want frame head >= sum */
+			/* If sort order is descending, flip both flags */
+			if (!winstate->inRangeAsc)
+			{
+				sub = !sub;
+				less = true;
+			}
+
+			tuplestore_select_read_pointer(winstate->buffer,
+										   winstate->framehead_ptr);
+			if (winstate->frameheadpos == 0 &&
+				TupIsNull(winstate->framehead_slot))
+			{
+				/* fetch first row into framehead_slot, if we didn't already */
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->framehead_slot))
+					elog(ERROR, "unexpected end of tuplestore");
+			}
+
+			while (!TupIsNull(winstate->framehead_slot))
+			{
+				Datum		headval,
+							currval;
+				bool		headisnull,
+							currisnull;
+
+				headval = slot_getattr(winstate->framehead_slot, sortCol,
+									   &headisnull);
+				currval = slot_getattr(winstate->ss.ss_ScanTupleSlot, sortCol,
+									   &currisnull);
+				if (headisnull || currisnull)
+				{
+					/* order of the rows depends only on nulls_first */
+					if (winstate->inRangeNullsFirst)
+					{
+						/* advance head if head is null and curr is not */
+						if (!headisnull || currisnull)
+							break;
+					}
+					else
+					{
+						/* advance head if head is not null and curr is null */
+						if (headisnull || !currisnull)
+							break;
+					}
+				}
+				else
+				{
+					if (DatumGetBool(FunctionCall5Coll(&winstate->startInRangeFunc,
+													   winstate->inRangeColl,
+													   headval,
+													   currval,
+													   winstate->startOffsetValue,
+													   BoolGetDatum(sub),
+													   BoolGetDatum(less))))
+						break;	/* this row is the correct frame head */
+				}
+				/* Note we advance frameheadpos even if the fetch fails */
+				winstate->frameheadpos++;
+				spool_tuples(winstate, winstate->frameheadpos);
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->framehead_slot))
+					break;		/* end of partition */
+			}
+			winstate->framehead_valid = true;
+		}
+		else if (frameOptions & FRAMEOPTION_GROUPS)
+		{
+			/*
+			 * In GROUPS START_OFFSET mode, frame head is the first row of the
+			 * first peer group whose number satisfies the offset constraint.
+			 * We keep a copy of the last-known frame head row in
+			 * framehead_slot, and advance as necessary.  Note that if we
+			 * reach end of partition, we will leave frameheadpos = end+1 and
+			 * framehead_slot empty.
+			 */
+			int64		offset = DatumGetInt64(winstate->startOffsetValue);
+			int64		minheadgroup;
+
+			if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING)
+				minheadgroup = winstate->currentgroup - offset;
+			else
+				minheadgroup = winstate->currentgroup + offset;
+
+			tuplestore_select_read_pointer(winstate->buffer,
+										   winstate->framehead_ptr);
+			if (winstate->frameheadpos == 0 &&
+				TupIsNull(winstate->framehead_slot))
+			{
+				/* fetch first row into framehead_slot, if we didn't already */
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->framehead_slot))
+					elog(ERROR, "unexpected end of tuplestore");
+			}
+
+			while (!TupIsNull(winstate->framehead_slot))
+			{
+				if (winstate->frameheadgroup >= minheadgroup)
+					break;		/* this row is the correct frame head */
+				ExecCopySlot(winstate->temp_slot_2, winstate->framehead_slot);
+				/* Note we advance frameheadpos even if the fetch fails */
+				winstate->frameheadpos++;
+				spool_tuples(winstate, winstate->frameheadpos);
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->framehead_slot))
+					break;		/* end of partition */
+				if (!are_peers(winstate, winstate->temp_slot_2,
+							   winstate->framehead_slot))
+					winstate->frameheadgroup++;
+			}
+			ExecClearTuple(winstate->temp_slot_2);
+			winstate->framehead_valid = true;
+		}
+		else
+			Assert(false);
+	}
+	else
+		Assert(false);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * update_frametailpos
+ * make frametailpos valid for the current row
+ *
+ * Note that frametailpos is computed without regard for any window exclusion
+ * clause; the current row and/or its peers are considered part of the frame
+ * for this purpose even if they must be excluded later.
+ *
+ * May clobber winstate->temp_slot_2.
+ */
+static void
+update_frametailpos(WindowAggState *winstate)
+{
+	WindowAgg  *node = (WindowAgg *) winstate->ss.ps.plan;
+	int			frameOptions = winstate->frameOptions;
+	MemoryContext oldcontext;
+
+	if (winstate->frametail_valid)
+		return;					/* already known for current row */
+
+	/* We may be called in a short-lived context */
+	oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+	if (frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING)
+	{
+		/* In UNBOUNDED FOLLOWING mode, all partition rows are in frame */
+		spool_tuples(winstate, -1);
+		winstate->frametailpos = winstate->spooled_rows;
+		winstate->frametail_valid = true;
+	}
+	else if (frameOptions & FRAMEOPTION_END_CURRENT_ROW)
+	{
+		if (frameOptions & FRAMEOPTION_ROWS)
+		{
+			/* In ROWS mode, exactly the rows up to current are in frame */
+			winstate->frametailpos = winstate->currentpos + 1;
+			winstate->frametail_valid = true;
+		}
+		else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+		{
+			/* If no ORDER BY, all rows are peers with each other */
+			if (node->ordNumCols == 0)
+			{
+				spool_tuples(winstate, -1);
+				winstate->frametailpos = winstate->spooled_rows;
+				winstate->frametail_valid = true;
+				MemoryContextSwitchTo(oldcontext);
+				return;
+			}
+
+			/*
+			 * In RANGE or GROUPS END_CURRENT_ROW mode, frame end is the last
+			 * row that is a peer of current row, frame tail is the row after
+			 * that (if any).  We keep a copy of the last-known frame tail row
+			 * in frametail_slot, and advance as necessary.  Note that if we
+			 * reach end of partition, we will leave frametailpos = end+1 and
+			 * frametail_slot empty.
+			 */
+			tuplestore_select_read_pointer(winstate->buffer,
+										   winstate->frametail_ptr);
+			if (winstate->frametailpos == 0 &&
+				TupIsNull(winstate->frametail_slot))
+			{
+				/* fetch first row into frametail_slot, if we didn't already */
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->frametail_slot))
+					elog(ERROR, "unexpected end of tuplestore");
+			}
+
+			while (!TupIsNull(winstate->frametail_slot))
+			{
+				if (winstate->frametailpos > winstate->currentpos &&
+					!are_peers(winstate, winstate->frametail_slot,
+							   winstate->ss.ss_ScanTupleSlot))
+					break;		/* this row is the frame tail */
+				/* Note we advance frametailpos even if the fetch fails */
+				winstate->frametailpos++;
+				spool_tuples(winstate, winstate->frametailpos);
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->frametail_slot))
+					break;		/* end of partition */
+			}
+			winstate->frametail_valid = true;
+		}
+		else
+			Assert(false);
+	}
+	else if (frameOptions & FRAMEOPTION_END_OFFSET)
+	{
+		if (frameOptions & FRAMEOPTION_ROWS)
+		{
+			/* In ROWS mode, bound is physically n before/after current */
+			int64		offset = DatumGetInt64(winstate->endOffsetValue);
+
+			if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+				offset = -offset;
+
+			winstate->frametailpos = winstate->currentpos + offset + 1;
+			/* smallest allowable value of frametailpos is 0 */
+			if (winstate->frametailpos < 0)
+				winstate->frametailpos = 0;
+			else if (winstate->frametailpos > winstate->currentpos + 1)
+			{
+				/* make sure frametailpos is not past end of partition */
+				spool_tuples(winstate, winstate->frametailpos - 1);
+				if (winstate->frametailpos > winstate->spooled_rows)
+					winstate->frametailpos = winstate->spooled_rows;
+			}
+			winstate->frametail_valid = true;
+		}
+		else if (frameOptions & FRAMEOPTION_RANGE)
+		{
+			/*
+			 * In RANGE END_OFFSET mode, frame end is the last row that
+			 * satisfies the in_range constraint relative to the current row,
+			 * frame tail is the row after that (if any).  We keep a copy of
+			 * the last-known frame tail row in frametail_slot, and advance as
+			 * necessary.  Note that if we reach end of partition, we will
+			 * leave frametailpos = end+1 and frametail_slot empty.
+			 */
+			int			sortCol = node->ordColIdx[0];
+			bool		sub,
+						less;
+
+			/* We must have an ordering column */
+			Assert(node->ordNumCols == 1);
+
+			/* Precompute flags for in_range checks */
+			if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+				sub = true;		/* subtract endOffset from current row */
+			else
+				sub = false;	/* add it */
+			less = true;		/* normally, we want frame tail <= sum */
+			/* If sort order is descending, flip both flags */
+			if (!winstate->inRangeAsc)
+			{
+				sub = !sub;
+				less = false;
+			}
+
+			tuplestore_select_read_pointer(winstate->buffer,
+										   winstate->frametail_ptr);
+			if (winstate->frametailpos == 0 &&
+				TupIsNull(winstate->frametail_slot))
+			{
+				/* fetch first row into frametail_slot, if we didn't already */
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->frametail_slot))
+					elog(ERROR, "unexpected end of tuplestore");
+			}
+
+			while (!TupIsNull(winstate->frametail_slot))
+			{
+				Datum		tailval,
+							currval;
+				bool		tailisnull,
+							currisnull;
+
+				tailval = slot_getattr(winstate->frametail_slot, sortCol,
+									   &tailisnull);
+				currval = slot_getattr(winstate->ss.ss_ScanTupleSlot, sortCol,
+									   &currisnull);
+				if (tailisnull || currisnull)
+				{
+					/* order of the rows depends only on nulls_first */
+					if (winstate->inRangeNullsFirst)
+					{
+						/* advance tail if tail is null or curr is not */
+						if (!tailisnull)
+							break;
+					}
+					else
+					{
+						/* advance tail if tail is not null or curr is null */
+						if (!currisnull)
+							break;
+					}
+				}
+				else
+				{
+					if (!DatumGetBool(FunctionCall5Coll(&winstate->endInRangeFunc,
+														winstate->inRangeColl,
+														tailval,
+														currval,
+														winstate->endOffsetValue,
+														BoolGetDatum(sub),
+														BoolGetDatum(less))))
+						break;	/* this row is the correct frame tail */
+				}
+				/* Note we advance frametailpos even if the fetch fails */
+				winstate->frametailpos++;
+				spool_tuples(winstate, winstate->frametailpos);
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->frametail_slot))
+					break;		/* end of partition */
+			}
+			winstate->frametail_valid = true;
+		}
+		else if (frameOptions & FRAMEOPTION_GROUPS)
+		{
+			/*
+			 * In GROUPS END_OFFSET mode, frame end is the last row of the
+			 * last peer group whose number satisfies the offset constraint,
+			 * and frame tail is the row after that (if any).  We keep a copy
+			 * of the last-known frame tail row in frametail_slot, and advance
+			 * as necessary.  Note that if we reach end of partition, we will
+			 * leave frametailpos = end+1 and frametail_slot empty.
+			 */
+			int64		offset = DatumGetInt64(winstate->endOffsetValue);
+			int64		maxtailgroup;
+
+			if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+				maxtailgroup = winstate->currentgroup - offset;
+			else
+				maxtailgroup = winstate->currentgroup + offset;
+
+			tuplestore_select_read_pointer(winstate->buffer,
+										   winstate->frametail_ptr);
+			if (winstate->frametailpos == 0 &&
+				TupIsNull(winstate->frametail_slot))
+			{
+				/* fetch first row into frametail_slot, if we didn't already */
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->frametail_slot))
+					elog(ERROR, "unexpected end of tuplestore");
+			}
+
+			while (!TupIsNull(winstate->frametail_slot))
+			{
+				if (winstate->frametailgroup > maxtailgroup)
+					break;		/* this row is the correct frame tail */
+				ExecCopySlot(winstate->temp_slot_2, winstate->frametail_slot);
+				/* Note we advance frametailpos even if the fetch fails */
+				winstate->frametailpos++;
+				spool_tuples(winstate, winstate->frametailpos);
+				if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+											 winstate->frametail_slot))
+					break;		/* end of partition */
+				if (!are_peers(winstate, winstate->temp_slot_2,
+							   winstate->frametail_slot))
+					winstate->frametailgroup++;
+			}
+			ExecClearTuple(winstate->temp_slot_2);
+			winstate->frametail_valid = true;
+		}
+		else
+			Assert(false);
+	}
+	else
+		Assert(false);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * update_grouptailpos
+ * make grouptailpos valid for the current row
+ *
+ * May clobber winstate->temp_slot_2.
+ */
+static void
+update_grouptailpos(WindowAggState *winstate)
+{
+	WindowAgg  *node = (WindowAgg *) winstate->ss.ps.plan;
+	MemoryContext oldcontext;
+
+	if (winstate->grouptail_valid)
+		return;					/* already known for current row */
+
+	/* We may be called in a short-lived context */
+	oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+	/* If no ORDER BY, all rows are peers with each other */
+	if (node->ordNumCols == 0)
+	{
+		spool_tuples(winstate, -1);
+		winstate->grouptailpos = winstate->spooled_rows;
+		winstate->grouptail_valid = true;
+		MemoryContextSwitchTo(oldcontext);
+		return;
+	}
+
+	/*
+	 * Because grouptail_valid is reset only when current row advances into a
+	 * new peer group, we always reach here knowing that grouptailpos needs to
+	 * be advanced by at least one row.  Hence, unlike the otherwise similar
+	 * case for frame tail tracking, we do not need persistent storage of the
+	 * group tail row.
+	 */
+	Assert(winstate->grouptailpos <= winstate->currentpos);
+	tuplestore_select_read_pointer(winstate->buffer,
+								   winstate->grouptail_ptr);
+	for (;;)
+	{
+		/* Note we advance grouptailpos even if the fetch fails */
+		winstate->grouptailpos++;
+		spool_tuples(winstate, winstate->grouptailpos);
+		if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+									 winstate->temp_slot_2))
+			break;				/* end of partition */
+		if (winstate->grouptailpos > winstate->currentpos &&
+			!are_peers(winstate, winstate->temp_slot_2,
+					   winstate->ss.ss_ScanTupleSlot))
+			break;				/* this row is the group tail */
+	}
+	ExecClearTuple(winstate->temp_slot_2);
+	winstate->grouptail_valid = true;
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+
+/* -----------------
+ * ExecWindowAgg
+ *
+ *	ExecWindowAgg receives tuples from its outer subplan and
+ *	stores them into a tuplestore, then processes window functions.
+ *	This node doesn't reduce nor qualify any row so the number of
+ *	returned rows is exactly the same as its outer subplan's result.
+ * -----------------
+ */
+static TupleTableSlot *
+ExecWindowAgg(PlanState *pstate)
+{
+	WindowAggState *winstate = castNode(WindowAggState, pstate);
+	ExprContext *econtext;
+	int			i;
+	int			numfuncs;
+
+	CHECK_FOR_INTERRUPTS();
+
+	if (winstate->all_done)
+		return NULL;
+
+	/*
+	 * Compute frame offset values, if any, during first call (or after a
+	 * rescan).  These are assumed to hold constant throughout the scan; if
+	 * user gives us a volatile expression, we'll only use its initial value.
+	 */
+	if (winstate->all_first)
+	{
+		int			frameOptions = winstate->frameOptions;
+		ExprContext *econtext = winstate->ss.ps.ps_ExprContext;
+		Datum		value;
+		bool		isnull;
+		int16		len;
+		bool		byval;
+
+		if (frameOptions & FRAMEOPTION_START_OFFSET)
+		{
+			Assert(winstate->startOffset != NULL);
+			value = ExecEvalExprSwitchContext(winstate->startOffset,
+											  econtext,
+											  &isnull);
+			if (isnull)
+				ereport(ERROR,
+						(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+						 errmsg("frame starting offset must not be null")));
+			/* copy value into query-lifespan context */
+			get_typlenbyval(exprType((Node *) winstate->startOffset->expr),
+							&len, &byval);
+			winstate->startOffsetValue = datumCopy(value, byval, len);
+			if (frameOptions & (FRAMEOPTION_ROWS | FRAMEOPTION_GROUPS))
+			{
+				/* value is known to be int8 */
+				int64		offset = DatumGetInt64(value);
+
+				if (offset < 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PRECEDING_OR_FOLLOWING_SIZE),
+							 errmsg("frame starting offset must not be negative")));
+			}
+		}
+		if (frameOptions & FRAMEOPTION_END_OFFSET)
+		{
+			Assert(winstate->endOffset != NULL);
+			value = ExecEvalExprSwitchContext(winstate->endOffset,
+											  econtext,
+											  &isnull);
+			if (isnull)
+				ereport(ERROR,
+						(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+						 errmsg("frame ending offset must not be null")));
+			/* copy value into query-lifespan context */
+			get_typlenbyval(exprType((Node *) winstate->endOffset->expr),
+							&len, &byval);
+			winstate->endOffsetValue = datumCopy(value, byval, len);
+			if (frameOptions & (FRAMEOPTION_ROWS | FRAMEOPTION_GROUPS))
+			{
+				/* value is known to be int8 */
+				int64		offset = DatumGetInt64(value);
+
+				if (offset < 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PRECEDING_OR_FOLLOWING_SIZE),
+							 errmsg("frame ending offset must not be negative")));
+			}
+		}
+		winstate->all_first = false;
+	}
+
+	if (winstate->buffer == NULL)
+	{
+		/* Initialize for first partition and set current row = 0 */
+		begin_partition(winstate);
+		/* If there are no input rows, we'll detect that and exit below */
+	}
+	else
+	{
+		/* Advance current row within partition */
+		winstate->currentpos++;
+		/* This might mean that the frame moves, too */
+		winstate->framehead_valid = false;
+		winstate->frametail_valid = false;
+		/* we don't need to invalidate grouptail here; see below */
+	}
+
+	/*
+	 * Spool all tuples up to and including the current row, if we haven't
+	 * already
+	 */
+	spool_tuples(winstate, winstate->currentpos);
+
+	/* Move to the next partition if we reached the end of this partition */
+	if (winstate->partition_spooled &&
+		winstate->currentpos >= winstate->spooled_rows)
+	{
+		release_partition(winstate);
+
+		if (winstate->more_partitions)
+		{
+			begin_partition(winstate);
+			Assert(winstate->spooled_rows > 0);
+		}
+		else
+		{
+			winstate->all_done = true;
+			return NULL;
+		}
+	}
+
+	/* final output execution is in ps_ExprContext */
+	econtext = winstate->ss.ps.ps_ExprContext;
+
+	/* Clear the per-output-tuple context for current row */
+	ResetExprContext(econtext);
+
+	/*
+	 * Read the current row from the tuplestore, and save in ScanTupleSlot.
+	 * (We can't rely on the outerplan's output slot because we may have to
+	 * read beyond the current row.  Also, we have to actually copy the row
+	 * out of the tuplestore, since window function evaluation might cause the
+	 * tuplestore to dump its state to disk.)
+	 *
+	 * In GROUPS mode, or when tracking a group-oriented exclusion clause, we
+	 * must also detect entering a new peer group and update associated state
+	 * when that happens.  We use temp_slot_2 to temporarily hold the previous
+	 * row for this purpose.
+	 *
+	 * Current row must be in the tuplestore, since we spooled it above.
+	 */
+	tuplestore_select_read_pointer(winstate->buffer, winstate->current_ptr);
+	if ((winstate->frameOptions & (FRAMEOPTION_GROUPS |
+								   FRAMEOPTION_EXCLUDE_GROUP |
+								   FRAMEOPTION_EXCLUDE_TIES)) &&
+		winstate->currentpos > 0)
+	{
+		ExecCopySlot(winstate->temp_slot_2, winstate->ss.ss_ScanTupleSlot);
+		if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+									 winstate->ss.ss_ScanTupleSlot))
+			elog(ERROR, "unexpected end of tuplestore");
+		if (!are_peers(winstate, winstate->temp_slot_2,
+					   winstate->ss.ss_ScanTupleSlot))
+		{
+			winstate->currentgroup++;
+			winstate->groupheadpos = winstate->currentpos;
+			winstate->grouptail_valid = false;
+		}
+		ExecClearTuple(winstate->temp_slot_2);
+	}
+	else
+	{
+		if (!tuplestore_gettupleslot(winstate->buffer, true, true,
+									 winstate->ss.ss_ScanTupleSlot))
+			elog(ERROR, "unexpected end of tuplestore");
+	}
+
+	/*
+	 * Evaluate true window functions
+	 */
+	numfuncs = winstate->numfuncs;
+	for (i = 0; i < numfuncs; i++)
+	{
+		WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]);
+
+		if (perfuncstate->plain_agg)
+			continue;
+		eval_windowfunction(winstate, perfuncstate,
+							&(econtext->ecxt_aggvalues[perfuncstate->wfuncstate->wfuncno]),
+							&(econtext->ecxt_aggnulls[perfuncstate->wfuncstate->wfuncno]));
+	}
+
+	/*
+	 * Evaluate aggregates
+	 */
+	if (winstate->numaggs > 0)
+		eval_windowaggregates(winstate);
+
+	/*
+	 * If we have created auxiliary read pointers for the frame or group
+	 * boundaries, force them to be kept up-to-date, because we don't know
+	 * whether the window function(s) will do anything that requires that.
+	 * Failing to advance the pointers would result in being unable to trim
+	 * data from the tuplestore, which is bad.  (If we could know in advance
+	 * whether the window functions will use frame boundary info, we could
+	 * skip creating these pointers in the first place ... but unfortunately
+	 * the window function API doesn't require that.)
+	 */
+	if (winstate->framehead_ptr >= 0)
+		update_frameheadpos(winstate);
+	if (winstate->frametail_ptr >= 0)
+		update_frametailpos(winstate);
+	if (winstate->grouptail_ptr >= 0)
+		update_grouptailpos(winstate);
+
+	/*
+	 * Truncate any no-longer-needed rows from the tuplestore.
+	 */
+	tuplestore_trim(winstate->buffer);
+
+	/*
+	 * Form and return a projection tuple using the windowfunc results and the
+	 * current row.  Setting ecxt_outertuple arranges that any Vars will be
+	 * evaluated with respect to that row.
+	 */
+	econtext->ecxt_outertuple = winstate->ss.ss_ScanTupleSlot;
+
+	return ExecProject(winstate->ss.ps.ps_ProjInfo);
+}
+
+/* -----------------
+ * ExecInitWindowAgg
+ *
+ *	Creates the run-time information for the WindowAgg node produced by the
+ *	planner and initializes its outer subtree
+ * -----------------
+ */
+WindowAggState *
+ExecInitWindowAgg(WindowAgg *node, EState *estate, int eflags)
+{
+	WindowAggState *winstate;
+	Plan	   *outerPlan;
+	ExprContext *econtext;
+	ExprContext *tmpcontext;
+	WindowStatePerFunc perfunc;
+	WindowStatePerAgg peragg;
+	int			frameOptions = node->frameOptions;
+	int			numfuncs,
+				wfuncno,
+				numaggs,
+				aggno;
+	TupleDesc	scanDesc;
+	ListCell   *l;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	winstate = makeNode(WindowAggState);
+	winstate->ss.ps.plan = (Plan *) node;
+	winstate->ss.ps.state = estate;
+	winstate->ss.ps.ExecProcNode = ExecWindowAgg;
+
+	/*
+	 * Create expression contexts.  We need two, one for per-input-tuple
+	 * processing and one for per-output-tuple processing.  We cheat a little
+	 * by using ExecAssignExprContext() to build both.
+	 */
+	ExecAssignExprContext(estate, &winstate->ss.ps);
+	tmpcontext = winstate->ss.ps.ps_ExprContext;
+	winstate->tmpcontext = tmpcontext;
+	ExecAssignExprContext(estate, &winstate->ss.ps);
+
+	/* Create long-lived context for storage of partition-local memory etc */
+	winstate->partcontext =
+		AllocSetContextCreate(CurrentMemoryContext,
+							  "WindowAgg Partition",
+							  ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * Create mid-lived context for aggregate trans values etc.
+	 *
+	 * Note that moving aggregates each use their own private context, not
+	 * this one.
+	 */
+	winstate->aggcontext =
+		AllocSetContextCreate(CurrentMemoryContext,
+							  "WindowAgg Aggregates",
+							  ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * WindowAgg nodes never have quals, since they can only occur at the
+	 * logical top level of a query (ie, after any WHERE or HAVING filters)
+	 */
+	Assert(node->plan.qual == NIL);
+	winstate->ss.ps.qual = NULL;
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlan = outerPlan(node);
+	outerPlanState(winstate) = ExecInitNode(outerPlan, estate, eflags);
+
+	/*
+	 * initialize source tuple type (which is also the tuple type that we'll
+	 * store in the tuplestore and use in all our working slots).
+	 */
+	ExecCreateScanSlotFromOuterPlan(estate, &winstate->ss, &TTSOpsMinimalTuple);
+	scanDesc = winstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
+
+	/* the outer tuple isn't the child's tuple, but always a minimal tuple */
+	winstate->ss.ps.outeropsset = true;
+	winstate->ss.ps.outerops = &TTSOpsMinimalTuple;
+	winstate->ss.ps.outeropsfixed = true;
+
+	/*
+	 * tuple table initialization
+	 */
+	winstate->first_part_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+													   &TTSOpsMinimalTuple);
+	winstate->agg_row_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+													&TTSOpsMinimalTuple);
+	winstate->temp_slot_1 = ExecInitExtraTupleSlot(estate, scanDesc,
+												   &TTSOpsMinimalTuple);
+	winstate->temp_slot_2 = ExecInitExtraTupleSlot(estate, scanDesc,
+												   &TTSOpsMinimalTuple);
+
+	/*
+	 * create frame head and tail slots only if needed (must create slots in
+	 * exactly the same cases that update_frameheadpos and update_frametailpos
+	 * need them)
+	 */
+	winstate->framehead_slot = winstate->frametail_slot = NULL;
+
+	if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
+	{
+		if (((frameOptions & FRAMEOPTION_START_CURRENT_ROW) &&
+			 node->ordNumCols != 0) ||
+			(frameOptions & FRAMEOPTION_START_OFFSET))
+			winstate->framehead_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+															  &TTSOpsMinimalTuple);
+		if (((frameOptions & FRAMEOPTION_END_CURRENT_ROW) &&
+			 node->ordNumCols != 0) ||
+			(frameOptions & FRAMEOPTION_END_OFFSET))
+			winstate->frametail_slot = ExecInitExtraTupleSlot(estate, scanDesc,
+															  &TTSOpsMinimalTuple);
+	}
+
+	/*
+	 * Initialize result slot, type and projection.
+	 */
+	ExecInitResultTupleSlotTL(&winstate->ss.ps, &TTSOpsVirtual);
+	ExecAssignProjectionInfo(&winstate->ss.ps, NULL);
+
+	/* Set up data for comparing tuples */
+	if (node->partNumCols > 0)
+		winstate->partEqfunction =
+			execTuplesMatchPrepare(scanDesc,
+								   node->partNumCols,
+								   node->partColIdx,
+								   node->partOperators,
+								   node->partCollations,
+								   &winstate->ss.ps);
+
+	if (node->ordNumCols > 0)
+		winstate->ordEqfunction =
+			execTuplesMatchPrepare(scanDesc,
+								   node->ordNumCols,
+								   node->ordColIdx,
+								   node->ordOperators,
+								   node->ordCollations,
+								   &winstate->ss.ps);
+
+	/*
+	 * WindowAgg nodes use aggvalues and aggnulls as well as Agg nodes.
+	 */
+	numfuncs = winstate->numfuncs;
+	numaggs = winstate->numaggs;
+	econtext = winstate->ss.ps.ps_ExprContext;
+	econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numfuncs);
+	econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numfuncs);
+
+	/*
+	 * allocate per-wfunc/per-agg state information.
+	 */
+	perfunc = (WindowStatePerFunc) palloc0(sizeof(WindowStatePerFuncData) * numfuncs);
+	peragg = (WindowStatePerAgg) palloc0(sizeof(WindowStatePerAggData) * numaggs);
+	winstate->perfunc = perfunc;
+	winstate->peragg = peragg;
+
+	wfuncno = -1;
+	aggno = -1;
+	foreach(l, winstate->funcs)
+	{
+		WindowFuncExprState *wfuncstate = (WindowFuncExprState *) lfirst(l);
+		WindowFunc *wfunc = wfuncstate->wfunc;
+		WindowStatePerFunc perfuncstate;
+		AclResult	aclresult;
+		int			i;
+
+		if (wfunc->winref != node->winref)	/* planner screwed up? */
+			elog(ERROR, "WindowFunc with winref %u assigned to WindowAgg with winref %u",
+				 wfunc->winref, node->winref);
+
+		/* Look for a previous duplicate window function */
+		for (i = 0; i <= wfuncno; i++)
+		{
+			if (equal(wfunc, perfunc[i].wfunc) &&
+				!contain_volatile_functions((Node *) wfunc))
+				break;
+		}
+		if (i <= wfuncno)
+		{
+			/* Found a match to an existing entry, so just mark it */
+			wfuncstate->wfuncno = i;
+			continue;
+		}
+
+		/* Nope, so assign a new PerAgg record */
+		perfuncstate = &perfunc[++wfuncno];
+
+		/* Mark WindowFunc state node with assigned index in the result array */
+		wfuncstate->wfuncno = wfuncno;
+
+		/* Check permission to call window function */
+		aclresult = pg_proc_aclcheck(wfunc->winfnoid, GetUserId(),
+									 ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, OBJECT_FUNCTION,
+						   get_func_name(wfunc->winfnoid));
+		InvokeFunctionExecuteHook(wfunc->winfnoid);
+
+		/* Fill in the perfuncstate data */
+		perfuncstate->wfuncstate = wfuncstate;
+		perfuncstate->wfunc = wfunc;
+		perfuncstate->numArguments = list_length(wfuncstate->args);
+		perfuncstate->winCollation = wfunc->inputcollid;
+
+		get_typlenbyval(wfunc->wintype,
+						&perfuncstate->resulttypeLen,
+						&perfuncstate->resulttypeByVal);
+
+		/*
+		 * If it's really just a plain aggregate function, we'll emulate the
+		 * Agg environment for it.
+		 */
+		perfuncstate->plain_agg = wfunc->winagg;
+		if (wfunc->winagg)
+		{
+			WindowStatePerAgg peraggstate;
+
+			perfuncstate->aggno = ++aggno;
+			peraggstate = &winstate->peragg[aggno];
+			initialize_peragg(winstate, wfunc, peraggstate);
+			peraggstate->wfuncno = wfuncno;
+		}
+		else
+		{
+			WindowObject winobj = makeNode(WindowObjectData);
+
+			winobj->winstate = winstate;
+			winobj->argstates = wfuncstate->args;
+			winobj->localmem = NULL;
+			perfuncstate->winobj = winobj;
+
+			/* It's a real window function, so set up to call it. */
+			fmgr_info_cxt(wfunc->winfnoid, &perfuncstate->flinfo,
+						  econtext->ecxt_per_query_memory);
+			fmgr_info_set_expr((Node *) wfunc, &perfuncstate->flinfo);
+		}
+	}
+
+	/* Update numfuncs, numaggs to match number of unique functions found */
+	winstate->numfuncs = wfuncno + 1;
+	winstate->numaggs = aggno + 1;
+
+	/* Set up WindowObject for aggregates, if needed */
+	if (winstate->numaggs > 0)
+	{
+		WindowObject agg_winobj = makeNode(WindowObjectData);
+
+		agg_winobj->winstate = winstate;
+		agg_winobj->argstates = NIL;
+		agg_winobj->localmem = NULL;
+		/* make sure markptr = -1 to invalidate. It may not get used */
+		agg_winobj->markptr = -1;
+		agg_winobj->readptr = -1;
+		winstate->agg_winobj = agg_winobj;
+	}
+
+	/* copy frame options to state node for easy access */
+	winstate->frameOptions = frameOptions;
+
+	/* initialize frame bound offset expressions */
+	winstate->startOffset = ExecInitExpr((Expr *) node->startOffset,
+										 (PlanState *) winstate);
+	winstate->endOffset = ExecInitExpr((Expr *) node->endOffset,
+									   (PlanState *) winstate);
+
+	/* Lookup in_range support functions if needed */
+	if (OidIsValid(node->startInRangeFunc))
+		fmgr_info(node->startInRangeFunc, &winstate->startInRangeFunc);
+	if (OidIsValid(node->endInRangeFunc))
+		fmgr_info(node->endInRangeFunc, &winstate->endInRangeFunc);
+	winstate->inRangeColl = node->inRangeColl;
+	winstate->inRangeAsc = node->inRangeAsc;
+	winstate->inRangeNullsFirst = node->inRangeNullsFirst;
+
+	winstate->all_first = true;
+	winstate->partition_spooled = false;
+	winstate->more_partitions = false;
+
+	return winstate;
+}
+
+/* -----------------
+ * ExecEndWindowAgg
+ * -----------------
+ */
+void
+ExecEndWindowAgg(WindowAggState *node)
+{
+	PlanState  *outerPlan;
+	int			i;
+
+	release_partition(node);
+
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	ExecClearTuple(node->first_part_slot);
+	ExecClearTuple(node->agg_row_slot);
+	ExecClearTuple(node->temp_slot_1);
+	ExecClearTuple(node->temp_slot_2);
+	if (node->framehead_slot)
+		ExecClearTuple(node->framehead_slot);
+	if (node->frametail_slot)
+		ExecClearTuple(node->frametail_slot);
+
+	/*
+	 * Free both the expr contexts.
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+	node->ss.ps.ps_ExprContext = node->tmpcontext;
+	ExecFreeExprContext(&node->ss.ps);
+
+	for (i = 0; i < node->numaggs; i++)
+	{
+		if (node->peragg[i].aggcontext != node->aggcontext)
+			MemoryContextDelete(node->peragg[i].aggcontext);
+	}
+	MemoryContextDelete(node->partcontext);
+	MemoryContextDelete(node->aggcontext);
+
+	pfree(node->perfunc);
+	pfree(node->peragg);
+
+	outerPlan = outerPlanState(node);
+	ExecEndNode(outerPlan);
+}
+
+/* -----------------
+ * ExecReScanWindowAgg
+ * -----------------
+ */
+void
+ExecReScanWindowAgg(WindowAggState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+	ExprContext *econtext = node->ss.ps.ps_ExprContext;
+
+	node->all_done = false;
+	node->all_first = true;
+
+	/* release tuplestore et al */
+	release_partition(node);
+
+	/* release all temp tuples, but especially first_part_slot */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	ExecClearTuple(node->first_part_slot);
+	ExecClearTuple(node->agg_row_slot);
+	ExecClearTuple(node->temp_slot_1);
+	ExecClearTuple(node->temp_slot_2);
+	if (node->framehead_slot)
+		ExecClearTuple(node->framehead_slot);
+	if (node->frametail_slot)
+		ExecClearTuple(node->frametail_slot);
+
+	/* Forget current wfunc values */
+	MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numfuncs);
+	MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numfuncs);
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+}
+
+/*
+ * initialize_peragg
+ *
+ * Almost same as in nodeAgg.c, except we don't support DISTINCT currently.
+ */
+static WindowStatePerAggData *
+initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc,
+				  WindowStatePerAgg peraggstate)
+{
+	Oid			inputTypes[FUNC_MAX_ARGS];
+	int			numArguments;
+	HeapTuple	aggTuple;
+	Form_pg_aggregate aggform;
+	Oid			aggtranstype;
+	AttrNumber	initvalAttNo;
+	AclResult	aclresult;
+	bool		use_ma_code;
+	Oid			transfn_oid,
+				invtransfn_oid,
+				finalfn_oid;
+	bool		finalextra;
+	char		finalmodify;
+	Expr	   *transfnexpr,
+			   *invtransfnexpr,
+			   *finalfnexpr;
+	Datum		textInitVal;
+	int			i;
+	ListCell   *lc;
+
+	numArguments = list_length(wfunc->args);
+
+	i = 0;
+	foreach(lc, wfunc->args)
+	{
+		inputTypes[i++] = exprType((Node *) lfirst(lc));
+	}
+
+	aggTuple = SearchSysCache1(AGGFNOID, ObjectIdGetDatum(wfunc->winfnoid));
+	if (!HeapTupleIsValid(aggTuple))
+		elog(ERROR, "cache lookup failed for aggregate %u",
+			 wfunc->winfnoid);
+	aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
+
+	/*
+	 * Figure out whether we want to use the moving-aggregate implementation,
+	 * and collect the right set of fields from the pg_attribute entry.
+	 *
+	 * It's possible that an aggregate would supply a safe moving-aggregate
+	 * implementation and an unsafe normal one, in which case our hand is
+	 * forced.  Otherwise, if the frame head can't move, we don't need
+	 * moving-aggregate code.  Even if we'd like to use it, don't do so if the
+	 * aggregate's arguments (and FILTER clause if any) contain any calls to
+	 * volatile functions.  Otherwise, the difference between restarting and
+	 * not restarting the aggregation would be user-visible.
+	 */
+	if (!OidIsValid(aggform->aggminvtransfn))
+		use_ma_code = false;	/* sine qua non */
+	else if (aggform->aggmfinalmodify == AGGMODIFY_READ_ONLY &&
+			 aggform->aggfinalmodify != AGGMODIFY_READ_ONLY)
+		use_ma_code = true;		/* decision forced by safety */
+	else if (winstate->frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING)
+		use_ma_code = false;	/* non-moving frame head */
+	else if (contain_volatile_functions((Node *) wfunc))
+		use_ma_code = false;	/* avoid possible behavioral change */
+	else
+		use_ma_code = true;		/* yes, let's use it */
+	if (use_ma_code)
+	{
+		peraggstate->transfn_oid = transfn_oid = aggform->aggmtransfn;
+		peraggstate->invtransfn_oid = invtransfn_oid = aggform->aggminvtransfn;
+		peraggstate->finalfn_oid = finalfn_oid = aggform->aggmfinalfn;
+		finalextra = aggform->aggmfinalextra;
+		finalmodify = aggform->aggmfinalmodify;
+		aggtranstype = aggform->aggmtranstype;
+		initvalAttNo = Anum_pg_aggregate_aggminitval;
+	}
+	else
+	{
+		peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn;
+		peraggstate->invtransfn_oid = invtransfn_oid = InvalidOid;
+		peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
+		finalextra = aggform->aggfinalextra;
+		finalmodify = aggform->aggfinalmodify;
+		aggtranstype = aggform->aggtranstype;
+		initvalAttNo = Anum_pg_aggregate_agginitval;
+	}
+
+	/*
+	 * ExecInitWindowAgg already checked permission to call aggregate function
+	 * ... but we still need to check the component functions
+	 */
+
+	/* Check that aggregate owner has permission to call component fns */
+	{
+		HeapTuple	procTuple;
+		Oid			aggOwner;
+
+		procTuple = SearchSysCache1(PROCOID,
+									ObjectIdGetDatum(wfunc->winfnoid));
+		if (!HeapTupleIsValid(procTuple))
+			elog(ERROR, "cache lookup failed for function %u",
+				 wfunc->winfnoid);
+		aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
+		ReleaseSysCache(procTuple);
+
+		aclresult = pg_proc_aclcheck(transfn_oid, aggOwner,
+									 ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, OBJECT_FUNCTION,
+						   get_func_name(transfn_oid));
+		InvokeFunctionExecuteHook(transfn_oid);
+
+		if (OidIsValid(invtransfn_oid))
+		{
+			aclresult = pg_proc_aclcheck(invtransfn_oid, aggOwner,
+										 ACL_EXECUTE);
+			if (aclresult != ACLCHECK_OK)
+				aclcheck_error(aclresult, OBJECT_FUNCTION,
+							   get_func_name(invtransfn_oid));
+			InvokeFunctionExecuteHook(invtransfn_oid);
+		}
+
+		if (OidIsValid(finalfn_oid))
+		{
+			aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
+										 ACL_EXECUTE);
+			if (aclresult != ACLCHECK_OK)
+				aclcheck_error(aclresult, OBJECT_FUNCTION,
+							   get_func_name(finalfn_oid));
+			InvokeFunctionExecuteHook(finalfn_oid);
+		}
+	}
+
+	/*
+	 * If the selected finalfn isn't read-only, we can't run this aggregate as
+	 * a window function.  This is a user-facing error, so we take a bit more
+	 * care with the error message than elsewhere in this function.
+	 */
+	if (finalmodify != AGGMODIFY_READ_ONLY)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("aggregate function %s does not support use as a window function",
+						format_procedure(wfunc->winfnoid))));
+
+	/* Detect how many arguments to pass to the finalfn */
+	if (finalextra)
+		peraggstate->numFinalArgs = numArguments + 1;
+	else
+		peraggstate->numFinalArgs = 1;
+
+	/* resolve actual type of transition state, if polymorphic */
+	aggtranstype = resolve_aggregate_transtype(wfunc->winfnoid,
+											   aggtranstype,
+											   inputTypes,
+											   numArguments);
+
+	/* build expression trees using actual argument & result types */
+	build_aggregate_transfn_expr(inputTypes,
+								 numArguments,
+								 0, /* no ordered-set window functions yet */
+								 false, /* no variadic window functions yet */
+								 aggtranstype,
+								 wfunc->inputcollid,
+								 transfn_oid,
+								 invtransfn_oid,
+								 &transfnexpr,
+								 &invtransfnexpr);
+
+	/* set up infrastructure for calling the transfn(s) and finalfn */
+	fmgr_info(transfn_oid, &peraggstate->transfn);
+	fmgr_info_set_expr((Node *) transfnexpr, &peraggstate->transfn);
+
+	if (OidIsValid(invtransfn_oid))
+	{
+		fmgr_info(invtransfn_oid, &peraggstate->invtransfn);
+		fmgr_info_set_expr((Node *) invtransfnexpr, &peraggstate->invtransfn);
+	}
+
+	if (OidIsValid(finalfn_oid))
+	{
+		build_aggregate_finalfn_expr(inputTypes,
+									 peraggstate->numFinalArgs,
+									 aggtranstype,
+									 wfunc->wintype,
+									 wfunc->inputcollid,
+									 finalfn_oid,
+									 &finalfnexpr);
+		fmgr_info(finalfn_oid, &peraggstate->finalfn);
+		fmgr_info_set_expr((Node *) finalfnexpr, &peraggstate->finalfn);
+	}
+
+	/* get info about relevant datatypes */
+	get_typlenbyval(wfunc->wintype,
+					&peraggstate->resulttypeLen,
+					&peraggstate->resulttypeByVal);
+	get_typlenbyval(aggtranstype,
+					&peraggstate->transtypeLen,
+					&peraggstate->transtypeByVal);
+
+	/*
+	 * initval is potentially null, so don't try to access it as a struct
+	 * field. Must do it the hard way with SysCacheGetAttr.
+	 */
+	textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, initvalAttNo,
+								  &peraggstate->initValueIsNull);
+
+	if (peraggstate->initValueIsNull)
+		peraggstate->initValue = (Datum) 0;
+	else
+		peraggstate->initValue = GetAggInitVal(textInitVal,
+											   aggtranstype);
+
+	/*
+	 * If the transfn is strict and the initval is NULL, make sure input type
+	 * and transtype are the same (or at least binary-compatible), so that
+	 * it's OK to use the first input value as the initial transValue.  This
+	 * should have been checked at agg definition time, but we must check
+	 * again in case the transfn's strictness property has been changed.
+	 */
+	if (peraggstate->transfn.fn_strict && peraggstate->initValueIsNull)
+	{
+		if (numArguments < 1 ||
+			!IsBinaryCoercible(inputTypes[0], aggtranstype))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+					 errmsg("aggregate %u needs to have compatible input type and transition type",
+							wfunc->winfnoid)));
+	}
+
+	/*
+	 * Insist that forward and inverse transition functions have the same
+	 * strictness setting.  Allowing them to differ would require handling
+	 * more special cases in advance_windowaggregate and
+	 * advance_windowaggregate_base, for no discernible benefit.  This should
+	 * have been checked at agg definition time, but we must check again in
+	 * case either function's strictness property has been changed.
+	 */
+	if (OidIsValid(invtransfn_oid) &&
+		peraggstate->transfn.fn_strict != peraggstate->invtransfn.fn_strict)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+				 errmsg("strictness of aggregate's forward and inverse transition functions must match")));
+
+	/*
+	 * Moving aggregates use their own aggcontext.
+	 *
+	 * This is necessary because they might restart at different times, so we
+	 * might never be able to reset the shared context otherwise.  We can't
+	 * make it the aggregates' responsibility to clean up after themselves,
+	 * because strict aggregates must be restarted whenever we remove their
+	 * last non-NULL input, which the aggregate won't be aware is happening.
+	 * Also, just pfree()ing the transValue upon restarting wouldn't help,
+	 * since we'd miss any indirectly referenced data.  We could, in theory,
+	 * make the memory allocation rules for moving aggregates different than
+	 * they have historically been for plain aggregates, but that seems grotty
+	 * and likely to lead to memory leaks.
+	 */
+	if (OidIsValid(invtransfn_oid))
+		peraggstate->aggcontext =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "WindowAgg Per Aggregate",
+								  ALLOCSET_DEFAULT_SIZES);
+	else
+		peraggstate->aggcontext = winstate->aggcontext;
+
+	ReleaseSysCache(aggTuple);
+
+	return peraggstate;
+}
+
+static Datum
+GetAggInitVal(Datum textInitVal, Oid transtype)
+{
+	Oid			typinput,
+				typioparam;
+	char	   *strInitVal;
+	Datum		initVal;
+
+	getTypeInputInfo(transtype, &typinput, &typioparam);
+	strInitVal = TextDatumGetCString(textInitVal);
+	initVal = OidInputFunctionCall(typinput, strInitVal,
+								   typioparam, -1);
+	pfree(strInitVal);
+	return initVal;
+}
+
+/*
+ * are_peers
+ * compare two rows to see if they are equal according to the ORDER BY clause
+ *
+ * NB: this does not consider the window frame mode.
+ */
+static bool
+are_peers(WindowAggState *winstate, TupleTableSlot *slot1,
+		  TupleTableSlot *slot2)
+{
+	WindowAgg  *node = (WindowAgg *) winstate->ss.ps.plan;
+	ExprContext *econtext = winstate->tmpcontext;
+
+	/* If no ORDER BY, all rows are peers with each other */
+	if (node->ordNumCols == 0)
+		return true;
+
+	econtext->ecxt_outertuple = slot1;
+	econtext->ecxt_innertuple = slot2;
+	return ExecQualAndReset(winstate->ordEqfunction, econtext);
+}
+
+/*
+ * window_gettupleslot
+ *	Fetch the pos'th tuple of the current partition into the slot,
+ *	using the winobj's read pointer
+ *
+ * Returns true if successful, false if no such row
+ */
+static bool
+window_gettupleslot(WindowObject winobj, int64 pos, TupleTableSlot *slot)
+{
+	WindowAggState *winstate = winobj->winstate;
+	MemoryContext oldcontext;
+
+	/* often called repeatedly in a row */
+	CHECK_FOR_INTERRUPTS();
+
+	/* Don't allow passing -1 to spool_tuples here */
+	if (pos < 0)
+		return false;
+
+	/* If necessary, fetch the tuple into the spool */
+	spool_tuples(winstate, pos);
+
+	if (pos >= winstate->spooled_rows)
+		return false;
+
+	if (pos < winobj->markpos)
+		elog(ERROR, "cannot fetch row before WindowObject's mark position");
+
+	oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+	tuplestore_select_read_pointer(winstate->buffer, winobj->readptr);
+
+	/*
+	 * Advance or rewind until we are within one tuple of the one we want.
+	 */
+	if (winobj->seekpos < pos - 1)
+	{
+		if (!tuplestore_skiptuples(winstate->buffer,
+								   pos - 1 - winobj->seekpos,
+								   true))
+			elog(ERROR, "unexpected end of tuplestore");
+		winobj->seekpos = pos - 1;
+	}
+	else if (winobj->seekpos > pos + 1)
+	{
+		if (!tuplestore_skiptuples(winstate->buffer,
+								   winobj->seekpos - (pos + 1),
+								   false))
+			elog(ERROR, "unexpected end of tuplestore");
+		winobj->seekpos = pos + 1;
+	}
+	else if (winobj->seekpos == pos)
+	{
+		/*
+		 * There's no API to refetch the tuple at the current position.  We
+		 * have to move one tuple forward, and then one backward.  (We don't
+		 * do it the other way because we might try to fetch the row before
+		 * our mark, which isn't allowed.)  XXX this case could stand to be
+		 * optimized.
+		 */
+		tuplestore_advance(winstate->buffer, true);
+		winobj->seekpos++;
+	}
+
+	/*
+	 * Now we should be on the tuple immediately before or after the one we
+	 * want, so just fetch forwards or backwards as appropriate.
+	 */
+	if (winobj->seekpos > pos)
+	{
+		if (!tuplestore_gettupleslot(winstate->buffer, false, true, slot))
+			elog(ERROR, "unexpected end of tuplestore");
+		winobj->seekpos--;
+	}
+	else
+	{
+		if (!tuplestore_gettupleslot(winstate->buffer, true, true, slot))
+			elog(ERROR, "unexpected end of tuplestore");
+		winobj->seekpos++;
+	}
+
+	Assert(winobj->seekpos == pos);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return true;
+}
+
+
+/***********************************************************************
+ * API exposed to window functions
+ ***********************************************************************/
+
+
+/*
+ * WinGetPartitionLocalMemory
+ *		Get working memory that lives till end of partition processing
+ *
+ * On first call within a given partition, this allocates and zeroes the
+ * requested amount of space.  Subsequent calls just return the same chunk.
+ *
+ * Memory obtained this way is normally used to hold state that should be
+ * automatically reset for each new partition.  If a window function wants
+ * to hold state across the whole query, fcinfo->fn_extra can be used in the
+ * usual way for that.
+ */
+void *
+WinGetPartitionLocalMemory(WindowObject winobj, Size sz)
+{
+	Assert(WindowObjectIsValid(winobj));
+	if (winobj->localmem == NULL)
+		winobj->localmem =
+			MemoryContextAllocZero(winobj->winstate->partcontext, sz);
+	return winobj->localmem;
+}
+
+/*
+ * WinGetCurrentPosition
+ *		Return the current row's position (counting from 0) within the current
+ *		partition.
+ */
+int64
+WinGetCurrentPosition(WindowObject winobj)
+{
+	Assert(WindowObjectIsValid(winobj));
+	return winobj->winstate->currentpos;
+}
+
+/*
+ * WinGetPartitionRowCount
+ *		Return total number of rows contained in the current partition.
+ *
+ * Note: this is a relatively expensive operation because it forces the
+ * whole partition to be "spooled" into the tuplestore at once.  Once
+ * executed, however, additional calls within the same partition are cheap.
+ */
+int64
+WinGetPartitionRowCount(WindowObject winobj)
+{
+	Assert(WindowObjectIsValid(winobj));
+	spool_tuples(winobj->winstate, -1);
+	return winobj->winstate->spooled_rows;
+}
+
+/*
+ * WinSetMarkPosition
+ *		Set the "mark" position for the window object, which is the oldest row
+ *		number (counting from 0) it is allowed to fetch during all subsequent
+ *		operations within the current partition.
+ *
+ * Window functions do not have to call this, but are encouraged to move the
+ * mark forward when possible to keep the tuplestore size down and prevent
+ * having to spill rows to disk.
+ */
+void
+WinSetMarkPosition(WindowObject winobj, int64 markpos)
+{
+	WindowAggState *winstate;
+
+	Assert(WindowObjectIsValid(winobj));
+	winstate = winobj->winstate;
+
+	if (markpos < winobj->markpos)
+		elog(ERROR, "cannot move WindowObject's mark position backward");
+	tuplestore_select_read_pointer(winstate->buffer, winobj->markptr);
+	if (markpos > winobj->markpos)
+	{
+		tuplestore_skiptuples(winstate->buffer,
+							  markpos - winobj->markpos,
+							  true);
+		winobj->markpos = markpos;
+	}
+	tuplestore_select_read_pointer(winstate->buffer, winobj->readptr);
+	if (markpos > winobj->seekpos)
+	{
+		tuplestore_skiptuples(winstate->buffer,
+							  markpos - winobj->seekpos,
+							  true);
+		winobj->seekpos = markpos;
+	}
+}
+
+/*
+ * WinRowsArePeers
+ *		Compare two rows (specified by absolute position in partition) to see
+ *		if they are equal according to the ORDER BY clause.
+ *
+ * NB: this does not consider the window frame mode.
+ */
+bool
+WinRowsArePeers(WindowObject winobj, int64 pos1, int64 pos2)
+{
+	WindowAggState *winstate;
+	WindowAgg  *node;
+	TupleTableSlot *slot1;
+	TupleTableSlot *slot2;
+	bool		res;
+
+	Assert(WindowObjectIsValid(winobj));
+	winstate = winobj->winstate;
+	node = (WindowAgg *) winstate->ss.ps.plan;
+
+	/* If no ORDER BY, all rows are peers; don't bother to fetch them */
+	if (node->ordNumCols == 0)
+		return true;
+
+	/*
+	 * Note: OK to use temp_slot_2 here because we aren't calling any
+	 * frame-related functions (those tend to clobber temp_slot_2).
+	 */
+	slot1 = winstate->temp_slot_1;
+	slot2 = winstate->temp_slot_2;
+
+	if (!window_gettupleslot(winobj, pos1, slot1))
+		elog(ERROR, "specified position is out of window: " INT64_FORMAT,
+			 pos1);
+	if (!window_gettupleslot(winobj, pos2, slot2))
+		elog(ERROR, "specified position is out of window: " INT64_FORMAT,
+			 pos2);
+
+	res = are_peers(winstate, slot1, slot2);
+
+	ExecClearTuple(slot1);
+	ExecClearTuple(slot2);
+
+	return res;
+}
+
+/*
+ * WinGetFuncArgInPartition
+ *		Evaluate a window function's argument expression on a specified
+ *		row of the partition.  The row is identified in lseek(2) style,
+ *		i.e. relative to the current, first, or last row.
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * relpos: signed rowcount offset from the seek position
+ * seektype: WINDOW_SEEK_CURRENT, WINDOW_SEEK_HEAD, or WINDOW_SEEK_TAIL
+ * set_mark: If the row is found and set_mark is true, the mark is moved to
+ *		the row as a side-effect.
+ * isnull: output argument, receives isnull status of result
+ * isout: output argument, set to indicate whether target row position
+ *		is out of partition (can pass NULL if caller doesn't care about this)
+ *
+ * Specifying a nonexistent row is not an error, it just causes a null result
+ * (plus setting *isout true, if isout isn't NULL).
+ */
+Datum
+WinGetFuncArgInPartition(WindowObject winobj, int argno,
+						 int relpos, int seektype, bool set_mark,
+						 bool *isnull, bool *isout)
+{
+	WindowAggState *winstate;
+	ExprContext *econtext;
+	TupleTableSlot *slot;
+	bool		gottuple;
+	int64		abs_pos;
+
+	Assert(WindowObjectIsValid(winobj));
+	winstate = winobj->winstate;
+	econtext = winstate->ss.ps.ps_ExprContext;
+	slot = winstate->temp_slot_1;
+
+	switch (seektype)
+	{
+		case WINDOW_SEEK_CURRENT:
+			abs_pos = winstate->currentpos + relpos;
+			break;
+		case WINDOW_SEEK_HEAD:
+			abs_pos = relpos;
+			break;
+		case WINDOW_SEEK_TAIL:
+			spool_tuples(winstate, -1);
+			abs_pos = winstate->spooled_rows - 1 + relpos;
+			break;
+		default:
+			elog(ERROR, "unrecognized window seek type: %d", seektype);
+			abs_pos = 0;		/* keep compiler quiet */
+			break;
+	}
+
+	gottuple = window_gettupleslot(winobj, abs_pos, slot);
+
+	if (!gottuple)
+	{
+		if (isout)
+			*isout = true;
+		*isnull = true;
+		return (Datum) 0;
+	}
+	else
+	{
+		if (isout)
+			*isout = false;
+		if (set_mark)
+			WinSetMarkPosition(winobj, abs_pos);
+		econtext->ecxt_outertuple = slot;
+		return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+							econtext, isnull);
+	}
+}
+
+/*
+ * WinGetFuncArgInFrame
+ *		Evaluate a window function's argument expression on a specified
+ *		row of the window frame.  The row is identified in lseek(2) style,
+ *		i.e. relative to the first or last row of the frame.  (We do not
+ *		support WINDOW_SEEK_CURRENT here, because it's not very clear what
+ *		that should mean if the current row isn't part of the frame.)
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * relpos: signed rowcount offset from the seek position
+ * seektype: WINDOW_SEEK_HEAD or WINDOW_SEEK_TAIL
+ * set_mark: If the row is found/in frame and set_mark is true, the mark is
+ *		moved to the row as a side-effect.
+ * isnull: output argument, receives isnull status of result
+ * isout: output argument, set to indicate whether target row position
+ *		is out of frame (can pass NULL if caller doesn't care about this)
+ *
+ * Specifying a nonexistent or not-in-frame row is not an error, it just
+ * causes a null result (plus setting *isout true, if isout isn't NULL).
+ *
+ * Note that some exclusion-clause options lead to situations where the
+ * rows that are in-frame are not consecutive in the partition.  But we
+ * count only in-frame rows when measuring relpos.
+ *
+ * The set_mark flag is interpreted as meaning that the caller will specify
+ * a constant (or, perhaps, monotonically increasing) relpos in successive
+ * calls, so that *if there is no exclusion clause* there will be no need
+ * to fetch a row before the previously fetched row.  But we do not expect
+ * the caller to know how to account for exclusion clauses.  Therefore,
+ * if there is an exclusion clause we take responsibility for adjusting the
+ * mark request to something that will be safe given the above assumption
+ * about relpos.
+ */
+Datum
+WinGetFuncArgInFrame(WindowObject winobj, int argno,
+					 int relpos, int seektype, bool set_mark,
+					 bool *isnull, bool *isout)
+{
+	WindowAggState *winstate;
+	ExprContext *econtext;
+	TupleTableSlot *slot;
+	int64		abs_pos;
+	int64		mark_pos;
+
+	Assert(WindowObjectIsValid(winobj));
+	winstate = winobj->winstate;
+	econtext = winstate->ss.ps.ps_ExprContext;
+	slot = winstate->temp_slot_1;
+
+	switch (seektype)
+	{
+		case WINDOW_SEEK_CURRENT:
+			elog(ERROR, "WINDOW_SEEK_CURRENT is not supported for WinGetFuncArgInFrame");
+			abs_pos = mark_pos = 0; /* keep compiler quiet */
+			break;
+		case WINDOW_SEEK_HEAD:
+			/* rejecting relpos < 0 is easy and simplifies code below */
+			if (relpos < 0)
+				goto out_of_frame;
+			update_frameheadpos(winstate);
+			abs_pos = winstate->frameheadpos + relpos;
+			mark_pos = abs_pos;
+
+			/*
+			 * Account for exclusion option if one is active, but advance only
+			 * abs_pos not mark_pos.  This prevents changes of the current
+			 * row's peer group from resulting in trying to fetch a row before
+			 * some previous mark position.
+			 *
+			 * Note that in some corner cases such as current row being
+			 * outside frame, these calculations are theoretically too simple,
+			 * but it doesn't matter because we'll end up deciding the row is
+			 * out of frame.  We do not attempt to avoid fetching rows past
+			 * end of frame; that would happen in some cases anyway.
+			 */
+			switch (winstate->frameOptions & FRAMEOPTION_EXCLUSION)
+			{
+				case 0:
+					/* no adjustment needed */
+					break;
+				case FRAMEOPTION_EXCLUDE_CURRENT_ROW:
+					if (abs_pos >= winstate->currentpos &&
+						winstate->currentpos >= winstate->frameheadpos)
+						abs_pos++;
+					break;
+				case FRAMEOPTION_EXCLUDE_GROUP:
+					update_grouptailpos(winstate);
+					if (abs_pos >= winstate->groupheadpos &&
+						winstate->grouptailpos > winstate->frameheadpos)
+					{
+						int64		overlapstart = Max(winstate->groupheadpos,
+													   winstate->frameheadpos);
+
+						abs_pos += winstate->grouptailpos - overlapstart;
+					}
+					break;
+				case FRAMEOPTION_EXCLUDE_TIES:
+					update_grouptailpos(winstate);
+					if (abs_pos >= winstate->groupheadpos &&
+						winstate->grouptailpos > winstate->frameheadpos)
+					{
+						int64		overlapstart = Max(winstate->groupheadpos,
+													   winstate->frameheadpos);
+
+						if (abs_pos == overlapstart)
+							abs_pos = winstate->currentpos;
+						else
+							abs_pos += winstate->grouptailpos - overlapstart - 1;
+					}
+					break;
+				default:
+					elog(ERROR, "unrecognized frame option state: 0x%x",
+						 winstate->frameOptions);
+					break;
+			}
+			break;
+		case WINDOW_SEEK_TAIL:
+			/* rejecting relpos > 0 is easy and simplifies code below */
+			if (relpos > 0)
+				goto out_of_frame;
+			update_frametailpos(winstate);
+			abs_pos = winstate->frametailpos - 1 + relpos;
+
+			/*
+			 * Account for exclusion option if one is active.  If there is no
+			 * exclusion, we can safely set the mark at the accessed row.  But
+			 * if there is, we can only mark the frame start, because we can't
+			 * be sure how far back in the frame the exclusion might cause us
+			 * to fetch in future.  Furthermore, we have to actually check
+			 * against frameheadpos here, since it's unsafe to try to fetch a
+			 * row before frame start if the mark might be there already.
+			 */
+			switch (winstate->frameOptions & FRAMEOPTION_EXCLUSION)
+			{
+				case 0:
+					/* no adjustment needed */
+					mark_pos = abs_pos;
+					break;
+				case FRAMEOPTION_EXCLUDE_CURRENT_ROW:
+					if (abs_pos <= winstate->currentpos &&
+						winstate->currentpos < winstate->frametailpos)
+						abs_pos--;
+					update_frameheadpos(winstate);
+					if (abs_pos < winstate->frameheadpos)
+						goto out_of_frame;
+					mark_pos = winstate->frameheadpos;
+					break;
+				case FRAMEOPTION_EXCLUDE_GROUP:
+					update_grouptailpos(winstate);
+					if (abs_pos < winstate->grouptailpos &&
+						winstate->groupheadpos < winstate->frametailpos)
+					{
+						int64		overlapend = Min(winstate->grouptailpos,
+													 winstate->frametailpos);
+
+						abs_pos -= overlapend - winstate->groupheadpos;
+					}
+					update_frameheadpos(winstate);
+					if (abs_pos < winstate->frameheadpos)
+						goto out_of_frame;
+					mark_pos = winstate->frameheadpos;
+					break;
+				case FRAMEOPTION_EXCLUDE_TIES:
+					update_grouptailpos(winstate);
+					if (abs_pos < winstate->grouptailpos &&
+						winstate->groupheadpos < winstate->frametailpos)
+					{
+						int64		overlapend = Min(winstate->grouptailpos,
+													 winstate->frametailpos);
+
+						if (abs_pos == overlapend - 1)
+							abs_pos = winstate->currentpos;
+						else
+							abs_pos -= overlapend - 1 - winstate->groupheadpos;
+					}
+					update_frameheadpos(winstate);
+					if (abs_pos < winstate->frameheadpos)
+						goto out_of_frame;
+					mark_pos = winstate->frameheadpos;
+					break;
+				default:
+					elog(ERROR, "unrecognized frame option state: 0x%x",
+						 winstate->frameOptions);
+					mark_pos = 0;	/* keep compiler quiet */
+					break;
+			}
+			break;
+		default:
+			elog(ERROR, "unrecognized window seek type: %d", seektype);
+			abs_pos = mark_pos = 0; /* keep compiler quiet */
+			break;
+	}
+
+	if (!window_gettupleslot(winobj, abs_pos, slot))
+		goto out_of_frame;
+
+	/* The code above does not detect all out-of-frame cases, so check */
+	if (row_is_in_frame(winstate, abs_pos, slot) <= 0)
+		goto out_of_frame;
+
+	if (isout)
+		*isout = false;
+	if (set_mark)
+		WinSetMarkPosition(winobj, mark_pos);
+	econtext->ecxt_outertuple = slot;
+	return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+						econtext, isnull);
+
+out_of_frame:
+	if (isout)
+		*isout = true;
+	*isnull = true;
+	return (Datum) 0;
+}
+
+/*
+ * WinGetFuncArgCurrent
+ *		Evaluate a window function's argument expression on the current row.
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * isnull: output argument, receives isnull status of result
+ *
+ * Note: this isn't quite equivalent to WinGetFuncArgInPartition or
+ * WinGetFuncArgInFrame targeting the current row, because it will succeed
+ * even if the WindowObject's mark has been set beyond the current row.
+ * This should generally be used for "ordinary" arguments of a window
+ * function, such as the offset argument of lead() or lag().
+ */
+Datum
+WinGetFuncArgCurrent(WindowObject winobj, int argno, bool *isnull)
+{
+	WindowAggState *winstate;
+	ExprContext *econtext;
+
+	Assert(WindowObjectIsValid(winobj));
+	winstate = winobj->winstate;
+
+	econtext = winstate->ss.ps.ps_ExprContext;
+
+	econtext->ecxt_outertuple = winstate->ss.ss_ScanTupleSlot;
+	return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+						econtext, isnull);
+}
diff --git a/src/backend/executor/nodeWorktablescan.c b/src/backend/executor/nodeWorktablescan.c
new file mode 100644
index 0000000..91d3bf3
--- /dev/null
+++ b/src/backend/executor/nodeWorktablescan.c
@@ -0,0 +1,223 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeWorktablescan.c
+ *	  routines to handle WorkTableScan nodes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeWorktablescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "executor/nodeWorktablescan.h"
+
+static TupleTableSlot *WorkTableScanNext(WorkTableScanState *node);
+
+/* ----------------------------------------------------------------
+ *		WorkTableScanNext
+ *
+ *		This is a workhorse for ExecWorkTableScan
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+WorkTableScanNext(WorkTableScanState *node)
+{
+	TupleTableSlot *slot;
+	Tuplestorestate *tuplestorestate;
+
+	/*
+	 * get information from the estate and scan state
+	 *
+	 * Note: we intentionally do not support backward scan.  Although it would
+	 * take only a couple more lines here, it would force nodeRecursiveunion.c
+	 * to create the tuplestore with backward scan enabled, which has a
+	 * performance cost.  In practice backward scan is never useful for a
+	 * worktable plan node, since it cannot appear high enough in the plan
+	 * tree of a scrollable cursor to be exposed to a backward-scan
+	 * requirement.  So it's not worth expending effort to support it.
+	 *
+	 * Note: we are also assuming that this node is the only reader of the
+	 * worktable.  Therefore, we don't need a private read pointer for the
+	 * tuplestore, nor do we need to tell tuplestore_gettupleslot to copy.
+	 */
+	Assert(ScanDirectionIsForward(node->ss.ps.state->es_direction));
+
+	tuplestorestate = node->rustate->working_table;
+
+	/*
+	 * Get the next tuple from tuplestore. Return NULL if no more tuples.
+	 */
+	slot = node->ss.ss_ScanTupleSlot;
+	(void) tuplestore_gettupleslot(tuplestorestate, true, false, slot);
+	return slot;
+}
+
+/*
+ * WorkTableScanRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ */
+static bool
+WorkTableScanRecheck(WorkTableScanState *node, TupleTableSlot *slot)
+{
+	/* nothing to check */
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecWorkTableScan(node)
+ *
+ *		Scans the worktable sequentially and returns the next qualifying tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecWorkTableScan(PlanState *pstate)
+{
+	WorkTableScanState *node = castNode(WorkTableScanState, pstate);
+
+	/*
+	 * On the first call, find the ancestor RecursiveUnion's state via the
+	 * Param slot reserved for it.  (We can't do this during node init because
+	 * there are corner cases where we'll get the init call before the
+	 * RecursiveUnion does.)
+	 */
+	if (node->rustate == NULL)
+	{
+		WorkTableScan *plan = (WorkTableScan *) node->ss.ps.plan;
+		EState	   *estate = node->ss.ps.state;
+		ParamExecData *param;
+
+		param = &(estate->es_param_exec_vals[plan->wtParam]);
+		Assert(param->execPlan == NULL);
+		Assert(!param->isnull);
+		node->rustate = castNode(RecursiveUnionState, DatumGetPointer(param->value));
+		Assert(node->rustate);
+
+		/*
+		 * The scan tuple type (ie, the rowtype we expect to find in the work
+		 * table) is the same as the result rowtype of the ancestor
+		 * RecursiveUnion node.  Note this depends on the assumption that
+		 * RecursiveUnion doesn't allow projection.
+		 */
+		ExecAssignScanType(&node->ss,
+						   ExecGetResultType(&node->rustate->ps));
+
+		/*
+		 * Now we can initialize the projection info.  This must be completed
+		 * before we can call ExecScan().
+		 */
+		ExecAssignScanProjectionInfo(&node->ss);
+	}
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) WorkTableScanNext,
+					(ExecScanRecheckMtd) WorkTableScanRecheck);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecInitWorkTableScan
+ * ----------------------------------------------------------------
+ */
+WorkTableScanState *
+ExecInitWorkTableScan(WorkTableScan *node, EState *estate, int eflags)
+{
+	WorkTableScanState *scanstate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * WorkTableScan should not have any children.
+	 */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create new WorkTableScanState for node
+	 */
+	scanstate = makeNode(WorkTableScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecWorkTableScan;
+	scanstate->rustate = NULL;	/* we'll set this later */
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitResultTypeTL(&scanstate->ss.ps);
+
+	/* signal that return type is not yet known */
+	scanstate->ss.ps.resultopsset = true;
+	scanstate->ss.ps.resultopsfixed = false;
+
+	ExecInitScanTupleSlot(estate, &scanstate->ss, NULL, &TTSOpsMinimalTuple);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
+
+	/*
+	 * Do not yet initialize projection info, see ExecWorkTableScan() for
+	 * details.
+	 */
+
+	return scanstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndWorkTableScan
+ *
+ *		frees any storage allocated through C routines.
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndWorkTableScan(WorkTableScanState *node)
+{
+	/*
+	 * Free exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanWorkTableScan
+ *
+ *		Rescans the relation.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanWorkTableScan(WorkTableScanState *node)
+{
+	if (node->ss.ps.ps_ResultTupleSlot)
+		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	ExecScanReScan(&node->ss);
+
+	/* No need (or way) to rescan if ExecWorkTableScan not called yet */
+	if (node->rustate)
+		tuplestore_rescan(node->rustate->working_table);
+}
diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
new file mode 100644
index 0000000..f73c1e7
--- /dev/null
+++ b/src/backend/executor/spi.c
@@ -0,0 +1,3383 @@
+/*-------------------------------------------------------------------------
+ *
+ * spi.c
+ *				Server Programming Interface
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/spi.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/printtup.h"
+#include "access/sysattr.h"
+#include "access/xact.h"
+#include "catalog/heap.h"
+#include "catalog/pg_type.h"
+#include "commands/trigger.h"
+#include "executor/executor.h"
+#include "executor/spi_priv.h"
+#include "miscadmin.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/typcache.h"
+
+
+/*
+ * These global variables are part of the API for various SPI functions
+ * (a horrible API choice, but it's too late now).  To reduce the risk of
+ * interference between different SPI callers, we save and restore them
+ * when entering/exiting a SPI nesting level.
+ */
+uint64		SPI_processed = 0;
+SPITupleTable *SPI_tuptable = NULL;
+int			SPI_result = 0;
+
+static _SPI_connection *_SPI_stack = NULL;
+static _SPI_connection *_SPI_current = NULL;
+static int	_SPI_stack_depth = 0;	/* allocated size of _SPI_stack */
+static int	_SPI_connected = -1;	/* current stack index */
+
+typedef struct SPICallbackArg
+{
+	const char *query;
+	RawParseMode mode;
+} SPICallbackArg;
+
+static Portal SPI_cursor_open_internal(const char *name, SPIPlanPtr plan,
+									   ParamListInfo paramLI, bool read_only);
+
+static void _SPI_prepare_plan(const char *src, SPIPlanPtr plan);
+
+static void _SPI_prepare_oneshot_plan(const char *src, SPIPlanPtr plan);
+
+static int	_SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options,
+							  Snapshot snapshot, Snapshot crosscheck_snapshot,
+							  bool fire_triggers);
+
+static ParamListInfo _SPI_convert_params(int nargs, Oid *argtypes,
+										 Datum *Values, const char *Nulls);
+
+static int	_SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount);
+
+static void _SPI_error_callback(void *arg);
+
+static void _SPI_cursor_operation(Portal portal,
+								  FetchDirection direction, long count,
+								  DestReceiver *dest);
+
+static SPIPlanPtr _SPI_make_plan_non_temp(SPIPlanPtr plan);
+static SPIPlanPtr _SPI_save_plan(SPIPlanPtr plan);
+
+static int	_SPI_begin_call(bool use_exec);
+static int	_SPI_end_call(bool use_exec);
+static MemoryContext _SPI_execmem(void);
+static MemoryContext _SPI_procmem(void);
+static bool _SPI_checktuples(void);
+
+
+/* =================== interface functions =================== */
+
+int
+SPI_connect(void)
+{
+	return SPI_connect_ext(0);
+}
+
+int
+SPI_connect_ext(int options)
+{
+	int			newdepth;
+
+	/* Enlarge stack if necessary */
+	if (_SPI_stack == NULL)
+	{
+		if (_SPI_connected != -1 || _SPI_stack_depth != 0)
+			elog(ERROR, "SPI stack corrupted");
+		newdepth = 16;
+		_SPI_stack = (_SPI_connection *)
+			MemoryContextAlloc(TopMemoryContext,
+							   newdepth * sizeof(_SPI_connection));
+		_SPI_stack_depth = newdepth;
+	}
+	else
+	{
+		if (_SPI_stack_depth <= 0 || _SPI_stack_depth <= _SPI_connected)
+			elog(ERROR, "SPI stack corrupted");
+		if (_SPI_stack_depth == _SPI_connected + 1)
+		{
+			newdepth = _SPI_stack_depth * 2;
+			_SPI_stack = (_SPI_connection *)
+				repalloc(_SPI_stack,
+						 newdepth * sizeof(_SPI_connection));
+			_SPI_stack_depth = newdepth;
+		}
+	}
+
+	/* Enter new stack level */
+	_SPI_connected++;
+	Assert(_SPI_connected >= 0 && _SPI_connected < _SPI_stack_depth);
+
+	_SPI_current = &(_SPI_stack[_SPI_connected]);
+	_SPI_current->processed = 0;
+	_SPI_current->tuptable = NULL;
+	_SPI_current->execSubid = InvalidSubTransactionId;
+	slist_init(&_SPI_current->tuptables);
+	_SPI_current->procCxt = NULL;	/* in case we fail to create 'em */
+	_SPI_current->execCxt = NULL;
+	_SPI_current->connectSubid = GetCurrentSubTransactionId();
+	_SPI_current->queryEnv = NULL;
+	_SPI_current->atomic = (options & SPI_OPT_NONATOMIC ? false : true);
+	_SPI_current->internal_xact = false;
+	_SPI_current->outer_processed = SPI_processed;
+	_SPI_current->outer_tuptable = SPI_tuptable;
+	_SPI_current->outer_result = SPI_result;
+
+	/*
+	 * Create memory contexts for this procedure
+	 *
+	 * In atomic contexts (the normal case), we use TopTransactionContext,
+	 * otherwise PortalContext, so that it lives across transaction
+	 * boundaries.
+	 *
+	 * XXX It could be better to use PortalContext as the parent context in
+	 * all cases, but we may not be inside a portal (consider deferred-trigger
+	 * execution).  Perhaps CurTransactionContext could be an option?  For now
+	 * it doesn't matter because we clean up explicitly in AtEOSubXact_SPI();
+	 * but see also AtEOXact_SPI().
+	 */
+	_SPI_current->procCxt = AllocSetContextCreate(_SPI_current->atomic ? TopTransactionContext : PortalContext,
+												  "SPI Proc",
+												  ALLOCSET_DEFAULT_SIZES);
+	_SPI_current->execCxt = AllocSetContextCreate(_SPI_current->atomic ? TopTransactionContext : _SPI_current->procCxt,
+												  "SPI Exec",
+												  ALLOCSET_DEFAULT_SIZES);
+	/* ... and switch to procedure's context */
+	_SPI_current->savedcxt = MemoryContextSwitchTo(_SPI_current->procCxt);
+
+	/*
+	 * Reset API global variables so that current caller cannot accidentally
+	 * depend on state of an outer caller.
+	 */
+	SPI_processed = 0;
+	SPI_tuptable = NULL;
+	SPI_result = 0;
+
+	return SPI_OK_CONNECT;
+}
+
+int
+SPI_finish(void)
+{
+	int			res;
+
+	res = _SPI_begin_call(false);	/* just check we're connected */
+	if (res < 0)
+		return res;
+
+	/* Restore memory context as it was before procedure call */
+	MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+	/* Release memory used in procedure call (including tuptables) */
+	MemoryContextDelete(_SPI_current->execCxt);
+	_SPI_current->execCxt = NULL;
+	MemoryContextDelete(_SPI_current->procCxt);
+	_SPI_current->procCxt = NULL;
+
+	/*
+	 * Restore outer API variables, especially SPI_tuptable which is probably
+	 * pointing at a just-deleted tuptable
+	 */
+	SPI_processed = _SPI_current->outer_processed;
+	SPI_tuptable = _SPI_current->outer_tuptable;
+	SPI_result = _SPI_current->outer_result;
+
+	/* Exit stack level */
+	_SPI_connected--;
+	if (_SPI_connected < 0)
+		_SPI_current = NULL;
+	else
+		_SPI_current = &(_SPI_stack[_SPI_connected]);
+
+	return SPI_OK_FINISH;
+}
+
+/*
+ * SPI_start_transaction is a no-op, kept for backwards compatibility.
+ * SPI callers are *always* inside a transaction.
+ */
+void
+SPI_start_transaction(void)
+{
+}
+
+static void
+_SPI_commit(bool chain)
+{
+	MemoryContext oldcontext = CurrentMemoryContext;
+
+	/*
+	 * Complain if we are in a context that doesn't permit transaction
+	 * termination.  (Note: here and _SPI_rollback should be the only places
+	 * that throw ERRCODE_INVALID_TRANSACTION_TERMINATION, so that callers can
+	 * test for that with security that they know what happened.)
+	 */
+	if (_SPI_current->atomic)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION),
+				 errmsg("invalid transaction termination")));
+
+	/*
+	 * This restriction is required by PLs implemented on top of SPI.  They
+	 * use subtransactions to establish exception blocks that are supposed to
+	 * be rolled back together if there is an error.  Terminating the
+	 * top-level transaction in such a block violates that idea.  A future PL
+	 * implementation might have different ideas about this, in which case
+	 * this restriction would have to be refined or the check possibly be
+	 * moved out of SPI into the PLs.  Note however that the code below relies
+	 * on not being within a subtransaction.
+	 */
+	if (IsSubTransaction())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION),
+				 errmsg("cannot commit while a subtransaction is active")));
+
+	/* XXX this ain't re-entrant enough for my taste */
+	if (chain)
+		SaveTransactionCharacteristics();
+
+	/* Catch any error occurring during the COMMIT */
+	PG_TRY();
+	{
+		/* Protect current SPI stack entry against deletion */
+		_SPI_current->internal_xact = true;
+
+		/*
+		 * Hold any pinned portals that any PLs might be using.  We have to do
+		 * this before changing transaction state, since this will run
+		 * user-defined code that might throw an error.
+		 */
+		HoldPinnedPortals();
+
+		/* Release snapshots associated with portals */
+		ForgetPortalSnapshots();
+
+		/* Do the deed */
+		CommitTransactionCommand();
+
+		/* Immediately start a new transaction */
+		StartTransactionCommand();
+		if (chain)
+			RestoreTransactionCharacteristics();
+
+		MemoryContextSwitchTo(oldcontext);
+
+		_SPI_current->internal_xact = false;
+	}
+	PG_CATCH();
+	{
+		ErrorData  *edata;
+
+		/* Save error info in caller's context */
+		MemoryContextSwitchTo(oldcontext);
+		edata = CopyErrorData();
+		FlushErrorState();
+
+		/*
+		 * Abort the failed transaction.  If this fails too, we'll just
+		 * propagate the error out ... there's not that much we can do.
+		 */
+		AbortCurrentTransaction();
+
+		/* ... and start a new one */
+		StartTransactionCommand();
+		if (chain)
+			RestoreTransactionCharacteristics();
+
+		MemoryContextSwitchTo(oldcontext);
+
+		_SPI_current->internal_xact = false;
+
+		/* Now that we've cleaned up the transaction, re-throw the error */
+		ReThrowError(edata);
+	}
+	PG_END_TRY();
+}
+
+void
+SPI_commit(void)
+{
+	_SPI_commit(false);
+}
+
+void
+SPI_commit_and_chain(void)
+{
+	_SPI_commit(true);
+}
+
+static void
+_SPI_rollback(bool chain)
+{
+	MemoryContext oldcontext = CurrentMemoryContext;
+
+	/* see under SPI_commit() */
+	if (_SPI_current->atomic)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION),
+				 errmsg("invalid transaction termination")));
+
+	/* see under SPI_commit() */
+	if (IsSubTransaction())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION),
+				 errmsg("cannot roll back while a subtransaction is active")));
+
+	/* XXX this ain't re-entrant enough for my taste */
+	if (chain)
+		SaveTransactionCharacteristics();
+
+	/* Catch any error occurring during the ROLLBACK */
+	PG_TRY();
+	{
+		/* Protect current SPI stack entry against deletion */
+		_SPI_current->internal_xact = true;
+
+		/*
+		 * Hold any pinned portals that any PLs might be using.  We have to do
+		 * this before changing transaction state, since this will run
+		 * user-defined code that might throw an error, and in any case
+		 * couldn't be run in an already-aborted transaction.
+		 */
+		HoldPinnedPortals();
+
+		/* Release snapshots associated with portals */
+		ForgetPortalSnapshots();
+
+		/* Do the deed */
+		AbortCurrentTransaction();
+
+		/* Immediately start a new transaction */
+		StartTransactionCommand();
+		if (chain)
+			RestoreTransactionCharacteristics();
+
+		MemoryContextSwitchTo(oldcontext);
+
+		_SPI_current->internal_xact = false;
+	}
+	PG_CATCH();
+	{
+		ErrorData  *edata;
+
+		/* Save error info in caller's context */
+		MemoryContextSwitchTo(oldcontext);
+		edata = CopyErrorData();
+		FlushErrorState();
+
+		/*
+		 * Try again to abort the failed transaction.  If this fails too,
+		 * we'll just propagate the error out ... there's not that much we can
+		 * do.
+		 */
+		AbortCurrentTransaction();
+
+		/* ... and start a new one */
+		StartTransactionCommand();
+		if (chain)
+			RestoreTransactionCharacteristics();
+
+		MemoryContextSwitchTo(oldcontext);
+
+		_SPI_current->internal_xact = false;
+
+		/* Now that we've cleaned up the transaction, re-throw the error */
+		ReThrowError(edata);
+	}
+	PG_END_TRY();
+}
+
+void
+SPI_rollback(void)
+{
+	_SPI_rollback(false);
+}
+
+void
+SPI_rollback_and_chain(void)
+{
+	_SPI_rollback(true);
+}
+
+/*
+ * SPICleanup is a no-op, kept for backwards compatibility. We rely on
+ * AtEOXact_SPI to cleanup. Extensions should not (need to) fiddle with the
+ * internal SPI state directly.
+ */
+void
+SPICleanup(void)
+{
+}
+
+/*
+ * Clean up SPI state at transaction commit or abort.
+ */
+void
+AtEOXact_SPI(bool isCommit)
+{
+	bool		found = false;
+
+	/*
+	 * Pop stack entries, stopping if we find one marked internal_xact (that
+	 * one belongs to the caller of SPI_commit or SPI_abort).
+	 */
+	while (_SPI_connected >= 0)
+	{
+		_SPI_connection *connection = &(_SPI_stack[_SPI_connected]);
+
+		if (connection->internal_xact)
+			break;
+
+		found = true;
+
+		/*
+		 * We need not release the procedure's memory contexts explicitly, as
+		 * they'll go away automatically when their parent context does; see
+		 * notes in SPI_connect_ext.
+		 */
+
+		/*
+		 * Restore outer global variables and pop the stack entry.  Unlike
+		 * SPI_finish(), we don't risk switching to memory contexts that might
+		 * be already gone.
+		 */
+		SPI_processed = connection->outer_processed;
+		SPI_tuptable = connection->outer_tuptable;
+		SPI_result = connection->outer_result;
+
+		_SPI_connected--;
+		if (_SPI_connected < 0)
+			_SPI_current = NULL;
+		else
+			_SPI_current = &(_SPI_stack[_SPI_connected]);
+	}
+
+	/* We should only find entries to pop during an ABORT. */
+	if (found && isCommit)
+		ereport(WARNING,
+				(errcode(ERRCODE_WARNING),
+				 errmsg("transaction left non-empty SPI stack"),
+				 errhint("Check for missing \"SPI_finish\" calls.")));
+}
+
+/*
+ * Clean up SPI state at subtransaction commit or abort.
+ *
+ * During commit, there shouldn't be any unclosed entries remaining from
+ * the current subtransaction; we emit a warning if any are found.
+ */
+void
+AtEOSubXact_SPI(bool isCommit, SubTransactionId mySubid)
+{
+	bool		found = false;
+
+	while (_SPI_connected >= 0)
+	{
+		_SPI_connection *connection = &(_SPI_stack[_SPI_connected]);
+
+		if (connection->connectSubid != mySubid)
+			break;				/* couldn't be any underneath it either */
+
+		if (connection->internal_xact)
+			break;
+
+		found = true;
+
+		/*
+		 * Release procedure memory explicitly (see note in SPI_connect)
+		 */
+		if (connection->execCxt)
+		{
+			MemoryContextDelete(connection->execCxt);
+			connection->execCxt = NULL;
+		}
+		if (connection->procCxt)
+		{
+			MemoryContextDelete(connection->procCxt);
+			connection->procCxt = NULL;
+		}
+
+		/*
+		 * Restore outer global variables and pop the stack entry.  Unlike
+		 * SPI_finish(), we don't risk switching to memory contexts that might
+		 * be already gone.
+		 */
+		SPI_processed = connection->outer_processed;
+		SPI_tuptable = connection->outer_tuptable;
+		SPI_result = connection->outer_result;
+
+		_SPI_connected--;
+		if (_SPI_connected < 0)
+			_SPI_current = NULL;
+		else
+			_SPI_current = &(_SPI_stack[_SPI_connected]);
+	}
+
+	if (found && isCommit)
+		ereport(WARNING,
+				(errcode(ERRCODE_WARNING),
+				 errmsg("subtransaction left non-empty SPI stack"),
+				 errhint("Check for missing \"SPI_finish\" calls.")));
+
+	/*
+	 * If we are aborting a subtransaction and there is an open SPI context
+	 * surrounding the subxact, clean up to prevent memory leakage.
+	 */
+	if (_SPI_current && !isCommit)
+	{
+		slist_mutable_iter siter;
+
+		/*
+		 * Throw away executor state if current executor operation was started
+		 * within current subxact (essentially, force a _SPI_end_call(true)).
+		 */
+		if (_SPI_current->execSubid >= mySubid)
+		{
+			_SPI_current->execSubid = InvalidSubTransactionId;
+			MemoryContextResetAndDeleteChildren(_SPI_current->execCxt);
+		}
+
+		/* throw away any tuple tables created within current subxact */
+		slist_foreach_modify(siter, &_SPI_current->tuptables)
+		{
+			SPITupleTable *tuptable;
+
+			tuptable = slist_container(SPITupleTable, next, siter.cur);
+			if (tuptable->subid >= mySubid)
+			{
+				/*
+				 * If we used SPI_freetuptable() here, its internal search of
+				 * the tuptables list would make this operation O(N^2).
+				 * Instead, just free the tuptable manually.  This should
+				 * match what SPI_freetuptable() does.
+				 */
+				slist_delete_current(&siter);
+				if (tuptable == _SPI_current->tuptable)
+					_SPI_current->tuptable = NULL;
+				if (tuptable == SPI_tuptable)
+					SPI_tuptable = NULL;
+				MemoryContextDelete(tuptable->tuptabcxt);
+			}
+		}
+	}
+}
+
+/*
+ * Are we executing inside a procedure (that is, a nonatomic SPI context)?
+ */
+bool
+SPI_inside_nonatomic_context(void)
+{
+	if (_SPI_current == NULL)
+		return false;			/* not in any SPI context at all */
+	if (_SPI_current->atomic)
+		return false;			/* it's atomic (ie function not procedure) */
+	return true;
+}
+
+
+/* Parse, plan, and execute a query string */
+int
+SPI_execute(const char *src, bool read_only, long tcount)
+{
+	_SPI_plan	plan;
+	SPIExecuteOptions options;
+	int			res;
+
+	if (src == NULL || tcount < 0)
+		return SPI_ERROR_ARGUMENT;
+
+	res = _SPI_begin_call(true);
+	if (res < 0)
+		return res;
+
+	memset(&plan, 0, sizeof(_SPI_plan));
+	plan.magic = _SPI_PLAN_MAGIC;
+	plan.parse_mode = RAW_PARSE_DEFAULT;
+	plan.cursor_options = CURSOR_OPT_PARALLEL_OK;
+
+	_SPI_prepare_oneshot_plan(src, &plan);
+
+	memset(&options, 0, sizeof(options));
+	options.read_only = read_only;
+	options.tcount = tcount;
+
+	res = _SPI_execute_plan(&plan, &options,
+							InvalidSnapshot, InvalidSnapshot,
+							true);
+
+	_SPI_end_call(true);
+	return res;
+}
+
+/* Obsolete version of SPI_execute */
+int
+SPI_exec(const char *src, long tcount)
+{
+	return SPI_execute(src, false, tcount);
+}
+
+/* Parse, plan, and execute a query string, with extensible options */
+int
+SPI_execute_extended(const char *src,
+					 const SPIExecuteOptions *options)
+{
+	int			res;
+	_SPI_plan	plan;
+
+	if (src == NULL || options == NULL)
+		return SPI_ERROR_ARGUMENT;
+
+	res = _SPI_begin_call(true);
+	if (res < 0)
+		return res;
+
+	memset(&plan, 0, sizeof(_SPI_plan));
+	plan.magic = _SPI_PLAN_MAGIC;
+	plan.parse_mode = RAW_PARSE_DEFAULT;
+	plan.cursor_options = CURSOR_OPT_PARALLEL_OK;
+	if (options->params)
+	{
+		plan.parserSetup = options->params->parserSetup;
+		plan.parserSetupArg = options->params->parserSetupArg;
+	}
+
+	_SPI_prepare_oneshot_plan(src, &plan);
+
+	res = _SPI_execute_plan(&plan, options,
+							InvalidSnapshot, InvalidSnapshot,
+							true);
+
+	_SPI_end_call(true);
+	return res;
+}
+
+/* Execute a previously prepared plan */
+int
+SPI_execute_plan(SPIPlanPtr plan, Datum *Values, const char *Nulls,
+				 bool read_only, long tcount)
+{
+	SPIExecuteOptions options;
+	int			res;
+
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0)
+		return SPI_ERROR_ARGUMENT;
+
+	if (plan->nargs > 0 && Values == NULL)
+		return SPI_ERROR_PARAM;
+
+	res = _SPI_begin_call(true);
+	if (res < 0)
+		return res;
+
+	memset(&options, 0, sizeof(options));
+	options.params = _SPI_convert_params(plan->nargs, plan->argtypes,
+										 Values, Nulls);
+	options.read_only = read_only;
+	options.tcount = tcount;
+
+	res = _SPI_execute_plan(plan, &options,
+							InvalidSnapshot, InvalidSnapshot,
+							true);
+
+	_SPI_end_call(true);
+	return res;
+}
+
+/* Obsolete version of SPI_execute_plan */
+int
+SPI_execp(SPIPlanPtr plan, Datum *Values, const char *Nulls, long tcount)
+{
+	return SPI_execute_plan(plan, Values, Nulls, false, tcount);
+}
+
+/* Execute a previously prepared plan */
+int
+SPI_execute_plan_extended(SPIPlanPtr plan,
+						  const SPIExecuteOptions *options)
+{
+	int			res;
+
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || options == NULL)
+		return SPI_ERROR_ARGUMENT;
+
+	res = _SPI_begin_call(true);
+	if (res < 0)
+		return res;
+
+	res = _SPI_execute_plan(plan, options,
+							InvalidSnapshot, InvalidSnapshot,
+							true);
+
+	_SPI_end_call(true);
+	return res;
+}
+
+/* Execute a previously prepared plan */
+int
+SPI_execute_plan_with_paramlist(SPIPlanPtr plan, ParamListInfo params,
+								bool read_only, long tcount)
+{
+	SPIExecuteOptions options;
+	int			res;
+
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0)
+		return SPI_ERROR_ARGUMENT;
+
+	res = _SPI_begin_call(true);
+	if (res < 0)
+		return res;
+
+	memset(&options, 0, sizeof(options));
+	options.params = params;
+	options.read_only = read_only;
+	options.tcount = tcount;
+
+	res = _SPI_execute_plan(plan, &options,
+							InvalidSnapshot, InvalidSnapshot,
+							true);
+
+	_SPI_end_call(true);
+	return res;
+}
+
+/*
+ * SPI_execute_snapshot -- identical to SPI_execute_plan, except that we allow
+ * the caller to specify exactly which snapshots to use, which will be
+ * registered here.  Also, the caller may specify that AFTER triggers should be
+ * queued as part of the outer query rather than being fired immediately at the
+ * end of the command.
+ *
+ * This is currently not documented in spi.sgml because it is only intended
+ * for use by RI triggers.
+ *
+ * Passing snapshot == InvalidSnapshot will select the normal behavior of
+ * fetching a new snapshot for each query.
+ */
+int
+SPI_execute_snapshot(SPIPlanPtr plan,
+					 Datum *Values, const char *Nulls,
+					 Snapshot snapshot, Snapshot crosscheck_snapshot,
+					 bool read_only, bool fire_triggers, long tcount)
+{
+	SPIExecuteOptions options;
+	int			res;
+
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0)
+		return SPI_ERROR_ARGUMENT;
+
+	if (plan->nargs > 0 && Values == NULL)
+		return SPI_ERROR_PARAM;
+
+	res = _SPI_begin_call(true);
+	if (res < 0)
+		return res;
+
+	memset(&options, 0, sizeof(options));
+	options.params = _SPI_convert_params(plan->nargs, plan->argtypes,
+										 Values, Nulls);
+	options.read_only = read_only;
+	options.tcount = tcount;
+
+	res = _SPI_execute_plan(plan, &options,
+							snapshot, crosscheck_snapshot,
+							fire_triggers);
+
+	_SPI_end_call(true);
+	return res;
+}
+
+/*
+ * SPI_execute_with_args -- plan and execute a query with supplied arguments
+ *
+ * This is functionally equivalent to SPI_prepare followed by
+ * SPI_execute_plan.
+ */
+int
+SPI_execute_with_args(const char *src,
+					  int nargs, Oid *argtypes,
+					  Datum *Values, const char *Nulls,
+					  bool read_only, long tcount)
+{
+	int			res;
+	_SPI_plan	plan;
+	ParamListInfo paramLI;
+	SPIExecuteOptions options;
+
+	if (src == NULL || nargs < 0 || tcount < 0)
+		return SPI_ERROR_ARGUMENT;
+
+	if (nargs > 0 && (argtypes == NULL || Values == NULL))
+		return SPI_ERROR_PARAM;
+
+	res = _SPI_begin_call(true);
+	if (res < 0)
+		return res;
+
+	memset(&plan, 0, sizeof(_SPI_plan));
+	plan.magic = _SPI_PLAN_MAGIC;
+	plan.parse_mode = RAW_PARSE_DEFAULT;
+	plan.cursor_options = CURSOR_OPT_PARALLEL_OK;
+	plan.nargs = nargs;
+	plan.argtypes = argtypes;
+	plan.parserSetup = NULL;
+	plan.parserSetupArg = NULL;
+
+	paramLI = _SPI_convert_params(nargs, argtypes,
+								  Values, Nulls);
+
+	_SPI_prepare_oneshot_plan(src, &plan);
+
+	memset(&options, 0, sizeof(options));
+	options.params = paramLI;
+	options.read_only = read_only;
+	options.tcount = tcount;
+
+	res = _SPI_execute_plan(&plan, &options,
+							InvalidSnapshot, InvalidSnapshot,
+							true);
+
+	_SPI_end_call(true);
+	return res;
+}
+
+SPIPlanPtr
+SPI_prepare(const char *src, int nargs, Oid *argtypes)
+{
+	return SPI_prepare_cursor(src, nargs, argtypes, 0);
+}
+
+SPIPlanPtr
+SPI_prepare_cursor(const char *src, int nargs, Oid *argtypes,
+				   int cursorOptions)
+{
+	_SPI_plan	plan;
+	SPIPlanPtr	result;
+
+	if (src == NULL || nargs < 0 || (nargs > 0 && argtypes == NULL))
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return NULL;
+	}
+
+	SPI_result = _SPI_begin_call(true);
+	if (SPI_result < 0)
+		return NULL;
+
+	memset(&plan, 0, sizeof(_SPI_plan));
+	plan.magic = _SPI_PLAN_MAGIC;
+	plan.parse_mode = RAW_PARSE_DEFAULT;
+	plan.cursor_options = cursorOptions;
+	plan.nargs = nargs;
+	plan.argtypes = argtypes;
+	plan.parserSetup = NULL;
+	plan.parserSetupArg = NULL;
+
+	_SPI_prepare_plan(src, &plan);
+
+	/* copy plan to procedure context */
+	result = _SPI_make_plan_non_temp(&plan);
+
+	_SPI_end_call(true);
+
+	return result;
+}
+
+SPIPlanPtr
+SPI_prepare_extended(const char *src,
+					 const SPIPrepareOptions *options)
+{
+	_SPI_plan	plan;
+	SPIPlanPtr	result;
+
+	if (src == NULL || options == NULL)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return NULL;
+	}
+
+	SPI_result = _SPI_begin_call(true);
+	if (SPI_result < 0)
+		return NULL;
+
+	memset(&plan, 0, sizeof(_SPI_plan));
+	plan.magic = _SPI_PLAN_MAGIC;
+	plan.parse_mode = options->parseMode;
+	plan.cursor_options = options->cursorOptions;
+	plan.nargs = 0;
+	plan.argtypes = NULL;
+	plan.parserSetup = options->parserSetup;
+	plan.parserSetupArg = options->parserSetupArg;
+
+	_SPI_prepare_plan(src, &plan);
+
+	/* copy plan to procedure context */
+	result = _SPI_make_plan_non_temp(&plan);
+
+	_SPI_end_call(true);
+
+	return result;
+}
+
+SPIPlanPtr
+SPI_prepare_params(const char *src,
+				   ParserSetupHook parserSetup,
+				   void *parserSetupArg,
+				   int cursorOptions)
+{
+	_SPI_plan	plan;
+	SPIPlanPtr	result;
+
+	if (src == NULL)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return NULL;
+	}
+
+	SPI_result = _SPI_begin_call(true);
+	if (SPI_result < 0)
+		return NULL;
+
+	memset(&plan, 0, sizeof(_SPI_plan));
+	plan.magic = _SPI_PLAN_MAGIC;
+	plan.parse_mode = RAW_PARSE_DEFAULT;
+	plan.cursor_options = cursorOptions;
+	plan.nargs = 0;
+	plan.argtypes = NULL;
+	plan.parserSetup = parserSetup;
+	plan.parserSetupArg = parserSetupArg;
+
+	_SPI_prepare_plan(src, &plan);
+
+	/* copy plan to procedure context */
+	result = _SPI_make_plan_non_temp(&plan);
+
+	_SPI_end_call(true);
+
+	return result;
+}
+
+int
+SPI_keepplan(SPIPlanPtr plan)
+{
+	ListCell   *lc;
+
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC ||
+		plan->saved || plan->oneshot)
+		return SPI_ERROR_ARGUMENT;
+
+	/*
+	 * Mark it saved, reparent it under CacheMemoryContext, and mark all the
+	 * component CachedPlanSources as saved.  This sequence cannot fail
+	 * partway through, so there's no risk of long-term memory leakage.
+	 */
+	plan->saved = true;
+	MemoryContextSetParent(plan->plancxt, CacheMemoryContext);
+
+	foreach(lc, plan->plancache_list)
+	{
+		CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+		SaveCachedPlan(plansource);
+	}
+
+	return 0;
+}
+
+SPIPlanPtr
+SPI_saveplan(SPIPlanPtr plan)
+{
+	SPIPlanPtr	newplan;
+
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return NULL;
+	}
+
+	SPI_result = _SPI_begin_call(false);	/* don't change context */
+	if (SPI_result < 0)
+		return NULL;
+
+	newplan = _SPI_save_plan(plan);
+
+	SPI_result = _SPI_end_call(false);
+
+	return newplan;
+}
+
+int
+SPI_freeplan(SPIPlanPtr plan)
+{
+	ListCell   *lc;
+
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC)
+		return SPI_ERROR_ARGUMENT;
+
+	/* Release the plancache entries */
+	foreach(lc, plan->plancache_list)
+	{
+		CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+		DropCachedPlan(plansource);
+	}
+
+	/* Now get rid of the _SPI_plan and subsidiary data in its plancxt */
+	MemoryContextDelete(plan->plancxt);
+
+	return 0;
+}
+
+HeapTuple
+SPI_copytuple(HeapTuple tuple)
+{
+	MemoryContext oldcxt;
+	HeapTuple	ctuple;
+
+	if (tuple == NULL)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return NULL;
+	}
+
+	if (_SPI_current == NULL)
+	{
+		SPI_result = SPI_ERROR_UNCONNECTED;
+		return NULL;
+	}
+
+	oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+	ctuple = heap_copytuple(tuple);
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return ctuple;
+}
+
+HeapTupleHeader
+SPI_returntuple(HeapTuple tuple, TupleDesc tupdesc)
+{
+	MemoryContext oldcxt;
+	HeapTupleHeader dtup;
+
+	if (tuple == NULL || tupdesc == NULL)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return NULL;
+	}
+
+	if (_SPI_current == NULL)
+	{
+		SPI_result = SPI_ERROR_UNCONNECTED;
+		return NULL;
+	}
+
+	/* For RECORD results, make sure a typmod has been assigned */
+	if (tupdesc->tdtypeid == RECORDOID &&
+		tupdesc->tdtypmod < 0)
+		assign_record_type_typmod(tupdesc);
+
+	oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+	dtup = DatumGetHeapTupleHeader(heap_copy_tuple_as_datum(tuple, tupdesc));
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return dtup;
+}
+
+HeapTuple
+SPI_modifytuple(Relation rel, HeapTuple tuple, int natts, int *attnum,
+				Datum *Values, const char *Nulls)
+{
+	MemoryContext oldcxt;
+	HeapTuple	mtuple;
+	int			numberOfAttributes;
+	Datum	   *v;
+	bool	   *n;
+	int			i;
+
+	if (rel == NULL || tuple == NULL || natts < 0 || attnum == NULL || Values == NULL)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return NULL;
+	}
+
+	if (_SPI_current == NULL)
+	{
+		SPI_result = SPI_ERROR_UNCONNECTED;
+		return NULL;
+	}
+
+	oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+	SPI_result = 0;
+
+	numberOfAttributes = rel->rd_att->natts;
+	v = (Datum *) palloc(numberOfAttributes * sizeof(Datum));
+	n = (bool *) palloc(numberOfAttributes * sizeof(bool));
+
+	/* fetch old values and nulls */
+	heap_deform_tuple(tuple, rel->rd_att, v, n);
+
+	/* replace values and nulls */
+	for (i = 0; i < natts; i++)
+	{
+		if (attnum[i] <= 0 || attnum[i] > numberOfAttributes)
+			break;
+		v[attnum[i] - 1] = Values[i];
+		n[attnum[i] - 1] = (Nulls && Nulls[i] == 'n') ? true : false;
+	}
+
+	if (i == natts)				/* no errors in *attnum */
+	{
+		mtuple = heap_form_tuple(rel->rd_att, v, n);
+
+		/*
+		 * copy the identification info of the old tuple: t_ctid, t_self, and
+		 * OID (if any)
+		 */
+		mtuple->t_data->t_ctid = tuple->t_data->t_ctid;
+		mtuple->t_self = tuple->t_self;
+		mtuple->t_tableOid = tuple->t_tableOid;
+	}
+	else
+	{
+		mtuple = NULL;
+		SPI_result = SPI_ERROR_NOATTRIBUTE;
+	}
+
+	pfree(v);
+	pfree(n);
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return mtuple;
+}
+
+int
+SPI_fnumber(TupleDesc tupdesc, const char *fname)
+{
+	int			res;
+	const FormData_pg_attribute *sysatt;
+
+	for (res = 0; res < tupdesc->natts; res++)
+	{
+		Form_pg_attribute attr = TupleDescAttr(tupdesc, res);
+
+		if (namestrcmp(&attr->attname, fname) == 0 &&
+			!attr->attisdropped)
+			return res + 1;
+	}
+
+	sysatt = SystemAttributeByName(fname);
+	if (sysatt != NULL)
+		return sysatt->attnum;
+
+	/* SPI_ERROR_NOATTRIBUTE is different from all sys column numbers */
+	return SPI_ERROR_NOATTRIBUTE;
+}
+
+char *
+SPI_fname(TupleDesc tupdesc, int fnumber)
+{
+	const FormData_pg_attribute *att;
+
+	SPI_result = 0;
+
+	if (fnumber > tupdesc->natts || fnumber == 0 ||
+		fnumber <= FirstLowInvalidHeapAttributeNumber)
+	{
+		SPI_result = SPI_ERROR_NOATTRIBUTE;
+		return NULL;
+	}
+
+	if (fnumber > 0)
+		att = TupleDescAttr(tupdesc, fnumber - 1);
+	else
+		att = SystemAttributeDefinition(fnumber);
+
+	return pstrdup(NameStr(att->attname));
+}
+
+char *
+SPI_getvalue(HeapTuple tuple, TupleDesc tupdesc, int fnumber)
+{
+	Datum		val;
+	bool		isnull;
+	Oid			typoid,
+				foutoid;
+	bool		typisvarlena;
+
+	SPI_result = 0;
+
+	if (fnumber > tupdesc->natts || fnumber == 0 ||
+		fnumber <= FirstLowInvalidHeapAttributeNumber)
+	{
+		SPI_result = SPI_ERROR_NOATTRIBUTE;
+		return NULL;
+	}
+
+	val = heap_getattr(tuple, fnumber, tupdesc, &isnull);
+	if (isnull)
+		return NULL;
+
+	if (fnumber > 0)
+		typoid = TupleDescAttr(tupdesc, fnumber - 1)->atttypid;
+	else
+		typoid = (SystemAttributeDefinition(fnumber))->atttypid;
+
+	getTypeOutputInfo(typoid, &foutoid, &typisvarlena);
+
+	return OidOutputFunctionCall(foutoid, val);
+}
+
+Datum
+SPI_getbinval(HeapTuple tuple, TupleDesc tupdesc, int fnumber, bool *isnull)
+{
+	SPI_result = 0;
+
+	if (fnumber > tupdesc->natts || fnumber == 0 ||
+		fnumber <= FirstLowInvalidHeapAttributeNumber)
+	{
+		SPI_result = SPI_ERROR_NOATTRIBUTE;
+		*isnull = true;
+		return (Datum) NULL;
+	}
+
+	return heap_getattr(tuple, fnumber, tupdesc, isnull);
+}
+
+char *
+SPI_gettype(TupleDesc tupdesc, int fnumber)
+{
+	Oid			typoid;
+	HeapTuple	typeTuple;
+	char	   *result;
+
+	SPI_result = 0;
+
+	if (fnumber > tupdesc->natts || fnumber == 0 ||
+		fnumber <= FirstLowInvalidHeapAttributeNumber)
+	{
+		SPI_result = SPI_ERROR_NOATTRIBUTE;
+		return NULL;
+	}
+
+	if (fnumber > 0)
+		typoid = TupleDescAttr(tupdesc, fnumber - 1)->atttypid;
+	else
+		typoid = (SystemAttributeDefinition(fnumber))->atttypid;
+
+	typeTuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid));
+
+	if (!HeapTupleIsValid(typeTuple))
+	{
+		SPI_result = SPI_ERROR_TYPUNKNOWN;
+		return NULL;
+	}
+
+	result = pstrdup(NameStr(((Form_pg_type) GETSTRUCT(typeTuple))->typname));
+	ReleaseSysCache(typeTuple);
+	return result;
+}
+
+/*
+ * Get the data type OID for a column.
+ *
+ * There's nothing similar for typmod and typcollation.  The rare consumers
+ * thereof should inspect the TupleDesc directly.
+ */
+Oid
+SPI_gettypeid(TupleDesc tupdesc, int fnumber)
+{
+	SPI_result = 0;
+
+	if (fnumber > tupdesc->natts || fnumber == 0 ||
+		fnumber <= FirstLowInvalidHeapAttributeNumber)
+	{
+		SPI_result = SPI_ERROR_NOATTRIBUTE;
+		return InvalidOid;
+	}
+
+	if (fnumber > 0)
+		return TupleDescAttr(tupdesc, fnumber - 1)->atttypid;
+	else
+		return (SystemAttributeDefinition(fnumber))->atttypid;
+}
+
+char *
+SPI_getrelname(Relation rel)
+{
+	return pstrdup(RelationGetRelationName(rel));
+}
+
+char *
+SPI_getnspname(Relation rel)
+{
+	return get_namespace_name(RelationGetNamespace(rel));
+}
+
+void *
+SPI_palloc(Size size)
+{
+	if (_SPI_current == NULL)
+		elog(ERROR, "SPI_palloc called while not connected to SPI");
+
+	return MemoryContextAlloc(_SPI_current->savedcxt, size);
+}
+
+void *
+SPI_repalloc(void *pointer, Size size)
+{
+	/* No longer need to worry which context chunk was in... */
+	return repalloc(pointer, size);
+}
+
+void
+SPI_pfree(void *pointer)
+{
+	/* No longer need to worry which context chunk was in... */
+	pfree(pointer);
+}
+
+Datum
+SPI_datumTransfer(Datum value, bool typByVal, int typLen)
+{
+	MemoryContext oldcxt;
+	Datum		result;
+
+	if (_SPI_current == NULL)
+		elog(ERROR, "SPI_datumTransfer called while not connected to SPI");
+
+	oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt);
+
+	result = datumTransfer(value, typByVal, typLen);
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return result;
+}
+
+void
+SPI_freetuple(HeapTuple tuple)
+{
+	/* No longer need to worry which context tuple was in... */
+	heap_freetuple(tuple);
+}
+
+void
+SPI_freetuptable(SPITupleTable *tuptable)
+{
+	bool		found = false;
+
+	/* ignore call if NULL pointer */
+	if (tuptable == NULL)
+		return;
+
+	/*
+	 * Search only the topmost SPI context for a matching tuple table.
+	 */
+	if (_SPI_current != NULL)
+	{
+		slist_mutable_iter siter;
+
+		/* find tuptable in active list, then remove it */
+		slist_foreach_modify(siter, &_SPI_current->tuptables)
+		{
+			SPITupleTable *tt;
+
+			tt = slist_container(SPITupleTable, next, siter.cur);
+			if (tt == tuptable)
+			{
+				slist_delete_current(&siter);
+				found = true;
+				break;
+			}
+		}
+	}
+
+	/*
+	 * Refuse the deletion if we didn't find it in the topmost SPI context.
+	 * This is primarily a guard against double deletion, but might prevent
+	 * other errors as well.  Since the worst consequence of not deleting a
+	 * tuptable would be a transient memory leak, this is just a WARNING.
+	 */
+	if (!found)
+	{
+		elog(WARNING, "attempt to delete invalid SPITupleTable %p", tuptable);
+		return;
+	}
+
+	/* for safety, reset global variables that might point at tuptable */
+	if (tuptable == _SPI_current->tuptable)
+		_SPI_current->tuptable = NULL;
+	if (tuptable == SPI_tuptable)
+		SPI_tuptable = NULL;
+
+	/* release all memory belonging to tuptable */
+	MemoryContextDelete(tuptable->tuptabcxt);
+}
+
+
+/*
+ * SPI_cursor_open()
+ *
+ *	Open a prepared SPI plan as a portal
+ */
+Portal
+SPI_cursor_open(const char *name, SPIPlanPtr plan,
+				Datum *Values, const char *Nulls,
+				bool read_only)
+{
+	Portal		portal;
+	ParamListInfo paramLI;
+
+	/* build transient ParamListInfo in caller's context */
+	paramLI = _SPI_convert_params(plan->nargs, plan->argtypes,
+								  Values, Nulls);
+
+	portal = SPI_cursor_open_internal(name, plan, paramLI, read_only);
+
+	/* done with the transient ParamListInfo */
+	if (paramLI)
+		pfree(paramLI);
+
+	return portal;
+}
+
+
+/*
+ * SPI_cursor_open_with_args()
+ *
+ * Parse and plan a query and open it as a portal.
+ */
+Portal
+SPI_cursor_open_with_args(const char *name,
+						  const char *src,
+						  int nargs, Oid *argtypes,
+						  Datum *Values, const char *Nulls,
+						  bool read_only, int cursorOptions)
+{
+	Portal		result;
+	_SPI_plan	plan;
+	ParamListInfo paramLI;
+
+	if (src == NULL || nargs < 0)
+		elog(ERROR, "SPI_cursor_open_with_args called with invalid arguments");
+
+	if (nargs > 0 && (argtypes == NULL || Values == NULL))
+		elog(ERROR, "SPI_cursor_open_with_args called with missing parameters");
+
+	SPI_result = _SPI_begin_call(true);
+	if (SPI_result < 0)
+		elog(ERROR, "SPI_cursor_open_with_args called while not connected");
+
+	memset(&plan, 0, sizeof(_SPI_plan));
+	plan.magic = _SPI_PLAN_MAGIC;
+	plan.parse_mode = RAW_PARSE_DEFAULT;
+	plan.cursor_options = cursorOptions;
+	plan.nargs = nargs;
+	plan.argtypes = argtypes;
+	plan.parserSetup = NULL;
+	plan.parserSetupArg = NULL;
+
+	/* build transient ParamListInfo in executor context */
+	paramLI = _SPI_convert_params(nargs, argtypes,
+								  Values, Nulls);
+
+	_SPI_prepare_plan(src, &plan);
+
+	/* We needn't copy the plan; SPI_cursor_open_internal will do so */
+
+	result = SPI_cursor_open_internal(name, &plan, paramLI, read_only);
+
+	/* And clean up */
+	_SPI_end_call(true);
+
+	return result;
+}
+
+
+/*
+ * SPI_cursor_open_with_paramlist()
+ *
+ *	Same as SPI_cursor_open except that parameters (if any) are passed
+ *	as a ParamListInfo, which supports dynamic parameter set determination
+ */
+Portal
+SPI_cursor_open_with_paramlist(const char *name, SPIPlanPtr plan,
+							   ParamListInfo params, bool read_only)
+{
+	return SPI_cursor_open_internal(name, plan, params, read_only);
+}
+
+/* Parse a query and open it as a cursor */
+Portal
+SPI_cursor_parse_open(const char *name,
+					  const char *src,
+					  const SPIParseOpenOptions *options)
+{
+	Portal		result;
+	_SPI_plan	plan;
+
+	if (src == NULL || options == NULL)
+		elog(ERROR, "SPI_cursor_parse_open called with invalid arguments");
+
+	SPI_result = _SPI_begin_call(true);
+	if (SPI_result < 0)
+		elog(ERROR, "SPI_cursor_parse_open called while not connected");
+
+	memset(&plan, 0, sizeof(_SPI_plan));
+	plan.magic = _SPI_PLAN_MAGIC;
+	plan.parse_mode = RAW_PARSE_DEFAULT;
+	plan.cursor_options = options->cursorOptions;
+	if (options->params)
+	{
+		plan.parserSetup = options->params->parserSetup;
+		plan.parserSetupArg = options->params->parserSetupArg;
+	}
+
+	_SPI_prepare_plan(src, &plan);
+
+	/* We needn't copy the plan; SPI_cursor_open_internal will do so */
+
+	result = SPI_cursor_open_internal(name, &plan,
+									  options->params, options->read_only);
+
+	/* And clean up */
+	_SPI_end_call(true);
+
+	return result;
+}
+
+
+/*
+ * SPI_cursor_open_internal()
+ *
+ *	Common code for SPI_cursor_open variants
+ */
+static Portal
+SPI_cursor_open_internal(const char *name, SPIPlanPtr plan,
+						 ParamListInfo paramLI, bool read_only)
+{
+	CachedPlanSource *plansource;
+	CachedPlan *cplan;
+	List	   *stmt_list;
+	char	   *query_string;
+	Snapshot	snapshot;
+	MemoryContext oldcontext;
+	Portal		portal;
+	SPICallbackArg spicallbackarg;
+	ErrorContextCallback spierrcontext;
+
+	/*
+	 * Check that the plan is something the Portal code will special-case as
+	 * returning one tupleset.
+	 */
+	if (!SPI_is_cursor_plan(plan))
+	{
+		/* try to give a good error message */
+		const char *cmdtag;
+
+		if (list_length(plan->plancache_list) != 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_CURSOR_DEFINITION),
+					 errmsg("cannot open multi-query plan as cursor")));
+		plansource = (CachedPlanSource *) linitial(plan->plancache_list);
+		/* A SELECT that fails SPI_is_cursor_plan() must be SELECT INTO */
+		if (plansource->commandTag == CMDTAG_SELECT)
+			cmdtag = "SELECT INTO";
+		else
+			cmdtag = GetCommandTagName(plansource->commandTag);
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_CURSOR_DEFINITION),
+		/* translator: %s is name of a SQL command, eg INSERT */
+				 errmsg("cannot open %s query as cursor", cmdtag)));
+	}
+
+	Assert(list_length(plan->plancache_list) == 1);
+	plansource = (CachedPlanSource *) linitial(plan->plancache_list);
+
+	/* Push the SPI stack */
+	if (_SPI_begin_call(true) < 0)
+		elog(ERROR, "SPI_cursor_open called while not connected");
+
+	/* Reset SPI result (note we deliberately don't touch lastoid) */
+	SPI_processed = 0;
+	SPI_tuptable = NULL;
+	_SPI_current->processed = 0;
+	_SPI_current->tuptable = NULL;
+
+	/* Create the portal */
+	if (name == NULL || name[0] == '\0')
+	{
+		/* Use a random nonconflicting name */
+		portal = CreateNewPortal();
+	}
+	else
+	{
+		/* In this path, error if portal of same name already exists */
+		portal = CreatePortal(name, false, false);
+	}
+
+	/* Copy the plan's query string into the portal */
+	query_string = MemoryContextStrdup(portal->portalContext,
+									   plansource->query_string);
+
+	/*
+	 * Setup error traceback support for ereport(), in case GetCachedPlan
+	 * throws an error.
+	 */
+	spicallbackarg.query = plansource->query_string;
+	spicallbackarg.mode = plan->parse_mode;
+	spierrcontext.callback = _SPI_error_callback;
+	spierrcontext.arg = &spicallbackarg;
+	spierrcontext.previous = error_context_stack;
+	error_context_stack = &spierrcontext;
+
+	/*
+	 * Note: for a saved plan, we mustn't have any failure occur between
+	 * GetCachedPlan and PortalDefineQuery; that would result in leaking our
+	 * plancache refcount.
+	 */
+
+	/* Replan if needed, and increment plan refcount for portal */
+	cplan = GetCachedPlan(plansource, paramLI, NULL, _SPI_current->queryEnv);
+	stmt_list = cplan->stmt_list;
+
+	if (!plan->saved)
+	{
+		/*
+		 * We don't want the portal to depend on an unsaved CachedPlanSource,
+		 * so must copy the plan into the portal's context.  An error here
+		 * will result in leaking our refcount on the plan, but it doesn't
+		 * matter because the plan is unsaved and hence transient anyway.
+		 */
+		oldcontext = MemoryContextSwitchTo(portal->portalContext);
+		stmt_list = copyObject(stmt_list);
+		MemoryContextSwitchTo(oldcontext);
+		ReleaseCachedPlan(cplan, NULL);
+		cplan = NULL;			/* portal shouldn't depend on cplan */
+	}
+
+	/*
+	 * Set up the portal.
+	 */
+	PortalDefineQuery(portal,
+					  NULL,		/* no statement name */
+					  query_string,
+					  plansource->commandTag,
+					  stmt_list,
+					  cplan);
+
+	/*
+	 * Set up options for portal.  Default SCROLL type is chosen the same way
+	 * as PerformCursorOpen does it.
+	 */
+	portal->cursorOptions = plan->cursor_options;
+	if (!(portal->cursorOptions & (CURSOR_OPT_SCROLL | CURSOR_OPT_NO_SCROLL)))
+	{
+		if (list_length(stmt_list) == 1 &&
+			linitial_node(PlannedStmt, stmt_list)->commandType != CMD_UTILITY &&
+			linitial_node(PlannedStmt, stmt_list)->rowMarks == NIL &&
+			ExecSupportsBackwardScan(linitial_node(PlannedStmt, stmt_list)->planTree))
+			portal->cursorOptions |= CURSOR_OPT_SCROLL;
+		else
+			portal->cursorOptions |= CURSOR_OPT_NO_SCROLL;
+	}
+
+	/*
+	 * Disallow SCROLL with SELECT FOR UPDATE.  This is not redundant with the
+	 * check in transformDeclareCursorStmt because the cursor options might
+	 * not have come through there.
+	 */
+	if (portal->cursorOptions & CURSOR_OPT_SCROLL)
+	{
+		if (list_length(stmt_list) == 1 &&
+			linitial_node(PlannedStmt, stmt_list)->commandType != CMD_UTILITY &&
+			linitial_node(PlannedStmt, stmt_list)->rowMarks != NIL)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("DECLARE SCROLL CURSOR ... FOR UPDATE/SHARE is not supported"),
+					 errdetail("Scrollable cursors must be READ ONLY.")));
+	}
+
+	/* Make current query environment available to portal at execution time. */
+	portal->queryEnv = _SPI_current->queryEnv;
+
+	/*
+	 * If told to be read-only, we'd better check for read-only queries. This
+	 * can't be done earlier because we need to look at the finished, planned
+	 * queries.  (In particular, we don't want to do it between GetCachedPlan
+	 * and PortalDefineQuery, because throwing an error between those steps
+	 * would result in leaking our plancache refcount.)
+	 */
+	if (read_only)
+	{
+		ListCell   *lc;
+
+		foreach(lc, stmt_list)
+		{
+			PlannedStmt *pstmt = lfirst_node(PlannedStmt, lc);
+
+			if (!CommandIsReadOnly(pstmt))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				/* translator: %s is a SQL statement name */
+						 errmsg("%s is not allowed in a non-volatile function",
+								CreateCommandName((Node *) pstmt))));
+		}
+	}
+
+	/* Set up the snapshot to use. */
+	if (read_only)
+		snapshot = GetActiveSnapshot();
+	else
+	{
+		CommandCounterIncrement();
+		snapshot = GetTransactionSnapshot();
+	}
+
+	/*
+	 * If the plan has parameters, copy them into the portal.  Note that this
+	 * must be done after revalidating the plan, because in dynamic parameter
+	 * cases the set of parameters could have changed during re-parsing.
+	 */
+	if (paramLI)
+	{
+		oldcontext = MemoryContextSwitchTo(portal->portalContext);
+		paramLI = copyParamList(paramLI);
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	/*
+	 * Start portal execution.
+	 */
+	PortalStart(portal, paramLI, 0, snapshot);
+
+	Assert(portal->strategy != PORTAL_MULTI_QUERY);
+
+	/* Pop the error context stack */
+	error_context_stack = spierrcontext.previous;
+
+	/* Pop the SPI stack */
+	_SPI_end_call(true);
+
+	/* Return the created portal */
+	return portal;
+}
+
+
+/*
+ * SPI_cursor_find()
+ *
+ *	Find the portal of an existing open cursor
+ */
+Portal
+SPI_cursor_find(const char *name)
+{
+	return GetPortalByName(name);
+}
+
+
+/*
+ * SPI_cursor_fetch()
+ *
+ *	Fetch rows in a cursor
+ */
+void
+SPI_cursor_fetch(Portal portal, bool forward, long count)
+{
+	_SPI_cursor_operation(portal,
+						  forward ? FETCH_FORWARD : FETCH_BACKWARD, count,
+						  CreateDestReceiver(DestSPI));
+	/* we know that the DestSPI receiver doesn't need a destroy call */
+}
+
+
+/*
+ * SPI_cursor_move()
+ *
+ *	Move in a cursor
+ */
+void
+SPI_cursor_move(Portal portal, bool forward, long count)
+{
+	_SPI_cursor_operation(portal,
+						  forward ? FETCH_FORWARD : FETCH_BACKWARD, count,
+						  None_Receiver);
+}
+
+
+/*
+ * SPI_scroll_cursor_fetch()
+ *
+ *	Fetch rows in a scrollable cursor
+ */
+void
+SPI_scroll_cursor_fetch(Portal portal, FetchDirection direction, long count)
+{
+	_SPI_cursor_operation(portal,
+						  direction, count,
+						  CreateDestReceiver(DestSPI));
+	/* we know that the DestSPI receiver doesn't need a destroy call */
+}
+
+
+/*
+ * SPI_scroll_cursor_move()
+ *
+ *	Move in a scrollable cursor
+ */
+void
+SPI_scroll_cursor_move(Portal portal, FetchDirection direction, long count)
+{
+	_SPI_cursor_operation(portal, direction, count, None_Receiver);
+}
+
+
+/*
+ * SPI_cursor_close()
+ *
+ *	Close a cursor
+ */
+void
+SPI_cursor_close(Portal portal)
+{
+	if (!PortalIsValid(portal))
+		elog(ERROR, "invalid portal in SPI cursor operation");
+
+	PortalDrop(portal, false);
+}
+
+/*
+ * Returns the Oid representing the type id for argument at argIndex. First
+ * parameter is at index zero.
+ */
+Oid
+SPI_getargtypeid(SPIPlanPtr plan, int argIndex)
+{
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC ||
+		argIndex < 0 || argIndex >= plan->nargs)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return InvalidOid;
+	}
+	return plan->argtypes[argIndex];
+}
+
+/*
+ * Returns the number of arguments for the prepared plan.
+ */
+int
+SPI_getargcount(SPIPlanPtr plan)
+{
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return -1;
+	}
+	return plan->nargs;
+}
+
+/*
+ * Returns true if the plan contains exactly one command
+ * and that command returns tuples to the caller (eg, SELECT or
+ * INSERT ... RETURNING, but not SELECT ... INTO). In essence,
+ * the result indicates if the command can be used with SPI_cursor_open
+ *
+ * Parameters
+ *	  plan: A plan previously prepared using SPI_prepare
+ */
+bool
+SPI_is_cursor_plan(SPIPlanPtr plan)
+{
+	CachedPlanSource *plansource;
+
+	if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC)
+	{
+		SPI_result = SPI_ERROR_ARGUMENT;
+		return false;
+	}
+
+	if (list_length(plan->plancache_list) != 1)
+	{
+		SPI_result = 0;
+		return false;			/* not exactly 1 pre-rewrite command */
+	}
+	plansource = (CachedPlanSource *) linitial(plan->plancache_list);
+
+	/*
+	 * We used to force revalidation of the cached plan here, but that seems
+	 * unnecessary: invalidation could mean a change in the rowtype of the
+	 * tuples returned by a plan, but not whether it returns tuples at all.
+	 */
+	SPI_result = 0;
+
+	/* Does it return tuples? */
+	if (plansource->resultDesc)
+		return true;
+
+	return false;
+}
+
+/*
+ * SPI_plan_is_valid --- test whether a SPI plan is currently valid
+ * (that is, not marked as being in need of revalidation).
+ *
+ * See notes for CachedPlanIsValid before using this.
+ */
+bool
+SPI_plan_is_valid(SPIPlanPtr plan)
+{
+	ListCell   *lc;
+
+	Assert(plan->magic == _SPI_PLAN_MAGIC);
+
+	foreach(lc, plan->plancache_list)
+	{
+		CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+		if (!CachedPlanIsValid(plansource))
+			return false;
+	}
+	return true;
+}
+
+/*
+ * SPI_result_code_string --- convert any SPI return code to a string
+ *
+ * This is often useful in error messages.  Most callers will probably
+ * only pass negative (error-case) codes, but for generality we recognize
+ * the success codes too.
+ */
+const char *
+SPI_result_code_string(int code)
+{
+	static char buf[64];
+
+	switch (code)
+	{
+		case SPI_ERROR_CONNECT:
+			return "SPI_ERROR_CONNECT";
+		case SPI_ERROR_COPY:
+			return "SPI_ERROR_COPY";
+		case SPI_ERROR_OPUNKNOWN:
+			return "SPI_ERROR_OPUNKNOWN";
+		case SPI_ERROR_UNCONNECTED:
+			return "SPI_ERROR_UNCONNECTED";
+		case SPI_ERROR_ARGUMENT:
+			return "SPI_ERROR_ARGUMENT";
+		case SPI_ERROR_PARAM:
+			return "SPI_ERROR_PARAM";
+		case SPI_ERROR_TRANSACTION:
+			return "SPI_ERROR_TRANSACTION";
+		case SPI_ERROR_NOATTRIBUTE:
+			return "SPI_ERROR_NOATTRIBUTE";
+		case SPI_ERROR_NOOUTFUNC:
+			return "SPI_ERROR_NOOUTFUNC";
+		case SPI_ERROR_TYPUNKNOWN:
+			return "SPI_ERROR_TYPUNKNOWN";
+		case SPI_ERROR_REL_DUPLICATE:
+			return "SPI_ERROR_REL_DUPLICATE";
+		case SPI_ERROR_REL_NOT_FOUND:
+			return "SPI_ERROR_REL_NOT_FOUND";
+		case SPI_OK_CONNECT:
+			return "SPI_OK_CONNECT";
+		case SPI_OK_FINISH:
+			return "SPI_OK_FINISH";
+		case SPI_OK_FETCH:
+			return "SPI_OK_FETCH";
+		case SPI_OK_UTILITY:
+			return "SPI_OK_UTILITY";
+		case SPI_OK_SELECT:
+			return "SPI_OK_SELECT";
+		case SPI_OK_SELINTO:
+			return "SPI_OK_SELINTO";
+		case SPI_OK_INSERT:
+			return "SPI_OK_INSERT";
+		case SPI_OK_DELETE:
+			return "SPI_OK_DELETE";
+		case SPI_OK_UPDATE:
+			return "SPI_OK_UPDATE";
+		case SPI_OK_CURSOR:
+			return "SPI_OK_CURSOR";
+		case SPI_OK_INSERT_RETURNING:
+			return "SPI_OK_INSERT_RETURNING";
+		case SPI_OK_DELETE_RETURNING:
+			return "SPI_OK_DELETE_RETURNING";
+		case SPI_OK_UPDATE_RETURNING:
+			return "SPI_OK_UPDATE_RETURNING";
+		case SPI_OK_REWRITTEN:
+			return "SPI_OK_REWRITTEN";
+		case SPI_OK_REL_REGISTER:
+			return "SPI_OK_REL_REGISTER";
+		case SPI_OK_REL_UNREGISTER:
+			return "SPI_OK_REL_UNREGISTER";
+	}
+	/* Unrecognized code ... return something useful ... */
+	sprintf(buf, "Unrecognized SPI code %d", code);
+	return buf;
+}
+
+/*
+ * SPI_plan_get_plan_sources --- get a SPI plan's underlying list of
+ * CachedPlanSources.
+ *
+ * This is exported so that PL/pgSQL can use it (this beats letting PL/pgSQL
+ * look directly into the SPIPlan for itself).  It's not documented in
+ * spi.sgml because we'd just as soon not have too many places using this.
+ */
+List *
+SPI_plan_get_plan_sources(SPIPlanPtr plan)
+{
+	Assert(plan->magic == _SPI_PLAN_MAGIC);
+	return plan->plancache_list;
+}
+
+/*
+ * SPI_plan_get_cached_plan --- get a SPI plan's generic CachedPlan,
+ * if the SPI plan contains exactly one CachedPlanSource.  If not,
+ * return NULL.
+ *
+ * The plan's refcount is incremented (and logged in CurrentResourceOwner,
+ * if it's a saved plan).  Caller is responsible for doing ReleaseCachedPlan.
+ *
+ * This is exported so that PL/pgSQL can use it (this beats letting PL/pgSQL
+ * look directly into the SPIPlan for itself).  It's not documented in
+ * spi.sgml because we'd just as soon not have too many places using this.
+ */
+CachedPlan *
+SPI_plan_get_cached_plan(SPIPlanPtr plan)
+{
+	CachedPlanSource *plansource;
+	CachedPlan *cplan;
+	SPICallbackArg spicallbackarg;
+	ErrorContextCallback spierrcontext;
+
+	Assert(plan->magic == _SPI_PLAN_MAGIC);
+
+	/* Can't support one-shot plans here */
+	if (plan->oneshot)
+		return NULL;
+
+	/* Must have exactly one CachedPlanSource */
+	if (list_length(plan->plancache_list) != 1)
+		return NULL;
+	plansource = (CachedPlanSource *) linitial(plan->plancache_list);
+
+	/* Setup error traceback support for ereport() */
+	spicallbackarg.query = plansource->query_string;
+	spicallbackarg.mode = plan->parse_mode;
+	spierrcontext.callback = _SPI_error_callback;
+	spierrcontext.arg = &spicallbackarg;
+	spierrcontext.previous = error_context_stack;
+	error_context_stack = &spierrcontext;
+
+	/* Get the generic plan for the query */
+	cplan = GetCachedPlan(plansource, NULL,
+						  plan->saved ? CurrentResourceOwner : NULL,
+						  _SPI_current->queryEnv);
+	Assert(cplan == plansource->gplan);
+
+	/* Pop the error context stack */
+	error_context_stack = spierrcontext.previous;
+
+	return cplan;
+}
+
+
+/* =================== private functions =================== */
+
+/*
+ * spi_dest_startup
+ *		Initialize to receive tuples from Executor into SPITupleTable
+ *		of current SPI procedure
+ */
+void
+spi_dest_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+	SPITupleTable *tuptable;
+	MemoryContext oldcxt;
+	MemoryContext tuptabcxt;
+
+	if (_SPI_current == NULL)
+		elog(ERROR, "spi_dest_startup called while not connected to SPI");
+
+	if (_SPI_current->tuptable != NULL)
+		elog(ERROR, "improper call to spi_dest_startup");
+
+	/* We create the tuple table context as a child of procCxt */
+
+	oldcxt = _SPI_procmem();	/* switch to procedure memory context */
+
+	tuptabcxt = AllocSetContextCreate(CurrentMemoryContext,
+									  "SPI TupTable",
+									  ALLOCSET_DEFAULT_SIZES);
+	MemoryContextSwitchTo(tuptabcxt);
+
+	_SPI_current->tuptable = tuptable = (SPITupleTable *)
+		palloc0(sizeof(SPITupleTable));
+	tuptable->tuptabcxt = tuptabcxt;
+	tuptable->subid = GetCurrentSubTransactionId();
+
+	/*
+	 * The tuptable is now valid enough to be freed by AtEOSubXact_SPI, so put
+	 * it onto the SPI context's tuptables list.  This will ensure it's not
+	 * leaked even in the unlikely event the following few lines fail.
+	 */
+	slist_push_head(&_SPI_current->tuptables, &tuptable->next);
+
+	/* set up initial allocations */
+	tuptable->alloced = 128;
+	tuptable->vals = (HeapTuple *) palloc(tuptable->alloced * sizeof(HeapTuple));
+	tuptable->numvals = 0;
+	tuptable->tupdesc = CreateTupleDescCopy(typeinfo);
+
+	MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * spi_printtup
+ *		store tuple retrieved by Executor into SPITupleTable
+ *		of current SPI procedure
+ */
+bool
+spi_printtup(TupleTableSlot *slot, DestReceiver *self)
+{
+	SPITupleTable *tuptable;
+	MemoryContext oldcxt;
+
+	if (_SPI_current == NULL)
+		elog(ERROR, "spi_printtup called while not connected to SPI");
+
+	tuptable = _SPI_current->tuptable;
+	if (tuptable == NULL)
+		elog(ERROR, "improper call to spi_printtup");
+
+	oldcxt = MemoryContextSwitchTo(tuptable->tuptabcxt);
+
+	if (tuptable->numvals >= tuptable->alloced)
+	{
+		/* Double the size of the pointer array */
+		uint64		newalloced = tuptable->alloced * 2;
+
+		tuptable->vals = (HeapTuple *) repalloc_huge(tuptable->vals,
+													 newalloced * sizeof(HeapTuple));
+		tuptable->alloced = newalloced;
+	}
+
+	tuptable->vals[tuptable->numvals] = ExecCopySlotHeapTuple(slot);
+	(tuptable->numvals)++;
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return true;
+}
+
+/*
+ * Static functions
+ */
+
+/*
+ * Parse and analyze a querystring.
+ *
+ * At entry, plan->argtypes and plan->nargs (or alternatively plan->parserSetup
+ * and plan->parserSetupArg) must be valid, as must plan->parse_mode and
+ * plan->cursor_options.
+ *
+ * Results are stored into *plan (specifically, plan->plancache_list).
+ * Note that the result data is all in CurrentMemoryContext or child contexts
+ * thereof; in practice this means it is in the SPI executor context, and
+ * what we are creating is a "temporary" SPIPlan.  Cruft generated during
+ * parsing is also left in CurrentMemoryContext.
+ */
+static void
+_SPI_prepare_plan(const char *src, SPIPlanPtr plan)
+{
+	List	   *raw_parsetree_list;
+	List	   *plancache_list;
+	ListCell   *list_item;
+	SPICallbackArg spicallbackarg;
+	ErrorContextCallback spierrcontext;
+
+	/*
+	 * Setup error traceback support for ereport()
+	 */
+	spicallbackarg.query = src;
+	spicallbackarg.mode = plan->parse_mode;
+	spierrcontext.callback = _SPI_error_callback;
+	spierrcontext.arg = &spicallbackarg;
+	spierrcontext.previous = error_context_stack;
+	error_context_stack = &spierrcontext;
+
+	/*
+	 * Parse the request string into a list of raw parse trees.
+	 */
+	raw_parsetree_list = raw_parser(src, plan->parse_mode);
+
+	/*
+	 * Do parse analysis and rule rewrite for each raw parsetree, storing the
+	 * results into unsaved plancache entries.
+	 */
+	plancache_list = NIL;
+
+	foreach(list_item, raw_parsetree_list)
+	{
+		RawStmt    *parsetree = lfirst_node(RawStmt, list_item);
+		List	   *stmt_list;
+		CachedPlanSource *plansource;
+
+		/*
+		 * Create the CachedPlanSource before we do parse analysis, since it
+		 * needs to see the unmodified raw parse tree.
+		 */
+		plansource = CreateCachedPlan(parsetree,
+									  src,
+									  CreateCommandTag(parsetree->stmt));
+
+		/*
+		 * Parameter datatypes are driven by parserSetup hook if provided,
+		 * otherwise we use the fixed parameter list.
+		 */
+		if (plan->parserSetup != NULL)
+		{
+			Assert(plan->nargs == 0);
+			stmt_list = pg_analyze_and_rewrite_params(parsetree,
+													  src,
+													  plan->parserSetup,
+													  plan->parserSetupArg,
+													  _SPI_current->queryEnv);
+		}
+		else
+		{
+			stmt_list = pg_analyze_and_rewrite(parsetree,
+											   src,
+											   plan->argtypes,
+											   plan->nargs,
+											   _SPI_current->queryEnv);
+		}
+
+		/* Finish filling in the CachedPlanSource */
+		CompleteCachedPlan(plansource,
+						   stmt_list,
+						   NULL,
+						   plan->argtypes,
+						   plan->nargs,
+						   plan->parserSetup,
+						   plan->parserSetupArg,
+						   plan->cursor_options,
+						   false);	/* not fixed result */
+
+		plancache_list = lappend(plancache_list, plansource);
+	}
+
+	plan->plancache_list = plancache_list;
+	plan->oneshot = false;
+
+	/*
+	 * Pop the error context stack
+	 */
+	error_context_stack = spierrcontext.previous;
+}
+
+/*
+ * Parse, but don't analyze, a querystring.
+ *
+ * This is a stripped-down version of _SPI_prepare_plan that only does the
+ * initial raw parsing.  It creates "one shot" CachedPlanSources
+ * that still require parse analysis before execution is possible.
+ *
+ * The advantage of using the "one shot" form of CachedPlanSource is that
+ * we eliminate data copying and invalidation overhead.  Postponing parse
+ * analysis also prevents issues if some of the raw parsetrees are DDL
+ * commands that affect validity of later parsetrees.  Both of these
+ * attributes are good things for SPI_execute() and similar cases.
+ *
+ * Results are stored into *plan (specifically, plan->plancache_list).
+ * Note that the result data is all in CurrentMemoryContext or child contexts
+ * thereof; in practice this means it is in the SPI executor context, and
+ * what we are creating is a "temporary" SPIPlan.  Cruft generated during
+ * parsing is also left in CurrentMemoryContext.
+ */
+static void
+_SPI_prepare_oneshot_plan(const char *src, SPIPlanPtr plan)
+{
+	List	   *raw_parsetree_list;
+	List	   *plancache_list;
+	ListCell   *list_item;
+	SPICallbackArg spicallbackarg;
+	ErrorContextCallback spierrcontext;
+
+	/*
+	 * Setup error traceback support for ereport()
+	 */
+	spicallbackarg.query = src;
+	spicallbackarg.mode = plan->parse_mode;
+	spierrcontext.callback = _SPI_error_callback;
+	spierrcontext.arg = &spicallbackarg;
+	spierrcontext.previous = error_context_stack;
+	error_context_stack = &spierrcontext;
+
+	/*
+	 * Parse the request string into a list of raw parse trees.
+	 */
+	raw_parsetree_list = raw_parser(src, plan->parse_mode);
+
+	/*
+	 * Construct plancache entries, but don't do parse analysis yet.
+	 */
+	plancache_list = NIL;
+
+	foreach(list_item, raw_parsetree_list)
+	{
+		RawStmt    *parsetree = lfirst_node(RawStmt, list_item);
+		CachedPlanSource *plansource;
+
+		plansource = CreateOneShotCachedPlan(parsetree,
+											 src,
+											 CreateCommandTag(parsetree->stmt));
+
+		plancache_list = lappend(plancache_list, plansource);
+	}
+
+	plan->plancache_list = plancache_list;
+	plan->oneshot = true;
+
+	/*
+	 * Pop the error context stack
+	 */
+	error_context_stack = spierrcontext.previous;
+}
+
+/*
+ * _SPI_execute_plan: execute the given plan with the given options
+ *
+ * options contains options accessible from outside SPI:
+ * params: parameter values to pass to query
+ * read_only: true for read-only execution (no CommandCounterIncrement)
+ * allow_nonatomic: true to allow nonatomic CALL/DO execution
+ * must_return_tuples: throw error if query doesn't return tuples
+ * tcount: execution tuple-count limit, or 0 for none
+ * dest: DestReceiver to receive output, or NULL for normal SPI output
+ * owner: ResourceOwner that will be used to hold refcount on plan;
+ *		if NULL, CurrentResourceOwner is used (ignored for non-saved plan)
+ *
+ * Additional, only-internally-accessible options:
+ * snapshot: query snapshot to use, or InvalidSnapshot for the normal
+ *		behavior of taking a new snapshot for each query.
+ * crosscheck_snapshot: for RI use, all others pass InvalidSnapshot
+ * fire_triggers: true to fire AFTER triggers at end of query (normal case);
+ *		false means any AFTER triggers are postponed to end of outer query
+ */
+static int
+_SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options,
+				  Snapshot snapshot, Snapshot crosscheck_snapshot,
+				  bool fire_triggers)
+{
+	int			my_res = 0;
+	uint64		my_processed = 0;
+	SPITupleTable *my_tuptable = NULL;
+	int			res = 0;
+	bool		pushed_active_snap = false;
+	ResourceOwner plan_owner = options->owner;
+	SPICallbackArg spicallbackarg;
+	ErrorContextCallback spierrcontext;
+	CachedPlan *cplan = NULL;
+	ListCell   *lc1;
+
+	/*
+	 * Setup error traceback support for ereport()
+	 */
+	spicallbackarg.query = NULL;	/* we'll fill this below */
+	spicallbackarg.mode = plan->parse_mode;
+	spierrcontext.callback = _SPI_error_callback;
+	spierrcontext.arg = &spicallbackarg;
+	spierrcontext.previous = error_context_stack;
+	error_context_stack = &spierrcontext;
+
+	/*
+	 * We support four distinct snapshot management behaviors:
+	 *
+	 * snapshot != InvalidSnapshot, read_only = true: use exactly the given
+	 * snapshot.
+	 *
+	 * snapshot != InvalidSnapshot, read_only = false: use the given snapshot,
+	 * modified by advancing its command ID before each querytree.
+	 *
+	 * snapshot == InvalidSnapshot, read_only = true: use the entry-time
+	 * ActiveSnapshot, if any (if there isn't one, we run with no snapshot).
+	 *
+	 * snapshot == InvalidSnapshot, read_only = false: take a full new
+	 * snapshot for each user command, and advance its command ID before each
+	 * querytree within the command.
+	 *
+	 * In the first two cases, we can just push the snap onto the stack once
+	 * for the whole plan list.
+	 *
+	 * Note that snapshot != InvalidSnapshot implies an atomic execution
+	 * context.
+	 */
+	if (snapshot != InvalidSnapshot)
+	{
+		Assert(!options->allow_nonatomic);
+		if (options->read_only)
+		{
+			PushActiveSnapshot(snapshot);
+			pushed_active_snap = true;
+		}
+		else
+		{
+			/* Make sure we have a private copy of the snapshot to modify */
+			PushCopiedSnapshot(snapshot);
+			pushed_active_snap = true;
+		}
+	}
+
+	/*
+	 * Ensure that we have a resource owner if plan is saved, and not if it
+	 * isn't.
+	 */
+	if (!plan->saved)
+		plan_owner = NULL;
+	else if (plan_owner == NULL)
+		plan_owner = CurrentResourceOwner;
+
+	/*
+	 * We interpret must_return_tuples as "there must be at least one query,
+	 * and all of them must return tuples".  This is a bit laxer than
+	 * SPI_is_cursor_plan's check, but there seems no reason to enforce that
+	 * there be only one query.
+	 */
+	if (options->must_return_tuples && plan->plancache_list == NIL)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("empty query does not return tuples")));
+
+	foreach(lc1, plan->plancache_list)
+	{
+		CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc1);
+		List	   *stmt_list;
+		ListCell   *lc2;
+
+		spicallbackarg.query = plansource->query_string;
+
+		/*
+		 * If this is a one-shot plan, we still need to do parse analysis.
+		 */
+		if (plan->oneshot)
+		{
+			RawStmt    *parsetree = plansource->raw_parse_tree;
+			const char *src = plansource->query_string;
+			List	   *stmt_list;
+
+			/*
+			 * Parameter datatypes are driven by parserSetup hook if provided,
+			 * otherwise we use the fixed parameter list.
+			 */
+			if (parsetree == NULL)
+				stmt_list = NIL;
+			else if (plan->parserSetup != NULL)
+			{
+				Assert(plan->nargs == 0);
+				stmt_list = pg_analyze_and_rewrite_params(parsetree,
+														  src,
+														  plan->parserSetup,
+														  plan->parserSetupArg,
+														  _SPI_current->queryEnv);
+			}
+			else
+			{
+				stmt_list = pg_analyze_and_rewrite(parsetree,
+												   src,
+												   plan->argtypes,
+												   plan->nargs,
+												   _SPI_current->queryEnv);
+			}
+
+			/* Finish filling in the CachedPlanSource */
+			CompleteCachedPlan(plansource,
+							   stmt_list,
+							   NULL,
+							   plan->argtypes,
+							   plan->nargs,
+							   plan->parserSetup,
+							   plan->parserSetupArg,
+							   plan->cursor_options,
+							   false);	/* not fixed result */
+		}
+
+		/*
+		 * If asked to, complain when query does not return tuples.
+		 * (Replanning can't change this, so we can check it before that.
+		 * However, we can't check it till after parse analysis, so in the
+		 * case of a one-shot plan this is the earliest we could check.)
+		 */
+		if (options->must_return_tuples && !plansource->resultDesc)
+		{
+			/* try to give a good error message */
+			const char *cmdtag;
+
+			/* A SELECT without resultDesc must be SELECT INTO */
+			if (plansource->commandTag == CMDTAG_SELECT)
+				cmdtag = "SELECT INTO";
+			else
+				cmdtag = GetCommandTagName(plansource->commandTag);
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+			/* translator: %s is name of a SQL command, eg INSERT */
+					 errmsg("%s query does not return tuples", cmdtag)));
+		}
+
+		/*
+		 * Replan if needed, and increment plan refcount.  If it's a saved
+		 * plan, the refcount must be backed by the plan_owner.
+		 */
+		cplan = GetCachedPlan(plansource, options->params,
+							  plan_owner, _SPI_current->queryEnv);
+
+		stmt_list = cplan->stmt_list;
+
+		/*
+		 * If we weren't given a specific snapshot to use, and the statement
+		 * list requires a snapshot, set that up.
+		 */
+		if (snapshot == InvalidSnapshot &&
+			(list_length(stmt_list) > 1 ||
+			 (list_length(stmt_list) == 1 &&
+			  PlannedStmtRequiresSnapshot(linitial_node(PlannedStmt,
+														stmt_list)))))
+		{
+			/*
+			 * First, ensure there's a Portal-level snapshot.  This back-fills
+			 * the snapshot stack in case the previous operation was a COMMIT
+			 * or ROLLBACK inside a procedure or DO block.  (We can't put back
+			 * the Portal snapshot any sooner, or we'd break cases like doing
+			 * SET or LOCK just after COMMIT.)  It's enough to check once per
+			 * statement list, since COMMIT/ROLLBACK/CALL/DO can't appear
+			 * within a multi-statement list.
+			 */
+			EnsurePortalSnapshotExists();
+
+			/*
+			 * In the default non-read-only case, get a new per-statement-list
+			 * snapshot, replacing any that we pushed in a previous cycle.
+			 * Skip it when doing non-atomic execution, though (we rely
+			 * entirely on the Portal snapshot in that case).
+			 */
+			if (!options->read_only && !options->allow_nonatomic)
+			{
+				if (pushed_active_snap)
+					PopActiveSnapshot();
+				PushActiveSnapshot(GetTransactionSnapshot());
+				pushed_active_snap = true;
+			}
+		}
+
+		foreach(lc2, stmt_list)
+		{
+			PlannedStmt *stmt = lfirst_node(PlannedStmt, lc2);
+			bool		canSetTag = stmt->canSetTag;
+			DestReceiver *dest;
+
+			/*
+			 * Reset output state.  (Note that if a non-SPI receiver is used,
+			 * _SPI_current->processed will stay zero, and that's what we'll
+			 * report to the caller.  It's the receiver's job to count tuples
+			 * in that case.)
+			 */
+			_SPI_current->processed = 0;
+			_SPI_current->tuptable = NULL;
+
+			/* Check for unsupported cases. */
+			if (stmt->utilityStmt)
+			{
+				if (IsA(stmt->utilityStmt, CopyStmt))
+				{
+					CopyStmt   *cstmt = (CopyStmt *) stmt->utilityStmt;
+
+					if (cstmt->filename == NULL)
+					{
+						my_res = SPI_ERROR_COPY;
+						goto fail;
+					}
+				}
+				else if (IsA(stmt->utilityStmt, TransactionStmt))
+				{
+					my_res = SPI_ERROR_TRANSACTION;
+					goto fail;
+				}
+			}
+
+			if (options->read_only && !CommandIsReadOnly(stmt))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				/* translator: %s is a SQL statement name */
+						 errmsg("%s is not allowed in a non-volatile function",
+								CreateCommandName((Node *) stmt))));
+
+			/*
+			 * If not read-only mode, advance the command counter before each
+			 * command and update the snapshot.  (But skip it if the snapshot
+			 * isn't under our control.)
+			 */
+			if (!options->read_only && pushed_active_snap)
+			{
+				CommandCounterIncrement();
+				UpdateActiveSnapshotCommandId();
+			}
+
+			/*
+			 * Select appropriate tuple receiver.  Output from non-canSetTag
+			 * subqueries always goes to the bit bucket.
+			 */
+			if (!canSetTag)
+				dest = CreateDestReceiver(DestNone);
+			else if (options->dest)
+				dest = options->dest;
+			else
+				dest = CreateDestReceiver(DestSPI);
+
+			if (stmt->utilityStmt == NULL)
+			{
+				QueryDesc  *qdesc;
+				Snapshot	snap;
+
+				if (ActiveSnapshotSet())
+					snap = GetActiveSnapshot();
+				else
+					snap = InvalidSnapshot;
+
+				qdesc = CreateQueryDesc(stmt,
+										plansource->query_string,
+										snap, crosscheck_snapshot,
+										dest,
+										options->params,
+										_SPI_current->queryEnv,
+										0);
+				res = _SPI_pquery(qdesc, fire_triggers,
+								  canSetTag ? options->tcount : 0);
+				FreeQueryDesc(qdesc);
+			}
+			else
+			{
+				ProcessUtilityContext context;
+				QueryCompletion qc;
+
+				/*
+				 * If the SPI context is atomic, or we were not told to allow
+				 * nonatomic operations, tell ProcessUtility this is an atomic
+				 * execution context.
+				 */
+				if (_SPI_current->atomic || !options->allow_nonatomic)
+					context = PROCESS_UTILITY_QUERY;
+				else
+					context = PROCESS_UTILITY_QUERY_NONATOMIC;
+
+				InitializeQueryCompletion(&qc);
+				ProcessUtility(stmt,
+							   plansource->query_string,
+							   true,	/* protect plancache's node tree */
+							   context,
+							   options->params,
+							   _SPI_current->queryEnv,
+							   dest,
+							   &qc);
+
+				/* Update "processed" if stmt returned tuples */
+				if (_SPI_current->tuptable)
+					_SPI_current->processed = _SPI_current->tuptable->numvals;
+
+				res = SPI_OK_UTILITY;
+
+				/*
+				 * Some utility statements return a row count, even though the
+				 * tuples are not returned to the caller.
+				 */
+				if (IsA(stmt->utilityStmt, CreateTableAsStmt))
+				{
+					CreateTableAsStmt *ctastmt = (CreateTableAsStmt *) stmt->utilityStmt;
+
+					if (qc.commandTag == CMDTAG_SELECT)
+						_SPI_current->processed = qc.nprocessed;
+					else
+					{
+						/*
+						 * Must be an IF NOT EXISTS that did nothing, or a
+						 * CREATE ... WITH NO DATA.
+						 */
+						Assert(ctastmt->if_not_exists ||
+							   ctastmt->into->skipData);
+						_SPI_current->processed = 0;
+					}
+
+					/*
+					 * For historical reasons, if CREATE TABLE AS was spelled
+					 * as SELECT INTO, return a special return code.
+					 */
+					if (ctastmt->is_select_into)
+						res = SPI_OK_SELINTO;
+				}
+				else if (IsA(stmt->utilityStmt, CopyStmt))
+				{
+					Assert(qc.commandTag == CMDTAG_COPY);
+					_SPI_current->processed = qc.nprocessed;
+				}
+			}
+
+			/*
+			 * The last canSetTag query sets the status values returned to the
+			 * caller.  Be careful to free any tuptables not returned, to
+			 * avoid intra-transaction memory leak.
+			 */
+			if (canSetTag)
+			{
+				my_processed = _SPI_current->processed;
+				SPI_freetuptable(my_tuptable);
+				my_tuptable = _SPI_current->tuptable;
+				my_res = res;
+			}
+			else
+			{
+				SPI_freetuptable(_SPI_current->tuptable);
+				_SPI_current->tuptable = NULL;
+			}
+
+			/*
+			 * We don't issue a destroy call to the receiver.  The SPI and
+			 * None receivers would ignore it anyway, while if the caller
+			 * supplied a receiver, it's not our job to destroy it.
+			 */
+
+			if (res < 0)
+			{
+				my_res = res;
+				goto fail;
+			}
+		}
+
+		/* Done with this plan, so release refcount */
+		ReleaseCachedPlan(cplan, plan_owner);
+		cplan = NULL;
+
+		/*
+		 * If not read-only mode, advance the command counter after the last
+		 * command.  This ensures that its effects are visible, in case it was
+		 * DDL that would affect the next CachedPlanSource.
+		 */
+		if (!options->read_only)
+			CommandCounterIncrement();
+	}
+
+fail:
+
+	/* Pop the snapshot off the stack if we pushed one */
+	if (pushed_active_snap)
+		PopActiveSnapshot();
+
+	/* We no longer need the cached plan refcount, if any */
+	if (cplan)
+		ReleaseCachedPlan(cplan, plan_owner);
+
+	/*
+	 * Pop the error context stack
+	 */
+	error_context_stack = spierrcontext.previous;
+
+	/* Save results for caller */
+	SPI_processed = my_processed;
+	SPI_tuptable = my_tuptable;
+
+	/* tuptable now is caller's responsibility, not SPI's */
+	_SPI_current->tuptable = NULL;
+
+	/*
+	 * If none of the queries had canSetTag, return SPI_OK_REWRITTEN. Prior to
+	 * 8.4, we used return the last query's result code, but not its auxiliary
+	 * results, but that's confusing.
+	 */
+	if (my_res == 0)
+		my_res = SPI_OK_REWRITTEN;
+
+	return my_res;
+}
+
+/*
+ * Convert arrays of query parameters to form wanted by planner and executor
+ */
+static ParamListInfo
+_SPI_convert_params(int nargs, Oid *argtypes,
+					Datum *Values, const char *Nulls)
+{
+	ParamListInfo paramLI;
+
+	if (nargs > 0)
+	{
+		paramLI = makeParamList(nargs);
+
+		for (int i = 0; i < nargs; i++)
+		{
+			ParamExternData *prm = &paramLI->params[i];
+
+			prm->value = Values[i];
+			prm->isnull = (Nulls && Nulls[i] == 'n');
+			prm->pflags = PARAM_FLAG_CONST;
+			prm->ptype = argtypes[i];
+		}
+	}
+	else
+		paramLI = NULL;
+	return paramLI;
+}
+
+static int
+_SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount)
+{
+	int			operation = queryDesc->operation;
+	int			eflags;
+	int			res;
+
+	switch (operation)
+	{
+		case CMD_SELECT:
+			if (queryDesc->dest->mydest == DestNone)
+			{
+				/* Don't return SPI_OK_SELECT if we're discarding result */
+				res = SPI_OK_UTILITY;
+			}
+			else
+				res = SPI_OK_SELECT;
+			break;
+		case CMD_INSERT:
+			if (queryDesc->plannedstmt->hasReturning)
+				res = SPI_OK_INSERT_RETURNING;
+			else
+				res = SPI_OK_INSERT;
+			break;
+		case CMD_DELETE:
+			if (queryDesc->plannedstmt->hasReturning)
+				res = SPI_OK_DELETE_RETURNING;
+			else
+				res = SPI_OK_DELETE;
+			break;
+		case CMD_UPDATE:
+			if (queryDesc->plannedstmt->hasReturning)
+				res = SPI_OK_UPDATE_RETURNING;
+			else
+				res = SPI_OK_UPDATE;
+			break;
+		default:
+			return SPI_ERROR_OPUNKNOWN;
+	}
+
+#ifdef SPI_EXECUTOR_STATS
+	if (ShowExecutorStats)
+		ResetUsage();
+#endif
+
+	/* Select execution options */
+	if (fire_triggers)
+		eflags = 0;				/* default run-to-completion flags */
+	else
+		eflags = EXEC_FLAG_SKIP_TRIGGERS;
+
+	ExecutorStart(queryDesc, eflags);
+
+	ExecutorRun(queryDesc, ForwardScanDirection, tcount, true);
+
+	_SPI_current->processed = queryDesc->estate->es_processed;
+
+	if ((res == SPI_OK_SELECT || queryDesc->plannedstmt->hasReturning) &&
+		queryDesc->dest->mydest == DestSPI)
+	{
+		if (_SPI_checktuples())
+			elog(ERROR, "consistency check on SPI tuple count failed");
+	}
+
+	ExecutorFinish(queryDesc);
+	ExecutorEnd(queryDesc);
+	/* FreeQueryDesc is done by the caller */
+
+#ifdef SPI_EXECUTOR_STATS
+	if (ShowExecutorStats)
+		ShowUsage("SPI EXECUTOR STATS");
+#endif
+
+	return res;
+}
+
+/*
+ * _SPI_error_callback
+ *
+ * Add context information when a query invoked via SPI fails
+ */
+static void
+_SPI_error_callback(void *arg)
+{
+	SPICallbackArg *carg = (SPICallbackArg *) arg;
+	const char *query = carg->query;
+	int			syntaxerrposition;
+
+	if (query == NULL)			/* in case arg wasn't set yet */
+		return;
+
+	/*
+	 * If there is a syntax error position, convert to internal syntax error;
+	 * otherwise treat the query as an item of context stack
+	 */
+	syntaxerrposition = geterrposition();
+	if (syntaxerrposition > 0)
+	{
+		errposition(0);
+		internalerrposition(syntaxerrposition);
+		internalerrquery(query);
+	}
+	else
+	{
+		/* Use the parse mode to decide how to describe the query */
+		switch (carg->mode)
+		{
+			case RAW_PARSE_PLPGSQL_EXPR:
+				errcontext("SQL expression \"%s\"", query);
+				break;
+			case RAW_PARSE_PLPGSQL_ASSIGN1:
+			case RAW_PARSE_PLPGSQL_ASSIGN2:
+			case RAW_PARSE_PLPGSQL_ASSIGN3:
+				errcontext("PL/pgSQL assignment \"%s\"", query);
+				break;
+			default:
+				errcontext("SQL statement \"%s\"", query);
+				break;
+		}
+	}
+}
+
+/*
+ * _SPI_cursor_operation()
+ *
+ *	Do a FETCH or MOVE in a cursor
+ */
+static void
+_SPI_cursor_operation(Portal portal, FetchDirection direction, long count,
+					  DestReceiver *dest)
+{
+	uint64		nfetched;
+
+	/* Check that the portal is valid */
+	if (!PortalIsValid(portal))
+		elog(ERROR, "invalid portal in SPI cursor operation");
+
+	/* Push the SPI stack */
+	if (_SPI_begin_call(true) < 0)
+		elog(ERROR, "SPI cursor operation called while not connected");
+
+	/* Reset the SPI result (note we deliberately don't touch lastoid) */
+	SPI_processed = 0;
+	SPI_tuptable = NULL;
+	_SPI_current->processed = 0;
+	_SPI_current->tuptable = NULL;
+
+	/* Run the cursor */
+	nfetched = PortalRunFetch(portal,
+							  direction,
+							  count,
+							  dest);
+
+	/*
+	 * Think not to combine this store with the preceding function call. If
+	 * the portal contains calls to functions that use SPI, then _SPI_stack is
+	 * likely to move around while the portal runs.  When control returns,
+	 * _SPI_current will point to the correct stack entry... but the pointer
+	 * may be different than it was beforehand. So we must be sure to re-fetch
+	 * the pointer after the function call completes.
+	 */
+	_SPI_current->processed = nfetched;
+
+	if (dest->mydest == DestSPI && _SPI_checktuples())
+		elog(ERROR, "consistency check on SPI tuple count failed");
+
+	/* Put the result into place for access by caller */
+	SPI_processed = _SPI_current->processed;
+	SPI_tuptable = _SPI_current->tuptable;
+
+	/* tuptable now is caller's responsibility, not SPI's */
+	_SPI_current->tuptable = NULL;
+
+	/* Pop the SPI stack */
+	_SPI_end_call(true);
+}
+
+
+static MemoryContext
+_SPI_execmem(void)
+{
+	return MemoryContextSwitchTo(_SPI_current->execCxt);
+}
+
+static MemoryContext
+_SPI_procmem(void)
+{
+	return MemoryContextSwitchTo(_SPI_current->procCxt);
+}
+
+/*
+ * _SPI_begin_call: begin a SPI operation within a connected procedure
+ *
+ * use_exec is true if we intend to make use of the procedure's execCxt
+ * during this SPI operation.  We'll switch into that context, and arrange
+ * for it to be cleaned up at _SPI_end_call or if an error occurs.
+ */
+static int
+_SPI_begin_call(bool use_exec)
+{
+	if (_SPI_current == NULL)
+		return SPI_ERROR_UNCONNECTED;
+
+	if (use_exec)
+	{
+		/* remember when the Executor operation started */
+		_SPI_current->execSubid = GetCurrentSubTransactionId();
+		/* switch to the Executor memory context */
+		_SPI_execmem();
+	}
+
+	return 0;
+}
+
+/*
+ * _SPI_end_call: end a SPI operation within a connected procedure
+ *
+ * use_exec must be the same as in the previous _SPI_begin_call
+ *
+ * Note: this currently has no failure return cases, so callers don't check
+ */
+static int
+_SPI_end_call(bool use_exec)
+{
+	if (use_exec)
+	{
+		/* switch to the procedure memory context */
+		_SPI_procmem();
+		/* mark Executor context no longer in use */
+		_SPI_current->execSubid = InvalidSubTransactionId;
+		/* and free Executor memory */
+		MemoryContextResetAndDeleteChildren(_SPI_current->execCxt);
+	}
+
+	return 0;
+}
+
+static bool
+_SPI_checktuples(void)
+{
+	uint64		processed = _SPI_current->processed;
+	SPITupleTable *tuptable = _SPI_current->tuptable;
+	bool		failed = false;
+
+	if (tuptable == NULL)		/* spi_dest_startup was not called */
+		failed = true;
+	else if (processed != tuptable->numvals)
+		failed = true;
+
+	return failed;
+}
+
+/*
+ * Convert a "temporary" SPIPlan into an "unsaved" plan.
+ *
+ * The passed _SPI_plan struct is on the stack, and all its subsidiary data
+ * is in or under the current SPI executor context.  Copy the plan into the
+ * SPI procedure context so it will survive _SPI_end_call().  To minimize
+ * data copying, this destructively modifies the input plan, by taking the
+ * plancache entries away from it and reparenting them to the new SPIPlan.
+ */
+static SPIPlanPtr
+_SPI_make_plan_non_temp(SPIPlanPtr plan)
+{
+	SPIPlanPtr	newplan;
+	MemoryContext parentcxt = _SPI_current->procCxt;
+	MemoryContext plancxt;
+	MemoryContext oldcxt;
+	ListCell   *lc;
+
+	/* Assert the input is a temporary SPIPlan */
+	Assert(plan->magic == _SPI_PLAN_MAGIC);
+	Assert(plan->plancxt == NULL);
+	/* One-shot plans can't be saved */
+	Assert(!plan->oneshot);
+
+	/*
+	 * Create a memory context for the plan, underneath the procedure context.
+	 * We don't expect the plan to be very large.
+	 */
+	plancxt = AllocSetContextCreate(parentcxt,
+									"SPI Plan",
+									ALLOCSET_SMALL_SIZES);
+	oldcxt = MemoryContextSwitchTo(plancxt);
+
+	/* Copy the _SPI_plan struct and subsidiary data into the new context */
+	newplan = (SPIPlanPtr) palloc0(sizeof(_SPI_plan));
+	newplan->magic = _SPI_PLAN_MAGIC;
+	newplan->plancxt = plancxt;
+	newplan->parse_mode = plan->parse_mode;
+	newplan->cursor_options = plan->cursor_options;
+	newplan->nargs = plan->nargs;
+	if (plan->nargs > 0)
+	{
+		newplan->argtypes = (Oid *) palloc(plan->nargs * sizeof(Oid));
+		memcpy(newplan->argtypes, plan->argtypes, plan->nargs * sizeof(Oid));
+	}
+	else
+		newplan->argtypes = NULL;
+	newplan->parserSetup = plan->parserSetup;
+	newplan->parserSetupArg = plan->parserSetupArg;
+
+	/*
+	 * Reparent all the CachedPlanSources into the procedure context.  In
+	 * theory this could fail partway through due to the pallocs, but we don't
+	 * care too much since both the procedure context and the executor context
+	 * would go away on error.
+	 */
+	foreach(lc, plan->plancache_list)
+	{
+		CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+		CachedPlanSetParentContext(plansource, parentcxt);
+
+		/* Build new list, with list cells in plancxt */
+		newplan->plancache_list = lappend(newplan->plancache_list, plansource);
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+
+	/* For safety, unlink the CachedPlanSources from the temporary plan */
+	plan->plancache_list = NIL;
+
+	return newplan;
+}
+
+/*
+ * Make a "saved" copy of the given plan.
+ */
+static SPIPlanPtr
+_SPI_save_plan(SPIPlanPtr plan)
+{
+	SPIPlanPtr	newplan;
+	MemoryContext plancxt;
+	MemoryContext oldcxt;
+	ListCell   *lc;
+
+	/* One-shot plans can't be saved */
+	Assert(!plan->oneshot);
+
+	/*
+	 * Create a memory context for the plan.  We don't expect the plan to be
+	 * very large, so use smaller-than-default alloc parameters.  It's a
+	 * transient context until we finish copying everything.
+	 */
+	plancxt = AllocSetContextCreate(CurrentMemoryContext,
+									"SPI Plan",
+									ALLOCSET_SMALL_SIZES);
+	oldcxt = MemoryContextSwitchTo(plancxt);
+
+	/* Copy the SPI plan into its own context */
+	newplan = (SPIPlanPtr) palloc0(sizeof(_SPI_plan));
+	newplan->magic = _SPI_PLAN_MAGIC;
+	newplan->plancxt = plancxt;
+	newplan->parse_mode = plan->parse_mode;
+	newplan->cursor_options = plan->cursor_options;
+	newplan->nargs = plan->nargs;
+	if (plan->nargs > 0)
+	{
+		newplan->argtypes = (Oid *) palloc(plan->nargs * sizeof(Oid));
+		memcpy(newplan->argtypes, plan->argtypes, plan->nargs * sizeof(Oid));
+	}
+	else
+		newplan->argtypes = NULL;
+	newplan->parserSetup = plan->parserSetup;
+	newplan->parserSetupArg = plan->parserSetupArg;
+
+	/* Copy all the plancache entries */
+	foreach(lc, plan->plancache_list)
+	{
+		CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+		CachedPlanSource *newsource;
+
+		newsource = CopyCachedPlan(plansource);
+		newplan->plancache_list = lappend(newplan->plancache_list, newsource);
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+
+	/*
+	 * Mark it saved, reparent it under CacheMemoryContext, and mark all the
+	 * component CachedPlanSources as saved.  This sequence cannot fail
+	 * partway through, so there's no risk of long-term memory leakage.
+	 */
+	newplan->saved = true;
+	MemoryContextSetParent(newplan->plancxt, CacheMemoryContext);
+
+	foreach(lc, newplan->plancache_list)
+	{
+		CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc);
+
+		SaveCachedPlan(plansource);
+	}
+
+	return newplan;
+}
+
+/*
+ * Internal lookup of ephemeral named relation by name.
+ */
+static EphemeralNamedRelation
+_SPI_find_ENR_by_name(const char *name)
+{
+	/* internal static function; any error is bug in SPI itself */
+	Assert(name != NULL);
+
+	/* fast exit if no tuplestores have been added */
+	if (_SPI_current->queryEnv == NULL)
+		return NULL;
+
+	return get_ENR(_SPI_current->queryEnv, name);
+}
+
+/*
+ * Register an ephemeral named relation for use by the planner and executor on
+ * subsequent calls using this SPI connection.
+ */
+int
+SPI_register_relation(EphemeralNamedRelation enr)
+{
+	EphemeralNamedRelation match;
+	int			res;
+
+	if (enr == NULL || enr->md.name == NULL)
+		return SPI_ERROR_ARGUMENT;
+
+	res = _SPI_begin_call(false);	/* keep current memory context */
+	if (res < 0)
+		return res;
+
+	match = _SPI_find_ENR_by_name(enr->md.name);
+	if (match)
+		res = SPI_ERROR_REL_DUPLICATE;
+	else
+	{
+		if (_SPI_current->queryEnv == NULL)
+			_SPI_current->queryEnv = create_queryEnv();
+
+		register_ENR(_SPI_current->queryEnv, enr);
+		res = SPI_OK_REL_REGISTER;
+	}
+
+	_SPI_end_call(false);
+
+	return res;
+}
+
+/*
+ * Unregister an ephemeral named relation by name.  This will probably be a
+ * rarely used function, since SPI_finish will clear it automatically.
+ */
+int
+SPI_unregister_relation(const char *name)
+{
+	EphemeralNamedRelation match;
+	int			res;
+
+	if (name == NULL)
+		return SPI_ERROR_ARGUMENT;
+
+	res = _SPI_begin_call(false);	/* keep current memory context */
+	if (res < 0)
+		return res;
+
+	match = _SPI_find_ENR_by_name(name);
+	if (match)
+	{
+		unregister_ENR(_SPI_current->queryEnv, match->md.name);
+		res = SPI_OK_REL_UNREGISTER;
+	}
+	else
+		res = SPI_ERROR_REL_NOT_FOUND;
+
+	_SPI_end_call(false);
+
+	return res;
+}
+
+/*
+ * Register the transient relations from 'tdata' using this SPI connection.
+ * This should be called by PL implementations' trigger handlers after
+ * connecting, in order to make transition tables visible to any queries run
+ * in this connection.
+ */
+int
+SPI_register_trigger_data(TriggerData *tdata)
+{
+	if (tdata == NULL)
+		return SPI_ERROR_ARGUMENT;
+
+	if (tdata->tg_newtable)
+	{
+		EphemeralNamedRelation enr =
+		palloc(sizeof(EphemeralNamedRelationData));
+		int			rc;
+
+		enr->md.name = tdata->tg_trigger->tgnewtable;
+		enr->md.reliddesc = tdata->tg_relation->rd_id;
+		enr->md.tupdesc = NULL;
+		enr->md.enrtype = ENR_NAMED_TUPLESTORE;
+		enr->md.enrtuples = tuplestore_tuple_count(tdata->tg_newtable);
+		enr->reldata = tdata->tg_newtable;
+		rc = SPI_register_relation(enr);
+		if (rc != SPI_OK_REL_REGISTER)
+			return rc;
+	}
+
+	if (tdata->tg_oldtable)
+	{
+		EphemeralNamedRelation enr =
+		palloc(sizeof(EphemeralNamedRelationData));
+		int			rc;
+
+		enr->md.name = tdata->tg_trigger->tgoldtable;
+		enr->md.reliddesc = tdata->tg_relation->rd_id;
+		enr->md.tupdesc = NULL;
+		enr->md.enrtype = ENR_NAMED_TUPLESTORE;
+		enr->md.enrtuples = tuplestore_tuple_count(tdata->tg_oldtable);
+		enr->reldata = tdata->tg_oldtable;
+		rc = SPI_register_relation(enr);
+		if (rc != SPI_OK_REL_REGISTER)
+			return rc;
+	}
+
+	return SPI_OK_TD_REGISTER;
+}
diff --git a/src/backend/executor/tqueue.c b/src/backend/executor/tqueue.c
new file mode 100644
index 0000000..7af9fbe
--- /dev/null
+++ b/src/backend/executor/tqueue.c
@@ -0,0 +1,210 @@
+/*-------------------------------------------------------------------------
+ *
+ * tqueue.c
+ *	  Use shm_mq to send & receive tuples between parallel backends
+ *
+ * A DestReceiver of type DestTupleQueue, which is a TQueueDestReceiver
+ * under the hood, writes tuples from the executor to a shm_mq.
+ *
+ * A TupleQueueReader reads tuples from a shm_mq and returns the tuples.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/tqueue.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/tqueue.h"
+
+/*
+ * DestReceiver object's private contents
+ *
+ * queue is a pointer to data supplied by DestReceiver's caller.
+ */
+typedef struct TQueueDestReceiver
+{
+	DestReceiver pub;			/* public fields */
+	shm_mq_handle *queue;		/* shm_mq to send to */
+} TQueueDestReceiver;
+
+/*
+ * TupleQueueReader object's private contents
+ *
+ * queue is a pointer to data supplied by reader's caller.
+ *
+ * "typedef struct TupleQueueReader TupleQueueReader" is in tqueue.h
+ */
+struct TupleQueueReader
+{
+	shm_mq_handle *queue;		/* shm_mq to receive from */
+};
+
+/*
+ * Receive a tuple from a query, and send it to the designated shm_mq.
+ *
+ * Returns true if successful, false if shm_mq has been detached.
+ */
+static bool
+tqueueReceiveSlot(TupleTableSlot *slot, DestReceiver *self)
+{
+	TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self;
+	MinimalTuple tuple;
+	shm_mq_result result;
+	bool		should_free;
+
+	/* Send the tuple itself. */
+	tuple = ExecFetchSlotMinimalTuple(slot, &should_free);
+	result = shm_mq_send(tqueue->queue, tuple->t_len, tuple, false);
+
+	if (should_free)
+		pfree(tuple);
+
+	/* Check for failure. */
+	if (result == SHM_MQ_DETACHED)
+		return false;
+	else if (result != SHM_MQ_SUCCESS)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not send tuple to shared-memory queue")));
+
+	return true;
+}
+
+/*
+ * Prepare to receive tuples from executor.
+ */
+static void
+tqueueStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+	/* do nothing */
+}
+
+/*
+ * Clean up at end of an executor run
+ */
+static void
+tqueueShutdownReceiver(DestReceiver *self)
+{
+	TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self;
+
+	if (tqueue->queue != NULL)
+		shm_mq_detach(tqueue->queue);
+	tqueue->queue = NULL;
+}
+
+/*
+ * Destroy receiver when done with it
+ */
+static void
+tqueueDestroyReceiver(DestReceiver *self)
+{
+	TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self;
+
+	/* We probably already detached from queue, but let's be sure */
+	if (tqueue->queue != NULL)
+		shm_mq_detach(tqueue->queue);
+	pfree(self);
+}
+
+/*
+ * Create a DestReceiver that writes tuples to a tuple queue.
+ */
+DestReceiver *
+CreateTupleQueueDestReceiver(shm_mq_handle *handle)
+{
+	TQueueDestReceiver *self;
+
+	self = (TQueueDestReceiver *) palloc0(sizeof(TQueueDestReceiver));
+
+	self->pub.receiveSlot = tqueueReceiveSlot;
+	self->pub.rStartup = tqueueStartupReceiver;
+	self->pub.rShutdown = tqueueShutdownReceiver;
+	self->pub.rDestroy = tqueueDestroyReceiver;
+	self->pub.mydest = DestTupleQueue;
+	self->queue = handle;
+
+	return (DestReceiver *) self;
+}
+
+/*
+ * Create a tuple queue reader.
+ */
+TupleQueueReader *
+CreateTupleQueueReader(shm_mq_handle *handle)
+{
+	TupleQueueReader *reader = palloc0(sizeof(TupleQueueReader));
+
+	reader->queue = handle;
+
+	return reader;
+}
+
+/*
+ * Destroy a tuple queue reader.
+ *
+ * Note: cleaning up the underlying shm_mq is the caller's responsibility.
+ * We won't access it here, as it may be detached already.
+ */
+void
+DestroyTupleQueueReader(TupleQueueReader *reader)
+{
+	pfree(reader);
+}
+
+/*
+ * Fetch a tuple from a tuple queue reader.
+ *
+ * The return value is NULL if there are no remaining tuples or if
+ * nowait = true and no tuple is ready to return.  *done, if not NULL,
+ * is set to true when there are no remaining tuples and otherwise to false.
+ *
+ * The returned tuple, if any, is either in shared memory or a private buffer
+ * and should not be freed.  The pointer is invalid after the next call to
+ * TupleQueueReaderNext().
+ *
+ * Even when shm_mq_receive() returns SHM_MQ_WOULD_BLOCK, this can still
+ * accumulate bytes from a partially-read message, so it's useful to call
+ * this with nowait = true even if nothing is returned.
+ */
+MinimalTuple
+TupleQueueReaderNext(TupleQueueReader *reader, bool nowait, bool *done)
+{
+	MinimalTuple tuple;
+	shm_mq_result result;
+	Size		nbytes;
+	void	   *data;
+
+	if (done != NULL)
+		*done = false;
+
+	/* Attempt to read a message. */
+	result = shm_mq_receive(reader->queue, &nbytes, &data, nowait);
+
+	/* If queue is detached, set *done and return NULL. */
+	if (result == SHM_MQ_DETACHED)
+	{
+		if (done != NULL)
+			*done = true;
+		return NULL;
+	}
+
+	/* In non-blocking mode, bail out if no message ready yet. */
+	if (result == SHM_MQ_WOULD_BLOCK)
+		return NULL;
+	Assert(result == SHM_MQ_SUCCESS);
+
+	/*
+	 * Return a pointer to the queue memory directly (which had better be
+	 * sufficiently aligned).
+	 */
+	tuple = (MinimalTuple) data;
+	Assert(tuple->t_len == nbytes);
+
+	return tuple;
+}
diff --git a/src/backend/executor/tstoreReceiver.c b/src/backend/executor/tstoreReceiver.c
new file mode 100644
index 0000000..e07664f
--- /dev/null
+++ b/src/backend/executor/tstoreReceiver.c
@@ -0,0 +1,283 @@
+/*-------------------------------------------------------------------------
+ *
+ * tstoreReceiver.c
+ *	  An implementation of DestReceiver that stores the result tuples in
+ *	  a Tuplestore.
+ *
+ * Optionally, we can force detoasting (but not decompression) of out-of-line
+ * toasted values.  This is to support cursors WITH HOLD, which must retain
+ * data even if the underlying table is dropped.
+ *
+ * Also optionally, we can apply a tuple conversion map before storing.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/tstoreReceiver.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/detoast.h"
+#include "access/tupconvert.h"
+#include "executor/tstoreReceiver.h"
+
+
+typedef struct
+{
+	DestReceiver pub;
+	/* parameters: */
+	Tuplestorestate *tstore;	/* where to put the data */
+	MemoryContext cxt;			/* context containing tstore */
+	bool		detoast;		/* were we told to detoast? */
+	TupleDesc	target_tupdesc; /* target tupdesc, or NULL if none */
+	const char *map_failure_msg;	/* tupdesc mapping failure message */
+	/* workspace: */
+	Datum	   *outvalues;		/* values array for result tuple */
+	Datum	   *tofree;			/* temp values to be pfree'd */
+	TupleConversionMap *tupmap; /* conversion map, if needed */
+	TupleTableSlot *mapslot;	/* slot for mapped tuples */
+} TStoreState;
+
+
+static bool tstoreReceiveSlot_notoast(TupleTableSlot *slot, DestReceiver *self);
+static bool tstoreReceiveSlot_detoast(TupleTableSlot *slot, DestReceiver *self);
+static bool tstoreReceiveSlot_tupmap(TupleTableSlot *slot, DestReceiver *self);
+
+
+/*
+ * Prepare to receive tuples from executor.
+ */
+static void
+tstoreStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+	TStoreState *myState = (TStoreState *) self;
+	bool		needtoast = false;
+	int			natts = typeinfo->natts;
+	int			i;
+
+	/* Check if any columns require detoast work */
+	if (myState->detoast)
+	{
+		for (i = 0; i < natts; i++)
+		{
+			Form_pg_attribute attr = TupleDescAttr(typeinfo, i);
+
+			if (attr->attisdropped)
+				continue;
+			if (attr->attlen == -1)
+			{
+				needtoast = true;
+				break;
+			}
+		}
+	}
+
+	/* Check if tuple conversion is needed */
+	if (myState->target_tupdesc)
+		myState->tupmap = convert_tuples_by_position(typeinfo,
+													 myState->target_tupdesc,
+													 myState->map_failure_msg);
+	else
+		myState->tupmap = NULL;
+
+	/* Set up appropriate callback */
+	if (needtoast)
+	{
+		Assert(!myState->tupmap);
+		myState->pub.receiveSlot = tstoreReceiveSlot_detoast;
+		/* Create workspace */
+		myState->outvalues = (Datum *)
+			MemoryContextAlloc(myState->cxt, natts * sizeof(Datum));
+		myState->tofree = (Datum *)
+			MemoryContextAlloc(myState->cxt, natts * sizeof(Datum));
+		myState->mapslot = NULL;
+	}
+	else if (myState->tupmap)
+	{
+		myState->pub.receiveSlot = tstoreReceiveSlot_tupmap;
+		myState->outvalues = NULL;
+		myState->tofree = NULL;
+		myState->mapslot = MakeSingleTupleTableSlot(myState->target_tupdesc,
+													&TTSOpsVirtual);
+	}
+	else
+	{
+		myState->pub.receiveSlot = tstoreReceiveSlot_notoast;
+		myState->outvalues = NULL;
+		myState->tofree = NULL;
+		myState->mapslot = NULL;
+	}
+}
+
+/*
+ * Receive a tuple from the executor and store it in the tuplestore.
+ * This is for the easy case where we don't have to detoast nor map anything.
+ */
+static bool
+tstoreReceiveSlot_notoast(TupleTableSlot *slot, DestReceiver *self)
+{
+	TStoreState *myState = (TStoreState *) self;
+
+	tuplestore_puttupleslot(myState->tstore, slot);
+
+	return true;
+}
+
+/*
+ * Receive a tuple from the executor and store it in the tuplestore.
+ * This is for the case where we have to detoast any toasted values.
+ */
+static bool
+tstoreReceiveSlot_detoast(TupleTableSlot *slot, DestReceiver *self)
+{
+	TStoreState *myState = (TStoreState *) self;
+	TupleDesc	typeinfo = slot->tts_tupleDescriptor;
+	int			natts = typeinfo->natts;
+	int			nfree;
+	int			i;
+	MemoryContext oldcxt;
+
+	/* Make sure the tuple is fully deconstructed */
+	slot_getallattrs(slot);
+
+	/*
+	 * Fetch back any out-of-line datums.  We build the new datums array in
+	 * myState->outvalues[] (but we can re-use the slot's isnull array). Also,
+	 * remember the fetched values to free afterwards.
+	 */
+	nfree = 0;
+	for (i = 0; i < natts; i++)
+	{
+		Datum		val = slot->tts_values[i];
+		Form_pg_attribute attr = TupleDescAttr(typeinfo, i);
+
+		if (!attr->attisdropped && attr->attlen == -1 && !slot->tts_isnull[i])
+		{
+			if (VARATT_IS_EXTERNAL(DatumGetPointer(val)))
+			{
+				val = PointerGetDatum(detoast_external_attr((struct varlena *)
+															DatumGetPointer(val)));
+				myState->tofree[nfree++] = val;
+			}
+		}
+
+		myState->outvalues[i] = val;
+	}
+
+	/*
+	 * Push the modified tuple into the tuplestore.
+	 */
+	oldcxt = MemoryContextSwitchTo(myState->cxt);
+	tuplestore_putvalues(myState->tstore, typeinfo,
+						 myState->outvalues, slot->tts_isnull);
+	MemoryContextSwitchTo(oldcxt);
+
+	/* And release any temporary detoasted values */
+	for (i = 0; i < nfree; i++)
+		pfree(DatumGetPointer(myState->tofree[i]));
+
+	return true;
+}
+
+/*
+ * Receive a tuple from the executor and store it in the tuplestore.
+ * This is for the case where we must apply a tuple conversion map.
+ */
+static bool
+tstoreReceiveSlot_tupmap(TupleTableSlot *slot, DestReceiver *self)
+{
+	TStoreState *myState = (TStoreState *) self;
+
+	execute_attr_map_slot(myState->tupmap->attrMap, slot, myState->mapslot);
+	tuplestore_puttupleslot(myState->tstore, myState->mapslot);
+
+	return true;
+}
+
+/*
+ * Clean up at end of an executor run
+ */
+static void
+tstoreShutdownReceiver(DestReceiver *self)
+{
+	TStoreState *myState = (TStoreState *) self;
+
+	/* Release workspace if any */
+	if (myState->outvalues)
+		pfree(myState->outvalues);
+	myState->outvalues = NULL;
+	if (myState->tofree)
+		pfree(myState->tofree);
+	myState->tofree = NULL;
+	if (myState->tupmap)
+		free_conversion_map(myState->tupmap);
+	myState->tupmap = NULL;
+	if (myState->mapslot)
+		ExecDropSingleTupleTableSlot(myState->mapslot);
+	myState->mapslot = NULL;
+}
+
+/*
+ * Destroy receiver when done with it
+ */
+static void
+tstoreDestroyReceiver(DestReceiver *self)
+{
+	pfree(self);
+}
+
+/*
+ * Initially create a DestReceiver object.
+ */
+DestReceiver *
+CreateTuplestoreDestReceiver(void)
+{
+	TStoreState *self = (TStoreState *) palloc0(sizeof(TStoreState));
+
+	self->pub.receiveSlot = tstoreReceiveSlot_notoast;	/* might change */
+	self->pub.rStartup = tstoreStartupReceiver;
+	self->pub.rShutdown = tstoreShutdownReceiver;
+	self->pub.rDestroy = tstoreDestroyReceiver;
+	self->pub.mydest = DestTuplestore;
+
+	/* private fields will be set by SetTuplestoreDestReceiverParams */
+
+	return (DestReceiver *) self;
+}
+
+/*
+ * Set parameters for a TuplestoreDestReceiver
+ *
+ * tStore: where to store the tuples
+ * tContext: memory context containing tStore
+ * detoast: forcibly detoast contained data?
+ * target_tupdesc: if not NULL, forcibly convert tuples to this rowtype
+ * map_failure_msg: error message to use if mapping to target_tupdesc fails
+ *
+ * We don't currently support both detoast and target_tupdesc at the same
+ * time, just because no existing caller needs that combination.
+ */
+void
+SetTuplestoreDestReceiverParams(DestReceiver *self,
+								Tuplestorestate *tStore,
+								MemoryContext tContext,
+								bool detoast,
+								TupleDesc target_tupdesc,
+								const char *map_failure_msg)
+{
+	TStoreState *myState = (TStoreState *) self;
+
+	Assert(!(detoast && target_tupdesc));
+
+	Assert(myState->pub.mydest == DestTuplestore);
+	myState->tstore = tStore;
+	myState->cxt = tContext;
+	myState->detoast = detoast;
+	myState->target_tupdesc = target_tupdesc;
+	myState->map_failure_msg = map_failure_msg;
+}
-- 
cgit v1.2.3