From 46651ce6fe013220ed397add242004d764fc0153 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 14:15:05 +0200 Subject: Adding upstream version 14.5. Signed-off-by: Daniel Baumann --- src/backend/executor/Makefile | 82 + src/backend/executor/README | 405 ++ src/backend/executor/execAmi.c | 662 ++++ src/backend/executor/execAsync.c | 154 + src/backend/executor/execCurrent.c | 426 +++ src/backend/executor/execExpr.c | 3965 +++++++++++++++++++ src/backend/executor/execExprInterp.c | 4373 +++++++++++++++++++++ src/backend/executor/execGrouping.c | 560 +++ src/backend/executor/execIndexing.c | 921 +++++ src/backend/executor/execJunk.c | 304 ++ src/backend/executor/execMain.c | 2886 ++++++++++++++ src/backend/executor/execParallel.c | 1498 ++++++++ src/backend/executor/execPartition.c | 2107 +++++++++++ src/backend/executor/execProcnode.c | 981 +++++ src/backend/executor/execReplication.c | 629 +++ src/backend/executor/execSRF.c | 980 +++++ src/backend/executor/execScan.c | 342 ++ src/backend/executor/execTuples.c | 2339 ++++++++++++ src/backend/executor/execUtils.c | 1351 +++++++ src/backend/executor/functions.c | 2103 +++++++++++ src/backend/executor/instrument.c | 279 ++ src/backend/executor/nodeAgg.c | 4829 ++++++++++++++++++++++++ src/backend/executor/nodeAppend.c | 1186 ++++++ src/backend/executor/nodeBitmapAnd.c | 223 ++ src/backend/executor/nodeBitmapHeapscan.c | 954 +++++ src/backend/executor/nodeBitmapIndexscan.c | 330 ++ src/backend/executor/nodeBitmapOr.c | 241 ++ src/backend/executor/nodeCtescan.c | 351 ++ src/backend/executor/nodeCustom.c | 228 ++ src/backend/executor/nodeForeignscan.c | 504 +++ src/backend/executor/nodeFunctionscan.c | 620 +++ src/backend/executor/nodeGather.c | 477 +++ src/backend/executor/nodeGatherMerge.c | 789 ++++ src/backend/executor/nodeGroup.c | 255 ++ src/backend/executor/nodeHash.c | 3434 +++++++++++++++++ src/backend/executor/nodeHashjoin.c | 1551 ++++++++ src/backend/executor/nodeIncrementalSort.c | 1257 ++++++ src/backend/executor/nodeIndexonlyscan.c | 735 ++++ src/backend/executor/nodeIndexscan.c | 1747 +++++++++ src/backend/executor/nodeLimit.c | 558 +++ src/backend/executor/nodeLockRows.c | 403 ++ src/backend/executor/nodeMaterial.c | 368 ++ src/backend/executor/nodeMemoize.c | 1225 ++++++ src/backend/executor/nodeMergeAppend.c | 389 ++ src/backend/executor/nodeMergejoin.c | 1678 ++++++++ src/backend/executor/nodeModifyTable.c | 3243 ++++++++++++++++ src/backend/executor/nodeNamedtuplestorescan.c | 201 + src/backend/executor/nodeNestloop.c | 411 ++ src/backend/executor/nodeProjectSet.c | 351 ++ src/backend/executor/nodeRecursiveunion.c | 331 ++ src/backend/executor/nodeResult.c | 272 ++ src/backend/executor/nodeSamplescan.c | 378 ++ src/backend/executor/nodeSeqscan.c | 314 ++ src/backend/executor/nodeSetOp.c | 651 ++++ src/backend/executor/nodeSort.c | 430 +++ src/backend/executor/nodeSubplan.c | 1313 +++++++ src/backend/executor/nodeSubqueryscan.c | 213 ++ src/backend/executor/nodeTableFuncscan.c | 523 +++ src/backend/executor/nodeTidrangescan.c | 413 ++ src/backend/executor/nodeTidscan.c | 558 +++ src/backend/executor/nodeUnique.c | 192 + src/backend/executor/nodeValuesscan.c | 361 ++ src/backend/executor/nodeWindowAgg.c | 3463 +++++++++++++++++ src/backend/executor/nodeWorktablescan.c | 223 ++ src/backend/executor/spi.c | 3383 +++++++++++++++++ src/backend/executor/tqueue.c | 210 ++ src/backend/executor/tstoreReceiver.c | 283 ++ 67 files changed, 69396 insertions(+) create mode 100644 src/backend/executor/Makefile create mode 100644 src/backend/executor/README create mode 100644 src/backend/executor/execAmi.c create mode 100644 src/backend/executor/execAsync.c create mode 100644 src/backend/executor/execCurrent.c create mode 100644 src/backend/executor/execExpr.c create mode 100644 src/backend/executor/execExprInterp.c create mode 100644 src/backend/executor/execGrouping.c create mode 100644 src/backend/executor/execIndexing.c create mode 100644 src/backend/executor/execJunk.c create mode 100644 src/backend/executor/execMain.c create mode 100644 src/backend/executor/execParallel.c create mode 100644 src/backend/executor/execPartition.c create mode 100644 src/backend/executor/execProcnode.c create mode 100644 src/backend/executor/execReplication.c create mode 100644 src/backend/executor/execSRF.c create mode 100644 src/backend/executor/execScan.c create mode 100644 src/backend/executor/execTuples.c create mode 100644 src/backend/executor/execUtils.c create mode 100644 src/backend/executor/functions.c create mode 100644 src/backend/executor/instrument.c create mode 100644 src/backend/executor/nodeAgg.c create mode 100644 src/backend/executor/nodeAppend.c create mode 100644 src/backend/executor/nodeBitmapAnd.c create mode 100644 src/backend/executor/nodeBitmapHeapscan.c create mode 100644 src/backend/executor/nodeBitmapIndexscan.c create mode 100644 src/backend/executor/nodeBitmapOr.c create mode 100644 src/backend/executor/nodeCtescan.c create mode 100644 src/backend/executor/nodeCustom.c create mode 100644 src/backend/executor/nodeForeignscan.c create mode 100644 src/backend/executor/nodeFunctionscan.c create mode 100644 src/backend/executor/nodeGather.c create mode 100644 src/backend/executor/nodeGatherMerge.c create mode 100644 src/backend/executor/nodeGroup.c create mode 100644 src/backend/executor/nodeHash.c create mode 100644 src/backend/executor/nodeHashjoin.c create mode 100644 src/backend/executor/nodeIncrementalSort.c create mode 100644 src/backend/executor/nodeIndexonlyscan.c create mode 100644 src/backend/executor/nodeIndexscan.c create mode 100644 src/backend/executor/nodeLimit.c create mode 100644 src/backend/executor/nodeLockRows.c create mode 100644 src/backend/executor/nodeMaterial.c create mode 100644 src/backend/executor/nodeMemoize.c create mode 100644 src/backend/executor/nodeMergeAppend.c create mode 100644 src/backend/executor/nodeMergejoin.c create mode 100644 src/backend/executor/nodeModifyTable.c create mode 100644 src/backend/executor/nodeNamedtuplestorescan.c create mode 100644 src/backend/executor/nodeNestloop.c create mode 100644 src/backend/executor/nodeProjectSet.c create mode 100644 src/backend/executor/nodeRecursiveunion.c create mode 100644 src/backend/executor/nodeResult.c create mode 100644 src/backend/executor/nodeSamplescan.c create mode 100644 src/backend/executor/nodeSeqscan.c create mode 100644 src/backend/executor/nodeSetOp.c create mode 100644 src/backend/executor/nodeSort.c create mode 100644 src/backend/executor/nodeSubplan.c create mode 100644 src/backend/executor/nodeSubqueryscan.c create mode 100644 src/backend/executor/nodeTableFuncscan.c create mode 100644 src/backend/executor/nodeTidrangescan.c create mode 100644 src/backend/executor/nodeTidscan.c create mode 100644 src/backend/executor/nodeUnique.c create mode 100644 src/backend/executor/nodeValuesscan.c create mode 100644 src/backend/executor/nodeWindowAgg.c create mode 100644 src/backend/executor/nodeWorktablescan.c create mode 100644 src/backend/executor/spi.c create mode 100644 src/backend/executor/tqueue.c create mode 100644 src/backend/executor/tstoreReceiver.c (limited to 'src/backend/executor') diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile new file mode 100644 index 0000000..11118d0 --- /dev/null +++ b/src/backend/executor/Makefile @@ -0,0 +1,82 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for executor +# +# IDENTIFICATION +# src/backend/executor/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/executor +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + execAmi.o \ + execAsync.o \ + execCurrent.o \ + execExpr.o \ + execExprInterp.o \ + execGrouping.o \ + execIndexing.o \ + execJunk.o \ + execMain.o \ + execParallel.o \ + execPartition.o \ + execProcnode.o \ + execReplication.o \ + execSRF.o \ + execScan.o \ + execTuples.o \ + execUtils.o \ + functions.o \ + instrument.o \ + nodeAgg.o \ + nodeAppend.o \ + nodeBitmapAnd.o \ + nodeBitmapHeapscan.o \ + nodeBitmapIndexscan.o \ + nodeBitmapOr.o \ + nodeCtescan.o \ + nodeCustom.o \ + nodeForeignscan.o \ + nodeFunctionscan.o \ + nodeGather.o \ + nodeGatherMerge.o \ + nodeGroup.o \ + nodeHash.o \ + nodeHashjoin.o \ + nodeIncrementalSort.o \ + nodeIndexonlyscan.o \ + nodeIndexscan.o \ + nodeLimit.o \ + nodeLockRows.o \ + nodeMaterial.o \ + nodeMemoize.o \ + nodeMergeAppend.o \ + nodeMergejoin.o \ + nodeModifyTable.o \ + nodeNamedtuplestorescan.o \ + nodeNestloop.o \ + nodeProjectSet.o \ + nodeRecursiveunion.o \ + nodeResult.o \ + nodeSamplescan.o \ + nodeSeqscan.o \ + nodeSetOp.o \ + nodeSort.o \ + nodeSubplan.o \ + nodeSubqueryscan.o \ + nodeTableFuncscan.o \ + nodeTidrangescan.o \ + nodeTidscan.o \ + nodeUnique.o \ + nodeValuesscan.o \ + nodeWindowAgg.o \ + nodeWorktablescan.o \ + spi.o \ + tqueue.o \ + tstoreReceiver.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/executor/README b/src/backend/executor/README new file mode 100644 index 0000000..bf5e708 --- /dev/null +++ b/src/backend/executor/README @@ -0,0 +1,405 @@ +src/backend/executor/README + +The Postgres Executor +===================== + +The executor processes a tree of "plan nodes". The plan tree is essentially +a demand-pull pipeline of tuple processing operations. Each node, when +called, will produce the next tuple in its output sequence, or NULL if no +more tuples are available. If the node is not a primitive relation-scanning +node, it will have child node(s) that it calls in turn to obtain input +tuples. + +Refinements on this basic model include: + +* Choice of scan direction (forwards or backwards). Caution: this is not +currently well-supported. It works for primitive scan nodes, but not very +well for joins, aggregates, etc. + +* Rescan command to reset a node and make it generate its output sequence +over again. + +* Parameters that can alter a node's results. After adjusting a parameter, +the rescan command must be applied to that node and all nodes above it. +There is a moderately intelligent scheme to avoid rescanning nodes +unnecessarily (for example, Sort does not rescan its input if no parameters +of the input have changed, since it can just reread its stored sorted data). + +For a SELECT, it is only necessary to deliver the top-level result tuples +to the client. For INSERT/UPDATE/DELETE, the actual table modification +operations happen in a top-level ModifyTable plan node. If the query +includes a RETURNING clause, the ModifyTable node delivers the computed +RETURNING rows as output, otherwise it returns nothing. Handling INSERT +is pretty straightforward: the tuples returned from the plan tree below +ModifyTable are inserted into the correct result relation. For UPDATE, +the plan tree returns the new values of the updated columns, plus "junk" +(hidden) column(s) identifying which table row is to be updated. The +ModifyTable node must fetch that row to extract values for the unchanged +columns, combine the values into a new row, and apply the update. (For a +heap table, the row-identity junk column is a CTID, but other things may +be used for other table types.) For DELETE, the plan tree need only deliver +junk row-identity column(s), and the ModifyTable node visits each of those +rows and marks the row deleted. + +XXX a great deal more documentation needs to be written here... + + +Plan Trees and State Trees +-------------------------- + +The plan tree delivered by the planner contains a tree of Plan nodes (struct +types derived from struct Plan). During executor startup we build a parallel +tree of identical structure containing executor state nodes --- generally, +every plan node type has a corresponding executor state node type. Each node +in the state tree has a pointer to its corresponding node in the plan tree, +plus executor state data as needed to implement that node type. This +arrangement allows the plan tree to be completely read-only so far as the +executor is concerned: all data that is modified during execution is in the +state tree. Read-only plan trees make life much simpler for plan caching and +reuse. + +A corresponding executor state node may not be created during executor startup +if the executor determines that an entire subplan is not required due to +execution time partition pruning determining that no matching records will be +found there. This currently only occurs for Append and MergeAppend nodes. In +this case the non-required subplans are ignored and the executor state's +subnode array will become out of sequence to the plan's subplan list. + +Each Plan node may have expression trees associated with it, to represent +its target list, qualification conditions, etc. These trees are also +read-only to the executor, but the executor state for expression evaluation +does not mirror the Plan expression's tree shape, as explained below. +Rather, there's just one ExprState node per expression tree, although this +may have sub-nodes for some complex expression node types. + +Altogether there are four classes of nodes used in these trees: Plan nodes, +their corresponding PlanState nodes, Expr nodes, and ExprState nodes. +(Actually, there are also List nodes, which are used as "glue" in all +three tree-based representations.) + + +Expression Trees and ExprState nodes +------------------------------------ + +Expression trees, in contrast to Plan trees, are not mirrored into a +corresponding tree of state nodes. Instead each separately executable +expression tree (e.g. a Plan's qual or targetlist) is represented by one +ExprState node. The ExprState node contains the information needed to +evaluate the expression in a compact, linear form. That compact form is +stored as a flat array in ExprState->steps[] (an array of ExprEvalStep, +not ExprEvalStep *). + +The reasons for choosing such a representation include: +- commonly the amount of work needed to evaluate one Expr-type node is + small enough that the overhead of having to perform a tree-walk + during evaluation is significant. +- the flat representation can be evaluated non-recursively within a single + function, reducing stack depth and function call overhead. +- such a representation is usable both for fast interpreted execution, + and for compiling into native code. + +The Plan-tree representation of an expression is compiled into an +ExprState node by ExecInitExpr(). As much complexity as possible should +be handled by ExecInitExpr() (and helpers), instead of execution time +where both interpreted and compiled versions would need to deal with the +complexity. Besides duplicating effort between execution approaches, +runtime initialization checks also have a small but noticeable cost every +time the expression is evaluated. Therefore, we allow ExecInitExpr() to +precompute information that we do not expect to vary across execution of a +single query, for example the set of CHECK constraint expressions to be +applied to a domain type. This could not be done at plan time without +greatly increasing the number of events that require plan invalidation. +(Previously, some information of this kind was rechecked on each +expression evaluation, but that seems like unnecessary overhead.) + + +Expression Initialization +------------------------- + +During ExecInitExpr() and similar routines, Expr trees are converted +into the flat representation. Each Expr node might be represented by +zero, one, or more ExprEvalSteps. + +Each ExprEvalStep's work is determined by its opcode (of enum ExprEvalOp) +and it stores the result of its work into the Datum variable and boolean +null flag variable pointed to by ExprEvalStep->resvalue/resnull. +Complex expressions are performed by chaining together several steps. +For example, "a + b" (one OpExpr, with two Var expressions) would be +represented as two steps to fetch the Var values, and one step for the +evaluation of the function underlying the + operator. The steps for the +Vars would have their resvalue/resnull pointing directly to the appropriate +args[].value .isnull elements in the FunctionCallInfoBaseData struct that +is used by the function evaluation step, thus avoiding extra work to copy +the result values around. + +The last entry in a completed ExprState->steps array is always an +EEOP_DONE step; this removes the need to test for end-of-array while +iterating. Also, if the expression contains any variable references (to +user columns of the ExprContext's INNER, OUTER, or SCAN tuples), the steps +array begins with EEOP_*_FETCHSOME steps that ensure that the relevant +tuples have been deconstructed to make the required columns directly +available (cf. slot_getsomeattrs()). This allows individual Var-fetching +steps to be little more than an array lookup. + +Most of ExecInitExpr()'s work is done by the recursive function +ExecInitExprRec() and its subroutines. ExecInitExprRec() maps one Expr +node into the steps required for execution, recursing as needed for +sub-expressions. + +Each ExecInitExprRec() call has to specify where that subexpression's +results are to be stored (via the resv/resnull parameters). This allows +the above scenario of evaluating a (sub-)expression directly into +fcinfo->args[].value/isnull, but also requires some care: target Datum/isnull +variables may not be shared with another ExecInitExprRec() unless the +results are only needed by steps executing before further usages of those +target Datum/isnull variables. Due to the non-recursiveness of the +ExprEvalStep representation that's usually easy to guarantee. + +ExecInitExprRec() pushes new operations into the ExprState->steps array +using ExprEvalPushStep(). To keep the steps as a consecutively laid out +array, ExprEvalPushStep() has to repalloc the entire array when there's +not enough space. Because of that it is *not* allowed to point directly +into any of the steps during expression initialization. Therefore, the +resv/resnull for a subexpression usually point to some storage that is +palloc'd separately from the steps array. For instance, the +FunctionCallInfoBaseData for a function call step is separately allocated +rather than being part of the ExprEvalStep array. The overall result +of a complete expression is typically returned into the resvalue/resnull +fields of the ExprState node itself. + +Some steps, e.g. boolean expressions, allow skipping evaluation of +certain subexpressions. In the flat representation this amounts to +jumping to some later step rather than just continuing consecutively +with the next step. The target for such a jump is represented by +the integer index in the ExprState->steps array of the step to execute +next. (Compare the EEO_NEXT and EEO_JUMP macros in execExprInterp.c.) + +Typically, ExecInitExprRec() has to push a jumping step into the steps +array, then recursively generate steps for the subexpression that might +get skipped over, then go back and fix up the jump target index using +the now-known length of the subexpression's steps. This is handled by +adjust_jumps lists in execExpr.c. + +The last step in constructing an ExprState is to apply ExecReadyExpr(), +which readies it for execution using whichever execution method has been +selected. + + +Expression Evaluation +--------------------- + +To allow for different methods of expression evaluation, and for +better branch/jump target prediction, expressions are evaluated by +calling ExprState->evalfunc (via ExecEvalExpr() and friends). + +ExecReadyExpr() can choose the method of interpretation by setting +evalfunc to an appropriate function. The default execution function, +ExecInterpExpr, is implemented in execExprInterp.c; see its header +comment for details. Special-case evalfuncs are used for certain +especially-simple expressions. + +Note that a lot of the more complex expression evaluation steps, which are +less performance-critical than the simpler ones, are implemented as +separate functions outside the fast-path of expression execution, allowing +their implementation to be shared between interpreted and compiled +expression evaluation. This means that these helper functions are not +allowed to perform expression step dispatch themselves, as the method of +dispatch will vary based on the caller. The helpers therefore cannot call +for the execution of subexpressions; all subexpression results they need +must be computed by earlier steps. And dispatch to the following +expression step must be performed after returning from the helper. + + +Targetlist Evaluation +--------------------- + +ExecBuildProjectionInfo builds an ExprState that has the effect of +evaluating a targetlist into ExprState->resultslot. A generic targetlist +expression is executed by evaluating it as discussed above (storing the +result into the ExprState's resvalue/resnull fields) and then using an +EEOP_ASSIGN_TMP step to move the result into the appropriate tts_values[] +and tts_isnull[] array elements of the result slot. There are special +fast-path step types (EEOP_ASSIGN_*_VAR) to handle targetlist entries that +are simple Vars using only one step instead of two. + + +Memory Management +----------------- + +A "per query" memory context is created during CreateExecutorState(); +all storage allocated during an executor invocation is allocated in that +context or a child context. This allows easy reclamation of storage +during executor shutdown --- rather than messing with retail pfree's and +probable storage leaks, we just destroy the memory context. + +In particular, the plan state trees and expression state trees described +in the previous section are allocated in the per-query memory context. + +To avoid intra-query memory leaks, most processing while a query runs +is done in "per tuple" memory contexts, which are so-called because they +are typically reset to empty once per tuple. Per-tuple contexts are usually +associated with ExprContexts, and commonly each PlanState node has its own +ExprContext to evaluate its qual and targetlist expressions in. + + +Query Processing Control Flow +----------------------------- + +This is a sketch of control flow for full query processing: + + CreateQueryDesc + + ExecutorStart + CreateExecutorState + creates per-query context + switch to per-query context to run ExecInitNode + AfterTriggerBeginQuery + ExecInitNode --- recursively scans plan tree + ExecInitNode + recurse into subsidiary nodes + CreateExprContext + creates per-tuple context + ExecInitExpr + + ExecutorRun + ExecProcNode --- recursively called in per-query context + ExecEvalExpr --- called in per-tuple context + ResetExprContext --- to free memory + + ExecutorFinish + ExecPostprocessPlan --- run any unfinished ModifyTable nodes + AfterTriggerEndQuery + + ExecutorEnd + ExecEndNode --- recursively releases resources + FreeExecutorState + frees per-query context and child contexts + + FreeQueryDesc + +Per above comments, it's not really critical for ExecEndNode to free any +memory; it'll all go away in FreeExecutorState anyway. However, we do need to +be careful to close relations, drop buffer pins, etc, so we do need to scan +the plan state tree to find these sorts of resources. + + +The executor can also be used to evaluate simple expressions without any Plan +tree ("simple" meaning "no aggregates and no sub-selects", though such might +be hidden inside function calls). This case has a flow of control like + + CreateExecutorState + creates per-query context + + CreateExprContext -- or use GetPerTupleExprContext(estate) + creates per-tuple context + + ExecPrepareExpr + temporarily switch to per-query context + run the expression through expression_planner + ExecInitExpr + + Repeatedly do: + ExecEvalExprSwitchContext + ExecEvalExpr --- called in per-tuple context + ResetExprContext --- to free memory + + FreeExecutorState + frees per-query context, as well as ExprContext + (a separate FreeExprContext call is not necessary) + + +EvalPlanQual (READ COMMITTED Update Checking) +--------------------------------------------- + +For simple SELECTs, the executor need only pay attention to tuples that are +valid according to the snapshot seen by the current transaction (ie, they +were inserted by a previously committed transaction, and not deleted by any +previously committed transaction). However, for UPDATE and DELETE it is not +cool to modify or delete a tuple that's been modified by an open or +concurrently-committed transaction. If we are running in SERIALIZABLE +isolation level then we just raise an error when this condition is seen to +occur. In READ COMMITTED isolation level, we must work a lot harder. + +The basic idea in READ COMMITTED mode is to take the modified tuple +committed by the concurrent transaction (after waiting for it to commit, +if need be) and re-evaluate the query qualifications to see if it would +still meet the quals. If so, we regenerate the updated tuple (if we are +doing an UPDATE) from the modified tuple, and finally update/delete the +modified tuple. SELECT FOR UPDATE/SHARE behaves similarly, except that its +action is just to lock the modified tuple and return results based on that +version of the tuple. + +To implement this checking, we actually re-run the query from scratch for +each modified tuple (or set of tuples, for SELECT FOR UPDATE), with the +relation scan nodes tweaked to return only the current tuples --- either +the original ones, or the updated (and now locked) versions of the modified +tuple(s). If this query returns a tuple, then the modified tuple(s) pass +the quals (and the query output is the suitably modified update tuple, if +we're doing UPDATE). If no tuple is returned, then the modified tuple(s) +fail the quals, so we ignore the current result tuple and continue the +original query. + +In UPDATE/DELETE, only the target relation needs to be handled this way. +In SELECT FOR UPDATE, there may be multiple relations flagged FOR UPDATE, +so we obtain lock on the current tuple version in each such relation before +executing the recheck. + +It is also possible that there are relations in the query that are not +to be locked (they are neither the UPDATE/DELETE target nor specified to +be locked in SELECT FOR UPDATE/SHARE). When re-running the test query +we want to use the same rows from these relations that were joined to +the locked rows. For ordinary relations this can be implemented relatively +cheaply by including the row TID in the join outputs and re-fetching that +TID. (The re-fetch is expensive, but we're trying to optimize the normal +case where no re-test is needed.) We have also to consider non-table +relations, such as a ValuesScan or FunctionScan. For these, since there +is no equivalent of TID, the only practical solution seems to be to include +the entire row value in the join output row. + +We disallow set-returning functions in the targetlist of SELECT FOR UPDATE, +so as to ensure that at most one tuple can be returned for any particular +set of scan tuples. Otherwise we'd get duplicates due to the original +query returning the same set of scan tuples multiple times. Likewise, +SRFs are disallowed in an UPDATE's targetlist. There, they would have the +effect of the same row being updated multiple times, which is not very +useful --- and updates after the first would have no effect anyway. + + +Asynchronous Execution +---------------------- + +In cases where a node is waiting on an event external to the database system, +such as a ForeignScan awaiting network I/O, it's desirable for the node to +indicate that it cannot return any tuple immediately but may be able to do so +at a later time. A process which discovers this type of situation can always +handle it simply by blocking, but this may waste time that could be spent +executing some other part of the plan tree where progress could be made +immediately. This is particularly likely to occur when the plan tree contains +an Append node. Asynchronous execution runs multiple parts of an Append node +concurrently rather than serially to improve performance. + +For asynchronous execution, an Append node must first request a tuple from an +async-capable child node using ExecAsyncRequest. Next, it must execute the +asynchronous event loop using ExecAppendAsyncEventWait. Eventually, when a +child node to which an asynchronous request has been made produces a tuple, +the Append node will receive it from the event loop via ExecAsyncResponse. In +the current implementation of asynchronous execution, the only node type that +requests tuples from an async-capable child node is an Append, while the only +node type that might be async-capable is a ForeignScan. + +Typically, the ExecAsyncResponse callback is the only one required for nodes +that wish to request tuples asynchronously. On the other hand, async-capable +nodes generally need to implement three methods: + +1. When an asynchronous request is made, the node's ExecAsyncRequest callback + will be invoked; it should use ExecAsyncRequestPending to indicate that the + request is pending for a callback described below. Alternatively, it can + instead use ExecAsyncRequestDone if a result is available immediately. + +2. When the event loop wishes to wait or poll for file descriptor events, the + node's ExecAsyncConfigureWait callback will be invoked to configure the + file descriptor event for which the node wishes to wait. + +3. When the file descriptor becomes ready, the node's ExecAsyncNotify callback + will be invoked; like #1, it should use ExecAsyncRequestPending for another + callback or ExecAsyncRequestDone to return a result immediately. diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c new file mode 100644 index 0000000..c3aa650 --- /dev/null +++ b/src/backend/executor/execAmi.c @@ -0,0 +1,662 @@ +/*------------------------------------------------------------------------- + * + * execAmi.c + * miscellaneous executor access method routines + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/executor/execAmi.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amapi.h" +#include "access/htup_details.h" +#include "executor/execdebug.h" +#include "executor/nodeAgg.h" +#include "executor/nodeAppend.h" +#include "executor/nodeBitmapAnd.h" +#include "executor/nodeBitmapHeapscan.h" +#include "executor/nodeBitmapIndexscan.h" +#include "executor/nodeBitmapOr.h" +#include "executor/nodeCtescan.h" +#include "executor/nodeCustom.h" +#include "executor/nodeForeignscan.h" +#include "executor/nodeFunctionscan.h" +#include "executor/nodeGather.h" +#include "executor/nodeGatherMerge.h" +#include "executor/nodeGroup.h" +#include "executor/nodeHash.h" +#include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" +#include "executor/nodeIndexonlyscan.h" +#include "executor/nodeIndexscan.h" +#include "executor/nodeLimit.h" +#include "executor/nodeLockRows.h" +#include "executor/nodeMaterial.h" +#include "executor/nodeMemoize.h" +#include "executor/nodeMergeAppend.h" +#include "executor/nodeMergejoin.h" +#include "executor/nodeModifyTable.h" +#include "executor/nodeNamedtuplestorescan.h" +#include "executor/nodeNestloop.h" +#include "executor/nodeProjectSet.h" +#include "executor/nodeRecursiveunion.h" +#include "executor/nodeResult.h" +#include "executor/nodeSamplescan.h" +#include "executor/nodeSeqscan.h" +#include "executor/nodeSetOp.h" +#include "executor/nodeSort.h" +#include "executor/nodeSubplan.h" +#include "executor/nodeSubqueryscan.h" +#include "executor/nodeTableFuncscan.h" +#include "executor/nodeTidrangescan.h" +#include "executor/nodeTidscan.h" +#include "executor/nodeUnique.h" +#include "executor/nodeValuesscan.h" +#include "executor/nodeWindowAgg.h" +#include "executor/nodeWorktablescan.h" +#include "nodes/extensible.h" +#include "nodes/nodeFuncs.h" +#include "nodes/pathnodes.h" +#include "utils/rel.h" +#include "utils/syscache.h" + +static bool IndexSupportsBackwardScan(Oid indexid); + + +/* + * ExecReScan + * Reset a plan node so that its output can be re-scanned. + * + * Note that if the plan node has parameters that have changed value, + * the output might be different from last time. + */ +void +ExecReScan(PlanState *node) +{ + /* If collecting timing stats, update them */ + if (node->instrument) + InstrEndLoop(node->instrument); + + /* + * If we have changed parameters, propagate that info. + * + * Note: ExecReScanSetParamPlan() can add bits to node->chgParam, + * corresponding to the output param(s) that the InitPlan will update. + * Since we make only one pass over the list, that means that an InitPlan + * can depend on the output param(s) of a sibling InitPlan only if that + * sibling appears earlier in the list. This is workable for now given + * the limited ways in which one InitPlan could depend on another, but + * eventually we might need to work harder (or else make the planner + * enlarge the extParam/allParam sets to include the params of depended-on + * InitPlans). + */ + if (node->chgParam != NULL) + { + ListCell *l; + + foreach(l, node->initPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(l); + PlanState *splan = sstate->planstate; + + if (splan->plan->extParam != NULL) /* don't care about child + * local Params */ + UpdateChangedParamSet(splan, node->chgParam); + if (splan->chgParam != NULL) + ExecReScanSetParamPlan(sstate, node); + } + foreach(l, node->subPlan) + { + SubPlanState *sstate = (SubPlanState *) lfirst(l); + PlanState *splan = sstate->planstate; + + if (splan->plan->extParam != NULL) + UpdateChangedParamSet(splan, node->chgParam); + } + /* Well. Now set chgParam for left/right trees. */ + if (node->lefttree != NULL) + UpdateChangedParamSet(node->lefttree, node->chgParam); + if (node->righttree != NULL) + UpdateChangedParamSet(node->righttree, node->chgParam); + } + + /* Call expression callbacks */ + if (node->ps_ExprContext) + ReScanExprContext(node->ps_ExprContext); + + /* And do node-type-specific processing */ + switch (nodeTag(node)) + { + case T_ResultState: + ExecReScanResult((ResultState *) node); + break; + + case T_ProjectSetState: + ExecReScanProjectSet((ProjectSetState *) node); + break; + + case T_ModifyTableState: + ExecReScanModifyTable((ModifyTableState *) node); + break; + + case T_AppendState: + ExecReScanAppend((AppendState *) node); + break; + + case T_MergeAppendState: + ExecReScanMergeAppend((MergeAppendState *) node); + break; + + case T_RecursiveUnionState: + ExecReScanRecursiveUnion((RecursiveUnionState *) node); + break; + + case T_BitmapAndState: + ExecReScanBitmapAnd((BitmapAndState *) node); + break; + + case T_BitmapOrState: + ExecReScanBitmapOr((BitmapOrState *) node); + break; + + case T_SeqScanState: + ExecReScanSeqScan((SeqScanState *) node); + break; + + case T_SampleScanState: + ExecReScanSampleScan((SampleScanState *) node); + break; + + case T_GatherState: + ExecReScanGather((GatherState *) node); + break; + + case T_GatherMergeState: + ExecReScanGatherMerge((GatherMergeState *) node); + break; + + case T_IndexScanState: + ExecReScanIndexScan((IndexScanState *) node); + break; + + case T_IndexOnlyScanState: + ExecReScanIndexOnlyScan((IndexOnlyScanState *) node); + break; + + case T_BitmapIndexScanState: + ExecReScanBitmapIndexScan((BitmapIndexScanState *) node); + break; + + case T_BitmapHeapScanState: + ExecReScanBitmapHeapScan((BitmapHeapScanState *) node); + break; + + case T_TidScanState: + ExecReScanTidScan((TidScanState *) node); + break; + + case T_TidRangeScanState: + ExecReScanTidRangeScan((TidRangeScanState *) node); + break; + + case T_SubqueryScanState: + ExecReScanSubqueryScan((SubqueryScanState *) node); + break; + + case T_FunctionScanState: + ExecReScanFunctionScan((FunctionScanState *) node); + break; + + case T_TableFuncScanState: + ExecReScanTableFuncScan((TableFuncScanState *) node); + break; + + case T_ValuesScanState: + ExecReScanValuesScan((ValuesScanState *) node); + break; + + case T_CteScanState: + ExecReScanCteScan((CteScanState *) node); + break; + + case T_NamedTuplestoreScanState: + ExecReScanNamedTuplestoreScan((NamedTuplestoreScanState *) node); + break; + + case T_WorkTableScanState: + ExecReScanWorkTableScan((WorkTableScanState *) node); + break; + + case T_ForeignScanState: + ExecReScanForeignScan((ForeignScanState *) node); + break; + + case T_CustomScanState: + ExecReScanCustomScan((CustomScanState *) node); + break; + + case T_NestLoopState: + ExecReScanNestLoop((NestLoopState *) node); + break; + + case T_MergeJoinState: + ExecReScanMergeJoin((MergeJoinState *) node); + break; + + case T_HashJoinState: + ExecReScanHashJoin((HashJoinState *) node); + break; + + case T_MaterialState: + ExecReScanMaterial((MaterialState *) node); + break; + + case T_MemoizeState: + ExecReScanMemoize((MemoizeState *) node); + break; + + case T_SortState: + ExecReScanSort((SortState *) node); + break; + + case T_IncrementalSortState: + ExecReScanIncrementalSort((IncrementalSortState *) node); + break; + + case T_GroupState: + ExecReScanGroup((GroupState *) node); + break; + + case T_AggState: + ExecReScanAgg((AggState *) node); + break; + + case T_WindowAggState: + ExecReScanWindowAgg((WindowAggState *) node); + break; + + case T_UniqueState: + ExecReScanUnique((UniqueState *) node); + break; + + case T_HashState: + ExecReScanHash((HashState *) node); + break; + + case T_SetOpState: + ExecReScanSetOp((SetOpState *) node); + break; + + case T_LockRowsState: + ExecReScanLockRows((LockRowsState *) node); + break; + + case T_LimitState: + ExecReScanLimit((LimitState *) node); + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); + break; + } + + if (node->chgParam != NULL) + { + bms_free(node->chgParam); + node->chgParam = NULL; + } +} + +/* + * ExecMarkPos + * + * Marks the current scan position. + * + * NOTE: mark/restore capability is currently needed only for plan nodes + * that are the immediate inner child of a MergeJoin node. Since MergeJoin + * requires sorted input, there is never any need to support mark/restore in + * node types that cannot produce sorted output. There are some cases in + * which a node can pass through sorted data from its child; if we don't + * implement mark/restore for such a node type, the planner compensates by + * inserting a Material node above that node. + */ +void +ExecMarkPos(PlanState *node) +{ + switch (nodeTag(node)) + { + case T_IndexScanState: + ExecIndexMarkPos((IndexScanState *) node); + break; + + case T_IndexOnlyScanState: + ExecIndexOnlyMarkPos((IndexOnlyScanState *) node); + break; + + case T_CustomScanState: + ExecCustomMarkPos((CustomScanState *) node); + break; + + case T_MaterialState: + ExecMaterialMarkPos((MaterialState *) node); + break; + + case T_SortState: + ExecSortMarkPos((SortState *) node); + break; + + case T_ResultState: + ExecResultMarkPos((ResultState *) node); + break; + + default: + /* don't make hard error unless caller asks to restore... */ + elog(DEBUG2, "unrecognized node type: %d", (int) nodeTag(node)); + break; + } +} + +/* + * ExecRestrPos + * + * restores the scan position previously saved with ExecMarkPos() + * + * NOTE: the semantics of this are that the first ExecProcNode following + * the restore operation will yield the same tuple as the first one following + * the mark operation. It is unspecified what happens to the plan node's + * result TupleTableSlot. (In most cases the result slot is unchanged by + * a restore, but the node may choose to clear it or to load it with the + * restored-to tuple.) Hence the caller should discard any previously + * returned TupleTableSlot after doing a restore. + */ +void +ExecRestrPos(PlanState *node) +{ + switch (nodeTag(node)) + { + case T_IndexScanState: + ExecIndexRestrPos((IndexScanState *) node); + break; + + case T_IndexOnlyScanState: + ExecIndexOnlyRestrPos((IndexOnlyScanState *) node); + break; + + case T_CustomScanState: + ExecCustomRestrPos((CustomScanState *) node); + break; + + case T_MaterialState: + ExecMaterialRestrPos((MaterialState *) node); + break; + + case T_SortState: + ExecSortRestrPos((SortState *) node); + break; + + case T_ResultState: + ExecResultRestrPos((ResultState *) node); + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); + break; + } +} + +/* + * ExecSupportsMarkRestore - does a Path support mark/restore? + * + * This is used during planning and so must accept a Path, not a Plan. + * We keep it here to be adjacent to the routines above, which also must + * know which plan types support mark/restore. + */ +bool +ExecSupportsMarkRestore(Path *pathnode) +{ + /* + * For consistency with the routines above, we do not examine the nodeTag + * but rather the pathtype, which is the Plan node type the Path would + * produce. + */ + switch (pathnode->pathtype) + { + case T_IndexScan: + case T_IndexOnlyScan: + + /* + * Not all index types support mark/restore. + */ + return castNode(IndexPath, pathnode)->indexinfo->amcanmarkpos; + + case T_Material: + case T_Sort: + return true; + + case T_CustomScan: + { + CustomPath *customPath = castNode(CustomPath, pathnode); + + if (customPath->flags & CUSTOMPATH_SUPPORT_MARK_RESTORE) + return true; + return false; + } + case T_Result: + + /* + * Result supports mark/restore iff it has a child plan that does. + * + * We have to be careful here because there is more than one Path + * type that can produce a Result plan node. + */ + if (IsA(pathnode, ProjectionPath)) + return ExecSupportsMarkRestore(((ProjectionPath *) pathnode)->subpath); + else if (IsA(pathnode, MinMaxAggPath)) + return false; /* childless Result */ + else if (IsA(pathnode, GroupResultPath)) + return false; /* childless Result */ + else + { + /* Simple RTE_RESULT base relation */ + Assert(IsA(pathnode, Path)); + return false; /* childless Result */ + } + + case T_Append: + { + AppendPath *appendPath = castNode(AppendPath, pathnode); + + /* + * If there's exactly one child, then there will be no Append + * in the final plan, so we can handle mark/restore if the + * child plan node can. + */ + if (list_length(appendPath->subpaths) == 1) + return ExecSupportsMarkRestore((Path *) linitial(appendPath->subpaths)); + /* Otherwise, Append can't handle it */ + return false; + } + + case T_MergeAppend: + { + MergeAppendPath *mapath = castNode(MergeAppendPath, pathnode); + + /* + * Like the Append case above, single-subpath MergeAppends + * won't be in the final plan, so just return the child's + * mark/restore ability. + */ + if (list_length(mapath->subpaths) == 1) + return ExecSupportsMarkRestore((Path *) linitial(mapath->subpaths)); + /* Otherwise, MergeAppend can't handle it */ + return false; + } + + default: + break; + } + + return false; +} + +/* + * ExecSupportsBackwardScan - does a plan type support backwards scanning? + * + * Ideally, all plan types would support backwards scan, but that seems + * unlikely to happen soon. In some cases, a plan node passes the backwards + * scan down to its children, and so supports backwards scan only if its + * children do. Therefore, this routine must be passed a complete plan tree. + */ +bool +ExecSupportsBackwardScan(Plan *node) +{ + if (node == NULL) + return false; + + /* + * Parallel-aware nodes return a subset of the tuples in each worker, and + * in general we can't expect to have enough bookkeeping state to know + * which ones we returned in this worker as opposed to some other worker. + */ + if (node->parallel_aware) + return false; + + switch (nodeTag(node)) + { + case T_Result: + if (outerPlan(node) != NULL) + return ExecSupportsBackwardScan(outerPlan(node)); + else + return false; + + case T_Append: + { + ListCell *l; + + /* With async, tuples may be interleaved, so can't back up. */ + if (((Append *) node)->nasyncplans > 0) + return false; + + foreach(l, ((Append *) node)->appendplans) + { + if (!ExecSupportsBackwardScan((Plan *) lfirst(l))) + return false; + } + /* need not check tlist because Append doesn't evaluate it */ + return true; + } + + case T_SampleScan: + /* Simplify life for tablesample methods by disallowing this */ + return false; + + case T_Gather: + return false; + + case T_IndexScan: + return IndexSupportsBackwardScan(((IndexScan *) node)->indexid); + + case T_IndexOnlyScan: + return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid); + + case T_SubqueryScan: + return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan); + + case T_CustomScan: + { + uint32 flags = ((CustomScan *) node)->flags; + + if (flags & CUSTOMPATH_SUPPORT_BACKWARD_SCAN) + return true; + } + return false; + + case T_SeqScan: + case T_TidScan: + case T_TidRangeScan: + case T_FunctionScan: + case T_ValuesScan: + case T_CteScan: + case T_Material: + case T_Sort: + /* these don't evaluate tlist */ + return true; + + case T_IncrementalSort: + + /* + * Unlike full sort, incremental sort keeps only a single group of + * tuples in memory, so it can't scan backwards. + */ + return false; + + case T_LockRows: + case T_Limit: + return ExecSupportsBackwardScan(outerPlan(node)); + + default: + return false; + } +} + +/* + * An IndexScan or IndexOnlyScan node supports backward scan only if the + * index's AM does. + */ +static bool +IndexSupportsBackwardScan(Oid indexid) +{ + bool result; + HeapTuple ht_idxrel; + Form_pg_class idxrelrec; + IndexAmRoutine *amroutine; + + /* Fetch the pg_class tuple of the index relation */ + ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexid)); + if (!HeapTupleIsValid(ht_idxrel)) + elog(ERROR, "cache lookup failed for relation %u", indexid); + idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel); + + /* Fetch the index AM's API struct */ + amroutine = GetIndexAmRoutineByAmId(idxrelrec->relam, false); + + result = amroutine->amcanbackward; + + pfree(amroutine); + ReleaseSysCache(ht_idxrel); + + return result; +} + +/* + * ExecMaterializesOutput - does a plan type materialize its output? + * + * Returns true if the plan node type is one that automatically materializes + * its output (typically by keeping it in a tuplestore). For such plans, + * a rescan without any parameter change will have zero startup cost and + * very low per-tuple cost. + */ +bool +ExecMaterializesOutput(NodeTag plantype) +{ + switch (plantype) + { + case T_Material: + case T_FunctionScan: + case T_TableFuncScan: + case T_CteScan: + case T_NamedTuplestoreScan: + case T_WorkTableScan: + case T_Sort: + return true; + + default: + break; + } + + return false; +} diff --git a/src/backend/executor/execAsync.c b/src/backend/executor/execAsync.c new file mode 100644 index 0000000..94a284a --- /dev/null +++ b/src/backend/executor/execAsync.c @@ -0,0 +1,154 @@ +/*------------------------------------------------------------------------- + * + * execAsync.c + * Support routines for asynchronous execution + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/execAsync.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/execAsync.h" +#include "executor/executor.h" +#include "executor/nodeAppend.h" +#include "executor/nodeForeignscan.h" + +/* + * Asynchronously request a tuple from a designed async-capable node. + */ +void +ExecAsyncRequest(AsyncRequest *areq) +{ + if (areq->requestee->chgParam != NULL) /* something changed? */ + ExecReScan(areq->requestee); /* let ReScan handle this */ + + /* must provide our own instrumentation support */ + if (areq->requestee->instrument) + InstrStartNode(areq->requestee->instrument); + + switch (nodeTag(areq->requestee)) + { + case T_ForeignScanState: + ExecAsyncForeignScanRequest(areq); + break; + default: + /* If the node doesn't support async, caller messed up. */ + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(areq->requestee)); + } + + ExecAsyncResponse(areq); + + /* must provide our own instrumentation support */ + if (areq->requestee->instrument) + InstrStopNode(areq->requestee->instrument, + TupIsNull(areq->result) ? 0.0 : 1.0); +} + +/* + * Give the asynchronous node a chance to configure the file descriptor event + * for which it wishes to wait. We expect the node-type specific callback to + * make a single call of the following form: + * + * AddWaitEventToSet(set, WL_SOCKET_READABLE, fd, NULL, areq); + */ +void +ExecAsyncConfigureWait(AsyncRequest *areq) +{ + /* must provide our own instrumentation support */ + if (areq->requestee->instrument) + InstrStartNode(areq->requestee->instrument); + + switch (nodeTag(areq->requestee)) + { + case T_ForeignScanState: + ExecAsyncForeignScanConfigureWait(areq); + break; + default: + /* If the node doesn't support async, caller messed up. */ + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(areq->requestee)); + } + + /* must provide our own instrumentation support */ + if (areq->requestee->instrument) + InstrStopNode(areq->requestee->instrument, 0.0); +} + +/* + * Call the asynchronous node back when a relevant event has occurred. + */ +void +ExecAsyncNotify(AsyncRequest *areq) +{ + /* must provide our own instrumentation support */ + if (areq->requestee->instrument) + InstrStartNode(areq->requestee->instrument); + + switch (nodeTag(areq->requestee)) + { + case T_ForeignScanState: + ExecAsyncForeignScanNotify(areq); + break; + default: + /* If the node doesn't support async, caller messed up. */ + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(areq->requestee)); + } + + ExecAsyncResponse(areq); + + /* must provide our own instrumentation support */ + if (areq->requestee->instrument) + InstrStopNode(areq->requestee->instrument, + TupIsNull(areq->result) ? 0.0 : 1.0); +} + +/* + * Call the requestor back when an asynchronous node has produced a result. + */ +void +ExecAsyncResponse(AsyncRequest *areq) +{ + switch (nodeTag(areq->requestor)) + { + case T_AppendState: + ExecAsyncAppendResponse(areq); + break; + default: + /* If the node doesn't support async, caller messed up. */ + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(areq->requestor)); + } +} + +/* + * A requestee node should call this function to deliver the tuple to its + * requestor node. The requestee node can call this from its ExecAsyncRequest + * or ExecAsyncNotify callback. + */ +void +ExecAsyncRequestDone(AsyncRequest *areq, TupleTableSlot *result) +{ + areq->request_complete = true; + areq->result = result; +} + +/* + * A requestee node should call this function to indicate that it is pending + * for a callback. The requestee node can call this from its ExecAsyncRequest + * or ExecAsyncNotify callback. + */ +void +ExecAsyncRequestPending(AsyncRequest *areq) +{ + areq->callback_pending = true; + areq->request_complete = false; + areq->result = NULL; +} diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c new file mode 100644 index 0000000..4f430fb --- /dev/null +++ b/src/backend/executor/execCurrent.c @@ -0,0 +1,426 @@ +/*------------------------------------------------------------------------- + * + * execCurrent.c + * executor support for WHERE CURRENT OF cursor + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/executor/execCurrent.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/relscan.h" +#include "access/sysattr.h" +#include "catalog/pg_type.h" +#include "executor/executor.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/portal.h" +#include "utils/rel.h" + + +static char *fetch_cursor_param_value(ExprContext *econtext, int paramId); +static ScanState *search_plan_tree(PlanState *node, Oid table_oid, + bool *pending_rescan); + + +/* + * execCurrentOf + * + * Given a CURRENT OF expression and the OID of a table, determine which row + * of the table is currently being scanned by the cursor named by CURRENT OF, + * and return the row's TID into *current_tid. + * + * Returns true if a row was identified. Returns false if the cursor is valid + * for the table but is not currently scanning a row of the table (this is a + * legal situation in inheritance cases). Raises error if cursor is not a + * valid updatable scan of the specified table. + */ +bool +execCurrentOf(CurrentOfExpr *cexpr, + ExprContext *econtext, + Oid table_oid, + ItemPointer current_tid) +{ + char *cursor_name; + char *table_name; + Portal portal; + QueryDesc *queryDesc; + + /* Get the cursor name --- may have to look up a parameter reference */ + if (cexpr->cursor_name) + cursor_name = cexpr->cursor_name; + else + cursor_name = fetch_cursor_param_value(econtext, cexpr->cursor_param); + + /* Fetch table name for possible use in error messages */ + table_name = get_rel_name(table_oid); + if (table_name == NULL) + elog(ERROR, "cache lookup failed for relation %u", table_oid); + + /* Find the cursor's portal */ + portal = GetPortalByName(cursor_name); + if (!PortalIsValid(portal)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_CURSOR), + errmsg("cursor \"%s\" does not exist", cursor_name))); + + /* + * We have to watch out for non-SELECT queries as well as held cursors, + * both of which may have null queryDesc. + */ + if (portal->strategy != PORTAL_ONE_SELECT) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" is not a SELECT query", + cursor_name))); + queryDesc = portal->queryDesc; + if (queryDesc == NULL || queryDesc->estate == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" is held from a previous transaction", + cursor_name))); + + /* + * We have two different strategies depending on whether the cursor uses + * FOR UPDATE/SHARE or not. The reason for supporting both is that the + * FOR UPDATE code is able to identify a target table in many cases where + * the other code can't, while the non-FOR-UPDATE case allows use of WHERE + * CURRENT OF with an insensitive cursor. + */ + if (queryDesc->estate->es_rowmarks) + { + ExecRowMark *erm; + Index i; + + /* + * Here, the query must have exactly one FOR UPDATE/SHARE reference to + * the target table, and we dig the ctid info out of that. + */ + erm = NULL; + for (i = 0; i < queryDesc->estate->es_range_table_size; i++) + { + ExecRowMark *thiserm = queryDesc->estate->es_rowmarks[i]; + + if (thiserm == NULL || + !RowMarkRequiresRowShareLock(thiserm->markType)) + continue; /* ignore non-FOR UPDATE/SHARE items */ + + if (thiserm->relid == table_oid) + { + if (erm) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" has multiple FOR UPDATE/SHARE references to table \"%s\"", + cursor_name, table_name))); + erm = thiserm; + } + } + + if (erm == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" does not have a FOR UPDATE/SHARE reference to table \"%s\"", + cursor_name, table_name))); + + /* + * The cursor must have a current result row: per the SQL spec, it's + * an error if not. + */ + if (portal->atStart || portal->atEnd) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" is not positioned on a row", + cursor_name))); + + /* Return the currently scanned TID, if there is one */ + if (ItemPointerIsValid(&(erm->curCtid))) + { + *current_tid = erm->curCtid; + return true; + } + + /* + * This table didn't produce the cursor's current row; some other + * inheritance child of the same parent must have. Signal caller to + * do nothing on this table. + */ + return false; + } + else + { + /* + * Without FOR UPDATE, we dig through the cursor's plan to find the + * scan node. Fail if it's not there or buried underneath + * aggregation. + */ + ScanState *scanstate; + bool pending_rescan = false; + + scanstate = search_plan_tree(queryDesc->planstate, table_oid, + &pending_rescan); + if (!scanstate) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"", + cursor_name, table_name))); + + /* + * The cursor must have a current result row: per the SQL spec, it's + * an error if not. We test this at the top level, rather than at the + * scan node level, because in inheritance cases any one table scan + * could easily not be on a row. We want to return false, not raise + * error, if the passed-in table OID is for one of the inactive scans. + */ + if (portal->atStart || portal->atEnd) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" is not positioned on a row", + cursor_name))); + + /* + * Now OK to return false if we found an inactive scan. It is + * inactive either if it's not positioned on a row, or there's a + * rescan pending for it. + */ + if (TupIsNull(scanstate->ss_ScanTupleSlot) || pending_rescan) + return false; + + /* + * Extract TID of the scan's current row. The mechanism for this is + * in principle scan-type-dependent, but for most scan types, we can + * just dig the TID out of the physical scan tuple. + */ + if (IsA(scanstate, IndexOnlyScanState)) + { + /* + * For IndexOnlyScan, the tuple stored in ss_ScanTupleSlot may be + * a virtual tuple that does not have the ctid column, so we have + * to get the TID from xs_ctup.t_self. + */ + IndexScanDesc scan = ((IndexOnlyScanState *) scanstate)->ioss_ScanDesc; + + *current_tid = scan->xs_heaptid; + } + else + { + /* + * Default case: try to fetch TID from the scan node's current + * tuple. As an extra cross-check, verify tableoid in the current + * tuple. If the scan hasn't provided a physical tuple, we have + * to fail. + */ + Datum ldatum; + bool lisnull; + ItemPointer tuple_tid; + +#ifdef USE_ASSERT_CHECKING + ldatum = slot_getsysattr(scanstate->ss_ScanTupleSlot, + TableOidAttributeNumber, + &lisnull); + if (lisnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"", + cursor_name, table_name))); + Assert(DatumGetObjectId(ldatum) == table_oid); +#endif + + ldatum = slot_getsysattr(scanstate->ss_ScanTupleSlot, + SelfItemPointerAttributeNumber, + &lisnull); + if (lisnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cursor \"%s\" is not a simply updatable scan of table \"%s\"", + cursor_name, table_name))); + tuple_tid = (ItemPointer) DatumGetPointer(ldatum); + + *current_tid = *tuple_tid; + } + + Assert(ItemPointerIsValid(current_tid)); + + return true; + } +} + +/* + * fetch_cursor_param_value + * + * Fetch the string value of a param, verifying it is of type REFCURSOR. + */ +static char * +fetch_cursor_param_value(ExprContext *econtext, int paramId) +{ + ParamListInfo paramInfo = econtext->ecxt_param_list_info; + + if (paramInfo && + paramId > 0 && paramId <= paramInfo->numParams) + { + ParamExternData *prm; + ParamExternData prmdata; + + /* give hook a chance in case parameter is dynamic */ + if (paramInfo->paramFetch != NULL) + prm = paramInfo->paramFetch(paramInfo, paramId, false, &prmdata); + else + prm = ¶mInfo->params[paramId - 1]; + + if (OidIsValid(prm->ptype) && !prm->isnull) + { + /* safety check in case hook did something unexpected */ + if (prm->ptype != REFCURSOROID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("type of parameter %d (%s) does not match that when preparing the plan (%s)", + paramId, + format_type_be(prm->ptype), + format_type_be(REFCURSOROID)))); + + /* We know that refcursor uses text's I/O routines */ + return TextDatumGetCString(prm->value); + } + } + + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("no value found for parameter %d", paramId))); + return NULL; +} + +/* + * search_plan_tree + * + * Search through a PlanState tree for a scan node on the specified table. + * Return NULL if not found or multiple candidates. + * + * CAUTION: this function is not charged simply with finding some candidate + * scan, but with ensuring that that scan returned the plan tree's current + * output row. That's why we must reject multiple-match cases. + * + * If a candidate is found, set *pending_rescan to true if that candidate + * or any node above it has a pending rescan action, i.e. chgParam != NULL. + * That indicates that we shouldn't consider the node to be positioned on a + * valid tuple, even if its own state would indicate that it is. (Caller + * must initialize *pending_rescan to false, and should not trust its state + * if multiple candidates are found.) + */ +static ScanState * +search_plan_tree(PlanState *node, Oid table_oid, + bool *pending_rescan) +{ + ScanState *result = NULL; + + if (node == NULL) + return NULL; + switch (nodeTag(node)) + { + /* + * Relation scan nodes can all be treated alike: check to see if + * they are scanning the specified table. + * + * ForeignScan and CustomScan might not have a currentRelation, in + * which case we just ignore them. (We dare not descend to any + * child plan nodes they might have, since we do not know the + * relationship of such a node's current output tuple to the + * children's current outputs.) + */ + case T_SeqScanState: + case T_SampleScanState: + case T_IndexScanState: + case T_IndexOnlyScanState: + case T_BitmapHeapScanState: + case T_TidScanState: + case T_TidRangeScanState: + case T_ForeignScanState: + case T_CustomScanState: + { + ScanState *sstate = (ScanState *) node; + + if (sstate->ss_currentRelation && + RelationGetRelid(sstate->ss_currentRelation) == table_oid) + result = sstate; + break; + } + + /* + * For Append, we can check each input node. It is safe to + * descend to the inputs because only the input that resulted in + * the Append's current output node could be positioned on a tuple + * at all; the other inputs are either at EOF or not yet started. + * Hence, if the desired table is scanned by some + * currently-inactive input node, we will find that node but then + * our caller will realize that it didn't emit the tuple of + * interest. + * + * We do need to watch out for multiple matches (possible if + * Append was from UNION ALL rather than an inheritance tree). + * + * Note: we can NOT descend through MergeAppend similarly, since + * its inputs are likely all active, and we don't know which one + * returned the current output tuple. (Perhaps that could be + * fixed if we were to let this code know more about MergeAppend's + * internal state, but it does not seem worth the trouble. Users + * should not expect plans for ORDER BY queries to be considered + * simply-updatable, since they won't be if the sorting is + * implemented by a Sort node.) + */ + case T_AppendState: + { + AppendState *astate = (AppendState *) node; + int i; + + for (i = 0; i < astate->as_nplans; i++) + { + ScanState *elem = search_plan_tree(astate->appendplans[i], + table_oid, + pending_rescan); + + if (!elem) + continue; + if (result) + return NULL; /* multiple matches */ + result = elem; + } + break; + } + + /* + * Result and Limit can be descended through (these are safe + * because they always return their input's current row) + */ + case T_ResultState: + case T_LimitState: + result = search_plan_tree(node->lefttree, + table_oid, + pending_rescan); + break; + + /* + * SubqueryScan too, but it keeps the child in a different place + */ + case T_SubqueryScanState: + result = search_plan_tree(((SubqueryScanState *) node)->subplan, + table_oid, + pending_rescan); + break; + + default: + /* Otherwise, assume we can't descend through it */ + break; + } + + /* + * If we found a candidate at or below this node, then this node's + * chgParam indicates a pending rescan that will affect the candidate. + */ + if (result && node->chgParam != NULL) + *pending_rescan = true; + + return result; +} diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c new file mode 100644 index 0000000..bec249f --- /dev/null +++ b/src/backend/executor/execExpr.c @@ -0,0 +1,3965 @@ +/*------------------------------------------------------------------------- + * + * execExpr.c + * Expression evaluation infrastructure. + * + * During executor startup, we compile each expression tree (which has + * previously been processed by the parser and planner) into an ExprState, + * using ExecInitExpr() et al. This converts the tree into a flat array + * of ExprEvalSteps, which may be thought of as instructions in a program. + * At runtime, we'll execute steps, starting with the first, until we reach + * an EEOP_DONE opcode. + * + * This file contains the "compilation" logic. It is independent of the + * specific execution technology we use (switch statement, computed goto, + * JIT compilation, etc). + * + * See src/backend/executor/README for some background, specifically the + * "Expression Trees and ExprState nodes", "Expression Initialization", + * and "Expression Evaluation" sections. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execExpr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_type.h" +#include "executor/execExpr.h" +#include "executor/nodeSubplan.h" +#include "funcapi.h" +#include "jit/jit.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "nodes/subscripting.h" +#include "optimizer/optimizer.h" +#include "pgstat.h" +#include "utils/acl.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/typcache.h" + + +typedef struct LastAttnumInfo +{ + AttrNumber last_inner; + AttrNumber last_outer; + AttrNumber last_scan; +} LastAttnumInfo; + +static void ExecReadyExpr(ExprState *state); +static void ExecInitExprRec(Expr *node, ExprState *state, + Datum *resv, bool *resnull); +static void ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, + Oid funcid, Oid inputcollid, + ExprState *state); +static void ExecInitExprSlots(ExprState *state, Node *node); +static void ExecPushExprSlots(ExprState *state, LastAttnumInfo *info); +static bool get_last_attnums_walker(Node *node, LastAttnumInfo *info); +static bool ExecComputeSlotInfo(ExprState *state, ExprEvalStep *op); +static void ExecInitWholeRowVar(ExprEvalStep *scratch, Var *variable, + ExprState *state); +static void ExecInitSubscriptingRef(ExprEvalStep *scratch, + SubscriptingRef *sbsref, + ExprState *state, + Datum *resv, bool *resnull); +static bool isAssignmentIndirectionExpr(Expr *expr); +static void ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest, + ExprState *state, + Datum *resv, bool *resnull); +static void ExecBuildAggTransCall(ExprState *state, AggState *aggstate, + ExprEvalStep *scratch, + FunctionCallInfo fcinfo, AggStatePerTrans pertrans, + int transno, int setno, int setoff, bool ishash, + bool nullcheck); + + +/* + * ExecInitExpr: prepare an expression tree for execution + * + * This function builds and returns an ExprState implementing the given + * Expr node tree. The return ExprState can then be handed to ExecEvalExpr + * for execution. Because the Expr tree itself is read-only as far as + * ExecInitExpr and ExecEvalExpr are concerned, several different executions + * of the same plan tree can occur concurrently. (But note that an ExprState + * does mutate at runtime, so it can't be re-used concurrently.) + * + * This must be called in a memory context that will last as long as repeated + * executions of the expression are needed. Typically the context will be + * the same as the per-query context of the associated ExprContext. + * + * Any Aggref, WindowFunc, or SubPlan nodes found in the tree are added to + * the lists of such nodes held by the parent PlanState. + * + * Note: there is no ExecEndExpr function; we assume that any resource + * cleanup needed will be handled by just releasing the memory context + * in which the state tree is built. Functions that require additional + * cleanup work can register a shutdown callback in the ExprContext. + * + * 'node' is the root of the expression tree to compile. + * 'parent' is the PlanState node that owns the expression. + * + * 'parent' may be NULL if we are preparing an expression that is not + * associated with a plan tree. (If so, it can't have aggs or subplans.) + * Such cases should usually come through ExecPrepareExpr, not directly here. + * + * Also, if 'node' is NULL, we just return NULL. This is convenient for some + * callers that may or may not have an expression that needs to be compiled. + * Note that a NULL ExprState pointer *cannot* be handed to ExecEvalExpr, + * although ExecQual and ExecCheck will accept one (and treat it as "true"). + */ +ExprState * +ExecInitExpr(Expr *node, PlanState *parent) +{ + ExprState *state; + ExprEvalStep scratch = {0}; + + /* Special case: NULL expression produces a NULL ExprState pointer */ + if (node == NULL) + return NULL; + + /* Initialize ExprState with empty step list */ + state = makeNode(ExprState); + state->expr = node; + state->parent = parent; + state->ext_params = NULL; + + /* Insert EEOP_*_FETCHSOME steps as needed */ + ExecInitExprSlots(state, (Node *) node); + + /* Compile the expression proper */ + ExecInitExprRec(node, state, &state->resvalue, &state->resnull); + + /* Finally, append a DONE step */ + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return state; +} + +/* + * ExecInitExprWithParams: prepare a standalone expression tree for execution + * + * This is the same as ExecInitExpr, except that there is no parent PlanState, + * and instead we may have a ParamListInfo describing PARAM_EXTERN Params. + */ +ExprState * +ExecInitExprWithParams(Expr *node, ParamListInfo ext_params) +{ + ExprState *state; + ExprEvalStep scratch = {0}; + + /* Special case: NULL expression produces a NULL ExprState pointer */ + if (node == NULL) + return NULL; + + /* Initialize ExprState with empty step list */ + state = makeNode(ExprState); + state->expr = node; + state->parent = NULL; + state->ext_params = ext_params; + + /* Insert EEOP_*_FETCHSOME steps as needed */ + ExecInitExprSlots(state, (Node *) node); + + /* Compile the expression proper */ + ExecInitExprRec(node, state, &state->resvalue, &state->resnull); + + /* Finally, append a DONE step */ + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return state; +} + +/* + * ExecInitQual: prepare a qual for execution by ExecQual + * + * Prepares for the evaluation of a conjunctive boolean expression (qual list + * with implicit AND semantics) that returns true if none of the + * subexpressions are false. + * + * We must return true if the list is empty. Since that's a very common case, + * we optimize it a bit further by translating to a NULL ExprState pointer + * rather than setting up an ExprState that computes constant TRUE. (Some + * especially hot-spot callers of ExecQual detect this and avoid calling + * ExecQual at all.) + * + * If any of the subexpressions yield NULL, then the result of the conjunction + * is false. This makes ExecQual primarily useful for evaluating WHERE + * clauses, since SQL specifies that tuples with null WHERE results do not + * get selected. + */ +ExprState * +ExecInitQual(List *qual, PlanState *parent) +{ + ExprState *state; + ExprEvalStep scratch = {0}; + List *adjust_jumps = NIL; + ListCell *lc; + + /* short-circuit (here and in ExecQual) for empty restriction list */ + if (qual == NIL) + return NULL; + + Assert(IsA(qual, List)); + + state = makeNode(ExprState); + state->expr = (Expr *) qual; + state->parent = parent; + state->ext_params = NULL; + + /* mark expression as to be used with ExecQual() */ + state->flags = EEO_FLAG_IS_QUAL; + + /* Insert EEOP_*_FETCHSOME steps as needed */ + ExecInitExprSlots(state, (Node *) qual); + + /* + * ExecQual() needs to return false for an expression returning NULL. That + * allows us to short-circuit the evaluation the first time a NULL is + * encountered. As qual evaluation is a hot-path this warrants using a + * special opcode for qual evaluation that's simpler than BOOL_AND (which + * has more complex NULL handling). + */ + scratch.opcode = EEOP_QUAL; + + /* + * We can use ExprState's resvalue/resnull as target for each qual expr. + */ + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + + foreach(lc, qual) + { + Expr *node = (Expr *) lfirst(lc); + + /* first evaluate expression */ + ExecInitExprRec(node, state, &state->resvalue, &state->resnull); + + /* then emit EEOP_QUAL to detect if it's false (or null) */ + scratch.d.qualexpr.jumpdone = -1; + ExprEvalPushStep(state, &scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->opcode == EEOP_QUAL); + Assert(as->d.qualexpr.jumpdone == -1); + as->d.qualexpr.jumpdone = state->steps_len; + } + + /* + * At the end, we don't need to do anything more. The last qual expr must + * have yielded TRUE, and since its result is stored in the desired output + * location, we're done. + */ + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return state; +} + +/* + * ExecInitCheck: prepare a check constraint for execution by ExecCheck + * + * This is much like ExecInitQual/ExecQual, except that a null result from + * the conjunction is treated as TRUE. This behavior is appropriate for + * evaluating CHECK constraints, since SQL specifies that NULL constraint + * conditions are not failures. + * + * Note that like ExecInitQual, this expects input in implicit-AND format. + * Users of ExecCheck that have expressions in normal explicit-AND format + * can just apply ExecInitExpr to produce suitable input for ExecCheck. + */ +ExprState * +ExecInitCheck(List *qual, PlanState *parent) +{ + /* short-circuit (here and in ExecCheck) for empty restriction list */ + if (qual == NIL) + return NULL; + + Assert(IsA(qual, List)); + + /* + * Just convert the implicit-AND list to an explicit AND (if there's more + * than one entry), and compile normally. Unlike ExecQual, we can't + * short-circuit on NULL results, so the regular AND behavior is needed. + */ + return ExecInitExpr(make_ands_explicit(qual), parent); +} + +/* + * Call ExecInitExpr() on a list of expressions, return a list of ExprStates. + */ +List * +ExecInitExprList(List *nodes, PlanState *parent) +{ + List *result = NIL; + ListCell *lc; + + foreach(lc, nodes) + { + Expr *e = lfirst(lc); + + result = lappend(result, ExecInitExpr(e, parent)); + } + + return result; +} + +/* + * ExecBuildProjectionInfo + * + * Build a ProjectionInfo node for evaluating the given tlist in the given + * econtext, and storing the result into the tuple slot. (Caller must have + * ensured that tuple slot has a descriptor matching the tlist!) + * + * inputDesc can be NULL, but if it is not, we check to see whether simple + * Vars in the tlist match the descriptor. It is important to provide + * inputDesc for relation-scan plan nodes, as a cross check that the relation + * hasn't been changed since the plan was made. At higher levels of a plan, + * there is no need to recheck. + * + * This is implemented by internally building an ExprState that performs the + * whole projection in one go. + * + * Caution: before PG v10, the targetList was a list of ExprStates; now it + * should be the planner-created targetlist, since we do the compilation here. + */ +ProjectionInfo * +ExecBuildProjectionInfo(List *targetList, + ExprContext *econtext, + TupleTableSlot *slot, + PlanState *parent, + TupleDesc inputDesc) +{ + ProjectionInfo *projInfo = makeNode(ProjectionInfo); + ExprState *state; + ExprEvalStep scratch = {0}; + ListCell *lc; + + projInfo->pi_exprContext = econtext; + /* We embed ExprState into ProjectionInfo instead of doing extra palloc */ + projInfo->pi_state.tag = T_ExprState; + state = &projInfo->pi_state; + state->expr = (Expr *) targetList; + state->parent = parent; + state->ext_params = NULL; + + state->resultslot = slot; + + /* Insert EEOP_*_FETCHSOME steps as needed */ + ExecInitExprSlots(state, (Node *) targetList); + + /* Now compile each tlist column */ + foreach(lc, targetList) + { + TargetEntry *tle = lfirst_node(TargetEntry, lc); + Var *variable = NULL; + AttrNumber attnum = 0; + bool isSafeVar = false; + + /* + * If tlist expression is a safe non-system Var, use the fast-path + * ASSIGN_*_VAR opcodes. "Safe" means that we don't need to apply + * CheckVarSlotCompatibility() during plan startup. If a source slot + * was provided, we make the equivalent tests here; if a slot was not + * provided, we assume that no check is needed because we're dealing + * with a non-relation-scan-level expression. + */ + if (tle->expr != NULL && + IsA(tle->expr, Var) && + ((Var *) tle->expr)->varattno > 0) + { + /* Non-system Var, but how safe is it? */ + variable = (Var *) tle->expr; + attnum = variable->varattno; + + if (inputDesc == NULL) + isSafeVar = true; /* can't check, just assume OK */ + else if (attnum <= inputDesc->natts) + { + Form_pg_attribute attr = TupleDescAttr(inputDesc, attnum - 1); + + /* + * If user attribute is dropped or has a type mismatch, don't + * use ASSIGN_*_VAR. Instead let the normal expression + * machinery handle it (which'll possibly error out). + */ + if (!attr->attisdropped && variable->vartype == attr->atttypid) + { + isSafeVar = true; + } + } + } + + if (isSafeVar) + { + /* Fast-path: just generate an EEOP_ASSIGN_*_VAR step */ + switch (variable->varno) + { + case INNER_VAR: + /* get the tuple from the inner node */ + scratch.opcode = EEOP_ASSIGN_INNER_VAR; + break; + + case OUTER_VAR: + /* get the tuple from the outer node */ + scratch.opcode = EEOP_ASSIGN_OUTER_VAR; + break; + + /* INDEX_VAR is handled by default case */ + + default: + /* get the tuple from the relation being scanned */ + scratch.opcode = EEOP_ASSIGN_SCAN_VAR; + break; + } + + scratch.d.assign_var.attnum = attnum - 1; + scratch.d.assign_var.resultnum = tle->resno - 1; + ExprEvalPushStep(state, &scratch); + } + else + { + /* + * Otherwise, compile the column expression normally. + * + * We can't tell the expression to evaluate directly into the + * result slot, as the result slot (and the exprstate for that + * matter) can change between executions. We instead evaluate + * into the ExprState's resvalue/resnull and then move. + */ + ExecInitExprRec(tle->expr, state, + &state->resvalue, &state->resnull); + + /* + * Column might be referenced multiple times in upper nodes, so + * force value to R/O - but only if it could be an expanded datum. + */ + if (get_typlen(exprType((Node *) tle->expr)) == -1) + scratch.opcode = EEOP_ASSIGN_TMP_MAKE_RO; + else + scratch.opcode = EEOP_ASSIGN_TMP; + scratch.d.assign_tmp.resultnum = tle->resno - 1; + ExprEvalPushStep(state, &scratch); + } + } + + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return projInfo; +} + +/* + * ExecBuildUpdateProjection + * + * Build a ProjectionInfo node for constructing a new tuple during UPDATE. + * The projection will be executed in the given econtext and the result will + * be stored into the given tuple slot. (Caller must have ensured that tuple + * slot has a descriptor matching the target rel!) + * + * When evalTargetList is false, targetList contains the UPDATE ... SET + * expressions that have already been computed by a subplan node; the values + * from this tlist are assumed to be available in the "outer" tuple slot. + * When evalTargetList is true, targetList contains the UPDATE ... SET + * expressions that must be computed (which could contain references to + * the outer, inner, or scan tuple slots). + * + * In either case, targetColnos contains a list of the target column numbers + * corresponding to the non-resjunk entries of targetList. The tlist values + * are assigned into these columns of the result tuple slot. Target columns + * not listed in targetColnos are filled from the UPDATE's old tuple, which + * is assumed to be available in the "scan" tuple slot. + * + * targetList can also contain resjunk columns. These must be evaluated + * if evalTargetList is true, but their values are discarded. + * + * relDesc must describe the relation we intend to update. + * + * This is basically a specialized variant of ExecBuildProjectionInfo. + * However, it also performs sanity checks equivalent to ExecCheckPlanOutput. + * Since we never make a normal tlist equivalent to the whole + * tuple-to-be-assigned, there is no convenient way to apply + * ExecCheckPlanOutput, so we must do our safety checks here. + */ +ProjectionInfo * +ExecBuildUpdateProjection(List *targetList, + bool evalTargetList, + List *targetColnos, + TupleDesc relDesc, + ExprContext *econtext, + TupleTableSlot *slot, + PlanState *parent) +{ + ProjectionInfo *projInfo = makeNode(ProjectionInfo); + ExprState *state; + int nAssignableCols; + bool sawJunk; + Bitmapset *assignedCols; + LastAttnumInfo deform = {0, 0, 0}; + ExprEvalStep scratch = {0}; + int outerattnum; + ListCell *lc, + *lc2; + + projInfo->pi_exprContext = econtext; + /* We embed ExprState into ProjectionInfo instead of doing extra palloc */ + projInfo->pi_state.tag = T_ExprState; + state = &projInfo->pi_state; + if (evalTargetList) + state->expr = (Expr *) targetList; + else + state->expr = NULL; /* not used */ + state->parent = parent; + state->ext_params = NULL; + + state->resultslot = slot; + + /* + * Examine the targetList to see how many non-junk columns there are, and + * to verify that the non-junk columns come before the junk ones. + */ + nAssignableCols = 0; + sawJunk = false; + foreach(lc, targetList) + { + TargetEntry *tle = lfirst_node(TargetEntry, lc); + + if (tle->resjunk) + sawJunk = true; + else + { + if (sawJunk) + elog(ERROR, "subplan target list is out of order"); + nAssignableCols++; + } + } + + /* We should have one targetColnos entry per non-junk column */ + if (nAssignableCols != list_length(targetColnos)) + elog(ERROR, "targetColnos does not match subplan target list"); + + /* + * Build a bitmapset of the columns in targetColnos. (We could just use + * list_member_int() tests, but that risks O(N^2) behavior with many + * columns.) + */ + assignedCols = NULL; + foreach(lc, targetColnos) + { + AttrNumber targetattnum = lfirst_int(lc); + + assignedCols = bms_add_member(assignedCols, targetattnum); + } + + /* + * We need to insert EEOP_*_FETCHSOME steps to ensure the input tuples are + * sufficiently deconstructed. The scan tuple must be deconstructed at + * least as far as the last old column we need. + */ + for (int attnum = relDesc->natts; attnum > 0; attnum--) + { + Form_pg_attribute attr = TupleDescAttr(relDesc, attnum - 1); + + if (attr->attisdropped) + continue; + if (bms_is_member(attnum, assignedCols)) + continue; + deform.last_scan = attnum; + break; + } + + /* + * If we're actually evaluating the tlist, incorporate its input + * requirements too; otherwise, we'll just need to fetch the appropriate + * number of columns of the "outer" tuple. + */ + if (evalTargetList) + get_last_attnums_walker((Node *) targetList, &deform); + else + deform.last_outer = nAssignableCols; + + ExecPushExprSlots(state, &deform); + + /* + * Now generate code to evaluate the tlist's assignable expressions or + * fetch them from the outer tuple, incidentally validating that they'll + * be of the right data type. The checks above ensure that the forboth() + * will iterate over exactly the non-junk columns. + */ + outerattnum = 0; + forboth(lc, targetList, lc2, targetColnos) + { + TargetEntry *tle = lfirst_node(TargetEntry, lc); + AttrNumber targetattnum = lfirst_int(lc2); + Form_pg_attribute attr; + + Assert(!tle->resjunk); + + /* + * Apply sanity checks comparable to ExecCheckPlanOutput(). + */ + if (targetattnum <= 0 || targetattnum > relDesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Query has too many columns."))); + attr = TupleDescAttr(relDesc, targetattnum - 1); + + if (attr->attisdropped) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Query provides a value for a dropped column at ordinal position %d.", + targetattnum))); + if (exprType((Node *) tle->expr) != attr->atttypid) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Table has type %s at ordinal position %d, but query expects %s.", + format_type_be(attr->atttypid), + targetattnum, + format_type_be(exprType((Node *) tle->expr))))); + + /* OK, generate code to perform the assignment. */ + if (evalTargetList) + { + /* + * We must evaluate the TLE's expression and assign it. We do not + * bother jumping through hoops for "safe" Vars like + * ExecBuildProjectionInfo does; this is a relatively less-used + * path and it doesn't seem worth expending code for that. + */ + ExecInitExprRec(tle->expr, state, + &state->resvalue, &state->resnull); + /* Needn't worry about read-only-ness here, either. */ + scratch.opcode = EEOP_ASSIGN_TMP; + scratch.d.assign_tmp.resultnum = targetattnum - 1; + ExprEvalPushStep(state, &scratch); + } + else + { + /* Just assign from the outer tuple. */ + scratch.opcode = EEOP_ASSIGN_OUTER_VAR; + scratch.d.assign_var.attnum = outerattnum; + scratch.d.assign_var.resultnum = targetattnum - 1; + ExprEvalPushStep(state, &scratch); + } + outerattnum++; + } + + /* + * If we're evaluating the tlist, must evaluate any resjunk columns too. + * (This matters for things like MULTIEXPR_SUBLINK SubPlans.) + */ + if (evalTargetList) + { + for_each_cell(lc, targetList, lc) + { + TargetEntry *tle = lfirst_node(TargetEntry, lc); + + Assert(tle->resjunk); + ExecInitExprRec(tle->expr, state, + &state->resvalue, &state->resnull); + } + } + + /* + * Now generate code to copy over any old columns that were not assigned + * to, and to ensure that dropped columns are set to NULL. + */ + for (int attnum = 1; attnum <= relDesc->natts; attnum++) + { + Form_pg_attribute attr = TupleDescAttr(relDesc, attnum - 1); + + if (attr->attisdropped) + { + /* Put a null into the ExprState's resvalue/resnull ... */ + scratch.opcode = EEOP_CONST; + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + scratch.d.constval.value = (Datum) 0; + scratch.d.constval.isnull = true; + ExprEvalPushStep(state, &scratch); + /* ... then assign it to the result slot */ + scratch.opcode = EEOP_ASSIGN_TMP; + scratch.d.assign_tmp.resultnum = attnum - 1; + ExprEvalPushStep(state, &scratch); + } + else if (!bms_is_member(attnum, assignedCols)) + { + /* Certainly the right type, so needn't check */ + scratch.opcode = EEOP_ASSIGN_SCAN_VAR; + scratch.d.assign_var.attnum = attnum - 1; + scratch.d.assign_var.resultnum = attnum - 1; + ExprEvalPushStep(state, &scratch); + } + } + + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return projInfo; +} + +/* + * ExecPrepareExpr --- initialize for expression execution outside a normal + * Plan tree context. + * + * This differs from ExecInitExpr in that we don't assume the caller is + * already running in the EState's per-query context. Also, we run the + * passed expression tree through expression_planner() to prepare it for + * execution. (In ordinary Plan trees the regular planning process will have + * made the appropriate transformations on expressions, but for standalone + * expressions this won't have happened.) + */ +ExprState * +ExecPrepareExpr(Expr *node, EState *estate) +{ + ExprState *result; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + node = expression_planner(node); + + result = ExecInitExpr(node, NULL); + + MemoryContextSwitchTo(oldcontext); + + return result; +} + +/* + * ExecPrepareQual --- initialize for qual execution outside a normal + * Plan tree context. + * + * This differs from ExecInitQual in that we don't assume the caller is + * already running in the EState's per-query context. Also, we run the + * passed expression tree through expression_planner() to prepare it for + * execution. (In ordinary Plan trees the regular planning process will have + * made the appropriate transformations on expressions, but for standalone + * expressions this won't have happened.) + */ +ExprState * +ExecPrepareQual(List *qual, EState *estate) +{ + ExprState *result; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + qual = (List *) expression_planner((Expr *) qual); + + result = ExecInitQual(qual, NULL); + + MemoryContextSwitchTo(oldcontext); + + return result; +} + +/* + * ExecPrepareCheck -- initialize check constraint for execution outside a + * normal Plan tree context. + * + * See ExecPrepareExpr() and ExecInitCheck() for details. + */ +ExprState * +ExecPrepareCheck(List *qual, EState *estate) +{ + ExprState *result; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + qual = (List *) expression_planner((Expr *) qual); + + result = ExecInitCheck(qual, NULL); + + MemoryContextSwitchTo(oldcontext); + + return result; +} + +/* + * Call ExecPrepareExpr() on each member of a list of Exprs, and return + * a list of ExprStates. + * + * See ExecPrepareExpr() for details. + */ +List * +ExecPrepareExprList(List *nodes, EState *estate) +{ + List *result = NIL; + MemoryContext oldcontext; + ListCell *lc; + + /* Ensure that the list cell nodes are in the right context too */ + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + foreach(lc, nodes) + { + Expr *e = (Expr *) lfirst(lc); + + result = lappend(result, ExecPrepareExpr(e, estate)); + } + + MemoryContextSwitchTo(oldcontext); + + return result; +} + +/* + * ExecCheck - evaluate a check constraint + * + * For check constraints, a null result is taken as TRUE, ie the constraint + * passes. + * + * The check constraint may have been prepared with ExecInitCheck + * (possibly via ExecPrepareCheck) if the caller had it in implicit-AND + * format, but a regular boolean expression prepared with ExecInitExpr or + * ExecPrepareExpr works too. + */ +bool +ExecCheck(ExprState *state, ExprContext *econtext) +{ + Datum ret; + bool isnull; + + /* short-circuit (here and in ExecInitCheck) for empty restriction list */ + if (state == NULL) + return true; + + /* verify that expression was not compiled using ExecInitQual */ + Assert(!(state->flags & EEO_FLAG_IS_QUAL)); + + ret = ExecEvalExprSwitchContext(state, econtext, &isnull); + + if (isnull) + return true; + + return DatumGetBool(ret); +} + +/* + * Prepare a compiled expression for execution. This has to be called for + * every ExprState before it can be executed. + * + * NB: While this currently only calls ExecReadyInterpretedExpr(), + * this will likely get extended to further expression evaluation methods. + * Therefore this should be used instead of directly calling + * ExecReadyInterpretedExpr(). + */ +static void +ExecReadyExpr(ExprState *state) +{ + if (jit_compile_expr(state)) + return; + + ExecReadyInterpretedExpr(state); +} + +/* + * Append the steps necessary for the evaluation of node to ExprState->steps, + * possibly recursing into sub-expressions of node. + * + * node - expression to evaluate + * state - ExprState to whose ->steps to append the necessary operations + * resv / resnull - where to store the result of the node into + */ +static void +ExecInitExprRec(Expr *node, ExprState *state, + Datum *resv, bool *resnull) +{ + ExprEvalStep scratch = {0}; + + /* Guard against stack overflow due to overly complex expressions */ + check_stack_depth(); + + /* Step's output location is always what the caller gave us */ + Assert(resv != NULL && resnull != NULL); + scratch.resvalue = resv; + scratch.resnull = resnull; + + /* cases should be ordered as they are in enum NodeTag */ + switch (nodeTag(node)) + { + case T_Var: + { + Var *variable = (Var *) node; + + if (variable->varattno == InvalidAttrNumber) + { + /* whole-row Var */ + ExecInitWholeRowVar(&scratch, variable, state); + } + else if (variable->varattno <= 0) + { + /* system column */ + scratch.d.var.attnum = variable->varattno; + scratch.d.var.vartype = variable->vartype; + switch (variable->varno) + { + case INNER_VAR: + scratch.opcode = EEOP_INNER_SYSVAR; + break; + case OUTER_VAR: + scratch.opcode = EEOP_OUTER_SYSVAR; + break; + + /* INDEX_VAR is handled by default case */ + + default: + scratch.opcode = EEOP_SCAN_SYSVAR; + break; + } + } + else + { + /* regular user column */ + scratch.d.var.attnum = variable->varattno - 1; + scratch.d.var.vartype = variable->vartype; + switch (variable->varno) + { + case INNER_VAR: + scratch.opcode = EEOP_INNER_VAR; + break; + case OUTER_VAR: + scratch.opcode = EEOP_OUTER_VAR; + break; + + /* INDEX_VAR is handled by default case */ + + default: + scratch.opcode = EEOP_SCAN_VAR; + break; + } + } + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_Const: + { + Const *con = (Const *) node; + + scratch.opcode = EEOP_CONST; + scratch.d.constval.value = con->constvalue; + scratch.d.constval.isnull = con->constisnull; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_Param: + { + Param *param = (Param *) node; + ParamListInfo params; + + switch (param->paramkind) + { + case PARAM_EXEC: + scratch.opcode = EEOP_PARAM_EXEC; + scratch.d.param.paramid = param->paramid; + scratch.d.param.paramtype = param->paramtype; + ExprEvalPushStep(state, &scratch); + break; + case PARAM_EXTERN: + + /* + * If we have a relevant ParamCompileHook, use it; + * otherwise compile a standard EEOP_PARAM_EXTERN + * step. ext_params, if supplied, takes precedence + * over info from the parent node's EState (if any). + */ + if (state->ext_params) + params = state->ext_params; + else if (state->parent && + state->parent->state) + params = state->parent->state->es_param_list_info; + else + params = NULL; + if (params && params->paramCompile) + { + params->paramCompile(params, param, state, + resv, resnull); + } + else + { + scratch.opcode = EEOP_PARAM_EXTERN; + scratch.d.param.paramid = param->paramid; + scratch.d.param.paramtype = param->paramtype; + ExprEvalPushStep(state, &scratch); + } + break; + default: + elog(ERROR, "unrecognized paramkind: %d", + (int) param->paramkind); + break; + } + break; + } + + case T_Aggref: + { + Aggref *aggref = (Aggref *) node; + + scratch.opcode = EEOP_AGGREF; + scratch.d.aggref.aggno = aggref->aggno; + + if (state->parent && IsA(state->parent, AggState)) + { + AggState *aggstate = (AggState *) state->parent; + + aggstate->aggs = lappend(aggstate->aggs, aggref); + } + else + { + /* planner messed up */ + elog(ERROR, "Aggref found in non-Agg plan node"); + } + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_GroupingFunc: + { + GroupingFunc *grp_node = (GroupingFunc *) node; + Agg *agg; + + if (!state->parent || !IsA(state->parent, AggState) || + !IsA(state->parent->plan, Agg)) + elog(ERROR, "GroupingFunc found in non-Agg plan node"); + + scratch.opcode = EEOP_GROUPING_FUNC; + + agg = (Agg *) (state->parent->plan); + + if (agg->groupingSets) + scratch.d.grouping_func.clauses = grp_node->cols; + else + scratch.d.grouping_func.clauses = NIL; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_WindowFunc: + { + WindowFunc *wfunc = (WindowFunc *) node; + WindowFuncExprState *wfstate = makeNode(WindowFuncExprState); + + wfstate->wfunc = wfunc; + + if (state->parent && IsA(state->parent, WindowAggState)) + { + WindowAggState *winstate = (WindowAggState *) state->parent; + int nfuncs; + + winstate->funcs = lappend(winstate->funcs, wfstate); + nfuncs = ++winstate->numfuncs; + if (wfunc->winagg) + winstate->numaggs++; + + /* for now initialize agg using old style expressions */ + wfstate->args = ExecInitExprList(wfunc->args, + state->parent); + wfstate->aggfilter = ExecInitExpr(wfunc->aggfilter, + state->parent); + + /* + * Complain if the windowfunc's arguments contain any + * windowfuncs; nested window functions are semantically + * nonsensical. (This should have been caught earlier, + * but we defend against it here anyway.) + */ + if (nfuncs != winstate->numfuncs) + ereport(ERROR, + (errcode(ERRCODE_WINDOWING_ERROR), + errmsg("window function calls cannot be nested"))); + } + else + { + /* planner messed up */ + elog(ERROR, "WindowFunc found in non-WindowAgg plan node"); + } + + scratch.opcode = EEOP_WINDOW_FUNC; + scratch.d.window_func.wfstate = wfstate; + ExprEvalPushStep(state, &scratch); + break; + } + + case T_SubscriptingRef: + { + SubscriptingRef *sbsref = (SubscriptingRef *) node; + + ExecInitSubscriptingRef(&scratch, sbsref, state, resv, resnull); + break; + } + + case T_FuncExpr: + { + FuncExpr *func = (FuncExpr *) node; + + ExecInitFunc(&scratch, node, + func->args, func->funcid, func->inputcollid, + state); + ExprEvalPushStep(state, &scratch); + break; + } + + case T_OpExpr: + { + OpExpr *op = (OpExpr *) node; + + ExecInitFunc(&scratch, node, + op->args, op->opfuncid, op->inputcollid, + state); + ExprEvalPushStep(state, &scratch); + break; + } + + case T_DistinctExpr: + { + DistinctExpr *op = (DistinctExpr *) node; + + ExecInitFunc(&scratch, node, + op->args, op->opfuncid, op->inputcollid, + state); + + /* + * Change opcode of call instruction to EEOP_DISTINCT. + * + * XXX: historically we've not called the function usage + * pgstat infrastructure - that seems inconsistent given that + * we do so for normal function *and* operator evaluation. If + * we decided to do that here, we'd probably want separate + * opcodes for FUSAGE or not. + */ + scratch.opcode = EEOP_DISTINCT; + ExprEvalPushStep(state, &scratch); + break; + } + + case T_NullIfExpr: + { + NullIfExpr *op = (NullIfExpr *) node; + + ExecInitFunc(&scratch, node, + op->args, op->opfuncid, op->inputcollid, + state); + + /* + * Change opcode of call instruction to EEOP_NULLIF. + * + * XXX: historically we've not called the function usage + * pgstat infrastructure - that seems inconsistent given that + * we do so for normal function *and* operator evaluation. If + * we decided to do that here, we'd probably want separate + * opcodes for FUSAGE or not. + */ + scratch.opcode = EEOP_NULLIF; + ExprEvalPushStep(state, &scratch); + break; + } + + case T_ScalarArrayOpExpr: + { + ScalarArrayOpExpr *opexpr = (ScalarArrayOpExpr *) node; + Expr *scalararg; + Expr *arrayarg; + FmgrInfo *finfo; + FunctionCallInfo fcinfo; + AclResult aclresult; + FmgrInfo *hash_finfo; + FunctionCallInfo hash_fcinfo; + + Assert(list_length(opexpr->args) == 2); + scalararg = (Expr *) linitial(opexpr->args); + arrayarg = (Expr *) lsecond(opexpr->args); + + /* Check permission to call function */ + aclresult = pg_proc_aclcheck(opexpr->opfuncid, + GetUserId(), + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(opexpr->opfuncid)); + InvokeFunctionExecuteHook(opexpr->opfuncid); + + if (OidIsValid(opexpr->hashfuncid)) + { + aclresult = pg_proc_aclcheck(opexpr->hashfuncid, + GetUserId(), + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(opexpr->hashfuncid)); + InvokeFunctionExecuteHook(opexpr->hashfuncid); + } + + /* Set up the primary fmgr lookup information */ + finfo = palloc0(sizeof(FmgrInfo)); + fcinfo = palloc0(SizeForFunctionCallInfo(2)); + fmgr_info(opexpr->opfuncid, finfo); + fmgr_info_set_expr((Node *) node, finfo); + InitFunctionCallInfoData(*fcinfo, finfo, 2, + opexpr->inputcollid, NULL, NULL); + + /* + * If hashfuncid is set, we create a EEOP_HASHED_SCALARARRAYOP + * step instead of a EEOP_SCALARARRAYOP. This provides much + * faster lookup performance than the normal linear search + * when the number of items in the array is anything but very + * small. + */ + if (OidIsValid(opexpr->hashfuncid)) + { + hash_finfo = palloc0(sizeof(FmgrInfo)); + hash_fcinfo = palloc0(SizeForFunctionCallInfo(1)); + fmgr_info(opexpr->hashfuncid, hash_finfo); + fmgr_info_set_expr((Node *) node, hash_finfo); + InitFunctionCallInfoData(*hash_fcinfo, hash_finfo, + 1, opexpr->inputcollid, NULL, + NULL); + + scratch.d.hashedscalararrayop.hash_finfo = hash_finfo; + scratch.d.hashedscalararrayop.hash_fcinfo_data = hash_fcinfo; + scratch.d.hashedscalararrayop.hash_fn_addr = hash_finfo->fn_addr; + + /* Evaluate scalar directly into left function argument */ + ExecInitExprRec(scalararg, state, + &fcinfo->args[0].value, &fcinfo->args[0].isnull); + + /* + * Evaluate array argument into our return value. There's + * no danger in that, because the return value is + * guaranteed to be overwritten by + * EEOP_HASHED_SCALARARRAYOP, and will not be passed to + * any other expression. + */ + ExecInitExprRec(arrayarg, state, resv, resnull); + + /* And perform the operation */ + scratch.opcode = EEOP_HASHED_SCALARARRAYOP; + scratch.d.hashedscalararrayop.finfo = finfo; + scratch.d.hashedscalararrayop.fcinfo_data = fcinfo; + scratch.d.hashedscalararrayop.fn_addr = finfo->fn_addr; + + scratch.d.hashedscalararrayop.hash_finfo = hash_finfo; + scratch.d.hashedscalararrayop.hash_fcinfo_data = hash_fcinfo; + scratch.d.hashedscalararrayop.hash_fn_addr = hash_finfo->fn_addr; + + ExprEvalPushStep(state, &scratch); + } + else + { + /* Evaluate scalar directly into left function argument */ + ExecInitExprRec(scalararg, state, + &fcinfo->args[0].value, + &fcinfo->args[0].isnull); + + /* + * Evaluate array argument into our return value. There's + * no danger in that, because the return value is + * guaranteed to be overwritten by EEOP_SCALARARRAYOP, and + * will not be passed to any other expression. + */ + ExecInitExprRec(arrayarg, state, resv, resnull); + + /* And perform the operation */ + scratch.opcode = EEOP_SCALARARRAYOP; + scratch.d.scalararrayop.element_type = InvalidOid; + scratch.d.scalararrayop.useOr = opexpr->useOr; + scratch.d.scalararrayop.finfo = finfo; + scratch.d.scalararrayop.fcinfo_data = fcinfo; + scratch.d.scalararrayop.fn_addr = finfo->fn_addr; + ExprEvalPushStep(state, &scratch); + } + break; + } + + case T_BoolExpr: + { + BoolExpr *boolexpr = (BoolExpr *) node; + int nargs = list_length(boolexpr->args); + List *adjust_jumps = NIL; + int off; + ListCell *lc; + + /* allocate scratch memory used by all steps of AND/OR */ + if (boolexpr->boolop != NOT_EXPR) + scratch.d.boolexpr.anynull = (bool *) palloc(sizeof(bool)); + + /* + * For each argument evaluate the argument itself, then + * perform the bool operation's appropriate handling. + * + * We can evaluate each argument into our result area, since + * the short-circuiting logic means we only need to remember + * previous NULL values. + * + * AND/OR is split into separate STEP_FIRST (one) / STEP (zero + * or more) / STEP_LAST (one) steps, as each of those has to + * perform different work. The FIRST/LAST split is valid + * because AND/OR have at least two arguments. + */ + off = 0; + foreach(lc, boolexpr->args) + { + Expr *arg = (Expr *) lfirst(lc); + + /* Evaluate argument into our output variable */ + ExecInitExprRec(arg, state, resv, resnull); + + /* Perform the appropriate step type */ + switch (boolexpr->boolop) + { + case AND_EXPR: + Assert(nargs >= 2); + + if (off == 0) + scratch.opcode = EEOP_BOOL_AND_STEP_FIRST; + else if (off + 1 == nargs) + scratch.opcode = EEOP_BOOL_AND_STEP_LAST; + else + scratch.opcode = EEOP_BOOL_AND_STEP; + break; + case OR_EXPR: + Assert(nargs >= 2); + + if (off == 0) + scratch.opcode = EEOP_BOOL_OR_STEP_FIRST; + else if (off + 1 == nargs) + scratch.opcode = EEOP_BOOL_OR_STEP_LAST; + else + scratch.opcode = EEOP_BOOL_OR_STEP; + break; + case NOT_EXPR: + Assert(nargs == 1); + + scratch.opcode = EEOP_BOOL_NOT_STEP; + break; + default: + elog(ERROR, "unrecognized boolop: %d", + (int) boolexpr->boolop); + break; + } + + scratch.d.boolexpr.jumpdone = -1; + ExprEvalPushStep(state, &scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + off++; + } + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->d.boolexpr.jumpdone == -1); + as->d.boolexpr.jumpdone = state->steps_len; + } + + break; + } + + case T_SubPlan: + { + SubPlan *subplan = (SubPlan *) node; + SubPlanState *sstate; + + if (!state->parent) + elog(ERROR, "SubPlan found with no parent plan"); + + sstate = ExecInitSubPlan(subplan, state->parent); + + /* add SubPlanState nodes to state->parent->subPlan */ + state->parent->subPlan = lappend(state->parent->subPlan, + sstate); + + scratch.opcode = EEOP_SUBPLAN; + scratch.d.subplan.sstate = sstate; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_FieldSelect: + { + FieldSelect *fselect = (FieldSelect *) node; + + /* evaluate row/record argument into result area */ + ExecInitExprRec(fselect->arg, state, resv, resnull); + + /* and extract field */ + scratch.opcode = EEOP_FIELDSELECT; + scratch.d.fieldselect.fieldnum = fselect->fieldnum; + scratch.d.fieldselect.resulttype = fselect->resulttype; + scratch.d.fieldselect.rowcache.cacheptr = NULL; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_FieldStore: + { + FieldStore *fstore = (FieldStore *) node; + TupleDesc tupDesc; + ExprEvalRowtypeCache *rowcachep; + Datum *values; + bool *nulls; + int ncolumns; + ListCell *l1, + *l2; + + /* find out the number of columns in the composite type */ + tupDesc = lookup_rowtype_tupdesc(fstore->resulttype, -1); + ncolumns = tupDesc->natts; + DecrTupleDescRefCount(tupDesc); + + /* create workspace for column values */ + values = (Datum *) palloc(sizeof(Datum) * ncolumns); + nulls = (bool *) palloc(sizeof(bool) * ncolumns); + + /* create shared composite-type-lookup cache struct */ + rowcachep = palloc(sizeof(ExprEvalRowtypeCache)); + rowcachep->cacheptr = NULL; + + /* emit code to evaluate the composite input value */ + ExecInitExprRec(fstore->arg, state, resv, resnull); + + /* next, deform the input tuple into our workspace */ + scratch.opcode = EEOP_FIELDSTORE_DEFORM; + scratch.d.fieldstore.fstore = fstore; + scratch.d.fieldstore.rowcache = rowcachep; + scratch.d.fieldstore.values = values; + scratch.d.fieldstore.nulls = nulls; + scratch.d.fieldstore.ncolumns = ncolumns; + ExprEvalPushStep(state, &scratch); + + /* evaluate new field values, store in workspace columns */ + forboth(l1, fstore->newvals, l2, fstore->fieldnums) + { + Expr *e = (Expr *) lfirst(l1); + AttrNumber fieldnum = lfirst_int(l2); + Datum *save_innermost_caseval; + bool *save_innermost_casenull; + + if (fieldnum <= 0 || fieldnum > ncolumns) + elog(ERROR, "field number %d is out of range in FieldStore", + fieldnum); + + /* + * Use the CaseTestExpr mechanism to pass down the old + * value of the field being replaced; this is needed in + * case the newval is itself a FieldStore or + * SubscriptingRef that has to obtain and modify the old + * value. It's safe to reuse the CASE mechanism because + * there cannot be a CASE between here and where the value + * would be needed, and a field assignment can't be within + * a CASE either. (So saving and restoring + * innermost_caseval is just paranoia, but let's do it + * anyway.) + * + * Another non-obvious point is that it's safe to use the + * field's values[]/nulls[] entries as both the caseval + * source and the result address for this subexpression. + * That's okay only because (1) both FieldStore and + * SubscriptingRef evaluate their arg or refexpr inputs + * first, and (2) any such CaseTestExpr is directly the + * arg or refexpr input. So any read of the caseval will + * occur before there's a chance to overwrite it. Also, + * if multiple entries in the newvals/fieldnums lists + * target the same field, they'll effectively be applied + * left-to-right which is what we want. + */ + save_innermost_caseval = state->innermost_caseval; + save_innermost_casenull = state->innermost_casenull; + state->innermost_caseval = &values[fieldnum - 1]; + state->innermost_casenull = &nulls[fieldnum - 1]; + + ExecInitExprRec(e, state, + &values[fieldnum - 1], + &nulls[fieldnum - 1]); + + state->innermost_caseval = save_innermost_caseval; + state->innermost_casenull = save_innermost_casenull; + } + + /* finally, form result tuple */ + scratch.opcode = EEOP_FIELDSTORE_FORM; + scratch.d.fieldstore.fstore = fstore; + scratch.d.fieldstore.rowcache = rowcachep; + scratch.d.fieldstore.values = values; + scratch.d.fieldstore.nulls = nulls; + scratch.d.fieldstore.ncolumns = ncolumns; + ExprEvalPushStep(state, &scratch); + break; + } + + case T_RelabelType: + { + /* relabel doesn't need to do anything at runtime */ + RelabelType *relabel = (RelabelType *) node; + + ExecInitExprRec(relabel->arg, state, resv, resnull); + break; + } + + case T_CoerceViaIO: + { + CoerceViaIO *iocoerce = (CoerceViaIO *) node; + Oid iofunc; + bool typisvarlena; + Oid typioparam; + FunctionCallInfo fcinfo_in; + + /* evaluate argument into step's result area */ + ExecInitExprRec(iocoerce->arg, state, resv, resnull); + + /* + * Prepare both output and input function calls, to be + * evaluated inside a single evaluation step for speed - this + * can be a very common operation. + * + * We don't check permissions here as a type's input/output + * function are assumed to be executable by everyone. + */ + scratch.opcode = EEOP_IOCOERCE; + + /* lookup the source type's output function */ + scratch.d.iocoerce.finfo_out = palloc0(sizeof(FmgrInfo)); + scratch.d.iocoerce.fcinfo_data_out = palloc0(SizeForFunctionCallInfo(1)); + + getTypeOutputInfo(exprType((Node *) iocoerce->arg), + &iofunc, &typisvarlena); + fmgr_info(iofunc, scratch.d.iocoerce.finfo_out); + fmgr_info_set_expr((Node *) node, scratch.d.iocoerce.finfo_out); + InitFunctionCallInfoData(*scratch.d.iocoerce.fcinfo_data_out, + scratch.d.iocoerce.finfo_out, + 1, InvalidOid, NULL, NULL); + + /* lookup the result type's input function */ + scratch.d.iocoerce.finfo_in = palloc0(sizeof(FmgrInfo)); + scratch.d.iocoerce.fcinfo_data_in = palloc0(SizeForFunctionCallInfo(3)); + + getTypeInputInfo(iocoerce->resulttype, + &iofunc, &typioparam); + fmgr_info(iofunc, scratch.d.iocoerce.finfo_in); + fmgr_info_set_expr((Node *) node, scratch.d.iocoerce.finfo_in); + InitFunctionCallInfoData(*scratch.d.iocoerce.fcinfo_data_in, + scratch.d.iocoerce.finfo_in, + 3, InvalidOid, NULL, NULL); + + /* + * We can preload the second and third arguments for the input + * function, since they're constants. + */ + fcinfo_in = scratch.d.iocoerce.fcinfo_data_in; + fcinfo_in->args[1].value = ObjectIdGetDatum(typioparam); + fcinfo_in->args[1].isnull = false; + fcinfo_in->args[2].value = Int32GetDatum(-1); + fcinfo_in->args[2].isnull = false; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_ArrayCoerceExpr: + { + ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node; + Oid resultelemtype; + ExprState *elemstate; + + /* evaluate argument into step's result area */ + ExecInitExprRec(acoerce->arg, state, resv, resnull); + + resultelemtype = get_element_type(acoerce->resulttype); + if (!OidIsValid(resultelemtype)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("target type is not an array"))); + + /* + * Construct a sub-expression for the per-element expression; + * but don't ready it until after we check it for triviality. + * We assume it hasn't any Var references, but does have a + * CaseTestExpr representing the source array element values. + */ + elemstate = makeNode(ExprState); + elemstate->expr = acoerce->elemexpr; + elemstate->parent = state->parent; + elemstate->ext_params = state->ext_params; + + elemstate->innermost_caseval = (Datum *) palloc(sizeof(Datum)); + elemstate->innermost_casenull = (bool *) palloc(sizeof(bool)); + + ExecInitExprRec(acoerce->elemexpr, elemstate, + &elemstate->resvalue, &elemstate->resnull); + + if (elemstate->steps_len == 1 && + elemstate->steps[0].opcode == EEOP_CASE_TESTVAL) + { + /* Trivial, so we need no per-element work at runtime */ + elemstate = NULL; + } + else + { + /* Not trivial, so append a DONE step */ + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(elemstate, &scratch); + /* and ready the subexpression */ + ExecReadyExpr(elemstate); + } + + scratch.opcode = EEOP_ARRAYCOERCE; + scratch.d.arraycoerce.elemexprstate = elemstate; + scratch.d.arraycoerce.resultelemtype = resultelemtype; + + if (elemstate) + { + /* Set up workspace for array_map */ + scratch.d.arraycoerce.amstate = + (ArrayMapState *) palloc0(sizeof(ArrayMapState)); + } + else + { + /* Don't need workspace if there's no subexpression */ + scratch.d.arraycoerce.amstate = NULL; + } + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_ConvertRowtypeExpr: + { + ConvertRowtypeExpr *convert = (ConvertRowtypeExpr *) node; + ExprEvalRowtypeCache *rowcachep; + + /* cache structs must be out-of-line for space reasons */ + rowcachep = palloc(2 * sizeof(ExprEvalRowtypeCache)); + rowcachep[0].cacheptr = NULL; + rowcachep[1].cacheptr = NULL; + + /* evaluate argument into step's result area */ + ExecInitExprRec(convert->arg, state, resv, resnull); + + /* and push conversion step */ + scratch.opcode = EEOP_CONVERT_ROWTYPE; + scratch.d.convert_rowtype.inputtype = + exprType((Node *) convert->arg); + scratch.d.convert_rowtype.outputtype = convert->resulttype; + scratch.d.convert_rowtype.incache = &rowcachep[0]; + scratch.d.convert_rowtype.outcache = &rowcachep[1]; + scratch.d.convert_rowtype.map = NULL; + + ExprEvalPushStep(state, &scratch); + break; + } + + /* note that CaseWhen expressions are handled within this block */ + case T_CaseExpr: + { + CaseExpr *caseExpr = (CaseExpr *) node; + List *adjust_jumps = NIL; + Datum *caseval = NULL; + bool *casenull = NULL; + ListCell *lc; + + /* + * If there's a test expression, we have to evaluate it and + * save the value where the CaseTestExpr placeholders can find + * it. + */ + if (caseExpr->arg != NULL) + { + /* Evaluate testexpr into caseval/casenull workspace */ + caseval = palloc(sizeof(Datum)); + casenull = palloc(sizeof(bool)); + + ExecInitExprRec(caseExpr->arg, state, + caseval, casenull); + + /* + * Since value might be read multiple times, force to R/O + * - but only if it could be an expanded datum. + */ + if (get_typlen(exprType((Node *) caseExpr->arg)) == -1) + { + /* change caseval in-place */ + scratch.opcode = EEOP_MAKE_READONLY; + scratch.resvalue = caseval; + scratch.resnull = casenull; + scratch.d.make_readonly.value = caseval; + scratch.d.make_readonly.isnull = casenull; + ExprEvalPushStep(state, &scratch); + /* restore normal settings of scratch fields */ + scratch.resvalue = resv; + scratch.resnull = resnull; + } + } + + /* + * Prepare to evaluate each of the WHEN clauses in turn; as + * soon as one is true we return the value of the + * corresponding THEN clause. If none are true then we return + * the value of the ELSE clause, or NULL if there is none. + */ + foreach(lc, caseExpr->args) + { + CaseWhen *when = (CaseWhen *) lfirst(lc); + Datum *save_innermost_caseval; + bool *save_innermost_casenull; + int whenstep; + + /* + * Make testexpr result available to CaseTestExpr nodes + * within the condition. We must save and restore prior + * setting of innermost_caseval fields, in case this node + * is itself within a larger CASE. + * + * If there's no test expression, we don't actually need + * to save and restore these fields; but it's less code to + * just do so unconditionally. + */ + save_innermost_caseval = state->innermost_caseval; + save_innermost_casenull = state->innermost_casenull; + state->innermost_caseval = caseval; + state->innermost_casenull = casenull; + + /* evaluate condition into CASE's result variables */ + ExecInitExprRec(when->expr, state, resv, resnull); + + state->innermost_caseval = save_innermost_caseval; + state->innermost_casenull = save_innermost_casenull; + + /* If WHEN result isn't true, jump to next CASE arm */ + scratch.opcode = EEOP_JUMP_IF_NOT_TRUE; + scratch.d.jump.jumpdone = -1; /* computed later */ + ExprEvalPushStep(state, &scratch); + whenstep = state->steps_len - 1; + + /* + * If WHEN result is true, evaluate THEN result, storing + * it into the CASE's result variables. + */ + ExecInitExprRec(when->result, state, resv, resnull); + + /* Emit JUMP step to jump to end of CASE's code */ + scratch.opcode = EEOP_JUMP; + scratch.d.jump.jumpdone = -1; /* computed later */ + ExprEvalPushStep(state, &scratch); + + /* + * Don't know address for that jump yet, compute once the + * whole CASE expression is built. + */ + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + + /* + * But we can set WHEN test's jump target now, to make it + * jump to the next WHEN subexpression or the ELSE. + */ + state->steps[whenstep].d.jump.jumpdone = state->steps_len; + } + + /* transformCaseExpr always adds a default */ + Assert(caseExpr->defresult); + + /* evaluate ELSE expr into CASE's result variables */ + ExecInitExprRec(caseExpr->defresult, state, + resv, resnull); + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->opcode == EEOP_JUMP); + Assert(as->d.jump.jumpdone == -1); + as->d.jump.jumpdone = state->steps_len; + } + + break; + } + + case T_CaseTestExpr: + { + /* + * Read from location identified by innermost_caseval. Note + * that innermost_caseval could be NULL, if this node isn't + * actually within a CaseExpr, ArrayCoerceExpr, etc structure. + * That can happen because some parts of the system abuse + * CaseTestExpr to cause a read of a value externally supplied + * in econtext->caseValue_datum. We'll take care of that + * scenario at runtime. + */ + scratch.opcode = EEOP_CASE_TESTVAL; + scratch.d.casetest.value = state->innermost_caseval; + scratch.d.casetest.isnull = state->innermost_casenull; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_ArrayExpr: + { + ArrayExpr *arrayexpr = (ArrayExpr *) node; + int nelems = list_length(arrayexpr->elements); + ListCell *lc; + int elemoff; + + /* + * Evaluate by computing each element, and then forming the + * array. Elements are computed into scratch arrays + * associated with the ARRAYEXPR step. + */ + scratch.opcode = EEOP_ARRAYEXPR; + scratch.d.arrayexpr.elemvalues = + (Datum *) palloc(sizeof(Datum) * nelems); + scratch.d.arrayexpr.elemnulls = + (bool *) palloc(sizeof(bool) * nelems); + scratch.d.arrayexpr.nelems = nelems; + + /* fill remaining fields of step */ + scratch.d.arrayexpr.multidims = arrayexpr->multidims; + scratch.d.arrayexpr.elemtype = arrayexpr->element_typeid; + + /* do one-time catalog lookup for type info */ + get_typlenbyvalalign(arrayexpr->element_typeid, + &scratch.d.arrayexpr.elemlength, + &scratch.d.arrayexpr.elembyval, + &scratch.d.arrayexpr.elemalign); + + /* prepare to evaluate all arguments */ + elemoff = 0; + foreach(lc, arrayexpr->elements) + { + Expr *e = (Expr *) lfirst(lc); + + ExecInitExprRec(e, state, + &scratch.d.arrayexpr.elemvalues[elemoff], + &scratch.d.arrayexpr.elemnulls[elemoff]); + elemoff++; + } + + /* and then collect all into an array */ + ExprEvalPushStep(state, &scratch); + break; + } + + case T_RowExpr: + { + RowExpr *rowexpr = (RowExpr *) node; + int nelems = list_length(rowexpr->args); + TupleDesc tupdesc; + int i; + ListCell *l; + + /* Build tupdesc to describe result tuples */ + if (rowexpr->row_typeid == RECORDOID) + { + /* generic record, use types of given expressions */ + tupdesc = ExecTypeFromExprList(rowexpr->args); + /* ... but adopt RowExpr's column aliases */ + ExecTypeSetColNames(tupdesc, rowexpr->colnames); + /* Bless the tupdesc so it can be looked up later */ + BlessTupleDesc(tupdesc); + } + else + { + /* it's been cast to a named type, use that */ + tupdesc = lookup_rowtype_tupdesc_copy(rowexpr->row_typeid, -1); + } + + /* + * In the named-type case, the tupdesc could have more columns + * than are in the args list, since the type might have had + * columns added since the ROW() was parsed. We want those + * extra columns to go to nulls, so we make sure that the + * workspace arrays are large enough and then initialize any + * extra columns to read as NULLs. + */ + Assert(nelems <= tupdesc->natts); + nelems = Max(nelems, tupdesc->natts); + + /* + * Evaluate by first building datums for each field, and then + * a final step forming the composite datum. + */ + scratch.opcode = EEOP_ROW; + scratch.d.row.tupdesc = tupdesc; + + /* space for the individual field datums */ + scratch.d.row.elemvalues = + (Datum *) palloc(sizeof(Datum) * nelems); + scratch.d.row.elemnulls = + (bool *) palloc(sizeof(bool) * nelems); + /* as explained above, make sure any extra columns are null */ + memset(scratch.d.row.elemnulls, true, sizeof(bool) * nelems); + + /* Set up evaluation, skipping any deleted columns */ + i = 0; + foreach(l, rowexpr->args) + { + Form_pg_attribute att = TupleDescAttr(tupdesc, i); + Expr *e = (Expr *) lfirst(l); + + if (!att->attisdropped) + { + /* + * Guard against ALTER COLUMN TYPE on rowtype since + * the RowExpr was created. XXX should we check + * typmod too? Not sure we can be sure it'll be the + * same. + */ + if (exprType((Node *) e) != att->atttypid) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("ROW() column has type %s instead of type %s", + format_type_be(exprType((Node *) e)), + format_type_be(att->atttypid)))); + } + else + { + /* + * Ignore original expression and insert a NULL. We + * don't really care what type of NULL it is, so + * always make an int4 NULL. + */ + e = (Expr *) makeNullConst(INT4OID, -1, InvalidOid); + } + + /* Evaluate column expr into appropriate workspace slot */ + ExecInitExprRec(e, state, + &scratch.d.row.elemvalues[i], + &scratch.d.row.elemnulls[i]); + i++; + } + + /* And finally build the row value */ + ExprEvalPushStep(state, &scratch); + break; + } + + case T_RowCompareExpr: + { + RowCompareExpr *rcexpr = (RowCompareExpr *) node; + int nopers = list_length(rcexpr->opnos); + List *adjust_jumps = NIL; + ListCell *l_left_expr, + *l_right_expr, + *l_opno, + *l_opfamily, + *l_inputcollid; + ListCell *lc; + + /* + * Iterate over each field, prepare comparisons. To handle + * NULL results, prepare jumps to after the expression. If a + * comparison yields a != 0 result, jump to the final step. + */ + Assert(list_length(rcexpr->largs) == nopers); + Assert(list_length(rcexpr->rargs) == nopers); + Assert(list_length(rcexpr->opfamilies) == nopers); + Assert(list_length(rcexpr->inputcollids) == nopers); + + forfive(l_left_expr, rcexpr->largs, + l_right_expr, rcexpr->rargs, + l_opno, rcexpr->opnos, + l_opfamily, rcexpr->opfamilies, + l_inputcollid, rcexpr->inputcollids) + { + Expr *left_expr = (Expr *) lfirst(l_left_expr); + Expr *right_expr = (Expr *) lfirst(l_right_expr); + Oid opno = lfirst_oid(l_opno); + Oid opfamily = lfirst_oid(l_opfamily); + Oid inputcollid = lfirst_oid(l_inputcollid); + int strategy; + Oid lefttype; + Oid righttype; + Oid proc; + FmgrInfo *finfo; + FunctionCallInfo fcinfo; + + get_op_opfamily_properties(opno, opfamily, false, + &strategy, + &lefttype, + &righttype); + proc = get_opfamily_proc(opfamily, + lefttype, + righttype, + BTORDER_PROC); + if (!OidIsValid(proc)) + elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", + BTORDER_PROC, lefttype, righttype, opfamily); + + /* Set up the primary fmgr lookup information */ + finfo = palloc0(sizeof(FmgrInfo)); + fcinfo = palloc0(SizeForFunctionCallInfo(2)); + fmgr_info(proc, finfo); + fmgr_info_set_expr((Node *) node, finfo); + InitFunctionCallInfoData(*fcinfo, finfo, 2, + inputcollid, NULL, NULL); + + /* + * If we enforced permissions checks on index support + * functions, we'd need to make a check here. But the + * index support machinery doesn't do that, and thus + * neither does this code. + */ + + /* evaluate left and right args directly into fcinfo */ + ExecInitExprRec(left_expr, state, + &fcinfo->args[0].value, &fcinfo->args[0].isnull); + ExecInitExprRec(right_expr, state, + &fcinfo->args[1].value, &fcinfo->args[1].isnull); + + scratch.opcode = EEOP_ROWCOMPARE_STEP; + scratch.d.rowcompare_step.finfo = finfo; + scratch.d.rowcompare_step.fcinfo_data = fcinfo; + scratch.d.rowcompare_step.fn_addr = finfo->fn_addr; + /* jump targets filled below */ + scratch.d.rowcompare_step.jumpnull = -1; + scratch.d.rowcompare_step.jumpdone = -1; + + ExprEvalPushStep(state, &scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + /* + * We could have a zero-column rowtype, in which case the rows + * necessarily compare equal. + */ + if (nopers == 0) + { + scratch.opcode = EEOP_CONST; + scratch.d.constval.value = Int32GetDatum(0); + scratch.d.constval.isnull = false; + ExprEvalPushStep(state, &scratch); + } + + /* Finally, examine the last comparison result */ + scratch.opcode = EEOP_ROWCOMPARE_FINAL; + scratch.d.rowcompare_final.rctype = rcexpr->rctype; + ExprEvalPushStep(state, &scratch); + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->opcode == EEOP_ROWCOMPARE_STEP); + Assert(as->d.rowcompare_step.jumpdone == -1); + Assert(as->d.rowcompare_step.jumpnull == -1); + + /* jump to comparison evaluation */ + as->d.rowcompare_step.jumpdone = state->steps_len - 1; + /* jump to the following expression */ + as->d.rowcompare_step.jumpnull = state->steps_len; + } + + break; + } + + case T_CoalesceExpr: + { + CoalesceExpr *coalesce = (CoalesceExpr *) node; + List *adjust_jumps = NIL; + ListCell *lc; + + /* We assume there's at least one arg */ + Assert(coalesce->args != NIL); + + /* + * Prepare evaluation of all coalesced arguments, after each + * one push a step that short-circuits if not null. + */ + foreach(lc, coalesce->args) + { + Expr *e = (Expr *) lfirst(lc); + + /* evaluate argument, directly into result datum */ + ExecInitExprRec(e, state, resv, resnull); + + /* if it's not null, skip to end of COALESCE expr */ + scratch.opcode = EEOP_JUMP_IF_NOT_NULL; + scratch.d.jump.jumpdone = -1; /* adjust later */ + ExprEvalPushStep(state, &scratch); + + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + /* + * No need to add a constant NULL return - we only can get to + * the end of the expression if a NULL already is being + * returned. + */ + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->opcode == EEOP_JUMP_IF_NOT_NULL); + Assert(as->d.jump.jumpdone == -1); + as->d.jump.jumpdone = state->steps_len; + } + + break; + } + + case T_MinMaxExpr: + { + MinMaxExpr *minmaxexpr = (MinMaxExpr *) node; + int nelems = list_length(minmaxexpr->args); + TypeCacheEntry *typentry; + FmgrInfo *finfo; + FunctionCallInfo fcinfo; + ListCell *lc; + int off; + + /* Look up the btree comparison function for the datatype */ + typentry = lookup_type_cache(minmaxexpr->minmaxtype, + TYPECACHE_CMP_PROC); + if (!OidIsValid(typentry->cmp_proc)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a comparison function for type %s", + format_type_be(minmaxexpr->minmaxtype)))); + + /* + * If we enforced permissions checks on index support + * functions, we'd need to make a check here. But the index + * support machinery doesn't do that, and thus neither does + * this code. + */ + + /* Perform function lookup */ + finfo = palloc0(sizeof(FmgrInfo)); + fcinfo = palloc0(SizeForFunctionCallInfo(2)); + fmgr_info(typentry->cmp_proc, finfo); + fmgr_info_set_expr((Node *) node, finfo); + InitFunctionCallInfoData(*fcinfo, finfo, 2, + minmaxexpr->inputcollid, NULL, NULL); + + scratch.opcode = EEOP_MINMAX; + /* allocate space to store arguments */ + scratch.d.minmax.values = + (Datum *) palloc(sizeof(Datum) * nelems); + scratch.d.minmax.nulls = + (bool *) palloc(sizeof(bool) * nelems); + scratch.d.minmax.nelems = nelems; + + scratch.d.minmax.op = minmaxexpr->op; + scratch.d.minmax.finfo = finfo; + scratch.d.minmax.fcinfo_data = fcinfo; + + /* evaluate expressions into minmax->values/nulls */ + off = 0; + foreach(lc, minmaxexpr->args) + { + Expr *e = (Expr *) lfirst(lc); + + ExecInitExprRec(e, state, + &scratch.d.minmax.values[off], + &scratch.d.minmax.nulls[off]); + off++; + } + + /* and push the final comparison */ + ExprEvalPushStep(state, &scratch); + break; + } + + case T_SQLValueFunction: + { + SQLValueFunction *svf = (SQLValueFunction *) node; + + scratch.opcode = EEOP_SQLVALUEFUNCTION; + scratch.d.sqlvaluefunction.svf = svf; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_XmlExpr: + { + XmlExpr *xexpr = (XmlExpr *) node; + int nnamed = list_length(xexpr->named_args); + int nargs = list_length(xexpr->args); + int off; + ListCell *arg; + + scratch.opcode = EEOP_XMLEXPR; + scratch.d.xmlexpr.xexpr = xexpr; + + /* allocate space for storing all the arguments */ + if (nnamed) + { + scratch.d.xmlexpr.named_argvalue = + (Datum *) palloc(sizeof(Datum) * nnamed); + scratch.d.xmlexpr.named_argnull = + (bool *) palloc(sizeof(bool) * nnamed); + } + else + { + scratch.d.xmlexpr.named_argvalue = NULL; + scratch.d.xmlexpr.named_argnull = NULL; + } + + if (nargs) + { + scratch.d.xmlexpr.argvalue = + (Datum *) palloc(sizeof(Datum) * nargs); + scratch.d.xmlexpr.argnull = + (bool *) palloc(sizeof(bool) * nargs); + } + else + { + scratch.d.xmlexpr.argvalue = NULL; + scratch.d.xmlexpr.argnull = NULL; + } + + /* prepare argument execution */ + off = 0; + foreach(arg, xexpr->named_args) + { + Expr *e = (Expr *) lfirst(arg); + + ExecInitExprRec(e, state, + &scratch.d.xmlexpr.named_argvalue[off], + &scratch.d.xmlexpr.named_argnull[off]); + off++; + } + + off = 0; + foreach(arg, xexpr->args) + { + Expr *e = (Expr *) lfirst(arg); + + ExecInitExprRec(e, state, + &scratch.d.xmlexpr.argvalue[off], + &scratch.d.xmlexpr.argnull[off]); + off++; + } + + /* and evaluate the actual XML expression */ + ExprEvalPushStep(state, &scratch); + break; + } + + case T_NullTest: + { + NullTest *ntest = (NullTest *) node; + + if (ntest->nulltesttype == IS_NULL) + { + if (ntest->argisrow) + scratch.opcode = EEOP_NULLTEST_ROWISNULL; + else + scratch.opcode = EEOP_NULLTEST_ISNULL; + } + else if (ntest->nulltesttype == IS_NOT_NULL) + { + if (ntest->argisrow) + scratch.opcode = EEOP_NULLTEST_ROWISNOTNULL; + else + scratch.opcode = EEOP_NULLTEST_ISNOTNULL; + } + else + { + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + } + /* initialize cache in case it's a row test */ + scratch.d.nulltest_row.rowcache.cacheptr = NULL; + + /* first evaluate argument into result variable */ + ExecInitExprRec(ntest->arg, state, + resv, resnull); + + /* then push the test of that argument */ + ExprEvalPushStep(state, &scratch); + break; + } + + case T_BooleanTest: + { + BooleanTest *btest = (BooleanTest *) node; + + /* + * Evaluate argument, directly into result datum. That's ok, + * because resv/resnull is definitely not used anywhere else, + * and will get overwritten by the below EEOP_BOOLTEST_IS_* + * step. + */ + ExecInitExprRec(btest->arg, state, resv, resnull); + + switch (btest->booltesttype) + { + case IS_TRUE: + scratch.opcode = EEOP_BOOLTEST_IS_TRUE; + break; + case IS_NOT_TRUE: + scratch.opcode = EEOP_BOOLTEST_IS_NOT_TRUE; + break; + case IS_FALSE: + scratch.opcode = EEOP_BOOLTEST_IS_FALSE; + break; + case IS_NOT_FALSE: + scratch.opcode = EEOP_BOOLTEST_IS_NOT_FALSE; + break; + case IS_UNKNOWN: + /* Same as scalar IS NULL test */ + scratch.opcode = EEOP_NULLTEST_ISNULL; + break; + case IS_NOT_UNKNOWN: + /* Same as scalar IS NOT NULL test */ + scratch.opcode = EEOP_NULLTEST_ISNOTNULL; + break; + default: + elog(ERROR, "unrecognized booltesttype: %d", + (int) btest->booltesttype); + } + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_CoerceToDomain: + { + CoerceToDomain *ctest = (CoerceToDomain *) node; + + ExecInitCoerceToDomain(&scratch, ctest, state, + resv, resnull); + break; + } + + case T_CoerceToDomainValue: + { + /* + * Read from location identified by innermost_domainval. Note + * that innermost_domainval could be NULL, if we're compiling + * a standalone domain check rather than one embedded in a + * larger expression. In that case we must read from + * econtext->domainValue_datum. We'll take care of that + * scenario at runtime. + */ + scratch.opcode = EEOP_DOMAIN_TESTVAL; + /* we share instruction union variant with case testval */ + scratch.d.casetest.value = state->innermost_domainval; + scratch.d.casetest.isnull = state->innermost_domainnull; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_CurrentOfExpr: + { + scratch.opcode = EEOP_CURRENTOFEXPR; + ExprEvalPushStep(state, &scratch); + break; + } + + case T_NextValueExpr: + { + NextValueExpr *nve = (NextValueExpr *) node; + + scratch.opcode = EEOP_NEXTVALUEEXPR; + scratch.d.nextvalueexpr.seqid = nve->seqid; + scratch.d.nextvalueexpr.seqtypid = nve->typeId; + + ExprEvalPushStep(state, &scratch); + break; + } + + default: + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(node)); + break; + } +} + +/* + * Add another expression evaluation step to ExprState->steps. + * + * Note that this potentially re-allocates es->steps, therefore no pointer + * into that array may be used while the expression is still being built. + */ +void +ExprEvalPushStep(ExprState *es, const ExprEvalStep *s) +{ + if (es->steps_alloc == 0) + { + es->steps_alloc = 16; + es->steps = palloc(sizeof(ExprEvalStep) * es->steps_alloc); + } + else if (es->steps_alloc == es->steps_len) + { + es->steps_alloc *= 2; + es->steps = repalloc(es->steps, + sizeof(ExprEvalStep) * es->steps_alloc); + } + + memcpy(&es->steps[es->steps_len++], s, sizeof(ExprEvalStep)); +} + +/* + * Perform setup necessary for the evaluation of a function-like expression, + * appending argument evaluation steps to the steps list in *state, and + * setting up *scratch so it is ready to be pushed. + * + * *scratch is not pushed here, so that callers may override the opcode, + * which is useful for function-like cases like DISTINCT. + */ +static void +ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, Oid funcid, + Oid inputcollid, ExprState *state) +{ + int nargs = list_length(args); + AclResult aclresult; + FmgrInfo *flinfo; + FunctionCallInfo fcinfo; + int argno; + ListCell *lc; + + /* Check permission to call function */ + aclresult = pg_proc_aclcheck(funcid, GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(funcid)); + InvokeFunctionExecuteHook(funcid); + + /* + * Safety check on nargs. Under normal circumstances this should never + * fail, as parser should check sooner. But possibly it might fail if + * server has been compiled with FUNC_MAX_ARGS smaller than some functions + * declared in pg_proc? + */ + if (nargs > FUNC_MAX_ARGS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_ARGUMENTS), + errmsg_plural("cannot pass more than %d argument to a function", + "cannot pass more than %d arguments to a function", + FUNC_MAX_ARGS, + FUNC_MAX_ARGS))); + + /* Allocate function lookup data and parameter workspace for this call */ + scratch->d.func.finfo = palloc0(sizeof(FmgrInfo)); + scratch->d.func.fcinfo_data = palloc0(SizeForFunctionCallInfo(nargs)); + flinfo = scratch->d.func.finfo; + fcinfo = scratch->d.func.fcinfo_data; + + /* Set up the primary fmgr lookup information */ + fmgr_info(funcid, flinfo); + fmgr_info_set_expr((Node *) node, flinfo); + + /* Initialize function call parameter structure too */ + InitFunctionCallInfoData(*fcinfo, flinfo, + nargs, inputcollid, NULL, NULL); + + /* Keep extra copies of this info to save an indirection at runtime */ + scratch->d.func.fn_addr = flinfo->fn_addr; + scratch->d.func.nargs = nargs; + + /* We only support non-set functions here */ + if (flinfo->fn_retset) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"), + state->parent ? + executor_errposition(state->parent->state, + exprLocation((Node *) node)) : 0)); + + /* Build code to evaluate arguments directly into the fcinfo struct */ + argno = 0; + foreach(lc, args) + { + Expr *arg = (Expr *) lfirst(lc); + + if (IsA(arg, Const)) + { + /* + * Don't evaluate const arguments every round; especially + * interesting for constants in comparisons. + */ + Const *con = (Const *) arg; + + fcinfo->args[argno].value = con->constvalue; + fcinfo->args[argno].isnull = con->constisnull; + } + else + { + ExecInitExprRec(arg, state, + &fcinfo->args[argno].value, + &fcinfo->args[argno].isnull); + } + argno++; + } + + /* Insert appropriate opcode depending on strictness and stats level */ + if (pgstat_track_functions <= flinfo->fn_stats) + { + if (flinfo->fn_strict && nargs > 0) + scratch->opcode = EEOP_FUNCEXPR_STRICT; + else + scratch->opcode = EEOP_FUNCEXPR; + } + else + { + if (flinfo->fn_strict && nargs > 0) + scratch->opcode = EEOP_FUNCEXPR_STRICT_FUSAGE; + else + scratch->opcode = EEOP_FUNCEXPR_FUSAGE; + } +} + +/* + * Add expression steps deforming the ExprState's inner/outer/scan slots + * as much as required by the expression. + */ +static void +ExecInitExprSlots(ExprState *state, Node *node) +{ + LastAttnumInfo info = {0, 0, 0}; + + /* + * Figure out which attributes we're going to need. + */ + get_last_attnums_walker(node, &info); + + ExecPushExprSlots(state, &info); +} + +/* + * Add steps deforming the ExprState's inner/out/scan slots as much as + * indicated by info. This is useful when building an ExprState covering more + * than one expression. + */ +static void +ExecPushExprSlots(ExprState *state, LastAttnumInfo *info) +{ + ExprEvalStep scratch = {0}; + + scratch.resvalue = NULL; + scratch.resnull = NULL; + + /* Emit steps as needed */ + if (info->last_inner > 0) + { + scratch.opcode = EEOP_INNER_FETCHSOME; + scratch.d.fetch.last_var = info->last_inner; + scratch.d.fetch.fixed = false; + scratch.d.fetch.kind = NULL; + scratch.d.fetch.known_desc = NULL; + if (ExecComputeSlotInfo(state, &scratch)) + ExprEvalPushStep(state, &scratch); + } + if (info->last_outer > 0) + { + scratch.opcode = EEOP_OUTER_FETCHSOME; + scratch.d.fetch.last_var = info->last_outer; + scratch.d.fetch.fixed = false; + scratch.d.fetch.kind = NULL; + scratch.d.fetch.known_desc = NULL; + if (ExecComputeSlotInfo(state, &scratch)) + ExprEvalPushStep(state, &scratch); + } + if (info->last_scan > 0) + { + scratch.opcode = EEOP_SCAN_FETCHSOME; + scratch.d.fetch.last_var = info->last_scan; + scratch.d.fetch.fixed = false; + scratch.d.fetch.kind = NULL; + scratch.d.fetch.known_desc = NULL; + if (ExecComputeSlotInfo(state, &scratch)) + ExprEvalPushStep(state, &scratch); + } +} + +/* + * get_last_attnums_walker: expression walker for ExecInitExprSlots + */ +static bool +get_last_attnums_walker(Node *node, LastAttnumInfo *info) +{ + if (node == NULL) + return false; + if (IsA(node, Var)) + { + Var *variable = (Var *) node; + AttrNumber attnum = variable->varattno; + + switch (variable->varno) + { + case INNER_VAR: + info->last_inner = Max(info->last_inner, attnum); + break; + + case OUTER_VAR: + info->last_outer = Max(info->last_outer, attnum); + break; + + /* INDEX_VAR is handled by default case */ + + default: + info->last_scan = Max(info->last_scan, attnum); + break; + } + return false; + } + + /* + * Don't examine the arguments or filters of Aggrefs or WindowFuncs, + * because those do not represent expressions to be evaluated within the + * calling expression's econtext. GroupingFunc arguments are never + * evaluated at all. + */ + if (IsA(node, Aggref)) + return false; + if (IsA(node, WindowFunc)) + return false; + if (IsA(node, GroupingFunc)) + return false; + return expression_tree_walker(node, get_last_attnums_walker, + (void *) info); +} + +/* + * Compute additional information for EEOP_*_FETCHSOME ops. + * + * The goal is to determine whether a slot is 'fixed', that is, every + * evaluation of the expression will have the same type of slot, with an + * equivalent descriptor. + * + * Returns true if the deforming step is required, false otherwise. + */ +static bool +ExecComputeSlotInfo(ExprState *state, ExprEvalStep *op) +{ + PlanState *parent = state->parent; + TupleDesc desc = NULL; + const TupleTableSlotOps *tts_ops = NULL; + bool isfixed = false; + ExprEvalOp opcode = op->opcode; + + Assert(opcode == EEOP_INNER_FETCHSOME || + opcode == EEOP_OUTER_FETCHSOME || + opcode == EEOP_SCAN_FETCHSOME); + + if (op->d.fetch.known_desc != NULL) + { + desc = op->d.fetch.known_desc; + tts_ops = op->d.fetch.kind; + isfixed = op->d.fetch.kind != NULL; + } + else if (!parent) + { + isfixed = false; + } + else if (opcode == EEOP_INNER_FETCHSOME) + { + PlanState *is = innerPlanState(parent); + + if (parent->inneropsset && !parent->inneropsfixed) + { + isfixed = false; + } + else if (parent->inneropsset && parent->innerops) + { + isfixed = true; + tts_ops = parent->innerops; + desc = ExecGetResultType(is); + } + else if (is) + { + tts_ops = ExecGetResultSlotOps(is, &isfixed); + desc = ExecGetResultType(is); + } + } + else if (opcode == EEOP_OUTER_FETCHSOME) + { + PlanState *os = outerPlanState(parent); + + if (parent->outeropsset && !parent->outeropsfixed) + { + isfixed = false; + } + else if (parent->outeropsset && parent->outerops) + { + isfixed = true; + tts_ops = parent->outerops; + desc = ExecGetResultType(os); + } + else if (os) + { + tts_ops = ExecGetResultSlotOps(os, &isfixed); + desc = ExecGetResultType(os); + } + } + else if (opcode == EEOP_SCAN_FETCHSOME) + { + desc = parent->scandesc; + + if (parent->scanops) + tts_ops = parent->scanops; + + if (parent->scanopsset) + isfixed = parent->scanopsfixed; + } + + if (isfixed && desc != NULL && tts_ops != NULL) + { + op->d.fetch.fixed = true; + op->d.fetch.kind = tts_ops; + op->d.fetch.known_desc = desc; + } + else + { + op->d.fetch.fixed = false; + op->d.fetch.kind = NULL; + op->d.fetch.known_desc = NULL; + } + + /* if the slot is known to always virtual we never need to deform */ + if (op->d.fetch.fixed && op->d.fetch.kind == &TTSOpsVirtual) + return false; + + return true; +} + +/* + * Prepare step for the evaluation of a whole-row variable. + * The caller still has to push the step. + */ +static void +ExecInitWholeRowVar(ExprEvalStep *scratch, Var *variable, ExprState *state) +{ + PlanState *parent = state->parent; + + /* fill in all but the target */ + scratch->opcode = EEOP_WHOLEROW; + scratch->d.wholerow.var = variable; + scratch->d.wholerow.first = true; + scratch->d.wholerow.slow = false; + scratch->d.wholerow.tupdesc = NULL; /* filled at runtime */ + scratch->d.wholerow.junkFilter = NULL; + + /* + * If the input tuple came from a subquery, it might contain "resjunk" + * columns (such as GROUP BY or ORDER BY columns), which we don't want to + * keep in the whole-row result. We can get rid of such columns by + * passing the tuple through a JunkFilter --- but to make one, we have to + * lay our hands on the subquery's targetlist. Fortunately, there are not + * very many cases where this can happen, and we can identify all of them + * by examining our parent PlanState. We assume this is not an issue in + * standalone expressions that don't have parent plans. (Whole-row Vars + * can occur in such expressions, but they will always be referencing + * table rows.) + */ + if (parent) + { + PlanState *subplan = NULL; + + switch (nodeTag(parent)) + { + case T_SubqueryScanState: + subplan = ((SubqueryScanState *) parent)->subplan; + break; + case T_CteScanState: + subplan = ((CteScanState *) parent)->cteplanstate; + break; + default: + break; + } + + if (subplan) + { + bool junk_filter_needed = false; + ListCell *tlist; + + /* Detect whether subplan tlist actually has any junk columns */ + foreach(tlist, subplan->plan->targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(tlist); + + if (tle->resjunk) + { + junk_filter_needed = true; + break; + } + } + + /* If so, build the junkfilter now */ + if (junk_filter_needed) + { + scratch->d.wholerow.junkFilter = + ExecInitJunkFilter(subplan->plan->targetlist, + ExecInitExtraTupleSlot(parent->state, NULL, + &TTSOpsVirtual)); + } + } + } +} + +/* + * Prepare evaluation of a SubscriptingRef expression. + */ +static void +ExecInitSubscriptingRef(ExprEvalStep *scratch, SubscriptingRef *sbsref, + ExprState *state, Datum *resv, bool *resnull) +{ + bool isAssignment = (sbsref->refassgnexpr != NULL); + int nupper = list_length(sbsref->refupperindexpr); + int nlower = list_length(sbsref->reflowerindexpr); + const SubscriptRoutines *sbsroutines; + SubscriptingRefState *sbsrefstate; + SubscriptExecSteps methods; + char *ptr; + List *adjust_jumps = NIL; + ListCell *lc; + int i; + + /* Look up the subscripting support methods */ + sbsroutines = getSubscriptingRoutines(sbsref->refcontainertype, NULL); + if (!sbsroutines) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("cannot subscript type %s because it does not support subscripting", + format_type_be(sbsref->refcontainertype)), + state->parent ? + executor_errposition(state->parent->state, + exprLocation((Node *) sbsref)) : 0)); + + /* Allocate sbsrefstate, with enough space for per-subscript arrays too */ + sbsrefstate = palloc0(MAXALIGN(sizeof(SubscriptingRefState)) + + (nupper + nlower) * (sizeof(Datum) + + 2 * sizeof(bool))); + + /* Fill constant fields of SubscriptingRefState */ + sbsrefstate->isassignment = isAssignment; + sbsrefstate->numupper = nupper; + sbsrefstate->numlower = nlower; + /* Set up per-subscript arrays */ + ptr = ((char *) sbsrefstate) + MAXALIGN(sizeof(SubscriptingRefState)); + sbsrefstate->upperindex = (Datum *) ptr; + ptr += nupper * sizeof(Datum); + sbsrefstate->lowerindex = (Datum *) ptr; + ptr += nlower * sizeof(Datum); + sbsrefstate->upperprovided = (bool *) ptr; + ptr += nupper * sizeof(bool); + sbsrefstate->lowerprovided = (bool *) ptr; + ptr += nlower * sizeof(bool); + sbsrefstate->upperindexnull = (bool *) ptr; + ptr += nupper * sizeof(bool); + sbsrefstate->lowerindexnull = (bool *) ptr; + /* ptr += nlower * sizeof(bool); */ + + /* + * Let the container-type-specific code have a chance. It must fill the + * "methods" struct with function pointers for us to possibly use in + * execution steps below; and it can optionally set up some data pointed + * to by the workspace field. + */ + memset(&methods, 0, sizeof(methods)); + sbsroutines->exec_setup(sbsref, sbsrefstate, &methods); + + /* + * Evaluate array input. It's safe to do so into resv/resnull, because we + * won't use that as target for any of the other subexpressions, and it'll + * be overwritten by the final EEOP_SBSREF_FETCH/ASSIGN step, which is + * pushed last. + */ + ExecInitExprRec(sbsref->refexpr, state, resv, resnull); + + /* + * If refexpr yields NULL, and the operation should be strict, then result + * is NULL. We can implement this with just JUMP_IF_NULL, since we + * evaluated the array into the desired target location. + */ + if (!isAssignment && sbsroutines->fetch_strict) + { + scratch->opcode = EEOP_JUMP_IF_NULL; + scratch->d.jump.jumpdone = -1; /* adjust later */ + ExprEvalPushStep(state, scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + /* Evaluate upper subscripts */ + i = 0; + foreach(lc, sbsref->refupperindexpr) + { + Expr *e = (Expr *) lfirst(lc); + + /* When slicing, individual subscript bounds can be omitted */ + if (!e) + { + sbsrefstate->upperprovided[i] = false; + sbsrefstate->upperindexnull[i] = true; + } + else + { + sbsrefstate->upperprovided[i] = true; + /* Each subscript is evaluated into appropriate array entry */ + ExecInitExprRec(e, state, + &sbsrefstate->upperindex[i], + &sbsrefstate->upperindexnull[i]); + } + i++; + } + + /* Evaluate lower subscripts similarly */ + i = 0; + foreach(lc, sbsref->reflowerindexpr) + { + Expr *e = (Expr *) lfirst(lc); + + /* When slicing, individual subscript bounds can be omitted */ + if (!e) + { + sbsrefstate->lowerprovided[i] = false; + sbsrefstate->lowerindexnull[i] = true; + } + else + { + sbsrefstate->lowerprovided[i] = true; + /* Each subscript is evaluated into appropriate array entry */ + ExecInitExprRec(e, state, + &sbsrefstate->lowerindex[i], + &sbsrefstate->lowerindexnull[i]); + } + i++; + } + + /* SBSREF_SUBSCRIPTS checks and converts all the subscripts at once */ + if (methods.sbs_check_subscripts) + { + scratch->opcode = EEOP_SBSREF_SUBSCRIPTS; + scratch->d.sbsref_subscript.subscriptfunc = methods.sbs_check_subscripts; + scratch->d.sbsref_subscript.state = sbsrefstate; + scratch->d.sbsref_subscript.jumpdone = -1; /* adjust later */ + ExprEvalPushStep(state, scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + if (isAssignment) + { + Datum *save_innermost_caseval; + bool *save_innermost_casenull; + + /* Check for unimplemented methods */ + if (!methods.sbs_assign) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("type %s does not support subscripted assignment", + format_type_be(sbsref->refcontainertype)))); + + /* + * We might have a nested-assignment situation, in which the + * refassgnexpr is itself a FieldStore or SubscriptingRef that needs + * to obtain and modify the previous value of the array element or + * slice being replaced. If so, we have to extract that value from + * the array and pass it down via the CaseTestExpr mechanism. It's + * safe to reuse the CASE mechanism because there cannot be a CASE + * between here and where the value would be needed, and an array + * assignment can't be within a CASE either. (So saving and restoring + * innermost_caseval is just paranoia, but let's do it anyway.) + * + * Since fetching the old element might be a nontrivial expense, do it + * only if the argument actually needs it. + */ + if (isAssignmentIndirectionExpr(sbsref->refassgnexpr)) + { + if (!methods.sbs_fetch_old) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("type %s does not support subscripted assignment", + format_type_be(sbsref->refcontainertype)))); + scratch->opcode = EEOP_SBSREF_OLD; + scratch->d.sbsref.subscriptfunc = methods.sbs_fetch_old; + scratch->d.sbsref.state = sbsrefstate; + ExprEvalPushStep(state, scratch); + } + + /* SBSREF_OLD puts extracted value into prevvalue/prevnull */ + save_innermost_caseval = state->innermost_caseval; + save_innermost_casenull = state->innermost_casenull; + state->innermost_caseval = &sbsrefstate->prevvalue; + state->innermost_casenull = &sbsrefstate->prevnull; + + /* evaluate replacement value into replacevalue/replacenull */ + ExecInitExprRec(sbsref->refassgnexpr, state, + &sbsrefstate->replacevalue, &sbsrefstate->replacenull); + + state->innermost_caseval = save_innermost_caseval; + state->innermost_casenull = save_innermost_casenull; + + /* and perform the assignment */ + scratch->opcode = EEOP_SBSREF_ASSIGN; + scratch->d.sbsref.subscriptfunc = methods.sbs_assign; + scratch->d.sbsref.state = sbsrefstate; + ExprEvalPushStep(state, scratch); + } + else + { + /* array fetch is much simpler */ + scratch->opcode = EEOP_SBSREF_FETCH; + scratch->d.sbsref.subscriptfunc = methods.sbs_fetch; + scratch->d.sbsref.state = sbsrefstate; + ExprEvalPushStep(state, scratch); + } + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + if (as->opcode == EEOP_SBSREF_SUBSCRIPTS) + { + Assert(as->d.sbsref_subscript.jumpdone == -1); + as->d.sbsref_subscript.jumpdone = state->steps_len; + } + else + { + Assert(as->opcode == EEOP_JUMP_IF_NULL); + Assert(as->d.jump.jumpdone == -1); + as->d.jump.jumpdone = state->steps_len; + } + } +} + +/* + * Helper for preparing SubscriptingRef expressions for evaluation: is expr + * a nested FieldStore or SubscriptingRef that needs the old element value + * passed down? + * + * (We could use this in FieldStore too, but in that case passing the old + * value is so cheap there's no need.) + * + * Note: it might seem that this needs to recurse, but in most cases it does + * not; the CaseTestExpr, if any, will be directly the arg or refexpr of the + * top-level node. Nested-assignment situations give rise to expression + * trees in which each level of assignment has its own CaseTestExpr, and the + * recursive structure appears within the newvals or refassgnexpr field. + * There is an exception, though: if the array is an array-of-domain, we will + * have a CoerceToDomain as the refassgnexpr, and we need to be able to look + * through that. + */ +static bool +isAssignmentIndirectionExpr(Expr *expr) +{ + if (expr == NULL) + return false; /* just paranoia */ + if (IsA(expr, FieldStore)) + { + FieldStore *fstore = (FieldStore *) expr; + + if (fstore->arg && IsA(fstore->arg, CaseTestExpr)) + return true; + } + else if (IsA(expr, SubscriptingRef)) + { + SubscriptingRef *sbsRef = (SubscriptingRef *) expr; + + if (sbsRef->refexpr && IsA(sbsRef->refexpr, CaseTestExpr)) + return true; + } + else if (IsA(expr, CoerceToDomain)) + { + CoerceToDomain *cd = (CoerceToDomain *) expr; + + return isAssignmentIndirectionExpr(cd->arg); + } + return false; +} + +/* + * Prepare evaluation of a CoerceToDomain expression. + */ +static void +ExecInitCoerceToDomain(ExprEvalStep *scratch, CoerceToDomain *ctest, + ExprState *state, Datum *resv, bool *resnull) +{ + DomainConstraintRef *constraint_ref; + Datum *domainval = NULL; + bool *domainnull = NULL; + ListCell *l; + + scratch->d.domaincheck.resulttype = ctest->resulttype; + /* we'll allocate workspace only if needed */ + scratch->d.domaincheck.checkvalue = NULL; + scratch->d.domaincheck.checknull = NULL; + + /* + * Evaluate argument - it's fine to directly store it into resv/resnull, + * if there's constraint failures there'll be errors, otherwise it's what + * needs to be returned. + */ + ExecInitExprRec(ctest->arg, state, resv, resnull); + + /* + * Note: if the argument is of varlena type, it could be a R/W expanded + * object. We want to return the R/W pointer as the final result, but we + * have to pass a R/O pointer as the value to be tested by any functions + * in check expressions. We don't bother to emit a MAKE_READONLY step + * unless there's actually at least one check expression, though. Until + * we've tested that, domainval/domainnull are NULL. + */ + + /* + * Collect the constraints associated with the domain. + * + * Note: before PG v10 we'd recheck the set of constraints during each + * evaluation of the expression. Now we bake them into the ExprState + * during executor initialization. That means we don't need typcache.c to + * provide compiled exprs. + */ + constraint_ref = (DomainConstraintRef *) + palloc(sizeof(DomainConstraintRef)); + InitDomainConstraintRef(ctest->resulttype, + constraint_ref, + CurrentMemoryContext, + false); + + /* + * Compile code to check each domain constraint. NOTNULL constraints can + * just be applied on the resv/resnull value, but for CHECK constraints we + * need more pushups. + */ + foreach(l, constraint_ref->constraints) + { + DomainConstraintState *con = (DomainConstraintState *) lfirst(l); + Datum *save_innermost_domainval; + bool *save_innermost_domainnull; + + scratch->d.domaincheck.constraintname = con->name; + + switch (con->constrainttype) + { + case DOM_CONSTRAINT_NOTNULL: + scratch->opcode = EEOP_DOMAIN_NOTNULL; + ExprEvalPushStep(state, scratch); + break; + case DOM_CONSTRAINT_CHECK: + /* Allocate workspace for CHECK output if we didn't yet */ + if (scratch->d.domaincheck.checkvalue == NULL) + { + scratch->d.domaincheck.checkvalue = + (Datum *) palloc(sizeof(Datum)); + scratch->d.domaincheck.checknull = + (bool *) palloc(sizeof(bool)); + } + + /* + * If first time through, determine where CoerceToDomainValue + * nodes should read from. + */ + if (domainval == NULL) + { + /* + * Since value might be read multiple times, force to R/O + * - but only if it could be an expanded datum. + */ + if (get_typlen(ctest->resulttype) == -1) + { + ExprEvalStep scratch2 = {0}; + + /* Yes, so make output workspace for MAKE_READONLY */ + domainval = (Datum *) palloc(sizeof(Datum)); + domainnull = (bool *) palloc(sizeof(bool)); + + /* Emit MAKE_READONLY */ + scratch2.opcode = EEOP_MAKE_READONLY; + scratch2.resvalue = domainval; + scratch2.resnull = domainnull; + scratch2.d.make_readonly.value = resv; + scratch2.d.make_readonly.isnull = resnull; + ExprEvalPushStep(state, &scratch2); + } + else + { + /* No, so it's fine to read from resv/resnull */ + domainval = resv; + domainnull = resnull; + } + } + + /* + * Set up value to be returned by CoerceToDomainValue nodes. + * We must save and restore innermost_domainval/null fields, + * in case this node is itself within a check expression for + * another domain. + */ + save_innermost_domainval = state->innermost_domainval; + save_innermost_domainnull = state->innermost_domainnull; + state->innermost_domainval = domainval; + state->innermost_domainnull = domainnull; + + /* evaluate check expression value */ + ExecInitExprRec(con->check_expr, state, + scratch->d.domaincheck.checkvalue, + scratch->d.domaincheck.checknull); + + state->innermost_domainval = save_innermost_domainval; + state->innermost_domainnull = save_innermost_domainnull; + + /* now test result */ + scratch->opcode = EEOP_DOMAIN_CHECK; + ExprEvalPushStep(state, scratch); + + break; + default: + elog(ERROR, "unrecognized constraint type: %d", + (int) con->constrainttype); + break; + } + } +} + +/* + * Build transition/combine function invocations for all aggregate transition + * / combination function invocations in a grouping sets phase. This has to + * invoke all sort based transitions in a phase (if doSort is true), all hash + * based transitions (if doHash is true), or both (both true). + * + * The resulting expression will, for each set of transition values, first + * check for filters, evaluate aggregate input, check that that input is not + * NULL for a strict transition function, and then finally invoke the + * transition for each of the concurrently computed grouping sets. + * + * If nullcheck is true, the generated code will check for a NULL pointer to + * the array of AggStatePerGroup, and skip evaluation if so. + */ +ExprState * +ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase, + bool doSort, bool doHash, bool nullcheck) +{ + ExprState *state = makeNode(ExprState); + PlanState *parent = &aggstate->ss.ps; + ExprEvalStep scratch = {0}; + bool isCombine = DO_AGGSPLIT_COMBINE(aggstate->aggsplit); + LastAttnumInfo deform = {0, 0, 0}; + + state->expr = (Expr *) aggstate; + state->parent = parent; + + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + + /* + * First figure out which slots, and how many columns from each, we're + * going to need. + */ + for (int transno = 0; transno < aggstate->numtrans; transno++) + { + AggStatePerTrans pertrans = &aggstate->pertrans[transno]; + + get_last_attnums_walker((Node *) pertrans->aggref->aggdirectargs, + &deform); + get_last_attnums_walker((Node *) pertrans->aggref->args, + &deform); + get_last_attnums_walker((Node *) pertrans->aggref->aggorder, + &deform); + get_last_attnums_walker((Node *) pertrans->aggref->aggdistinct, + &deform); + get_last_attnums_walker((Node *) pertrans->aggref->aggfilter, + &deform); + } + ExecPushExprSlots(state, &deform); + + /* + * Emit instructions for each transition value / grouping set combination. + */ + for (int transno = 0; transno < aggstate->numtrans; transno++) + { + AggStatePerTrans pertrans = &aggstate->pertrans[transno]; + FunctionCallInfo trans_fcinfo = pertrans->transfn_fcinfo; + List *adjust_bailout = NIL; + NullableDatum *strictargs = NULL; + bool *strictnulls = NULL; + int argno; + ListCell *bail; + + /* + * If filter present, emit. Do so before evaluating the input, to + * avoid potentially unneeded computations, or even worse, unintended + * side-effects. When combining, all the necessary filtering has + * already been done. + */ + if (pertrans->aggref->aggfilter && !isCombine) + { + /* evaluate filter expression */ + ExecInitExprRec(pertrans->aggref->aggfilter, state, + &state->resvalue, &state->resnull); + /* and jump out if false */ + scratch.opcode = EEOP_JUMP_IF_NOT_TRUE; + scratch.d.jump.jumpdone = -1; /* adjust later */ + ExprEvalPushStep(state, &scratch); + adjust_bailout = lappend_int(adjust_bailout, + state->steps_len - 1); + } + + /* + * Evaluate arguments to aggregate/combine function. + */ + argno = 0; + if (isCombine) + { + /* + * Combining two aggregate transition values. Instead of directly + * coming from a tuple the input is a, potentially deserialized, + * transition value. + */ + TargetEntry *source_tle; + + Assert(pertrans->numSortCols == 0); + Assert(list_length(pertrans->aggref->args) == 1); + + strictargs = trans_fcinfo->args + 1; + source_tle = (TargetEntry *) linitial(pertrans->aggref->args); + + /* + * deserialfn_oid will be set if we must deserialize the input + * state before calling the combine function. + */ + if (!OidIsValid(pertrans->deserialfn_oid)) + { + /* + * Start from 1, since the 0th arg will be the transition + * value + */ + ExecInitExprRec(source_tle->expr, state, + &trans_fcinfo->args[argno + 1].value, + &trans_fcinfo->args[argno + 1].isnull); + } + else + { + FunctionCallInfo ds_fcinfo = pertrans->deserialfn_fcinfo; + + /* evaluate argument */ + ExecInitExprRec(source_tle->expr, state, + &ds_fcinfo->args[0].value, + &ds_fcinfo->args[0].isnull); + + /* Dummy second argument for type-safety reasons */ + ds_fcinfo->args[1].value = PointerGetDatum(NULL); + ds_fcinfo->args[1].isnull = false; + + /* + * Don't call a strict deserialization function with NULL + * input + */ + if (pertrans->deserialfn.fn_strict) + scratch.opcode = EEOP_AGG_STRICT_DESERIALIZE; + else + scratch.opcode = EEOP_AGG_DESERIALIZE; + + scratch.d.agg_deserialize.fcinfo_data = ds_fcinfo; + scratch.d.agg_deserialize.jumpnull = -1; /* adjust later */ + scratch.resvalue = &trans_fcinfo->args[argno + 1].value; + scratch.resnull = &trans_fcinfo->args[argno + 1].isnull; + + ExprEvalPushStep(state, &scratch); + /* don't add an adjustment unless the function is strict */ + if (pertrans->deserialfn.fn_strict) + adjust_bailout = lappend_int(adjust_bailout, + state->steps_len - 1); + + /* restore normal settings of scratch fields */ + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + } + argno++; + } + else if (pertrans->numSortCols == 0) + { + ListCell *arg; + + /* + * Normal transition function without ORDER BY / DISTINCT. + */ + strictargs = trans_fcinfo->args + 1; + + foreach(arg, pertrans->aggref->args) + { + TargetEntry *source_tle = (TargetEntry *) lfirst(arg); + + /* + * Start from 1, since the 0th arg will be the transition + * value + */ + ExecInitExprRec(source_tle->expr, state, + &trans_fcinfo->args[argno + 1].value, + &trans_fcinfo->args[argno + 1].isnull); + argno++; + } + } + else if (pertrans->numInputs == 1) + { + /* + * DISTINCT and/or ORDER BY case, with a single column sorted on. + */ + TargetEntry *source_tle = + (TargetEntry *) linitial(pertrans->aggref->args); + + Assert(list_length(pertrans->aggref->args) == 1); + + ExecInitExprRec(source_tle->expr, state, + &state->resvalue, + &state->resnull); + strictnulls = &state->resnull; + argno++; + } + else + { + /* + * DISTINCT and/or ORDER BY case, with multiple columns sorted on. + */ + Datum *values = pertrans->sortslot->tts_values; + bool *nulls = pertrans->sortslot->tts_isnull; + ListCell *arg; + + strictnulls = nulls; + + foreach(arg, pertrans->aggref->args) + { + TargetEntry *source_tle = (TargetEntry *) lfirst(arg); + + ExecInitExprRec(source_tle->expr, state, + &values[argno], &nulls[argno]); + argno++; + } + } + Assert(pertrans->numInputs == argno); + + /* + * For a strict transfn, nothing happens when there's a NULL input; we + * just keep the prior transValue. This is true for both plain and + * sorted/distinct aggregates. + */ + if (trans_fcinfo->flinfo->fn_strict && pertrans->numTransInputs > 0) + { + if (strictnulls) + scratch.opcode = EEOP_AGG_STRICT_INPUT_CHECK_NULLS; + else + scratch.opcode = EEOP_AGG_STRICT_INPUT_CHECK_ARGS; + scratch.d.agg_strict_input_check.nulls = strictnulls; + scratch.d.agg_strict_input_check.args = strictargs; + scratch.d.agg_strict_input_check.jumpnull = -1; /* adjust later */ + scratch.d.agg_strict_input_check.nargs = pertrans->numTransInputs; + ExprEvalPushStep(state, &scratch); + adjust_bailout = lappend_int(adjust_bailout, + state->steps_len - 1); + } + + /* + * Call transition function (once for each concurrently evaluated + * grouping set). Do so for both sort and hash based computations, as + * applicable. + */ + if (doSort) + { + int processGroupingSets = Max(phase->numsets, 1); + int setoff = 0; + + for (int setno = 0; setno < processGroupingSets; setno++) + { + ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo, + pertrans, transno, setno, setoff, false, + nullcheck); + setoff++; + } + } + + if (doHash) + { + int numHashes = aggstate->num_hashes; + int setoff; + + /* in MIXED mode, there'll be preceding transition values */ + if (aggstate->aggstrategy != AGG_HASHED) + setoff = aggstate->maxsets; + else + setoff = 0; + + for (int setno = 0; setno < numHashes; setno++) + { + ExecBuildAggTransCall(state, aggstate, &scratch, trans_fcinfo, + pertrans, transno, setno, setoff, true, + nullcheck); + setoff++; + } + } + + /* adjust early bail out jump target(s) */ + foreach(bail, adjust_bailout) + { + ExprEvalStep *as = &state->steps[lfirst_int(bail)]; + + if (as->opcode == EEOP_JUMP_IF_NOT_TRUE) + { + Assert(as->d.jump.jumpdone == -1); + as->d.jump.jumpdone = state->steps_len; + } + else if (as->opcode == EEOP_AGG_STRICT_INPUT_CHECK_ARGS || + as->opcode == EEOP_AGG_STRICT_INPUT_CHECK_NULLS) + { + Assert(as->d.agg_strict_input_check.jumpnull == -1); + as->d.agg_strict_input_check.jumpnull = state->steps_len; + } + else if (as->opcode == EEOP_AGG_STRICT_DESERIALIZE) + { + Assert(as->d.agg_deserialize.jumpnull == -1); + as->d.agg_deserialize.jumpnull = state->steps_len; + } + else + Assert(false); + } + } + + scratch.resvalue = NULL; + scratch.resnull = NULL; + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return state; +} + +/* + * Build transition/combine function invocation for a single transition + * value. This is separated from ExecBuildAggTrans() because there are + * multiple callsites (hash and sort in some grouping set cases). + */ +static void +ExecBuildAggTransCall(ExprState *state, AggState *aggstate, + ExprEvalStep *scratch, + FunctionCallInfo fcinfo, AggStatePerTrans pertrans, + int transno, int setno, int setoff, bool ishash, + bool nullcheck) +{ + ExprContext *aggcontext; + int adjust_jumpnull = -1; + + if (ishash) + aggcontext = aggstate->hashcontext; + else + aggcontext = aggstate->aggcontexts[setno]; + + /* add check for NULL pointer? */ + if (nullcheck) + { + scratch->opcode = EEOP_AGG_PLAIN_PERGROUP_NULLCHECK; + scratch->d.agg_plain_pergroup_nullcheck.setoff = setoff; + /* adjust later */ + scratch->d.agg_plain_pergroup_nullcheck.jumpnull = -1; + ExprEvalPushStep(state, scratch); + adjust_jumpnull = state->steps_len - 1; + } + + /* + * Determine appropriate transition implementation. + * + * For non-ordered aggregates: + * + * If the initial value for the transition state doesn't exist in the + * pg_aggregate table then we will let the first non-NULL value returned + * from the outer procNode become the initial value. (This is useful for + * aggregates like max() and min().) The noTransValue flag signals that we + * need to do so. If true, generate a + * EEOP_AGG_INIT_STRICT_PLAIN_TRANS{,_BYVAL} step. This step also needs to + * do the work described next: + * + * If the function is strict, but does have an initial value, choose + * EEOP_AGG_STRICT_PLAIN_TRANS{,_BYVAL}, which skips the transition + * function if the transition value has become NULL (because a previous + * transition function returned NULL). This step also needs to do the work + * described next: + * + * Otherwise we call EEOP_AGG_PLAIN_TRANS{,_BYVAL}, which does not have to + * perform either of the above checks. + * + * Having steps with overlapping responsibilities is not nice, but + * aggregations are very performance sensitive, making this worthwhile. + * + * For ordered aggregates: + * + * Only need to choose between the faster path for a single ordered + * column, and the one between multiple columns. Checking strictness etc + * is done when finalizing the aggregate. See + * process_ordered_aggregate_{single, multi} and + * advance_transition_function. + */ + if (pertrans->numSortCols == 0) + { + if (pertrans->transtypeByVal) + { + if (fcinfo->flinfo->fn_strict && + pertrans->initValueIsNull) + scratch->opcode = EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL; + else if (fcinfo->flinfo->fn_strict) + scratch->opcode = EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL; + else + scratch->opcode = EEOP_AGG_PLAIN_TRANS_BYVAL; + } + else + { + if (fcinfo->flinfo->fn_strict && + pertrans->initValueIsNull) + scratch->opcode = EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF; + else if (fcinfo->flinfo->fn_strict) + scratch->opcode = EEOP_AGG_PLAIN_TRANS_STRICT_BYREF; + else + scratch->opcode = EEOP_AGG_PLAIN_TRANS_BYREF; + } + } + else if (pertrans->numInputs == 1) + scratch->opcode = EEOP_AGG_ORDERED_TRANS_DATUM; + else + scratch->opcode = EEOP_AGG_ORDERED_TRANS_TUPLE; + + scratch->d.agg_trans.pertrans = pertrans; + scratch->d.agg_trans.setno = setno; + scratch->d.agg_trans.setoff = setoff; + scratch->d.agg_trans.transno = transno; + scratch->d.agg_trans.aggcontext = aggcontext; + ExprEvalPushStep(state, scratch); + + /* fix up jumpnull */ + if (adjust_jumpnull != -1) + { + ExprEvalStep *as = &state->steps[adjust_jumpnull]; + + Assert(as->opcode == EEOP_AGG_PLAIN_PERGROUP_NULLCHECK); + Assert(as->d.agg_plain_pergroup_nullcheck.jumpnull == -1); + as->d.agg_plain_pergroup_nullcheck.jumpnull = state->steps_len; + } +} + +/* + * Build equality expression that can be evaluated using ExecQual(), returning + * true if the expression context's inner/outer tuple are NOT DISTINCT. I.e + * two nulls match, a null and a not-null don't match. + * + * desc: tuple descriptor of the to-be-compared tuples + * numCols: the number of attributes to be examined + * keyColIdx: array of attribute column numbers + * eqFunctions: array of function oids of the equality functions to use + * parent: parent executor node + */ +ExprState * +ExecBuildGroupingEqual(TupleDesc ldesc, TupleDesc rdesc, + const TupleTableSlotOps *lops, const TupleTableSlotOps *rops, + int numCols, + const AttrNumber *keyColIdx, + const Oid *eqfunctions, + const Oid *collations, + PlanState *parent) +{ + ExprState *state = makeNode(ExprState); + ExprEvalStep scratch = {0}; + int maxatt = -1; + List *adjust_jumps = NIL; + ListCell *lc; + + /* + * When no columns are actually compared, the result's always true. See + * special case in ExecQual(). + */ + if (numCols == 0) + return NULL; + + state->expr = NULL; + state->flags = EEO_FLAG_IS_QUAL; + state->parent = parent; + + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + + /* compute max needed attribute */ + for (int natt = 0; natt < numCols; natt++) + { + int attno = keyColIdx[natt]; + + if (attno > maxatt) + maxatt = attno; + } + Assert(maxatt >= 0); + + /* push deform steps */ + scratch.opcode = EEOP_INNER_FETCHSOME; + scratch.d.fetch.last_var = maxatt; + scratch.d.fetch.fixed = false; + scratch.d.fetch.known_desc = ldesc; + scratch.d.fetch.kind = lops; + if (ExecComputeSlotInfo(state, &scratch)) + ExprEvalPushStep(state, &scratch); + + scratch.opcode = EEOP_OUTER_FETCHSOME; + scratch.d.fetch.last_var = maxatt; + scratch.d.fetch.fixed = false; + scratch.d.fetch.known_desc = rdesc; + scratch.d.fetch.kind = rops; + if (ExecComputeSlotInfo(state, &scratch)) + ExprEvalPushStep(state, &scratch); + + /* + * Start comparing at the last field (least significant sort key). That's + * the most likely to be different if we are dealing with sorted input. + */ + for (int natt = numCols; --natt >= 0;) + { + int attno = keyColIdx[natt]; + Form_pg_attribute latt = TupleDescAttr(ldesc, attno - 1); + Form_pg_attribute ratt = TupleDescAttr(rdesc, attno - 1); + Oid foid = eqfunctions[natt]; + Oid collid = collations[natt]; + FmgrInfo *finfo; + FunctionCallInfo fcinfo; + AclResult aclresult; + + /* Check permission to call function */ + aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid)); + + InvokeFunctionExecuteHook(foid); + + /* Set up the primary fmgr lookup information */ + finfo = palloc0(sizeof(FmgrInfo)); + fcinfo = palloc0(SizeForFunctionCallInfo(2)); + fmgr_info(foid, finfo); + fmgr_info_set_expr(NULL, finfo); + InitFunctionCallInfoData(*fcinfo, finfo, 2, + collid, NULL, NULL); + + /* left arg */ + scratch.opcode = EEOP_INNER_VAR; + scratch.d.var.attnum = attno - 1; + scratch.d.var.vartype = latt->atttypid; + scratch.resvalue = &fcinfo->args[0].value; + scratch.resnull = &fcinfo->args[0].isnull; + ExprEvalPushStep(state, &scratch); + + /* right arg */ + scratch.opcode = EEOP_OUTER_VAR; + scratch.d.var.attnum = attno - 1; + scratch.d.var.vartype = ratt->atttypid; + scratch.resvalue = &fcinfo->args[1].value; + scratch.resnull = &fcinfo->args[1].isnull; + ExprEvalPushStep(state, &scratch); + + /* evaluate distinctness */ + scratch.opcode = EEOP_NOT_DISTINCT; + scratch.d.func.finfo = finfo; + scratch.d.func.fcinfo_data = fcinfo; + scratch.d.func.fn_addr = finfo->fn_addr; + scratch.d.func.nargs = 2; + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + ExprEvalPushStep(state, &scratch); + + /* then emit EEOP_QUAL to detect if result is false (or null) */ + scratch.opcode = EEOP_QUAL; + scratch.d.qualexpr.jumpdone = -1; + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + ExprEvalPushStep(state, &scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->opcode == EEOP_QUAL); + Assert(as->d.qualexpr.jumpdone == -1); + as->d.qualexpr.jumpdone = state->steps_len; + } + + scratch.resvalue = NULL; + scratch.resnull = NULL; + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return state; +} + +/* + * Build equality expression that can be evaluated using ExecQual(), returning + * true if the expression context's inner/outer tuples are equal. Datums in + * the inner/outer slots are assumed to be in the same order and quantity as + * the 'eqfunctions' parameter. NULLs are treated as equal. + * + * desc: tuple descriptor of the to-be-compared tuples + * lops: the slot ops for the inner tuple slots + * rops: the slot ops for the outer tuple slots + * eqFunctions: array of function oids of the equality functions to use + * this must be the same length as the 'param_exprs' list. + * collations: collation Oids to use for equality comparison. Must be the + * same length as the 'param_exprs' list. + * parent: parent executor node + */ +ExprState * +ExecBuildParamSetEqual(TupleDesc desc, + const TupleTableSlotOps *lops, + const TupleTableSlotOps *rops, + const Oid *eqfunctions, + const Oid *collations, + const List *param_exprs, + PlanState *parent) +{ + ExprState *state = makeNode(ExprState); + ExprEvalStep scratch = {0}; + int maxatt = list_length(param_exprs); + List *adjust_jumps = NIL; + ListCell *lc; + + state->expr = NULL; + state->flags = EEO_FLAG_IS_QUAL; + state->parent = parent; + + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + + /* push deform steps */ + scratch.opcode = EEOP_INNER_FETCHSOME; + scratch.d.fetch.last_var = maxatt; + scratch.d.fetch.fixed = false; + scratch.d.fetch.known_desc = desc; + scratch.d.fetch.kind = lops; + if (ExecComputeSlotInfo(state, &scratch)) + ExprEvalPushStep(state, &scratch); + + scratch.opcode = EEOP_OUTER_FETCHSOME; + scratch.d.fetch.last_var = maxatt; + scratch.d.fetch.fixed = false; + scratch.d.fetch.known_desc = desc; + scratch.d.fetch.kind = rops; + if (ExecComputeSlotInfo(state, &scratch)) + ExprEvalPushStep(state, &scratch); + + for (int attno = 0; attno < maxatt; attno++) + { + Form_pg_attribute att = TupleDescAttr(desc, attno); + Oid foid = eqfunctions[attno]; + Oid collid = collations[attno]; + FmgrInfo *finfo; + FunctionCallInfo fcinfo; + AclResult aclresult; + + /* Check permission to call function */ + aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid)); + + InvokeFunctionExecuteHook(foid); + + /* Set up the primary fmgr lookup information */ + finfo = palloc0(sizeof(FmgrInfo)); + fcinfo = palloc0(SizeForFunctionCallInfo(2)); + fmgr_info(foid, finfo); + fmgr_info_set_expr(NULL, finfo); + InitFunctionCallInfoData(*fcinfo, finfo, 2, + collid, NULL, NULL); + + /* left arg */ + scratch.opcode = EEOP_INNER_VAR; + scratch.d.var.attnum = attno; + scratch.d.var.vartype = att->atttypid; + scratch.resvalue = &fcinfo->args[0].value; + scratch.resnull = &fcinfo->args[0].isnull; + ExprEvalPushStep(state, &scratch); + + /* right arg */ + scratch.opcode = EEOP_OUTER_VAR; + scratch.d.var.attnum = attno; + scratch.d.var.vartype = att->atttypid; + scratch.resvalue = &fcinfo->args[1].value; + scratch.resnull = &fcinfo->args[1].isnull; + ExprEvalPushStep(state, &scratch); + + /* evaluate distinctness */ + scratch.opcode = EEOP_NOT_DISTINCT; + scratch.d.func.finfo = finfo; + scratch.d.func.fcinfo_data = fcinfo; + scratch.d.func.fn_addr = finfo->fn_addr; + scratch.d.func.nargs = 2; + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + ExprEvalPushStep(state, &scratch); + + /* then emit EEOP_QUAL to detect if result is false (or null) */ + scratch.opcode = EEOP_QUAL; + scratch.d.qualexpr.jumpdone = -1; + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + ExprEvalPushStep(state, &scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->opcode == EEOP_QUAL); + Assert(as->d.qualexpr.jumpdone == -1); + as->d.qualexpr.jumpdone = state->steps_len; + } + + scratch.resvalue = NULL; + scratch.resnull = NULL; + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return state; +} diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c new file mode 100644 index 0000000..6b63f93 --- /dev/null +++ b/src/backend/executor/execExprInterp.c @@ -0,0 +1,4373 @@ +/*------------------------------------------------------------------------- + * + * execExprInterp.c + * Interpreted evaluation of an expression step list. + * + * This file provides either a "direct threaded" (for gcc, clang and + * compatible) or a "switch threaded" (for all compilers) implementation of + * expression evaluation. The former is amongst the fastest known methods + * of interpreting programs without resorting to assembly level work, or + * just-in-time compilation, but it requires support for computed gotos. + * The latter is amongst the fastest approaches doable in standard C. + * + * In either case we use ExprEvalStep->opcode to dispatch to the code block + * within ExecInterpExpr() that implements the specific opcode type. + * + * Switch-threading uses a plain switch() statement to perform the + * dispatch. This has the advantages of being plain C and allowing the + * compiler to warn if implementation of a specific opcode has been forgotten. + * The disadvantage is that dispatches will, as commonly implemented by + * compilers, happen from a single location, requiring more jumps and causing + * bad branch prediction. + * + * In direct threading, we use gcc's label-as-values extension - also adopted + * by some other compilers - to replace ExprEvalStep->opcode with the address + * of the block implementing the instruction. Dispatch to the next instruction + * is done by a "computed goto". This allows for better branch prediction + * (as the jumps are happening from different locations) and fewer jumps + * (as no preparatory jump to a common dispatch location is needed). + * + * When using direct threading, ExecReadyInterpretedExpr will replace + * each step's opcode field with the address of the relevant code block and + * ExprState->flags will contain EEO_FLAG_DIRECT_THREADED to remember that + * that's been done. + * + * For very simple instructions the overhead of the full interpreter + * "startup", as minimal as it is, is noticeable. Therefore + * ExecReadyInterpretedExpr will choose to implement certain simple + * opcode patterns using special fast-path routines (ExecJust*). + * + * Complex or uncommon instructions are not implemented in-line in + * ExecInterpExpr(), rather we call out to a helper function appearing later + * in this file. For one reason, there'd not be a noticeable performance + * benefit, but more importantly those complex routines are intended to be + * shared between different expression evaluation approaches. For instance + * a JIT compiler would generate calls to them. (This is why they are + * exported rather than being "static" in this file.) + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/execExprInterp.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heaptoast.h" +#include "catalog/pg_type.h" +#include "commands/sequence.h" +#include "executor/execExpr.h" +#include "executor/nodeSubplan.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "parser/parsetree.h" +#include "pgstat.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/date.h" +#include "utils/datum.h" +#include "utils/expandedrecord.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" +#include "utils/typcache.h" +#include "utils/xml.h" + +/* + * Use computed-goto-based opcode dispatch when computed gotos are available. + * But use a separate symbol so that it's easy to adjust locally in this file + * for development and testing. + */ +#ifdef HAVE_COMPUTED_GOTO +#define EEO_USE_COMPUTED_GOTO +#endif /* HAVE_COMPUTED_GOTO */ + +/* + * Macros for opcode dispatch. + * + * EEO_SWITCH - just hides the switch if not in use. + * EEO_CASE - labels the implementation of named expression step type. + * EEO_DISPATCH - jump to the implementation of the step type for 'op'. + * EEO_OPCODE - compute opcode required by used expression evaluation method. + * EEO_NEXT - increment 'op' and jump to correct next step type. + * EEO_JUMP - jump to the specified step number within the current expression. + */ +#if defined(EEO_USE_COMPUTED_GOTO) + +/* struct for jump target -> opcode lookup table */ +typedef struct ExprEvalOpLookup +{ + const void *opcode; + ExprEvalOp op; +} ExprEvalOpLookup; + +/* to make dispatch_table accessible outside ExecInterpExpr() */ +static const void **dispatch_table = NULL; + +/* jump target -> opcode lookup table */ +static ExprEvalOpLookup reverse_dispatch_table[EEOP_LAST]; + +#define EEO_SWITCH() +#define EEO_CASE(name) CASE_##name: +#define EEO_DISPATCH() goto *((void *) op->opcode) +#define EEO_OPCODE(opcode) ((intptr_t) dispatch_table[opcode]) + +#else /* !EEO_USE_COMPUTED_GOTO */ + +#define EEO_SWITCH() starteval: switch ((ExprEvalOp) op->opcode) +#define EEO_CASE(name) case name: +#define EEO_DISPATCH() goto starteval +#define EEO_OPCODE(opcode) (opcode) + +#endif /* EEO_USE_COMPUTED_GOTO */ + +#define EEO_NEXT() \ + do { \ + op++; \ + EEO_DISPATCH(); \ + } while (0) + +#define EEO_JUMP(stepno) \ + do { \ + op = &state->steps[stepno]; \ + EEO_DISPATCH(); \ + } while (0) + + +static Datum ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull); +static void ExecInitInterpreter(void); + +/* support functions */ +static void CheckVarSlotCompatibility(TupleTableSlot *slot, int attnum, Oid vartype); +static void CheckOpSlotCompatibility(ExprEvalStep *op, TupleTableSlot *slot); +static TupleDesc get_cached_rowtype(Oid type_id, int32 typmod, + ExprEvalRowtypeCache *rowcache, + bool *changed); +static void ExecEvalRowNullInt(ExprState *state, ExprEvalStep *op, + ExprContext *econtext, bool checkisnull); + +/* fast-path evaluation functions */ +static Datum ExecJustInnerVar(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustOuterVar(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustScanVar(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustAssignInnerVar(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustAssignOuterVar(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustAssignScanVar(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustApplyFuncToCase(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustConst(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustAssignInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustAssignOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull); +static Datum ExecJustAssignScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull); + +/* execution helper functions */ +static pg_attribute_always_inline void ExecAggPlainTransByVal(AggState *aggstate, + AggStatePerTrans pertrans, + AggStatePerGroup pergroup, + ExprContext *aggcontext, + int setno); +static pg_attribute_always_inline void ExecAggPlainTransByRef(AggState *aggstate, + AggStatePerTrans pertrans, + AggStatePerGroup pergroup, + ExprContext *aggcontext, + int setno); + +/* + * ScalarArrayOpExprHashEntry + * Hash table entry type used during EEOP_HASHED_SCALARARRAYOP + */ +typedef struct ScalarArrayOpExprHashEntry +{ + Datum key; + uint32 status; /* hash status */ + uint32 hash; /* hash value (cached) */ +} ScalarArrayOpExprHashEntry; + +#define SH_PREFIX saophash +#define SH_ELEMENT_TYPE ScalarArrayOpExprHashEntry +#define SH_KEY_TYPE Datum +#define SH_SCOPE static inline +#define SH_DECLARE +#include "lib/simplehash.h" + +static bool saop_hash_element_match(struct saophash_hash *tb, Datum key1, + Datum key2); +static uint32 saop_element_hash(struct saophash_hash *tb, Datum key); + +/* + * ScalarArrayOpExprHashTable + * Hash table for EEOP_HASHED_SCALARARRAYOP + */ +typedef struct ScalarArrayOpExprHashTable +{ + saophash_hash *hashtab; /* underlying hash table */ + struct ExprEvalStep *op; +} ScalarArrayOpExprHashTable; + +/* Define parameters for ScalarArrayOpExpr hash table code generation. */ +#define SH_PREFIX saophash +#define SH_ELEMENT_TYPE ScalarArrayOpExprHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY key +#define SH_HASH_KEY(tb, key) saop_element_hash(tb, key) +#define SH_EQUAL(tb, a, b) saop_hash_element_match(tb, a, b) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) a->hash +#define SH_DEFINE +#include "lib/simplehash.h" + +/* + * Prepare ExprState for interpreted execution. + */ +void +ExecReadyInterpretedExpr(ExprState *state) +{ + /* Ensure one-time interpreter setup has been done */ + ExecInitInterpreter(); + + /* Simple validity checks on expression */ + Assert(state->steps_len >= 1); + Assert(state->steps[state->steps_len - 1].opcode == EEOP_DONE); + + /* + * Don't perform redundant initialization. This is unreachable in current + * cases, but might be hit if there's additional expression evaluation + * methods that rely on interpreted execution to work. + */ + if (state->flags & EEO_FLAG_INTERPRETER_INITIALIZED) + return; + + /* + * First time through, check whether attribute matches Var. Might not be + * ok anymore, due to schema changes. We do that by setting up a callback + * that does checking on the first call, which then sets the evalfunc + * callback to the actual method of execution. + */ + state->evalfunc = ExecInterpExprStillValid; + + /* DIRECT_THREADED should not already be set */ + Assert((state->flags & EEO_FLAG_DIRECT_THREADED) == 0); + + /* + * There shouldn't be any errors before the expression is fully + * initialized, and even if so, it'd lead to the expression being + * abandoned. So we can set the flag now and save some code. + */ + state->flags |= EEO_FLAG_INTERPRETER_INITIALIZED; + + /* + * Select fast-path evalfuncs for very simple expressions. "Starting up" + * the full interpreter is a measurable overhead for these, and these + * patterns occur often enough to be worth optimizing. + */ + if (state->steps_len == 3) + { + ExprEvalOp step0 = state->steps[0].opcode; + ExprEvalOp step1 = state->steps[1].opcode; + + if (step0 == EEOP_INNER_FETCHSOME && + step1 == EEOP_INNER_VAR) + { + state->evalfunc_private = (void *) ExecJustInnerVar; + return; + } + else if (step0 == EEOP_OUTER_FETCHSOME && + step1 == EEOP_OUTER_VAR) + { + state->evalfunc_private = (void *) ExecJustOuterVar; + return; + } + else if (step0 == EEOP_SCAN_FETCHSOME && + step1 == EEOP_SCAN_VAR) + { + state->evalfunc_private = (void *) ExecJustScanVar; + return; + } + else if (step0 == EEOP_INNER_FETCHSOME && + step1 == EEOP_ASSIGN_INNER_VAR) + { + state->evalfunc_private = (void *) ExecJustAssignInnerVar; + return; + } + else if (step0 == EEOP_OUTER_FETCHSOME && + step1 == EEOP_ASSIGN_OUTER_VAR) + { + state->evalfunc_private = (void *) ExecJustAssignOuterVar; + return; + } + else if (step0 == EEOP_SCAN_FETCHSOME && + step1 == EEOP_ASSIGN_SCAN_VAR) + { + state->evalfunc_private = (void *) ExecJustAssignScanVar; + return; + } + else if (step0 == EEOP_CASE_TESTVAL && + step1 == EEOP_FUNCEXPR_STRICT && + state->steps[0].d.casetest.value) + { + state->evalfunc_private = (void *) ExecJustApplyFuncToCase; + return; + } + } + else if (state->steps_len == 2) + { + ExprEvalOp step0 = state->steps[0].opcode; + + if (step0 == EEOP_CONST) + { + state->evalfunc_private = (void *) ExecJustConst; + return; + } + else if (step0 == EEOP_INNER_VAR) + { + state->evalfunc_private = (void *) ExecJustInnerVarVirt; + return; + } + else if (step0 == EEOP_OUTER_VAR) + { + state->evalfunc_private = (void *) ExecJustOuterVarVirt; + return; + } + else if (step0 == EEOP_SCAN_VAR) + { + state->evalfunc_private = (void *) ExecJustScanVarVirt; + return; + } + else if (step0 == EEOP_ASSIGN_INNER_VAR) + { + state->evalfunc_private = (void *) ExecJustAssignInnerVarVirt; + return; + } + else if (step0 == EEOP_ASSIGN_OUTER_VAR) + { + state->evalfunc_private = (void *) ExecJustAssignOuterVarVirt; + return; + } + else if (step0 == EEOP_ASSIGN_SCAN_VAR) + { + state->evalfunc_private = (void *) ExecJustAssignScanVarVirt; + return; + } + } + +#if defined(EEO_USE_COMPUTED_GOTO) + + /* + * In the direct-threaded implementation, replace each opcode with the + * address to jump to. (Use ExecEvalStepOp() to get back the opcode.) + */ + for (int off = 0; off < state->steps_len; off++) + { + ExprEvalStep *op = &state->steps[off]; + + op->opcode = EEO_OPCODE(op->opcode); + } + + state->flags |= EEO_FLAG_DIRECT_THREADED; +#endif /* EEO_USE_COMPUTED_GOTO */ + + state->evalfunc_private = (void *) ExecInterpExpr; +} + + +/* + * Evaluate expression identified by "state" in the execution context + * given by "econtext". *isnull is set to the is-null flag for the result, + * and the Datum value is the function result. + * + * As a special case, return the dispatch table's address if state is NULL. + * This is used by ExecInitInterpreter to set up the dispatch_table global. + * (Only applies when EEO_USE_COMPUTED_GOTO is defined.) + */ +static Datum +ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) +{ + ExprEvalStep *op; + TupleTableSlot *resultslot; + TupleTableSlot *innerslot; + TupleTableSlot *outerslot; + TupleTableSlot *scanslot; + + /* + * This array has to be in the same order as enum ExprEvalOp. + */ +#if defined(EEO_USE_COMPUTED_GOTO) + static const void *const dispatch_table[] = { + &&CASE_EEOP_DONE, + &&CASE_EEOP_INNER_FETCHSOME, + &&CASE_EEOP_OUTER_FETCHSOME, + &&CASE_EEOP_SCAN_FETCHSOME, + &&CASE_EEOP_INNER_VAR, + &&CASE_EEOP_OUTER_VAR, + &&CASE_EEOP_SCAN_VAR, + &&CASE_EEOP_INNER_SYSVAR, + &&CASE_EEOP_OUTER_SYSVAR, + &&CASE_EEOP_SCAN_SYSVAR, + &&CASE_EEOP_WHOLEROW, + &&CASE_EEOP_ASSIGN_INNER_VAR, + &&CASE_EEOP_ASSIGN_OUTER_VAR, + &&CASE_EEOP_ASSIGN_SCAN_VAR, + &&CASE_EEOP_ASSIGN_TMP, + &&CASE_EEOP_ASSIGN_TMP_MAKE_RO, + &&CASE_EEOP_CONST, + &&CASE_EEOP_FUNCEXPR, + &&CASE_EEOP_FUNCEXPR_STRICT, + &&CASE_EEOP_FUNCEXPR_FUSAGE, + &&CASE_EEOP_FUNCEXPR_STRICT_FUSAGE, + &&CASE_EEOP_BOOL_AND_STEP_FIRST, + &&CASE_EEOP_BOOL_AND_STEP, + &&CASE_EEOP_BOOL_AND_STEP_LAST, + &&CASE_EEOP_BOOL_OR_STEP_FIRST, + &&CASE_EEOP_BOOL_OR_STEP, + &&CASE_EEOP_BOOL_OR_STEP_LAST, + &&CASE_EEOP_BOOL_NOT_STEP, + &&CASE_EEOP_QUAL, + &&CASE_EEOP_JUMP, + &&CASE_EEOP_JUMP_IF_NULL, + &&CASE_EEOP_JUMP_IF_NOT_NULL, + &&CASE_EEOP_JUMP_IF_NOT_TRUE, + &&CASE_EEOP_NULLTEST_ISNULL, + &&CASE_EEOP_NULLTEST_ISNOTNULL, + &&CASE_EEOP_NULLTEST_ROWISNULL, + &&CASE_EEOP_NULLTEST_ROWISNOTNULL, + &&CASE_EEOP_BOOLTEST_IS_TRUE, + &&CASE_EEOP_BOOLTEST_IS_NOT_TRUE, + &&CASE_EEOP_BOOLTEST_IS_FALSE, + &&CASE_EEOP_BOOLTEST_IS_NOT_FALSE, + &&CASE_EEOP_PARAM_EXEC, + &&CASE_EEOP_PARAM_EXTERN, + &&CASE_EEOP_PARAM_CALLBACK, + &&CASE_EEOP_CASE_TESTVAL, + &&CASE_EEOP_MAKE_READONLY, + &&CASE_EEOP_IOCOERCE, + &&CASE_EEOP_DISTINCT, + &&CASE_EEOP_NOT_DISTINCT, + &&CASE_EEOP_NULLIF, + &&CASE_EEOP_SQLVALUEFUNCTION, + &&CASE_EEOP_CURRENTOFEXPR, + &&CASE_EEOP_NEXTVALUEEXPR, + &&CASE_EEOP_ARRAYEXPR, + &&CASE_EEOP_ARRAYCOERCE, + &&CASE_EEOP_ROW, + &&CASE_EEOP_ROWCOMPARE_STEP, + &&CASE_EEOP_ROWCOMPARE_FINAL, + &&CASE_EEOP_MINMAX, + &&CASE_EEOP_FIELDSELECT, + &&CASE_EEOP_FIELDSTORE_DEFORM, + &&CASE_EEOP_FIELDSTORE_FORM, + &&CASE_EEOP_SBSREF_SUBSCRIPTS, + &&CASE_EEOP_SBSREF_OLD, + &&CASE_EEOP_SBSREF_ASSIGN, + &&CASE_EEOP_SBSREF_FETCH, + &&CASE_EEOP_DOMAIN_TESTVAL, + &&CASE_EEOP_DOMAIN_NOTNULL, + &&CASE_EEOP_DOMAIN_CHECK, + &&CASE_EEOP_CONVERT_ROWTYPE, + &&CASE_EEOP_SCALARARRAYOP, + &&CASE_EEOP_HASHED_SCALARARRAYOP, + &&CASE_EEOP_XMLEXPR, + &&CASE_EEOP_AGGREF, + &&CASE_EEOP_GROUPING_FUNC, + &&CASE_EEOP_WINDOW_FUNC, + &&CASE_EEOP_SUBPLAN, + &&CASE_EEOP_AGG_STRICT_DESERIALIZE, + &&CASE_EEOP_AGG_DESERIALIZE, + &&CASE_EEOP_AGG_STRICT_INPUT_CHECK_ARGS, + &&CASE_EEOP_AGG_STRICT_INPUT_CHECK_NULLS, + &&CASE_EEOP_AGG_PLAIN_PERGROUP_NULLCHECK, + &&CASE_EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL, + &&CASE_EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL, + &&CASE_EEOP_AGG_PLAIN_TRANS_BYVAL, + &&CASE_EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF, + &&CASE_EEOP_AGG_PLAIN_TRANS_STRICT_BYREF, + &&CASE_EEOP_AGG_PLAIN_TRANS_BYREF, + &&CASE_EEOP_AGG_ORDERED_TRANS_DATUM, + &&CASE_EEOP_AGG_ORDERED_TRANS_TUPLE, + &&CASE_EEOP_LAST + }; + + StaticAssertStmt(EEOP_LAST + 1 == lengthof(dispatch_table), + "dispatch_table out of whack with ExprEvalOp"); + + if (unlikely(state == NULL)) + return PointerGetDatum(dispatch_table); +#else + Assert(state != NULL); +#endif /* EEO_USE_COMPUTED_GOTO */ + + /* setup state */ + op = state->steps; + resultslot = state->resultslot; + innerslot = econtext->ecxt_innertuple; + outerslot = econtext->ecxt_outertuple; + scanslot = econtext->ecxt_scantuple; + +#if defined(EEO_USE_COMPUTED_GOTO) + EEO_DISPATCH(); +#endif + + EEO_SWITCH() + { + EEO_CASE(EEOP_DONE) + { + goto out; + } + + EEO_CASE(EEOP_INNER_FETCHSOME) + { + CheckOpSlotCompatibility(op, innerslot); + + slot_getsomeattrs(innerslot, op->d.fetch.last_var); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_OUTER_FETCHSOME) + { + CheckOpSlotCompatibility(op, outerslot); + + slot_getsomeattrs(outerslot, op->d.fetch.last_var); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_SCAN_FETCHSOME) + { + CheckOpSlotCompatibility(op, scanslot); + + slot_getsomeattrs(scanslot, op->d.fetch.last_var); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_INNER_VAR) + { + int attnum = op->d.var.attnum; + + /* + * Since we already extracted all referenced columns from the + * tuple with a FETCHSOME step, we can just grab the value + * directly out of the slot's decomposed-data arrays. But let's + * have an Assert to check that that did happen. + */ + Assert(attnum >= 0 && attnum < innerslot->tts_nvalid); + *op->resvalue = innerslot->tts_values[attnum]; + *op->resnull = innerslot->tts_isnull[attnum]; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_OUTER_VAR) + { + int attnum = op->d.var.attnum; + + /* See EEOP_INNER_VAR comments */ + + Assert(attnum >= 0 && attnum < outerslot->tts_nvalid); + *op->resvalue = outerslot->tts_values[attnum]; + *op->resnull = outerslot->tts_isnull[attnum]; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_SCAN_VAR) + { + int attnum = op->d.var.attnum; + + /* See EEOP_INNER_VAR comments */ + + Assert(attnum >= 0 && attnum < scanslot->tts_nvalid); + *op->resvalue = scanslot->tts_values[attnum]; + *op->resnull = scanslot->tts_isnull[attnum]; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_INNER_SYSVAR) + { + ExecEvalSysVar(state, op, econtext, innerslot); + EEO_NEXT(); + } + + EEO_CASE(EEOP_OUTER_SYSVAR) + { + ExecEvalSysVar(state, op, econtext, outerslot); + EEO_NEXT(); + } + + EEO_CASE(EEOP_SCAN_SYSVAR) + { + ExecEvalSysVar(state, op, econtext, scanslot); + EEO_NEXT(); + } + + EEO_CASE(EEOP_WHOLEROW) + { + /* too complex for an inline implementation */ + ExecEvalWholeRowVar(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ASSIGN_INNER_VAR) + { + int resultnum = op->d.assign_var.resultnum; + int attnum = op->d.assign_var.attnum; + + /* + * We do not need CheckVarSlotCompatibility here; that was taken + * care of at compilation time. But see EEOP_INNER_VAR comments. + */ + Assert(attnum >= 0 && attnum < innerslot->tts_nvalid); + Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts); + resultslot->tts_values[resultnum] = innerslot->tts_values[attnum]; + resultslot->tts_isnull[resultnum] = innerslot->tts_isnull[attnum]; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ASSIGN_OUTER_VAR) + { + int resultnum = op->d.assign_var.resultnum; + int attnum = op->d.assign_var.attnum; + + /* + * We do not need CheckVarSlotCompatibility here; that was taken + * care of at compilation time. But see EEOP_INNER_VAR comments. + */ + Assert(attnum >= 0 && attnum < outerslot->tts_nvalid); + Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts); + resultslot->tts_values[resultnum] = outerslot->tts_values[attnum]; + resultslot->tts_isnull[resultnum] = outerslot->tts_isnull[attnum]; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ASSIGN_SCAN_VAR) + { + int resultnum = op->d.assign_var.resultnum; + int attnum = op->d.assign_var.attnum; + + /* + * We do not need CheckVarSlotCompatibility here; that was taken + * care of at compilation time. But see EEOP_INNER_VAR comments. + */ + Assert(attnum >= 0 && attnum < scanslot->tts_nvalid); + Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts); + resultslot->tts_values[resultnum] = scanslot->tts_values[attnum]; + resultslot->tts_isnull[resultnum] = scanslot->tts_isnull[attnum]; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ASSIGN_TMP) + { + int resultnum = op->d.assign_tmp.resultnum; + + Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts); + resultslot->tts_values[resultnum] = state->resvalue; + resultslot->tts_isnull[resultnum] = state->resnull; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ASSIGN_TMP_MAKE_RO) + { + int resultnum = op->d.assign_tmp.resultnum; + + Assert(resultnum >= 0 && resultnum < resultslot->tts_tupleDescriptor->natts); + resultslot->tts_isnull[resultnum] = state->resnull; + if (!resultslot->tts_isnull[resultnum]) + resultslot->tts_values[resultnum] = + MakeExpandedObjectReadOnlyInternal(state->resvalue); + else + resultslot->tts_values[resultnum] = state->resvalue; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_CONST) + { + *op->resnull = op->d.constval.isnull; + *op->resvalue = op->d.constval.value; + + EEO_NEXT(); + } + + /* + * Function-call implementations. Arguments have previously been + * evaluated directly into fcinfo->args. + * + * As both STRICT checks and function-usage are noticeable performance + * wise, and function calls are a very hot-path (they also back + * operators!), it's worth having so many separate opcodes. + * + * Note: the reason for using a temporary variable "d", here and in + * other places, is that some compilers think "*op->resvalue = f();" + * requires them to evaluate op->resvalue into a register before + * calling f(), just in case f() is able to modify op->resvalue + * somehow. The extra line of code can save a useless register spill + * and reload across the function call. + */ + EEO_CASE(EEOP_FUNCEXPR) + { + FunctionCallInfo fcinfo = op->d.func.fcinfo_data; + Datum d; + + fcinfo->isnull = false; + d = op->d.func.fn_addr(fcinfo); + *op->resvalue = d; + *op->resnull = fcinfo->isnull; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_FUNCEXPR_STRICT) + { + FunctionCallInfo fcinfo = op->d.func.fcinfo_data; + NullableDatum *args = fcinfo->args; + int nargs = op->d.func.nargs; + Datum d; + + /* strict function, so check for NULL args */ + for (int argno = 0; argno < nargs; argno++) + { + if (args[argno].isnull) + { + *op->resnull = true; + goto strictfail; + } + } + fcinfo->isnull = false; + d = op->d.func.fn_addr(fcinfo); + *op->resvalue = d; + *op->resnull = fcinfo->isnull; + + strictfail: + EEO_NEXT(); + } + + EEO_CASE(EEOP_FUNCEXPR_FUSAGE) + { + /* not common enough to inline */ + ExecEvalFuncExprFusage(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_FUNCEXPR_STRICT_FUSAGE) + { + /* not common enough to inline */ + ExecEvalFuncExprStrictFusage(state, op, econtext); + + EEO_NEXT(); + } + + /* + * If any of its clauses is FALSE, an AND's result is FALSE regardless + * of the states of the rest of the clauses, so we can stop evaluating + * and return FALSE immediately. If none are FALSE and one or more is + * NULL, we return NULL; otherwise we return TRUE. This makes sense + * when you interpret NULL as "don't know": perhaps one of the "don't + * knows" would have been FALSE if we'd known its value. Only when + * all the inputs are known to be TRUE can we state confidently that + * the AND's result is TRUE. + */ + EEO_CASE(EEOP_BOOL_AND_STEP_FIRST) + { + *op->d.boolexpr.anynull = false; + + /* + * EEOP_BOOL_AND_STEP_FIRST resets anynull, otherwise it's the + * same as EEOP_BOOL_AND_STEP - so fall through to that. + */ + + /* FALL THROUGH */ + } + + EEO_CASE(EEOP_BOOL_AND_STEP) + { + if (*op->resnull) + { + *op->d.boolexpr.anynull = true; + } + else if (!DatumGetBool(*op->resvalue)) + { + /* result is already set to FALSE, need not change it */ + /* bail out early */ + EEO_JUMP(op->d.boolexpr.jumpdone); + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_BOOL_AND_STEP_LAST) + { + if (*op->resnull) + { + /* result is already set to NULL, need not change it */ + } + else if (!DatumGetBool(*op->resvalue)) + { + /* result is already set to FALSE, need not change it */ + + /* + * No point jumping early to jumpdone - would be same target + * (as this is the last argument to the AND expression), + * except more expensive. + */ + } + else if (*op->d.boolexpr.anynull) + { + *op->resvalue = (Datum) 0; + *op->resnull = true; + } + else + { + /* result is already set to TRUE, need not change it */ + } + + EEO_NEXT(); + } + + /* + * If any of its clauses is TRUE, an OR's result is TRUE regardless of + * the states of the rest of the clauses, so we can stop evaluating + * and return TRUE immediately. If none are TRUE and one or more is + * NULL, we return NULL; otherwise we return FALSE. This makes sense + * when you interpret NULL as "don't know": perhaps one of the "don't + * knows" would have been TRUE if we'd known its value. Only when all + * the inputs are known to be FALSE can we state confidently that the + * OR's result is FALSE. + */ + EEO_CASE(EEOP_BOOL_OR_STEP_FIRST) + { + *op->d.boolexpr.anynull = false; + + /* + * EEOP_BOOL_OR_STEP_FIRST resets anynull, otherwise it's the same + * as EEOP_BOOL_OR_STEP - so fall through to that. + */ + + /* FALL THROUGH */ + } + + EEO_CASE(EEOP_BOOL_OR_STEP) + { + if (*op->resnull) + { + *op->d.boolexpr.anynull = true; + } + else if (DatumGetBool(*op->resvalue)) + { + /* result is already set to TRUE, need not change it */ + /* bail out early */ + EEO_JUMP(op->d.boolexpr.jumpdone); + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_BOOL_OR_STEP_LAST) + { + if (*op->resnull) + { + /* result is already set to NULL, need not change it */ + } + else if (DatumGetBool(*op->resvalue)) + { + /* result is already set to TRUE, need not change it */ + + /* + * No point jumping to jumpdone - would be same target (as + * this is the last argument to the AND expression), except + * more expensive. + */ + } + else if (*op->d.boolexpr.anynull) + { + *op->resvalue = (Datum) 0; + *op->resnull = true; + } + else + { + /* result is already set to FALSE, need not change it */ + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_BOOL_NOT_STEP) + { + /* + * Evaluation of 'not' is simple... if expr is false, then return + * 'true' and vice versa. It's safe to do this even on a + * nominally null value, so we ignore resnull; that means that + * NULL in produces NULL out, which is what we want. + */ + *op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue)); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_QUAL) + { + /* simplified version of BOOL_AND_STEP for use by ExecQual() */ + + /* If argument (also result) is false or null ... */ + if (*op->resnull || + !DatumGetBool(*op->resvalue)) + { + /* ... bail out early, returning FALSE */ + *op->resnull = false; + *op->resvalue = BoolGetDatum(false); + EEO_JUMP(op->d.qualexpr.jumpdone); + } + + /* + * Otherwise, leave the TRUE value in place, in case this is the + * last qual. Then, TRUE is the correct answer. + */ + + EEO_NEXT(); + } + + EEO_CASE(EEOP_JUMP) + { + /* Unconditionally jump to target step */ + EEO_JUMP(op->d.jump.jumpdone); + } + + EEO_CASE(EEOP_JUMP_IF_NULL) + { + /* Transfer control if current result is null */ + if (*op->resnull) + EEO_JUMP(op->d.jump.jumpdone); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_JUMP_IF_NOT_NULL) + { + /* Transfer control if current result is non-null */ + if (!*op->resnull) + EEO_JUMP(op->d.jump.jumpdone); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_JUMP_IF_NOT_TRUE) + { + /* Transfer control if current result is null or false */ + if (*op->resnull || !DatumGetBool(*op->resvalue)) + EEO_JUMP(op->d.jump.jumpdone); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_NULLTEST_ISNULL) + { + *op->resvalue = BoolGetDatum(*op->resnull); + *op->resnull = false; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_NULLTEST_ISNOTNULL) + { + *op->resvalue = BoolGetDatum(!*op->resnull); + *op->resnull = false; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_NULLTEST_ROWISNULL) + { + /* out of line implementation: too large */ + ExecEvalRowNull(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_NULLTEST_ROWISNOTNULL) + { + /* out of line implementation: too large */ + ExecEvalRowNotNull(state, op, econtext); + + EEO_NEXT(); + } + + /* BooleanTest implementations for all booltesttypes */ + + EEO_CASE(EEOP_BOOLTEST_IS_TRUE) + { + if (*op->resnull) + { + *op->resvalue = BoolGetDatum(false); + *op->resnull = false; + } + /* else, input value is the correct output as well */ + + EEO_NEXT(); + } + + EEO_CASE(EEOP_BOOLTEST_IS_NOT_TRUE) + { + if (*op->resnull) + { + *op->resvalue = BoolGetDatum(true); + *op->resnull = false; + } + else + *op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue)); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_BOOLTEST_IS_FALSE) + { + if (*op->resnull) + { + *op->resvalue = BoolGetDatum(false); + *op->resnull = false; + } + else + *op->resvalue = BoolGetDatum(!DatumGetBool(*op->resvalue)); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_BOOLTEST_IS_NOT_FALSE) + { + if (*op->resnull) + { + *op->resvalue = BoolGetDatum(true); + *op->resnull = false; + } + /* else, input value is the correct output as well */ + + EEO_NEXT(); + } + + EEO_CASE(EEOP_PARAM_EXEC) + { + /* out of line implementation: too large */ + ExecEvalParamExec(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_PARAM_EXTERN) + { + /* out of line implementation: too large */ + ExecEvalParamExtern(state, op, econtext); + EEO_NEXT(); + } + + EEO_CASE(EEOP_PARAM_CALLBACK) + { + /* allow an extension module to supply a PARAM_EXTERN value */ + op->d.cparam.paramfunc(state, op, econtext); + EEO_NEXT(); + } + + EEO_CASE(EEOP_CASE_TESTVAL) + { + /* + * Normally upper parts of the expression tree have setup the + * values to be returned here, but some parts of the system + * currently misuse {caseValue,domainValue}_{datum,isNull} to set + * run-time data. So if no values have been set-up, use + * ExprContext's. This isn't pretty, but also not *that* ugly, + * and this is unlikely to be performance sensitive enough to + * worry about an extra branch. + */ + if (op->d.casetest.value) + { + *op->resvalue = *op->d.casetest.value; + *op->resnull = *op->d.casetest.isnull; + } + else + { + *op->resvalue = econtext->caseValue_datum; + *op->resnull = econtext->caseValue_isNull; + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_DOMAIN_TESTVAL) + { + /* + * See EEOP_CASE_TESTVAL comment. + */ + if (op->d.casetest.value) + { + *op->resvalue = *op->d.casetest.value; + *op->resnull = *op->d.casetest.isnull; + } + else + { + *op->resvalue = econtext->domainValue_datum; + *op->resnull = econtext->domainValue_isNull; + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_MAKE_READONLY) + { + /* + * Force a varlena value that might be read multiple times to R/O + */ + if (!*op->d.make_readonly.isnull) + *op->resvalue = + MakeExpandedObjectReadOnlyInternal(*op->d.make_readonly.value); + *op->resnull = *op->d.make_readonly.isnull; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_IOCOERCE) + { + /* + * Evaluate a CoerceViaIO node. This can be quite a hot path, so + * inline as much work as possible. The source value is in our + * result variable. + */ + char *str; + + /* call output function (similar to OutputFunctionCall) */ + if (*op->resnull) + { + /* output functions are not called on nulls */ + str = NULL; + } + else + { + FunctionCallInfo fcinfo_out; + + fcinfo_out = op->d.iocoerce.fcinfo_data_out; + fcinfo_out->args[0].value = *op->resvalue; + fcinfo_out->args[0].isnull = false; + + fcinfo_out->isnull = false; + str = DatumGetCString(FunctionCallInvoke(fcinfo_out)); + + /* OutputFunctionCall assumes result isn't null */ + Assert(!fcinfo_out->isnull); + } + + /* call input function (similar to InputFunctionCall) */ + if (!op->d.iocoerce.finfo_in->fn_strict || str != NULL) + { + FunctionCallInfo fcinfo_in; + Datum d; + + fcinfo_in = op->d.iocoerce.fcinfo_data_in; + fcinfo_in->args[0].value = PointerGetDatum(str); + fcinfo_in->args[0].isnull = *op->resnull; + /* second and third arguments are already set up */ + + fcinfo_in->isnull = false; + d = FunctionCallInvoke(fcinfo_in); + *op->resvalue = d; + + /* Should get null result if and only if str is NULL */ + if (str == NULL) + { + Assert(*op->resnull); + Assert(fcinfo_in->isnull); + } + else + { + Assert(!*op->resnull); + Assert(!fcinfo_in->isnull); + } + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_DISTINCT) + { + /* + * IS DISTINCT FROM must evaluate arguments (already done into + * fcinfo->args) to determine whether they are NULL; if either is + * NULL then the result is determined. If neither is NULL, then + * proceed to evaluate the comparison function, which is just the + * type's standard equality operator. We need not care whether + * that function is strict. Because the handling of nulls is + * different, we can't just reuse EEOP_FUNCEXPR. + */ + FunctionCallInfo fcinfo = op->d.func.fcinfo_data; + + /* check function arguments for NULLness */ + if (fcinfo->args[0].isnull && fcinfo->args[1].isnull) + { + /* Both NULL? Then is not distinct... */ + *op->resvalue = BoolGetDatum(false); + *op->resnull = false; + } + else if (fcinfo->args[0].isnull || fcinfo->args[1].isnull) + { + /* Only one is NULL? Then is distinct... */ + *op->resvalue = BoolGetDatum(true); + *op->resnull = false; + } + else + { + /* Neither null, so apply the equality function */ + Datum eqresult; + + fcinfo->isnull = false; + eqresult = op->d.func.fn_addr(fcinfo); + /* Must invert result of "="; safe to do even if null */ + *op->resvalue = BoolGetDatum(!DatumGetBool(eqresult)); + *op->resnull = fcinfo->isnull; + } + + EEO_NEXT(); + } + + /* see EEOP_DISTINCT for comments, this is just inverted */ + EEO_CASE(EEOP_NOT_DISTINCT) + { + FunctionCallInfo fcinfo = op->d.func.fcinfo_data; + + if (fcinfo->args[0].isnull && fcinfo->args[1].isnull) + { + *op->resvalue = BoolGetDatum(true); + *op->resnull = false; + } + else if (fcinfo->args[0].isnull || fcinfo->args[1].isnull) + { + *op->resvalue = BoolGetDatum(false); + *op->resnull = false; + } + else + { + Datum eqresult; + + fcinfo->isnull = false; + eqresult = op->d.func.fn_addr(fcinfo); + *op->resvalue = eqresult; + *op->resnull = fcinfo->isnull; + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_NULLIF) + { + /* + * The arguments are already evaluated into fcinfo->args. + */ + FunctionCallInfo fcinfo = op->d.func.fcinfo_data; + + /* if either argument is NULL they can't be equal */ + if (!fcinfo->args[0].isnull && !fcinfo->args[1].isnull) + { + Datum result; + + fcinfo->isnull = false; + result = op->d.func.fn_addr(fcinfo); + + /* if the arguments are equal return null */ + if (!fcinfo->isnull && DatumGetBool(result)) + { + *op->resvalue = (Datum) 0; + *op->resnull = true; + + EEO_NEXT(); + } + } + + /* Arguments aren't equal, so return the first one */ + *op->resvalue = fcinfo->args[0].value; + *op->resnull = fcinfo->args[0].isnull; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_SQLVALUEFUNCTION) + { + /* + * Doesn't seem worthwhile to have an inline implementation + * efficiency-wise. + */ + ExecEvalSQLValueFunction(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_CURRENTOFEXPR) + { + /* error invocation uses space, and shouldn't ever occur */ + ExecEvalCurrentOfExpr(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_NEXTVALUEEXPR) + { + /* + * Doesn't seem worthwhile to have an inline implementation + * efficiency-wise. + */ + ExecEvalNextValueExpr(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ARRAYEXPR) + { + /* too complex for an inline implementation */ + ExecEvalArrayExpr(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ARRAYCOERCE) + { + /* too complex for an inline implementation */ + ExecEvalArrayCoerce(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ROW) + { + /* too complex for an inline implementation */ + ExecEvalRow(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ROWCOMPARE_STEP) + { + FunctionCallInfo fcinfo = op->d.rowcompare_step.fcinfo_data; + Datum d; + + /* force NULL result if strict fn and NULL input */ + if (op->d.rowcompare_step.finfo->fn_strict && + (fcinfo->args[0].isnull || fcinfo->args[1].isnull)) + { + *op->resnull = true; + EEO_JUMP(op->d.rowcompare_step.jumpnull); + } + + /* Apply comparison function */ + fcinfo->isnull = false; + d = op->d.rowcompare_step.fn_addr(fcinfo); + *op->resvalue = d; + + /* force NULL result if NULL function result */ + if (fcinfo->isnull) + { + *op->resnull = true; + EEO_JUMP(op->d.rowcompare_step.jumpnull); + } + *op->resnull = false; + + /* If unequal, no need to compare remaining columns */ + if (DatumGetInt32(*op->resvalue) != 0) + { + EEO_JUMP(op->d.rowcompare_step.jumpdone); + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_ROWCOMPARE_FINAL) + { + int32 cmpresult = DatumGetInt32(*op->resvalue); + RowCompareType rctype = op->d.rowcompare_final.rctype; + + *op->resnull = false; + switch (rctype) + { + /* EQ and NE cases aren't allowed here */ + case ROWCOMPARE_LT: + *op->resvalue = BoolGetDatum(cmpresult < 0); + break; + case ROWCOMPARE_LE: + *op->resvalue = BoolGetDatum(cmpresult <= 0); + break; + case ROWCOMPARE_GE: + *op->resvalue = BoolGetDatum(cmpresult >= 0); + break; + case ROWCOMPARE_GT: + *op->resvalue = BoolGetDatum(cmpresult > 0); + break; + default: + Assert(false); + break; + } + + EEO_NEXT(); + } + + EEO_CASE(EEOP_MINMAX) + { + /* too complex for an inline implementation */ + ExecEvalMinMax(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_FIELDSELECT) + { + /* too complex for an inline implementation */ + ExecEvalFieldSelect(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_FIELDSTORE_DEFORM) + { + /* too complex for an inline implementation */ + ExecEvalFieldStoreDeForm(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_FIELDSTORE_FORM) + { + /* too complex for an inline implementation */ + ExecEvalFieldStoreForm(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_SBSREF_SUBSCRIPTS) + { + /* Precheck SubscriptingRef subscript(s) */ + if (op->d.sbsref_subscript.subscriptfunc(state, op, econtext)) + { + EEO_NEXT(); + } + else + { + /* Subscript is null, short-circuit SubscriptingRef to NULL */ + EEO_JUMP(op->d.sbsref_subscript.jumpdone); + } + } + + EEO_CASE(EEOP_SBSREF_OLD) + EEO_CASE(EEOP_SBSREF_ASSIGN) + EEO_CASE(EEOP_SBSREF_FETCH) + { + /* Perform a SubscriptingRef fetch or assignment */ + op->d.sbsref.subscriptfunc(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_CONVERT_ROWTYPE) + { + /* too complex for an inline implementation */ + ExecEvalConvertRowtype(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_SCALARARRAYOP) + { + /* too complex for an inline implementation */ + ExecEvalScalarArrayOp(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_HASHED_SCALARARRAYOP) + { + /* too complex for an inline implementation */ + ExecEvalHashedScalarArrayOp(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_DOMAIN_NOTNULL) + { + /* too complex for an inline implementation */ + ExecEvalConstraintNotNull(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_DOMAIN_CHECK) + { + /* too complex for an inline implementation */ + ExecEvalConstraintCheck(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_XMLEXPR) + { + /* too complex for an inline implementation */ + ExecEvalXmlExpr(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_AGGREF) + { + /* + * Returns a Datum whose value is the precomputed aggregate value + * found in the given expression context. + */ + int aggno = op->d.aggref.aggno; + + Assert(econtext->ecxt_aggvalues != NULL); + + *op->resvalue = econtext->ecxt_aggvalues[aggno]; + *op->resnull = econtext->ecxt_aggnulls[aggno]; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_GROUPING_FUNC) + { + /* too complex/uncommon for an inline implementation */ + ExecEvalGroupingFunc(state, op); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_WINDOW_FUNC) + { + /* + * Like Aggref, just return a precomputed value from the econtext. + */ + WindowFuncExprState *wfunc = op->d.window_func.wfstate; + + Assert(econtext->ecxt_aggvalues != NULL); + + *op->resvalue = econtext->ecxt_aggvalues[wfunc->wfuncno]; + *op->resnull = econtext->ecxt_aggnulls[wfunc->wfuncno]; + + EEO_NEXT(); + } + + EEO_CASE(EEOP_SUBPLAN) + { + /* too complex for an inline implementation */ + ExecEvalSubPlan(state, op, econtext); + + EEO_NEXT(); + } + + /* evaluate a strict aggregate deserialization function */ + EEO_CASE(EEOP_AGG_STRICT_DESERIALIZE) + { + /* Don't call a strict deserialization function with NULL input */ + if (op->d.agg_deserialize.fcinfo_data->args[0].isnull) + EEO_JUMP(op->d.agg_deserialize.jumpnull); + + /* fallthrough */ + } + + /* evaluate aggregate deserialization function (non-strict portion) */ + EEO_CASE(EEOP_AGG_DESERIALIZE) + { + FunctionCallInfo fcinfo = op->d.agg_deserialize.fcinfo_data; + AggState *aggstate = castNode(AggState, state->parent); + MemoryContext oldContext; + + /* + * We run the deserialization functions in per-input-tuple memory + * context. + */ + oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory); + fcinfo->isnull = false; + *op->resvalue = FunctionCallInvoke(fcinfo); + *op->resnull = fcinfo->isnull; + MemoryContextSwitchTo(oldContext); + + EEO_NEXT(); + } + + /* + * Check that a strict aggregate transition / combination function's + * input is not NULL. + */ + + EEO_CASE(EEOP_AGG_STRICT_INPUT_CHECK_ARGS) + { + NullableDatum *args = op->d.agg_strict_input_check.args; + int nargs = op->d.agg_strict_input_check.nargs; + + for (int argno = 0; argno < nargs; argno++) + { + if (args[argno].isnull) + EEO_JUMP(op->d.agg_strict_input_check.jumpnull); + } + EEO_NEXT(); + } + + EEO_CASE(EEOP_AGG_STRICT_INPUT_CHECK_NULLS) + { + bool *nulls = op->d.agg_strict_input_check.nulls; + int nargs = op->d.agg_strict_input_check.nargs; + + for (int argno = 0; argno < nargs; argno++) + { + if (nulls[argno]) + EEO_JUMP(op->d.agg_strict_input_check.jumpnull); + } + EEO_NEXT(); + } + + /* + * Check for a NULL pointer to the per-group states. + */ + + EEO_CASE(EEOP_AGG_PLAIN_PERGROUP_NULLCHECK) + { + AggState *aggstate = castNode(AggState, state->parent); + AggStatePerGroup pergroup_allaggs = + aggstate->all_pergroups[op->d.agg_plain_pergroup_nullcheck.setoff]; + + if (pergroup_allaggs == NULL) + EEO_JUMP(op->d.agg_plain_pergroup_nullcheck.jumpnull); + + EEO_NEXT(); + } + + /* + * Different types of aggregate transition functions are implemented + * as different types of steps, to avoid incurring unnecessary + * overhead. There's a step type for each valid combination of having + * a by value / by reference transition type, [not] needing to the + * initialize the transition value for the first row in a group from + * input, and [not] strict transition function. + * + * Could optimize further by splitting off by-reference for + * fixed-length types, but currently that doesn't seem worth it. + */ + + EEO_CASE(EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL) + { + AggState *aggstate = castNode(AggState, state->parent); + AggStatePerTrans pertrans = op->d.agg_trans.pertrans; + AggStatePerGroup pergroup = + &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno]; + + Assert(pertrans->transtypeByVal); + + if (pergroup->noTransValue) + { + /* If transValue has not yet been initialized, do so now. */ + ExecAggInitGroup(aggstate, pertrans, pergroup, + op->d.agg_trans.aggcontext); + /* copied trans value from input, done this round */ + } + else if (likely(!pergroup->transValueIsNull)) + { + /* invoke transition function, unless prevented by strictness */ + ExecAggPlainTransByVal(aggstate, pertrans, pergroup, + op->d.agg_trans.aggcontext, + op->d.agg_trans.setno); + } + + EEO_NEXT(); + } + + /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */ + EEO_CASE(EEOP_AGG_PLAIN_TRANS_STRICT_BYVAL) + { + AggState *aggstate = castNode(AggState, state->parent); + AggStatePerTrans pertrans = op->d.agg_trans.pertrans; + AggStatePerGroup pergroup = + &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno]; + + Assert(pertrans->transtypeByVal); + + if (likely(!pergroup->transValueIsNull)) + ExecAggPlainTransByVal(aggstate, pertrans, pergroup, + op->d.agg_trans.aggcontext, + op->d.agg_trans.setno); + + EEO_NEXT(); + } + + /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */ + EEO_CASE(EEOP_AGG_PLAIN_TRANS_BYVAL) + { + AggState *aggstate = castNode(AggState, state->parent); + AggStatePerTrans pertrans = op->d.agg_trans.pertrans; + AggStatePerGroup pergroup = + &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno]; + + Assert(pertrans->transtypeByVal); + + ExecAggPlainTransByVal(aggstate, pertrans, pergroup, + op->d.agg_trans.aggcontext, + op->d.agg_trans.setno); + + EEO_NEXT(); + } + + /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */ + EEO_CASE(EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF) + { + AggState *aggstate = castNode(AggState, state->parent); + AggStatePerTrans pertrans = op->d.agg_trans.pertrans; + AggStatePerGroup pergroup = + &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno]; + + Assert(!pertrans->transtypeByVal); + + if (pergroup->noTransValue) + ExecAggInitGroup(aggstate, pertrans, pergroup, + op->d.agg_trans.aggcontext); + else if (likely(!pergroup->transValueIsNull)) + ExecAggPlainTransByRef(aggstate, pertrans, pergroup, + op->d.agg_trans.aggcontext, + op->d.agg_trans.setno); + + EEO_NEXT(); + } + + /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */ + EEO_CASE(EEOP_AGG_PLAIN_TRANS_STRICT_BYREF) + { + AggState *aggstate = castNode(AggState, state->parent); + AggStatePerTrans pertrans = op->d.agg_trans.pertrans; + AggStatePerGroup pergroup = + &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno]; + + Assert(!pertrans->transtypeByVal); + + if (likely(!pergroup->transValueIsNull)) + ExecAggPlainTransByRef(aggstate, pertrans, pergroup, + op->d.agg_trans.aggcontext, + op->d.agg_trans.setno); + EEO_NEXT(); + } + + /* see comments above EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYVAL */ + EEO_CASE(EEOP_AGG_PLAIN_TRANS_BYREF) + { + AggState *aggstate = castNode(AggState, state->parent); + AggStatePerTrans pertrans = op->d.agg_trans.pertrans; + AggStatePerGroup pergroup = + &aggstate->all_pergroups[op->d.agg_trans.setoff][op->d.agg_trans.transno]; + + Assert(!pertrans->transtypeByVal); + + ExecAggPlainTransByRef(aggstate, pertrans, pergroup, + op->d.agg_trans.aggcontext, + op->d.agg_trans.setno); + + EEO_NEXT(); + } + + /* process single-column ordered aggregate datum */ + EEO_CASE(EEOP_AGG_ORDERED_TRANS_DATUM) + { + /* too complex for an inline implementation */ + ExecEvalAggOrderedTransDatum(state, op, econtext); + + EEO_NEXT(); + } + + /* process multi-column ordered aggregate tuple */ + EEO_CASE(EEOP_AGG_ORDERED_TRANS_TUPLE) + { + /* too complex for an inline implementation */ + ExecEvalAggOrderedTransTuple(state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_LAST) + { + /* unreachable */ + Assert(false); + goto out; + } + } + +out: + *isnull = state->resnull; + return state->resvalue; +} + +/* + * Expression evaluation callback that performs extra checks before executing + * the expression. Declared extern so other methods of execution can use it + * too. + */ +Datum +ExecInterpExprStillValid(ExprState *state, ExprContext *econtext, bool *isNull) +{ + /* + * First time through, check whether attribute matches Var. Might not be + * ok anymore, due to schema changes. + */ + CheckExprStillValid(state, econtext); + + /* skip the check during further executions */ + state->evalfunc = (ExprStateEvalFunc) state->evalfunc_private; + + /* and actually execute */ + return state->evalfunc(state, econtext, isNull); +} + +/* + * Check that an expression is still valid in the face of potential schema + * changes since the plan has been created. + */ +void +CheckExprStillValid(ExprState *state, ExprContext *econtext) +{ + TupleTableSlot *innerslot; + TupleTableSlot *outerslot; + TupleTableSlot *scanslot; + + innerslot = econtext->ecxt_innertuple; + outerslot = econtext->ecxt_outertuple; + scanslot = econtext->ecxt_scantuple; + + for (int i = 0; i < state->steps_len; i++) + { + ExprEvalStep *op = &state->steps[i]; + + switch (ExecEvalStepOp(state, op)) + { + case EEOP_INNER_VAR: + { + int attnum = op->d.var.attnum; + + CheckVarSlotCompatibility(innerslot, attnum + 1, op->d.var.vartype); + break; + } + + case EEOP_OUTER_VAR: + { + int attnum = op->d.var.attnum; + + CheckVarSlotCompatibility(outerslot, attnum + 1, op->d.var.vartype); + break; + } + + case EEOP_SCAN_VAR: + { + int attnum = op->d.var.attnum; + + CheckVarSlotCompatibility(scanslot, attnum + 1, op->d.var.vartype); + break; + } + default: + break; + } + } +} + +/* + * Check whether a user attribute in a slot can be referenced by a Var + * expression. This should succeed unless there have been schema changes + * since the expression tree has been created. + */ +static void +CheckVarSlotCompatibility(TupleTableSlot *slot, int attnum, Oid vartype) +{ + /* + * What we have to check for here is the possibility of an attribute + * having been dropped or changed in type since the plan tree was created. + * Ideally the plan will get invalidated and not re-used, but just in + * case, we keep these defenses. Fortunately it's sufficient to check + * once on the first time through. + * + * Note: ideally we'd check typmod as well as typid, but that seems + * impractical at the moment: in many cases the tupdesc will have been + * generated by ExecTypeFromTL(), and that can't guarantee to generate an + * accurate typmod in all cases, because some expression node types don't + * carry typmod. Fortunately, for precisely that reason, there should be + * no places with a critical dependency on the typmod of a value. + * + * System attributes don't require checking since their types never + * change. + */ + if (attnum > 0) + { + TupleDesc slot_tupdesc = slot->tts_tupleDescriptor; + Form_pg_attribute attr; + + if (attnum > slot_tupdesc->natts) /* should never happen */ + elog(ERROR, "attribute number %d exceeds number of columns %d", + attnum, slot_tupdesc->natts); + + attr = TupleDescAttr(slot_tupdesc, attnum - 1); + + if (attr->attisdropped) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("attribute %d of type %s has been dropped", + attnum, format_type_be(slot_tupdesc->tdtypeid)))); + + if (vartype != attr->atttypid) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("attribute %d of type %s has wrong type", + attnum, format_type_be(slot_tupdesc->tdtypeid)), + errdetail("Table has type %s, but query expects %s.", + format_type_be(attr->atttypid), + format_type_be(vartype)))); + } +} + +/* + * Verify that the slot is compatible with a EEOP_*_FETCHSOME operation. + */ +static void +CheckOpSlotCompatibility(ExprEvalStep *op, TupleTableSlot *slot) +{ +#ifdef USE_ASSERT_CHECKING + /* there's nothing to check */ + if (!op->d.fetch.fixed) + return; + + /* + * Should probably fixed at some point, but for now it's easier to allow + * buffer and heap tuples to be used interchangeably. + */ + if (slot->tts_ops == &TTSOpsBufferHeapTuple && + op->d.fetch.kind == &TTSOpsHeapTuple) + return; + if (slot->tts_ops == &TTSOpsHeapTuple && + op->d.fetch.kind == &TTSOpsBufferHeapTuple) + return; + + /* + * At the moment we consider it OK if a virtual slot is used instead of a + * specific type of slot, as a virtual slot never needs to be deformed. + */ + if (slot->tts_ops == &TTSOpsVirtual) + return; + + Assert(op->d.fetch.kind == slot->tts_ops); +#endif +} + +/* + * get_cached_rowtype: utility function to lookup a rowtype tupdesc + * + * type_id, typmod: identity of the rowtype + * rowcache: space for caching identity info + * (rowcache->cacheptr must be initialized to NULL) + * changed: if not NULL, *changed is set to true on any update + * + * The returned TupleDesc is not guaranteed pinned; caller must pin it + * to use it across any operation that might incur cache invalidation. + * (The TupleDesc is always refcounted, so just use IncrTupleDescRefCount.) + * + * NOTE: because composite types can change contents, we must be prepared + * to re-do this during any node execution; cannot call just once during + * expression initialization. + */ +static TupleDesc +get_cached_rowtype(Oid type_id, int32 typmod, + ExprEvalRowtypeCache *rowcache, + bool *changed) +{ + if (type_id != RECORDOID) + { + /* + * It's a named composite type, so use the regular typcache. Do a + * lookup first time through, or if the composite type changed. Note: + * "tupdesc_id == 0" may look redundant, but it protects against the + * admittedly-theoretical possibility that type_id was RECORDOID the + * last time through, so that the cacheptr isn't TypeCacheEntry *. + */ + TypeCacheEntry *typentry = (TypeCacheEntry *) rowcache->cacheptr; + + if (unlikely(typentry == NULL || + rowcache->tupdesc_id == 0 || + typentry->tupDesc_identifier != rowcache->tupdesc_id)) + { + typentry = lookup_type_cache(type_id, TYPECACHE_TUPDESC); + if (typentry->tupDesc == NULL) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("type %s is not composite", + format_type_be(type_id)))); + rowcache->cacheptr = (void *) typentry; + rowcache->tupdesc_id = typentry->tupDesc_identifier; + if (changed) + *changed = true; + } + return typentry->tupDesc; + } + else + { + /* + * A RECORD type, once registered, doesn't change for the life of the + * backend. So we don't need a typcache entry as such, which is good + * because there isn't one. It's possible that the caller is asking + * about a different type than before, though. + */ + TupleDesc tupDesc = (TupleDesc) rowcache->cacheptr; + + if (unlikely(tupDesc == NULL || + rowcache->tupdesc_id != 0 || + type_id != tupDesc->tdtypeid || + typmod != tupDesc->tdtypmod)) + { + tupDesc = lookup_rowtype_tupdesc(type_id, typmod); + /* Drop pin acquired by lookup_rowtype_tupdesc */ + ReleaseTupleDesc(tupDesc); + rowcache->cacheptr = (void *) tupDesc; + rowcache->tupdesc_id = 0; /* not a valid value for non-RECORD */ + if (changed) + *changed = true; + } + return tupDesc; + } +} + + +/* + * Fast-path functions, for very simple expressions + */ + +/* implementation of ExecJust(Inner|Outer|Scan)Var */ +static pg_attribute_always_inline Datum +ExecJustVarImpl(ExprState *state, TupleTableSlot *slot, bool *isnull) +{ + ExprEvalStep *op = &state->steps[1]; + int attnum = op->d.var.attnum + 1; + + CheckOpSlotCompatibility(&state->steps[0], slot); + + /* + * Since we use slot_getattr(), we don't need to implement the FETCHSOME + * step explicitly, and we also needn't Assert that the attnum is in range + * --- slot_getattr() will take care of any problems. + */ + return slot_getattr(slot, attnum, isnull); +} + +/* Simple reference to inner Var */ +static Datum +ExecJustInnerVar(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustVarImpl(state, econtext->ecxt_innertuple, isnull); +} + +/* Simple reference to outer Var */ +static Datum +ExecJustOuterVar(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustVarImpl(state, econtext->ecxt_outertuple, isnull); +} + +/* Simple reference to scan Var */ +static Datum +ExecJustScanVar(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustVarImpl(state, econtext->ecxt_scantuple, isnull); +} + +/* implementation of ExecJustAssign(Inner|Outer|Scan)Var */ +static pg_attribute_always_inline Datum +ExecJustAssignVarImpl(ExprState *state, TupleTableSlot *inslot, bool *isnull) +{ + ExprEvalStep *op = &state->steps[1]; + int attnum = op->d.assign_var.attnum + 1; + int resultnum = op->d.assign_var.resultnum; + TupleTableSlot *outslot = state->resultslot; + + CheckOpSlotCompatibility(&state->steps[0], inslot); + + /* + * We do not need CheckVarSlotCompatibility here; that was taken care of + * at compilation time. + * + * Since we use slot_getattr(), we don't need to implement the FETCHSOME + * step explicitly, and we also needn't Assert that the attnum is in range + * --- slot_getattr() will take care of any problems. Nonetheless, check + * that resultnum is in range. + */ + Assert(resultnum >= 0 && resultnum < outslot->tts_tupleDescriptor->natts); + outslot->tts_values[resultnum] = + slot_getattr(inslot, attnum, &outslot->tts_isnull[resultnum]); + return 0; +} + +/* Evaluate inner Var and assign to appropriate column of result tuple */ +static Datum +ExecJustAssignInnerVar(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustAssignVarImpl(state, econtext->ecxt_innertuple, isnull); +} + +/* Evaluate outer Var and assign to appropriate column of result tuple */ +static Datum +ExecJustAssignOuterVar(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustAssignVarImpl(state, econtext->ecxt_outertuple, isnull); +} + +/* Evaluate scan Var and assign to appropriate column of result tuple */ +static Datum +ExecJustAssignScanVar(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustAssignVarImpl(state, econtext->ecxt_scantuple, isnull); +} + +/* Evaluate CASE_TESTVAL and apply a strict function to it */ +static Datum +ExecJustApplyFuncToCase(ExprState *state, ExprContext *econtext, bool *isnull) +{ + ExprEvalStep *op = &state->steps[0]; + FunctionCallInfo fcinfo; + NullableDatum *args; + int nargs; + Datum d; + + /* + * XXX with some redesign of the CaseTestExpr mechanism, maybe we could + * get rid of this data shuffling? + */ + *op->resvalue = *op->d.casetest.value; + *op->resnull = *op->d.casetest.isnull; + + op++; + + nargs = op->d.func.nargs; + fcinfo = op->d.func.fcinfo_data; + args = fcinfo->args; + + /* strict function, so check for NULL args */ + for (int argno = 0; argno < nargs; argno++) + { + if (args[argno].isnull) + { + *isnull = true; + return (Datum) 0; + } + } + fcinfo->isnull = false; + d = op->d.func.fn_addr(fcinfo); + *isnull = fcinfo->isnull; + return d; +} + +/* Simple Const expression */ +static Datum +ExecJustConst(ExprState *state, ExprContext *econtext, bool *isnull) +{ + ExprEvalStep *op = &state->steps[0]; + + *isnull = op->d.constval.isnull; + return op->d.constval.value; +} + +/* implementation of ExecJust(Inner|Outer|Scan)VarVirt */ +static pg_attribute_always_inline Datum +ExecJustVarVirtImpl(ExprState *state, TupleTableSlot *slot, bool *isnull) +{ + ExprEvalStep *op = &state->steps[0]; + int attnum = op->d.var.attnum; + + /* + * As it is guaranteed that a virtual slot is used, there never is a need + * to perform tuple deforming (nor would it be possible). Therefore + * execExpr.c has not emitted an EEOP_*_FETCHSOME step. Verify, as much as + * possible, that that determination was accurate. + */ + Assert(TTS_IS_VIRTUAL(slot)); + Assert(TTS_FIXED(slot)); + Assert(attnum >= 0 && attnum < slot->tts_nvalid); + + *isnull = slot->tts_isnull[attnum]; + + return slot->tts_values[attnum]; +} + +/* Like ExecJustInnerVar, optimized for virtual slots */ +static Datum +ExecJustInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustVarVirtImpl(state, econtext->ecxt_innertuple, isnull); +} + +/* Like ExecJustOuterVar, optimized for virtual slots */ +static Datum +ExecJustOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustVarVirtImpl(state, econtext->ecxt_outertuple, isnull); +} + +/* Like ExecJustScanVar, optimized for virtual slots */ +static Datum +ExecJustScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustVarVirtImpl(state, econtext->ecxt_scantuple, isnull); +} + +/* implementation of ExecJustAssign(Inner|Outer|Scan)VarVirt */ +static pg_attribute_always_inline Datum +ExecJustAssignVarVirtImpl(ExprState *state, TupleTableSlot *inslot, bool *isnull) +{ + ExprEvalStep *op = &state->steps[0]; + int attnum = op->d.assign_var.attnum; + int resultnum = op->d.assign_var.resultnum; + TupleTableSlot *outslot = state->resultslot; + + /* see ExecJustVarVirtImpl for comments */ + + Assert(TTS_IS_VIRTUAL(inslot)); + Assert(TTS_FIXED(inslot)); + Assert(attnum >= 0 && attnum < inslot->tts_nvalid); + Assert(resultnum >= 0 && resultnum < outslot->tts_tupleDescriptor->natts); + + outslot->tts_values[resultnum] = inslot->tts_values[attnum]; + outslot->tts_isnull[resultnum] = inslot->tts_isnull[attnum]; + + return 0; +} + +/* Like ExecJustAssignInnerVar, optimized for virtual slots */ +static Datum +ExecJustAssignInnerVarVirt(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustAssignVarVirtImpl(state, econtext->ecxt_innertuple, isnull); +} + +/* Like ExecJustAssignOuterVar, optimized for virtual slots */ +static Datum +ExecJustAssignOuterVarVirt(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustAssignVarVirtImpl(state, econtext->ecxt_outertuple, isnull); +} + +/* Like ExecJustAssignScanVar, optimized for virtual slots */ +static Datum +ExecJustAssignScanVarVirt(ExprState *state, ExprContext *econtext, bool *isnull) +{ + return ExecJustAssignVarVirtImpl(state, econtext->ecxt_scantuple, isnull); +} + +#if defined(EEO_USE_COMPUTED_GOTO) +/* + * Comparator used when building address->opcode lookup table for + * ExecEvalStepOp() in the threaded dispatch case. + */ +static int +dispatch_compare_ptr(const void *a, const void *b) +{ + const ExprEvalOpLookup *la = (const ExprEvalOpLookup *) a; + const ExprEvalOpLookup *lb = (const ExprEvalOpLookup *) b; + + if (la->opcode < lb->opcode) + return -1; + else if (la->opcode > lb->opcode) + return 1; + return 0; +} +#endif + +/* + * Do one-time initialization of interpretation machinery. + */ +static void +ExecInitInterpreter(void) +{ +#if defined(EEO_USE_COMPUTED_GOTO) + /* Set up externally-visible pointer to dispatch table */ + if (dispatch_table == NULL) + { + dispatch_table = (const void **) + DatumGetPointer(ExecInterpExpr(NULL, NULL, NULL)); + + /* build reverse lookup table */ + for (int i = 0; i < EEOP_LAST; i++) + { + reverse_dispatch_table[i].opcode = dispatch_table[i]; + reverse_dispatch_table[i].op = (ExprEvalOp) i; + } + + /* make it bsearch()able */ + qsort(reverse_dispatch_table, + EEOP_LAST /* nmembers */ , + sizeof(ExprEvalOpLookup), + dispatch_compare_ptr); + } +#endif +} + +/* + * Function to return the opcode of an expression step. + * + * When direct-threading is in use, ExprState->opcode isn't easily + * decipherable. This function returns the appropriate enum member. + */ +ExprEvalOp +ExecEvalStepOp(ExprState *state, ExprEvalStep *op) +{ +#if defined(EEO_USE_COMPUTED_GOTO) + if (state->flags & EEO_FLAG_DIRECT_THREADED) + { + ExprEvalOpLookup key; + ExprEvalOpLookup *res; + + key.opcode = (void *) op->opcode; + res = bsearch(&key, + reverse_dispatch_table, + EEOP_LAST /* nmembers */ , + sizeof(ExprEvalOpLookup), + dispatch_compare_ptr); + Assert(res); /* unknown ops shouldn't get looked up */ + return res->op; + } +#endif + return (ExprEvalOp) op->opcode; +} + + +/* + * Out-of-line helper functions for complex instructions. + */ + +/* + * Evaluate EEOP_FUNCEXPR_FUSAGE + */ +void +ExecEvalFuncExprFusage(ExprState *state, ExprEvalStep *op, + ExprContext *econtext) +{ + FunctionCallInfo fcinfo = op->d.func.fcinfo_data; + PgStat_FunctionCallUsage fcusage; + Datum d; + + pgstat_init_function_usage(fcinfo, &fcusage); + + fcinfo->isnull = false; + d = op->d.func.fn_addr(fcinfo); + *op->resvalue = d; + *op->resnull = fcinfo->isnull; + + pgstat_end_function_usage(&fcusage, true); +} + +/* + * Evaluate EEOP_FUNCEXPR_STRICT_FUSAGE + */ +void +ExecEvalFuncExprStrictFusage(ExprState *state, ExprEvalStep *op, + ExprContext *econtext) +{ + + FunctionCallInfo fcinfo = op->d.func.fcinfo_data; + PgStat_FunctionCallUsage fcusage; + NullableDatum *args = fcinfo->args; + int nargs = op->d.func.nargs; + Datum d; + + /* strict function, so check for NULL args */ + for (int argno = 0; argno < nargs; argno++) + { + if (args[argno].isnull) + { + *op->resnull = true; + return; + } + } + + pgstat_init_function_usage(fcinfo, &fcusage); + + fcinfo->isnull = false; + d = op->d.func.fn_addr(fcinfo); + *op->resvalue = d; + *op->resnull = fcinfo->isnull; + + pgstat_end_function_usage(&fcusage, true); +} + +/* + * Evaluate a PARAM_EXEC parameter. + * + * PARAM_EXEC params (internal executor parameters) are stored in the + * ecxt_param_exec_vals array, and can be accessed by array index. + */ +void +ExecEvalParamExec(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + ParamExecData *prm; + + prm = &(econtext->ecxt_param_exec_vals[op->d.param.paramid]); + if (unlikely(prm->execPlan != NULL)) + { + /* Parameter not evaluated yet, so go do it */ + ExecSetParamPlan(prm->execPlan, econtext); + /* ExecSetParamPlan should have processed this param... */ + Assert(prm->execPlan == NULL); + } + *op->resvalue = prm->value; + *op->resnull = prm->isnull; +} + +/* + * Evaluate a PARAM_EXTERN parameter. + * + * PARAM_EXTERN parameters must be sought in ecxt_param_list_info. + */ +void +ExecEvalParamExtern(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + ParamListInfo paramInfo = econtext->ecxt_param_list_info; + int paramId = op->d.param.paramid; + + if (likely(paramInfo && + paramId > 0 && paramId <= paramInfo->numParams)) + { + ParamExternData *prm; + ParamExternData prmdata; + + /* give hook a chance in case parameter is dynamic */ + if (paramInfo->paramFetch != NULL) + prm = paramInfo->paramFetch(paramInfo, paramId, false, &prmdata); + else + prm = ¶mInfo->params[paramId - 1]; + + if (likely(OidIsValid(prm->ptype))) + { + /* safety check in case hook did something unexpected */ + if (unlikely(prm->ptype != op->d.param.paramtype)) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("type of parameter %d (%s) does not match that when preparing the plan (%s)", + paramId, + format_type_be(prm->ptype), + format_type_be(op->d.param.paramtype)))); + *op->resvalue = prm->value; + *op->resnull = prm->isnull; + return; + } + } + + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("no value found for parameter %d", paramId))); +} + +/* + * Evaluate a SQLValueFunction expression. + */ +void +ExecEvalSQLValueFunction(ExprState *state, ExprEvalStep *op) +{ + LOCAL_FCINFO(fcinfo, 0); + SQLValueFunction *svf = op->d.sqlvaluefunction.svf; + + *op->resnull = false; + + /* + * Note: current_schema() can return NULL. current_user() etc currently + * cannot, but might as well code those cases the same way for safety. + */ + switch (svf->op) + { + case SVFOP_CURRENT_DATE: + *op->resvalue = DateADTGetDatum(GetSQLCurrentDate()); + break; + case SVFOP_CURRENT_TIME: + case SVFOP_CURRENT_TIME_N: + *op->resvalue = TimeTzADTPGetDatum(GetSQLCurrentTime(svf->typmod)); + break; + case SVFOP_CURRENT_TIMESTAMP: + case SVFOP_CURRENT_TIMESTAMP_N: + *op->resvalue = TimestampTzGetDatum(GetSQLCurrentTimestamp(svf->typmod)); + break; + case SVFOP_LOCALTIME: + case SVFOP_LOCALTIME_N: + *op->resvalue = TimeADTGetDatum(GetSQLLocalTime(svf->typmod)); + break; + case SVFOP_LOCALTIMESTAMP: + case SVFOP_LOCALTIMESTAMP_N: + *op->resvalue = TimestampGetDatum(GetSQLLocalTimestamp(svf->typmod)); + break; + case SVFOP_CURRENT_ROLE: + case SVFOP_CURRENT_USER: + case SVFOP_USER: + InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL); + *op->resvalue = current_user(fcinfo); + *op->resnull = fcinfo->isnull; + break; + case SVFOP_SESSION_USER: + InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL); + *op->resvalue = session_user(fcinfo); + *op->resnull = fcinfo->isnull; + break; + case SVFOP_CURRENT_CATALOG: + InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL); + *op->resvalue = current_database(fcinfo); + *op->resnull = fcinfo->isnull; + break; + case SVFOP_CURRENT_SCHEMA: + InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL); + *op->resvalue = current_schema(fcinfo); + *op->resnull = fcinfo->isnull; + break; + } +} + +/* + * Raise error if a CURRENT OF expression is evaluated. + * + * The planner should convert CURRENT OF into a TidScan qualification, or some + * other special handling in a ForeignScan node. So we have to be able to do + * ExecInitExpr on a CurrentOfExpr, but we shouldn't ever actually execute it. + * If we get here, we suppose we must be dealing with CURRENT OF on a foreign + * table whose FDW doesn't handle it, and complain accordingly. + */ +void +ExecEvalCurrentOfExpr(ExprState *state, ExprEvalStep *op) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("WHERE CURRENT OF is not supported for this table type"))); +} + +/* + * Evaluate NextValueExpr. + */ +void +ExecEvalNextValueExpr(ExprState *state, ExprEvalStep *op) +{ + int64 newval = nextval_internal(op->d.nextvalueexpr.seqid, false); + + switch (op->d.nextvalueexpr.seqtypid) + { + case INT2OID: + *op->resvalue = Int16GetDatum((int16) newval); + break; + case INT4OID: + *op->resvalue = Int32GetDatum((int32) newval); + break; + case INT8OID: + *op->resvalue = Int64GetDatum((int64) newval); + break; + default: + elog(ERROR, "unsupported sequence type %u", + op->d.nextvalueexpr.seqtypid); + } + *op->resnull = false; +} + +/* + * Evaluate NullTest / IS NULL for rows. + */ +void +ExecEvalRowNull(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + ExecEvalRowNullInt(state, op, econtext, true); +} + +/* + * Evaluate NullTest / IS NOT NULL for rows. + */ +void +ExecEvalRowNotNull(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + ExecEvalRowNullInt(state, op, econtext, false); +} + +/* Common code for IS [NOT] NULL on a row value */ +static void +ExecEvalRowNullInt(ExprState *state, ExprEvalStep *op, + ExprContext *econtext, bool checkisnull) +{ + Datum value = *op->resvalue; + bool isnull = *op->resnull; + HeapTupleHeader tuple; + Oid tupType; + int32 tupTypmod; + TupleDesc tupDesc; + HeapTupleData tmptup; + + *op->resnull = false; + + /* NULL row variables are treated just as NULL scalar columns */ + if (isnull) + { + *op->resvalue = BoolGetDatum(checkisnull); + return; + } + + /* + * The SQL standard defines IS [NOT] NULL for a non-null rowtype argument + * as: + * + * "R IS NULL" is true if every field is the null value. + * + * "R IS NOT NULL" is true if no field is the null value. + * + * This definition is (apparently intentionally) not recursive; so our + * tests on the fields are primitive attisnull tests, not recursive checks + * to see if they are all-nulls or no-nulls rowtypes. + * + * The standard does not consider the possibility of zero-field rows, but + * here we consider them to vacuously satisfy both predicates. + */ + + tuple = DatumGetHeapTupleHeader(value); + + tupType = HeapTupleHeaderGetTypeId(tuple); + tupTypmod = HeapTupleHeaderGetTypMod(tuple); + + /* Lookup tupdesc if first time through or if type changes */ + tupDesc = get_cached_rowtype(tupType, tupTypmod, + &op->d.nulltest_row.rowcache, NULL); + + /* + * heap_attisnull needs a HeapTuple not a bare HeapTupleHeader. + */ + tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple); + tmptup.t_data = tuple; + + for (int att = 1; att <= tupDesc->natts; att++) + { + /* ignore dropped columns */ + if (TupleDescAttr(tupDesc, att - 1)->attisdropped) + continue; + if (heap_attisnull(&tmptup, att, tupDesc)) + { + /* null field disproves IS NOT NULL */ + if (!checkisnull) + { + *op->resvalue = BoolGetDatum(false); + return; + } + } + else + { + /* non-null field disproves IS NULL */ + if (checkisnull) + { + *op->resvalue = BoolGetDatum(false); + return; + } + } + } + + *op->resvalue = BoolGetDatum(true); +} + +/* + * Evaluate an ARRAY[] expression. + * + * The individual array elements (or subarrays) have already been evaluated + * into op->d.arrayexpr.elemvalues[]/elemnulls[]. + */ +void +ExecEvalArrayExpr(ExprState *state, ExprEvalStep *op) +{ + ArrayType *result; + Oid element_type = op->d.arrayexpr.elemtype; + int nelems = op->d.arrayexpr.nelems; + int ndims = 0; + int dims[MAXDIM]; + int lbs[MAXDIM]; + + /* Set non-null as default */ + *op->resnull = false; + + if (!op->d.arrayexpr.multidims) + { + /* Elements are presumably of scalar type */ + Datum *dvalues = op->d.arrayexpr.elemvalues; + bool *dnulls = op->d.arrayexpr.elemnulls; + + /* setup for 1-D array of the given length */ + ndims = 1; + dims[0] = nelems; + lbs[0] = 1; + + result = construct_md_array(dvalues, dnulls, ndims, dims, lbs, + element_type, + op->d.arrayexpr.elemlength, + op->d.arrayexpr.elembyval, + op->d.arrayexpr.elemalign); + } + else + { + /* Must be nested array expressions */ + int nbytes = 0; + int nitems = 0; + int outer_nelems = 0; + int elem_ndims = 0; + int *elem_dims = NULL; + int *elem_lbs = NULL; + bool firstone = true; + bool havenulls = false; + bool haveempty = false; + char **subdata; + bits8 **subbitmaps; + int *subbytes; + int *subnitems; + int32 dataoffset; + char *dat; + int iitem; + + subdata = (char **) palloc(nelems * sizeof(char *)); + subbitmaps = (bits8 **) palloc(nelems * sizeof(bits8 *)); + subbytes = (int *) palloc(nelems * sizeof(int)); + subnitems = (int *) palloc(nelems * sizeof(int)); + + /* loop through and get data area from each element */ + for (int elemoff = 0; elemoff < nelems; elemoff++) + { + Datum arraydatum; + bool eisnull; + ArrayType *array; + int this_ndims; + + arraydatum = op->d.arrayexpr.elemvalues[elemoff]; + eisnull = op->d.arrayexpr.elemnulls[elemoff]; + + /* temporarily ignore null subarrays */ + if (eisnull) + { + haveempty = true; + continue; + } + + array = DatumGetArrayTypeP(arraydatum); + + /* run-time double-check on element type */ + if (element_type != ARR_ELEMTYPE(array)) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("cannot merge incompatible arrays"), + errdetail("Array with element type %s cannot be " + "included in ARRAY construct with element type %s.", + format_type_be(ARR_ELEMTYPE(array)), + format_type_be(element_type)))); + + this_ndims = ARR_NDIM(array); + /* temporarily ignore zero-dimensional subarrays */ + if (this_ndims <= 0) + { + haveempty = true; + continue; + } + + if (firstone) + { + /* Get sub-array details from first member */ + elem_ndims = this_ndims; + ndims = elem_ndims + 1; + if (ndims <= 0 || ndims > MAXDIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("number of array dimensions (%d) exceeds the maximum allowed (%d)", + ndims, MAXDIM))); + + elem_dims = (int *) palloc(elem_ndims * sizeof(int)); + memcpy(elem_dims, ARR_DIMS(array), elem_ndims * sizeof(int)); + elem_lbs = (int *) palloc(elem_ndims * sizeof(int)); + memcpy(elem_lbs, ARR_LBOUND(array), elem_ndims * sizeof(int)); + + firstone = false; + } + else + { + /* Check other sub-arrays are compatible */ + if (elem_ndims != this_ndims || + memcmp(elem_dims, ARR_DIMS(array), + elem_ndims * sizeof(int)) != 0 || + memcmp(elem_lbs, ARR_LBOUND(array), + elem_ndims * sizeof(int)) != 0) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("multidimensional arrays must have array " + "expressions with matching dimensions"))); + } + + subdata[outer_nelems] = ARR_DATA_PTR(array); + subbitmaps[outer_nelems] = ARR_NULLBITMAP(array); + subbytes[outer_nelems] = ARR_SIZE(array) - ARR_DATA_OFFSET(array); + nbytes += subbytes[outer_nelems]; + subnitems[outer_nelems] = ArrayGetNItems(this_ndims, + ARR_DIMS(array)); + nitems += subnitems[outer_nelems]; + havenulls |= ARR_HASNULL(array); + outer_nelems++; + } + + /* + * If all items were null or empty arrays, return an empty array; + * otherwise, if some were and some weren't, raise error. (Note: we + * must special-case this somehow to avoid trying to generate a 1-D + * array formed from empty arrays. It's not ideal...) + */ + if (haveempty) + { + if (ndims == 0) /* didn't find any nonempty array */ + { + *op->resvalue = PointerGetDatum(construct_empty_array(element_type)); + return; + } + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("multidimensional arrays must have array " + "expressions with matching dimensions"))); + } + + /* setup for multi-D array */ + dims[0] = outer_nelems; + lbs[0] = 1; + for (int i = 1; i < ndims; i++) + { + dims[i] = elem_dims[i - 1]; + lbs[i] = elem_lbs[i - 1]; + } + + /* check for subscript overflow */ + (void) ArrayGetNItems(ndims, dims); + ArrayCheckBounds(ndims, dims, lbs); + + if (havenulls) + { + dataoffset = ARR_OVERHEAD_WITHNULLS(ndims, nitems); + nbytes += dataoffset; + } + else + { + dataoffset = 0; /* marker for no null bitmap */ + nbytes += ARR_OVERHEAD_NONULLS(ndims); + } + + result = (ArrayType *) palloc(nbytes); + SET_VARSIZE(result, nbytes); + result->ndim = ndims; + result->dataoffset = dataoffset; + result->elemtype = element_type; + memcpy(ARR_DIMS(result), dims, ndims * sizeof(int)); + memcpy(ARR_LBOUND(result), lbs, ndims * sizeof(int)); + + dat = ARR_DATA_PTR(result); + iitem = 0; + for (int i = 0; i < outer_nelems; i++) + { + memcpy(dat, subdata[i], subbytes[i]); + dat += subbytes[i]; + if (havenulls) + array_bitmap_copy(ARR_NULLBITMAP(result), iitem, + subbitmaps[i], 0, + subnitems[i]); + iitem += subnitems[i]; + } + } + + *op->resvalue = PointerGetDatum(result); +} + +/* + * Evaluate an ArrayCoerceExpr expression. + * + * Source array is in step's result variable. + */ +void +ExecEvalArrayCoerce(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + Datum arraydatum; + + /* NULL array -> NULL result */ + if (*op->resnull) + return; + + arraydatum = *op->resvalue; + + /* + * If it's binary-compatible, modify the element type in the array header, + * but otherwise leave the array as we received it. + */ + if (op->d.arraycoerce.elemexprstate == NULL) + { + /* Detoast input array if necessary, and copy in any case */ + ArrayType *array = DatumGetArrayTypePCopy(arraydatum); + + ARR_ELEMTYPE(array) = op->d.arraycoerce.resultelemtype; + *op->resvalue = PointerGetDatum(array); + return; + } + + /* + * Use array_map to apply the sub-expression to each array element. + */ + *op->resvalue = array_map(arraydatum, + op->d.arraycoerce.elemexprstate, + econtext, + op->d.arraycoerce.resultelemtype, + op->d.arraycoerce.amstate); +} + +/* + * Evaluate a ROW() expression. + * + * The individual columns have already been evaluated into + * op->d.row.elemvalues[]/elemnulls[]. + */ +void +ExecEvalRow(ExprState *state, ExprEvalStep *op) +{ + HeapTuple tuple; + + /* build tuple from evaluated field values */ + tuple = heap_form_tuple(op->d.row.tupdesc, + op->d.row.elemvalues, + op->d.row.elemnulls); + + *op->resvalue = HeapTupleGetDatum(tuple); + *op->resnull = false; +} + +/* + * Evaluate GREATEST() or LEAST() expression (note this is *not* MIN()/MAX()). + * + * All of the to-be-compared expressions have already been evaluated into + * op->d.minmax.values[]/nulls[]. + */ +void +ExecEvalMinMax(ExprState *state, ExprEvalStep *op) +{ + Datum *values = op->d.minmax.values; + bool *nulls = op->d.minmax.nulls; + FunctionCallInfo fcinfo = op->d.minmax.fcinfo_data; + MinMaxOp operator = op->d.minmax.op; + + /* set at initialization */ + Assert(fcinfo->args[0].isnull == false); + Assert(fcinfo->args[1].isnull == false); + + /* default to null result */ + *op->resnull = true; + + for (int off = 0; off < op->d.minmax.nelems; off++) + { + /* ignore NULL inputs */ + if (nulls[off]) + continue; + + if (*op->resnull) + { + /* first nonnull input, adopt value */ + *op->resvalue = values[off]; + *op->resnull = false; + } + else + { + int cmpresult; + + /* apply comparison function */ + fcinfo->args[0].value = *op->resvalue; + fcinfo->args[1].value = values[off]; + + fcinfo->isnull = false; + cmpresult = DatumGetInt32(FunctionCallInvoke(fcinfo)); + if (fcinfo->isnull) /* probably should not happen */ + continue; + + if (cmpresult > 0 && operator == IS_LEAST) + *op->resvalue = values[off]; + else if (cmpresult < 0 && operator == IS_GREATEST) + *op->resvalue = values[off]; + } + } +} + +/* + * Evaluate a FieldSelect node. + * + * Source record is in step's result variable. + */ +void +ExecEvalFieldSelect(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + AttrNumber fieldnum = op->d.fieldselect.fieldnum; + Datum tupDatum; + HeapTupleHeader tuple; + Oid tupType; + int32 tupTypmod; + TupleDesc tupDesc; + Form_pg_attribute attr; + HeapTupleData tmptup; + + /* NULL record -> NULL result */ + if (*op->resnull) + return; + + tupDatum = *op->resvalue; + + /* We can special-case expanded records for speed */ + if (VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(tupDatum))) + { + ExpandedRecordHeader *erh = (ExpandedRecordHeader *) DatumGetEOHP(tupDatum); + + Assert(erh->er_magic == ER_MAGIC); + + /* Extract record's TupleDesc */ + tupDesc = expanded_record_get_tupdesc(erh); + + /* + * Find field's attr record. Note we don't support system columns + * here: a datum tuple doesn't have valid values for most of the + * interesting system columns anyway. + */ + if (fieldnum <= 0) /* should never happen */ + elog(ERROR, "unsupported reference to system column %d in FieldSelect", + fieldnum); + if (fieldnum > tupDesc->natts) /* should never happen */ + elog(ERROR, "attribute number %d exceeds number of columns %d", + fieldnum, tupDesc->natts); + attr = TupleDescAttr(tupDesc, fieldnum - 1); + + /* Check for dropped column, and force a NULL result if so */ + if (attr->attisdropped) + { + *op->resnull = true; + return; + } + + /* Check for type mismatch --- possible after ALTER COLUMN TYPE? */ + /* As in CheckVarSlotCompatibility, we should but can't check typmod */ + if (op->d.fieldselect.resulttype != attr->atttypid) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("attribute %d has wrong type", fieldnum), + errdetail("Table has type %s, but query expects %s.", + format_type_be(attr->atttypid), + format_type_be(op->d.fieldselect.resulttype)))); + + /* extract the field */ + *op->resvalue = expanded_record_get_field(erh, fieldnum, + op->resnull); + } + else + { + /* Get the composite datum and extract its type fields */ + tuple = DatumGetHeapTupleHeader(tupDatum); + + tupType = HeapTupleHeaderGetTypeId(tuple); + tupTypmod = HeapTupleHeaderGetTypMod(tuple); + + /* Lookup tupdesc if first time through or if type changes */ + tupDesc = get_cached_rowtype(tupType, tupTypmod, + &op->d.fieldselect.rowcache, NULL); + + /* + * Find field's attr record. Note we don't support system columns + * here: a datum tuple doesn't have valid values for most of the + * interesting system columns anyway. + */ + if (fieldnum <= 0) /* should never happen */ + elog(ERROR, "unsupported reference to system column %d in FieldSelect", + fieldnum); + if (fieldnum > tupDesc->natts) /* should never happen */ + elog(ERROR, "attribute number %d exceeds number of columns %d", + fieldnum, tupDesc->natts); + attr = TupleDescAttr(tupDesc, fieldnum - 1); + + /* Check for dropped column, and force a NULL result if so */ + if (attr->attisdropped) + { + *op->resnull = true; + return; + } + + /* Check for type mismatch --- possible after ALTER COLUMN TYPE? */ + /* As in CheckVarSlotCompatibility, we should but can't check typmod */ + if (op->d.fieldselect.resulttype != attr->atttypid) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("attribute %d has wrong type", fieldnum), + errdetail("Table has type %s, but query expects %s.", + format_type_be(attr->atttypid), + format_type_be(op->d.fieldselect.resulttype)))); + + /* heap_getattr needs a HeapTuple not a bare HeapTupleHeader */ + tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple); + tmptup.t_data = tuple; + + /* extract the field */ + *op->resvalue = heap_getattr(&tmptup, + fieldnum, + tupDesc, + op->resnull); + } +} + +/* + * Deform source tuple, filling in the step's values/nulls arrays, before + * evaluating individual new values as part of a FieldStore expression. + * Subsequent steps will overwrite individual elements of the values/nulls + * arrays with the new field values, and then FIELDSTORE_FORM will build the + * new tuple value. + * + * Source record is in step's result variable. + */ +void +ExecEvalFieldStoreDeForm(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + TupleDesc tupDesc; + + /* Lookup tupdesc if first time through or if type changes */ + tupDesc = get_cached_rowtype(op->d.fieldstore.fstore->resulttype, -1, + op->d.fieldstore.rowcache, NULL); + + /* Check that current tupdesc doesn't have more fields than we allocated */ + if (unlikely(tupDesc->natts > op->d.fieldstore.ncolumns)) + elog(ERROR, "too many columns in composite type %u", + op->d.fieldstore.fstore->resulttype); + + if (*op->resnull) + { + /* Convert null input tuple into an all-nulls row */ + memset(op->d.fieldstore.nulls, true, + op->d.fieldstore.ncolumns * sizeof(bool)); + } + else + { + /* + * heap_deform_tuple needs a HeapTuple not a bare HeapTupleHeader. We + * set all the fields in the struct just in case. + */ + Datum tupDatum = *op->resvalue; + HeapTupleHeader tuphdr; + HeapTupleData tmptup; + + tuphdr = DatumGetHeapTupleHeader(tupDatum); + tmptup.t_len = HeapTupleHeaderGetDatumLength(tuphdr); + ItemPointerSetInvalid(&(tmptup.t_self)); + tmptup.t_tableOid = InvalidOid; + tmptup.t_data = tuphdr; + + heap_deform_tuple(&tmptup, tupDesc, + op->d.fieldstore.values, + op->d.fieldstore.nulls); + } +} + +/* + * Compute the new composite datum after each individual field value of a + * FieldStore expression has been evaluated. + */ +void +ExecEvalFieldStoreForm(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + TupleDesc tupDesc; + HeapTuple tuple; + + /* Lookup tupdesc (should be valid already) */ + tupDesc = get_cached_rowtype(op->d.fieldstore.fstore->resulttype, -1, + op->d.fieldstore.rowcache, NULL); + + tuple = heap_form_tuple(tupDesc, + op->d.fieldstore.values, + op->d.fieldstore.nulls); + + *op->resvalue = HeapTupleGetDatum(tuple); + *op->resnull = false; +} + +/* + * Evaluate a rowtype coercion operation. + * This may require rearranging field positions. + * + * Source record is in step's result variable. + */ +void +ExecEvalConvertRowtype(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + HeapTuple result; + Datum tupDatum; + HeapTupleHeader tuple; + HeapTupleData tmptup; + TupleDesc indesc, + outdesc; + bool changed = false; + + /* NULL in -> NULL out */ + if (*op->resnull) + return; + + tupDatum = *op->resvalue; + tuple = DatumGetHeapTupleHeader(tupDatum); + + /* + * Lookup tupdescs if first time through or if type changes. We'd better + * pin them since type conversion functions could do catalog lookups and + * hence cause cache invalidation. + */ + indesc = get_cached_rowtype(op->d.convert_rowtype.inputtype, -1, + op->d.convert_rowtype.incache, + &changed); + IncrTupleDescRefCount(indesc); + outdesc = get_cached_rowtype(op->d.convert_rowtype.outputtype, -1, + op->d.convert_rowtype.outcache, + &changed); + IncrTupleDescRefCount(outdesc); + + /* + * We used to be able to assert that incoming tuples are marked with + * exactly the rowtype of indesc. However, now that ExecEvalWholeRowVar + * might change the tuples' marking to plain RECORD due to inserting + * aliases, we can only make this weak test: + */ + Assert(HeapTupleHeaderGetTypeId(tuple) == indesc->tdtypeid || + HeapTupleHeaderGetTypeId(tuple) == RECORDOID); + + /* if first time through, or after change, initialize conversion map */ + if (changed) + { + MemoryContext old_cxt; + + /* allocate map in long-lived memory context */ + old_cxt = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + /* prepare map from old to new attribute numbers */ + op->d.convert_rowtype.map = convert_tuples_by_name(indesc, outdesc); + + MemoryContextSwitchTo(old_cxt); + } + + /* Following steps need a HeapTuple not a bare HeapTupleHeader */ + tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple); + tmptup.t_data = tuple; + + if (op->d.convert_rowtype.map != NULL) + { + /* Full conversion with attribute rearrangement needed */ + result = execute_attr_map_tuple(&tmptup, op->d.convert_rowtype.map); + /* Result already has appropriate composite-datum header fields */ + *op->resvalue = HeapTupleGetDatum(result); + } + else + { + /* + * The tuple is physically compatible as-is, but we need to insert the + * destination rowtype OID in its composite-datum header field, so we + * have to copy it anyway. heap_copy_tuple_as_datum() is convenient + * for this since it will both make the physical copy and insert the + * correct composite header fields. Note that we aren't expecting to + * have to flatten any toasted fields: the input was a composite + * datum, so it shouldn't contain any. So heap_copy_tuple_as_datum() + * is overkill here, but its check for external fields is cheap. + */ + *op->resvalue = heap_copy_tuple_as_datum(&tmptup, outdesc); + } + + DecrTupleDescRefCount(indesc); + DecrTupleDescRefCount(outdesc); +} + +/* + * Evaluate "scalar op ANY/ALL (array)". + * + * Source array is in our result area, scalar arg is already evaluated into + * fcinfo->args[0]. + * + * The operator always yields boolean, and we combine the results across all + * array elements using OR and AND (for ANY and ALL respectively). Of course + * we short-circuit as soon as the result is known. + */ +void +ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op) +{ + FunctionCallInfo fcinfo = op->d.scalararrayop.fcinfo_data; + bool useOr = op->d.scalararrayop.useOr; + bool strictfunc = op->d.scalararrayop.finfo->fn_strict; + ArrayType *arr; + int nitems; + Datum result; + bool resultnull; + int16 typlen; + bool typbyval; + char typalign; + char *s; + bits8 *bitmap; + int bitmask; + + /* + * If the array is NULL then we return NULL --- it's not very meaningful + * to do anything else, even if the operator isn't strict. + */ + if (*op->resnull) + return; + + /* Else okay to fetch and detoast the array */ + arr = DatumGetArrayTypeP(*op->resvalue); + + /* + * If the array is empty, we return either FALSE or TRUE per the useOr + * flag. This is correct even if the scalar is NULL; since we would + * evaluate the operator zero times, it matters not whether it would want + * to return NULL. + */ + nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr)); + if (nitems <= 0) + { + *op->resvalue = BoolGetDatum(!useOr); + *op->resnull = false; + return; + } + + /* + * If the scalar is NULL, and the function is strict, return NULL; no + * point in iterating the loop. + */ + if (fcinfo->args[0].isnull && strictfunc) + { + *op->resnull = true; + return; + } + + /* + * We arrange to look up info about the element type only once per series + * of calls, assuming the element type doesn't change underneath us. + */ + if (op->d.scalararrayop.element_type != ARR_ELEMTYPE(arr)) + { + get_typlenbyvalalign(ARR_ELEMTYPE(arr), + &op->d.scalararrayop.typlen, + &op->d.scalararrayop.typbyval, + &op->d.scalararrayop.typalign); + op->d.scalararrayop.element_type = ARR_ELEMTYPE(arr); + } + + typlen = op->d.scalararrayop.typlen; + typbyval = op->d.scalararrayop.typbyval; + typalign = op->d.scalararrayop.typalign; + + /* Initialize result appropriately depending on useOr */ + result = BoolGetDatum(!useOr); + resultnull = false; + + /* Loop over the array elements */ + s = (char *) ARR_DATA_PTR(arr); + bitmap = ARR_NULLBITMAP(arr); + bitmask = 1; + + for (int i = 0; i < nitems; i++) + { + Datum elt; + Datum thisresult; + + /* Get array element, checking for NULL */ + if (bitmap && (*bitmap & bitmask) == 0) + { + fcinfo->args[1].value = (Datum) 0; + fcinfo->args[1].isnull = true; + } + else + { + elt = fetch_att(s, typbyval, typlen); + s = att_addlength_pointer(s, typlen, s); + s = (char *) att_align_nominal(s, typalign); + fcinfo->args[1].value = elt; + fcinfo->args[1].isnull = false; + } + + /* Call comparison function */ + if (fcinfo->args[1].isnull && strictfunc) + { + fcinfo->isnull = true; + thisresult = (Datum) 0; + } + else + { + fcinfo->isnull = false; + thisresult = op->d.scalararrayop.fn_addr(fcinfo); + } + + /* Combine results per OR or AND semantics */ + if (fcinfo->isnull) + resultnull = true; + else if (useOr) + { + if (DatumGetBool(thisresult)) + { + result = BoolGetDatum(true); + resultnull = false; + break; /* needn't look at any more elements */ + } + } + else + { + if (!DatumGetBool(thisresult)) + { + result = BoolGetDatum(false); + resultnull = false; + break; /* needn't look at any more elements */ + } + } + + /* advance bitmap pointer if any */ + if (bitmap) + { + bitmask <<= 1; + if (bitmask == 0x100) + { + bitmap++; + bitmask = 1; + } + } + } + + *op->resvalue = result; + *op->resnull = resultnull; +} + +/* + * Hash function for scalar array hash op elements. + * + * We use the element type's default hash opclass, and the column collation + * if the type is collation-sensitive. + */ +static uint32 +saop_element_hash(struct saophash_hash *tb, Datum key) +{ + ScalarArrayOpExprHashTable *elements_tab = (ScalarArrayOpExprHashTable *) tb->private_data; + FunctionCallInfo fcinfo = elements_tab->op->d.hashedscalararrayop.hash_fcinfo_data; + Datum hash; + + fcinfo->args[0].value = key; + fcinfo->args[0].isnull = false; + + hash = elements_tab->op->d.hashedscalararrayop.hash_fn_addr(fcinfo); + + return DatumGetUInt32(hash); +} + +/* + * Matching function for scalar array hash op elements, to be used in hashtable + * lookups. + */ +static bool +saop_hash_element_match(struct saophash_hash *tb, Datum key1, Datum key2) +{ + Datum result; + + ScalarArrayOpExprHashTable *elements_tab = (ScalarArrayOpExprHashTable *) tb->private_data; + FunctionCallInfo fcinfo = elements_tab->op->d.hashedscalararrayop.fcinfo_data; + + fcinfo->args[0].value = key1; + fcinfo->args[0].isnull = false; + fcinfo->args[1].value = key2; + fcinfo->args[1].isnull = false; + + result = elements_tab->op->d.hashedscalararrayop.fn_addr(fcinfo); + + return DatumGetBool(result); +} + +/* + * Evaluate "scalar op ANY (const array)". + * + * Similar to ExecEvalScalarArrayOp, but optimized for faster repeat lookups + * by building a hashtable on the first lookup. This hashtable will be reused + * by subsequent lookups. Unlike ExecEvalScalarArrayOp, this version only + * supports OR semantics. + * + * Source array is in our result area, scalar arg is already evaluated into + * fcinfo->args[0]. + * + * The operator always yields boolean. + */ +void +ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + ScalarArrayOpExprHashTable *elements_tab = op->d.hashedscalararrayop.elements_tab; + FunctionCallInfo fcinfo = op->d.hashedscalararrayop.fcinfo_data; + bool strictfunc = op->d.hashedscalararrayop.finfo->fn_strict; + Datum scalar = fcinfo->args[0].value; + bool scalar_isnull = fcinfo->args[0].isnull; + Datum result; + bool resultnull; + bool hashfound; + + /* We don't setup a hashed scalar array op if the array const is null. */ + Assert(!*op->resnull); + + /* + * If the scalar is NULL, and the function is strict, return NULL; no + * point in executing the search. + */ + if (fcinfo->args[0].isnull && strictfunc) + { + *op->resnull = true; + return; + } + + /* Build the hash table on first evaluation */ + if (elements_tab == NULL) + { + int16 typlen; + bool typbyval; + char typalign; + int nitems; + bool has_nulls = false; + char *s; + bits8 *bitmap; + int bitmask; + MemoryContext oldcontext; + ArrayType *arr; + + arr = DatumGetArrayTypeP(*op->resvalue); + nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr)); + + get_typlenbyvalalign(ARR_ELEMTYPE(arr), + &typlen, + &typbyval, + &typalign); + + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + elements_tab = (ScalarArrayOpExprHashTable *) + palloc(sizeof(ScalarArrayOpExprHashTable)); + op->d.hashedscalararrayop.elements_tab = elements_tab; + elements_tab->op = op; + + /* + * Create the hash table sizing it according to the number of elements + * in the array. This does assume that the array has no duplicates. + * If the array happens to contain many duplicate values then it'll + * just mean that we sized the table a bit on the large side. + */ + elements_tab->hashtab = saophash_create(CurrentMemoryContext, nitems, + elements_tab); + + MemoryContextSwitchTo(oldcontext); + + s = (char *) ARR_DATA_PTR(arr); + bitmap = ARR_NULLBITMAP(arr); + bitmask = 1; + for (int i = 0; i < nitems; i++) + { + /* Get array element, checking for NULL. */ + if (bitmap && (*bitmap & bitmask) == 0) + { + has_nulls = true; + } + else + { + Datum element; + + element = fetch_att(s, typbyval, typlen); + s = att_addlength_pointer(s, typlen, s); + s = (char *) att_align_nominal(s, typalign); + + saophash_insert(elements_tab->hashtab, element, &hashfound); + } + + /* Advance bitmap pointer if any. */ + if (bitmap) + { + bitmask <<= 1; + if (bitmask == 0x100) + { + bitmap++; + bitmask = 1; + } + } + } + + /* + * Remember if we had any nulls so that we know if we need to execute + * non-strict functions with a null lhs value if no match is found. + */ + op->d.hashedscalararrayop.has_nulls = has_nulls; + } + + /* Check the hash to see if we have a match. */ + hashfound = NULL != saophash_lookup(elements_tab->hashtab, scalar); + + result = BoolGetDatum(hashfound); + resultnull = false; + + /* + * If we didn't find a match in the array, we still might need to handle + * the possibility of null values. We didn't put any NULLs into the + * hashtable, but instead marked if we found any when building the table + * in has_nulls. + */ + if (!DatumGetBool(result) && op->d.hashedscalararrayop.has_nulls) + { + if (strictfunc) + { + + /* + * We have nulls in the array so a non-null lhs and no match must + * yield NULL. + */ + result = (Datum) 0; + resultnull = true; + } + else + { + /* + * Execute function will null rhs just once. + * + * The hash lookup path will have scribbled on the lhs argument so + * we need to set it up also (even though we entered this function + * with it already set). + */ + fcinfo->args[0].value = scalar; + fcinfo->args[0].isnull = scalar_isnull; + fcinfo->args[1].value = (Datum) 0; + fcinfo->args[1].isnull = true; + + result = op->d.hashedscalararrayop.fn_addr(fcinfo); + resultnull = fcinfo->isnull; + } + } + + *op->resvalue = result; + *op->resnull = resultnull; +} + +/* + * Evaluate a NOT NULL domain constraint. + */ +void +ExecEvalConstraintNotNull(ExprState *state, ExprEvalStep *op) +{ + if (*op->resnull) + ereport(ERROR, + (errcode(ERRCODE_NOT_NULL_VIOLATION), + errmsg("domain %s does not allow null values", + format_type_be(op->d.domaincheck.resulttype)), + errdatatype(op->d.domaincheck.resulttype))); +} + +/* + * Evaluate a CHECK domain constraint. + */ +void +ExecEvalConstraintCheck(ExprState *state, ExprEvalStep *op) +{ + if (!*op->d.domaincheck.checknull && + !DatumGetBool(*op->d.domaincheck.checkvalue)) + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("value for domain %s violates check constraint \"%s\"", + format_type_be(op->d.domaincheck.resulttype), + op->d.domaincheck.constraintname), + errdomainconstraint(op->d.domaincheck.resulttype, + op->d.domaincheck.constraintname))); +} + +/* + * Evaluate the various forms of XmlExpr. + * + * Arguments have been evaluated into named_argvalue/named_argnull + * and/or argvalue/argnull arrays. + */ +void +ExecEvalXmlExpr(ExprState *state, ExprEvalStep *op) +{ + XmlExpr *xexpr = op->d.xmlexpr.xexpr; + Datum value; + + *op->resnull = true; /* until we get a result */ + *op->resvalue = (Datum) 0; + + switch (xexpr->op) + { + case IS_XMLCONCAT: + { + Datum *argvalue = op->d.xmlexpr.argvalue; + bool *argnull = op->d.xmlexpr.argnull; + List *values = NIL; + + for (int i = 0; i < list_length(xexpr->args); i++) + { + if (!argnull[i]) + values = lappend(values, DatumGetPointer(argvalue[i])); + } + + if (values != NIL) + { + *op->resvalue = PointerGetDatum(xmlconcat(values)); + *op->resnull = false; + } + } + break; + + case IS_XMLFOREST: + { + Datum *argvalue = op->d.xmlexpr.named_argvalue; + bool *argnull = op->d.xmlexpr.named_argnull; + StringInfoData buf; + ListCell *lc; + ListCell *lc2; + int i; + + initStringInfo(&buf); + + i = 0; + forboth(lc, xexpr->named_args, lc2, xexpr->arg_names) + { + Expr *e = (Expr *) lfirst(lc); + char *argname = strVal(lfirst(lc2)); + + if (!argnull[i]) + { + value = argvalue[i]; + appendStringInfo(&buf, "<%s>%s", + argname, + map_sql_value_to_xml_value(value, + exprType((Node *) e), true), + argname); + *op->resnull = false; + } + i++; + } + + if (!*op->resnull) + { + text *result; + + result = cstring_to_text_with_len(buf.data, buf.len); + *op->resvalue = PointerGetDatum(result); + } + + pfree(buf.data); + } + break; + + case IS_XMLELEMENT: + *op->resvalue = PointerGetDatum(xmlelement(xexpr, + op->d.xmlexpr.named_argvalue, + op->d.xmlexpr.named_argnull, + op->d.xmlexpr.argvalue, + op->d.xmlexpr.argnull)); + *op->resnull = false; + break; + + case IS_XMLPARSE: + { + Datum *argvalue = op->d.xmlexpr.argvalue; + bool *argnull = op->d.xmlexpr.argnull; + text *data; + bool preserve_whitespace; + + /* arguments are known to be text, bool */ + Assert(list_length(xexpr->args) == 2); + + if (argnull[0]) + return; + value = argvalue[0]; + data = DatumGetTextPP(value); + + if (argnull[1]) /* probably can't happen */ + return; + value = argvalue[1]; + preserve_whitespace = DatumGetBool(value); + + *op->resvalue = PointerGetDatum(xmlparse(data, + xexpr->xmloption, + preserve_whitespace)); + *op->resnull = false; + } + break; + + case IS_XMLPI: + { + text *arg; + bool isnull; + + /* optional argument is known to be text */ + Assert(list_length(xexpr->args) <= 1); + + if (xexpr->args) + { + isnull = op->d.xmlexpr.argnull[0]; + if (isnull) + arg = NULL; + else + arg = DatumGetTextPP(op->d.xmlexpr.argvalue[0]); + } + else + { + arg = NULL; + isnull = false; + } + + *op->resvalue = PointerGetDatum(xmlpi(xexpr->name, + arg, + isnull, + op->resnull)); + } + break; + + case IS_XMLROOT: + { + Datum *argvalue = op->d.xmlexpr.argvalue; + bool *argnull = op->d.xmlexpr.argnull; + xmltype *data; + text *version; + int standalone; + + /* arguments are known to be xml, text, int */ + Assert(list_length(xexpr->args) == 3); + + if (argnull[0]) + return; + data = DatumGetXmlP(argvalue[0]); + + if (argnull[1]) + version = NULL; + else + version = DatumGetTextPP(argvalue[1]); + + Assert(!argnull[2]); /* always present */ + standalone = DatumGetInt32(argvalue[2]); + + *op->resvalue = PointerGetDatum(xmlroot(data, + version, + standalone)); + *op->resnull = false; + } + break; + + case IS_XMLSERIALIZE: + { + Datum *argvalue = op->d.xmlexpr.argvalue; + bool *argnull = op->d.xmlexpr.argnull; + + /* argument type is known to be xml */ + Assert(list_length(xexpr->args) == 1); + + if (argnull[0]) + return; + value = argvalue[0]; + + *op->resvalue = PointerGetDatum(xmltotext_with_xmloption(DatumGetXmlP(value), + xexpr->xmloption)); + *op->resnull = false; + } + break; + + case IS_DOCUMENT: + { + Datum *argvalue = op->d.xmlexpr.argvalue; + bool *argnull = op->d.xmlexpr.argnull; + + /* optional argument is known to be xml */ + Assert(list_length(xexpr->args) == 1); + + if (argnull[0]) + return; + value = argvalue[0]; + + *op->resvalue = + BoolGetDatum(xml_is_document(DatumGetXmlP(value))); + *op->resnull = false; + } + break; + + default: + elog(ERROR, "unrecognized XML operation"); + break; + } +} + +/* + * ExecEvalGroupingFunc + * + * Computes a bitmask with a bit for each (unevaluated) argument expression + * (rightmost arg is least significant bit). + * + * A bit is set if the corresponding expression is NOT part of the set of + * grouping expressions in the current grouping set. + */ +void +ExecEvalGroupingFunc(ExprState *state, ExprEvalStep *op) +{ + AggState *aggstate = castNode(AggState, state->parent); + int result = 0; + Bitmapset *grouped_cols = aggstate->grouped_cols; + ListCell *lc; + + foreach(lc, op->d.grouping_func.clauses) + { + int attnum = lfirst_int(lc); + + result <<= 1; + + if (!bms_is_member(attnum, grouped_cols)) + result |= 1; + } + + *op->resvalue = Int32GetDatum(result); + *op->resnull = false; +} + +/* + * Hand off evaluation of a subplan to nodeSubplan.c + */ +void +ExecEvalSubPlan(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + SubPlanState *sstate = op->d.subplan.sstate; + + /* could potentially be nested, so make sure there's enough stack */ + check_stack_depth(); + + *op->resvalue = ExecSubPlan(sstate, econtext, op->resnull); +} + +/* + * Evaluate a wholerow Var expression. + * + * Returns a Datum whose value is the value of a whole-row range variable + * with respect to given expression context. + */ +void +ExecEvalWholeRowVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + Var *variable = op->d.wholerow.var; + TupleTableSlot *slot; + TupleDesc output_tupdesc; + MemoryContext oldcontext; + HeapTupleHeader dtuple; + HeapTuple tuple; + + /* This was checked by ExecInitExpr */ + Assert(variable->varattno == InvalidAttrNumber); + + /* Get the input slot we want */ + switch (variable->varno) + { + case INNER_VAR: + /* get the tuple from the inner node */ + slot = econtext->ecxt_innertuple; + break; + + case OUTER_VAR: + /* get the tuple from the outer node */ + slot = econtext->ecxt_outertuple; + break; + + /* INDEX_VAR is handled by default case */ + + default: + /* get the tuple from the relation being scanned */ + slot = econtext->ecxt_scantuple; + break; + } + + /* Apply the junkfilter if any */ + if (op->d.wholerow.junkFilter != NULL) + slot = ExecFilterJunk(op->d.wholerow.junkFilter, slot); + + /* + * If first time through, obtain tuple descriptor and check compatibility. + * + * XXX: It'd be great if this could be moved to the expression + * initialization phase, but due to using slots that's currently not + * feasible. + */ + if (op->d.wholerow.first) + { + /* optimistically assume we don't need slow path */ + op->d.wholerow.slow = false; + + /* + * If the Var identifies a named composite type, we must check that + * the actual tuple type is compatible with it. + */ + if (variable->vartype != RECORDOID) + { + TupleDesc var_tupdesc; + TupleDesc slot_tupdesc; + + /* + * We really only care about numbers of attributes and data types. + * Also, we can ignore type mismatch on columns that are dropped + * in the destination type, so long as (1) the physical storage + * matches or (2) the actual column value is NULL. Case (1) is + * helpful in some cases involving out-of-date cached plans, while + * case (2) is expected behavior in situations such as an INSERT + * into a table with dropped columns (the planner typically + * generates an INT4 NULL regardless of the dropped column type). + * If we find a dropped column and cannot verify that case (1) + * holds, we have to use the slow path to check (2) for each row. + * + * If vartype is a domain over composite, just look through that + * to the base composite type. + */ + var_tupdesc = lookup_rowtype_tupdesc_domain(variable->vartype, + -1, false); + + slot_tupdesc = slot->tts_tupleDescriptor; + + if (var_tupdesc->natts != slot_tupdesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail_plural("Table row contains %d attribute, but query expects %d.", + "Table row contains %d attributes, but query expects %d.", + slot_tupdesc->natts, + slot_tupdesc->natts, + var_tupdesc->natts))); + + for (int i = 0; i < var_tupdesc->natts; i++) + { + Form_pg_attribute vattr = TupleDescAttr(var_tupdesc, i); + Form_pg_attribute sattr = TupleDescAttr(slot_tupdesc, i); + + if (vattr->atttypid == sattr->atttypid) + continue; /* no worries */ + if (!vattr->attisdropped) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Table has type %s at ordinal position %d, but query expects %s.", + format_type_be(sattr->atttypid), + i + 1, + format_type_be(vattr->atttypid)))); + + if (vattr->attlen != sattr->attlen || + vattr->attalign != sattr->attalign) + op->d.wholerow.slow = true; /* need to check for nulls */ + } + + /* + * Use the variable's declared rowtype as the descriptor for the + * output values. In particular, we *must* absorb any + * attisdropped markings. + */ + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + output_tupdesc = CreateTupleDescCopy(var_tupdesc); + MemoryContextSwitchTo(oldcontext); + + ReleaseTupleDesc(var_tupdesc); + } + else + { + /* + * In the RECORD case, we use the input slot's rowtype as the + * descriptor for the output values, modulo possibly assigning new + * column names below. + */ + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + output_tupdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor); + MemoryContextSwitchTo(oldcontext); + + /* + * It's possible that the input slot is a relation scan slot and + * so is marked with that relation's rowtype. But we're supposed + * to be returning RECORD, so reset to that. + */ + output_tupdesc->tdtypeid = RECORDOID; + output_tupdesc->tdtypmod = -1; + + /* + * We already got the correct physical datatype info above, but + * now we should try to find the source RTE and adopt its column + * aliases, since it's unlikely that the input slot has the + * desired names. + * + * If we can't locate the RTE, assume the column names we've got + * are OK. (As of this writing, the only cases where we can't + * locate the RTE are in execution of trigger WHEN clauses, and + * then the Var will have the trigger's relation's rowtype, so its + * names are fine.) Also, if the creator of the RTE didn't bother + * to fill in an eref field, assume our column names are OK. (This + * happens in COPY, and perhaps other places.) + */ + if (econtext->ecxt_estate && + variable->varno <= econtext->ecxt_estate->es_range_table_size) + { + RangeTblEntry *rte = exec_rt_fetch(variable->varno, + econtext->ecxt_estate); + + if (rte->eref) + ExecTypeSetColNames(output_tupdesc, rte->eref->colnames); + } + } + + /* Bless the tupdesc if needed, and save it in the execution state */ + op->d.wholerow.tupdesc = BlessTupleDesc(output_tupdesc); + + op->d.wholerow.first = false; + } + + /* + * Make sure all columns of the slot are accessible in the slot's + * Datum/isnull arrays. + */ + slot_getallattrs(slot); + + if (op->d.wholerow.slow) + { + /* Check to see if any dropped attributes are non-null */ + TupleDesc tupleDesc = slot->tts_tupleDescriptor; + TupleDesc var_tupdesc = op->d.wholerow.tupdesc; + + Assert(var_tupdesc->natts == tupleDesc->natts); + + for (int i = 0; i < var_tupdesc->natts; i++) + { + Form_pg_attribute vattr = TupleDescAttr(var_tupdesc, i); + Form_pg_attribute sattr = TupleDescAttr(tupleDesc, i); + + if (!vattr->attisdropped) + continue; /* already checked non-dropped cols */ + if (slot->tts_isnull[i]) + continue; /* null is always okay */ + if (vattr->attlen != sattr->attlen || + vattr->attalign != sattr->attalign) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Physical storage mismatch on dropped attribute at ordinal position %d.", + i + 1))); + } + } + + /* + * Build a composite datum, making sure any toasted fields get detoasted. + * + * (Note: it is critical that we not change the slot's state here.) + */ + tuple = toast_build_flattened_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); + dtuple = tuple->t_data; + + /* + * Label the datum with the composite type info we identified before. + * + * (Note: we could skip doing this by passing op->d.wholerow.tupdesc to + * the tuple build step; but that seems a tad risky so let's not.) + */ + HeapTupleHeaderSetTypeId(dtuple, op->d.wholerow.tupdesc->tdtypeid); + HeapTupleHeaderSetTypMod(dtuple, op->d.wholerow.tupdesc->tdtypmod); + + *op->resvalue = PointerGetDatum(dtuple); + *op->resnull = false; +} + +void +ExecEvalSysVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext, + TupleTableSlot *slot) +{ + Datum d; + + /* slot_getsysattr has sufficient defenses against bad attnums */ + d = slot_getsysattr(slot, + op->d.var.attnum, + op->resnull); + *op->resvalue = d; + /* this ought to be unreachable, but it's cheap enough to check */ + if (unlikely(*op->resnull)) + elog(ERROR, "failed to fetch attribute from slot"); +} + +/* + * Transition value has not been initialized. This is the first non-NULL input + * value for a group. We use it as the initial value for transValue. + */ +void +ExecAggInitGroup(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroup, + ExprContext *aggcontext) +{ + FunctionCallInfo fcinfo = pertrans->transfn_fcinfo; + MemoryContext oldContext; + + /* + * We must copy the datum into aggcontext if it is pass-by-ref. We do not + * need to pfree the old transValue, since it's NULL. (We already checked + * that the agg's input type is binary-compatible with its transtype, so + * straight copy here is OK.) + */ + oldContext = MemoryContextSwitchTo(aggcontext->ecxt_per_tuple_memory); + pergroup->transValue = datumCopy(fcinfo->args[1].value, + pertrans->transtypeByVal, + pertrans->transtypeLen); + pergroup->transValueIsNull = false; + pergroup->noTransValue = false; + MemoryContextSwitchTo(oldContext); +} + +/* + * Ensure that the current transition value is a child of the aggcontext, + * rather than the per-tuple context. + * + * NB: This can change the current memory context. + */ +Datum +ExecAggTransReparent(AggState *aggstate, AggStatePerTrans pertrans, + Datum newValue, bool newValueIsNull, + Datum oldValue, bool oldValueIsNull) +{ + Assert(newValue != oldValue); + + if (!newValueIsNull) + { + MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory); + if (DatumIsReadWriteExpandedObject(newValue, + false, + pertrans->transtypeLen) && + MemoryContextGetParent(DatumGetEOHP(newValue)->eoh_context) == CurrentMemoryContext) + /* do nothing */ ; + else + newValue = datumCopy(newValue, + pertrans->transtypeByVal, + pertrans->transtypeLen); + } + else + { + /* + * Ensure that AggStatePerGroup->transValue ends up being 0, so + * callers can safely compare newValue/oldValue without having to + * check their respective nullness. + */ + newValue = (Datum) 0; + } + + if (!oldValueIsNull) + { + if (DatumIsReadWriteExpandedObject(oldValue, + false, + pertrans->transtypeLen)) + DeleteExpandedObject(oldValue); + else + pfree(DatumGetPointer(oldValue)); + } + + return newValue; +} + +/* + * Invoke ordered transition function, with a datum argument. + */ +void +ExecEvalAggOrderedTransDatum(ExprState *state, ExprEvalStep *op, + ExprContext *econtext) +{ + AggStatePerTrans pertrans = op->d.agg_trans.pertrans; + int setno = op->d.agg_trans.setno; + + tuplesort_putdatum(pertrans->sortstates[setno], + *op->resvalue, *op->resnull); +} + +/* + * Invoke ordered transition function, with a tuple argument. + */ +void +ExecEvalAggOrderedTransTuple(ExprState *state, ExprEvalStep *op, + ExprContext *econtext) +{ + AggStatePerTrans pertrans = op->d.agg_trans.pertrans; + int setno = op->d.agg_trans.setno; + + ExecClearTuple(pertrans->sortslot); + pertrans->sortslot->tts_nvalid = pertrans->numInputs; + ExecStoreVirtualTuple(pertrans->sortslot); + tuplesort_puttupleslot(pertrans->sortstates[setno], pertrans->sortslot); +} + +/* implementation of transition function invocation for byval types */ +static pg_attribute_always_inline void +ExecAggPlainTransByVal(AggState *aggstate, AggStatePerTrans pertrans, + AggStatePerGroup pergroup, + ExprContext *aggcontext, int setno) +{ + FunctionCallInfo fcinfo = pertrans->transfn_fcinfo; + MemoryContext oldContext; + Datum newVal; + + /* cf. select_current_set() */ + aggstate->curaggcontext = aggcontext; + aggstate->current_set = setno; + + /* set up aggstate->curpertrans for AggGetAggref() */ + aggstate->curpertrans = pertrans; + + /* invoke transition function in per-tuple context */ + oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory); + + fcinfo->args[0].value = pergroup->transValue; + fcinfo->args[0].isnull = pergroup->transValueIsNull; + fcinfo->isnull = false; /* just in case transfn doesn't set it */ + + newVal = FunctionCallInvoke(fcinfo); + + pergroup->transValue = newVal; + pergroup->transValueIsNull = fcinfo->isnull; + + MemoryContextSwitchTo(oldContext); +} + +/* implementation of transition function invocation for byref types */ +static pg_attribute_always_inline void +ExecAggPlainTransByRef(AggState *aggstate, AggStatePerTrans pertrans, + AggStatePerGroup pergroup, + ExprContext *aggcontext, int setno) +{ + FunctionCallInfo fcinfo = pertrans->transfn_fcinfo; + MemoryContext oldContext; + Datum newVal; + + /* cf. select_current_set() */ + aggstate->curaggcontext = aggcontext; + aggstate->current_set = setno; + + /* set up aggstate->curpertrans for AggGetAggref() */ + aggstate->curpertrans = pertrans; + + /* invoke transition function in per-tuple context */ + oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory); + + fcinfo->args[0].value = pergroup->transValue; + fcinfo->args[0].isnull = pergroup->transValueIsNull; + fcinfo->isnull = false; /* just in case transfn doesn't set it */ + + newVal = FunctionCallInvoke(fcinfo); + + /* + * For pass-by-ref datatype, must copy the new value into aggcontext and + * free the prior transValue. But if transfn returned a pointer to its + * first input, we don't need to do anything. Also, if transfn returned a + * pointer to a R/W expanded object that is already a child of the + * aggcontext, assume we can adopt that value without copying it. + * + * It's safe to compare newVal with pergroup->transValue without regard + * for either being NULL, because ExecAggTransReparent() takes care to set + * transValue to 0 when NULL. Otherwise we could end up accidentally not + * reparenting, when the transValue has the same numerical value as + * newValue, despite being NULL. This is a somewhat hot path, making it + * undesirable to instead solve this with another branch for the common + * case of the transition function returning its (modified) input + * argument. + */ + if (DatumGetPointer(newVal) != DatumGetPointer(pergroup->transValue)) + newVal = ExecAggTransReparent(aggstate, pertrans, + newVal, fcinfo->isnull, + pergroup->transValue, + pergroup->transValueIsNull); + + pergroup->transValue = newVal; + pergroup->transValueIsNull = fcinfo->isnull; + + MemoryContextSwitchTo(oldContext); +} diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c new file mode 100644 index 0000000..c11427a --- /dev/null +++ b/src/backend/executor/execGrouping.c @@ -0,0 +1,560 @@ +/*------------------------------------------------------------------------- + * + * execGrouping.c + * executor utility routines for grouping, hashing, and aggregation + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execGrouping.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/parallel.h" +#include "common/hashfn.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" + +static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2); +static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb, + const MinimalTuple tuple); +static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable, + TupleTableSlot *slot, + bool *isnew, uint32 hash); + +/* + * Define parameters for tuple hash table code generation. The interface is + * *also* declared in execnodes.h (to generate the types, which are externally + * visible). + */ +#define SH_PREFIX tuplehash +#define SH_ELEMENT_TYPE TupleHashEntryData +#define SH_KEY_TYPE MinimalTuple +#define SH_KEY firstTuple +#define SH_HASH_KEY(tb, key) TupleHashTableHash_internal(tb, key) +#define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0 +#define SH_SCOPE extern +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) a->hash +#define SH_DEFINE +#include "lib/simplehash.h" + + +/***************************************************************************** + * Utility routines for grouping tuples together + *****************************************************************************/ + +/* + * execTuplesMatchPrepare + * Build expression that can be evaluated using ExecQual(), returning + * whether an ExprContext's inner/outer tuples are NOT DISTINCT + */ +ExprState * +execTuplesMatchPrepare(TupleDesc desc, + int numCols, + const AttrNumber *keyColIdx, + const Oid *eqOperators, + const Oid *collations, + PlanState *parent) +{ + Oid *eqFunctions = (Oid *) palloc(numCols * sizeof(Oid)); + int i; + ExprState *expr; + + if (numCols == 0) + return NULL; + + /* lookup equality functions */ + for (i = 0; i < numCols; i++) + eqFunctions[i] = get_opcode(eqOperators[i]); + + /* build actual expression */ + expr = ExecBuildGroupingEqual(desc, desc, NULL, NULL, + numCols, keyColIdx, eqFunctions, collations, + parent); + + return expr; +} + +/* + * execTuplesHashPrepare + * Look up the equality and hashing functions needed for a TupleHashTable. + * + * This is similar to execTuplesMatchPrepare, but we also need to find the + * hash functions associated with the equality operators. *eqFunctions and + * *hashFunctions receive the palloc'd result arrays. + * + * Note: we expect that the given operators are not cross-type comparisons. + */ +void +execTuplesHashPrepare(int numCols, + const Oid *eqOperators, + Oid **eqFuncOids, + FmgrInfo **hashFunctions) +{ + int i; + + *eqFuncOids = (Oid *) palloc(numCols * sizeof(Oid)); + *hashFunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo)); + + for (i = 0; i < numCols; i++) + { + Oid eq_opr = eqOperators[i]; + Oid eq_function; + Oid left_hash_function; + Oid right_hash_function; + + eq_function = get_opcode(eq_opr); + if (!get_op_hash_functions(eq_opr, + &left_hash_function, &right_hash_function)) + elog(ERROR, "could not find hash function for hash operator %u", + eq_opr); + /* We're not supporting cross-type cases here */ + Assert(left_hash_function == right_hash_function); + (*eqFuncOids)[i] = eq_function; + fmgr_info(right_hash_function, &(*hashFunctions)[i]); + } +} + + +/***************************************************************************** + * Utility routines for all-in-memory hash tables + * + * These routines build hash tables for grouping tuples together (eg, for + * hash aggregation). There is one entry for each not-distinct set of tuples + * presented. + *****************************************************************************/ + +/* + * Construct an empty TupleHashTable + * + * numCols, keyColIdx: identify the tuple fields to use as lookup key + * eqfunctions: equality comparison functions to use + * hashfunctions: datatype-specific hashing functions to use + * nbuckets: initial estimate of hashtable size + * additionalsize: size of data stored in ->additional + * metacxt: memory context for long-lived allocation, but not per-entry data + * tablecxt: memory context in which to store table entries + * tempcxt: short-lived context for evaluation hash and comparison functions + * + * The function arrays may be made with execTuplesHashPrepare(). Note they + * are not cross-type functions, but expect to see the table datatype(s) + * on both sides. + * + * Note that keyColIdx, eqfunctions, and hashfunctions must be allocated in + * storage that will live as long as the hashtable does. + */ +TupleHashTable +BuildTupleHashTableExt(PlanState *parent, + TupleDesc inputDesc, + int numCols, AttrNumber *keyColIdx, + const Oid *eqfuncoids, + FmgrInfo *hashfunctions, + Oid *collations, + long nbuckets, Size additionalsize, + MemoryContext metacxt, + MemoryContext tablecxt, + MemoryContext tempcxt, + bool use_variable_hash_iv) +{ + TupleHashTable hashtable; + Size entrysize = sizeof(TupleHashEntryData) + additionalsize; + Size hash_mem_limit; + MemoryContext oldcontext; + bool allow_jit; + + Assert(nbuckets > 0); + + /* Limit initial table size request to not more than hash_mem */ + hash_mem_limit = get_hash_memory_limit() / entrysize; + if (nbuckets > hash_mem_limit) + nbuckets = hash_mem_limit; + + oldcontext = MemoryContextSwitchTo(metacxt); + + hashtable = (TupleHashTable) palloc(sizeof(TupleHashTableData)); + + hashtable->numCols = numCols; + hashtable->keyColIdx = keyColIdx; + hashtable->tab_hash_funcs = hashfunctions; + hashtable->tab_collations = collations; + hashtable->tablecxt = tablecxt; + hashtable->tempcxt = tempcxt; + hashtable->entrysize = entrysize; + hashtable->tableslot = NULL; /* will be made on first lookup */ + hashtable->inputslot = NULL; + hashtable->in_hash_funcs = NULL; + hashtable->cur_eq_func = NULL; + + /* + * If parallelism is in use, even if the leader backend is performing the + * scan itself, we don't want to create the hashtable exactly the same way + * in all workers. As hashtables are iterated over in keyspace-order, + * doing so in all processes in the same way is likely to lead to + * "unbalanced" hashtables when the table size initially is + * underestimated. + */ + if (use_variable_hash_iv) + hashtable->hash_iv = murmurhash32(ParallelWorkerNumber); + else + hashtable->hash_iv = 0; + + hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable); + + /* + * We copy the input tuple descriptor just for safety --- we assume all + * input tuples will have equivalent descriptors. + */ + hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc), + &TTSOpsMinimalTuple); + + /* + * If the old reset interface is used (i.e. BuildTupleHashTable, rather + * than BuildTupleHashTableExt), allowing JIT would lead to the generated + * functions to a) live longer than the query b) be re-generated each time + * the table is being reset. Therefore prevent JIT from being used in that + * case, by not providing a parent node (which prevents accessing the + * JitContext in the EState). + */ + allow_jit = metacxt != tablecxt; + + /* build comparator for all columns */ + /* XXX: should we support non-minimal tuples for the inputslot? */ + hashtable->tab_eq_func = ExecBuildGroupingEqual(inputDesc, inputDesc, + &TTSOpsMinimalTuple, &TTSOpsMinimalTuple, + numCols, + keyColIdx, eqfuncoids, collations, + allow_jit ? parent : NULL); + + /* + * While not pretty, it's ok to not shut down this context, but instead + * rely on the containing memory context being reset, as + * ExecBuildGroupingEqual() only builds a very simple expression calling + * functions (i.e. nothing that'd employ RegisterExprContextCallback()). + */ + hashtable->exprcontext = CreateStandaloneExprContext(); + + MemoryContextSwitchTo(oldcontext); + + return hashtable; +} + +/* + * BuildTupleHashTable is a backwards-compatibilty wrapper for + * BuildTupleHashTableExt(), that allocates the hashtable's metadata in + * tablecxt. Note that hashtables created this way cannot be reset leak-free + * with ResetTupleHashTable(). + */ +TupleHashTable +BuildTupleHashTable(PlanState *parent, + TupleDesc inputDesc, + int numCols, AttrNumber *keyColIdx, + const Oid *eqfuncoids, + FmgrInfo *hashfunctions, + Oid *collations, + long nbuckets, Size additionalsize, + MemoryContext tablecxt, + MemoryContext tempcxt, + bool use_variable_hash_iv) +{ + return BuildTupleHashTableExt(parent, + inputDesc, + numCols, keyColIdx, + eqfuncoids, + hashfunctions, + collations, + nbuckets, additionalsize, + tablecxt, + tablecxt, + tempcxt, + use_variable_hash_iv); +} + +/* + * Reset contents of the hashtable to be empty, preserving all the non-content + * state. Note that the tablecxt passed to BuildTupleHashTableExt() should + * also be reset, otherwise there will be leaks. + */ +void +ResetTupleHashTable(TupleHashTable hashtable) +{ + tuplehash_reset(hashtable->hashtab); +} + +/* + * Find or create a hashtable entry for the tuple group containing the + * given tuple. The tuple must be the same type as the hashtable entries. + * + * If isnew is NULL, we do not create new entries; we return NULL if no + * match is found. + * + * If hash is not NULL, we set it to the calculated hash value. This allows + * callers access to the hash value even if no entry is returned. + * + * If isnew isn't NULL, then a new entry is created if no existing entry + * matches. On return, *isnew is true if the entry is newly created, + * false if it existed already. ->additional_data in the new entry has + * been zeroed. + */ +TupleHashEntry +LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, + bool *isnew, uint32 *hash) +{ + TupleHashEntry entry; + MemoryContext oldContext; + uint32 local_hash; + + /* Need to run the hash functions in short-lived context */ + oldContext = MemoryContextSwitchTo(hashtable->tempcxt); + + /* set up data needed by hash and match functions */ + hashtable->inputslot = slot; + hashtable->in_hash_funcs = hashtable->tab_hash_funcs; + hashtable->cur_eq_func = hashtable->tab_eq_func; + + local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL); + entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash); + + if (hash != NULL) + *hash = local_hash; + + Assert(entry == NULL || entry->hash == local_hash); + + MemoryContextSwitchTo(oldContext); + + return entry; +} + +/* + * Compute the hash value for a tuple + */ +uint32 +TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot) +{ + MemoryContext oldContext; + uint32 hash; + + hashtable->inputslot = slot; + hashtable->in_hash_funcs = hashtable->tab_hash_funcs; + + /* Need to run the hash functions in short-lived context */ + oldContext = MemoryContextSwitchTo(hashtable->tempcxt); + + hash = TupleHashTableHash_internal(hashtable->hashtab, NULL); + + MemoryContextSwitchTo(oldContext); + + return hash; +} + +/* + * A variant of LookupTupleHashEntry for callers that have already computed + * the hash value. + */ +TupleHashEntry +LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot, + bool *isnew, uint32 hash) +{ + TupleHashEntry entry; + MemoryContext oldContext; + + /* Need to run the hash functions in short-lived context */ + oldContext = MemoryContextSwitchTo(hashtable->tempcxt); + + /* set up data needed by hash and match functions */ + hashtable->inputslot = slot; + hashtable->in_hash_funcs = hashtable->tab_hash_funcs; + hashtable->cur_eq_func = hashtable->tab_eq_func; + + entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash); + Assert(entry == NULL || entry->hash == hash); + + MemoryContextSwitchTo(oldContext); + + return entry; +} + +/* + * Search for a hashtable entry matching the given tuple. No entry is + * created if there's not a match. This is similar to the non-creating + * case of LookupTupleHashEntry, except that it supports cross-type + * comparisons, in which the given tuple is not of the same type as the + * table entries. The caller must provide the hash functions to use for + * the input tuple, as well as the equality functions, since these may be + * different from the table's internal functions. + */ +TupleHashEntry +FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, + ExprState *eqcomp, + FmgrInfo *hashfunctions) +{ + TupleHashEntry entry; + MemoryContext oldContext; + MinimalTuple key; + + /* Need to run the hash functions in short-lived context */ + oldContext = MemoryContextSwitchTo(hashtable->tempcxt); + + /* Set up data needed by hash and match functions */ + hashtable->inputslot = slot; + hashtable->in_hash_funcs = hashfunctions; + hashtable->cur_eq_func = eqcomp; + + /* Search the hash table */ + key = NULL; /* flag to reference inputslot */ + entry = tuplehash_lookup(hashtable->hashtab, key); + MemoryContextSwitchTo(oldContext); + + return entry; +} + +/* + * If tuple is NULL, use the input slot instead. This convention avoids the + * need to materialize virtual input tuples unless they actually need to get + * copied into the table. + * + * Also, the caller must select an appropriate memory context for running + * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.) + */ +static uint32 +TupleHashTableHash_internal(struct tuplehash_hash *tb, + const MinimalTuple tuple) +{ + TupleHashTable hashtable = (TupleHashTable) tb->private_data; + int numCols = hashtable->numCols; + AttrNumber *keyColIdx = hashtable->keyColIdx; + uint32 hashkey = hashtable->hash_iv; + TupleTableSlot *slot; + FmgrInfo *hashfunctions; + int i; + + if (tuple == NULL) + { + /* Process the current input tuple for the table */ + slot = hashtable->inputslot; + hashfunctions = hashtable->in_hash_funcs; + } + else + { + /* + * Process a tuple already stored in the table. + * + * (this case never actually occurs due to the way simplehash.h is + * used, as the hash-value is stored in the entries) + */ + slot = hashtable->tableslot; + ExecStoreMinimalTuple(tuple, slot, false); + hashfunctions = hashtable->tab_hash_funcs; + } + + for (i = 0; i < numCols; i++) + { + AttrNumber att = keyColIdx[i]; + Datum attr; + bool isNull; + + /* rotate hashkey left 1 bit at each step */ + hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0); + + attr = slot_getattr(slot, att, &isNull); + + if (!isNull) /* treat nulls as having hash key 0 */ + { + uint32 hkey; + + hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i], + hashtable->tab_collations[i], + attr)); + hashkey ^= hkey; + } + } + + /* + * The way hashes are combined above, among each other and with the IV, + * doesn't lead to good bit perturbation. As the IV's goal is to lead to + * achieve that, perform a round of hashing of the combined hash - + * resulting in near perfect perturbation. + */ + return murmurhash32(hashkey); +} + +/* + * Does the work of LookupTupleHashEntry and LookupTupleHashEntryHash. Useful + * so that we can avoid switching the memory context multiple times for + * LookupTupleHashEntry. + * + * NB: This function may or may not change the memory context. Caller is + * expected to change it back. + */ +static inline TupleHashEntry +LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot, + bool *isnew, uint32 hash) +{ + TupleHashEntryData *entry; + bool found; + MinimalTuple key; + + key = NULL; /* flag to reference inputslot */ + + if (isnew) + { + entry = tuplehash_insert_hash(hashtable->hashtab, key, hash, &found); + + if (found) + { + /* found pre-existing entry */ + *isnew = false; + } + else + { + /* created new entry */ + *isnew = true; + /* zero caller data */ + entry->additional = NULL; + MemoryContextSwitchTo(hashtable->tablecxt); + /* Copy the first tuple into the table context */ + entry->firstTuple = ExecCopySlotMinimalTuple(slot); + } + } + else + { + entry = tuplehash_lookup_hash(hashtable->hashtab, key, hash); + } + + return entry; +} + +/* + * See whether two tuples (presumably of the same hash value) match + */ +static int +TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2) +{ + TupleTableSlot *slot1; + TupleTableSlot *slot2; + TupleHashTable hashtable = (TupleHashTable) tb->private_data; + ExprContext *econtext = hashtable->exprcontext; + + /* + * We assume that simplehash.h will only ever call us with the first + * argument being an actual table entry, and the second argument being + * LookupTupleHashEntry's dummy TupleHashEntryData. The other direction + * could be supported too, but is not currently required. + */ + Assert(tuple1 != NULL); + slot1 = hashtable->tableslot; + ExecStoreMinimalTuple(tuple1, slot1, false); + Assert(tuple2 == NULL); + slot2 = hashtable->inputslot; + + /* For crosstype comparisons, the inputslot must be first */ + econtext->ecxt_innertuple = slot2; + econtext->ecxt_outertuple = slot1; + return !ExecQualAndReset(hashtable->cur_eq_func, econtext); +} diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c new file mode 100644 index 0000000..74becdc --- /dev/null +++ b/src/backend/executor/execIndexing.c @@ -0,0 +1,921 @@ +/*------------------------------------------------------------------------- + * + * execIndexing.c + * routines for inserting index tuples and enforcing unique and + * exclusion constraints. + * + * ExecInsertIndexTuples() is the main entry point. It's called after + * inserting a tuple to the heap, and it inserts corresponding index tuples + * into all indexes. At the same time, it enforces any unique and + * exclusion constraints: + * + * Unique Indexes + * -------------- + * + * Enforcing a unique constraint is straightforward. When the index AM + * inserts the tuple to the index, it also checks that there are no + * conflicting tuples in the index already. It does so atomically, so that + * even if two backends try to insert the same key concurrently, only one + * of them will succeed. All the logic to ensure atomicity, and to wait + * for in-progress transactions to finish, is handled by the index AM. + * + * If a unique constraint is deferred, we request the index AM to not + * throw an error if a conflict is found. Instead, we make note that there + * was a conflict and return the list of indexes with conflicts to the + * caller. The caller must re-check them later, by calling index_insert() + * with the UNIQUE_CHECK_EXISTING option. + * + * Exclusion Constraints + * --------------------- + * + * Exclusion constraints are different from unique indexes in that when the + * tuple is inserted to the index, the index AM does not check for + * duplicate keys at the same time. After the insertion, we perform a + * separate scan on the index to check for conflicting tuples, and if one + * is found, we throw an error and the transaction is aborted. If the + * conflicting tuple's inserter or deleter is in-progress, we wait for it + * to finish first. + * + * There is a chance of deadlock, if two backends insert a tuple at the + * same time, and then perform the scan to check for conflicts. They will + * find each other's tuple, and both try to wait for each other. The + * deadlock detector will detect that, and abort one of the transactions. + * That's fairly harmless, as one of them was bound to abort with a + * "duplicate key error" anyway, although you get a different error + * message. + * + * If an exclusion constraint is deferred, we still perform the conflict + * checking scan immediately after inserting the index tuple. But instead + * of throwing an error if a conflict is found, we return that information + * to the caller. The caller must re-check them later by calling + * check_exclusion_constraint(). + * + * Speculative insertion + * --------------------- + * + * Speculative insertion is a two-phase mechanism used to implement + * INSERT ... ON CONFLICT DO UPDATE/NOTHING. The tuple is first inserted + * to the heap and update the indexes as usual, but if a constraint is + * violated, we can still back out the insertion without aborting the whole + * transaction. In an INSERT ... ON CONFLICT statement, if a conflict is + * detected, the inserted tuple is backed out and the ON CONFLICT action is + * executed instead. + * + * Insertion to a unique index works as usual: the index AM checks for + * duplicate keys atomically with the insertion. But instead of throwing + * an error on a conflict, the speculatively inserted heap tuple is backed + * out. + * + * Exclusion constraints are slightly more complicated. As mentioned + * earlier, there is a risk of deadlock when two backends insert the same + * key concurrently. That was not a problem for regular insertions, when + * one of the transactions has to be aborted anyway, but with a speculative + * insertion we cannot let a deadlock happen, because we only want to back + * out the speculatively inserted tuple on conflict, not abort the whole + * transaction. + * + * When a backend detects that the speculative insertion conflicts with + * another in-progress tuple, it has two options: + * + * 1. back out the speculatively inserted tuple, then wait for the other + * transaction, and retry. Or, + * 2. wait for the other transaction, with the speculatively inserted tuple + * still in place. + * + * If two backends insert at the same time, and both try to wait for each + * other, they will deadlock. So option 2 is not acceptable. Option 1 + * avoids the deadlock, but it is prone to a livelock instead. Both + * transactions will wake up immediately as the other transaction backs + * out. Then they both retry, and conflict with each other again, lather, + * rinse, repeat. + * + * To avoid the livelock, one of the backends must back out first, and then + * wait, while the other one waits without backing out. It doesn't matter + * which one backs out, so we employ an arbitrary rule that the transaction + * with the higher XID backs out. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execIndexing.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "catalog/index.h" +#include "executor/executor.h" +#include "nodes/nodeFuncs.h" +#include "storage/lmgr.h" +#include "utils/snapmgr.h" + +/* waitMode argument to check_exclusion_or_unique_constraint() */ +typedef enum +{ + CEOUC_WAIT, + CEOUC_NOWAIT, + CEOUC_LIVELOCK_PREVENTING_WAIT +} CEOUC_WAIT_MODE; + +static bool check_exclusion_or_unique_constraint(Relation heap, Relation index, + IndexInfo *indexInfo, + ItemPointer tupleid, + Datum *values, bool *isnull, + EState *estate, bool newIndex, + CEOUC_WAIT_MODE waitMode, + bool errorOK, + ItemPointer conflictTid); + +static bool index_recheck_constraint(Relation index, Oid *constr_procs, + Datum *existing_values, bool *existing_isnull, + Datum *new_values); + +/* ---------------------------------------------------------------- + * ExecOpenIndices + * + * Find the indices associated with a result relation, open them, + * and save information about them in the result ResultRelInfo. + * + * At entry, caller has already opened and locked + * resultRelInfo->ri_RelationDesc. + * ---------------------------------------------------------------- + */ +void +ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative) +{ + Relation resultRelation = resultRelInfo->ri_RelationDesc; + List *indexoidlist; + ListCell *l; + int len, + i; + RelationPtr relationDescs; + IndexInfo **indexInfoArray; + + resultRelInfo->ri_NumIndices = 0; + + /* fast path if no indexes */ + if (!RelationGetForm(resultRelation)->relhasindex) + return; + + /* + * Get cached list of index OIDs + */ + indexoidlist = RelationGetIndexList(resultRelation); + len = list_length(indexoidlist); + if (len == 0) + return; + + /* + * allocate space for result arrays + */ + relationDescs = (RelationPtr) palloc(len * sizeof(Relation)); + indexInfoArray = (IndexInfo **) palloc(len * sizeof(IndexInfo *)); + + resultRelInfo->ri_NumIndices = len; + resultRelInfo->ri_IndexRelationDescs = relationDescs; + resultRelInfo->ri_IndexRelationInfo = indexInfoArray; + + /* + * For each index, open the index relation and save pg_index info. We + * acquire RowExclusiveLock, signifying we will update the index. + * + * Note: we do this even if the index is not indisready; it's not worth + * the trouble to optimize for the case where it isn't. + */ + i = 0; + foreach(l, indexoidlist) + { + Oid indexOid = lfirst_oid(l); + Relation indexDesc; + IndexInfo *ii; + + indexDesc = index_open(indexOid, RowExclusiveLock); + + /* extract index key information from the index's pg_index info */ + ii = BuildIndexInfo(indexDesc); + + /* + * If the indexes are to be used for speculative insertion, add extra + * information required by unique index entries. + */ + if (speculative && ii->ii_Unique) + BuildSpeculativeIndexInfo(indexDesc, ii); + + relationDescs[i] = indexDesc; + indexInfoArray[i] = ii; + i++; + } + + list_free(indexoidlist); +} + +/* ---------------------------------------------------------------- + * ExecCloseIndices + * + * Close the index relations stored in resultRelInfo + * ---------------------------------------------------------------- + */ +void +ExecCloseIndices(ResultRelInfo *resultRelInfo) +{ + int i; + int numIndices; + RelationPtr indexDescs; + + numIndices = resultRelInfo->ri_NumIndices; + indexDescs = resultRelInfo->ri_IndexRelationDescs; + + for (i = 0; i < numIndices; i++) + { + if (indexDescs[i] == NULL) + continue; /* shouldn't happen? */ + + /* Drop lock acquired by ExecOpenIndices */ + index_close(indexDescs[i], RowExclusiveLock); + } + + /* + * XXX should free indexInfo array here too? Currently we assume that + * such stuff will be cleaned up automatically in FreeExecutorState. + */ +} + +/* ---------------------------------------------------------------- + * ExecInsertIndexTuples + * + * This routine takes care of inserting index tuples + * into all the relations indexing the result relation + * when a heap tuple is inserted into the result relation. + * + * When 'update' is true, executor is performing an UPDATE + * that could not use an optimization like heapam's HOT (in + * more general terms a call to table_tuple_update() took + * place and set 'update_indexes' to true). Receiving this + * hint makes us consider if we should pass down the + * 'indexUnchanged' hint in turn. That's something that we + * figure out for each index_insert() call iff 'update' is + * true. (When 'update' is false we already know not to pass + * the hint to any index.) + * + * Unique and exclusion constraints are enforced at the same + * time. This returns a list of index OIDs for any unique or + * exclusion constraints that are deferred and that had + * potential (unconfirmed) conflicts. (if noDupErr == true, + * the same is done for non-deferred constraints, but report + * if conflict was speculative or deferred conflict to caller) + * + * If 'arbiterIndexes' is nonempty, noDupErr applies only to + * those indexes. NIL means noDupErr applies to all indexes. + * ---------------------------------------------------------------- + */ +List * +ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate, + bool update, + bool noDupErr, + bool *specConflict, + List *arbiterIndexes) +{ + ItemPointer tupleid = &slot->tts_tid; + List *result = NIL; + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + + Assert(ItemPointerIsValid(tupleid)); + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + bool applyNoDupErr; + IndexUniqueCheck checkUnique; + bool indexUnchanged; + bool satisfiesConstraint; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* Check whether to apply noDupErr to this index */ + applyNoDupErr = noDupErr && + (arbiterIndexes == NIL || + list_member_oid(arbiterIndexes, + indexRelation->rd_index->indexrelid)); + + /* + * The index AM does the actual insertion, plus uniqueness checking. + * + * For an immediate-mode unique index, we just tell the index AM to + * throw error if not unique. + * + * For a deferrable unique index, we tell the index AM to just detect + * possible non-uniqueness, and we add the index OID to the result + * list if further checking is needed. + * + * For a speculative insertion (used by INSERT ... ON CONFLICT), do + * the same as for a deferrable unique index. + */ + if (!indexRelation->rd_index->indisunique) + checkUnique = UNIQUE_CHECK_NO; + else if (applyNoDupErr) + checkUnique = UNIQUE_CHECK_PARTIAL; + else if (indexRelation->rd_index->indimmediate) + checkUnique = UNIQUE_CHECK_YES; + else + checkUnique = UNIQUE_CHECK_PARTIAL; + + /* + * There's definitely going to be an index_insert() call for this + * index. If we're being called as part of an UPDATE statement, + * consider if the 'indexUnchanged' = true hint should be passed. + * + * XXX We always assume that the hint should be passed for an UPDATE. + * This is a workaround for a bug in PostgreSQL 14. In practice this + * won't make much difference for current users of the hint. + */ + indexUnchanged = update; + + satisfiesConstraint = + index_insert(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexUnchanged, /* UPDATE without logical change? */ + indexInfo); /* index AM may need this */ + + /* + * If the index has an associated exclusion constraint, check that. + * This is simpler than the process for uniqueness checks since we + * always insert first and then check. If the constraint is deferred, + * we check now anyway, but don't throw error on violation or wait for + * a conclusive outcome from a concurrent insertion; instead we'll + * queue a recheck event. Similarly, noDupErr callers (speculative + * inserters) will recheck later, and wait for a conclusive outcome + * then. + * + * An index for an exclusion constraint can't also be UNIQUE (not an + * essential property, we just don't allow it in the grammar), so no + * need to preserve the prior state of satisfiesConstraint. + */ + if (indexInfo->ii_ExclusionOps != NULL) + { + bool violationOK; + CEOUC_WAIT_MODE waitMode; + + if (applyNoDupErr) + { + violationOK = true; + waitMode = CEOUC_LIVELOCK_PREVENTING_WAIT; + } + else if (!indexRelation->rd_index->indimmediate) + { + violationOK = true; + waitMode = CEOUC_NOWAIT; + } + else + { + violationOK = false; + waitMode = CEOUC_WAIT; + } + + satisfiesConstraint = + check_exclusion_or_unique_constraint(heapRelation, + indexRelation, indexInfo, + tupleid, values, isnull, + estate, false, + waitMode, violationOK, NULL); + } + + if ((checkUnique == UNIQUE_CHECK_PARTIAL || + indexInfo->ii_ExclusionOps != NULL) && + !satisfiesConstraint) + { + /* + * The tuple potentially violates the uniqueness or exclusion + * constraint, so make a note of the index so that we can re-check + * it later. Speculative inserters are told if there was a + * speculative conflict, since that always requires a restart. + */ + result = lappend_oid(result, RelationGetRelid(indexRelation)); + if (indexRelation->rd_index->indimmediate && specConflict) + *specConflict = true; + } + } + + return result; +} + +/* ---------------------------------------------------------------- + * ExecCheckIndexConstraints + * + * This routine checks if a tuple violates any unique or + * exclusion constraints. Returns true if there is no conflict. + * Otherwise returns false, and the TID of the conflicting + * tuple is returned in *conflictTid. + * + * If 'arbiterIndexes' is given, only those indexes are checked. + * NIL means all indexes. + * + * Note that this doesn't lock the values in any way, so it's + * possible that a conflicting tuple is inserted immediately + * after this returns. But this can be used for a pre-check + * before insertion. + * ---------------------------------------------------------------- + */ +bool +ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, + EState *estate, ItemPointer conflictTid, + List *arbiterIndexes) +{ + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + ItemPointerData invalidItemPtr; + bool checkedIndex = false; + + ItemPointerSetInvalid(conflictTid); + ItemPointerSetInvalid(&invalidItemPtr); + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* + * For each index, form index tuple and check if it satisfies the + * constraint. + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + bool satisfiesConstraint; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + if (!indexInfo->ii_Unique && !indexInfo->ii_ExclusionOps) + continue; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + /* When specific arbiter indexes requested, only examine them */ + if (arbiterIndexes != NIL && + !list_member_oid(arbiterIndexes, + indexRelation->rd_index->indexrelid)) + continue; + + if (!indexRelation->rd_index->indimmediate) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("ON CONFLICT does not support deferrable unique constraints/exclusion constraints as arbiters"), + errtableconstraint(heapRelation, + RelationGetRelationName(indexRelation)))); + + checkedIndex = true; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + satisfiesConstraint = + check_exclusion_or_unique_constraint(heapRelation, indexRelation, + indexInfo, &invalidItemPtr, + values, isnull, estate, false, + CEOUC_WAIT, true, + conflictTid); + if (!satisfiesConstraint) + return false; + } + + if (arbiterIndexes != NIL && !checkedIndex) + elog(ERROR, "unexpected failure to find arbiter index"); + + return true; +} + +/* + * Check for violation of an exclusion or unique constraint + * + * heap: the table containing the new tuple + * index: the index supporting the constraint + * indexInfo: info about the index, including the exclusion properties + * tupleid: heap TID of the new tuple we have just inserted (invalid if we + * haven't inserted a new tuple yet) + * values, isnull: the *index* column values computed for the new tuple + * estate: an EState we can do evaluation in + * newIndex: if true, we are trying to build a new index (this affects + * only the wording of error messages) + * waitMode: whether to wait for concurrent inserters/deleters + * violationOK: if true, don't throw error for violation + * conflictTid: if not-NULL, the TID of the conflicting tuple is returned here + * + * Returns true if OK, false if actual or potential violation + * + * 'waitMode' determines what happens if a conflict is detected with a tuple + * that was inserted or deleted by a transaction that's still running. + * CEOUC_WAIT means that we wait for the transaction to commit, before + * throwing an error or returning. CEOUC_NOWAIT means that we report the + * violation immediately; so the violation is only potential, and the caller + * must recheck sometime later. This behavior is convenient for deferred + * exclusion checks; we need not bother queuing a deferred event if there is + * definitely no conflict at insertion time. + * + * CEOUC_LIVELOCK_PREVENTING_WAIT is like CEOUC_NOWAIT, but we will sometimes + * wait anyway, to prevent livelocking if two transactions try inserting at + * the same time. This is used with speculative insertions, for INSERT ON + * CONFLICT statements. (See notes in file header) + * + * If violationOK is true, we just report the potential or actual violation to + * the caller by returning 'false'. Otherwise we throw a descriptive error + * message here. When violationOK is false, a false result is impossible. + * + * Note: The indexam is normally responsible for checking unique constraints, + * so this normally only needs to be used for exclusion constraints. But this + * function is also called when doing a "pre-check" for conflicts on a unique + * constraint, when doing speculative insertion. Caller may use the returned + * conflict TID to take further steps. + */ +static bool +check_exclusion_or_unique_constraint(Relation heap, Relation index, + IndexInfo *indexInfo, + ItemPointer tupleid, + Datum *values, bool *isnull, + EState *estate, bool newIndex, + CEOUC_WAIT_MODE waitMode, + bool violationOK, + ItemPointer conflictTid) +{ + Oid *constr_procs; + uint16 *constr_strats; + Oid *index_collations = index->rd_indcollation; + int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index); + IndexScanDesc index_scan; + ScanKeyData scankeys[INDEX_MAX_KEYS]; + SnapshotData DirtySnapshot; + int i; + bool conflict; + bool found_self; + ExprContext *econtext; + TupleTableSlot *existing_slot; + TupleTableSlot *save_scantuple; + + if (indexInfo->ii_ExclusionOps) + { + constr_procs = indexInfo->ii_ExclusionProcs; + constr_strats = indexInfo->ii_ExclusionStrats; + } + else + { + constr_procs = indexInfo->ii_UniqueProcs; + constr_strats = indexInfo->ii_UniqueStrats; + } + + /* + * If any of the input values are NULL, the constraint check is assumed to + * pass (i.e., we assume the operators are strict). + */ + for (i = 0; i < indnkeyatts; i++) + { + if (isnull[i]) + return true; + } + + /* + * Search the tuples that are in the index for any violations, including + * tuples that aren't visible yet. + */ + InitDirtySnapshot(DirtySnapshot); + + for (i = 0; i < indnkeyatts; i++) + { + ScanKeyEntryInitialize(&scankeys[i], + 0, + i + 1, + constr_strats[i], + InvalidOid, + index_collations[i], + constr_procs[i], + values[i]); + } + + /* + * Need a TupleTableSlot to put existing tuples in. + * + * To use FormIndexDatum, we have to make the econtext's scantuple point + * to this slot. Be sure to save and restore caller's value for + * scantuple. + */ + existing_slot = table_slot_create(heap, NULL); + + econtext = GetPerTupleExprContext(estate); + save_scantuple = econtext->ecxt_scantuple; + econtext->ecxt_scantuple = existing_slot; + + /* + * May have to restart scan from this point if a potential conflict is + * found. + */ +retry: + conflict = false; + found_self = false; + index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0); + index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0); + + while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot)) + { + TransactionId xwait; + XLTW_Oper reason_wait; + Datum existing_values[INDEX_MAX_KEYS]; + bool existing_isnull[INDEX_MAX_KEYS]; + char *error_new; + char *error_existing; + + /* + * Ignore the entry for the tuple we're trying to check. + */ + if (ItemPointerIsValid(tupleid) && + ItemPointerEquals(tupleid, &existing_slot->tts_tid)) + { + if (found_self) /* should not happen */ + elog(ERROR, "found self tuple multiple times in index \"%s\"", + RelationGetRelationName(index)); + found_self = true; + continue; + } + + /* + * Extract the index column values and isnull flags from the existing + * tuple. + */ + FormIndexDatum(indexInfo, existing_slot, estate, + existing_values, existing_isnull); + + /* If lossy indexscan, must recheck the condition */ + if (index_scan->xs_recheck) + { + if (!index_recheck_constraint(index, + constr_procs, + existing_values, + existing_isnull, + values)) + continue; /* tuple doesn't actually match, so no + * conflict */ + } + + /* + * At this point we have either a conflict or a potential conflict. + * + * If an in-progress transaction is affecting the visibility of this + * tuple, we need to wait for it to complete and then recheck (unless + * the caller requested not to). For simplicity we do rechecking by + * just restarting the whole scan --- this case probably doesn't + * happen often enough to be worth trying harder, and anyway we don't + * want to hold any index internal locks while waiting. + */ + xwait = TransactionIdIsValid(DirtySnapshot.xmin) ? + DirtySnapshot.xmin : DirtySnapshot.xmax; + + if (TransactionIdIsValid(xwait) && + (waitMode == CEOUC_WAIT || + (waitMode == CEOUC_LIVELOCK_PREVENTING_WAIT && + DirtySnapshot.speculativeToken && + TransactionIdPrecedes(GetCurrentTransactionId(), xwait)))) + { + reason_wait = indexInfo->ii_ExclusionOps ? + XLTW_RecheckExclusionConstr : XLTW_InsertIndex; + index_endscan(index_scan); + if (DirtySnapshot.speculativeToken) + SpeculativeInsertionWait(DirtySnapshot.xmin, + DirtySnapshot.speculativeToken); + else + XactLockTableWait(xwait, heap, + &existing_slot->tts_tid, reason_wait); + goto retry; + } + + /* + * We have a definite conflict (or a potential one, but the caller + * didn't want to wait). Return it to caller, or report it. + */ + if (violationOK) + { + conflict = true; + if (conflictTid) + *conflictTid = existing_slot->tts_tid; + break; + } + + error_new = BuildIndexValueDescription(index, values, isnull); + error_existing = BuildIndexValueDescription(index, existing_values, + existing_isnull); + if (newIndex) + ereport(ERROR, + (errcode(ERRCODE_EXCLUSION_VIOLATION), + errmsg("could not create exclusion constraint \"%s\"", + RelationGetRelationName(index)), + error_new && error_existing ? + errdetail("Key %s conflicts with key %s.", + error_new, error_existing) : + errdetail("Key conflicts exist."), + errtableconstraint(heap, + RelationGetRelationName(index)))); + else + ereport(ERROR, + (errcode(ERRCODE_EXCLUSION_VIOLATION), + errmsg("conflicting key value violates exclusion constraint \"%s\"", + RelationGetRelationName(index)), + error_new && error_existing ? + errdetail("Key %s conflicts with existing key %s.", + error_new, error_existing) : + errdetail("Key conflicts with existing key."), + errtableconstraint(heap, + RelationGetRelationName(index)))); + } + + index_endscan(index_scan); + + /* + * Ordinarily, at this point the search should have found the originally + * inserted tuple (if any), unless we exited the loop early because of + * conflict. However, it is possible to define exclusion constraints for + * which that wouldn't be true --- for instance, if the operator is <>. So + * we no longer complain if found_self is still false. + */ + + econtext->ecxt_scantuple = save_scantuple; + + ExecDropSingleTupleTableSlot(existing_slot); + + return !conflict; +} + +/* + * Check for violation of an exclusion constraint + * + * This is a dumbed down version of check_exclusion_or_unique_constraint + * for external callers. They don't need all the special modes. + */ +void +check_exclusion_constraint(Relation heap, Relation index, + IndexInfo *indexInfo, + ItemPointer tupleid, + Datum *values, bool *isnull, + EState *estate, bool newIndex) +{ + (void) check_exclusion_or_unique_constraint(heap, index, indexInfo, tupleid, + values, isnull, + estate, newIndex, + CEOUC_WAIT, false, NULL); +} + +/* + * Check existing tuple's index values to see if it really matches the + * exclusion condition against the new_values. Returns true if conflict. + */ +static bool +index_recheck_constraint(Relation index, Oid *constr_procs, + Datum *existing_values, bool *existing_isnull, + Datum *new_values) +{ + int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index); + int i; + + for (i = 0; i < indnkeyatts; i++) + { + /* Assume the exclusion operators are strict */ + if (existing_isnull[i]) + return false; + + if (!DatumGetBool(OidFunctionCall2Coll(constr_procs[i], + index->rd_indcollation[i], + existing_values[i], + new_values[i]))) + return false; + } + + return true; +} diff --git a/src/backend/executor/execJunk.c b/src/backend/executor/execJunk.c new file mode 100644 index 0000000..9741897 --- /dev/null +++ b/src/backend/executor/execJunk.c @@ -0,0 +1,304 @@ +/*------------------------------------------------------------------------- + * + * execJunk.c + * Junk attribute support stuff.... + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execJunk.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "executor/executor.h" + +/*------------------------------------------------------------------------- + * XXX this stuff should be rewritten to take advantage + * of ExecProject() and the ProjectionInfo node. + * -cim 6/3/91 + * + * An attribute of a tuple living inside the executor, can be + * either a normal attribute or a "junk" attribute. "junk" attributes + * never make it out of the executor, i.e. they are never printed, + * returned or stored on disk. Their only purpose in life is to + * store some information useful only to the executor, mainly the values + * of system attributes like "ctid", or sort key columns that are not to + * be output. + * + * The general idea is the following: A target list consists of a list of + * TargetEntry nodes containing expressions. Each TargetEntry has a field + * called 'resjunk'. If the value of this field is true then the + * corresponding attribute is a "junk" attribute. + * + * When we initialize a plan we call ExecInitJunkFilter to create a filter. + * + * We then execute the plan, treating the resjunk attributes like any others. + * + * Finally, when at the top level we get back a tuple, we can call + * ExecFindJunkAttribute/ExecGetJunkAttribute to retrieve the values of the + * junk attributes we are interested in, and ExecFilterJunk to remove all the + * junk attributes from a tuple. This new "clean" tuple is then printed, + * inserted, or updated. + * + *------------------------------------------------------------------------- + */ + +/* + * ExecInitJunkFilter + * + * Initialize the Junk filter. + * + * The source targetlist is passed in. The output tuple descriptor is + * built from the non-junk tlist entries. + * An optional resultSlot can be passed as well; otherwise, we create one. + */ +JunkFilter * +ExecInitJunkFilter(List *targetList, TupleTableSlot *slot) +{ + JunkFilter *junkfilter; + TupleDesc cleanTupType; + int cleanLength; + AttrNumber *cleanMap; + + /* + * Compute the tuple descriptor for the cleaned tuple. + */ + cleanTupType = ExecCleanTypeFromTL(targetList); + + /* + * Use the given slot, or make a new slot if we weren't given one. + */ + if (slot) + ExecSetSlotDescriptor(slot, cleanTupType); + else + slot = MakeSingleTupleTableSlot(cleanTupType, &TTSOpsVirtual); + + /* + * Now calculate the mapping between the original tuple's attributes and + * the "clean" tuple's attributes. + * + * The "map" is an array of "cleanLength" attribute numbers, i.e. one + * entry for every attribute of the "clean" tuple. The value of this entry + * is the attribute number of the corresponding attribute of the + * "original" tuple. (Zero indicates a NULL output attribute, but we do + * not use that feature in this routine.) + */ + cleanLength = cleanTupType->natts; + if (cleanLength > 0) + { + AttrNumber cleanResno; + ListCell *t; + + cleanMap = (AttrNumber *) palloc(cleanLength * sizeof(AttrNumber)); + cleanResno = 0; + foreach(t, targetList) + { + TargetEntry *tle = lfirst(t); + + if (!tle->resjunk) + { + cleanMap[cleanResno] = tle->resno; + cleanResno++; + } + } + Assert(cleanResno == cleanLength); + } + else + cleanMap = NULL; + + /* + * Finally create and initialize the JunkFilter struct. + */ + junkfilter = makeNode(JunkFilter); + + junkfilter->jf_targetList = targetList; + junkfilter->jf_cleanTupType = cleanTupType; + junkfilter->jf_cleanMap = cleanMap; + junkfilter->jf_resultSlot = slot; + + return junkfilter; +} + +/* + * ExecInitJunkFilterConversion + * + * Initialize a JunkFilter for rowtype conversions. + * + * Here, we are given the target "clean" tuple descriptor rather than + * inferring it from the targetlist. The target descriptor can contain + * deleted columns. It is assumed that the caller has checked that the + * non-deleted columns match up with the non-junk columns of the targetlist. + */ +JunkFilter * +ExecInitJunkFilterConversion(List *targetList, + TupleDesc cleanTupType, + TupleTableSlot *slot) +{ + JunkFilter *junkfilter; + int cleanLength; + AttrNumber *cleanMap; + ListCell *t; + int i; + + /* + * Use the given slot, or make a new slot if we weren't given one. + */ + if (slot) + ExecSetSlotDescriptor(slot, cleanTupType); + else + slot = MakeSingleTupleTableSlot(cleanTupType, &TTSOpsVirtual); + + /* + * Calculate the mapping between the original tuple's attributes and the + * "clean" tuple's attributes. + * + * The "map" is an array of "cleanLength" attribute numbers, i.e. one + * entry for every attribute of the "clean" tuple. The value of this entry + * is the attribute number of the corresponding attribute of the + * "original" tuple. We store zero for any deleted attributes, marking + * that a NULL is needed in the output tuple. + */ + cleanLength = cleanTupType->natts; + if (cleanLength > 0) + { + cleanMap = (AttrNumber *) palloc0(cleanLength * sizeof(AttrNumber)); + t = list_head(targetList); + for (i = 0; i < cleanLength; i++) + { + if (TupleDescAttr(cleanTupType, i)->attisdropped) + continue; /* map entry is already zero */ + for (;;) + { + TargetEntry *tle = lfirst(t); + + t = lnext(targetList, t); + if (!tle->resjunk) + { + cleanMap[i] = tle->resno; + break; + } + } + } + } + else + cleanMap = NULL; + + /* + * Finally create and initialize the JunkFilter struct. + */ + junkfilter = makeNode(JunkFilter); + + junkfilter->jf_targetList = targetList; + junkfilter->jf_cleanTupType = cleanTupType; + junkfilter->jf_cleanMap = cleanMap; + junkfilter->jf_resultSlot = slot; + + return junkfilter; +} + +/* + * ExecFindJunkAttribute + * + * Locate the specified junk attribute in the junk filter's targetlist, + * and return its resno. Returns InvalidAttrNumber if not found. + */ +AttrNumber +ExecFindJunkAttribute(JunkFilter *junkfilter, const char *attrName) +{ + return ExecFindJunkAttributeInTlist(junkfilter->jf_targetList, attrName); +} + +/* + * ExecFindJunkAttributeInTlist + * + * Find a junk attribute given a subplan's targetlist (not necessarily + * part of a JunkFilter). + */ +AttrNumber +ExecFindJunkAttributeInTlist(List *targetlist, const char *attrName) +{ + ListCell *t; + + foreach(t, targetlist) + { + TargetEntry *tle = lfirst(t); + + if (tle->resjunk && tle->resname && + (strcmp(tle->resname, attrName) == 0)) + { + /* We found it ! */ + return tle->resno; + } + } + + return InvalidAttrNumber; +} + +/* + * ExecFilterJunk + * + * Construct and return a slot with all the junk attributes removed. + */ +TupleTableSlot * +ExecFilterJunk(JunkFilter *junkfilter, TupleTableSlot *slot) +{ + TupleTableSlot *resultSlot; + AttrNumber *cleanMap; + TupleDesc cleanTupType; + int cleanLength; + int i; + Datum *values; + bool *isnull; + Datum *old_values; + bool *old_isnull; + + /* + * Extract all the values of the old tuple. + */ + slot_getallattrs(slot); + old_values = slot->tts_values; + old_isnull = slot->tts_isnull; + + /* + * get info from the junk filter + */ + cleanTupType = junkfilter->jf_cleanTupType; + cleanLength = cleanTupType->natts; + cleanMap = junkfilter->jf_cleanMap; + resultSlot = junkfilter->jf_resultSlot; + + /* + * Prepare to build a virtual result tuple. + */ + ExecClearTuple(resultSlot); + values = resultSlot->tts_values; + isnull = resultSlot->tts_isnull; + + /* + * Transpose data into proper fields of the new tuple. + */ + for (i = 0; i < cleanLength; i++) + { + int j = cleanMap[i]; + + if (j == 0) + { + values[i] = (Datum) 0; + isnull[i] = true; + } + else + { + values[i] = old_values[j - 1]; + isnull[i] = old_isnull[j - 1]; + } + } + + /* + * And return the virtual tuple. + */ + return ExecStoreVirtualTuple(resultSlot); +} diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c new file mode 100644 index 0000000..b3ce4ba --- /dev/null +++ b/src/backend/executor/execMain.c @@ -0,0 +1,2886 @@ +/*------------------------------------------------------------------------- + * + * execMain.c + * top level executor interface routines + * + * INTERFACE ROUTINES + * ExecutorStart() + * ExecutorRun() + * ExecutorFinish() + * ExecutorEnd() + * + * These four procedures are the external interface to the executor. + * In each case, the query descriptor is required as an argument. + * + * ExecutorStart must be called at the beginning of execution of any + * query plan and ExecutorEnd must always be called at the end of + * execution of a plan (unless it is aborted due to error). + * + * ExecutorRun accepts direction and count arguments that specify whether + * the plan is to be executed forwards, backwards, and for how many tuples. + * In some cases ExecutorRun may be called multiple times to process all + * the tuples for a plan. It is also acceptable to stop short of executing + * the whole plan (but only if it is a SELECT). + * + * ExecutorFinish must be called after the final ExecutorRun call and + * before ExecutorEnd. This can be omitted only in case of EXPLAIN, + * which should also omit ExecutorRun. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execMain.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/sysattr.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/namespace.h" +#include "catalog/pg_publication.h" +#include "commands/matview.h" +#include "commands/trigger.h" +#include "executor/execdebug.h" +#include "executor/nodeSubplan.h" +#include "foreign/fdwapi.h" +#include "jit/jit.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "parser/parsetree.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "tcop/utility.h" +#include "utils/acl.h" +#include "utils/backend_status.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/partcache.h" +#include "utils/rls.h" +#include "utils/ruleutils.h" +#include "utils/snapmgr.h" + + +/* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */ +ExecutorStart_hook_type ExecutorStart_hook = NULL; +ExecutorRun_hook_type ExecutorRun_hook = NULL; +ExecutorFinish_hook_type ExecutorFinish_hook = NULL; +ExecutorEnd_hook_type ExecutorEnd_hook = NULL; + +/* Hook for plugin to get control in ExecCheckRTPerms() */ +ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL; + +/* decls for local routines only used within this module */ +static void InitPlan(QueryDesc *queryDesc, int eflags); +static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); +static void ExecPostprocessPlan(EState *estate); +static void ExecEndPlan(PlanState *planstate, EState *estate); +static void ExecutePlan(EState *estate, PlanState *planstate, + bool use_parallel_mode, + CmdType operation, + bool sendTuples, + uint64 numberTuples, + ScanDirection direction, + DestReceiver *dest, + bool execute_once); +static bool ExecCheckRTEPerms(RangeTblEntry *rte); +static bool ExecCheckRTEPermsModified(Oid relOid, Oid userid, + Bitmapset *modifiedCols, + AclMode requiredPerms); +static void ExecCheckXactReadOnly(PlannedStmt *plannedstmt); +static char *ExecBuildSlotValueDescription(Oid reloid, + TupleTableSlot *slot, + TupleDesc tupdesc, + Bitmapset *modifiedCols, + int maxfieldlen); +static void EvalPlanQualStart(EPQState *epqstate, Plan *planTree); + +/* end of local decls */ + + +/* ---------------------------------------------------------------- + * ExecutorStart + * + * This routine must be called at the beginning of any execution of any + * query plan + * + * Takes a QueryDesc previously created by CreateQueryDesc (which is separate + * only because some places use QueryDescs for utility commands). The tupDesc + * field of the QueryDesc is filled in to describe the tuples that will be + * returned, and the internal fields (estate and planstate) are set up. + * + * eflags contains flag bits as described in executor.h. + * + * NB: the CurrentMemoryContext when this is called will become the parent + * of the per-query context used for this Executor invocation. + * + * We provide a function hook variable that lets loadable plugins + * get control when ExecutorStart is called. Such a plugin would + * normally call standard_ExecutorStart(). + * + * ---------------------------------------------------------------- + */ +void +ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + /* + * In some cases (e.g. an EXECUTE statement) a query execution will skip + * parse analysis, which means that the query_id won't be reported. Note + * that it's harmless to report the query_id multiple time, as the call + * will be ignored if the top level query_id has already been reported. + */ + pgstat_report_query_id(queryDesc->plannedstmt->queryId, false); + + if (ExecutorStart_hook) + (*ExecutorStart_hook) (queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); +} + +void +standard_ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + EState *estate; + MemoryContext oldcontext; + + /* sanity checks: queryDesc must not be started already */ + Assert(queryDesc != NULL); + Assert(queryDesc->estate == NULL); + + /* + * If the transaction is read-only, we need to check if any writes are + * planned to non-temporary tables. EXPLAIN is considered read-only. + * + * Don't allow writes in parallel mode. Supporting UPDATE and DELETE + * would require (a) storing the combo CID hash in shared memory, rather + * than synchronizing it just once at the start of parallelism, and (b) an + * alternative to heap_update()'s reliance on xmax for mutual exclusion. + * INSERT may have no such troubles, but we forbid it to simplify the + * checks. + * + * We have lower-level defenses in CommandCounterIncrement and elsewhere + * against performing unsafe operations in parallel mode, but this gives a + * more user-friendly error message. + */ + if ((XactReadOnly || IsInParallelMode()) && + !(eflags & EXEC_FLAG_EXPLAIN_ONLY)) + ExecCheckXactReadOnly(queryDesc->plannedstmt); + + /* + * Build EState, switch into per-query memory context for startup. + */ + estate = CreateExecutorState(); + queryDesc->estate = estate; + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + /* + * Fill in external parameters, if any, from queryDesc; and allocate + * workspace for internal parameters + */ + estate->es_param_list_info = queryDesc->params; + + if (queryDesc->plannedstmt->paramExecTypes != NIL) + { + int nParamExec; + + nParamExec = list_length(queryDesc->plannedstmt->paramExecTypes); + estate->es_param_exec_vals = (ParamExecData *) + palloc0(nParamExec * sizeof(ParamExecData)); + } + + /* We now require all callers to provide sourceText */ + Assert(queryDesc->sourceText != NULL); + estate->es_sourceText = queryDesc->sourceText; + + /* + * Fill in the query environment, if any, from queryDesc. + */ + estate->es_queryEnv = queryDesc->queryEnv; + + /* + * If non-read-only query, set the command ID to mark output tuples with + */ + switch (queryDesc->operation) + { + case CMD_SELECT: + + /* + * SELECT FOR [KEY] UPDATE/SHARE and modifying CTEs need to mark + * tuples + */ + if (queryDesc->plannedstmt->rowMarks != NIL || + queryDesc->plannedstmt->hasModifyingCTE) + estate->es_output_cid = GetCurrentCommandId(true); + + /* + * A SELECT without modifying CTEs can't possibly queue triggers, + * so force skip-triggers mode. This is just a marginal efficiency + * hack, since AfterTriggerBeginQuery/AfterTriggerEndQuery aren't + * all that expensive, but we might as well do it. + */ + if (!queryDesc->plannedstmt->hasModifyingCTE) + eflags |= EXEC_FLAG_SKIP_TRIGGERS; + break; + + case CMD_INSERT: + case CMD_DELETE: + case CMD_UPDATE: + estate->es_output_cid = GetCurrentCommandId(true); + break; + + default: + elog(ERROR, "unrecognized operation code: %d", + (int) queryDesc->operation); + break; + } + + /* + * Copy other important information into the EState + */ + estate->es_snapshot = RegisterSnapshot(queryDesc->snapshot); + estate->es_crosscheck_snapshot = RegisterSnapshot(queryDesc->crosscheck_snapshot); + estate->es_top_eflags = eflags; + estate->es_instrument = queryDesc->instrument_options; + estate->es_jit_flags = queryDesc->plannedstmt->jitFlags; + + /* + * Set up an AFTER-trigger statement context, unless told not to, or + * unless it's EXPLAIN-only mode (when ExecutorFinish won't be called). + */ + if (!(eflags & (EXEC_FLAG_SKIP_TRIGGERS | EXEC_FLAG_EXPLAIN_ONLY))) + AfterTriggerBeginQuery(); + + /* + * Initialize the plan state tree + */ + InitPlan(queryDesc, eflags); + + MemoryContextSwitchTo(oldcontext); +} + +/* ---------------------------------------------------------------- + * ExecutorRun + * + * This is the main routine of the executor module. It accepts + * the query descriptor from the traffic cop and executes the + * query plan. + * + * ExecutorStart must have been called already. + * + * If direction is NoMovementScanDirection then nothing is done + * except to start up/shut down the destination. Otherwise, + * we retrieve up to 'count' tuples in the specified direction. + * + * Note: count = 0 is interpreted as no portal limit, i.e., run to + * completion. Also note that the count limit is only applied to + * retrieved tuples, not for instance to those inserted/updated/deleted + * by a ModifyTable plan node. + * + * There is no return value, but output tuples (if any) are sent to + * the destination receiver specified in the QueryDesc; and the number + * of tuples processed at the top level can be found in + * estate->es_processed. + * + * We provide a function hook variable that lets loadable plugins + * get control when ExecutorRun is called. Such a plugin would + * normally call standard_ExecutorRun(). + * + * ---------------------------------------------------------------- + */ +void +ExecutorRun(QueryDesc *queryDesc, + ScanDirection direction, uint64 count, + bool execute_once) +{ + if (ExecutorRun_hook) + (*ExecutorRun_hook) (queryDesc, direction, count, execute_once); + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); +} + +void +standard_ExecutorRun(QueryDesc *queryDesc, + ScanDirection direction, uint64 count, bool execute_once) +{ + EState *estate; + CmdType operation; + DestReceiver *dest; + bool sendTuples; + MemoryContext oldcontext; + + /* sanity checks */ + Assert(queryDesc != NULL); + + estate = queryDesc->estate; + + Assert(estate != NULL); + Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); + + /* + * Switch into per-query memory context + */ + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + /* Allow instrumentation of Executor overall runtime */ + if (queryDesc->totaltime) + InstrStartNode(queryDesc->totaltime); + + /* + * extract information from the query descriptor and the query feature. + */ + operation = queryDesc->operation; + dest = queryDesc->dest; + + /* + * startup tuple receiver, if we will be emitting tuples + */ + estate->es_processed = 0; + + sendTuples = (operation == CMD_SELECT || + queryDesc->plannedstmt->hasReturning); + + if (sendTuples) + dest->rStartup(dest, operation, queryDesc->tupDesc); + + /* + * run plan + */ + if (!ScanDirectionIsNoMovement(direction)) + { + if (execute_once && queryDesc->already_executed) + elog(ERROR, "can't re-execute query flagged for single execution"); + queryDesc->already_executed = true; + + ExecutePlan(estate, + queryDesc->planstate, + queryDesc->plannedstmt->parallelModeNeeded, + operation, + sendTuples, + count, + direction, + dest, + execute_once); + } + + /* + * shutdown tuple receiver, if we started it + */ + if (sendTuples) + dest->rShutdown(dest); + + if (queryDesc->totaltime) + InstrStopNode(queryDesc->totaltime, estate->es_processed); + + MemoryContextSwitchTo(oldcontext); +} + +/* ---------------------------------------------------------------- + * ExecutorFinish + * + * This routine must be called after the last ExecutorRun call. + * It performs cleanup such as firing AFTER triggers. It is + * separate from ExecutorEnd because EXPLAIN ANALYZE needs to + * include these actions in the total runtime. + * + * We provide a function hook variable that lets loadable plugins + * get control when ExecutorFinish is called. Such a plugin would + * normally call standard_ExecutorFinish(). + * + * ---------------------------------------------------------------- + */ +void +ExecutorFinish(QueryDesc *queryDesc) +{ + if (ExecutorFinish_hook) + (*ExecutorFinish_hook) (queryDesc); + else + standard_ExecutorFinish(queryDesc); +} + +void +standard_ExecutorFinish(QueryDesc *queryDesc) +{ + EState *estate; + MemoryContext oldcontext; + + /* sanity checks */ + Assert(queryDesc != NULL); + + estate = queryDesc->estate; + + Assert(estate != NULL); + Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); + + /* This should be run once and only once per Executor instance */ + Assert(!estate->es_finished); + + /* Switch into per-query memory context */ + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + /* Allow instrumentation of Executor overall runtime */ + if (queryDesc->totaltime) + InstrStartNode(queryDesc->totaltime); + + /* Run ModifyTable nodes to completion */ + ExecPostprocessPlan(estate); + + /* Execute queued AFTER triggers, unless told not to */ + if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) + AfterTriggerEndQuery(estate); + + if (queryDesc->totaltime) + InstrStopNode(queryDesc->totaltime, 0); + + MemoryContextSwitchTo(oldcontext); + + estate->es_finished = true; +} + +/* ---------------------------------------------------------------- + * ExecutorEnd + * + * This routine must be called at the end of execution of any + * query plan + * + * We provide a function hook variable that lets loadable plugins + * get control when ExecutorEnd is called. Such a plugin would + * normally call standard_ExecutorEnd(). + * + * ---------------------------------------------------------------- + */ +void +ExecutorEnd(QueryDesc *queryDesc) +{ + if (ExecutorEnd_hook) + (*ExecutorEnd_hook) (queryDesc); + else + standard_ExecutorEnd(queryDesc); +} + +void +standard_ExecutorEnd(QueryDesc *queryDesc) +{ + EState *estate; + MemoryContext oldcontext; + + /* sanity checks */ + Assert(queryDesc != NULL); + + estate = queryDesc->estate; + + Assert(estate != NULL); + + /* + * Check that ExecutorFinish was called, unless in EXPLAIN-only mode. This + * Assert is needed because ExecutorFinish is new as of 9.1, and callers + * might forget to call it. + */ + Assert(estate->es_finished || + (estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); + + /* + * Switch into per-query memory context to run ExecEndPlan + */ + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + ExecEndPlan(queryDesc->planstate, estate); + + /* do away with our snapshots */ + UnregisterSnapshot(estate->es_snapshot); + UnregisterSnapshot(estate->es_crosscheck_snapshot); + + /* + * Must switch out of context before destroying it + */ + MemoryContextSwitchTo(oldcontext); + + /* + * Release EState and per-query memory context. This should release + * everything the executor has allocated. + */ + FreeExecutorState(estate); + + /* Reset queryDesc fields that no longer point to anything */ + queryDesc->tupDesc = NULL; + queryDesc->estate = NULL; + queryDesc->planstate = NULL; + queryDesc->totaltime = NULL; +} + +/* ---------------------------------------------------------------- + * ExecutorRewind + * + * This routine may be called on an open queryDesc to rewind it + * to the start. + * ---------------------------------------------------------------- + */ +void +ExecutorRewind(QueryDesc *queryDesc) +{ + EState *estate; + MemoryContext oldcontext; + + /* sanity checks */ + Assert(queryDesc != NULL); + + estate = queryDesc->estate; + + Assert(estate != NULL); + + /* It's probably not sensible to rescan updating queries */ + Assert(queryDesc->operation == CMD_SELECT); + + /* + * Switch into per-query memory context + */ + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + /* + * rescan plan + */ + ExecReScan(queryDesc->planstate); + + MemoryContextSwitchTo(oldcontext); +} + + +/* + * ExecCheckRTPerms + * Check access permissions for all relations listed in a range table. + * + * Returns true if permissions are adequate. Otherwise, throws an appropriate + * error if ereport_on_violation is true, or simply returns false otherwise. + * + * Note that this does NOT address row-level security policies (aka: RLS). If + * rows will be returned to the user as a result of this permission check + * passing, then RLS also needs to be consulted (and check_enable_rls()). + * + * See rewrite/rowsecurity.c. + */ +bool +ExecCheckRTPerms(List *rangeTable, bool ereport_on_violation) +{ + ListCell *l; + bool result = true; + + foreach(l, rangeTable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + + result = ExecCheckRTEPerms(rte); + if (!result) + { + Assert(rte->rtekind == RTE_RELATION); + if (ereport_on_violation) + aclcheck_error(ACLCHECK_NO_PRIV, get_relkind_objtype(get_rel_relkind(rte->relid)), + get_rel_name(rte->relid)); + return false; + } + } + + if (ExecutorCheckPerms_hook) + result = (*ExecutorCheckPerms_hook) (rangeTable, + ereport_on_violation); + return result; +} + +/* + * ExecCheckRTEPerms + * Check access permissions for a single RTE. + */ +static bool +ExecCheckRTEPerms(RangeTblEntry *rte) +{ + AclMode requiredPerms; + AclMode relPerms; + AclMode remainingPerms; + Oid relOid; + Oid userid; + + /* + * Only plain-relation RTEs need to be checked here. Function RTEs are + * checked when the function is prepared for execution. Join, subquery, + * and special RTEs need no checks. + */ + if (rte->rtekind != RTE_RELATION) + return true; + + /* + * No work if requiredPerms is empty. + */ + requiredPerms = rte->requiredPerms; + if (requiredPerms == 0) + return true; + + relOid = rte->relid; + + /* + * userid to check as: current user unless we have a setuid indication. + * + * Note: GetUserId() is presently fast enough that there's no harm in + * calling it separately for each RTE. If that stops being true, we could + * call it once in ExecCheckRTPerms and pass the userid down from there. + * But for now, no need for the extra clutter. + */ + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + /* + * We must have *all* the requiredPerms bits, but some of the bits can be + * satisfied from column-level rather than relation-level permissions. + * First, remove any bits that are satisfied by relation permissions. + */ + relPerms = pg_class_aclmask(relOid, userid, requiredPerms, ACLMASK_ALL); + remainingPerms = requiredPerms & ~relPerms; + if (remainingPerms != 0) + { + int col = -1; + + /* + * If we lack any permissions that exist only as relation permissions, + * we can fail straight away. + */ + if (remainingPerms & ~(ACL_SELECT | ACL_INSERT | ACL_UPDATE)) + return false; + + /* + * Check to see if we have the needed privileges at column level. + * + * Note: failures just report a table-level error; it would be nicer + * to report a column-level error if we have some but not all of the + * column privileges. + */ + if (remainingPerms & ACL_SELECT) + { + /* + * When the query doesn't explicitly reference any columns (for + * example, SELECT COUNT(*) FROM table), allow the query if we + * have SELECT on any column of the rel, as per SQL spec. + */ + if (bms_is_empty(rte->selectedCols)) + { + if (pg_attribute_aclcheck_all(relOid, userid, ACL_SELECT, + ACLMASK_ANY) != ACLCHECK_OK) + return false; + } + + while ((col = bms_next_member(rte->selectedCols, col)) >= 0) + { + /* bit #s are offset by FirstLowInvalidHeapAttributeNumber */ + AttrNumber attno = col + FirstLowInvalidHeapAttributeNumber; + + if (attno == InvalidAttrNumber) + { + /* Whole-row reference, must have priv on all cols */ + if (pg_attribute_aclcheck_all(relOid, userid, ACL_SELECT, + ACLMASK_ALL) != ACLCHECK_OK) + return false; + } + else + { + if (pg_attribute_aclcheck(relOid, attno, userid, + ACL_SELECT) != ACLCHECK_OK) + return false; + } + } + } + + /* + * Basically the same for the mod columns, for both INSERT and UPDATE + * privilege as specified by remainingPerms. + */ + if (remainingPerms & ACL_INSERT && !ExecCheckRTEPermsModified(relOid, + userid, + rte->insertedCols, + ACL_INSERT)) + return false; + + if (remainingPerms & ACL_UPDATE && !ExecCheckRTEPermsModified(relOid, + userid, + rte->updatedCols, + ACL_UPDATE)) + return false; + } + return true; +} + +/* + * ExecCheckRTEPermsModified + * Check INSERT or UPDATE access permissions for a single RTE (these + * are processed uniformly). + */ +static bool +ExecCheckRTEPermsModified(Oid relOid, Oid userid, Bitmapset *modifiedCols, + AclMode requiredPerms) +{ + int col = -1; + + /* + * When the query doesn't explicitly update any columns, allow the query + * if we have permission on any column of the rel. This is to handle + * SELECT FOR UPDATE as well as possible corner cases in UPDATE. + */ + if (bms_is_empty(modifiedCols)) + { + if (pg_attribute_aclcheck_all(relOid, userid, requiredPerms, + ACLMASK_ANY) != ACLCHECK_OK) + return false; + } + + while ((col = bms_next_member(modifiedCols, col)) >= 0) + { + /* bit #s are offset by FirstLowInvalidHeapAttributeNumber */ + AttrNumber attno = col + FirstLowInvalidHeapAttributeNumber; + + if (attno == InvalidAttrNumber) + { + /* whole-row reference can't happen here */ + elog(ERROR, "whole-row update is not implemented"); + } + else + { + if (pg_attribute_aclcheck(relOid, attno, userid, + requiredPerms) != ACLCHECK_OK) + return false; + } + } + return true; +} + +/* + * Check that the query does not imply any writes to non-temp tables; + * unless we're in parallel mode, in which case don't even allow writes + * to temp tables. + * + * Note: in a Hot Standby this would need to reject writes to temp + * tables just as we do in parallel mode; but an HS standby can't have created + * any temp tables in the first place, so no need to check that. + */ +static void +ExecCheckXactReadOnly(PlannedStmt *plannedstmt) +{ + ListCell *l; + + /* + * Fail if write permissions are requested in parallel mode for table + * (temp or non-temp), otherwise fail for any non-temp table. + */ + foreach(l, plannedstmt->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + + if (rte->rtekind != RTE_RELATION) + continue; + + if ((rte->requiredPerms & (~ACL_SELECT)) == 0) + continue; + + if (isTempNamespace(get_rel_namespace(rte->relid))) + continue; + + PreventCommandIfReadOnly(CreateCommandName((Node *) plannedstmt)); + } + + if (plannedstmt->commandType != CMD_SELECT || plannedstmt->hasModifyingCTE) + PreventCommandIfParallelMode(CreateCommandName((Node *) plannedstmt)); +} + + +/* ---------------------------------------------------------------- + * InitPlan + * + * Initializes the query plan: open files, allocate storage + * and start up the rule manager + * ---------------------------------------------------------------- + */ +static void +InitPlan(QueryDesc *queryDesc, int eflags) +{ + CmdType operation = queryDesc->operation; + PlannedStmt *plannedstmt = queryDesc->plannedstmt; + Plan *plan = plannedstmt->planTree; + List *rangeTable = plannedstmt->rtable; + EState *estate = queryDesc->estate; + PlanState *planstate; + TupleDesc tupType; + ListCell *l; + int i; + + /* + * Do permissions checks + */ + ExecCheckRTPerms(rangeTable, true); + + /* + * initialize the node's execution state + */ + ExecInitRangeTable(estate, rangeTable); + + estate->es_plannedstmt = plannedstmt; + + /* + * Next, build the ExecRowMark array from the PlanRowMark(s), if any. + */ + if (plannedstmt->rowMarks) + { + estate->es_rowmarks = (ExecRowMark **) + palloc0(estate->es_range_table_size * sizeof(ExecRowMark *)); + foreach(l, plannedstmt->rowMarks) + { + PlanRowMark *rc = (PlanRowMark *) lfirst(l); + Oid relid; + Relation relation; + ExecRowMark *erm; + + /* ignore "parent" rowmarks; they are irrelevant at runtime */ + if (rc->isParent) + continue; + + /* get relation's OID (will produce InvalidOid if subquery) */ + relid = exec_rt_fetch(rc->rti, estate)->relid; + + /* open relation, if we need to access it for this mark type */ + switch (rc->markType) + { + case ROW_MARK_EXCLUSIVE: + case ROW_MARK_NOKEYEXCLUSIVE: + case ROW_MARK_SHARE: + case ROW_MARK_KEYSHARE: + case ROW_MARK_REFERENCE: + relation = ExecGetRangeTableRelation(estate, rc->rti); + break; + case ROW_MARK_COPY: + /* no physical table access is required */ + relation = NULL; + break; + default: + elog(ERROR, "unrecognized markType: %d", rc->markType); + relation = NULL; /* keep compiler quiet */ + break; + } + + /* Check that relation is a legal target for marking */ + if (relation) + CheckValidRowMarkRel(relation, rc->markType); + + erm = (ExecRowMark *) palloc(sizeof(ExecRowMark)); + erm->relation = relation; + erm->relid = relid; + erm->rti = rc->rti; + erm->prti = rc->prti; + erm->rowmarkId = rc->rowmarkId; + erm->markType = rc->markType; + erm->strength = rc->strength; + erm->waitPolicy = rc->waitPolicy; + erm->ermActive = false; + ItemPointerSetInvalid(&(erm->curCtid)); + erm->ermExtra = NULL; + + Assert(erm->rti > 0 && erm->rti <= estate->es_range_table_size && + estate->es_rowmarks[erm->rti - 1] == NULL); + + estate->es_rowmarks[erm->rti - 1] = erm; + } + } + + /* + * Initialize the executor's tuple table to empty. + */ + estate->es_tupleTable = NIL; + + /* signal that this EState is not used for EPQ */ + estate->es_epq_active = NULL; + + /* + * Initialize private state information for each SubPlan. We must do this + * before running ExecInitNode on the main query tree, since + * ExecInitSubPlan expects to be able to find these entries. + */ + Assert(estate->es_subplanstates == NIL); + i = 1; /* subplan indices count from 1 */ + foreach(l, plannedstmt->subplans) + { + Plan *subplan = (Plan *) lfirst(l); + PlanState *subplanstate; + int sp_eflags; + + /* + * A subplan will never need to do BACKWARD scan nor MARK/RESTORE. If + * it is a parameterless subplan (not initplan), we suggest that it be + * prepared to handle REWIND efficiently; otherwise there is no need. + */ + sp_eflags = eflags + & (EXEC_FLAG_EXPLAIN_ONLY | EXEC_FLAG_WITH_NO_DATA); + if (bms_is_member(i, plannedstmt->rewindPlanIDs)) + sp_eflags |= EXEC_FLAG_REWIND; + + subplanstate = ExecInitNode(subplan, estate, sp_eflags); + + estate->es_subplanstates = lappend(estate->es_subplanstates, + subplanstate); + + i++; + } + + /* + * Initialize the private state information for all the nodes in the query + * tree. This opens files, allocates storage and leaves us ready to start + * processing tuples. + */ + planstate = ExecInitNode(plan, estate, eflags); + + /* + * Get the tuple descriptor describing the type of tuples to return. + */ + tupType = ExecGetResultType(planstate); + + /* + * Initialize the junk filter if needed. SELECT queries need a filter if + * there are any junk attrs in the top-level tlist. + */ + if (operation == CMD_SELECT) + { + bool junk_filter_needed = false; + ListCell *tlist; + + foreach(tlist, plan->targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(tlist); + + if (tle->resjunk) + { + junk_filter_needed = true; + break; + } + } + + if (junk_filter_needed) + { + JunkFilter *j; + TupleTableSlot *slot; + + slot = ExecInitExtraTupleSlot(estate, NULL, &TTSOpsVirtual); + j = ExecInitJunkFilter(planstate->plan->targetlist, + slot); + estate->es_junkFilter = j; + + /* Want to return the cleaned tuple type */ + tupType = j->jf_cleanTupType; + } + } + + queryDesc->tupDesc = tupType; + queryDesc->planstate = planstate; +} + +/* + * Check that a proposed result relation is a legal target for the operation + * + * Generally the parser and/or planner should have noticed any such mistake + * already, but let's make sure. + * + * Note: when changing this function, you probably also need to look at + * CheckValidRowMarkRel. + */ +void +CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation) +{ + Relation resultRel = resultRelInfo->ri_RelationDesc; + TriggerDesc *trigDesc = resultRel->trigdesc; + FdwRoutine *fdwroutine; + + switch (resultRel->rd_rel->relkind) + { + case RELKIND_RELATION: + case RELKIND_PARTITIONED_TABLE: + CheckCmdReplicaIdentity(resultRel, operation); + break; + case RELKIND_SEQUENCE: + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot change sequence \"%s\"", + RelationGetRelationName(resultRel)))); + break; + case RELKIND_TOASTVALUE: + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot change TOAST relation \"%s\"", + RelationGetRelationName(resultRel)))); + break; + case RELKIND_VIEW: + + /* + * Okay only if there's a suitable INSTEAD OF trigger. Messages + * here should match rewriteHandler.c's rewriteTargetView and + * RewriteQuery, except that we omit errdetail because we haven't + * got the information handy (and given that we really shouldn't + * get here anyway, it's not worth great exertion to get). + */ + switch (operation) + { + case CMD_INSERT: + if (!trigDesc || !trigDesc->trig_insert_instead_row) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot insert into view \"%s\"", + RelationGetRelationName(resultRel)), + errhint("To enable inserting into the view, provide an INSTEAD OF INSERT trigger or an unconditional ON INSERT DO INSTEAD rule."))); + break; + case CMD_UPDATE: + if (!trigDesc || !trigDesc->trig_update_instead_row) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot update view \"%s\"", + RelationGetRelationName(resultRel)), + errhint("To enable updating the view, provide an INSTEAD OF UPDATE trigger or an unconditional ON UPDATE DO INSTEAD rule."))); + break; + case CMD_DELETE: + if (!trigDesc || !trigDesc->trig_delete_instead_row) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot delete from view \"%s\"", + RelationGetRelationName(resultRel)), + errhint("To enable deleting from the view, provide an INSTEAD OF DELETE trigger or an unconditional ON DELETE DO INSTEAD rule."))); + break; + default: + elog(ERROR, "unrecognized CmdType: %d", (int) operation); + break; + } + break; + case RELKIND_MATVIEW: + if (!MatViewIncrementalMaintenanceIsEnabled()) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot change materialized view \"%s\"", + RelationGetRelationName(resultRel)))); + break; + case RELKIND_FOREIGN_TABLE: + /* Okay only if the FDW supports it */ + fdwroutine = resultRelInfo->ri_FdwRoutine; + switch (operation) + { + case CMD_INSERT: + if (fdwroutine->ExecForeignInsert == NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot insert into foreign table \"%s\"", + RelationGetRelationName(resultRel)))); + if (fdwroutine->IsForeignRelUpdatable != NULL && + (fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_INSERT)) == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("foreign table \"%s\" does not allow inserts", + RelationGetRelationName(resultRel)))); + break; + case CMD_UPDATE: + if (fdwroutine->ExecForeignUpdate == NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot update foreign table \"%s\"", + RelationGetRelationName(resultRel)))); + if (fdwroutine->IsForeignRelUpdatable != NULL && + (fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_UPDATE)) == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("foreign table \"%s\" does not allow updates", + RelationGetRelationName(resultRel)))); + break; + case CMD_DELETE: + if (fdwroutine->ExecForeignDelete == NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot delete from foreign table \"%s\"", + RelationGetRelationName(resultRel)))); + if (fdwroutine->IsForeignRelUpdatable != NULL && + (fdwroutine->IsForeignRelUpdatable(resultRel) & (1 << CMD_DELETE)) == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("foreign table \"%s\" does not allow deletes", + RelationGetRelationName(resultRel)))); + break; + default: + elog(ERROR, "unrecognized CmdType: %d", (int) operation); + break; + } + break; + default: + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot change relation \"%s\"", + RelationGetRelationName(resultRel)))); + break; + } +} + +/* + * Check that a proposed rowmark target relation is a legal target + * + * In most cases parser and/or planner should have noticed this already, but + * they don't cover all cases. + */ +static void +CheckValidRowMarkRel(Relation rel, RowMarkType markType) +{ + FdwRoutine *fdwroutine; + + switch (rel->rd_rel->relkind) + { + case RELKIND_RELATION: + case RELKIND_PARTITIONED_TABLE: + /* OK */ + break; + case RELKIND_SEQUENCE: + /* Must disallow this because we don't vacuum sequences */ + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot lock rows in sequence \"%s\"", + RelationGetRelationName(rel)))); + break; + case RELKIND_TOASTVALUE: + /* We could allow this, but there seems no good reason to */ + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot lock rows in TOAST relation \"%s\"", + RelationGetRelationName(rel)))); + break; + case RELKIND_VIEW: + /* Should not get here; planner should have expanded the view */ + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot lock rows in view \"%s\"", + RelationGetRelationName(rel)))); + break; + case RELKIND_MATVIEW: + /* Allow referencing a matview, but not actual locking clauses */ + if (markType != ROW_MARK_REFERENCE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot lock rows in materialized view \"%s\"", + RelationGetRelationName(rel)))); + break; + case RELKIND_FOREIGN_TABLE: + /* Okay only if the FDW supports it */ + fdwroutine = GetFdwRoutineForRelation(rel, false); + if (fdwroutine->RefetchForeignRow == NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot lock rows in foreign table \"%s\"", + RelationGetRelationName(rel)))); + break; + default: + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot lock rows in relation \"%s\"", + RelationGetRelationName(rel)))); + break; + } +} + +/* + * Initialize ResultRelInfo data for one result relation + * + * Caution: before Postgres 9.1, this function included the relkind checking + * that's now in CheckValidResultRel, and it also did ExecOpenIndices if + * appropriate. Be sure callers cover those needs. + */ +void +InitResultRelInfo(ResultRelInfo *resultRelInfo, + Relation resultRelationDesc, + Index resultRelationIndex, + ResultRelInfo *partition_root_rri, + int instrument_options) +{ + MemSet(resultRelInfo, 0, sizeof(ResultRelInfo)); + resultRelInfo->type = T_ResultRelInfo; + resultRelInfo->ri_RangeTableIndex = resultRelationIndex; + resultRelInfo->ri_RelationDesc = resultRelationDesc; + resultRelInfo->ri_NumIndices = 0; + resultRelInfo->ri_IndexRelationDescs = NULL; + resultRelInfo->ri_IndexRelationInfo = NULL; + /* make a copy so as not to depend on relcache info not changing... */ + resultRelInfo->ri_TrigDesc = CopyTriggerDesc(resultRelationDesc->trigdesc); + if (resultRelInfo->ri_TrigDesc) + { + int n = resultRelInfo->ri_TrigDesc->numtriggers; + + resultRelInfo->ri_TrigFunctions = (FmgrInfo *) + palloc0(n * sizeof(FmgrInfo)); + resultRelInfo->ri_TrigWhenExprs = (ExprState **) + palloc0(n * sizeof(ExprState *)); + if (instrument_options) + resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false); + } + else + { + resultRelInfo->ri_TrigFunctions = NULL; + resultRelInfo->ri_TrigWhenExprs = NULL; + resultRelInfo->ri_TrigInstrument = NULL; + } + if (resultRelationDesc->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + resultRelInfo->ri_FdwRoutine = GetFdwRoutineForRelation(resultRelationDesc, true); + else + resultRelInfo->ri_FdwRoutine = NULL; + + /* The following fields are set later if needed */ + resultRelInfo->ri_RowIdAttNo = 0; + resultRelInfo->ri_projectNew = NULL; + resultRelInfo->ri_newTupleSlot = NULL; + resultRelInfo->ri_oldTupleSlot = NULL; + resultRelInfo->ri_projectNewInfoValid = false; + resultRelInfo->ri_FdwState = NULL; + resultRelInfo->ri_usesFdwDirectModify = false; + resultRelInfo->ri_ConstraintExprs = NULL; + resultRelInfo->ri_GeneratedExprs = NULL; + resultRelInfo->ri_projectReturning = NULL; + resultRelInfo->ri_onConflictArbiterIndexes = NIL; + resultRelInfo->ri_onConflict = NULL; + resultRelInfo->ri_ReturningSlot = NULL; + resultRelInfo->ri_TrigOldSlot = NULL; + resultRelInfo->ri_TrigNewSlot = NULL; + + /* + * Only ExecInitPartitionInfo() and ExecInitPartitionDispatchInfo() pass + * non-NULL partition_root_rri. For child relations that are part of the + * initial query rather than being dynamically added by tuple routing, + * this field is filled in ExecInitModifyTable(). + */ + resultRelInfo->ri_RootResultRelInfo = partition_root_rri; + resultRelInfo->ri_RootToPartitionMap = NULL; /* set by + * ExecInitRoutingInfo */ + resultRelInfo->ri_PartitionTupleSlot = NULL; /* ditto */ + resultRelInfo->ri_ChildToRootMap = NULL; + resultRelInfo->ri_ChildToRootMapValid = false; + resultRelInfo->ri_CopyMultiInsertBuffer = NULL; +} + +/* + * ExecGetTriggerResultRel + * Get a ResultRelInfo for a trigger target relation. + * + * Most of the time, triggers are fired on one of the result relations of the + * query, and so we can just return a member of the es_result_relations array, + * or the es_tuple_routing_result_relations list (if any). (Note: in self-join + * situations there might be multiple members with the same OID; if so it + * doesn't matter which one we pick.) + * + * However, it is sometimes necessary to fire triggers on other relations; + * this happens mainly when an RI update trigger queues additional triggers + * on other relations, which will be processed in the context of the outer + * query. For efficiency's sake, we want to have a ResultRelInfo for those + * triggers too; that can avoid repeated re-opening of the relation. (It + * also provides a way for EXPLAIN ANALYZE to report the runtimes of such + * triggers.) So we make additional ResultRelInfo's as needed, and save them + * in es_trig_target_relations. + */ +ResultRelInfo * +ExecGetTriggerResultRel(EState *estate, Oid relid) +{ + ResultRelInfo *rInfo; + ListCell *l; + Relation rel; + MemoryContext oldcontext; + + /* Search through the query result relations */ + foreach(l, estate->es_opened_result_relations) + { + rInfo = lfirst(l); + if (RelationGetRelid(rInfo->ri_RelationDesc) == relid) + return rInfo; + } + + /* + * Search through the result relations that were created during tuple + * routing, if any. + */ + foreach(l, estate->es_tuple_routing_result_relations) + { + rInfo = (ResultRelInfo *) lfirst(l); + if (RelationGetRelid(rInfo->ri_RelationDesc) == relid) + return rInfo; + } + + /* Nope, but maybe we already made an extra ResultRelInfo for it */ + foreach(l, estate->es_trig_target_relations) + { + rInfo = (ResultRelInfo *) lfirst(l); + if (RelationGetRelid(rInfo->ri_RelationDesc) == relid) + return rInfo; + } + /* Nope, so we need a new one */ + + /* + * Open the target relation's relcache entry. We assume that an + * appropriate lock is still held by the backend from whenever the trigger + * event got queued, so we need take no new lock here. Also, we need not + * recheck the relkind, so no need for CheckValidResultRel. + */ + rel = table_open(relid, NoLock); + + /* + * Make the new entry in the right context. + */ + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + rInfo = makeNode(ResultRelInfo); + InitResultRelInfo(rInfo, + rel, + 0, /* dummy rangetable index */ + NULL, + estate->es_instrument); + estate->es_trig_target_relations = + lappend(estate->es_trig_target_relations, rInfo); + MemoryContextSwitchTo(oldcontext); + + /* + * Currently, we don't need any index information in ResultRelInfos used + * only for triggers, so no need to call ExecOpenIndices. + */ + + return rInfo; +} + +/* ---------------------------------------------------------------- + * ExecPostprocessPlan + * + * Give plan nodes a final chance to execute before shutdown + * ---------------------------------------------------------------- + */ +static void +ExecPostprocessPlan(EState *estate) +{ + ListCell *lc; + + /* + * Make sure nodes run forward. + */ + estate->es_direction = ForwardScanDirection; + + /* + * Run any secondary ModifyTable nodes to completion, in case the main + * query did not fetch all rows from them. (We do this to ensure that + * such nodes have predictable results.) + */ + foreach(lc, estate->es_auxmodifytables) + { + PlanState *ps = (PlanState *) lfirst(lc); + + for (;;) + { + TupleTableSlot *slot; + + /* Reset the per-output-tuple exprcontext each time */ + ResetPerTupleExprContext(estate); + + slot = ExecProcNode(ps); + + if (TupIsNull(slot)) + break; + } + } +} + +/* ---------------------------------------------------------------- + * ExecEndPlan + * + * Cleans up the query plan -- closes files and frees up storage + * + * NOTE: we are no longer very worried about freeing storage per se + * in this code; FreeExecutorState should be guaranteed to release all + * memory that needs to be released. What we are worried about doing + * is closing relations and dropping buffer pins. Thus, for example, + * tuple tables must be cleared or dropped to ensure pins are released. + * ---------------------------------------------------------------- + */ +static void +ExecEndPlan(PlanState *planstate, EState *estate) +{ + ListCell *l; + + /* + * shut down the node-type-specific query processing + */ + ExecEndNode(planstate); + + /* + * for subplans too + */ + foreach(l, estate->es_subplanstates) + { + PlanState *subplanstate = (PlanState *) lfirst(l); + + ExecEndNode(subplanstate); + } + + /* + * destroy the executor's tuple table. Actually we only care about + * releasing buffer pins and tupdesc refcounts; there's no need to pfree + * the TupleTableSlots, since the containing memory context is about to go + * away anyway. + */ + ExecResetTupleTable(estate->es_tupleTable, false); + + /* + * Close any Relations that have been opened for range table entries or + * result relations. + */ + ExecCloseResultRelations(estate); + ExecCloseRangeTableRelations(estate); +} + +/* + * Close any relations that have been opened for ResultRelInfos. + */ +void +ExecCloseResultRelations(EState *estate) +{ + ListCell *l; + + /* + * close indexes of result relation(s) if any. (Rels themselves are + * closed in ExecCloseRangeTableRelations()) + */ + foreach(l, estate->es_opened_result_relations) + { + ResultRelInfo *resultRelInfo = lfirst(l); + + ExecCloseIndices(resultRelInfo); + } + + /* Close any relations that have been opened by ExecGetTriggerResultRel(). */ + foreach(l, estate->es_trig_target_relations) + { + ResultRelInfo *resultRelInfo = (ResultRelInfo *) lfirst(l); + + /* + * Assert this is a "dummy" ResultRelInfo, see above. Otherwise we + * might be issuing a duplicate close against a Relation opened by + * ExecGetRangeTableRelation. + */ + Assert(resultRelInfo->ri_RangeTableIndex == 0); + + /* + * Since ExecGetTriggerResultRel doesn't call ExecOpenIndices for + * these rels, we needn't call ExecCloseIndices either. + */ + Assert(resultRelInfo->ri_NumIndices == 0); + + table_close(resultRelInfo->ri_RelationDesc, NoLock); + } +} + +/* + * Close all relations opened by ExecGetRangeTableRelation(). + * + * We do not release any locks we might hold on those rels. + */ +void +ExecCloseRangeTableRelations(EState *estate) +{ + int i; + + for (i = 0; i < estate->es_range_table_size; i++) + { + if (estate->es_relations[i]) + table_close(estate->es_relations[i], NoLock); + } +} + +/* ---------------------------------------------------------------- + * ExecutePlan + * + * Processes the query plan until we have retrieved 'numberTuples' tuples, + * moving in the specified direction. + * + * Runs to completion if numberTuples is 0 + * + * Note: the ctid attribute is a 'junk' attribute that is removed before the + * user can see it + * ---------------------------------------------------------------- + */ +static void +ExecutePlan(EState *estate, + PlanState *planstate, + bool use_parallel_mode, + CmdType operation, + bool sendTuples, + uint64 numberTuples, + ScanDirection direction, + DestReceiver *dest, + bool execute_once) +{ + TupleTableSlot *slot; + uint64 current_tuple_count; + + /* + * initialize local variables + */ + current_tuple_count = 0; + + /* + * Set the direction. + */ + estate->es_direction = direction; + + /* + * If the plan might potentially be executed multiple times, we must force + * it to run without parallelism, because we might exit early. + */ + if (!execute_once) + use_parallel_mode = false; + + estate->es_use_parallel_mode = use_parallel_mode; + if (use_parallel_mode) + EnterParallelMode(); + + /* + * Loop until we've processed the proper number of tuples from the plan. + */ + for (;;) + { + /* Reset the per-output-tuple exprcontext */ + ResetPerTupleExprContext(estate); + + /* + * Execute the plan and obtain a tuple + */ + slot = ExecProcNode(planstate); + + /* + * if the tuple is null, then we assume there is nothing more to + * process so we just end the loop... + */ + if (TupIsNull(slot)) + break; + + /* + * If we have a junk filter, then project a new tuple with the junk + * removed. + * + * Store this new "clean" tuple in the junkfilter's resultSlot. + * (Formerly, we stored it back over the "dirty" tuple, which is WRONG + * because that tuple slot has the wrong descriptor.) + */ + if (estate->es_junkFilter != NULL) + slot = ExecFilterJunk(estate->es_junkFilter, slot); + + /* + * If we are supposed to send the tuple somewhere, do so. (In + * practice, this is probably always the case at this point.) + */ + if (sendTuples) + { + /* + * If we are not able to send the tuple, we assume the destination + * has closed and no more tuples can be sent. If that's the case, + * end the loop. + */ + if (!dest->receiveSlot(slot, dest)) + break; + } + + /* + * Count tuples processed, if this is a SELECT. (For other operation + * types, the ModifyTable plan node must count the appropriate + * events.) + */ + if (operation == CMD_SELECT) + (estate->es_processed)++; + + /* + * check our tuple count.. if we've processed the proper number then + * quit, else loop again and process more tuples. Zero numberTuples + * means no limit. + */ + current_tuple_count++; + if (numberTuples && numberTuples == current_tuple_count) + break; + } + + /* + * If we know we won't need to back up, we can release resources at this + * point. + */ + if (!(estate->es_top_eflags & EXEC_FLAG_BACKWARD)) + (void) ExecShutdownNode(planstate); + + if (use_parallel_mode) + ExitParallelMode(); +} + + +/* + * ExecRelCheck --- check that tuple meets constraints for result relation + * + * Returns NULL if OK, else name of failed check constraint + */ +static const char * +ExecRelCheck(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + int ncheck = rel->rd_att->constr->num_check; + ConstrCheck *check = rel->rd_att->constr->check; + ExprContext *econtext; + MemoryContext oldContext; + int i; + + /* + * CheckConstraintFetch let this pass with only a warning, but now we + * should fail rather than possibly failing to enforce an important + * constraint. + */ + if (ncheck != rel->rd_rel->relchecks) + elog(ERROR, "%d pg_constraint record(s) missing for relation \"%s\"", + rel->rd_rel->relchecks - ncheck, RelationGetRelationName(rel)); + + /* + * If first time through for this result relation, build expression + * nodetrees for rel's constraint expressions. Keep them in the per-query + * memory context so they'll survive throughout the query. + */ + if (resultRelInfo->ri_ConstraintExprs == NULL) + { + oldContext = MemoryContextSwitchTo(estate->es_query_cxt); + resultRelInfo->ri_ConstraintExprs = + (ExprState **) palloc(ncheck * sizeof(ExprState *)); + for (i = 0; i < ncheck; i++) + { + Expr *checkconstr; + + checkconstr = stringToNode(check[i].ccbin); + resultRelInfo->ri_ConstraintExprs[i] = + ExecPrepareExpr(checkconstr, estate); + } + MemoryContextSwitchTo(oldContext); + } + + /* + * We will use the EState's per-tuple context for evaluating constraint + * expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* And evaluate the constraints */ + for (i = 0; i < ncheck; i++) + { + ExprState *checkconstr = resultRelInfo->ri_ConstraintExprs[i]; + + /* + * NOTE: SQL specifies that a NULL result from a constraint expression + * is not to be treated as a failure. Therefore, use ExecCheck not + * ExecQual. + */ + if (!ExecCheck(checkconstr, econtext)) + return check[i].ccname; + } + + /* NULL result means no error */ + return NULL; +} + +/* + * ExecPartitionCheck --- check that tuple meets the partition constraint. + * + * Returns true if it meets the partition constraint. If the constraint + * fails and we're asked to emit an error, do so and don't return; otherwise + * return false. + */ +bool +ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, + EState *estate, bool emitError) +{ + ExprContext *econtext; + bool success; + + /* + * If first time through, build expression state tree for the partition + * check expression. (In the corner case where the partition check + * expression is empty, ie there's a default partition and nothing else, + * we'll be fooled into executing this code each time through. But it's + * pretty darn cheap in that case, so we don't worry about it.) + */ + if (resultRelInfo->ri_PartitionCheckExpr == NULL) + { + /* + * Ensure that the qual tree and prepared expression are in the + * query-lifespan context. + */ + MemoryContext oldcxt = MemoryContextSwitchTo(estate->es_query_cxt); + List *qual = RelationGetPartitionQual(resultRelInfo->ri_RelationDesc); + + resultRelInfo->ri_PartitionCheckExpr = ExecPrepareCheck(qual, estate); + MemoryContextSwitchTo(oldcxt); + } + + /* + * We will use the EState's per-tuple context for evaluating constraint + * expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* + * As in case of the catalogued constraints, we treat a NULL result as + * success here, not a failure. + */ + success = ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext); + + /* if asked to emit error, don't actually return on failure */ + if (!success && emitError) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + + return success; +} + +/* + * ExecPartitionCheckEmitError - Form and emit an error message after a failed + * partition constraint check. + */ +void +ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate) +{ + Oid root_relid; + TupleDesc tupdesc; + char *val_desc; + Bitmapset *modifiedCols; + + /* + * If the tuple has been routed, it's been converted to the partition's + * rowtype, which might differ from the root table's. We must convert it + * back to the root table's rowtype so that val_desc in the error message + * matches the input tuple. + */ + if (resultRelInfo->ri_RootResultRelInfo) + { + ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo; + TupleDesc old_tupdesc; + AttrMap *map; + + root_relid = RelationGetRelid(rootrel->ri_RelationDesc); + tupdesc = RelationGetDescr(rootrel->ri_RelationDesc); + + old_tupdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + /* a reverse map */ + map = build_attrmap_by_name_if_req(old_tupdesc, tupdesc); + + /* + * Partition-specific slot's tupdesc can't be changed, so allocate a + * new one. + */ + if (map != NULL) + slot = execute_attr_map_slot(map, slot, + MakeTupleTableSlot(tupdesc, &TTSOpsVirtual)); + modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate), + ExecGetUpdatedCols(rootrel, estate)); + } + else + { + root_relid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + tupdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate), + ExecGetUpdatedCols(resultRelInfo, estate)); + } + + val_desc = ExecBuildSlotValueDescription(root_relid, + slot, + tupdesc, + modifiedCols, + 64); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("new row for relation \"%s\" violates partition constraint", + RelationGetRelationName(resultRelInfo->ri_RelationDesc)), + val_desc ? errdetail("Failing row contains %s.", val_desc) : 0, + errtable(resultRelInfo->ri_RelationDesc))); +} + +/* + * ExecConstraints - check constraints of the tuple in 'slot' + * + * This checks the traditional NOT NULL and check constraints. + * + * The partition constraint is *NOT* checked. + * + * Note: 'slot' contains the tuple to check the constraints of, which may + * have been converted from the original input tuple after tuple routing. + * 'resultRelInfo' is the final result relation, after tuple routing. + */ +void +ExecConstraints(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + TupleDesc tupdesc = RelationGetDescr(rel); + TupleConstr *constr = tupdesc->constr; + Bitmapset *modifiedCols; + + Assert(constr); /* we should not be called otherwise */ + + if (constr->has_not_null) + { + int natts = tupdesc->natts; + int attrChk; + + for (attrChk = 1; attrChk <= natts; attrChk++) + { + Form_pg_attribute att = TupleDescAttr(tupdesc, attrChk - 1); + + if (att->attnotnull && slot_attisnull(slot, attrChk)) + { + char *val_desc; + Relation orig_rel = rel; + TupleDesc orig_tupdesc = RelationGetDescr(rel); + + /* + * If the tuple has been routed, it's been converted to the + * partition's rowtype, which might differ from the root + * table's. We must convert it back to the root table's + * rowtype so that val_desc shown error message matches the + * input tuple. + */ + if (resultRelInfo->ri_RootResultRelInfo) + { + ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo; + AttrMap *map; + + tupdesc = RelationGetDescr(rootrel->ri_RelationDesc); + /* a reverse map */ + map = build_attrmap_by_name_if_req(orig_tupdesc, + tupdesc); + + /* + * Partition-specific slot's tupdesc can't be changed, so + * allocate a new one. + */ + if (map != NULL) + slot = execute_attr_map_slot(map, slot, + MakeTupleTableSlot(tupdesc, &TTSOpsVirtual)); + modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate), + ExecGetUpdatedCols(rootrel, estate)); + rel = rootrel->ri_RelationDesc; + } + else + modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate), + ExecGetUpdatedCols(resultRelInfo, estate)); + val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), + slot, + tupdesc, + modifiedCols, + 64); + + ereport(ERROR, + (errcode(ERRCODE_NOT_NULL_VIOLATION), + errmsg("null value in column \"%s\" of relation \"%s\" violates not-null constraint", + NameStr(att->attname), + RelationGetRelationName(orig_rel)), + val_desc ? errdetail("Failing row contains %s.", val_desc) : 0, + errtablecol(orig_rel, attrChk))); + } + } + } + + if (rel->rd_rel->relchecks > 0) + { + const char *failed; + + if ((failed = ExecRelCheck(resultRelInfo, slot, estate)) != NULL) + { + char *val_desc; + Relation orig_rel = rel; + + /* See the comment above. */ + if (resultRelInfo->ri_RootResultRelInfo) + { + ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo; + TupleDesc old_tupdesc = RelationGetDescr(rel); + AttrMap *map; + + tupdesc = RelationGetDescr(rootrel->ri_RelationDesc); + /* a reverse map */ + map = build_attrmap_by_name_if_req(old_tupdesc, + tupdesc); + + /* + * Partition-specific slot's tupdesc can't be changed, so + * allocate a new one. + */ + if (map != NULL) + slot = execute_attr_map_slot(map, slot, + MakeTupleTableSlot(tupdesc, &TTSOpsVirtual)); + modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate), + ExecGetUpdatedCols(rootrel, estate)); + rel = rootrel->ri_RelationDesc; + } + else + modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate), + ExecGetUpdatedCols(resultRelInfo, estate)); + val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), + slot, + tupdesc, + modifiedCols, + 64); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("new row for relation \"%s\" violates check constraint \"%s\"", + RelationGetRelationName(orig_rel), failed), + val_desc ? errdetail("Failing row contains %s.", val_desc) : 0, + errtableconstraint(orig_rel, failed))); + } + } +} + +/* + * ExecWithCheckOptions -- check that tuple satisfies any WITH CHECK OPTIONs + * of the specified kind. + * + * Note that this needs to be called multiple times to ensure that all kinds of + * WITH CHECK OPTIONs are handled (both those from views which have the WITH + * CHECK OPTION set and from row-level security policies). See ExecInsert() + * and ExecUpdate(). + */ +void +ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + TupleDesc tupdesc = RelationGetDescr(rel); + ExprContext *econtext; + ListCell *l1, + *l2; + + /* + * We will use the EState's per-tuple context for evaluating constraint + * expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check each of the constraints */ + forboth(l1, resultRelInfo->ri_WithCheckOptions, + l2, resultRelInfo->ri_WithCheckOptionExprs) + { + WithCheckOption *wco = (WithCheckOption *) lfirst(l1); + ExprState *wcoExpr = (ExprState *) lfirst(l2); + + /* + * Skip any WCOs which are not the kind we are looking for at this + * time. + */ + if (wco->kind != kind) + continue; + + /* + * WITH CHECK OPTION checks are intended to ensure that the new tuple + * is visible (in the case of a view) or that it passes the + * 'with-check' policy (in the case of row security). If the qual + * evaluates to NULL or FALSE, then the new tuple won't be included in + * the view or doesn't pass the 'with-check' policy for the table. + */ + if (!ExecQual(wcoExpr, econtext)) + { + char *val_desc; + Bitmapset *modifiedCols; + + switch (wco->kind) + { + /* + * For WITH CHECK OPTIONs coming from views, we might be + * able to provide the details on the row, depending on + * the permissions on the relation (that is, if the user + * could view it directly anyway). For RLS violations, we + * don't include the data since we don't know if the user + * should be able to view the tuple as that depends on the + * USING policy. + */ + case WCO_VIEW_CHECK: + /* See the comment in ExecConstraints(). */ + if (resultRelInfo->ri_RootResultRelInfo) + { + ResultRelInfo *rootrel = resultRelInfo->ri_RootResultRelInfo; + TupleDesc old_tupdesc = RelationGetDescr(rel); + AttrMap *map; + + tupdesc = RelationGetDescr(rootrel->ri_RelationDesc); + /* a reverse map */ + map = build_attrmap_by_name_if_req(old_tupdesc, + tupdesc); + + /* + * Partition-specific slot's tupdesc can't be changed, + * so allocate a new one. + */ + if (map != NULL) + slot = execute_attr_map_slot(map, slot, + MakeTupleTableSlot(tupdesc, &TTSOpsVirtual)); + + modifiedCols = bms_union(ExecGetInsertedCols(rootrel, estate), + ExecGetUpdatedCols(rootrel, estate)); + rel = rootrel->ri_RelationDesc; + } + else + modifiedCols = bms_union(ExecGetInsertedCols(resultRelInfo, estate), + ExecGetUpdatedCols(resultRelInfo, estate)); + val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), + slot, + tupdesc, + modifiedCols, + 64); + + ereport(ERROR, + (errcode(ERRCODE_WITH_CHECK_OPTION_VIOLATION), + errmsg("new row violates check option for view \"%s\"", + wco->relname), + val_desc ? errdetail("Failing row contains %s.", + val_desc) : 0)); + break; + case WCO_RLS_INSERT_CHECK: + case WCO_RLS_UPDATE_CHECK: + if (wco->polname != NULL) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("new row violates row-level security policy \"%s\" for table \"%s\"", + wco->polname, wco->relname))); + else + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("new row violates row-level security policy for table \"%s\"", + wco->relname))); + break; + case WCO_RLS_CONFLICT_CHECK: + if (wco->polname != NULL) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("new row violates row-level security policy \"%s\" (USING expression) for table \"%s\"", + wco->polname, wco->relname))); + else + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("new row violates row-level security policy (USING expression) for table \"%s\"", + wco->relname))); + break; + default: + elog(ERROR, "unrecognized WCO kind: %u", wco->kind); + break; + } + } + } +} + +/* + * ExecBuildSlotValueDescription -- construct a string representing a tuple + * + * This is intentionally very similar to BuildIndexValueDescription, but + * unlike that function, we truncate long field values (to at most maxfieldlen + * bytes). That seems necessary here since heap field values could be very + * long, whereas index entries typically aren't so wide. + * + * Also, unlike the case with index entries, we need to be prepared to ignore + * dropped columns. We used to use the slot's tuple descriptor to decode the + * data, but the slot's descriptor doesn't identify dropped columns, so we + * now need to be passed the relation's descriptor. + * + * Note that, like BuildIndexValueDescription, if the user does not have + * permission to view any of the columns involved, a NULL is returned. Unlike + * BuildIndexValueDescription, if the user has access to view a subset of the + * column involved, that subset will be returned with a key identifying which + * columns they are. + */ +static char * +ExecBuildSlotValueDescription(Oid reloid, + TupleTableSlot *slot, + TupleDesc tupdesc, + Bitmapset *modifiedCols, + int maxfieldlen) +{ + StringInfoData buf; + StringInfoData collist; + bool write_comma = false; + bool write_comma_collist = false; + int i; + AclResult aclresult; + bool table_perm = false; + bool any_perm = false; + + /* + * Check if RLS is enabled and should be active for the relation; if so, + * then don't return anything. Otherwise, go through normal permission + * checks. + */ + if (check_enable_rls(reloid, InvalidOid, true) == RLS_ENABLED) + return NULL; + + initStringInfo(&buf); + + appendStringInfoChar(&buf, '('); + + /* + * Check if the user has permissions to see the row. Table-level SELECT + * allows access to all columns. If the user does not have table-level + * SELECT then we check each column and include those the user has SELECT + * rights on. Additionally, we always include columns the user provided + * data for. + */ + aclresult = pg_class_aclcheck(reloid, GetUserId(), ACL_SELECT); + if (aclresult != ACLCHECK_OK) + { + /* Set up the buffer for the column list */ + initStringInfo(&collist); + appendStringInfoChar(&collist, '('); + } + else + table_perm = any_perm = true; + + /* Make sure the tuple is fully deconstructed */ + slot_getallattrs(slot); + + for (i = 0; i < tupdesc->natts; i++) + { + bool column_perm = false; + char *val; + int vallen; + Form_pg_attribute att = TupleDescAttr(tupdesc, i); + + /* ignore dropped columns */ + if (att->attisdropped) + continue; + + if (!table_perm) + { + /* + * No table-level SELECT, so need to make sure they either have + * SELECT rights on the column or that they have provided the data + * for the column. If not, omit this column from the error + * message. + */ + aclresult = pg_attribute_aclcheck(reloid, att->attnum, + GetUserId(), ACL_SELECT); + if (bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber, + modifiedCols) || aclresult == ACLCHECK_OK) + { + column_perm = any_perm = true; + + if (write_comma_collist) + appendStringInfoString(&collist, ", "); + else + write_comma_collist = true; + + appendStringInfoString(&collist, NameStr(att->attname)); + } + } + + if (table_perm || column_perm) + { + if (slot->tts_isnull[i]) + val = "null"; + else + { + Oid foutoid; + bool typisvarlena; + + getTypeOutputInfo(att->atttypid, + &foutoid, &typisvarlena); + val = OidOutputFunctionCall(foutoid, slot->tts_values[i]); + } + + if (write_comma) + appendStringInfoString(&buf, ", "); + else + write_comma = true; + + /* truncate if needed */ + vallen = strlen(val); + if (vallen <= maxfieldlen) + appendBinaryStringInfo(&buf, val, vallen); + else + { + vallen = pg_mbcliplen(val, vallen, maxfieldlen); + appendBinaryStringInfo(&buf, val, vallen); + appendStringInfoString(&buf, "..."); + } + } + } + + /* If we end up with zero columns being returned, then return NULL. */ + if (!any_perm) + return NULL; + + appendStringInfoChar(&buf, ')'); + + if (!table_perm) + { + appendStringInfoString(&collist, ") = "); + appendBinaryStringInfo(&collist, buf.data, buf.len); + + return collist.data; + } + + return buf.data; +} + + +/* + * ExecUpdateLockMode -- find the appropriate UPDATE tuple lock mode for a + * given ResultRelInfo + */ +LockTupleMode +ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo) +{ + Bitmapset *keyCols; + Bitmapset *updatedCols; + + /* + * Compute lock mode to use. If columns that are part of the key have not + * been modified, then we can use a weaker lock, allowing for better + * concurrency. + */ + updatedCols = ExecGetAllUpdatedCols(relinfo, estate); + keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc, + INDEX_ATTR_BITMAP_KEY); + + if (bms_overlap(keyCols, updatedCols)) + return LockTupleExclusive; + + return LockTupleNoKeyExclusive; +} + +/* + * ExecFindRowMark -- find the ExecRowMark struct for given rangetable index + * + * If no such struct, either return NULL or throw error depending on missing_ok + */ +ExecRowMark * +ExecFindRowMark(EState *estate, Index rti, bool missing_ok) +{ + if (rti > 0 && rti <= estate->es_range_table_size && + estate->es_rowmarks != NULL) + { + ExecRowMark *erm = estate->es_rowmarks[rti - 1]; + + if (erm) + return erm; + } + if (!missing_ok) + elog(ERROR, "failed to find ExecRowMark for rangetable index %u", rti); + return NULL; +} + +/* + * ExecBuildAuxRowMark -- create an ExecAuxRowMark struct + * + * Inputs are the underlying ExecRowMark struct and the targetlist of the + * input plan node (not planstate node!). We need the latter to find out + * the column numbers of the resjunk columns. + */ +ExecAuxRowMark * +ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist) +{ + ExecAuxRowMark *aerm = (ExecAuxRowMark *) palloc0(sizeof(ExecAuxRowMark)); + char resname[32]; + + aerm->rowmark = erm; + + /* Look up the resjunk columns associated with this rowmark */ + if (erm->markType != ROW_MARK_COPY) + { + /* need ctid for all methods other than COPY */ + snprintf(resname, sizeof(resname), "ctid%u", erm->rowmarkId); + aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, + resname); + if (!AttributeNumberIsValid(aerm->ctidAttNo)) + elog(ERROR, "could not find junk %s column", resname); + } + else + { + /* need wholerow if COPY */ + snprintf(resname, sizeof(resname), "wholerow%u", erm->rowmarkId); + aerm->wholeAttNo = ExecFindJunkAttributeInTlist(targetlist, + resname); + if (!AttributeNumberIsValid(aerm->wholeAttNo)) + elog(ERROR, "could not find junk %s column", resname); + } + + /* if child rel, need tableoid */ + if (erm->rti != erm->prti) + { + snprintf(resname, sizeof(resname), "tableoid%u", erm->rowmarkId); + aerm->toidAttNo = ExecFindJunkAttributeInTlist(targetlist, + resname); + if (!AttributeNumberIsValid(aerm->toidAttNo)) + elog(ERROR, "could not find junk %s column", resname); + } + + return aerm; +} + + +/* + * EvalPlanQual logic --- recheck modified tuple(s) to see if we want to + * process the updated version under READ COMMITTED rules. + * + * See backend/executor/README for some info about how this works. + */ + + +/* + * Check the updated version of a tuple to see if we want to process it under + * READ COMMITTED rules. + * + * epqstate - state for EvalPlanQual rechecking + * relation - table containing tuple + * rti - rangetable index of table containing tuple + * inputslot - tuple for processing - this can be the slot from + * EvalPlanQualSlot(), for the increased efficiency. + * + * This tests whether the tuple in inputslot still matches the relevant + * quals. For that result to be useful, typically the input tuple has to be + * last row version (otherwise the result isn't particularly useful) and + * locked (otherwise the result might be out of date). That's typically + * achieved by using table_tuple_lock() with the + * TUPLE_LOCK_FLAG_FIND_LAST_VERSION flag. + * + * Returns a slot containing the new candidate update/delete tuple, or + * NULL if we determine we shouldn't process the row. + */ +TupleTableSlot * +EvalPlanQual(EPQState *epqstate, Relation relation, + Index rti, TupleTableSlot *inputslot) +{ + TupleTableSlot *slot; + TupleTableSlot *testslot; + + Assert(rti > 0); + + /* + * Need to run a recheck subquery. Initialize or reinitialize EPQ state. + */ + EvalPlanQualBegin(epqstate); + + /* + * Callers will often use the EvalPlanQualSlot to store the tuple to avoid + * an unnecessary copy. + */ + testslot = EvalPlanQualSlot(epqstate, relation, rti); + if (testslot != inputslot) + ExecCopySlot(testslot, inputslot); + + /* + * Run the EPQ query. We assume it will return at most one tuple. + */ + slot = EvalPlanQualNext(epqstate); + + /* + * If we got a tuple, force the slot to materialize the tuple so that it + * is not dependent on any local state in the EPQ query (in particular, + * it's highly likely that the slot contains references to any pass-by-ref + * datums that may be present in copyTuple). As with the next step, this + * is to guard against early re-use of the EPQ query. + */ + if (!TupIsNull(slot)) + ExecMaterializeSlot(slot); + + /* + * Clear out the test tuple. This is needed in case the EPQ query is + * re-used to test a tuple for a different relation. (Not clear that can + * really happen, but let's be safe.) + */ + ExecClearTuple(testslot); + + return slot; +} + +/* + * EvalPlanQualInit -- initialize during creation of a plan state node + * that might need to invoke EPQ processing. + * + * Note: subplan/auxrowmarks can be NULL/NIL if they will be set later + * with EvalPlanQualSetPlan. + */ +void +EvalPlanQualInit(EPQState *epqstate, EState *parentestate, + Plan *subplan, List *auxrowmarks, int epqParam) +{ + Index rtsize = parentestate->es_range_table_size; + + /* initialize data not changing over EPQState's lifetime */ + epqstate->parentestate = parentestate; + epqstate->epqParam = epqParam; + + /* + * Allocate space to reference a slot for each potential rti - do so now + * rather than in EvalPlanQualBegin(), as done for other dynamically + * allocated resources, so EvalPlanQualSlot() can be used to hold tuples + * that *may* need EPQ later, without forcing the overhead of + * EvalPlanQualBegin(). + */ + epqstate->tuple_table = NIL; + epqstate->relsubs_slot = (TupleTableSlot **) + palloc0(rtsize * sizeof(TupleTableSlot *)); + + /* ... and remember data that EvalPlanQualBegin will need */ + epqstate->plan = subplan; + epqstate->arowMarks = auxrowmarks; + + /* ... and mark the EPQ state inactive */ + epqstate->origslot = NULL; + epqstate->recheckestate = NULL; + epqstate->recheckplanstate = NULL; + epqstate->relsubs_rowmark = NULL; + epqstate->relsubs_done = NULL; +} + +/* + * EvalPlanQualSetPlan -- set or change subplan of an EPQState. + * + * We used to need this so that ModifyTable could deal with multiple subplans. + * It could now be refactored out of existence. + */ +void +EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks) +{ + /* If we have a live EPQ query, shut it down */ + EvalPlanQualEnd(epqstate); + /* And set/change the plan pointer */ + epqstate->plan = subplan; + /* The rowmarks depend on the plan, too */ + epqstate->arowMarks = auxrowmarks; +} + +/* + * Return, and create if necessary, a slot for an EPQ test tuple. + * + * Note this only requires EvalPlanQualInit() to have been called, + * EvalPlanQualBegin() is not necessary. + */ +TupleTableSlot * +EvalPlanQualSlot(EPQState *epqstate, + Relation relation, Index rti) +{ + TupleTableSlot **slot; + + Assert(relation); + Assert(rti > 0 && rti <= epqstate->parentestate->es_range_table_size); + slot = &epqstate->relsubs_slot[rti - 1]; + + if (*slot == NULL) + { + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(epqstate->parentestate->es_query_cxt); + *slot = table_slot_create(relation, &epqstate->tuple_table); + MemoryContextSwitchTo(oldcontext); + } + + return *slot; +} + +/* + * Fetch the current row value for a non-locked relation, identified by rti, + * that needs to be scanned by an EvalPlanQual operation. origslot must have + * been set to contain the current result row (top-level row) that we need to + * recheck. Returns true if a substitution tuple was found, false if not. + */ +bool +EvalPlanQualFetchRowMark(EPQState *epqstate, Index rti, TupleTableSlot *slot) +{ + ExecAuxRowMark *earm = epqstate->relsubs_rowmark[rti - 1]; + ExecRowMark *erm = earm->rowmark; + Datum datum; + bool isNull; + + Assert(earm != NULL); + Assert(epqstate->origslot != NULL); + + if (RowMarkRequiresRowShareLock(erm->markType)) + elog(ERROR, "EvalPlanQual doesn't support locking rowmarks"); + + /* if child rel, must check whether it produced this row */ + if (erm->rti != erm->prti) + { + Oid tableoid; + + datum = ExecGetJunkAttribute(epqstate->origslot, + earm->toidAttNo, + &isNull); + /* non-locked rels could be on the inside of outer joins */ + if (isNull) + return false; + + tableoid = DatumGetObjectId(datum); + + Assert(OidIsValid(erm->relid)); + if (tableoid != erm->relid) + { + /* this child is inactive right now */ + return false; + } + } + + if (erm->markType == ROW_MARK_REFERENCE) + { + Assert(erm->relation != NULL); + + /* fetch the tuple's ctid */ + datum = ExecGetJunkAttribute(epqstate->origslot, + earm->ctidAttNo, + &isNull); + /* non-locked rels could be on the inside of outer joins */ + if (isNull) + return false; + + /* fetch requests on foreign tables must be passed to their FDW */ + if (erm->relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + { + FdwRoutine *fdwroutine; + bool updated = false; + + fdwroutine = GetFdwRoutineForRelation(erm->relation, false); + /* this should have been checked already, but let's be safe */ + if (fdwroutine->RefetchForeignRow == NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot lock rows in foreign table \"%s\"", + RelationGetRelationName(erm->relation)))); + + fdwroutine->RefetchForeignRow(epqstate->recheckestate, + erm, + datum, + slot, + &updated); + if (TupIsNull(slot)) + elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); + + /* + * Ideally we'd insist on updated == false here, but that assumes + * that FDWs can track that exactly, which they might not be able + * to. So just ignore the flag. + */ + return true; + } + else + { + /* ordinary table, fetch the tuple */ + if (!table_tuple_fetch_row_version(erm->relation, + (ItemPointer) DatumGetPointer(datum), + SnapshotAny, slot)) + elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); + return true; + } + } + else + { + Assert(erm->markType == ROW_MARK_COPY); + + /* fetch the whole-row Var for the relation */ + datum = ExecGetJunkAttribute(epqstate->origslot, + earm->wholeAttNo, + &isNull); + /* non-locked rels could be on the inside of outer joins */ + if (isNull) + return false; + + ExecStoreHeapTupleDatum(datum, slot); + return true; + } +} + +/* + * Fetch the next row (if any) from EvalPlanQual testing + * + * (In practice, there should never be more than one row...) + */ +TupleTableSlot * +EvalPlanQualNext(EPQState *epqstate) +{ + MemoryContext oldcontext; + TupleTableSlot *slot; + + oldcontext = MemoryContextSwitchTo(epqstate->recheckestate->es_query_cxt); + slot = ExecProcNode(epqstate->recheckplanstate); + MemoryContextSwitchTo(oldcontext); + + return slot; +} + +/* + * Initialize or reset an EvalPlanQual state tree + */ +void +EvalPlanQualBegin(EPQState *epqstate) +{ + EState *parentestate = epqstate->parentestate; + EState *recheckestate = epqstate->recheckestate; + + if (recheckestate == NULL) + { + /* First time through, so create a child EState */ + EvalPlanQualStart(epqstate, epqstate->plan); + } + else + { + /* + * We already have a suitable child EPQ tree, so just reset it. + */ + Index rtsize = parentestate->es_range_table_size; + PlanState *rcplanstate = epqstate->recheckplanstate; + + MemSet(epqstate->relsubs_done, 0, rtsize * sizeof(bool)); + + /* Recopy current values of parent parameters */ + if (parentestate->es_plannedstmt->paramExecTypes != NIL) + { + int i; + + /* + * Force evaluation of any InitPlan outputs that could be needed + * by the subplan, just in case they got reset since + * EvalPlanQualStart (see comments therein). + */ + ExecSetParamPlanMulti(rcplanstate->plan->extParam, + GetPerTupleExprContext(parentestate)); + + i = list_length(parentestate->es_plannedstmt->paramExecTypes); + + while (--i >= 0) + { + /* copy value if any, but not execPlan link */ + recheckestate->es_param_exec_vals[i].value = + parentestate->es_param_exec_vals[i].value; + recheckestate->es_param_exec_vals[i].isnull = + parentestate->es_param_exec_vals[i].isnull; + } + } + + /* + * Mark child plan tree as needing rescan at all scan nodes. The + * first ExecProcNode will take care of actually doing the rescan. + */ + rcplanstate->chgParam = bms_add_member(rcplanstate->chgParam, + epqstate->epqParam); + } +} + +/* + * Start execution of an EvalPlanQual plan tree. + * + * This is a cut-down version of ExecutorStart(): we copy some state from + * the top-level estate rather than initializing it fresh. + */ +static void +EvalPlanQualStart(EPQState *epqstate, Plan *planTree) +{ + EState *parentestate = epqstate->parentestate; + Index rtsize = parentestate->es_range_table_size; + EState *rcestate; + MemoryContext oldcontext; + ListCell *l; + + epqstate->recheckestate = rcestate = CreateExecutorState(); + + oldcontext = MemoryContextSwitchTo(rcestate->es_query_cxt); + + /* signal that this is an EState for executing EPQ */ + rcestate->es_epq_active = epqstate; + + /* + * Child EPQ EStates share the parent's copy of unchanging state such as + * the snapshot, rangetable, and external Param info. They need their own + * copies of local state, including a tuple table, es_param_exec_vals, + * result-rel info, etc. + */ + rcestate->es_direction = ForwardScanDirection; + rcestate->es_snapshot = parentestate->es_snapshot; + rcestate->es_crosscheck_snapshot = parentestate->es_crosscheck_snapshot; + rcestate->es_range_table = parentestate->es_range_table; + rcestate->es_range_table_size = parentestate->es_range_table_size; + rcestate->es_relations = parentestate->es_relations; + rcestate->es_queryEnv = parentestate->es_queryEnv; + rcestate->es_rowmarks = parentestate->es_rowmarks; + rcestate->es_plannedstmt = parentestate->es_plannedstmt; + rcestate->es_junkFilter = parentestate->es_junkFilter; + rcestate->es_output_cid = parentestate->es_output_cid; + + /* + * ResultRelInfos needed by subplans are initialized from scratch when the + * subplans themselves are initialized. + */ + rcestate->es_result_relations = NULL; + /* es_trig_target_relations must NOT be copied */ + rcestate->es_top_eflags = parentestate->es_top_eflags; + rcestate->es_instrument = parentestate->es_instrument; + /* es_auxmodifytables must NOT be copied */ + + /* + * The external param list is simply shared from parent. The internal + * param workspace has to be local state, but we copy the initial values + * from the parent, so as to have access to any param values that were + * already set from other parts of the parent's plan tree. + */ + rcestate->es_param_list_info = parentestate->es_param_list_info; + if (parentestate->es_plannedstmt->paramExecTypes != NIL) + { + int i; + + /* + * Force evaluation of any InitPlan outputs that could be needed by + * the subplan. (With more complexity, maybe we could postpone this + * till the subplan actually demands them, but it doesn't seem worth + * the trouble; this is a corner case already, since usually the + * InitPlans would have been evaluated before reaching EvalPlanQual.) + * + * This will not touch output params of InitPlans that occur somewhere + * within the subplan tree, only those that are attached to the + * ModifyTable node or above it and are referenced within the subplan. + * That's OK though, because the planner would only attach such + * InitPlans to a lower-level SubqueryScan node, and EPQ execution + * will not descend into a SubqueryScan. + * + * The EState's per-output-tuple econtext is sufficiently short-lived + * for this, since it should get reset before there is any chance of + * doing EvalPlanQual again. + */ + ExecSetParamPlanMulti(planTree->extParam, + GetPerTupleExprContext(parentestate)); + + /* now make the internal param workspace ... */ + i = list_length(parentestate->es_plannedstmt->paramExecTypes); + rcestate->es_param_exec_vals = (ParamExecData *) + palloc0(i * sizeof(ParamExecData)); + /* ... and copy down all values, whether really needed or not */ + while (--i >= 0) + { + /* copy value if any, but not execPlan link */ + rcestate->es_param_exec_vals[i].value = + parentestate->es_param_exec_vals[i].value; + rcestate->es_param_exec_vals[i].isnull = + parentestate->es_param_exec_vals[i].isnull; + } + } + + /* + * Initialize private state information for each SubPlan. We must do this + * before running ExecInitNode on the main query tree, since + * ExecInitSubPlan expects to be able to find these entries. Some of the + * SubPlans might not be used in the part of the plan tree we intend to + * run, but since it's not easy to tell which, we just initialize them + * all. + */ + Assert(rcestate->es_subplanstates == NIL); + foreach(l, parentestate->es_plannedstmt->subplans) + { + Plan *subplan = (Plan *) lfirst(l); + PlanState *subplanstate; + + subplanstate = ExecInitNode(subplan, rcestate, 0); + rcestate->es_subplanstates = lappend(rcestate->es_subplanstates, + subplanstate); + } + + /* + * Build an RTI indexed array of rowmarks, so that + * EvalPlanQualFetchRowMark() can efficiently access the to be fetched + * rowmark. + */ + epqstate->relsubs_rowmark = (ExecAuxRowMark **) + palloc0(rtsize * sizeof(ExecAuxRowMark *)); + foreach(l, epqstate->arowMarks) + { + ExecAuxRowMark *earm = (ExecAuxRowMark *) lfirst(l); + + epqstate->relsubs_rowmark[earm->rowmark->rti - 1] = earm; + } + + /* + * Initialize per-relation EPQ tuple states to not-fetched. + */ + epqstate->relsubs_done = (bool *) + palloc0(rtsize * sizeof(bool)); + + /* + * Initialize the private state information for all the nodes in the part + * of the plan tree we need to run. This opens files, allocates storage + * and leaves us ready to start processing tuples. + */ + epqstate->recheckplanstate = ExecInitNode(planTree, rcestate, 0); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * EvalPlanQualEnd -- shut down at termination of parent plan state node, + * or if we are done with the current EPQ child. + * + * This is a cut-down version of ExecutorEnd(); basically we want to do most + * of the normal cleanup, but *not* close result relations (which we are + * just sharing from the outer query). We do, however, have to close any + * result and trigger target relations that got opened, since those are not + * shared. (There probably shouldn't be any of the latter, but just in + * case...) + */ +void +EvalPlanQualEnd(EPQState *epqstate) +{ + EState *estate = epqstate->recheckestate; + Index rtsize; + MemoryContext oldcontext; + ListCell *l; + + rtsize = epqstate->parentestate->es_range_table_size; + + /* + * We may have a tuple table, even if EPQ wasn't started, because we allow + * use of EvalPlanQualSlot() without calling EvalPlanQualBegin(). + */ + if (epqstate->tuple_table != NIL) + { + memset(epqstate->relsubs_slot, 0, + rtsize * sizeof(TupleTableSlot *)); + ExecResetTupleTable(epqstate->tuple_table, true); + epqstate->tuple_table = NIL; + } + + /* EPQ wasn't started, nothing further to do */ + if (estate == NULL) + return; + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + ExecEndNode(epqstate->recheckplanstate); + + foreach(l, estate->es_subplanstates) + { + PlanState *subplanstate = (PlanState *) lfirst(l); + + ExecEndNode(subplanstate); + } + + /* throw away the per-estate tuple table, some node may have used it */ + ExecResetTupleTable(estate->es_tupleTable, false); + + /* Close any result and trigger target relations attached to this EState */ + ExecCloseResultRelations(estate); + + MemoryContextSwitchTo(oldcontext); + + FreeExecutorState(estate); + + /* Mark EPQState idle */ + epqstate->origslot = NULL; + epqstate->recheckestate = NULL; + epqstate->recheckplanstate = NULL; + epqstate->relsubs_rowmark = NULL; + epqstate->relsubs_done = NULL; +} diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c new file mode 100644 index 0000000..f8a4a40 --- /dev/null +++ b/src/backend/executor/execParallel.c @@ -0,0 +1,1498 @@ +/*------------------------------------------------------------------------- + * + * execParallel.c + * Support routines for parallel execution. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * This file contains routines that are intended to support setting up, + * using, and tearing down a ParallelContext from within the PostgreSQL + * executor. The ParallelContext machinery will handle starting the + * workers and ensuring that their state generally matches that of the + * leader; see src/backend/access/transam/README.parallel for details. + * However, we must save and restore relevant executor state, such as + * any ParamListInfo associated with the query, buffer/WAL usage info, and + * the actual plan to be passed down to the worker. + * + * IDENTIFICATION + * src/backend/executor/execParallel.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/execParallel.h" +#include "executor/executor.h" +#include "executor/nodeAgg.h" +#include "executor/nodeAppend.h" +#include "executor/nodeBitmapHeapscan.h" +#include "executor/nodeCustom.h" +#include "executor/nodeForeignscan.h" +#include "executor/nodeHash.h" +#include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" +#include "executor/nodeIndexonlyscan.h" +#include "executor/nodeIndexscan.h" +#include "executor/nodeMemoize.h" +#include "executor/nodeSeqscan.h" +#include "executor/nodeSort.h" +#include "executor/nodeSubplan.h" +#include "executor/tqueue.h" +#include "jit/jit.h" +#include "nodes/nodeFuncs.h" +#include "pgstat.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/datum.h" +#include "utils/dsa.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + +/* + * Magic numbers for parallel executor communication. We use constants + * greater than any 32-bit integer here so that values < 2^32 can be used + * by individual parallel nodes to store their own state. + */ +#define PARALLEL_KEY_EXECUTOR_FIXED UINT64CONST(0xE000000000000001) +#define PARALLEL_KEY_PLANNEDSTMT UINT64CONST(0xE000000000000002) +#define PARALLEL_KEY_PARAMLISTINFO UINT64CONST(0xE000000000000003) +#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xE000000000000004) +#define PARALLEL_KEY_TUPLE_QUEUE UINT64CONST(0xE000000000000005) +#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xE000000000000006) +#define PARALLEL_KEY_DSA UINT64CONST(0xE000000000000007) +#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xE000000000000008) +#define PARALLEL_KEY_JIT_INSTRUMENTATION UINT64CONST(0xE000000000000009) +#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xE00000000000000A) + +#define PARALLEL_TUPLE_QUEUE_SIZE 65536 + +/* + * Fixed-size random stuff that we need to pass to parallel workers. + */ +typedef struct FixedParallelExecutorState +{ + int64 tuples_needed; /* tuple bound, see ExecSetTupleBound */ + dsa_pointer param_exec; + int eflags; + int jit_flags; +} FixedParallelExecutorState; + +/* + * DSM structure for accumulating per-PlanState instrumentation. + * + * instrument_options: Same meaning here as in instrument.c. + * + * instrument_offset: Offset, relative to the start of this structure, + * of the first Instrumentation object. This will depend on the length of + * the plan_node_id array. + * + * num_workers: Number of workers. + * + * num_plan_nodes: Number of plan nodes. + * + * plan_node_id: Array of plan nodes for which we are gathering instrumentation + * from parallel workers. The length of this array is given by num_plan_nodes. + */ +struct SharedExecutorInstrumentation +{ + int instrument_options; + int instrument_offset; + int num_workers; + int num_plan_nodes; + int plan_node_id[FLEXIBLE_ARRAY_MEMBER]; + /* array of num_plan_nodes * num_workers Instrumentation objects follows */ +}; +#define GetInstrumentationArray(sei) \ + (AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ + (Instrumentation *) (((char *) sei) + sei->instrument_offset)) + +/* Context object for ExecParallelEstimate. */ +typedef struct ExecParallelEstimateContext +{ + ParallelContext *pcxt; + int nnodes; +} ExecParallelEstimateContext; + +/* Context object for ExecParallelInitializeDSM. */ +typedef struct ExecParallelInitializeDSMContext +{ + ParallelContext *pcxt; + SharedExecutorInstrumentation *instrumentation; + int nnodes; +} ExecParallelInitializeDSMContext; + +/* Helper functions that run in the parallel leader. */ +static char *ExecSerializePlan(Plan *plan, EState *estate); +static bool ExecParallelEstimate(PlanState *node, + ExecParallelEstimateContext *e); +static bool ExecParallelInitializeDSM(PlanState *node, + ExecParallelInitializeDSMContext *d); +static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt, + bool reinitialize); +static bool ExecParallelReInitializeDSM(PlanState *planstate, + ParallelContext *pcxt); +static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, + SharedExecutorInstrumentation *instrumentation); + +/* Helper function that runs in the parallel worker. */ +static DestReceiver *ExecParallelGetReceiver(dsm_segment *seg, shm_toc *toc); + +/* + * Create a serialized representation of the plan to be sent to each worker. + */ +static char * +ExecSerializePlan(Plan *plan, EState *estate) +{ + PlannedStmt *pstmt; + ListCell *lc; + + /* We can't scribble on the original plan, so make a copy. */ + plan = copyObject(plan); + + /* + * The worker will start its own copy of the executor, and that copy will + * insert a junk filter if the toplevel node has any resjunk entries. We + * don't want that to happen, because while resjunk columns shouldn't be + * sent back to the user, here the tuples are coming back to another + * backend which may very well need them. So mutate the target list + * accordingly. This is sort of a hack; there might be better ways to do + * this... + */ + foreach(lc, plan->targetlist) + { + TargetEntry *tle = lfirst_node(TargetEntry, lc); + + tle->resjunk = false; + } + + /* + * Create a dummy PlannedStmt. Most of the fields don't need to be valid + * for our purposes, but the worker will need at least a minimal + * PlannedStmt to start the executor. + */ + pstmt = makeNode(PlannedStmt); + pstmt->commandType = CMD_SELECT; + pstmt->queryId = pgstat_get_my_query_id(); + pstmt->hasReturning = false; + pstmt->hasModifyingCTE = false; + pstmt->canSetTag = true; + pstmt->transientPlan = false; + pstmt->dependsOnRole = false; + pstmt->parallelModeNeeded = false; + pstmt->planTree = plan; + pstmt->rtable = estate->es_range_table; + pstmt->resultRelations = NIL; + pstmt->appendRelations = NIL; + + /* + * Transfer only parallel-safe subplans, leaving a NULL "hole" in the list + * for unsafe ones (so that the list indexes of the safe ones are + * preserved). This positively ensures that the worker won't try to run, + * or even do ExecInitNode on, an unsafe subplan. That's important to + * protect, eg, non-parallel-aware FDWs from getting into trouble. + */ + pstmt->subplans = NIL; + foreach(lc, estate->es_plannedstmt->subplans) + { + Plan *subplan = (Plan *) lfirst(lc); + + if (subplan && !subplan->parallel_safe) + subplan = NULL; + pstmt->subplans = lappend(pstmt->subplans, subplan); + } + + pstmt->rewindPlanIDs = NULL; + pstmt->rowMarks = NIL; + pstmt->relationOids = NIL; + pstmt->invalItems = NIL; /* workers can't replan anyway... */ + pstmt->paramExecTypes = estate->es_plannedstmt->paramExecTypes; + pstmt->utilityStmt = NULL; + pstmt->stmt_location = -1; + pstmt->stmt_len = -1; + + /* Return serialized copy of our dummy PlannedStmt. */ + return nodeToString(pstmt); +} + +/* + * Parallel-aware plan nodes (and occasionally others) may need some state + * which is shared across all parallel workers. Before we size the DSM, give + * them a chance to call shm_toc_estimate_chunk or shm_toc_estimate_keys on + * &pcxt->estimator. + * + * While we're at it, count the number of PlanState nodes in the tree, so + * we know how many Instrumentation structures we need. + */ +static bool +ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) +{ + if (planstate == NULL) + return false; + + /* Count this node. */ + e->nnodes++; + + switch (nodeTag(planstate)) + { + case T_SeqScanState: + if (planstate->plan->parallel_aware) + ExecSeqScanEstimate((SeqScanState *) planstate, + e->pcxt); + break; + case T_IndexScanState: + if (planstate->plan->parallel_aware) + ExecIndexScanEstimate((IndexScanState *) planstate, + e->pcxt); + break; + case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) + ExecIndexOnlyScanEstimate((IndexOnlyScanState *) planstate, + e->pcxt); + break; + case T_ForeignScanState: + if (planstate->plan->parallel_aware) + ExecForeignScanEstimate((ForeignScanState *) planstate, + e->pcxt); + break; + case T_AppendState: + if (planstate->plan->parallel_aware) + ExecAppendEstimate((AppendState *) planstate, + e->pcxt); + break; + case T_CustomScanState: + if (planstate->plan->parallel_aware) + ExecCustomScanEstimate((CustomScanState *) planstate, + e->pcxt); + break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) + ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate, + e->pcxt); + break; + case T_HashJoinState: + if (planstate->plan->parallel_aware) + ExecHashJoinEstimate((HashJoinState *) planstate, + e->pcxt); + break; + case T_HashState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecHashEstimate((HashState *) planstate, e->pcxt); + break; + case T_SortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecSortEstimate((SortState *) planstate, e->pcxt); + break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt); + break; + case T_AggState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecAggEstimate((AggState *) planstate, e->pcxt); + break; + case T_MemoizeState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecMemoizeEstimate((MemoizeState *) planstate, e->pcxt); + break; + default: + break; + } + + return planstate_tree_walker(planstate, ExecParallelEstimate, e); +} + +/* + * Estimate the amount of space required to serialize the indicated parameters. + */ +static Size +EstimateParamExecSpace(EState *estate, Bitmapset *params) +{ + int paramid; + Size sz = sizeof(int); + + paramid = -1; + while ((paramid = bms_next_member(params, paramid)) >= 0) + { + Oid typeOid; + int16 typLen; + bool typByVal; + ParamExecData *prm; + + prm = &(estate->es_param_exec_vals[paramid]); + typeOid = list_nth_oid(estate->es_plannedstmt->paramExecTypes, + paramid); + + sz = add_size(sz, sizeof(int)); /* space for paramid */ + + /* space for datum/isnull */ + if (OidIsValid(typeOid)) + get_typlenbyval(typeOid, &typLen, &typByVal); + else + { + /* If no type OID, assume by-value, like copyParamList does. */ + typLen = sizeof(Datum); + typByVal = true; + } + sz = add_size(sz, + datumEstimateSpace(prm->value, prm->isnull, + typByVal, typLen)); + } + return sz; +} + +/* + * Serialize specified PARAM_EXEC parameters. + * + * We write the number of parameters first, as a 4-byte integer, and then + * write details for each parameter in turn. The details for each parameter + * consist of a 4-byte paramid (location of param in execution time internal + * parameter array) and then the datum as serialized by datumSerialize(). + */ +static dsa_pointer +SerializeParamExecParams(EState *estate, Bitmapset *params, dsa_area *area) +{ + Size size; + int nparams; + int paramid; + ParamExecData *prm; + dsa_pointer handle; + char *start_address; + + /* Allocate enough space for the current parameter values. */ + size = EstimateParamExecSpace(estate, params); + handle = dsa_allocate(area, size); + start_address = dsa_get_address(area, handle); + + /* First write the number of parameters as a 4-byte integer. */ + nparams = bms_num_members(params); + memcpy(start_address, &nparams, sizeof(int)); + start_address += sizeof(int); + + /* Write details for each parameter in turn. */ + paramid = -1; + while ((paramid = bms_next_member(params, paramid)) >= 0) + { + Oid typeOid; + int16 typLen; + bool typByVal; + + prm = &(estate->es_param_exec_vals[paramid]); + typeOid = list_nth_oid(estate->es_plannedstmt->paramExecTypes, + paramid); + + /* Write paramid. */ + memcpy(start_address, ¶mid, sizeof(int)); + start_address += sizeof(int); + + /* Write datum/isnull */ + if (OidIsValid(typeOid)) + get_typlenbyval(typeOid, &typLen, &typByVal); + else + { + /* If no type OID, assume by-value, like copyParamList does. */ + typLen = sizeof(Datum); + typByVal = true; + } + datumSerialize(prm->value, prm->isnull, typByVal, typLen, + &start_address); + } + + return handle; +} + +/* + * Restore specified PARAM_EXEC parameters. + */ +static void +RestoreParamExecParams(char *start_address, EState *estate) +{ + int nparams; + int i; + int paramid; + + memcpy(&nparams, start_address, sizeof(int)); + start_address += sizeof(int); + + for (i = 0; i < nparams; i++) + { + ParamExecData *prm; + + /* Read paramid */ + memcpy(¶mid, start_address, sizeof(int)); + start_address += sizeof(int); + prm = &(estate->es_param_exec_vals[paramid]); + + /* Read datum/isnull. */ + prm->value = datumRestore(&start_address, &prm->isnull); + prm->execPlan = NULL; + } +} + +/* + * Initialize the dynamic shared memory segment that will be used to control + * parallel execution. + */ +static bool +ExecParallelInitializeDSM(PlanState *planstate, + ExecParallelInitializeDSMContext *d) +{ + if (planstate == NULL) + return false; + + /* If instrumentation is enabled, initialize slot for this node. */ + if (d->instrumentation != NULL) + d->instrumentation->plan_node_id[d->nnodes] = + planstate->plan->plan_node_id; + + /* Count this node. */ + d->nnodes++; + + /* + * Call initializers for DSM-using plan nodes. + * + * Most plan nodes won't do anything here, but plan nodes that allocated + * DSM may need to initialize shared state in the DSM before parallel + * workers are launched. They can allocate the space they previously + * estimated using shm_toc_allocate, and add the keys they previously + * estimated using shm_toc_insert, in each case targeting pcxt->toc. + */ + switch (nodeTag(planstate)) + { + case T_SeqScanState: + if (planstate->plan->parallel_aware) + ExecSeqScanInitializeDSM((SeqScanState *) planstate, + d->pcxt); + break; + case T_IndexScanState: + if (planstate->plan->parallel_aware) + ExecIndexScanInitializeDSM((IndexScanState *) planstate, + d->pcxt); + break; + case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) + ExecIndexOnlyScanInitializeDSM((IndexOnlyScanState *) planstate, + d->pcxt); + break; + case T_ForeignScanState: + if (planstate->plan->parallel_aware) + ExecForeignScanInitializeDSM((ForeignScanState *) planstate, + d->pcxt); + break; + case T_AppendState: + if (planstate->plan->parallel_aware) + ExecAppendInitializeDSM((AppendState *) planstate, + d->pcxt); + break; + case T_CustomScanState: + if (planstate->plan->parallel_aware) + ExecCustomScanInitializeDSM((CustomScanState *) planstate, + d->pcxt); + break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) + ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate, + d->pcxt); + break; + case T_HashJoinState: + if (planstate->plan->parallel_aware) + ExecHashJoinInitializeDSM((HashJoinState *) planstate, + d->pcxt); + break; + case T_HashState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecHashInitializeDSM((HashState *) planstate, d->pcxt); + break; + case T_SortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecSortInitializeDSM((SortState *) planstate, d->pcxt); + break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt); + break; + case T_AggState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecAggInitializeDSM((AggState *) planstate, d->pcxt); + break; + case T_MemoizeState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecMemoizeInitializeDSM((MemoizeState *) planstate, d->pcxt); + break; + default: + break; + } + + return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d); +} + +/* + * It sets up the response queues for backend workers to return tuples + * to the main backend and start the workers. + */ +static shm_mq_handle ** +ExecParallelSetupTupleQueues(ParallelContext *pcxt, bool reinitialize) +{ + shm_mq_handle **responseq; + char *tqueuespace; + int i; + + /* Skip this if no workers. */ + if (pcxt->nworkers == 0) + return NULL; + + /* Allocate memory for shared memory queue handles. */ + responseq = (shm_mq_handle **) + palloc(pcxt->nworkers * sizeof(shm_mq_handle *)); + + /* + * If not reinitializing, allocate space from the DSM for the queues; + * otherwise, find the already allocated space. + */ + if (!reinitialize) + tqueuespace = + shm_toc_allocate(pcxt->toc, + mul_size(PARALLEL_TUPLE_QUEUE_SIZE, + pcxt->nworkers)); + else + tqueuespace = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_TUPLE_QUEUE, false); + + /* Create the queues, and become the receiver for each. */ + for (i = 0; i < pcxt->nworkers; ++i) + { + shm_mq *mq; + + mq = shm_mq_create(tqueuespace + + ((Size) i) * PARALLEL_TUPLE_QUEUE_SIZE, + (Size) PARALLEL_TUPLE_QUEUE_SIZE); + + shm_mq_set_receiver(mq, MyProc); + responseq[i] = shm_mq_attach(mq, pcxt->seg, NULL); + } + + /* Add array of queues to shm_toc, so others can find it. */ + if (!reinitialize) + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLE_QUEUE, tqueuespace); + + /* Return array of handles. */ + return responseq; +} + +/* + * Sets up the required infrastructure for backend workers to perform + * execution and return results to the main backend. + */ +ParallelExecutorInfo * +ExecInitParallelPlan(PlanState *planstate, EState *estate, + Bitmapset *sendParams, int nworkers, + int64 tuples_needed) +{ + ParallelExecutorInfo *pei; + ParallelContext *pcxt; + ExecParallelEstimateContext e; + ExecParallelInitializeDSMContext d; + FixedParallelExecutorState *fpes; + char *pstmt_data; + char *pstmt_space; + char *paramlistinfo_space; + BufferUsage *bufusage_space; + WalUsage *walusage_space; + SharedExecutorInstrumentation *instrumentation = NULL; + SharedJitInstrumentation *jit_instrumentation = NULL; + int pstmt_len; + int paramlistinfo_len; + int instrumentation_len = 0; + int jit_instrumentation_len = 0; + int instrument_offset = 0; + Size dsa_minsize = dsa_minimum_size(); + char *query_string; + int query_len; + + /* + * Force any initplan outputs that we're going to pass to workers to be + * evaluated, if they weren't already. + * + * For simplicity, we use the EState's per-output-tuple ExprContext here. + * That risks intra-query memory leakage, since we might pass through here + * many times before that ExprContext gets reset; but ExecSetParamPlan + * doesn't normally leak any memory in the context (see its comments), so + * it doesn't seem worth complicating this function's API to pass it a + * shorter-lived ExprContext. This might need to change someday. + */ + ExecSetParamPlanMulti(sendParams, GetPerTupleExprContext(estate)); + + /* Allocate object for return value. */ + pei = palloc0(sizeof(ParallelExecutorInfo)); + pei->finished = false; + pei->planstate = planstate; + + /* Fix up and serialize plan to be sent to workers. */ + pstmt_data = ExecSerializePlan(planstate->plan, estate); + + /* Create a parallel context. */ + pcxt = CreateParallelContext("postgres", "ParallelQueryMain", nworkers); + pei->pcxt = pcxt; + + /* + * Before telling the parallel context to create a dynamic shared memory + * segment, we need to figure out how big it should be. Estimate space + * for the various things we need to store. + */ + + /* Estimate space for fixed-size state. */ + shm_toc_estimate_chunk(&pcxt->estimator, + sizeof(FixedParallelExecutorState)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate space for query text. */ + query_len = strlen(estate->es_sourceText); + shm_toc_estimate_chunk(&pcxt->estimator, query_len + 1); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate space for serialized PlannedStmt. */ + pstmt_len = strlen(pstmt_data) + 1; + shm_toc_estimate_chunk(&pcxt->estimator, pstmt_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate space for serialized ParamListInfo. */ + paramlistinfo_len = EstimateParamListSpace(estate->es_param_list_info); + shm_toc_estimate_chunk(&pcxt->estimator, paramlistinfo_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* + * Estimate space for BufferUsage. + * + * If EXPLAIN is not in use and there are no extensions loaded that care, + * we could skip this. But we have no way of knowing whether anyone's + * looking at pgBufferUsage, so do it unconditionally. + */ + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* + * Same thing for WalUsage. + */ + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(WalUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate space for tuple queues. */ + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(PARALLEL_TUPLE_QUEUE_SIZE, pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* + * Give parallel-aware nodes a chance to add to the estimates, and get a + * count of how many PlanState nodes there are. + */ + e.pcxt = pcxt; + e.nnodes = 0; + ExecParallelEstimate(planstate, &e); + + /* Estimate space for instrumentation, if required. */ + if (estate->es_instrument) + { + instrumentation_len = + offsetof(SharedExecutorInstrumentation, plan_node_id) + + sizeof(int) * e.nnodes; + instrumentation_len = MAXALIGN(instrumentation_len); + instrument_offset = instrumentation_len; + instrumentation_len += + mul_size(sizeof(Instrumentation), + mul_size(e.nnodes, nworkers)); + shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate space for JIT instrumentation, if required. */ + if (estate->es_jit_flags != PGJIT_NONE) + { + jit_instrumentation_len = + offsetof(SharedJitInstrumentation, jit_instr) + + sizeof(JitInstrumentation) * nworkers; + shm_toc_estimate_chunk(&pcxt->estimator, jit_instrumentation_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } + } + + /* Estimate space for DSA area. */ + shm_toc_estimate_chunk(&pcxt->estimator, dsa_minsize); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Everyone's had a chance to ask for space, so now create the DSM. */ + InitializeParallelDSM(pcxt); + + /* + * OK, now we have a dynamic shared memory segment, and it should be big + * enough to store all of the data we estimated we would want to put into + * it, plus whatever general stuff (not specifically executor-related) the + * ParallelContext itself needs to store there. None of the space we + * asked for has been allocated or initialized yet, though, so do that. + */ + + /* Store fixed-size state. */ + fpes = shm_toc_allocate(pcxt->toc, sizeof(FixedParallelExecutorState)); + fpes->tuples_needed = tuples_needed; + fpes->param_exec = InvalidDsaPointer; + fpes->eflags = estate->es_top_eflags; + fpes->jit_flags = estate->es_jit_flags; + shm_toc_insert(pcxt->toc, PARALLEL_KEY_EXECUTOR_FIXED, fpes); + + /* Store query string */ + query_string = shm_toc_allocate(pcxt->toc, query_len + 1); + memcpy(query_string, estate->es_sourceText, query_len + 1); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, query_string); + + /* Store serialized PlannedStmt. */ + pstmt_space = shm_toc_allocate(pcxt->toc, pstmt_len); + memcpy(pstmt_space, pstmt_data, pstmt_len); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_PLANNEDSTMT, pstmt_space); + + /* Store serialized ParamListInfo. */ + paramlistinfo_space = shm_toc_allocate(pcxt->toc, paramlistinfo_len); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_PARAMLISTINFO, paramlistinfo_space); + SerializeParamList(estate->es_param_list_info, ¶mlistinfo_space); + + /* Allocate space for each worker's BufferUsage; no need to initialize. */ + bufusage_space = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufusage_space); + pei->buffer_usage = bufusage_space; + + /* Same for WalUsage. */ + walusage_space = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(WalUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage_space); + pei->wal_usage = walusage_space; + + /* Set up the tuple queues that the workers will write into. */ + pei->tqueue = ExecParallelSetupTupleQueues(pcxt, false); + + /* We don't need the TupleQueueReaders yet, though. */ + pei->reader = NULL; + + /* + * If instrumentation options were supplied, allocate space for the data. + * It only gets partially initialized here; the rest happens during + * ExecParallelInitializeDSM. + */ + if (estate->es_instrument) + { + Instrumentation *instrument; + int i; + + instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len); + instrumentation->instrument_options = estate->es_instrument; + instrumentation->instrument_offset = instrument_offset; + instrumentation->num_workers = nworkers; + instrumentation->num_plan_nodes = e.nnodes; + instrument = GetInstrumentationArray(instrumentation); + for (i = 0; i < nworkers * e.nnodes; ++i) + InstrInit(&instrument[i], estate->es_instrument); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, + instrumentation); + pei->instrumentation = instrumentation; + + if (estate->es_jit_flags != PGJIT_NONE) + { + jit_instrumentation = shm_toc_allocate(pcxt->toc, + jit_instrumentation_len); + jit_instrumentation->num_workers = nworkers; + memset(jit_instrumentation->jit_instr, 0, + sizeof(JitInstrumentation) * nworkers); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_JIT_INSTRUMENTATION, + jit_instrumentation); + pei->jit_instrumentation = jit_instrumentation; + } + } + + /* + * Create a DSA area that can be used by the leader and all workers. + * (However, if we failed to create a DSM and are using private memory + * instead, then skip this.) + */ + if (pcxt->seg != NULL) + { + char *area_space; + + area_space = shm_toc_allocate(pcxt->toc, dsa_minsize); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_DSA, area_space); + pei->area = dsa_create_in_place(area_space, dsa_minsize, + LWTRANCHE_PARALLEL_QUERY_DSA, + pcxt->seg); + + /* + * Serialize parameters, if any, using DSA storage. We don't dare use + * the main parallel query DSM for this because we might relaunch + * workers after the values have changed (and thus the amount of + * storage required has changed). + */ + if (!bms_is_empty(sendParams)) + { + pei->param_exec = SerializeParamExecParams(estate, sendParams, + pei->area); + fpes->param_exec = pei->param_exec; + } + } + + /* + * Give parallel-aware nodes a chance to initialize their shared data. + * This also initializes the elements of instrumentation->ps_instrument, + * if it exists. + */ + d.pcxt = pcxt; + d.instrumentation = instrumentation; + d.nnodes = 0; + + /* Install our DSA area while initializing the plan. */ + estate->es_query_dsa = pei->area; + ExecParallelInitializeDSM(planstate, &d); + estate->es_query_dsa = NULL; + + /* + * Make sure that the world hasn't shifted under our feet. This could + * probably just be an Assert(), but let's be conservative for now. + */ + if (e.nnodes != d.nnodes) + elog(ERROR, "inconsistent count of PlanState nodes"); + + /* OK, we're ready to rock and roll. */ + return pei; +} + +/* + * Set up tuple queue readers to read the results of a parallel subplan. + * + * This is separate from ExecInitParallelPlan() because we can launch the + * worker processes and let them start doing something before we do this. + */ +void +ExecParallelCreateReaders(ParallelExecutorInfo *pei) +{ + int nworkers = pei->pcxt->nworkers_launched; + int i; + + Assert(pei->reader == NULL); + + if (nworkers > 0) + { + pei->reader = (TupleQueueReader **) + palloc(nworkers * sizeof(TupleQueueReader *)); + + for (i = 0; i < nworkers; i++) + { + shm_mq_set_handle(pei->tqueue[i], + pei->pcxt->worker[i].bgwhandle); + pei->reader[i] = CreateTupleQueueReader(pei->tqueue[i]); + } + } +} + +/* + * Re-initialize the parallel executor shared memory state before launching + * a fresh batch of workers. + */ +void +ExecParallelReinitialize(PlanState *planstate, + ParallelExecutorInfo *pei, + Bitmapset *sendParams) +{ + EState *estate = planstate->state; + FixedParallelExecutorState *fpes; + + /* Old workers must already be shut down */ + Assert(pei->finished); + + /* + * Force any initplan outputs that we're going to pass to workers to be + * evaluated, if they weren't already (see comments in + * ExecInitParallelPlan). + */ + ExecSetParamPlanMulti(sendParams, GetPerTupleExprContext(estate)); + + ReinitializeParallelDSM(pei->pcxt); + pei->tqueue = ExecParallelSetupTupleQueues(pei->pcxt, true); + pei->reader = NULL; + pei->finished = false; + + fpes = shm_toc_lookup(pei->pcxt->toc, PARALLEL_KEY_EXECUTOR_FIXED, false); + + /* Free any serialized parameters from the last round. */ + if (DsaPointerIsValid(fpes->param_exec)) + { + dsa_free(pei->area, fpes->param_exec); + fpes->param_exec = InvalidDsaPointer; + } + + /* Serialize current parameter values if required. */ + if (!bms_is_empty(sendParams)) + { + pei->param_exec = SerializeParamExecParams(estate, sendParams, + pei->area); + fpes->param_exec = pei->param_exec; + } + + /* Traverse plan tree and let each child node reset associated state. */ + estate->es_query_dsa = pei->area; + ExecParallelReInitializeDSM(planstate, pei->pcxt); + estate->es_query_dsa = NULL; +} + +/* + * Traverse plan tree to reinitialize per-node dynamic shared memory state + */ +static bool +ExecParallelReInitializeDSM(PlanState *planstate, + ParallelContext *pcxt) +{ + if (planstate == NULL) + return false; + + /* + * Call reinitializers for DSM-using plan nodes. + */ + switch (nodeTag(planstate)) + { + case T_SeqScanState: + if (planstate->plan->parallel_aware) + ExecSeqScanReInitializeDSM((SeqScanState *) planstate, + pcxt); + break; + case T_IndexScanState: + if (planstate->plan->parallel_aware) + ExecIndexScanReInitializeDSM((IndexScanState *) planstate, + pcxt); + break; + case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) + ExecIndexOnlyScanReInitializeDSM((IndexOnlyScanState *) planstate, + pcxt); + break; + case T_ForeignScanState: + if (planstate->plan->parallel_aware) + ExecForeignScanReInitializeDSM((ForeignScanState *) planstate, + pcxt); + break; + case T_AppendState: + if (planstate->plan->parallel_aware) + ExecAppendReInitializeDSM((AppendState *) planstate, pcxt); + break; + case T_CustomScanState: + if (planstate->plan->parallel_aware) + ExecCustomScanReInitializeDSM((CustomScanState *) planstate, + pcxt); + break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) + ExecBitmapHeapReInitializeDSM((BitmapHeapScanState *) planstate, + pcxt); + break; + case T_HashJoinState: + if (planstate->plan->parallel_aware) + ExecHashJoinReInitializeDSM((HashJoinState *) planstate, + pcxt); + break; + case T_HashState: + case T_SortState: + case T_IncrementalSortState: + case T_MemoizeState: + /* these nodes have DSM state, but no reinitialization is required */ + break; + + default: + break; + } + + return planstate_tree_walker(planstate, ExecParallelReInitializeDSM, pcxt); +} + +/* + * Copy instrumentation information about this node and its descendants from + * dynamic shared memory. + */ +static bool +ExecParallelRetrieveInstrumentation(PlanState *planstate, + SharedExecutorInstrumentation *instrumentation) +{ + Instrumentation *instrument; + int i; + int n; + int ibytes; + int plan_node_id = planstate->plan->plan_node_id; + MemoryContext oldcontext; + + /* Find the instrumentation for this node. */ + for (i = 0; i < instrumentation->num_plan_nodes; ++i) + if (instrumentation->plan_node_id[i] == plan_node_id) + break; + if (i >= instrumentation->num_plan_nodes) + elog(ERROR, "plan node %d not found", plan_node_id); + + /* Accumulate the statistics from all workers. */ + instrument = GetInstrumentationArray(instrumentation); + instrument += i * instrumentation->num_workers; + for (n = 0; n < instrumentation->num_workers; ++n) + InstrAggNode(planstate->instrument, &instrument[n]); + + /* + * Also store the per-worker detail. + * + * Worker instrumentation should be allocated in the same context as the + * regular instrumentation information, which is the per-query context. + * Switch into per-query memory context. + */ + oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt); + ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation)); + planstate->worker_instrument = + palloc(ibytes + offsetof(WorkerInstrumentation, instrument)); + MemoryContextSwitchTo(oldcontext); + + planstate->worker_instrument->num_workers = instrumentation->num_workers; + memcpy(&planstate->worker_instrument->instrument, instrument, ibytes); + + /* Perform any node-type-specific work that needs to be done. */ + switch (nodeTag(planstate)) + { + case T_SortState: + ExecSortRetrieveInstrumentation((SortState *) planstate); + break; + case T_IncrementalSortState: + ExecIncrementalSortRetrieveInstrumentation((IncrementalSortState *) planstate); + break; + case T_HashState: + ExecHashRetrieveInstrumentation((HashState *) planstate); + break; + case T_AggState: + ExecAggRetrieveInstrumentation((AggState *) planstate); + break; + case T_MemoizeState: + ExecMemoizeRetrieveInstrumentation((MemoizeState *) planstate); + break; + default: + break; + } + + return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation, + instrumentation); +} + +/* + * Add up the workers' JIT instrumentation from dynamic shared memory. + */ +static void +ExecParallelRetrieveJitInstrumentation(PlanState *planstate, + SharedJitInstrumentation *shared_jit) +{ + JitInstrumentation *combined; + int ibytes; + + int n; + + /* + * Accumulate worker JIT instrumentation into the combined JIT + * instrumentation, allocating it if required. + */ + if (!planstate->state->es_jit_worker_instr) + planstate->state->es_jit_worker_instr = + MemoryContextAllocZero(planstate->state->es_query_cxt, sizeof(JitInstrumentation)); + combined = planstate->state->es_jit_worker_instr; + + /* Accumulate all the workers' instrumentations. */ + for (n = 0; n < shared_jit->num_workers; ++n) + InstrJitAgg(combined, &shared_jit->jit_instr[n]); + + /* + * Store the per-worker detail. + * + * Similar to ExecParallelRetrieveInstrumentation(), allocate the + * instrumentation in per-query context. + */ + ibytes = offsetof(SharedJitInstrumentation, jit_instr) + + mul_size(shared_jit->num_workers, sizeof(JitInstrumentation)); + planstate->worker_jit_instrument = + MemoryContextAlloc(planstate->state->es_query_cxt, ibytes); + + memcpy(planstate->worker_jit_instrument, shared_jit, ibytes); +} + +/* + * Finish parallel execution. We wait for parallel workers to finish, and + * accumulate their buffer/WAL usage. + */ +void +ExecParallelFinish(ParallelExecutorInfo *pei) +{ + int nworkers = pei->pcxt->nworkers_launched; + int i; + + /* Make this be a no-op if called twice in a row. */ + if (pei->finished) + return; + + /* + * Detach from tuple queues ASAP, so that any still-active workers will + * notice that no further results are wanted. + */ + if (pei->tqueue != NULL) + { + for (i = 0; i < nworkers; i++) + shm_mq_detach(pei->tqueue[i]); + pfree(pei->tqueue); + pei->tqueue = NULL; + } + + /* + * While we're waiting for the workers to finish, let's get rid of the + * tuple queue readers. (Any other local cleanup could be done here too.) + */ + if (pei->reader != NULL) + { + for (i = 0; i < nworkers; i++) + DestroyTupleQueueReader(pei->reader[i]); + pfree(pei->reader); + pei->reader = NULL; + } + + /* Now wait for the workers to finish. */ + WaitForParallelWorkersToFinish(pei->pcxt); + + /* + * Next, accumulate buffer/WAL usage. (This must wait for the workers to + * finish, or we might get incomplete data.) + */ + for (i = 0; i < nworkers; i++) + InstrAccumParallelQuery(&pei->buffer_usage[i], &pei->wal_usage[i]); + + pei->finished = true; +} + +/* + * Accumulate instrumentation, and then clean up whatever ParallelExecutorInfo + * resources still exist after ExecParallelFinish. We separate these + * routines because someone might want to examine the contents of the DSM + * after ExecParallelFinish and before calling this routine. + */ +void +ExecParallelCleanup(ParallelExecutorInfo *pei) +{ + /* Accumulate instrumentation, if any. */ + if (pei->instrumentation) + ExecParallelRetrieveInstrumentation(pei->planstate, + pei->instrumentation); + + /* Accumulate JIT instrumentation, if any. */ + if (pei->jit_instrumentation) + ExecParallelRetrieveJitInstrumentation(pei->planstate, + pei->jit_instrumentation); + + /* Free any serialized parameters. */ + if (DsaPointerIsValid(pei->param_exec)) + { + dsa_free(pei->area, pei->param_exec); + pei->param_exec = InvalidDsaPointer; + } + if (pei->area != NULL) + { + dsa_detach(pei->area); + pei->area = NULL; + } + if (pei->pcxt != NULL) + { + DestroyParallelContext(pei->pcxt); + pei->pcxt = NULL; + } + pfree(pei); +} + +/* + * Create a DestReceiver to write tuples we produce to the shm_mq designated + * for that purpose. + */ +static DestReceiver * +ExecParallelGetReceiver(dsm_segment *seg, shm_toc *toc) +{ + char *mqspace; + shm_mq *mq; + + mqspace = shm_toc_lookup(toc, PARALLEL_KEY_TUPLE_QUEUE, false); + mqspace += ParallelWorkerNumber * PARALLEL_TUPLE_QUEUE_SIZE; + mq = (shm_mq *) mqspace; + shm_mq_set_sender(mq, MyProc); + return CreateTupleQueueDestReceiver(shm_mq_attach(mq, seg, NULL)); +} + +/* + * Create a QueryDesc for the PlannedStmt we are to execute, and return it. + */ +static QueryDesc * +ExecParallelGetQueryDesc(shm_toc *toc, DestReceiver *receiver, + int instrument_options) +{ + char *pstmtspace; + char *paramspace; + PlannedStmt *pstmt; + ParamListInfo paramLI; + char *queryString; + + /* Get the query string from shared memory */ + queryString = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false); + + /* Reconstruct leader-supplied PlannedStmt. */ + pstmtspace = shm_toc_lookup(toc, PARALLEL_KEY_PLANNEDSTMT, false); + pstmt = (PlannedStmt *) stringToNode(pstmtspace); + + /* Reconstruct ParamListInfo. */ + paramspace = shm_toc_lookup(toc, PARALLEL_KEY_PARAMLISTINFO, false); + paramLI = RestoreParamList(¶mspace); + + /* Create a QueryDesc for the query. */ + return CreateQueryDesc(pstmt, + queryString, + GetActiveSnapshot(), InvalidSnapshot, + receiver, paramLI, NULL, instrument_options); +} + +/* + * Copy instrumentation information from this node and its descendants into + * dynamic shared memory, so that the parallel leader can retrieve it. + */ +static bool +ExecParallelReportInstrumentation(PlanState *planstate, + SharedExecutorInstrumentation *instrumentation) +{ + int i; + int plan_node_id = planstate->plan->plan_node_id; + Instrumentation *instrument; + + InstrEndLoop(planstate->instrument); + + /* + * If we shuffled the plan_node_id values in ps_instrument into sorted + * order, we could use binary search here. This might matter someday if + * we're pushing down sufficiently large plan trees. For now, do it the + * slow, dumb way. + */ + for (i = 0; i < instrumentation->num_plan_nodes; ++i) + if (instrumentation->plan_node_id[i] == plan_node_id) + break; + if (i >= instrumentation->num_plan_nodes) + elog(ERROR, "plan node %d not found", plan_node_id); + + /* + * Add our statistics to the per-node, per-worker totals. It's possible + * that this could happen more than once if we relaunched workers. + */ + instrument = GetInstrumentationArray(instrumentation); + instrument += i * instrumentation->num_workers; + Assert(IsParallelWorker()); + Assert(ParallelWorkerNumber < instrumentation->num_workers); + InstrAggNode(&instrument[ParallelWorkerNumber], planstate->instrument); + + return planstate_tree_walker(planstate, ExecParallelReportInstrumentation, + instrumentation); +} + +/* + * Initialize the PlanState and its descendants with the information + * retrieved from shared memory. This has to be done once the PlanState + * is allocated and initialized by executor; that is, after ExecutorStart(). + */ +static bool +ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) +{ + if (planstate == NULL) + return false; + + switch (nodeTag(planstate)) + { + case T_SeqScanState: + if (planstate->plan->parallel_aware) + ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt); + break; + case T_IndexScanState: + if (planstate->plan->parallel_aware) + ExecIndexScanInitializeWorker((IndexScanState *) planstate, + pwcxt); + break; + case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) + ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate, + pwcxt); + break; + case T_ForeignScanState: + if (planstate->plan->parallel_aware) + ExecForeignScanInitializeWorker((ForeignScanState *) planstate, + pwcxt); + break; + case T_AppendState: + if (planstate->plan->parallel_aware) + ExecAppendInitializeWorker((AppendState *) planstate, pwcxt); + break; + case T_CustomScanState: + if (planstate->plan->parallel_aware) + ExecCustomScanInitializeWorker((CustomScanState *) planstate, + pwcxt); + break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) + ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate, + pwcxt); + break; + case T_HashJoinState: + if (planstate->plan->parallel_aware) + ExecHashJoinInitializeWorker((HashJoinState *) planstate, + pwcxt); + break; + case T_HashState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecHashInitializeWorker((HashState *) planstate, pwcxt); + break; + case T_SortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecSortInitializeWorker((SortState *) planstate, pwcxt); + break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate, + pwcxt); + break; + case T_AggState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecAggInitializeWorker((AggState *) planstate, pwcxt); + break; + case T_MemoizeState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecMemoizeInitializeWorker((MemoizeState *) planstate, pwcxt); + break; + default: + break; + } + + return planstate_tree_walker(planstate, ExecParallelInitializeWorker, + pwcxt); +} + +/* + * Main entrypoint for parallel query worker processes. + * + * We reach this function from ParallelWorkerMain, so the setup necessary to + * create a sensible parallel environment has already been done; + * ParallelWorkerMain worries about stuff like the transaction state, combo + * CID mappings, and GUC values, so we don't need to deal with any of that + * here. + * + * Our job is to deal with concerns specific to the executor. The parallel + * group leader will have stored a serialized PlannedStmt, and it's our job + * to execute that plan and write the resulting tuples to the appropriate + * tuple queue. Various bits of supporting information that we need in order + * to do this are also stored in the dsm_segment and can be accessed through + * the shm_toc. + */ +void +ParallelQueryMain(dsm_segment *seg, shm_toc *toc) +{ + FixedParallelExecutorState *fpes; + BufferUsage *buffer_usage; + WalUsage *wal_usage; + DestReceiver *receiver; + QueryDesc *queryDesc; + SharedExecutorInstrumentation *instrumentation; + SharedJitInstrumentation *jit_instrumentation; + int instrument_options = 0; + void *area_space; + dsa_area *area; + ParallelWorkerContext pwcxt; + + /* Get fixed-size state. */ + fpes = shm_toc_lookup(toc, PARALLEL_KEY_EXECUTOR_FIXED, false); + + /* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */ + receiver = ExecParallelGetReceiver(seg, toc); + instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, true); + if (instrumentation != NULL) + instrument_options = instrumentation->instrument_options; + jit_instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_JIT_INSTRUMENTATION, + true); + queryDesc = ExecParallelGetQueryDesc(toc, receiver, instrument_options); + + /* Setting debug_query_string for individual workers */ + debug_query_string = queryDesc->sourceText; + + /* Report workers' query and queryId for monitoring purposes */ + pgstat_report_activity(STATE_RUNNING, debug_query_string); + + /* Attach to the dynamic shared memory area. */ + area_space = shm_toc_lookup(toc, PARALLEL_KEY_DSA, false); + area = dsa_attach_in_place(area_space, seg); + + /* Start up the executor */ + queryDesc->plannedstmt->jitFlags = fpes->jit_flags; + ExecutorStart(queryDesc, fpes->eflags); + + /* Special executor initialization steps for parallel workers */ + queryDesc->planstate->state->es_query_dsa = area; + if (DsaPointerIsValid(fpes->param_exec)) + { + char *paramexec_space; + + paramexec_space = dsa_get_address(area, fpes->param_exec); + RestoreParamExecParams(paramexec_space, queryDesc->estate); + + } + pwcxt.toc = toc; + pwcxt.seg = seg; + ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt); + + /* Pass down any tuple bound */ + ExecSetTupleBound(fpes->tuples_needed, queryDesc->planstate); + + /* + * Prepare to track buffer/WAL usage during query execution. + * + * We do this after starting up the executor to match what happens in the + * leader, which also doesn't count buffer accesses and WAL activity that + * occur during executor startup. + */ + InstrStartParallelQuery(); + + /* + * Run the plan. If we specified a tuple bound, be careful not to demand + * more tuples than that. + */ + ExecutorRun(queryDesc, + ForwardScanDirection, + fpes->tuples_needed < 0 ? (int64) 0 : fpes->tuples_needed, + true); + + /* Shut down the executor */ + ExecutorFinish(queryDesc); + + /* Report buffer/WAL usage during parallel execution. */ + buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); + wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); + InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + &wal_usage[ParallelWorkerNumber]); + + /* Report instrumentation data if any instrumentation options are set. */ + if (instrumentation != NULL) + ExecParallelReportInstrumentation(queryDesc->planstate, + instrumentation); + + /* Report JIT instrumentation data if any */ + if (queryDesc->estate->es_jit && jit_instrumentation != NULL) + { + Assert(ParallelWorkerNumber < jit_instrumentation->num_workers); + jit_instrumentation->jit_instr[ParallelWorkerNumber] = + queryDesc->estate->es_jit->instr; + } + + /* Must do this after capturing instrumentation. */ + ExecutorEnd(queryDesc); + + /* Cleanup. */ + dsa_detach(area); + FreeQueryDesc(queryDesc); + receiver->rDestroy(receiver); +} diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c new file mode 100644 index 0000000..606c920 --- /dev/null +++ b/src/backend/executor/execPartition.c @@ -0,0 +1,2107 @@ +/*------------------------------------------------------------------------- + * + * execPartition.c + * Support routines for partitioning. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/execPartition.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/table.h" +#include "access/tableam.h" +#include "catalog/partition.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_type.h" +#include "executor/execPartition.h" +#include "executor/executor.h" +#include "foreign/fdwapi.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "partitioning/partbounds.h" +#include "partitioning/partdesc.h" +#include "partitioning/partprune.h" +#include "rewrite/rewriteManip.h" +#include "utils/acl.h" +#include "utils/lsyscache.h" +#include "utils/partcache.h" +#include "utils/rls.h" +#include "utils/ruleutils.h" + + +/*----------------------- + * PartitionTupleRouting - Encapsulates all information required to + * route a tuple inserted into a partitioned table to one of its leaf + * partitions. + * + * partition_root + * The partitioned table that's the target of the command. + * + * partition_dispatch_info + * Array of 'max_dispatch' elements containing a pointer to a + * PartitionDispatch object for every partitioned table touched by tuple + * routing. The entry for the target partitioned table is *always* + * present in the 0th element of this array. See comment for + * PartitionDispatchData->indexes for details on how this array is + * indexed. + * + * nonleaf_partitions + * Array of 'max_dispatch' elements containing pointers to fake + * ResultRelInfo objects for nonleaf partitions, useful for checking + * the partition constraint. + * + * num_dispatch + * The current number of items stored in the 'partition_dispatch_info' + * array. Also serves as the index of the next free array element for + * new PartitionDispatch objects that need to be stored. + * + * max_dispatch + * The current allocated size of the 'partition_dispatch_info' array. + * + * partitions + * Array of 'max_partitions' elements containing a pointer to a + * ResultRelInfo for every leaf partition touched by tuple routing. + * Some of these are pointers to ResultRelInfos which are borrowed out of + * the owning ModifyTableState node. The remainder have been built + * especially for tuple routing. See comment for + * PartitionDispatchData->indexes for details on how this array is + * indexed. + * + * is_borrowed_rel + * Array of 'max_partitions' booleans recording whether a given entry + * in 'partitions' is a ResultRelInfo pointer borrowed from the owning + * ModifyTableState node, rather than being built here. + * + * num_partitions + * The current number of items stored in the 'partitions' array. Also + * serves as the index of the next free array element for new + * ResultRelInfo objects that need to be stored. + * + * max_partitions + * The current allocated size of the 'partitions' array. + * + * memcxt + * Memory context used to allocate subsidiary structs. + *----------------------- + */ +struct PartitionTupleRouting +{ + Relation partition_root; + PartitionDispatch *partition_dispatch_info; + ResultRelInfo **nonleaf_partitions; + int num_dispatch; + int max_dispatch; + ResultRelInfo **partitions; + bool *is_borrowed_rel; + int num_partitions; + int max_partitions; + MemoryContext memcxt; +}; + +/*----------------------- + * PartitionDispatch - information about one partitioned table in a partition + * hierarchy required to route a tuple to any of its partitions. A + * PartitionDispatch is always encapsulated inside a PartitionTupleRouting + * struct and stored inside its 'partition_dispatch_info' array. + * + * reldesc + * Relation descriptor of the table + * + * key + * Partition key information of the table + * + * keystate + * Execution state required for expressions in the partition key + * + * partdesc + * Partition descriptor of the table + * + * tupslot + * A standalone TupleTableSlot initialized with this table's tuple + * descriptor, or NULL if no tuple conversion between the parent is + * required. + * + * tupmap + * TupleConversionMap to convert from the parent's rowtype to this table's + * rowtype (when extracting the partition key of a tuple just before + * routing it through this table). A NULL value is stored if no tuple + * conversion is required. + * + * indexes + * Array of partdesc->nparts elements. For leaf partitions the index + * corresponds to the partition's ResultRelInfo in the encapsulating + * PartitionTupleRouting's partitions array. For partitioned partitions, + * the index corresponds to the PartitionDispatch for it in its + * partition_dispatch_info array. -1 indicates we've not yet allocated + * anything in PartitionTupleRouting for the partition. + *----------------------- + */ +typedef struct PartitionDispatchData +{ + Relation reldesc; + PartitionKey key; + List *keystate; /* list of ExprState */ + PartitionDesc partdesc; + TupleTableSlot *tupslot; + AttrMap *tupmap; + int indexes[FLEXIBLE_ARRAY_MEMBER]; +} PartitionDispatchData; + + +static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate, + EState *estate, PartitionTupleRouting *proute, + PartitionDispatch dispatch, + ResultRelInfo *rootResultRelInfo, + int partidx); +static void ExecInitRoutingInfo(ModifyTableState *mtstate, + EState *estate, + PartitionTupleRouting *proute, + PartitionDispatch dispatch, + ResultRelInfo *partRelInfo, + int partidx, + bool is_borrowed_rel); +static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate, + PartitionTupleRouting *proute, + Oid partoid, PartitionDispatch parent_pd, + int partidx, ResultRelInfo *rootResultRelInfo); +static void FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull); +static int get_partition_for_tuple(PartitionDispatch pd, Datum *values, + bool *isnull); +static char *ExecBuildSlotPartitionKeyDescription(Relation rel, + Datum *values, + bool *isnull, + int maxfieldlen); +static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri); +static void ExecInitPruningContext(PartitionPruneContext *context, + List *pruning_steps, + PartitionDesc partdesc, + PartitionKey partkey, + PlanState *planstate); +static void find_matching_subplans_recurse(PartitionPruningData *prunedata, + PartitionedRelPruningData *pprune, + bool initial_prune, + Bitmapset **validsubplans); + + +/* + * ExecSetupPartitionTupleRouting - sets up information needed during + * tuple routing for partitioned tables, encapsulates it in + * PartitionTupleRouting, and returns it. + * + * Callers must use the returned PartitionTupleRouting during calls to + * ExecFindPartition(). The actual ResultRelInfo for a partition is only + * allocated when the partition is found for the first time. + * + * The current memory context is used to allocate this struct and all + * subsidiary structs that will be allocated from it later on. Typically + * it should be estate->es_query_cxt. + */ +PartitionTupleRouting * +ExecSetupPartitionTupleRouting(EState *estate, Relation rel) +{ + PartitionTupleRouting *proute; + + /* + * Here we attempt to expend as little effort as possible in setting up + * the PartitionTupleRouting. Each partition's ResultRelInfo is built on + * demand, only when we actually need to route a tuple to that partition. + * The reason for this is that a common case is for INSERT to insert a + * single tuple into a partitioned table and this must be fast. + */ + proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting)); + proute->partition_root = rel; + proute->memcxt = CurrentMemoryContext; + /* Rest of members initialized by zeroing */ + + /* + * Initialize this table's PartitionDispatch object. Here we pass in the + * parent as NULL as we don't need to care about any parent of the target + * partitioned table. + */ + ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel), + NULL, 0, NULL); + + return proute; +} + +/* + * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that + * the tuple contained in *slot should belong to. + * + * If the partition's ResultRelInfo does not yet exist in 'proute' then we set + * one up or reuse one from mtstate's resultRelInfo array. When reusing a + * ResultRelInfo from the mtstate we verify that the relation is a valid + * target for INSERTs and initialize tuple routing information. + * + * rootResultRelInfo is the relation named in the query. + * + * estate must be non-NULL; we'll need it to compute any expressions in the + * partition keys. Also, its per-tuple contexts are used as evaluation + * scratch space. + * + * If no leaf partition is found, this routine errors out with the appropriate + * error message. An error may also be raised if the found target partition + * is not a valid target for an INSERT. + */ +ResultRelInfo * +ExecFindPartition(ModifyTableState *mtstate, + ResultRelInfo *rootResultRelInfo, + PartitionTupleRouting *proute, + TupleTableSlot *slot, EState *estate) +{ + PartitionDispatch *pd = proute->partition_dispatch_info; + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + Relation rel; + PartitionDispatch dispatch; + PartitionDesc partdesc; + ExprContext *ecxt = GetPerTupleExprContext(estate); + TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple; + TupleTableSlot *rootslot = slot; + TupleTableSlot *myslot = NULL; + MemoryContext oldcxt; + ResultRelInfo *rri = NULL; + + /* use per-tuple context here to avoid leaking memory */ + oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + + /* + * First check the root table's partition constraint, if any. No point in + * routing the tuple if it doesn't belong in the root table itself. + */ + if (rootResultRelInfo->ri_RelationDesc->rd_rel->relispartition) + ExecPartitionCheck(rootResultRelInfo, slot, estate, true); + + /* start with the root partitioned table */ + dispatch = pd[0]; + while (dispatch != NULL) + { + int partidx = -1; + bool is_leaf; + + CHECK_FOR_INTERRUPTS(); + + rel = dispatch->reldesc; + partdesc = dispatch->partdesc; + + /* + * Extract partition key from tuple. Expression evaluation machinery + * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to + * point to the correct tuple slot. The slot might have changed from + * what was used for the parent table if the table of the current + * partitioning level has different tuple descriptor from the parent. + * So update ecxt_scantuple accordingly. + */ + ecxt->ecxt_scantuple = slot; + FormPartitionKeyDatum(dispatch, slot, estate, values, isnull); + + /* + * If this partitioned table has no partitions or no partition for + * these values, error out. + */ + if (partdesc->nparts == 0 || + (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0) + { + char *val_desc; + + val_desc = ExecBuildSlotPartitionKeyDescription(rel, + values, isnull, 64); + Assert(OidIsValid(RelationGetRelid(rel))); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + RelationGetRelationName(rel)), + val_desc ? + errdetail("Partition key of the failing row contains %s.", + val_desc) : 0, + errtable(rel))); + } + + is_leaf = partdesc->is_leaf[partidx]; + if (is_leaf) + { + /* + * We've reached the leaf -- hurray, we're done. Look to see if + * we've already got a ResultRelInfo for this partition. + */ + if (likely(dispatch->indexes[partidx] >= 0)) + { + /* ResultRelInfo already built */ + Assert(dispatch->indexes[partidx] < proute->num_partitions); + rri = proute->partitions[dispatch->indexes[partidx]]; + } + else + { + /* + * If the partition is known in the owning ModifyTableState + * node, we can re-use that ResultRelInfo instead of creating + * a new one with ExecInitPartitionInfo(). + */ + rri = ExecLookupResultRelByOid(mtstate, + partdesc->oids[partidx], + true, false); + if (rri) + { + /* Verify this ResultRelInfo allows INSERTs */ + CheckValidResultRel(rri, CMD_INSERT); + + /* + * Initialize information needed to insert this and + * subsequent tuples routed to this partition. + */ + ExecInitRoutingInfo(mtstate, estate, proute, dispatch, + rri, partidx, true); + } + else + { + /* We need to create a new one. */ + rri = ExecInitPartitionInfo(mtstate, estate, proute, + dispatch, + rootResultRelInfo, partidx); + } + } + Assert(rri != NULL); + + /* Signal to terminate the loop */ + dispatch = NULL; + } + else + { + /* + * Partition is a sub-partitioned table; get the PartitionDispatch + */ + if (likely(dispatch->indexes[partidx] >= 0)) + { + /* Already built. */ + Assert(dispatch->indexes[partidx] < proute->num_dispatch); + + rri = proute->nonleaf_partitions[dispatch->indexes[partidx]]; + + /* + * Move down to the next partition level and search again + * until we find a leaf partition that matches this tuple + */ + dispatch = pd[dispatch->indexes[partidx]]; + } + else + { + /* Not yet built. Do that now. */ + PartitionDispatch subdispatch; + + /* + * Create the new PartitionDispatch. We pass the current one + * in as the parent PartitionDispatch + */ + subdispatch = ExecInitPartitionDispatchInfo(estate, + proute, + partdesc->oids[partidx], + dispatch, partidx, + mtstate->rootResultRelInfo); + Assert(dispatch->indexes[partidx] >= 0 && + dispatch->indexes[partidx] < proute->num_dispatch); + + rri = proute->nonleaf_partitions[dispatch->indexes[partidx]]; + dispatch = subdispatch; + } + + /* + * Convert the tuple to the new parent's layout, if different from + * the previous parent. + */ + if (dispatch->tupslot) + { + AttrMap *map = dispatch->tupmap; + TupleTableSlot *tempslot = myslot; + + myslot = dispatch->tupslot; + slot = execute_attr_map_slot(map, slot, myslot); + + if (tempslot != NULL) + ExecClearTuple(tempslot); + } + } + + /* + * If this partition is the default one, we must check its partition + * constraint now, which may have changed concurrently due to + * partitions being added to the parent. + * + * (We do this here, and do not rely on ExecInsert doing it, because + * we don't want to miss doing it for non-leaf partitions.) + */ + if (partidx == partdesc->boundinfo->default_index) + { + /* + * The tuple must match the partition's layout for the constraint + * expression to be evaluated successfully. If the partition is + * sub-partitioned, that would already be the case due to the code + * above, but for a leaf partition the tuple still matches the + * parent's layout. + * + * Note that we have a map to convert from root to current + * partition, but not from immediate parent to current partition. + * So if we have to convert, do it from the root slot; if not, use + * the root slot as-is. + */ + if (is_leaf) + { + TupleConversionMap *map = rri->ri_RootToPartitionMap; + + if (map) + slot = execute_attr_map_slot(map->attrMap, rootslot, + rri->ri_PartitionTupleSlot); + else + slot = rootslot; + } + + ExecPartitionCheck(rri, slot, estate, true); + } + } + + /* Release the tuple in the lowest parent's dedicated slot. */ + if (myslot != NULL) + ExecClearTuple(myslot); + /* and restore ecxt's scantuple */ + ecxt->ecxt_scantuple = ecxt_scantuple_saved; + MemoryContextSwitchTo(oldcxt); + + return rri; +} + +/* + * ExecInitPartitionInfo + * Lock the partition and initialize ResultRelInfo. Also setup other + * information for the partition and store it in the next empty slot in + * the proute->partitions array. + * + * Returns the ResultRelInfo + */ +static ResultRelInfo * +ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, + PartitionTupleRouting *proute, + PartitionDispatch dispatch, + ResultRelInfo *rootResultRelInfo, + int partidx) +{ + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + Oid partOid = dispatch->partdesc->oids[partidx]; + Relation partrel; + int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; + ResultRelInfo *leaf_part_rri; + MemoryContext oldcxt; + AttrMap *part_attmap = NULL; + bool found_whole_row; + + oldcxt = MemoryContextSwitchTo(proute->memcxt); + + partrel = table_open(partOid, RowExclusiveLock); + + leaf_part_rri = makeNode(ResultRelInfo); + InitResultRelInfo(leaf_part_rri, + partrel, + 0, + rootResultRelInfo, + estate->es_instrument); + + /* + * Verify result relation is a valid target for an INSERT. An UPDATE of a + * partition-key becomes a DELETE+INSERT operation, so this check is still + * required when the operation is CMD_UPDATE. + */ + CheckValidResultRel(leaf_part_rri, CMD_INSERT); + + /* + * Open partition indices. The user may have asked to check for conflicts + * within this leaf partition and do "nothing" instead of throwing an + * error. Be prepared in that case by initializing the index information + * needed by ExecInsert() to perform speculative insertions. + */ + if (partrel->rd_rel->relhasindex && + leaf_part_rri->ri_IndexRelationDescs == NULL) + ExecOpenIndices(leaf_part_rri, + (node != NULL && + node->onConflictAction != ONCONFLICT_NONE)); + + /* + * Build WITH CHECK OPTION constraints for the partition. Note that we + * didn't build the withCheckOptionList for partitions within the planner, + * but simple translation of varattnos will suffice. This only occurs for + * the INSERT case or in the case of UPDATE tuple routing where we didn't + * find a result rel to reuse. + */ + if (node && node->withCheckOptionLists != NIL) + { + List *wcoList; + List *wcoExprs = NIL; + ListCell *ll; + + /* + * In the case of INSERT on a partitioned table, there is only one + * plan. Likewise, there is only one WCO list, not one per partition. + * For UPDATE, there are as many WCO lists as there are plans. + */ + Assert((node->operation == CMD_INSERT && + list_length(node->withCheckOptionLists) == 1 && + list_length(node->resultRelations) == 1) || + (node->operation == CMD_UPDATE && + list_length(node->withCheckOptionLists) == + list_length(node->resultRelations))); + + /* + * Use the WCO list of the first plan as a reference to calculate + * attno's for the WCO list of this partition. In the INSERT case, + * that refers to the root partitioned table, whereas in the UPDATE + * tuple routing case, that refers to the first partition in the + * mtstate->resultRelInfo array. In any case, both that relation and + * this partition should have the same columns, so we should be able + * to map attributes successfully. + */ + wcoList = linitial(node->withCheckOptionLists); + + /* + * Convert Vars in it to contain this partition's attribute numbers. + */ + part_attmap = + build_attrmap_by_name(RelationGetDescr(partrel), + RelationGetDescr(firstResultRel)); + wcoList = (List *) + map_variable_attnos((Node *) wcoList, + firstVarno, 0, + part_attmap, + RelationGetForm(partrel)->reltype, + &found_whole_row); + /* We ignore the value of found_whole_row. */ + + foreach(ll, wcoList) + { + WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); + ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), + &mtstate->ps); + + wcoExprs = lappend(wcoExprs, wcoExpr); + } + + leaf_part_rri->ri_WithCheckOptions = wcoList; + leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs; + } + + /* + * Build the RETURNING projection for the partition. Note that we didn't + * build the returningList for partitions within the planner, but simple + * translation of varattnos will suffice. This only occurs for the INSERT + * case or in the case of UPDATE tuple routing where we didn't find a + * result rel to reuse. + */ + if (node && node->returningLists != NIL) + { + TupleTableSlot *slot; + ExprContext *econtext; + List *returningList; + + /* See the comment above for WCO lists. */ + Assert((node->operation == CMD_INSERT && + list_length(node->returningLists) == 1 && + list_length(node->resultRelations) == 1) || + (node->operation == CMD_UPDATE && + list_length(node->returningLists) == + list_length(node->resultRelations))); + + /* + * Use the RETURNING list of the first plan as a reference to + * calculate attno's for the RETURNING list of this partition. See + * the comment above for WCO lists for more details on why this is + * okay. + */ + returningList = linitial(node->returningLists); + + /* + * Convert Vars in it to contain this partition's attribute numbers. + */ + if (part_attmap == NULL) + part_attmap = + build_attrmap_by_name(RelationGetDescr(partrel), + RelationGetDescr(firstResultRel)); + returningList = (List *) + map_variable_attnos((Node *) returningList, + firstVarno, 0, + part_attmap, + RelationGetForm(partrel)->reltype, + &found_whole_row); + /* We ignore the value of found_whole_row. */ + + leaf_part_rri->ri_returningList = returningList; + + /* + * Initialize the projection itself. + * + * Use the slot and the expression context that would have been set up + * in ExecInitModifyTable() for projection's output. + */ + Assert(mtstate->ps.ps_ResultTupleSlot != NULL); + slot = mtstate->ps.ps_ResultTupleSlot; + Assert(mtstate->ps.ps_ExprContext != NULL); + econtext = mtstate->ps.ps_ExprContext; + leaf_part_rri->ri_projectReturning = + ExecBuildProjectionInfo(returningList, econtext, slot, + &mtstate->ps, RelationGetDescr(partrel)); + } + + /* Set up information needed for routing tuples to the partition. */ + ExecInitRoutingInfo(mtstate, estate, proute, dispatch, + leaf_part_rri, partidx, false); + + /* + * If there is an ON CONFLICT clause, initialize state for it. + */ + if (node && node->onConflictAction != ONCONFLICT_NONE) + { + TupleDesc partrelDesc = RelationGetDescr(partrel); + ExprContext *econtext = mtstate->ps.ps_ExprContext; + ListCell *lc; + List *arbiterIndexes = NIL; + + /* + * If there is a list of arbiter indexes, map it to a list of indexes + * in the partition. We do that by scanning the partition's index + * list and searching for ancestry relationships to each index in the + * ancestor table. + */ + if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) > 0) + { + List *childIdxs; + + childIdxs = RelationGetIndexList(leaf_part_rri->ri_RelationDesc); + + foreach(lc, childIdxs) + { + Oid childIdx = lfirst_oid(lc); + List *ancestors; + ListCell *lc2; + + ancestors = get_partition_ancestors(childIdx); + foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes) + { + if (list_member_oid(ancestors, lfirst_oid(lc2))) + arbiterIndexes = lappend_oid(arbiterIndexes, childIdx); + } + list_free(ancestors); + } + } + + /* + * If the resulting lists are of inequal length, something is wrong. + * (This shouldn't happen, since arbiter index selection should not + * pick up an invalid index.) + */ + if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) != + list_length(arbiterIndexes)) + elog(ERROR, "invalid arbiter index list"); + leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes; + + /* + * In the DO UPDATE case, we have some more state to initialize. + */ + if (node->onConflictAction == ONCONFLICT_UPDATE) + { + OnConflictSetState *onconfl = makeNode(OnConflictSetState); + TupleConversionMap *map; + + map = leaf_part_rri->ri_RootToPartitionMap; + + Assert(node->onConflictSet != NIL); + Assert(rootResultRelInfo->ri_onConflict != NULL); + + leaf_part_rri->ri_onConflict = onconfl; + + /* + * Need a separate existing slot for each partition, as the + * partition could be of a different AM, even if the tuple + * descriptors match. + */ + onconfl->oc_Existing = + table_slot_create(leaf_part_rri->ri_RelationDesc, + &mtstate->ps.state->es_tupleTable); + + /* + * If the partition's tuple descriptor matches exactly the root + * parent (the common case), we can re-use most of the parent's ON + * CONFLICT SET state, skipping a bunch of work. Otherwise, we + * need to create state specific to this partition. + */ + if (map == NULL) + { + /* + * It's safe to reuse these from the partition root, as we + * only process one tuple at a time (therefore we won't + * overwrite needed data in slots), and the results of + * projections are independent of the underlying storage. + * Projections and where clauses themselves don't store state + * / are independent of the underlying storage. + */ + onconfl->oc_ProjSlot = + rootResultRelInfo->ri_onConflict->oc_ProjSlot; + onconfl->oc_ProjInfo = + rootResultRelInfo->ri_onConflict->oc_ProjInfo; + onconfl->oc_WhereClause = + rootResultRelInfo->ri_onConflict->oc_WhereClause; + } + else + { + List *onconflset; + List *onconflcols; + bool found_whole_row; + + /* + * Translate expressions in onConflictSet to account for + * different attribute numbers. For that, map partition + * varattnos twice: first to catch the EXCLUDED + * pseudo-relation (INNER_VAR), and second to handle the main + * target relation (firstVarno). + */ + onconflset = copyObject(node->onConflictSet); + if (part_attmap == NULL) + part_attmap = + build_attrmap_by_name(RelationGetDescr(partrel), + RelationGetDescr(firstResultRel)); + onconflset = (List *) + map_variable_attnos((Node *) onconflset, + INNER_VAR, 0, + part_attmap, + RelationGetForm(partrel)->reltype, + &found_whole_row); + /* We ignore the value of found_whole_row. */ + onconflset = (List *) + map_variable_attnos((Node *) onconflset, + firstVarno, 0, + part_attmap, + RelationGetForm(partrel)->reltype, + &found_whole_row); + /* We ignore the value of found_whole_row. */ + + /* Finally, adjust the target colnos to match the partition. */ + onconflcols = adjust_partition_colnos(node->onConflictCols, + leaf_part_rri); + + /* create the tuple slot for the UPDATE SET projection */ + onconfl->oc_ProjSlot = + table_slot_create(partrel, + &mtstate->ps.state->es_tupleTable); + + /* build UPDATE SET projection state */ + onconfl->oc_ProjInfo = + ExecBuildUpdateProjection(onconflset, + true, + onconflcols, + partrelDesc, + econtext, + onconfl->oc_ProjSlot, + &mtstate->ps); + + /* + * If there is a WHERE clause, initialize state where it will + * be evaluated, mapping the attribute numbers appropriately. + * As with onConflictSet, we need to map partition varattnos + * to the partition's tupdesc. + */ + if (node->onConflictWhere) + { + List *clause; + + clause = copyObject((List *) node->onConflictWhere); + clause = (List *) + map_variable_attnos((Node *) clause, + INNER_VAR, 0, + part_attmap, + RelationGetForm(partrel)->reltype, + &found_whole_row); + /* We ignore the value of found_whole_row. */ + clause = (List *) + map_variable_attnos((Node *) clause, + firstVarno, 0, + part_attmap, + RelationGetForm(partrel)->reltype, + &found_whole_row); + /* We ignore the value of found_whole_row. */ + onconfl->oc_WhereClause = + ExecInitQual((List *) clause, &mtstate->ps); + } + } + } + } + + /* + * Since we've just initialized this ResultRelInfo, it's not in any list + * attached to the estate as yet. Add it, so that it can be found later. + * + * Note that the entries in this list appear in no predetermined order, + * because partition result rels are initialized as and when they're + * needed. + */ + MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_tuple_routing_result_relations = + lappend(estate->es_tuple_routing_result_relations, + leaf_part_rri); + + MemoryContextSwitchTo(oldcxt); + + return leaf_part_rri; +} + +/* + * ExecInitRoutingInfo + * Set up information needed for translating tuples between root + * partitioned table format and partition format, and keep track of it + * in PartitionTupleRouting. + */ +static void +ExecInitRoutingInfo(ModifyTableState *mtstate, + EState *estate, + PartitionTupleRouting *proute, + PartitionDispatch dispatch, + ResultRelInfo *partRelInfo, + int partidx, + bool is_borrowed_rel) +{ + ResultRelInfo *rootRelInfo = partRelInfo->ri_RootResultRelInfo; + MemoryContext oldcxt; + int rri_index; + + oldcxt = MemoryContextSwitchTo(proute->memcxt); + + /* + * Set up a tuple conversion map to convert a tuple routed to the + * partition from the parent's type to the partition's. + */ + partRelInfo->ri_RootToPartitionMap = + convert_tuples_by_name(RelationGetDescr(rootRelInfo->ri_RelationDesc), + RelationGetDescr(partRelInfo->ri_RelationDesc)); + + /* + * If a partition has a different rowtype than the root parent, initialize + * a slot dedicated to storing this partition's tuples. The slot is used + * for various operations that are applied to tuples after routing, such + * as checking constraints. + */ + if (partRelInfo->ri_RootToPartitionMap != NULL) + { + Relation partrel = partRelInfo->ri_RelationDesc; + + /* + * Initialize the slot itself setting its descriptor to this + * partition's TupleDesc; TupleDesc reference will be released at the + * end of the command. + */ + partRelInfo->ri_PartitionTupleSlot = + table_slot_create(partrel, &estate->es_tupleTable); + } + else + partRelInfo->ri_PartitionTupleSlot = NULL; + + /* + * If the partition is a foreign table, let the FDW init itself for + * routing tuples to the partition. + */ + if (partRelInfo->ri_FdwRoutine != NULL && + partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL) + partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo); + + /* + * Determine if the FDW supports batch insert and determine the batch size + * (a FDW may support batching, but it may be disabled for the + * server/table or for this particular query). + * + * If the FDW does not support batching, we set the batch size to 1. + */ + if (mtstate->operation == CMD_INSERT && + partRelInfo->ri_FdwRoutine != NULL && + partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize && + partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert) + partRelInfo->ri_BatchSize = + partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(partRelInfo); + else + partRelInfo->ri_BatchSize = 1; + + Assert(partRelInfo->ri_BatchSize >= 1); + + partRelInfo->ri_CopyMultiInsertBuffer = NULL; + + /* + * Keep track of it in the PartitionTupleRouting->partitions array. + */ + Assert(dispatch->indexes[partidx] == -1); + + rri_index = proute->num_partitions++; + + /* Allocate or enlarge the array, as needed */ + if (proute->num_partitions >= proute->max_partitions) + { + if (proute->max_partitions == 0) + { + proute->max_partitions = 8; + proute->partitions = (ResultRelInfo **) + palloc(sizeof(ResultRelInfo *) * proute->max_partitions); + proute->is_borrowed_rel = (bool *) + palloc(sizeof(bool) * proute->max_partitions); + } + else + { + proute->max_partitions *= 2; + proute->partitions = (ResultRelInfo **) + repalloc(proute->partitions, sizeof(ResultRelInfo *) * + proute->max_partitions); + proute->is_borrowed_rel = (bool *) + repalloc(proute->is_borrowed_rel, sizeof(bool) * + proute->max_partitions); + } + } + + proute->partitions[rri_index] = partRelInfo; + proute->is_borrowed_rel[rri_index] = is_borrowed_rel; + dispatch->indexes[partidx] = rri_index; + + MemoryContextSwitchTo(oldcxt); +} + +/* + * ExecInitPartitionDispatchInfo + * Lock the partitioned table (if not locked already) and initialize + * PartitionDispatch for a partitioned table and store it in the next + * available slot in the proute->partition_dispatch_info array. Also, + * record the index into this array in the parent_pd->indexes[] array in + * the partidx element so that we can properly retrieve the newly created + * PartitionDispatch later. + */ +static PartitionDispatch +ExecInitPartitionDispatchInfo(EState *estate, + PartitionTupleRouting *proute, Oid partoid, + PartitionDispatch parent_pd, int partidx, + ResultRelInfo *rootResultRelInfo) +{ + Relation rel; + PartitionDesc partdesc; + PartitionDispatch pd; + int dispatchidx; + MemoryContext oldcxt; + + /* + * For data modification, it is better that executor does not include + * partitions being detached, except when running in snapshot-isolation + * mode. This means that a read-committed transaction immediately gets a + * "no partition for tuple" error when a tuple is inserted into a + * partition that's being detached concurrently, but a transaction in + * repeatable-read mode can still use such a partition. + */ + if (estate->es_partition_directory == NULL) + estate->es_partition_directory = + CreatePartitionDirectory(estate->es_query_cxt, + !IsolationUsesXactSnapshot()); + + oldcxt = MemoryContextSwitchTo(proute->memcxt); + + /* + * Only sub-partitioned tables need to be locked here. The root + * partitioned table will already have been locked as it's referenced in + * the query's rtable. + */ + if (partoid != RelationGetRelid(proute->partition_root)) + rel = table_open(partoid, RowExclusiveLock); + else + rel = proute->partition_root; + partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel); + + pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) + + partdesc->nparts * sizeof(int)); + pd->reldesc = rel; + pd->key = RelationGetPartitionKey(rel); + pd->keystate = NIL; + pd->partdesc = partdesc; + if (parent_pd != NULL) + { + TupleDesc tupdesc = RelationGetDescr(rel); + + /* + * For sub-partitioned tables where the column order differs from its + * direct parent partitioned table, we must store a tuple table slot + * initialized with its tuple descriptor and a tuple conversion map to + * convert a tuple from its parent's rowtype to its own. This is to + * make sure that we are looking at the correct row using the correct + * tuple descriptor when computing its partition key for tuple + * routing. + */ + pd->tupmap = build_attrmap_by_name_if_req(RelationGetDescr(parent_pd->reldesc), + tupdesc); + pd->tupslot = pd->tupmap ? + MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual) : NULL; + } + else + { + /* Not required for the root partitioned table */ + pd->tupmap = NULL; + pd->tupslot = NULL; + } + + /* + * Initialize with -1 to signify that the corresponding partition's + * ResultRelInfo or PartitionDispatch has not been created yet. + */ + memset(pd->indexes, -1, sizeof(int) * partdesc->nparts); + + /* Track in PartitionTupleRouting for later use */ + dispatchidx = proute->num_dispatch++; + + /* Allocate or enlarge the array, as needed */ + if (proute->num_dispatch >= proute->max_dispatch) + { + if (proute->max_dispatch == 0) + { + proute->max_dispatch = 4; + proute->partition_dispatch_info = (PartitionDispatch *) + palloc(sizeof(PartitionDispatch) * proute->max_dispatch); + proute->nonleaf_partitions = (ResultRelInfo **) + palloc(sizeof(ResultRelInfo *) * proute->max_dispatch); + } + else + { + proute->max_dispatch *= 2; + proute->partition_dispatch_info = (PartitionDispatch *) + repalloc(proute->partition_dispatch_info, + sizeof(PartitionDispatch) * proute->max_dispatch); + proute->nonleaf_partitions = (ResultRelInfo **) + repalloc(proute->nonleaf_partitions, + sizeof(ResultRelInfo *) * proute->max_dispatch); + } + } + proute->partition_dispatch_info[dispatchidx] = pd; + + /* + * If setting up a PartitionDispatch for a sub-partitioned table, we may + * also need a minimally valid ResultRelInfo for checking the partition + * constraint later; set that up now. + */ + if (parent_pd) + { + ResultRelInfo *rri = makeNode(ResultRelInfo); + + InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0); + proute->nonleaf_partitions[dispatchidx] = rri; + } + else + proute->nonleaf_partitions[dispatchidx] = NULL; + + /* + * Finally, if setting up a PartitionDispatch for a sub-partitioned table, + * install a downlink in the parent to allow quick descent. + */ + if (parent_pd) + { + Assert(parent_pd->indexes[partidx] == -1); + parent_pd->indexes[partidx] = dispatchidx; + } + + MemoryContextSwitchTo(oldcxt); + + return pd; +} + +/* + * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple + * routing. + * + * Close all the partitioned tables, leaf partitions, and their indices. + */ +void +ExecCleanupTupleRouting(ModifyTableState *mtstate, + PartitionTupleRouting *proute) +{ + int i; + + /* + * Remember, proute->partition_dispatch_info[0] corresponds to the root + * partitioned table, which we must not try to close, because it is the + * main target table of the query that will be closed by callers such as + * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root + * partitioned table. + */ + for (i = 1; i < proute->num_dispatch; i++) + { + PartitionDispatch pd = proute->partition_dispatch_info[i]; + + table_close(pd->reldesc, NoLock); + + if (pd->tupslot) + ExecDropSingleTupleTableSlot(pd->tupslot); + } + + for (i = 0; i < proute->num_partitions; i++) + { + ResultRelInfo *resultRelInfo = proute->partitions[i]; + + /* Allow any FDWs to shut down */ + if (resultRelInfo->ri_FdwRoutine != NULL && + resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL) + resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state, + resultRelInfo); + + /* + * Close it if it's not one of the result relations borrowed from the + * owning ModifyTableState; those will be closed by ExecEndPlan(). + */ + if (proute->is_borrowed_rel[i]) + continue; + + ExecCloseIndices(resultRelInfo); + table_close(resultRelInfo->ri_RelationDesc, NoLock); + } +} + +/* ---------------- + * FormPartitionKeyDatum + * Construct values[] and isnull[] arrays for the partition key + * of a tuple. + * + * pd Partition dispatch object of the partitioned table + * slot Heap tuple from which to extract partition key + * estate executor state for evaluating any partition key + * expressions (must be non-NULL) + * values Array of partition key Datums (output area) + * isnull Array of is-null indicators (output area) + * + * the ecxt_scantuple slot of estate's per-tuple expr context must point to + * the heap tuple passed in. + * ---------------- + */ +static void +FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull) +{ + ListCell *partexpr_item; + int i; + + if (pd->key->partexprs != NIL && pd->keystate == NIL) + { + /* Check caller has set up context correctly */ + Assert(estate != NULL && + GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + + /* First time through, set up expression evaluation state */ + pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate); + } + + partexpr_item = list_head(pd->keystate); + for (i = 0; i < pd->key->partnatts; i++) + { + AttrNumber keycol = pd->key->partattrs[i]; + Datum datum; + bool isNull; + + if (keycol != 0) + { + /* Plain column; get the value directly from the heap tuple */ + datum = slot_getattr(slot, keycol, &isNull); + } + else + { + /* Expression; need to evaluate it */ + if (partexpr_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), + GetPerTupleExprContext(estate), + &isNull); + partexpr_item = lnext(pd->keystate, partexpr_item); + } + values[i] = datum; + isnull[i] = isNull; + } + + if (partexpr_item != NULL) + elog(ERROR, "wrong number of partition key expressions"); +} + +/* + * get_partition_for_tuple + * Finds partition of relation which accepts the partition key specified + * in values and isnull + * + * Return value is index of the partition (>= 0 and < partdesc->nparts) if one + * found or -1 if none found. + */ +static int +get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) +{ + int bound_offset; + int part_index = -1; + PartitionKey key = pd->key; + PartitionDesc partdesc = pd->partdesc; + PartitionBoundInfo boundinfo = partdesc->boundinfo; + + /* Route as appropriate based on partitioning strategy. */ + switch (key->strategy) + { + case PARTITION_STRATEGY_HASH: + { + uint64 rowHash; + + rowHash = compute_partition_hash_value(key->partnatts, + key->partsupfunc, + key->partcollation, + values, isnull); + + part_index = boundinfo->indexes[rowHash % boundinfo->nindexes]; + } + break; + + case PARTITION_STRATEGY_LIST: + if (isnull[0]) + { + if (partition_bound_accepts_nulls(boundinfo)) + part_index = boundinfo->null_index; + } + else + { + bool equal = false; + + bound_offset = partition_list_bsearch(key->partsupfunc, + key->partcollation, + boundinfo, + values[0], &equal); + if (bound_offset >= 0 && equal) + part_index = boundinfo->indexes[bound_offset]; + } + break; + + case PARTITION_STRATEGY_RANGE: + { + bool equal = false, + range_partkey_has_null = false; + int i; + + /* + * No range includes NULL, so this will be accepted by the + * default partition if there is one, and otherwise rejected. + */ + for (i = 0; i < key->partnatts; i++) + { + if (isnull[i]) + { + range_partkey_has_null = true; + break; + } + } + + if (!range_partkey_has_null) + { + bound_offset = partition_range_datum_bsearch(key->partsupfunc, + key->partcollation, + boundinfo, + key->partnatts, + values, + &equal); + + /* + * The bound at bound_offset is less than or equal to the + * tuple value, so the bound at offset+1 is the upper + * bound of the partition we're looking for, if there + * actually exists one. + */ + part_index = boundinfo->indexes[bound_offset + 1]; + } + } + break; + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); + } + + /* + * part_index < 0 means we failed to find a partition of this parent. Use + * the default partition, if there is one. + */ + if (part_index < 0) + part_index = boundinfo->default_index; + + return part_index; +} + +/* + * ExecBuildSlotPartitionKeyDescription + * + * This works very much like BuildIndexValueDescription() and is currently + * used for building error messages when ExecFindPartition() fails to find + * partition for a row. + */ +static char * +ExecBuildSlotPartitionKeyDescription(Relation rel, + Datum *values, + bool *isnull, + int maxfieldlen) +{ + StringInfoData buf; + PartitionKey key = RelationGetPartitionKey(rel); + int partnatts = get_partition_natts(key); + int i; + Oid relid = RelationGetRelid(rel); + AclResult aclresult; + + if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED) + return NULL; + + /* If the user has table-level access, just go build the description. */ + aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT); + if (aclresult != ACLCHECK_OK) + { + /* + * Step through the columns of the partition key and make sure the + * user has SELECT rights on all of them. + */ + for (i = 0; i < partnatts; i++) + { + AttrNumber attnum = get_partition_col_attnum(key, i); + + /* + * If this partition key column is an expression, we return no + * detail rather than try to figure out what column(s) the + * expression includes and if the user has SELECT rights on them. + */ + if (attnum == InvalidAttrNumber || + pg_attribute_aclcheck(relid, attnum, GetUserId(), + ACL_SELECT) != ACLCHECK_OK) + return NULL; + } + } + + initStringInfo(&buf); + appendStringInfo(&buf, "(%s) = (", + pg_get_partkeydef_columns(relid, true)); + + for (i = 0; i < partnatts; i++) + { + char *val; + int vallen; + + if (isnull[i]) + val = "null"; + else + { + Oid foutoid; + bool typisvarlena; + + getTypeOutputInfo(get_partition_col_typid(key, i), + &foutoid, &typisvarlena); + val = OidOutputFunctionCall(foutoid, values[i]); + } + + if (i > 0) + appendStringInfoString(&buf, ", "); + + /* truncate if needed */ + vallen = strlen(val); + if (vallen <= maxfieldlen) + appendBinaryStringInfo(&buf, val, vallen); + else + { + vallen = pg_mbcliplen(val, vallen, maxfieldlen); + appendBinaryStringInfo(&buf, val, vallen); + appendStringInfoString(&buf, "..."); + } + } + + appendStringInfoChar(&buf, ')'); + + return buf.data; +} + +/* + * adjust_partition_colnos + * Adjust the list of UPDATE target column numbers to account for + * attribute differences between the parent and the partition. + */ +static List * +adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri) +{ + List *new_colnos = NIL; + TupleConversionMap *map = ExecGetChildToRootMap(leaf_part_rri); + AttrMap *attrMap; + ListCell *lc; + + Assert(map != NULL); /* else we shouldn't be here */ + attrMap = map->attrMap; + + foreach(lc, colnos) + { + AttrNumber parentattrno = lfirst_int(lc); + + if (parentattrno <= 0 || + parentattrno > attrMap->maplen || + attrMap->attnums[parentattrno - 1] == 0) + elog(ERROR, "unexpected attno %d in target column list", + parentattrno); + new_colnos = lappend_int(new_colnos, + attrMap->attnums[parentattrno - 1]); + } + + return new_colnos; +} + +/*------------------------------------------------------------------------- + * Run-Time Partition Pruning Support. + * + * The following series of functions exist to support the removal of unneeded + * subplans for queries against partitioned tables. The supporting functions + * here are designed to work with any plan type which supports an arbitrary + * number of subplans, e.g. Append, MergeAppend. + * + * When pruning involves comparison of a partition key to a constant, it's + * done by the planner. However, if we have a comparison to a non-constant + * but not volatile expression, that presents an opportunity for run-time + * pruning by the executor, allowing irrelevant partitions to be skipped + * dynamically. + * + * We must distinguish expressions containing PARAM_EXEC Params from + * expressions that don't contain those. Even though a PARAM_EXEC Param is + * considered to be a stable expression, it can change value from one plan + * node scan to the next during query execution. Stable comparison + * expressions that don't involve such Params allow partition pruning to be + * done once during executor startup. Expressions that do involve such Params + * require us to prune separately for each scan of the parent plan node. + * + * Note that pruning away unneeded subplans during executor startup has the + * added benefit of not having to initialize the unneeded subplans at all. + * + * + * Functions: + * + * ExecCreatePartitionPruneState: + * Creates the PartitionPruneState required by each of the two pruning + * functions. Details stored include how to map the partition index + * returned by the partition pruning code into subplan indexes. + * + * ExecFindInitialMatchingSubPlans: + * Returns indexes of matching subplans. Partition pruning is attempted + * without any evaluation of expressions containing PARAM_EXEC Params. + * This function must be called during executor startup for the parent + * plan before the subplans themselves are initialized. Subplans which + * are found not to match by this function must be removed from the + * plan's list of subplans during execution, as this function performs a + * remap of the partition index to subplan index map and the newly + * created map provides indexes only for subplans which remain after + * calling this function. + * + * ExecFindMatchingSubPlans: + * Returns indexes of matching subplans after evaluating all available + * expressions. This function can only be called during execution and + * must be called again each time the value of a Param listed in + * PartitionPruneState's 'execparamids' changes. + *------------------------------------------------------------------------- + */ + +/* + * ExecCreatePartitionPruneState + * Build the data structure required for calling + * ExecFindInitialMatchingSubPlans and ExecFindMatchingSubPlans. + * + * 'planstate' is the parent plan node's execution state. + * + * 'partitionpruneinfo' is a PartitionPruneInfo as generated by + * make_partition_pruneinfo. Here we build a PartitionPruneState containing a + * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of + * partitionpruneinfo->prune_infos), each of which contains a + * PartitionedRelPruningData for each PartitionedRelPruneInfo appearing in + * that sublist. This two-level system is needed to keep from confusing the + * different hierarchies when a UNION ALL contains multiple partitioned tables + * as children. The data stored in each PartitionedRelPruningData can be + * re-used each time we re-evaluate which partitions match the pruning steps + * provided in each PartitionedRelPruneInfo. + */ +PartitionPruneState * +ExecCreatePartitionPruneState(PlanState *planstate, + PartitionPruneInfo *partitionpruneinfo) +{ + EState *estate = planstate->state; + PartitionPruneState *prunestate; + int n_part_hierarchies; + ListCell *lc; + int i; + + /* For data reading, executor always omits detached partitions */ + if (estate->es_partition_directory == NULL) + estate->es_partition_directory = + CreatePartitionDirectory(estate->es_query_cxt, false); + + n_part_hierarchies = list_length(partitionpruneinfo->prune_infos); + Assert(n_part_hierarchies > 0); + + /* + * Allocate the data structure + */ + prunestate = (PartitionPruneState *) + palloc(offsetof(PartitionPruneState, partprunedata) + + sizeof(PartitionPruningData *) * n_part_hierarchies); + + prunestate->execparamids = NULL; + /* other_subplans can change at runtime, so we need our own copy */ + prunestate->other_subplans = bms_copy(partitionpruneinfo->other_subplans); + prunestate->do_initial_prune = false; /* may be set below */ + prunestate->do_exec_prune = false; /* may be set below */ + prunestate->num_partprunedata = n_part_hierarchies; + + /* + * Create a short-term memory context which we'll use when making calls to + * the partition pruning functions. This avoids possible memory leaks, + * since the pruning functions call comparison functions that aren't under + * our control. + */ + prunestate->prune_context = + AllocSetContextCreate(CurrentMemoryContext, + "Partition Prune", + ALLOCSET_DEFAULT_SIZES); + + i = 0; + foreach(lc, partitionpruneinfo->prune_infos) + { + List *partrelpruneinfos = lfirst_node(List, lc); + int npartrelpruneinfos = list_length(partrelpruneinfos); + PartitionPruningData *prunedata; + ListCell *lc2; + int j; + + prunedata = (PartitionPruningData *) + palloc(offsetof(PartitionPruningData, partrelprunedata) + + npartrelpruneinfos * sizeof(PartitionedRelPruningData)); + prunestate->partprunedata[i] = prunedata; + prunedata->num_partrelprunedata = npartrelpruneinfos; + + j = 0; + foreach(lc2, partrelpruneinfos) + { + PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2); + PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j]; + Relation partrel; + PartitionDesc partdesc; + PartitionKey partkey; + + /* + * We can rely on the copies of the partitioned table's partition + * key and partition descriptor appearing in its relcache entry, + * because that entry will be held open and locked for the + * duration of this executor run. + */ + partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex); + partkey = RelationGetPartitionKey(partrel); + partdesc = PartitionDirectoryLookup(estate->es_partition_directory, + partrel); + + /* + * Initialize the subplan_map and subpart_map. + * + * Because we request detached partitions to be included, and + * detaching waits for old transactions, it is safe to assume that + * no partitions have disappeared since this query was planned. + * + * However, new partitions may have been added. + */ + Assert(partdesc->nparts >= pinfo->nparts); + pprune->nparts = partdesc->nparts; + pprune->subplan_map = palloc(sizeof(int) * partdesc->nparts); + if (partdesc->nparts == pinfo->nparts) + { + /* + * There are no new partitions, so this is simple. We can + * simply point to the subpart_map from the plan, but we must + * copy the subplan_map since we may change it later. + */ + pprune->subpart_map = pinfo->subpart_map; + memcpy(pprune->subplan_map, pinfo->subplan_map, + sizeof(int) * pinfo->nparts); + + /* + * Double-check that the list of unpruned relations has not + * changed. (Pruned partitions are not in relid_map[].) + */ +#ifdef USE_ASSERT_CHECKING + for (int k = 0; k < pinfo->nparts; k++) + { + Assert(partdesc->oids[k] == pinfo->relid_map[k] || + pinfo->subplan_map[k] == -1); + } +#endif + } + else + { + int pd_idx = 0; + int pp_idx; + + /* + * Some new partitions have appeared since plan time, and + * those are reflected in our PartitionDesc but were not + * present in the one used to construct subplan_map and + * subpart_map. So we must construct new and longer arrays + * where the partitions that were originally present map to + * the same sub-structures, and any added partitions map to + * -1, as if the new partitions had been pruned. + * + * Note: pinfo->relid_map[] may contain InvalidOid entries for + * partitions pruned by the planner. We cannot tell exactly + * which of the partdesc entries these correspond to, but we + * don't have to; just skip over them. The non-pruned + * relid_map entries, however, had better be a subset of the + * partdesc entries and in the same order. + */ + pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts); + for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++) + { + /* Skip any InvalidOid relid_map entries */ + while (pd_idx < pinfo->nparts && + !OidIsValid(pinfo->relid_map[pd_idx])) + pd_idx++; + + if (pd_idx < pinfo->nparts && + pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx]) + { + /* match... */ + pprune->subplan_map[pp_idx] = + pinfo->subplan_map[pd_idx]; + pprune->subpart_map[pp_idx] = + pinfo->subpart_map[pd_idx]; + pd_idx++; + } + else + { + /* this partdesc entry is not in the plan */ + pprune->subplan_map[pp_idx] = -1; + pprune->subpart_map[pp_idx] = -1; + } + } + + /* + * It might seem that we need to skip any trailing InvalidOid + * entries in pinfo->relid_map before checking that we scanned + * all of the relid_map. But we will have skipped them above, + * because they must correspond to some partdesc->oids + * entries; we just couldn't tell which. + */ + if (pd_idx != pinfo->nparts) + elog(ERROR, "could not match partition child tables to plan elements"); + } + + /* present_parts is also subject to later modification */ + pprune->present_parts = bms_copy(pinfo->present_parts); + + /* + * Initialize pruning contexts as needed. + */ + pprune->initial_pruning_steps = pinfo->initial_pruning_steps; + if (pinfo->initial_pruning_steps) + { + ExecInitPruningContext(&pprune->initial_context, + pinfo->initial_pruning_steps, + partdesc, partkey, planstate); + /* Record whether initial pruning is needed at any level */ + prunestate->do_initial_prune = true; + } + pprune->exec_pruning_steps = pinfo->exec_pruning_steps; + if (pinfo->exec_pruning_steps) + { + ExecInitPruningContext(&pprune->exec_context, + pinfo->exec_pruning_steps, + partdesc, partkey, planstate); + /* Record whether exec pruning is needed at any level */ + prunestate->do_exec_prune = true; + } + + /* + * Accumulate the IDs of all PARAM_EXEC Params affecting the + * partitioning decisions at this plan node. + */ + prunestate->execparamids = bms_add_members(prunestate->execparamids, + pinfo->execparamids); + + j++; + } + i++; + } + + return prunestate; +} + +/* + * Initialize a PartitionPruneContext for the given list of pruning steps. + */ +static void +ExecInitPruningContext(PartitionPruneContext *context, + List *pruning_steps, + PartitionDesc partdesc, + PartitionKey partkey, + PlanState *planstate) +{ + int n_steps; + int partnatts; + ListCell *lc; + + n_steps = list_length(pruning_steps); + + context->strategy = partkey->strategy; + context->partnatts = partnatts = partkey->partnatts; + context->nparts = partdesc->nparts; + context->boundinfo = partdesc->boundinfo; + context->partcollation = partkey->partcollation; + context->partsupfunc = partkey->partsupfunc; + + /* We'll look up type-specific support functions as needed */ + context->stepcmpfuncs = (FmgrInfo *) + palloc0(sizeof(FmgrInfo) * n_steps * partnatts); + + context->ppccontext = CurrentMemoryContext; + context->planstate = planstate; + + /* Initialize expression state for each expression we need */ + context->exprstates = (ExprState **) + palloc0(sizeof(ExprState *) * n_steps * partnatts); + foreach(lc, pruning_steps) + { + PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc); + ListCell *lc2; + int keyno; + + /* not needed for other step kinds */ + if (!IsA(step, PartitionPruneStepOp)) + continue; + + Assert(list_length(step->exprs) <= partnatts); + + keyno = 0; + foreach(lc2, step->exprs) + { + Expr *expr = (Expr *) lfirst(lc2); + + /* not needed for Consts */ + if (!IsA(expr, Const)) + { + int stateidx = PruneCxtStateIdx(partnatts, + step->step.step_id, + keyno); + + context->exprstates[stateidx] = + ExecInitExpr(expr, context->planstate); + } + keyno++; + } + } +} + +/* + * ExecFindInitialMatchingSubPlans + * Identify the set of subplans that cannot be eliminated by initial + * pruning, disregarding any pruning constraints involving PARAM_EXEC + * Params. + * + * If additional pruning passes will be required (because of PARAM_EXEC + * Params), we must also update the translation data that allows conversion + * of partition indexes into subplan indexes to account for the unneeded + * subplans having been removed. + * + * Must only be called once per 'prunestate', and only if initial pruning + * is required. + * + * 'nsubplans' must be passed as the total number of unpruned subplans. + */ +Bitmapset * +ExecFindInitialMatchingSubPlans(PartitionPruneState *prunestate, int nsubplans) +{ + Bitmapset *result = NULL; + MemoryContext oldcontext; + int i; + + /* Caller error if we get here without do_initial_prune */ + Assert(prunestate->do_initial_prune); + + /* + * Switch to a temp context to avoid leaking memory in the executor's + * query-lifespan memory context. + */ + oldcontext = MemoryContextSwitchTo(prunestate->prune_context); + + /* + * For each hierarchy, do the pruning tests, and add nondeletable + * subplans' indexes to "result". + */ + for (i = 0; i < prunestate->num_partprunedata; i++) + { + PartitionPruningData *prunedata; + PartitionedRelPruningData *pprune; + + prunedata = prunestate->partprunedata[i]; + pprune = &prunedata->partrelprunedata[0]; + + /* Perform pruning without using PARAM_EXEC Params */ + find_matching_subplans_recurse(prunedata, pprune, true, &result); + + /* Expression eval may have used space in node's ps_ExprContext too */ + if (pprune->initial_pruning_steps) + ResetExprContext(pprune->initial_context.planstate->ps_ExprContext); + } + + /* Add in any subplans that partition pruning didn't account for */ + result = bms_add_members(result, prunestate->other_subplans); + + MemoryContextSwitchTo(oldcontext); + + /* Copy result out of the temp context before we reset it */ + result = bms_copy(result); + + MemoryContextReset(prunestate->prune_context); + + /* + * If exec-time pruning is required and we pruned subplans above, then we + * must re-sequence the subplan indexes so that ExecFindMatchingSubPlans + * properly returns the indexes from the subplans which will remain after + * execution of this function. + * + * We can safely skip this when !do_exec_prune, even though that leaves + * invalid data in prunestate, because that data won't be consulted again + * (cf initial Assert in ExecFindMatchingSubPlans). + */ + if (prunestate->do_exec_prune && bms_num_members(result) < nsubplans) + { + int *new_subplan_indexes; + Bitmapset *new_other_subplans; + int i; + int newidx; + + /* + * First we must build a temporary array which maps old subplan + * indexes to new ones. For convenience of initialization, we use + * 1-based indexes in this array and leave pruned items as 0. + */ + new_subplan_indexes = (int *) palloc0(sizeof(int) * nsubplans); + newidx = 1; + i = -1; + while ((i = bms_next_member(result, i)) >= 0) + { + Assert(i < nsubplans); + new_subplan_indexes[i] = newidx++; + } + + /* + * Now we can update each PartitionedRelPruneInfo's subplan_map with + * new subplan indexes. We must also recompute its present_parts + * bitmap. + */ + for (i = 0; i < prunestate->num_partprunedata; i++) + { + PartitionPruningData *prunedata = prunestate->partprunedata[i]; + int j; + + /* + * Within each hierarchy, we perform this loop in back-to-front + * order so that we determine present_parts for the lowest-level + * partitioned tables first. This way we can tell whether a + * sub-partitioned table's partitions were entirely pruned so we + * can exclude it from the current level's present_parts. + */ + for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--) + { + PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j]; + int nparts = pprune->nparts; + int k; + + /* We just rebuild present_parts from scratch */ + bms_free(pprune->present_parts); + pprune->present_parts = NULL; + + for (k = 0; k < nparts; k++) + { + int oldidx = pprune->subplan_map[k]; + int subidx; + + /* + * If this partition existed as a subplan then change the + * old subplan index to the new subplan index. The new + * index may become -1 if the partition was pruned above, + * or it may just come earlier in the subplan list due to + * some subplans being removed earlier in the list. If + * it's a subpartition, add it to present_parts unless + * it's entirely pruned. + */ + if (oldidx >= 0) + { + Assert(oldidx < nsubplans); + pprune->subplan_map[k] = new_subplan_indexes[oldidx] - 1; + + if (new_subplan_indexes[oldidx] > 0) + pprune->present_parts = + bms_add_member(pprune->present_parts, k); + } + else if ((subidx = pprune->subpart_map[k]) >= 0) + { + PartitionedRelPruningData *subprune; + + subprune = &prunedata->partrelprunedata[subidx]; + + if (!bms_is_empty(subprune->present_parts)) + pprune->present_parts = + bms_add_member(pprune->present_parts, k); + } + } + } + } + + /* + * We must also recompute the other_subplans set, since indexes in it + * may change. + */ + new_other_subplans = NULL; + i = -1; + while ((i = bms_next_member(prunestate->other_subplans, i)) >= 0) + new_other_subplans = bms_add_member(new_other_subplans, + new_subplan_indexes[i] - 1); + + bms_free(prunestate->other_subplans); + prunestate->other_subplans = new_other_subplans; + + pfree(new_subplan_indexes); + } + + return result; +} + +/* + * ExecFindMatchingSubPlans + * Determine which subplans match the pruning steps detailed in + * 'prunestate' for the current comparison expression values. + * + * Here we assume we may evaluate PARAM_EXEC Params. + */ +Bitmapset * +ExecFindMatchingSubPlans(PartitionPruneState *prunestate) +{ + Bitmapset *result = NULL; + MemoryContext oldcontext; + int i; + + /* + * If !do_exec_prune, we've got problems because + * ExecFindInitialMatchingSubPlans will not have bothered to update + * prunestate for whatever pruning it did. + */ + Assert(prunestate->do_exec_prune); + + /* + * Switch to a temp context to avoid leaking memory in the executor's + * query-lifespan memory context. + */ + oldcontext = MemoryContextSwitchTo(prunestate->prune_context); + + /* + * For each hierarchy, do the pruning tests, and add nondeletable + * subplans' indexes to "result". + */ + for (i = 0; i < prunestate->num_partprunedata; i++) + { + PartitionPruningData *prunedata; + PartitionedRelPruningData *pprune; + + prunedata = prunestate->partprunedata[i]; + pprune = &prunedata->partrelprunedata[0]; + + find_matching_subplans_recurse(prunedata, pprune, false, &result); + + /* Expression eval may have used space in node's ps_ExprContext too */ + if (pprune->exec_pruning_steps) + ResetExprContext(pprune->exec_context.planstate->ps_ExprContext); + } + + /* Add in any subplans that partition pruning didn't account for */ + result = bms_add_members(result, prunestate->other_subplans); + + MemoryContextSwitchTo(oldcontext); + + /* Copy result out of the temp context before we reset it */ + result = bms_copy(result); + + MemoryContextReset(prunestate->prune_context); + + return result; +} + +/* + * find_matching_subplans_recurse + * Recursive worker function for ExecFindMatchingSubPlans and + * ExecFindInitialMatchingSubPlans + * + * Adds valid (non-prunable) subplan IDs to *validsubplans + */ +static void +find_matching_subplans_recurse(PartitionPruningData *prunedata, + PartitionedRelPruningData *pprune, + bool initial_prune, + Bitmapset **validsubplans) +{ + Bitmapset *partset; + int i; + + /* Guard against stack overflow due to overly deep partition hierarchy. */ + check_stack_depth(); + + /* Only prune if pruning would be useful at this level. */ + if (initial_prune && pprune->initial_pruning_steps) + { + partset = get_matching_partitions(&pprune->initial_context, + pprune->initial_pruning_steps); + } + else if (!initial_prune && pprune->exec_pruning_steps) + { + partset = get_matching_partitions(&pprune->exec_context, + pprune->exec_pruning_steps); + } + else + { + /* + * If no pruning is to be done, just include all partitions at this + * level. + */ + partset = pprune->present_parts; + } + + /* Translate partset into subplan indexes */ + i = -1; + while ((i = bms_next_member(partset, i)) >= 0) + { + if (pprune->subplan_map[i] >= 0) + *validsubplans = bms_add_member(*validsubplans, + pprune->subplan_map[i]); + else + { + int partidx = pprune->subpart_map[i]; + + if (partidx >= 0) + find_matching_subplans_recurse(prunedata, + &prunedata->partrelprunedata[partidx], + initial_prune, validsubplans); + else + { + /* + * We get here if the planner already pruned all the sub- + * partitions for this partition. Silently ignore this + * partition in this case. The end result is the same: we + * would have pruned all partitions just the same, but we + * don't have any pruning steps to execute to verify this. + */ + } + } + } +} diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c new file mode 100644 index 0000000..1752b9b --- /dev/null +++ b/src/backend/executor/execProcnode.c @@ -0,0 +1,981 @@ +/*------------------------------------------------------------------------- + * + * execProcnode.c + * contains dispatch functions which call the appropriate "initialize", + * "get a tuple", and "cleanup" routines for the given node type. + * If the node has children, then it will presumably call ExecInitNode, + * ExecProcNode, or ExecEndNode on its subnodes and do the appropriate + * processing. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execProcnode.c + * + *------------------------------------------------------------------------- + */ +/* + * NOTES + * This used to be three files. It is now all combined into + * one file so that it is easier to keep the dispatch routines + * in sync when new nodes are added. + * + * EXAMPLE + * Suppose we want the age of the manager of the shoe department and + * the number of employees in that department. So we have the query: + * + * select DEPT.no_emps, EMP.age + * from DEPT, EMP + * where EMP.name = DEPT.mgr and + * DEPT.name = "shoe" + * + * Suppose the planner gives us the following plan: + * + * Nest Loop (DEPT.mgr = EMP.name) + * / \ + * / \ + * Seq Scan Seq Scan + * DEPT EMP + * (name = "shoe") + * + * ExecutorStart() is called first. + * It calls InitPlan() which calls ExecInitNode() on + * the root of the plan -- the nest loop node. + * + * * ExecInitNode() notices that it is looking at a nest loop and + * as the code below demonstrates, it calls ExecInitNestLoop(). + * Eventually this calls ExecInitNode() on the right and left subplans + * and so forth until the entire plan is initialized. The result + * of ExecInitNode() is a plan state tree built with the same structure + * as the underlying plan tree. + * + * * Then when ExecutorRun() is called, it calls ExecutePlan() which calls + * ExecProcNode() repeatedly on the top node of the plan state tree. + * Each time this happens, ExecProcNode() will end up calling + * ExecNestLoop(), which calls ExecProcNode() on its subplans. + * Each of these subplans is a sequential scan so ExecSeqScan() is + * called. The slots returned by ExecSeqScan() may contain + * tuples which contain the attributes ExecNestLoop() uses to + * form the tuples it returns. + * + * * Eventually ExecSeqScan() stops returning tuples and the nest + * loop join ends. Lastly, ExecutorEnd() calls ExecEndNode() which + * calls ExecEndNestLoop() which in turn calls ExecEndNode() on + * its subplans which result in ExecEndSeqScan(). + * + * This should show how the executor works by having + * ExecInitNode(), ExecProcNode() and ExecEndNode() dispatch + * their work to the appropriate node support routines which may + * in turn call these routines themselves on their subplans. + */ +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeAgg.h" +#include "executor/nodeAppend.h" +#include "executor/nodeBitmapAnd.h" +#include "executor/nodeBitmapHeapscan.h" +#include "executor/nodeBitmapIndexscan.h" +#include "executor/nodeBitmapOr.h" +#include "executor/nodeCtescan.h" +#include "executor/nodeCustom.h" +#include "executor/nodeForeignscan.h" +#include "executor/nodeFunctionscan.h" +#include "executor/nodeGather.h" +#include "executor/nodeGatherMerge.h" +#include "executor/nodeGroup.h" +#include "executor/nodeHash.h" +#include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" +#include "executor/nodeIndexonlyscan.h" +#include "executor/nodeIndexscan.h" +#include "executor/nodeLimit.h" +#include "executor/nodeLockRows.h" +#include "executor/nodeMaterial.h" +#include "executor/nodeMemoize.h" +#include "executor/nodeMergeAppend.h" +#include "executor/nodeMergejoin.h" +#include "executor/nodeModifyTable.h" +#include "executor/nodeNamedtuplestorescan.h" +#include "executor/nodeNestloop.h" +#include "executor/nodeProjectSet.h" +#include "executor/nodeRecursiveunion.h" +#include "executor/nodeResult.h" +#include "executor/nodeSamplescan.h" +#include "executor/nodeSeqscan.h" +#include "executor/nodeSetOp.h" +#include "executor/nodeSort.h" +#include "executor/nodeSubplan.h" +#include "executor/nodeSubqueryscan.h" +#include "executor/nodeTableFuncscan.h" +#include "executor/nodeTidrangescan.h" +#include "executor/nodeTidscan.h" +#include "executor/nodeUnique.h" +#include "executor/nodeValuesscan.h" +#include "executor/nodeWindowAgg.h" +#include "executor/nodeWorktablescan.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" + +static TupleTableSlot *ExecProcNodeFirst(PlanState *node); +static TupleTableSlot *ExecProcNodeInstr(PlanState *node); + + +/* ------------------------------------------------------------------------ + * ExecInitNode + * + * Recursively initializes all the nodes in the plan tree rooted + * at 'node'. + * + * Inputs: + * 'node' is the current node of the plan produced by the query planner + * 'estate' is the shared execution state for the plan tree + * 'eflags' is a bitwise OR of flag bits described in executor.h + * + * Returns a PlanState node corresponding to the given Plan node. + * ------------------------------------------------------------------------ + */ +PlanState * +ExecInitNode(Plan *node, EState *estate, int eflags) +{ + PlanState *result; + List *subps; + ListCell *l; + + /* + * do nothing when we get to the end of a leaf on tree. + */ + if (node == NULL) + return NULL; + + /* + * Make sure there's enough stack available. Need to check here, in + * addition to ExecProcNode() (via ExecProcNodeFirst()), to ensure the + * stack isn't overrun while initializing the node tree. + */ + check_stack_depth(); + + switch (nodeTag(node)) + { + /* + * control nodes + */ + case T_Result: + result = (PlanState *) ExecInitResult((Result *) node, + estate, eflags); + break; + + case T_ProjectSet: + result = (PlanState *) ExecInitProjectSet((ProjectSet *) node, + estate, eflags); + break; + + case T_ModifyTable: + result = (PlanState *) ExecInitModifyTable((ModifyTable *) node, + estate, eflags); + break; + + case T_Append: + result = (PlanState *) ExecInitAppend((Append *) node, + estate, eflags); + break; + + case T_MergeAppend: + result = (PlanState *) ExecInitMergeAppend((MergeAppend *) node, + estate, eflags); + break; + + case T_RecursiveUnion: + result = (PlanState *) ExecInitRecursiveUnion((RecursiveUnion *) node, + estate, eflags); + break; + + case T_BitmapAnd: + result = (PlanState *) ExecInitBitmapAnd((BitmapAnd *) node, + estate, eflags); + break; + + case T_BitmapOr: + result = (PlanState *) ExecInitBitmapOr((BitmapOr *) node, + estate, eflags); + break; + + /* + * scan nodes + */ + case T_SeqScan: + result = (PlanState *) ExecInitSeqScan((SeqScan *) node, + estate, eflags); + break; + + case T_SampleScan: + result = (PlanState *) ExecInitSampleScan((SampleScan *) node, + estate, eflags); + break; + + case T_IndexScan: + result = (PlanState *) ExecInitIndexScan((IndexScan *) node, + estate, eflags); + break; + + case T_IndexOnlyScan: + result = (PlanState *) ExecInitIndexOnlyScan((IndexOnlyScan *) node, + estate, eflags); + break; + + case T_BitmapIndexScan: + result = (PlanState *) ExecInitBitmapIndexScan((BitmapIndexScan *) node, + estate, eflags); + break; + + case T_BitmapHeapScan: + result = (PlanState *) ExecInitBitmapHeapScan((BitmapHeapScan *) node, + estate, eflags); + break; + + case T_TidScan: + result = (PlanState *) ExecInitTidScan((TidScan *) node, + estate, eflags); + break; + + case T_TidRangeScan: + result = (PlanState *) ExecInitTidRangeScan((TidRangeScan *) node, + estate, eflags); + break; + + case T_SubqueryScan: + result = (PlanState *) ExecInitSubqueryScan((SubqueryScan *) node, + estate, eflags); + break; + + case T_FunctionScan: + result = (PlanState *) ExecInitFunctionScan((FunctionScan *) node, + estate, eflags); + break; + + case T_TableFuncScan: + result = (PlanState *) ExecInitTableFuncScan((TableFuncScan *) node, + estate, eflags); + break; + + case T_ValuesScan: + result = (PlanState *) ExecInitValuesScan((ValuesScan *) node, + estate, eflags); + break; + + case T_CteScan: + result = (PlanState *) ExecInitCteScan((CteScan *) node, + estate, eflags); + break; + + case T_NamedTuplestoreScan: + result = (PlanState *) ExecInitNamedTuplestoreScan((NamedTuplestoreScan *) node, + estate, eflags); + break; + + case T_WorkTableScan: + result = (PlanState *) ExecInitWorkTableScan((WorkTableScan *) node, + estate, eflags); + break; + + case T_ForeignScan: + result = (PlanState *) ExecInitForeignScan((ForeignScan *) node, + estate, eflags); + break; + + case T_CustomScan: + result = (PlanState *) ExecInitCustomScan((CustomScan *) node, + estate, eflags); + break; + + /* + * join nodes + */ + case T_NestLoop: + result = (PlanState *) ExecInitNestLoop((NestLoop *) node, + estate, eflags); + break; + + case T_MergeJoin: + result = (PlanState *) ExecInitMergeJoin((MergeJoin *) node, + estate, eflags); + break; + + case T_HashJoin: + result = (PlanState *) ExecInitHashJoin((HashJoin *) node, + estate, eflags); + break; + + /* + * materialization nodes + */ + case T_Material: + result = (PlanState *) ExecInitMaterial((Material *) node, + estate, eflags); + break; + + case T_Sort: + result = (PlanState *) ExecInitSort((Sort *) node, + estate, eflags); + break; + + case T_IncrementalSort: + result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node, + estate, eflags); + break; + + case T_Memoize: + result = (PlanState *) ExecInitMemoize((Memoize *) node, estate, + eflags); + break; + + case T_Group: + result = (PlanState *) ExecInitGroup((Group *) node, + estate, eflags); + break; + + case T_Agg: + result = (PlanState *) ExecInitAgg((Agg *) node, + estate, eflags); + break; + + case T_WindowAgg: + result = (PlanState *) ExecInitWindowAgg((WindowAgg *) node, + estate, eflags); + break; + + case T_Unique: + result = (PlanState *) ExecInitUnique((Unique *) node, + estate, eflags); + break; + + case T_Gather: + result = (PlanState *) ExecInitGather((Gather *) node, + estate, eflags); + break; + + case T_GatherMerge: + result = (PlanState *) ExecInitGatherMerge((GatherMerge *) node, + estate, eflags); + break; + + case T_Hash: + result = (PlanState *) ExecInitHash((Hash *) node, + estate, eflags); + break; + + case T_SetOp: + result = (PlanState *) ExecInitSetOp((SetOp *) node, + estate, eflags); + break; + + case T_LockRows: + result = (PlanState *) ExecInitLockRows((LockRows *) node, + estate, eflags); + break; + + case T_Limit: + result = (PlanState *) ExecInitLimit((Limit *) node, + estate, eflags); + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); + result = NULL; /* keep compiler quiet */ + break; + } + + ExecSetExecProcNode(result, result->ExecProcNode); + + /* + * Initialize any initPlans present in this node. The planner put them in + * a separate list for us. + */ + subps = NIL; + foreach(l, node->initPlan) + { + SubPlan *subplan = (SubPlan *) lfirst(l); + SubPlanState *sstate; + + Assert(IsA(subplan, SubPlan)); + sstate = ExecInitSubPlan(subplan, result); + subps = lappend(subps, sstate); + } + result->initPlan = subps; + + /* Set up instrumentation for this node if requested */ + if (estate->es_instrument) + result->instrument = InstrAlloc(1, estate->es_instrument, + result->async_capable); + + return result; +} + + +/* + * If a node wants to change its ExecProcNode function after ExecInitNode() + * has finished, it should do so with this function. That way any wrapper + * functions can be reinstalled, without the node having to know how that + * works. + */ +void +ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function) +{ + /* + * Add a wrapper around the ExecProcNode callback that checks stack depth + * during the first execution and maybe adds an instrumentation wrapper. + * When the callback is changed after execution has already begun that + * means we'll superfluously execute ExecProcNodeFirst, but that seems ok. + */ + node->ExecProcNodeReal = function; + node->ExecProcNode = ExecProcNodeFirst; +} + + +/* + * ExecProcNode wrapper that performs some one-time checks, before calling + * the relevant node method (possibly via an instrumentation wrapper). + */ +static TupleTableSlot * +ExecProcNodeFirst(PlanState *node) +{ + /* + * Perform stack depth check during the first execution of the node. We + * only do so the first time round because it turns out to not be cheap on + * some common architectures (eg. x86). This relies on the assumption + * that ExecProcNode calls for a given plan node will always be made at + * roughly the same stack depth. + */ + check_stack_depth(); + + /* + * If instrumentation is required, change the wrapper to one that just + * does instrumentation. Otherwise we can dispense with all wrappers and + * have ExecProcNode() directly call the relevant function from now on. + */ + if (node->instrument) + node->ExecProcNode = ExecProcNodeInstr; + else + node->ExecProcNode = node->ExecProcNodeReal; + + return node->ExecProcNode(node); +} + + +/* + * ExecProcNode wrapper that performs instrumentation calls. By keeping + * this a separate function, we avoid overhead in the normal case where + * no instrumentation is wanted. + */ +static TupleTableSlot * +ExecProcNodeInstr(PlanState *node) +{ + TupleTableSlot *result; + + InstrStartNode(node->instrument); + + result = node->ExecProcNodeReal(node); + + InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0); + + return result; +} + + +/* ---------------------------------------------------------------- + * MultiExecProcNode + * + * Execute a node that doesn't return individual tuples + * (it might return a hashtable, bitmap, etc). Caller should + * check it got back the expected kind of Node. + * + * This has essentially the same responsibilities as ExecProcNode, + * but it does not do InstrStartNode/InstrStopNode (mainly because + * it can't tell how many returned tuples to count). Each per-node + * function must provide its own instrumentation support. + * ---------------------------------------------------------------- + */ +Node * +MultiExecProcNode(PlanState *node) +{ + Node *result; + + check_stack_depth(); + + CHECK_FOR_INTERRUPTS(); + + if (node->chgParam != NULL) /* something changed */ + ExecReScan(node); /* let ReScan handle this */ + + switch (nodeTag(node)) + { + /* + * Only node types that actually support multiexec will be listed + */ + + case T_HashState: + result = MultiExecHash((HashState *) node); + break; + + case T_BitmapIndexScanState: + result = MultiExecBitmapIndexScan((BitmapIndexScanState *) node); + break; + + case T_BitmapAndState: + result = MultiExecBitmapAnd((BitmapAndState *) node); + break; + + case T_BitmapOrState: + result = MultiExecBitmapOr((BitmapOrState *) node); + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); + result = NULL; + break; + } + + return result; +} + + +/* ---------------------------------------------------------------- + * ExecEndNode + * + * Recursively cleans up all the nodes in the plan rooted + * at 'node'. + * + * After this operation, the query plan will not be able to be + * processed any further. This should be called only after + * the query plan has been fully executed. + * ---------------------------------------------------------------- + */ +void +ExecEndNode(PlanState *node) +{ + /* + * do nothing when we get to the end of a leaf on tree. + */ + if (node == NULL) + return; + + /* + * Make sure there's enough stack available. Need to check here, in + * addition to ExecProcNode() (via ExecProcNodeFirst()), because it's not + * guaranteed that ExecProcNode() is reached for all nodes. + */ + check_stack_depth(); + + if (node->chgParam != NULL) + { + bms_free(node->chgParam); + node->chgParam = NULL; + } + + switch (nodeTag(node)) + { + /* + * control nodes + */ + case T_ResultState: + ExecEndResult((ResultState *) node); + break; + + case T_ProjectSetState: + ExecEndProjectSet((ProjectSetState *) node); + break; + + case T_ModifyTableState: + ExecEndModifyTable((ModifyTableState *) node); + break; + + case T_AppendState: + ExecEndAppend((AppendState *) node); + break; + + case T_MergeAppendState: + ExecEndMergeAppend((MergeAppendState *) node); + break; + + case T_RecursiveUnionState: + ExecEndRecursiveUnion((RecursiveUnionState *) node); + break; + + case T_BitmapAndState: + ExecEndBitmapAnd((BitmapAndState *) node); + break; + + case T_BitmapOrState: + ExecEndBitmapOr((BitmapOrState *) node); + break; + + /* + * scan nodes + */ + case T_SeqScanState: + ExecEndSeqScan((SeqScanState *) node); + break; + + case T_SampleScanState: + ExecEndSampleScan((SampleScanState *) node); + break; + + case T_GatherState: + ExecEndGather((GatherState *) node); + break; + + case T_GatherMergeState: + ExecEndGatherMerge((GatherMergeState *) node); + break; + + case T_IndexScanState: + ExecEndIndexScan((IndexScanState *) node); + break; + + case T_IndexOnlyScanState: + ExecEndIndexOnlyScan((IndexOnlyScanState *) node); + break; + + case T_BitmapIndexScanState: + ExecEndBitmapIndexScan((BitmapIndexScanState *) node); + break; + + case T_BitmapHeapScanState: + ExecEndBitmapHeapScan((BitmapHeapScanState *) node); + break; + + case T_TidScanState: + ExecEndTidScan((TidScanState *) node); + break; + + case T_TidRangeScanState: + ExecEndTidRangeScan((TidRangeScanState *) node); + break; + + case T_SubqueryScanState: + ExecEndSubqueryScan((SubqueryScanState *) node); + break; + + case T_FunctionScanState: + ExecEndFunctionScan((FunctionScanState *) node); + break; + + case T_TableFuncScanState: + ExecEndTableFuncScan((TableFuncScanState *) node); + break; + + case T_ValuesScanState: + ExecEndValuesScan((ValuesScanState *) node); + break; + + case T_CteScanState: + ExecEndCteScan((CteScanState *) node); + break; + + case T_NamedTuplestoreScanState: + ExecEndNamedTuplestoreScan((NamedTuplestoreScanState *) node); + break; + + case T_WorkTableScanState: + ExecEndWorkTableScan((WorkTableScanState *) node); + break; + + case T_ForeignScanState: + ExecEndForeignScan((ForeignScanState *) node); + break; + + case T_CustomScanState: + ExecEndCustomScan((CustomScanState *) node); + break; + + /* + * join nodes + */ + case T_NestLoopState: + ExecEndNestLoop((NestLoopState *) node); + break; + + case T_MergeJoinState: + ExecEndMergeJoin((MergeJoinState *) node); + break; + + case T_HashJoinState: + ExecEndHashJoin((HashJoinState *) node); + break; + + /* + * materialization nodes + */ + case T_MaterialState: + ExecEndMaterial((MaterialState *) node); + break; + + case T_SortState: + ExecEndSort((SortState *) node); + break; + + case T_IncrementalSortState: + ExecEndIncrementalSort((IncrementalSortState *) node); + break; + + case T_MemoizeState: + ExecEndMemoize((MemoizeState *) node); + break; + + case T_GroupState: + ExecEndGroup((GroupState *) node); + break; + + case T_AggState: + ExecEndAgg((AggState *) node); + break; + + case T_WindowAggState: + ExecEndWindowAgg((WindowAggState *) node); + break; + + case T_UniqueState: + ExecEndUnique((UniqueState *) node); + break; + + case T_HashState: + ExecEndHash((HashState *) node); + break; + + case T_SetOpState: + ExecEndSetOp((SetOpState *) node); + break; + + case T_LockRowsState: + ExecEndLockRows((LockRowsState *) node); + break; + + case T_LimitState: + ExecEndLimit((LimitState *) node); + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); + break; + } +} + +/* + * ExecShutdownNode + * + * Give execution nodes a chance to stop asynchronous resource consumption + * and release any resources still held. + */ +bool +ExecShutdownNode(PlanState *node) +{ + if (node == NULL) + return false; + + check_stack_depth(); + + /* + * Treat the node as running while we shut it down, but only if it's run + * at least once already. We don't expect much CPU consumption during + * node shutdown, but in the case of Gather or Gather Merge, we may shut + * down workers at this stage. If so, their buffer usage will get + * propagated into pgBufferUsage at this point, and we want to make sure + * that it gets associated with the Gather node. We skip this if the node + * has never been executed, so as to avoid incorrectly making it appear + * that it has. + */ + if (node->instrument && node->instrument->running) + InstrStartNode(node->instrument); + + planstate_tree_walker(node, ExecShutdownNode, NULL); + + switch (nodeTag(node)) + { + case T_GatherState: + ExecShutdownGather((GatherState *) node); + break; + case T_ForeignScanState: + ExecShutdownForeignScan((ForeignScanState *) node); + break; + case T_CustomScanState: + ExecShutdownCustomScan((CustomScanState *) node); + break; + case T_GatherMergeState: + ExecShutdownGatherMerge((GatherMergeState *) node); + break; + case T_HashState: + ExecShutdownHash((HashState *) node); + break; + case T_HashJoinState: + ExecShutdownHashJoin((HashJoinState *) node); + break; + default: + break; + } + + /* Stop the node if we started it above, reporting 0 tuples. */ + if (node->instrument && node->instrument->running) + InstrStopNode(node->instrument, 0); + + return false; +} + +/* + * ExecSetTupleBound + * + * Set a tuple bound for a planstate node. This lets child plan nodes + * optimize based on the knowledge that the maximum number of tuples that + * their parent will demand is limited. The tuple bound for a node may + * only be changed between scans (i.e., after node initialization or just + * before an ExecReScan call). + * + * Any negative tuples_needed value means "no limit", which should be the + * default assumption when this is not called at all for a particular node. + * + * Note: if this is called repeatedly on a plan tree, the exact same set + * of nodes must be updated with the new limit each time; be careful that + * only unchanging conditions are tested here. + */ +void +ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) +{ + /* + * Since this function recurses, in principle we should check stack depth + * here. In practice, it's probably pointless since the earlier node + * initialization tree traversal would surely have consumed more stack. + */ + + if (IsA(child_node, SortState)) + { + /* + * If it is a Sort node, notify it that it can use bounded sort. + * + * Note: it is the responsibility of nodeSort.c to react properly to + * changes of these parameters. If we ever redesign this, it'd be a + * good idea to integrate this signaling with the parameter-change + * mechanism. + */ + SortState *sortState = (SortState *) child_node; + + if (tuples_needed < 0) + { + /* make sure flag gets reset if needed upon rescan */ + sortState->bounded = false; + } + else + { + sortState->bounded = true; + sortState->bound = tuples_needed; + } + } + else if (IsA(child_node, IncrementalSortState)) + { + /* + * If it is an IncrementalSort node, notify it that it can use bounded + * sort. + * + * Note: it is the responsibility of nodeIncrementalSort.c to react + * properly to changes of these parameters. If we ever redesign this, + * it'd be a good idea to integrate this signaling with the + * parameter-change mechanism. + */ + IncrementalSortState *sortState = (IncrementalSortState *) child_node; + + if (tuples_needed < 0) + { + /* make sure flag gets reset if needed upon rescan */ + sortState->bounded = false; + } + else + { + sortState->bounded = true; + sortState->bound = tuples_needed; + } + } + else if (IsA(child_node, AppendState)) + { + /* + * If it is an Append, we can apply the bound to any nodes that are + * children of the Append, since the Append surely need read no more + * than that many tuples from any one input. + */ + AppendState *aState = (AppendState *) child_node; + int i; + + for (i = 0; i < aState->as_nplans; i++) + ExecSetTupleBound(tuples_needed, aState->appendplans[i]); + } + else if (IsA(child_node, MergeAppendState)) + { + /* + * If it is a MergeAppend, we can apply the bound to any nodes that + * are children of the MergeAppend, since the MergeAppend surely need + * read no more than that many tuples from any one input. + */ + MergeAppendState *maState = (MergeAppendState *) child_node; + int i; + + for (i = 0; i < maState->ms_nplans; i++) + ExecSetTupleBound(tuples_needed, maState->mergeplans[i]); + } + else if (IsA(child_node, ResultState)) + { + /* + * Similarly, for a projecting Result, we can apply the bound to its + * child node. + * + * If Result supported qual checking, we'd have to punt on seeing a + * qual. Note that having a resconstantqual is not a showstopper: if + * that condition succeeds it affects nothing, while if it fails, no + * rows will be demanded from the Result child anyway. + */ + if (outerPlanState(child_node)) + ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); + } + else if (IsA(child_node, SubqueryScanState)) + { + /* + * We can also descend through SubqueryScan, but only if it has no + * qual (otherwise it might discard rows). + */ + SubqueryScanState *subqueryState = (SubqueryScanState *) child_node; + + if (subqueryState->ss.ps.qual == NULL) + ExecSetTupleBound(tuples_needed, subqueryState->subplan); + } + else if (IsA(child_node, GatherState)) + { + /* + * A Gather node can propagate the bound to its workers. As with + * MergeAppend, no one worker could possibly need to return more + * tuples than the Gather itself needs to. + * + * Note: As with Sort, the Gather node is responsible for reacting + * properly to changes to this parameter. + */ + GatherState *gstate = (GatherState *) child_node; + + gstate->tuples_needed = tuples_needed; + + /* Also pass down the bound to our own copy of the child plan */ + ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); + } + else if (IsA(child_node, GatherMergeState)) + { + /* Same comments as for Gather */ + GatherMergeState *gstate = (GatherMergeState *) child_node; + + gstate->tuples_needed = tuples_needed; + + ExecSetTupleBound(tuples_needed, outerPlanState(child_node)); + } + + /* + * In principle we could descend through any plan node type that is + * certain not to discard or combine input rows; but on seeing a node that + * can do that, we can't propagate the bound any further. For the moment + * it's unclear that any other cases are worth checking here. + */ +} diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c new file mode 100644 index 0000000..1e285e0 --- /dev/null +++ b/src/backend/executor/execReplication.c @@ -0,0 +1,629 @@ +/*------------------------------------------------------------------------- + * + * execReplication.c + * miscellaneous executor routines for logical replication + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/execReplication.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xact.h" +#include "commands/trigger.h" +#include "executor/executor.h" +#include "executor/nodeModifyTable.h" +#include "nodes/nodeFuncs.h" +#include "parser/parse_relation.h" +#include "parser/parsetree.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + + +/* + * Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that + * is setup to match 'rel' (*NOT* idxrel!). + * + * Returns whether any column contains NULLs. + * + * This is not generic routine, it expects the idxrel to be replication + * identity of a rel and meet all limitations associated with that. + */ +static bool +build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel, + TupleTableSlot *searchslot) +{ + int attoff; + bool isnull; + Datum indclassDatum; + oidvector *opclass; + int2vector *indkey = &idxrel->rd_index->indkey; + bool hasnulls = false; + + Assert(RelationGetReplicaIndex(rel) == RelationGetRelid(idxrel) || + RelationGetPrimaryKeyIndex(rel) == RelationGetRelid(idxrel)); + + indclassDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple, + Anum_pg_index_indclass, &isnull); + Assert(!isnull); + opclass = (oidvector *) DatumGetPointer(indclassDatum); + + /* Build scankey for every attribute in the index. */ + for (attoff = 0; attoff < IndexRelationGetNumberOfKeyAttributes(idxrel); attoff++) + { + Oid operator; + Oid opfamily; + RegProcedure regop; + int pkattno = attoff + 1; + int mainattno = indkey->values[attoff]; + Oid optype = get_opclass_input_type(opclass->values[attoff]); + + /* + * Load the operator info. We need this to get the equality operator + * function for the scan key. + */ + opfamily = get_opclass_family(opclass->values[attoff]); + + operator = get_opfamily_member(opfamily, optype, + optype, + BTEqualStrategyNumber); + if (!OidIsValid(operator)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + BTEqualStrategyNumber, optype, optype, opfamily); + + regop = get_opcode(operator); + + /* Initialize the scankey. */ + ScanKeyInit(&skey[attoff], + pkattno, + BTEqualStrategyNumber, + regop, + searchslot->tts_values[mainattno - 1]); + + skey[attoff].sk_collation = idxrel->rd_indcollation[attoff]; + + /* Check for null value. */ + if (searchslot->tts_isnull[mainattno - 1]) + { + hasnulls = true; + skey[attoff].sk_flags |= SK_ISNULL; + } + } + + return hasnulls; +} + +/* + * Search the relation 'rel' for tuple using the index. + * + * If a matching tuple is found, lock it with lockmode, fill the slot with its + * contents, and return true. Return false otherwise. + */ +bool +RelationFindReplTupleByIndex(Relation rel, Oid idxoid, + LockTupleMode lockmode, + TupleTableSlot *searchslot, + TupleTableSlot *outslot) +{ + ScanKeyData skey[INDEX_MAX_KEYS]; + IndexScanDesc scan; + SnapshotData snap; + TransactionId xwait; + Relation idxrel; + bool found; + + /* Open the index. */ + idxrel = index_open(idxoid, RowExclusiveLock); + + /* Start an index scan. */ + InitDirtySnapshot(snap); + scan = index_beginscan(rel, idxrel, &snap, + IndexRelationGetNumberOfKeyAttributes(idxrel), + 0); + + /* Build scan key. */ + build_replindex_scan_key(skey, rel, idxrel, searchslot); + +retry: + found = false; + + index_rescan(scan, skey, IndexRelationGetNumberOfKeyAttributes(idxrel), NULL, 0); + + /* Try to find the tuple */ + if (index_getnext_slot(scan, ForwardScanDirection, outslot)) + { + found = true; + ExecMaterializeSlot(outslot); + + xwait = TransactionIdIsValid(snap.xmin) ? + snap.xmin : snap.xmax; + + /* + * If the tuple is locked, wait for locking transaction to finish and + * retry. + */ + if (TransactionIdIsValid(xwait)) + { + XactLockTableWait(xwait, NULL, NULL, XLTW_None); + goto retry; + } + } + + /* Found tuple, try to lock it in the lockmode. */ + if (found) + { + TM_FailureData tmfd; + TM_Result res; + + PushActiveSnapshot(GetLatestSnapshot()); + + res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(), + outslot, + GetCurrentCommandId(false), + lockmode, + LockWaitBlock, + 0 /* don't follow updates */ , + &tmfd); + + PopActiveSnapshot(); + + switch (res) + { + case TM_Ok: + break; + case TM_Updated: + /* XXX: Improve handling here */ + if (ItemPointerIndicatesMovedPartitions(&tmfd.ctid)) + ereport(LOG, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying"))); + else + ereport(LOG, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("concurrent update, retrying"))); + goto retry; + case TM_Deleted: + /* XXX: Improve handling here */ + ereport(LOG, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("concurrent delete, retrying"))); + goto retry; + case TM_Invisible: + elog(ERROR, "attempted to lock invisible tuple"); + break; + default: + elog(ERROR, "unexpected table_tuple_lock status: %u", res); + break; + } + } + + index_endscan(scan); + + /* Don't release lock until commit. */ + index_close(idxrel, NoLock); + + return found; +} + +/* + * Compare the tuples in the slots by checking if they have equal values. + */ +static bool +tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2, + TypeCacheEntry **eq) +{ + int attrnum; + + Assert(slot1->tts_tupleDescriptor->natts == + slot2->tts_tupleDescriptor->natts); + + slot_getallattrs(slot1); + slot_getallattrs(slot2); + + /* Check equality of the attributes. */ + for (attrnum = 0; attrnum < slot1->tts_tupleDescriptor->natts; attrnum++) + { + Form_pg_attribute att; + TypeCacheEntry *typentry; + + /* + * If one value is NULL and other is not, then they are certainly not + * equal + */ + if (slot1->tts_isnull[attrnum] != slot2->tts_isnull[attrnum]) + return false; + + /* + * If both are NULL, they can be considered equal. + */ + if (slot1->tts_isnull[attrnum] || slot2->tts_isnull[attrnum]) + continue; + + att = TupleDescAttr(slot1->tts_tupleDescriptor, attrnum); + + typentry = eq[attrnum]; + if (typentry == NULL) + { + typentry = lookup_type_cache(att->atttypid, + TYPECACHE_EQ_OPR_FINFO); + if (!OidIsValid(typentry->eq_opr_finfo.fn_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify an equality operator for type %s", + format_type_be(att->atttypid)))); + eq[attrnum] = typentry; + } + + if (!DatumGetBool(FunctionCall2Coll(&typentry->eq_opr_finfo, + att->attcollation, + slot1->tts_values[attrnum], + slot2->tts_values[attrnum]))) + return false; + } + + return true; +} + +/* + * Search the relation 'rel' for tuple using the sequential scan. + * + * If a matching tuple is found, lock it with lockmode, fill the slot with its + * contents, and return true. Return false otherwise. + * + * Note that this stops on the first matching tuple. + * + * This can obviously be quite slow on tables that have more than few rows. + */ +bool +RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, + TupleTableSlot *searchslot, TupleTableSlot *outslot) +{ + TupleTableSlot *scanslot; + TableScanDesc scan; + SnapshotData snap; + TypeCacheEntry **eq; + TransactionId xwait; + bool found; + TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel); + + Assert(equalTupleDescs(desc, outslot->tts_tupleDescriptor)); + + eq = palloc0(sizeof(*eq) * outslot->tts_tupleDescriptor->natts); + + /* Start a heap scan. */ + InitDirtySnapshot(snap); + scan = table_beginscan(rel, &snap, 0, NULL); + scanslot = table_slot_create(rel, NULL); + +retry: + found = false; + + table_rescan(scan, NULL); + + /* Try to find the tuple */ + while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot)) + { + if (!tuples_equal(scanslot, searchslot, eq)) + continue; + + found = true; + ExecCopySlot(outslot, scanslot); + + xwait = TransactionIdIsValid(snap.xmin) ? + snap.xmin : snap.xmax; + + /* + * If the tuple is locked, wait for locking transaction to finish and + * retry. + */ + if (TransactionIdIsValid(xwait)) + { + XactLockTableWait(xwait, NULL, NULL, XLTW_None); + goto retry; + } + + /* Found our tuple and it's not locked */ + break; + } + + /* Found tuple, try to lock it in the lockmode. */ + if (found) + { + TM_FailureData tmfd; + TM_Result res; + + PushActiveSnapshot(GetLatestSnapshot()); + + res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(), + outslot, + GetCurrentCommandId(false), + lockmode, + LockWaitBlock, + 0 /* don't follow updates */ , + &tmfd); + + PopActiveSnapshot(); + + switch (res) + { + case TM_Ok: + break; + case TM_Updated: + /* XXX: Improve handling here */ + if (ItemPointerIndicatesMovedPartitions(&tmfd.ctid)) + ereport(LOG, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying"))); + else + ereport(LOG, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("concurrent update, retrying"))); + goto retry; + case TM_Deleted: + /* XXX: Improve handling here */ + ereport(LOG, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("concurrent delete, retrying"))); + goto retry; + case TM_Invisible: + elog(ERROR, "attempted to lock invisible tuple"); + break; + default: + elog(ERROR, "unexpected table_tuple_lock status: %u", res); + break; + } + } + + table_endscan(scan); + ExecDropSingleTupleTableSlot(scanslot); + + return found; +} + +/* + * Insert tuple represented in the slot to the relation, update the indexes, + * and execute any constraints and per-row triggers. + * + * Caller is responsible for opening the indexes. + */ +void +ExecSimpleRelationInsert(ResultRelInfo *resultRelInfo, + EState *estate, TupleTableSlot *slot) +{ + bool skip_tuple = false; + Relation rel = resultRelInfo->ri_RelationDesc; + + /* For now we support only tables. */ + Assert(rel->rd_rel->relkind == RELKIND_RELATION); + + CheckCmdReplicaIdentity(rel, CMD_INSERT); + + /* BEFORE ROW INSERT Triggers */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_insert_before_row) + { + if (!ExecBRInsertTriggers(estate, resultRelInfo, slot)) + skip_tuple = true; /* "do nothing" */ + } + + if (!skip_tuple) + { + List *recheckIndexes = NIL; + + /* Compute stored generated columns */ + if (rel->rd_att->constr && + rel->rd_att->constr->has_generated_stored) + ExecComputeStoredGenerated(resultRelInfo, estate, slot, + CMD_INSERT); + + /* Check the constraints of the tuple */ + if (rel->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate); + if (rel->rd_rel->relispartition) + ExecPartitionCheck(resultRelInfo, slot, estate, true); + + /* OK, store the tuple and create index entries for it */ + simple_table_tuple_insert(resultRelInfo->ri_RelationDesc, slot); + + if (resultRelInfo->ri_NumIndices > 0) + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, false, false, + NULL, NIL); + + /* AFTER ROW INSERT Triggers */ + ExecARInsertTriggers(estate, resultRelInfo, slot, + recheckIndexes, NULL); + + /* + * XXX we should in theory pass a TransitionCaptureState object to the + * above to capture transition tuples, but after statement triggers + * don't actually get fired by replication yet anyway + */ + + list_free(recheckIndexes); + } +} + +/* + * Find the searchslot tuple and update it with data in the slot, + * update the indexes, and execute any constraints and per-row triggers. + * + * Caller is responsible for opening the indexes. + */ +void +ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, + EState *estate, EPQState *epqstate, + TupleTableSlot *searchslot, TupleTableSlot *slot) +{ + bool skip_tuple = false; + Relation rel = resultRelInfo->ri_RelationDesc; + ItemPointer tid = &(searchslot->tts_tid); + + /* For now we support only tables. */ + Assert(rel->rd_rel->relkind == RELKIND_RELATION); + + CheckCmdReplicaIdentity(rel, CMD_UPDATE); + + /* BEFORE ROW UPDATE Triggers */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_before_row) + { + if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, + tid, NULL, slot)) + skip_tuple = true; /* "do nothing" */ + } + + if (!skip_tuple) + { + List *recheckIndexes = NIL; + bool update_indexes; + + /* Compute stored generated columns */ + if (rel->rd_att->constr && + rel->rd_att->constr->has_generated_stored) + ExecComputeStoredGenerated(resultRelInfo, estate, slot, + CMD_UPDATE); + + /* Check the constraints of the tuple */ + if (rel->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate); + if (rel->rd_rel->relispartition) + ExecPartitionCheck(resultRelInfo, slot, estate, true); + + simple_table_tuple_update(rel, tid, slot, estate->es_snapshot, + &update_indexes); + + if (resultRelInfo->ri_NumIndices > 0 && update_indexes) + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, true, false, + NULL, NIL); + + /* AFTER ROW UPDATE Triggers */ + ExecARUpdateTriggers(estate, resultRelInfo, + tid, NULL, slot, + recheckIndexes, NULL); + + list_free(recheckIndexes); + } +} + +/* + * Find the searchslot tuple and delete it, and execute any constraints + * and per-row triggers. + * + * Caller is responsible for opening the indexes. + */ +void +ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, + EState *estate, EPQState *epqstate, + TupleTableSlot *searchslot) +{ + bool skip_tuple = false; + Relation rel = resultRelInfo->ri_RelationDesc; + ItemPointer tid = &searchslot->tts_tid; + + CheckCmdReplicaIdentity(rel, CMD_DELETE); + + /* BEFORE ROW DELETE Triggers */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_delete_before_row) + { + skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, + tid, NULL, NULL); + + } + + if (!skip_tuple) + { + /* OK, delete the tuple */ + simple_table_tuple_delete(rel, tid, estate->es_snapshot); + + /* AFTER ROW DELETE Triggers */ + ExecARDeleteTriggers(estate, resultRelInfo, + tid, NULL, NULL); + } +} + +/* + * Check if command can be executed with current replica identity. + */ +void +CheckCmdReplicaIdentity(Relation rel, CmdType cmd) +{ + PublicationActions *pubactions; + + /* We only need to do checks for UPDATE and DELETE. */ + if (cmd != CMD_UPDATE && cmd != CMD_DELETE) + return; + + /* If relation has replica identity we are always good. */ + if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || + OidIsValid(RelationGetReplicaIndex(rel))) + return; + + /* + * This is either UPDATE OR DELETE and there is no replica identity. + * + * Check if the table publishes UPDATES or DELETES. + */ + pubactions = GetRelationPublicationActions(rel); + if (cmd == CMD_UPDATE && pubactions->pubupdate) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot update table \"%s\" because it does not have a replica identity and publishes updates", + RelationGetRelationName(rel)), + errhint("To enable updating the table, set REPLICA IDENTITY using ALTER TABLE."))); + else if (cmd == CMD_DELETE && pubactions->pubdelete) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot delete from table \"%s\" because it does not have a replica identity and publishes deletes", + RelationGetRelationName(rel)), + errhint("To enable deleting from the table, set REPLICA IDENTITY using ALTER TABLE."))); +} + + +/* + * Check if we support writing into specific relkind. + * + * The nspname and relname are only needed for error reporting. + */ +void +CheckSubscriptionRelkind(char relkind, const char *nspname, + const char *relname) +{ + /* + * Give a more specific error for foreign tables. + */ + if (relkind == RELKIND_FOREIGN_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot use relation \"%s.%s\" as logical replication target", + nspname, relname), + errdetail("\"%s.%s\" is a foreign table.", + nspname, relname))); + + if (relkind != RELKIND_RELATION && relkind != RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot use relation \"%s.%s\" as logical replication target", + nspname, relname), + errdetail("\"%s.%s\" is not a table.", + nspname, relname))); +} diff --git a/src/backend/executor/execSRF.c b/src/backend/executor/execSRF.c new file mode 100644 index 0000000..545b6c1 --- /dev/null +++ b/src/backend/executor/execSRF.c @@ -0,0 +1,980 @@ +/*------------------------------------------------------------------------- + * + * execSRF.c + * Routines implementing the API for set-returning functions + * + * This file serves nodeFunctionscan.c and nodeProjectSet.c, providing + * common code for calling set-returning functions according to the + * ReturnSetInfo API. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execSRF.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "catalog/objectaccess.h" +#include "executor/execdebug.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "parser/parse_coerce.h" +#include "pgstat.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/typcache.h" + + +/* static function decls */ +static void init_sexpr(Oid foid, Oid input_collation, Expr *node, + SetExprState *sexpr, PlanState *parent, + MemoryContext sexprCxt, bool allowSRF, bool needDescForSRF); +static void ShutdownSetExpr(Datum arg); +static void ExecEvalFuncArgs(FunctionCallInfo fcinfo, + List *argList, ExprContext *econtext); +static void ExecPrepareTuplestoreResult(SetExprState *sexpr, + ExprContext *econtext, + Tuplestorestate *resultStore, + TupleDesc resultDesc); +static void tupledesc_match(TupleDesc dst_tupdesc, TupleDesc src_tupdesc); + + +/* + * Prepare function call in FROM (ROWS FROM) for execution. + * + * This is used by nodeFunctionscan.c. + */ +SetExprState * +ExecInitTableFunctionResult(Expr *expr, + ExprContext *econtext, PlanState *parent) +{ + SetExprState *state = makeNode(SetExprState); + + state->funcReturnsSet = false; + state->expr = expr; + state->func.fn_oid = InvalidOid; + + /* + * Normally the passed expression tree will be a FuncExpr, since the + * grammar only allows a function call at the top level of a table + * function reference. However, if the function doesn't return set then + * the planner might have replaced the function call via constant-folding + * or inlining. So if we see any other kind of expression node, execute + * it via the general ExecEvalExpr() code. That code path will not + * support set-returning functions buried in the expression, though. + */ + if (IsA(expr, FuncExpr)) + { + FuncExpr *func = (FuncExpr *) expr; + + state->funcReturnsSet = func->funcretset; + state->args = ExecInitExprList(func->args, parent); + + init_sexpr(func->funcid, func->inputcollid, expr, state, parent, + econtext->ecxt_per_query_memory, func->funcretset, false); + } + else + { + state->elidedFuncState = ExecInitExpr(expr, parent); + } + + return state; +} + +/* + * ExecMakeTableFunctionResult + * + * Evaluate a table function, producing a materialized result in a Tuplestore + * object. + * + * This is used by nodeFunctionscan.c. + */ +Tuplestorestate * +ExecMakeTableFunctionResult(SetExprState *setexpr, + ExprContext *econtext, + MemoryContext argContext, + TupleDesc expectedDesc, + bool randomAccess) +{ + Tuplestorestate *tupstore = NULL; + TupleDesc tupdesc = NULL; + Oid funcrettype; + bool returnsTuple; + bool returnsSet = false; + FunctionCallInfo fcinfo; + PgStat_FunctionCallUsage fcusage; + ReturnSetInfo rsinfo; + HeapTupleData tmptup; + MemoryContext callerContext; + bool first_time = true; + + /* + * Execute per-tablefunc actions in appropriate context. + * + * The FunctionCallInfo needs to live across all the calls to a + * ValuePerCall function, so it can't be allocated in the per-tuple + * context. Similarly, the function arguments need to be evaluated in a + * context that is longer lived than the per-tuple context: The argument + * values would otherwise disappear when we reset that context in the + * inner loop. As the caller's CurrentMemoryContext is typically a + * query-lifespan context, we don't want to leak memory there. We require + * the caller to pass a separate memory context that can be used for this, + * and can be reset each time through to avoid bloat. + */ + MemoryContextReset(argContext); + callerContext = MemoryContextSwitchTo(argContext); + + funcrettype = exprType((Node *) setexpr->expr); + + returnsTuple = type_is_rowtype(funcrettype); + + /* + * Prepare a resultinfo node for communication. We always do this even if + * not expecting a set result, so that we can pass expectedDesc. In the + * generic-expression case, the expression doesn't actually get to see the + * resultinfo, but set it up anyway because we use some of the fields as + * our own state variables. + */ + rsinfo.type = T_ReturnSetInfo; + rsinfo.econtext = econtext; + rsinfo.expectedDesc = expectedDesc; + rsinfo.allowedModes = (int) (SFRM_ValuePerCall | SFRM_Materialize | SFRM_Materialize_Preferred); + if (randomAccess) + rsinfo.allowedModes |= (int) SFRM_Materialize_Random; + rsinfo.returnMode = SFRM_ValuePerCall; + /* isDone is filled below */ + rsinfo.setResult = NULL; + rsinfo.setDesc = NULL; + + fcinfo = palloc(SizeForFunctionCallInfo(list_length(setexpr->args))); + + /* + * Normally the passed expression tree will be a SetExprState, since the + * grammar only allows a function call at the top level of a table + * function reference. However, if the function doesn't return set then + * the planner might have replaced the function call via constant-folding + * or inlining. So if we see any other kind of expression node, execute + * it via the general ExecEvalExpr() code; the only difference is that we + * don't get a chance to pass a special ReturnSetInfo to any functions + * buried in the expression. + */ + if (!setexpr->elidedFuncState) + { + /* + * This path is similar to ExecMakeFunctionResultSet. + */ + returnsSet = setexpr->funcReturnsSet; + InitFunctionCallInfoData(*fcinfo, &(setexpr->func), + list_length(setexpr->args), + setexpr->fcinfo->fncollation, + NULL, (Node *) &rsinfo); + /* evaluate the function's argument list */ + Assert(CurrentMemoryContext == argContext); + ExecEvalFuncArgs(fcinfo, setexpr->args, econtext); + + /* + * If function is strict, and there are any NULL arguments, skip + * calling the function and act like it returned NULL (or an empty + * set, in the returns-set case). + */ + if (setexpr->func.fn_strict) + { + int i; + + for (i = 0; i < fcinfo->nargs; i++) + { + if (fcinfo->args[i].isnull) + goto no_function_result; + } + } + } + else + { + /* Treat setexpr as a generic expression */ + InitFunctionCallInfoData(*fcinfo, NULL, 0, InvalidOid, NULL, NULL); + } + + /* + * Switch to short-lived context for calling the function or expression. + */ + MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + /* + * Loop to handle the ValuePerCall protocol (which is also the same + * behavior needed in the generic ExecEvalExpr path). + */ + for (;;) + { + Datum result; + + CHECK_FOR_INTERRUPTS(); + + /* + * Reset per-tuple memory context before each call of the function or + * expression. This cleans up any local memory the function may leak + * when called. + */ + ResetExprContext(econtext); + + /* Call the function or expression one time */ + if (!setexpr->elidedFuncState) + { + pgstat_init_function_usage(fcinfo, &fcusage); + + fcinfo->isnull = false; + rsinfo.isDone = ExprSingleResult; + result = FunctionCallInvoke(fcinfo); + + pgstat_end_function_usage(&fcusage, + rsinfo.isDone != ExprMultipleResult); + } + else + { + result = + ExecEvalExpr(setexpr->elidedFuncState, econtext, &fcinfo->isnull); + rsinfo.isDone = ExprSingleResult; + } + + /* Which protocol does function want to use? */ + if (rsinfo.returnMode == SFRM_ValuePerCall) + { + /* + * Check for end of result set. + */ + if (rsinfo.isDone == ExprEndResult) + break; + + /* + * If first time through, build tuplestore for result. For a + * scalar function result type, also make a suitable tupdesc. + */ + if (first_time) + { + MemoryContext oldcontext = + MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + tupstore = tuplestore_begin_heap(randomAccess, false, work_mem); + rsinfo.setResult = tupstore; + if (!returnsTuple) + { + tupdesc = CreateTemplateTupleDesc(1); + TupleDescInitEntry(tupdesc, + (AttrNumber) 1, + "column", + funcrettype, + -1, + 0); + rsinfo.setDesc = tupdesc; + } + MemoryContextSwitchTo(oldcontext); + } + + /* + * Store current resultset item. + */ + if (returnsTuple) + { + if (!fcinfo->isnull) + { + HeapTupleHeader td = DatumGetHeapTupleHeader(result); + + if (tupdesc == NULL) + { + MemoryContext oldcontext = + MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + /* + * This is the first non-NULL result from the + * function. Use the type info embedded in the + * rowtype Datum to look up the needed tupdesc. Make + * a copy for the query. + */ + tupdesc = lookup_rowtype_tupdesc_copy(HeapTupleHeaderGetTypeId(td), + HeapTupleHeaderGetTypMod(td)); + rsinfo.setDesc = tupdesc; + MemoryContextSwitchTo(oldcontext); + } + else + { + /* + * Verify all later returned rows have same subtype; + * necessary in case the type is RECORD. + */ + if (HeapTupleHeaderGetTypeId(td) != tupdesc->tdtypeid || + HeapTupleHeaderGetTypMod(td) != tupdesc->tdtypmod) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("rows returned by function are not all of the same row type"))); + } + + /* + * tuplestore_puttuple needs a HeapTuple not a bare + * HeapTupleHeader, but it doesn't need all the fields. + */ + tmptup.t_len = HeapTupleHeaderGetDatumLength(td); + tmptup.t_data = td; + + tuplestore_puttuple(tupstore, &tmptup); + } + else + { + /* + * NULL result from a tuple-returning function; expand it + * to a row of all nulls. We rely on the expectedDesc to + * form such rows. (Note: this would be problematic if + * tuplestore_putvalues saved the tdtypeid/tdtypmod from + * the provided descriptor, since that might not match + * what we get from the function itself. But it doesn't.) + */ + int natts = expectedDesc->natts; + bool *nullflags; + + nullflags = (bool *) palloc(natts * sizeof(bool)); + memset(nullflags, true, natts * sizeof(bool)); + tuplestore_putvalues(tupstore, expectedDesc, NULL, nullflags); + } + } + else + { + /* Scalar-type case: just store the function result */ + tuplestore_putvalues(tupstore, tupdesc, &result, &fcinfo->isnull); + } + + /* + * Are we done? + */ + if (rsinfo.isDone != ExprMultipleResult) + break; + + /* + * Check that set-returning functions were properly declared. + * (Note: for historical reasons, we don't complain if a non-SRF + * returns ExprEndResult; that's treated as returning NULL.) + */ + if (!returnsSet) + ereport(ERROR, + (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED), + errmsg("table-function protocol for value-per-call mode was not followed"))); + } + else if (rsinfo.returnMode == SFRM_Materialize) + { + /* check we're on the same page as the function author */ + if (!first_time || rsinfo.isDone != ExprSingleResult || !returnsSet) + ereport(ERROR, + (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED), + errmsg("table-function protocol for materialize mode was not followed"))); + /* Done evaluating the set result */ + break; + } + else + ereport(ERROR, + (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED), + errmsg("unrecognized table-function returnMode: %d", + (int) rsinfo.returnMode))); + + first_time = false; + } + +no_function_result: + + /* + * If we got nothing from the function (ie, an empty-set or NULL result), + * we have to create the tuplestore to return, and if it's a + * non-set-returning function then insert a single all-nulls row. As + * above, we depend on the expectedDesc to manufacture the dummy row. + */ + if (rsinfo.setResult == NULL) + { + MemoryContext oldcontext = + MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + tupstore = tuplestore_begin_heap(randomAccess, false, work_mem); + rsinfo.setResult = tupstore; + MemoryContextSwitchTo(oldcontext); + + if (!returnsSet) + { + int natts = expectedDesc->natts; + bool *nullflags; + + nullflags = (bool *) palloc(natts * sizeof(bool)); + memset(nullflags, true, natts * sizeof(bool)); + tuplestore_putvalues(tupstore, expectedDesc, NULL, nullflags); + } + } + + /* + * If function provided a tupdesc, cross-check it. We only really need to + * do this for functions returning RECORD, but might as well do it always. + */ + if (rsinfo.setDesc) + { + tupledesc_match(expectedDesc, rsinfo.setDesc); + + /* + * If it is a dynamically-allocated TupleDesc, free it: it is + * typically allocated in a per-query context, so we must avoid + * leaking it across multiple usages. + */ + if (rsinfo.setDesc->tdrefcount == -1) + FreeTupleDesc(rsinfo.setDesc); + } + + MemoryContextSwitchTo(callerContext); + + /* All done, pass back the tuplestore */ + return rsinfo.setResult; +} + + +/* + * Prepare targetlist SRF function call for execution. + * + * This is used by nodeProjectSet.c. + */ +SetExprState * +ExecInitFunctionResultSet(Expr *expr, + ExprContext *econtext, PlanState *parent) +{ + SetExprState *state = makeNode(SetExprState); + + state->funcReturnsSet = true; + state->expr = expr; + state->func.fn_oid = InvalidOid; + + /* + * Initialize metadata. The expression node could be either a FuncExpr or + * an OpExpr. + */ + if (IsA(expr, FuncExpr)) + { + FuncExpr *func = (FuncExpr *) expr; + + state->args = ExecInitExprList(func->args, parent); + init_sexpr(func->funcid, func->inputcollid, expr, state, parent, + econtext->ecxt_per_query_memory, true, true); + } + else if (IsA(expr, OpExpr)) + { + OpExpr *op = (OpExpr *) expr; + + state->args = ExecInitExprList(op->args, parent); + init_sexpr(op->opfuncid, op->inputcollid, expr, state, parent, + econtext->ecxt_per_query_memory, true, true); + } + else + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(expr)); + + /* shouldn't get here unless the selected function returns set */ + Assert(state->func.fn_retset); + + return state; +} + +/* + * ExecMakeFunctionResultSet + * + * Evaluate the arguments to a set-returning function and then call the + * function itself. The argument expressions may not contain set-returning + * functions (the planner is supposed to have separated evaluation for those). + * + * This should be called in a short-lived (per-tuple) context, argContext + * needs to live until all rows have been returned (i.e. *isDone set to + * ExprEndResult or ExprSingleResult). + * + * This is used by nodeProjectSet.c. + */ +Datum +ExecMakeFunctionResultSet(SetExprState *fcache, + ExprContext *econtext, + MemoryContext argContext, + bool *isNull, + ExprDoneCond *isDone) +{ + List *arguments; + Datum result; + FunctionCallInfo fcinfo; + PgStat_FunctionCallUsage fcusage; + ReturnSetInfo rsinfo; + bool callit; + int i; + +restart: + + /* Guard against stack overflow due to overly complex expressions */ + check_stack_depth(); + + /* + * If a previous call of the function returned a set result in the form of + * a tuplestore, continue reading rows from the tuplestore until it's + * empty. + */ + if (fcache->funcResultStore) + { + TupleTableSlot *slot = fcache->funcResultSlot; + MemoryContext oldContext; + bool foundTup; + + /* + * Have to make sure tuple in slot lives long enough, otherwise + * clearing the slot could end up trying to free something already + * freed. + */ + oldContext = MemoryContextSwitchTo(slot->tts_mcxt); + foundTup = tuplestore_gettupleslot(fcache->funcResultStore, true, false, + fcache->funcResultSlot); + MemoryContextSwitchTo(oldContext); + + if (foundTup) + { + *isDone = ExprMultipleResult; + if (fcache->funcReturnsTuple) + { + /* We must return the whole tuple as a Datum. */ + *isNull = false; + return ExecFetchSlotHeapTupleDatum(fcache->funcResultSlot); + } + else + { + /* Extract the first column and return it as a scalar. */ + return slot_getattr(fcache->funcResultSlot, 1, isNull); + } + } + /* Exhausted the tuplestore, so clean up */ + tuplestore_end(fcache->funcResultStore); + fcache->funcResultStore = NULL; + *isDone = ExprEndResult; + *isNull = true; + return (Datum) 0; + } + + /* + * arguments is a list of expressions to evaluate before passing to the + * function manager. We skip the evaluation if it was already done in the + * previous call (ie, we are continuing the evaluation of a set-valued + * function). Otherwise, collect the current argument values into fcinfo. + * + * The arguments have to live in a context that lives at least until all + * rows from this SRF have been returned, otherwise ValuePerCall SRFs + * would reference freed memory after the first returned row. + */ + fcinfo = fcache->fcinfo; + arguments = fcache->args; + if (!fcache->setArgsValid) + { + MemoryContext oldContext = MemoryContextSwitchTo(argContext); + + ExecEvalFuncArgs(fcinfo, arguments, econtext); + MemoryContextSwitchTo(oldContext); + } + else + { + /* Reset flag (we may set it again below) */ + fcache->setArgsValid = false; + } + + /* + * Now call the function, passing the evaluated parameter values. + */ + + /* Prepare a resultinfo node for communication. */ + fcinfo->resultinfo = (Node *) &rsinfo; + rsinfo.type = T_ReturnSetInfo; + rsinfo.econtext = econtext; + rsinfo.expectedDesc = fcache->funcResultDesc; + rsinfo.allowedModes = (int) (SFRM_ValuePerCall | SFRM_Materialize); + /* note we do not set SFRM_Materialize_Random or _Preferred */ + rsinfo.returnMode = SFRM_ValuePerCall; + /* isDone is filled below */ + rsinfo.setResult = NULL; + rsinfo.setDesc = NULL; + + /* + * If function is strict, and there are any NULL arguments, skip calling + * the function. + */ + callit = true; + if (fcache->func.fn_strict) + { + for (i = 0; i < fcinfo->nargs; i++) + { + if (fcinfo->args[i].isnull) + { + callit = false; + break; + } + } + } + + if (callit) + { + pgstat_init_function_usage(fcinfo, &fcusage); + + fcinfo->isnull = false; + rsinfo.isDone = ExprSingleResult; + result = FunctionCallInvoke(fcinfo); + *isNull = fcinfo->isnull; + *isDone = rsinfo.isDone; + + pgstat_end_function_usage(&fcusage, + rsinfo.isDone != ExprMultipleResult); + } + else + { + /* for a strict SRF, result for NULL is an empty set */ + result = (Datum) 0; + *isNull = true; + *isDone = ExprEndResult; + } + + /* Which protocol does function want to use? */ + if (rsinfo.returnMode == SFRM_ValuePerCall) + { + if (*isDone != ExprEndResult) + { + /* + * Save the current argument values to re-use on the next call. + */ + if (*isDone == ExprMultipleResult) + { + fcache->setArgsValid = true; + /* Register cleanup callback if we didn't already */ + if (!fcache->shutdown_reg) + { + RegisterExprContextCallback(econtext, + ShutdownSetExpr, + PointerGetDatum(fcache)); + fcache->shutdown_reg = true; + } + } + } + } + else if (rsinfo.returnMode == SFRM_Materialize) + { + /* check we're on the same page as the function author */ + if (rsinfo.isDone != ExprSingleResult) + ereport(ERROR, + (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED), + errmsg("table-function protocol for materialize mode was not followed"))); + if (rsinfo.setResult != NULL) + { + /* prepare to return values from the tuplestore */ + ExecPrepareTuplestoreResult(fcache, econtext, + rsinfo.setResult, + rsinfo.setDesc); + /* loop back to top to start returning from tuplestore */ + goto restart; + } + /* if setResult was left null, treat it as empty set */ + *isDone = ExprEndResult; + *isNull = true; + result = (Datum) 0; + } + else + ereport(ERROR, + (errcode(ERRCODE_E_R_I_E_SRF_PROTOCOL_VIOLATED), + errmsg("unrecognized table-function returnMode: %d", + (int) rsinfo.returnMode))); + + return result; +} + + +/* + * init_sexpr - initialize a SetExprState node during first use + */ +static void +init_sexpr(Oid foid, Oid input_collation, Expr *node, + SetExprState *sexpr, PlanState *parent, + MemoryContext sexprCxt, bool allowSRF, bool needDescForSRF) +{ + AclResult aclresult; + size_t numargs = list_length(sexpr->args); + + /* Check permission to call function */ + aclresult = pg_proc_aclcheck(foid, GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(foid)); + InvokeFunctionExecuteHook(foid); + + /* + * Safety check on nargs. Under normal circumstances this should never + * fail, as parser should check sooner. But possibly it might fail if + * server has been compiled with FUNC_MAX_ARGS smaller than some functions + * declared in pg_proc? + */ + if (list_length(sexpr->args) > FUNC_MAX_ARGS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_ARGUMENTS), + errmsg_plural("cannot pass more than %d argument to a function", + "cannot pass more than %d arguments to a function", + FUNC_MAX_ARGS, + FUNC_MAX_ARGS))); + + /* Set up the primary fmgr lookup information */ + fmgr_info_cxt(foid, &(sexpr->func), sexprCxt); + fmgr_info_set_expr((Node *) sexpr->expr, &(sexpr->func)); + + /* Initialize the function call parameter struct as well */ + sexpr->fcinfo = + (FunctionCallInfo) palloc(SizeForFunctionCallInfo(numargs)); + InitFunctionCallInfoData(*sexpr->fcinfo, &(sexpr->func), + numargs, + input_collation, NULL, NULL); + + /* If function returns set, check if that's allowed by caller */ + if (sexpr->func.fn_retset && !allowSRF) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"), + parent ? executor_errposition(parent->state, + exprLocation((Node *) node)) : 0)); + + /* Otherwise, caller should have marked the sexpr correctly */ + Assert(sexpr->func.fn_retset == sexpr->funcReturnsSet); + + /* If function returns set, prepare expected tuple descriptor */ + if (sexpr->func.fn_retset && needDescForSRF) + { + TypeFuncClass functypclass; + Oid funcrettype; + TupleDesc tupdesc; + MemoryContext oldcontext; + + functypclass = get_expr_result_type(sexpr->func.fn_expr, + &funcrettype, + &tupdesc); + + /* Must save tupdesc in sexpr's context */ + oldcontext = MemoryContextSwitchTo(sexprCxt); + + if (functypclass == TYPEFUNC_COMPOSITE || + functypclass == TYPEFUNC_COMPOSITE_DOMAIN) + { + /* Composite data type, e.g. a table's row type */ + Assert(tupdesc); + /* Must copy it out of typcache for safety */ + sexpr->funcResultDesc = CreateTupleDescCopy(tupdesc); + sexpr->funcReturnsTuple = true; + } + else if (functypclass == TYPEFUNC_SCALAR) + { + /* Base data type, i.e. scalar */ + tupdesc = CreateTemplateTupleDesc(1); + TupleDescInitEntry(tupdesc, + (AttrNumber) 1, + NULL, + funcrettype, + -1, + 0); + sexpr->funcResultDesc = tupdesc; + sexpr->funcReturnsTuple = false; + } + else if (functypclass == TYPEFUNC_RECORD) + { + /* This will work if function doesn't need an expectedDesc */ + sexpr->funcResultDesc = NULL; + sexpr->funcReturnsTuple = true; + } + else + { + /* Else, we will fail if function needs an expectedDesc */ + sexpr->funcResultDesc = NULL; + } + + MemoryContextSwitchTo(oldcontext); + } + else + sexpr->funcResultDesc = NULL; + + /* Initialize additional state */ + sexpr->funcResultStore = NULL; + sexpr->funcResultSlot = NULL; + sexpr->shutdown_reg = false; +} + +/* + * callback function in case a SetExprState needs to be shut down before it + * has been run to completion + */ +static void +ShutdownSetExpr(Datum arg) +{ + SetExprState *sexpr = castNode(SetExprState, DatumGetPointer(arg)); + + /* If we have a slot, make sure it's let go of any tuplestore pointer */ + if (sexpr->funcResultSlot) + ExecClearTuple(sexpr->funcResultSlot); + + /* Release any open tuplestore */ + if (sexpr->funcResultStore) + tuplestore_end(sexpr->funcResultStore); + sexpr->funcResultStore = NULL; + + /* Clear any active set-argument state */ + sexpr->setArgsValid = false; + + /* execUtils will deregister the callback... */ + sexpr->shutdown_reg = false; +} + +/* + * Evaluate arguments for a function. + */ +static void +ExecEvalFuncArgs(FunctionCallInfo fcinfo, + List *argList, + ExprContext *econtext) +{ + int i; + ListCell *arg; + + i = 0; + foreach(arg, argList) + { + ExprState *argstate = (ExprState *) lfirst(arg); + + fcinfo->args[i].value = ExecEvalExpr(argstate, + econtext, + &fcinfo->args[i].isnull); + i++; + } + + Assert(i == fcinfo->nargs); +} + +/* + * ExecPrepareTuplestoreResult + * + * Subroutine for ExecMakeFunctionResultSet: prepare to extract rows from a + * tuplestore function result. We must set up a funcResultSlot (unless + * already done in a previous call cycle) and verify that the function + * returned the expected tuple descriptor. + */ +static void +ExecPrepareTuplestoreResult(SetExprState *sexpr, + ExprContext *econtext, + Tuplestorestate *resultStore, + TupleDesc resultDesc) +{ + sexpr->funcResultStore = resultStore; + + if (sexpr->funcResultSlot == NULL) + { + /* Create a slot so we can read data out of the tuplestore */ + TupleDesc slotDesc; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(sexpr->func.fn_mcxt); + + /* + * If we were not able to determine the result rowtype from context, + * and the function didn't return a tupdesc, we have to fail. + */ + if (sexpr->funcResultDesc) + slotDesc = sexpr->funcResultDesc; + else if (resultDesc) + { + /* don't assume resultDesc is long-lived */ + slotDesc = CreateTupleDescCopy(resultDesc); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function returning setof record called in " + "context that cannot accept type record"))); + slotDesc = NULL; /* keep compiler quiet */ + } + + sexpr->funcResultSlot = MakeSingleTupleTableSlot(slotDesc, + &TTSOpsMinimalTuple); + MemoryContextSwitchTo(oldcontext); + } + + /* + * If function provided a tupdesc, cross-check it. We only really need to + * do this for functions returning RECORD, but might as well do it always. + */ + if (resultDesc) + { + if (sexpr->funcResultDesc) + tupledesc_match(sexpr->funcResultDesc, resultDesc); + + /* + * If it is a dynamically-allocated TupleDesc, free it: it is + * typically allocated in a per-query context, so we must avoid + * leaking it across multiple usages. + */ + if (resultDesc->tdrefcount == -1) + FreeTupleDesc(resultDesc); + } + + /* Register cleanup callback if we didn't already */ + if (!sexpr->shutdown_reg) + { + RegisterExprContextCallback(econtext, + ShutdownSetExpr, + PointerGetDatum(sexpr)); + sexpr->shutdown_reg = true; + } +} + +/* + * Check that function result tuple type (src_tupdesc) matches or can + * be considered to match what the query expects (dst_tupdesc). If + * they don't match, ereport. + * + * We really only care about number of attributes and data type. + * Also, we can ignore type mismatch on columns that are dropped in the + * destination type, so long as the physical storage matches. This is + * helpful in some cases involving out-of-date cached plans. + */ +static void +tupledesc_match(TupleDesc dst_tupdesc, TupleDesc src_tupdesc) +{ + int i; + + if (dst_tupdesc->natts != src_tupdesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("function return row and query-specified return row do not match"), + errdetail_plural("Returned row contains %d attribute, but query expects %d.", + "Returned row contains %d attributes, but query expects %d.", + src_tupdesc->natts, + src_tupdesc->natts, dst_tupdesc->natts))); + + for (i = 0; i < dst_tupdesc->natts; i++) + { + Form_pg_attribute dattr = TupleDescAttr(dst_tupdesc, i); + Form_pg_attribute sattr = TupleDescAttr(src_tupdesc, i); + + if (IsBinaryCoercible(sattr->atttypid, dattr->atttypid)) + continue; /* no worries */ + if (!dattr->attisdropped) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("function return row and query-specified return row do not match"), + errdetail("Returned type %s at ordinal position %d, but query expects %s.", + format_type_be(sattr->atttypid), + i + 1, + format_type_be(dattr->atttypid)))); + + if (dattr->attlen != sattr->attlen || + dattr->attalign != sattr->attalign) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("function return row and query-specified return row do not match"), + errdetail("Physical storage mismatch on dropped attribute at ordinal position %d.", + i + 1))); + } +} diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c new file mode 100644 index 0000000..69ab345 --- /dev/null +++ b/src/backend/executor/execScan.c @@ -0,0 +1,342 @@ +/*------------------------------------------------------------------------- + * + * execScan.c + * This code provides support for generalized relation scans. ExecScan + * is passed a node and a pointer to a function to "do the right thing" + * and return a tuple from the relation. ExecScan then does the tedious + * stuff - checking the qualification and projecting the tuple + * appropriately. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execScan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "executor/executor.h" +#include "miscadmin.h" +#include "utils/memutils.h" + + + +/* + * ExecScanFetch -- check interrupts & fetch next potential tuple + * + * This routine is concerned with substituting a test tuple if we are + * inside an EvalPlanQual recheck. If we aren't, just execute + * the access method's next-tuple routine. + */ +static inline TupleTableSlot * +ExecScanFetch(ScanState *node, + ExecScanAccessMtd accessMtd, + ExecScanRecheckMtd recheckMtd) +{ + EState *estate = node->ps.state; + + CHECK_FOR_INTERRUPTS(); + + if (estate->es_epq_active != NULL) + { + EPQState *epqstate = estate->es_epq_active; + + /* + * We are inside an EvalPlanQual recheck. Return the test tuple if + * one is available, after rechecking any access-method-specific + * conditions. + */ + Index scanrelid = ((Scan *) node->ps.plan)->scanrelid; + + if (scanrelid == 0) + { + /* + * This is a ForeignScan or CustomScan which has pushed down a + * join to the remote side. The recheck method is responsible not + * only for rechecking the scan/join quals but also for storing + * the correct tuple in the slot. + */ + + TupleTableSlot *slot = node->ss_ScanTupleSlot; + + if (!(*recheckMtd) (node, slot)) + ExecClearTuple(slot); /* would not be returned by scan */ + return slot; + } + else if (epqstate->relsubs_done[scanrelid - 1]) + { + /* + * Return empty slot, as we already performed an EPQ substitution + * for this relation. + */ + + TupleTableSlot *slot = node->ss_ScanTupleSlot; + + /* Return empty slot, as we already returned a tuple */ + return ExecClearTuple(slot); + } + else if (epqstate->relsubs_slot[scanrelid - 1] != NULL) + { + /* + * Return replacement tuple provided by the EPQ caller. + */ + + TupleTableSlot *slot = epqstate->relsubs_slot[scanrelid - 1]; + + Assert(epqstate->relsubs_rowmark[scanrelid - 1] == NULL); + + /* Mark to remember that we shouldn't return more */ + epqstate->relsubs_done[scanrelid - 1] = true; + + /* Return empty slot if we haven't got a test tuple */ + if (TupIsNull(slot)) + return NULL; + + /* Check if it meets the access-method conditions */ + if (!(*recheckMtd) (node, slot)) + return ExecClearTuple(slot); /* would not be returned by + * scan */ + return slot; + } + else if (epqstate->relsubs_rowmark[scanrelid - 1] != NULL) + { + /* + * Fetch and return replacement tuple using a non-locking rowmark. + */ + + TupleTableSlot *slot = node->ss_ScanTupleSlot; + + /* Mark to remember that we shouldn't return more */ + epqstate->relsubs_done[scanrelid - 1] = true; + + if (!EvalPlanQualFetchRowMark(epqstate, scanrelid, slot)) + return NULL; + + /* Return empty slot if we haven't got a test tuple */ + if (TupIsNull(slot)) + return NULL; + + /* Check if it meets the access-method conditions */ + if (!(*recheckMtd) (node, slot)) + return ExecClearTuple(slot); /* would not be returned by + * scan */ + return slot; + } + } + + /* + * Run the node-type-specific access method function to get the next tuple + */ + return (*accessMtd) (node); +} + +/* ---------------------------------------------------------------- + * ExecScan + * + * Scans the relation using the 'access method' indicated and + * returns the next qualifying tuple. + * The access method returns the next tuple and ExecScan() is + * responsible for checking the tuple returned against the qual-clause. + * + * A 'recheck method' must also be provided that can check an + * arbitrary tuple of the relation against any qual conditions + * that are implemented internal to the access method. + * + * Conditions: + * -- the "cursor" maintained by the AMI is positioned at the tuple + * returned previously. + * + * Initial States: + * -- the relation indicated is opened for scanning so that the + * "cursor" is positioned before the first qualifying tuple. + * ---------------------------------------------------------------- + */ +TupleTableSlot * +ExecScan(ScanState *node, + ExecScanAccessMtd accessMtd, /* function returning a tuple */ + ExecScanRecheckMtd recheckMtd) +{ + ExprContext *econtext; + ExprState *qual; + ProjectionInfo *projInfo; + + /* + * Fetch data from node + */ + qual = node->ps.qual; + projInfo = node->ps.ps_ProjInfo; + econtext = node->ps.ps_ExprContext; + + /* interrupt checks are in ExecScanFetch */ + + /* + * If we have neither a qual to check nor a projection to do, just skip + * all the overhead and return the raw scan tuple. + */ + if (!qual && !projInfo) + { + ResetExprContext(econtext); + return ExecScanFetch(node, accessMtd, recheckMtd); + } + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + ResetExprContext(econtext); + + /* + * get a tuple from the access method. Loop until we obtain a tuple that + * passes the qualification. + */ + for (;;) + { + TupleTableSlot *slot; + + slot = ExecScanFetch(node, accessMtd, recheckMtd); + + /* + * if the slot returned by the accessMtd contains NULL, then it means + * there is nothing more to scan so we just return an empty slot, + * being careful to use the projection result slot so it has correct + * tupleDesc. + */ + if (TupIsNull(slot)) + { + if (projInfo) + return ExecClearTuple(projInfo->pi_state.resultslot); + else + return slot; + } + + /* + * place the current tuple into the expr context + */ + econtext->ecxt_scantuple = slot; + + /* + * check that the current tuple satisfies the qual-clause + * + * check for non-null qual here to avoid a function call to ExecQual() + * when the qual is null ... saves only a few cycles, but they add up + * ... + */ + if (qual == NULL || ExecQual(qual, econtext)) + { + /* + * Found a satisfactory scan tuple. + */ + if (projInfo) + { + /* + * Form a projection tuple, store it in the result tuple slot + * and return it. + */ + return ExecProject(projInfo); + } + else + { + /* + * Here, we aren't projecting, so just return scan tuple. + */ + return slot; + } + } + else + InstrCountFiltered1(node, 1); + + /* + * Tuple fails qual, so free per-tuple memory and try again. + */ + ResetExprContext(econtext); + } +} + +/* + * ExecAssignScanProjectionInfo + * Set up projection info for a scan node, if necessary. + * + * We can avoid a projection step if the requested tlist exactly matches + * the underlying tuple type. If so, we just set ps_ProjInfo to NULL. + * Note that this case occurs not only for simple "SELECT * FROM ...", but + * also in most cases where there are joins or other processing nodes above + * the scan node, because the planner will preferentially generate a matching + * tlist. + * + * The scan slot's descriptor must have been set already. + */ +void +ExecAssignScanProjectionInfo(ScanState *node) +{ + Scan *scan = (Scan *) node->ps.plan; + TupleDesc tupdesc = node->ss_ScanTupleSlot->tts_tupleDescriptor; + + ExecConditionalAssignProjectionInfo(&node->ps, tupdesc, scan->scanrelid); +} + +/* + * ExecAssignScanProjectionInfoWithVarno + * As above, but caller can specify varno expected in Vars in the tlist. + */ +void +ExecAssignScanProjectionInfoWithVarno(ScanState *node, Index varno) +{ + TupleDesc tupdesc = node->ss_ScanTupleSlot->tts_tupleDescriptor; + + ExecConditionalAssignProjectionInfo(&node->ps, tupdesc, varno); +} + +/* + * ExecScanReScan + * + * This must be called within the ReScan function of any plan node type + * that uses ExecScan(). + */ +void +ExecScanReScan(ScanState *node) +{ + EState *estate = node->ps.state; + + /* + * We must clear the scan tuple so that observers (e.g., execCurrent.c) + * can tell that this plan node is not positioned on a tuple. + */ + ExecClearTuple(node->ss_ScanTupleSlot); + + /* Rescan EvalPlanQual tuple if we're inside an EvalPlanQual recheck */ + if (estate->es_epq_active != NULL) + { + EPQState *epqstate = estate->es_epq_active; + Index scanrelid = ((Scan *) node->ps.plan)->scanrelid; + + if (scanrelid > 0) + epqstate->relsubs_done[scanrelid - 1] = false; + else + { + Bitmapset *relids; + int rtindex = -1; + + /* + * If an FDW or custom scan provider has replaced the join with a + * scan, there are multiple RTIs; reset the epqScanDone flag for + * all of them. + */ + if (IsA(node->ps.plan, ForeignScan)) + relids = ((ForeignScan *) node->ps.plan)->fs_relids; + else if (IsA(node->ps.plan, CustomScan)) + relids = ((CustomScan *) node->ps.plan)->custom_relids; + else + elog(ERROR, "unexpected scan node: %d", + (int) nodeTag(node->ps.plan)); + + while ((rtindex = bms_next_member(relids, rtindex)) >= 0) + { + Assert(rtindex > 0); + epqstate->relsubs_done[rtindex - 1] = false; + } + } + } +} diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c new file mode 100644 index 0000000..5004b3b --- /dev/null +++ b/src/backend/executor/execTuples.c @@ -0,0 +1,2339 @@ +/*------------------------------------------------------------------------- + * + * execTuples.c + * Routines dealing with TupleTableSlots. These are used for resource + * management associated with tuples (eg, releasing buffer pins for + * tuples in disk buffers, or freeing the memory occupied by transient + * tuples). Slots also provide access abstraction that lets us implement + * "virtual" tuples to reduce data-copying overhead. + * + * Routines dealing with the type information for tuples. Currently, + * the type information for a tuple is an array of FormData_pg_attribute. + * This information is needed by routines manipulating tuples + * (getattribute, formtuple, etc.). + * + * + * EXAMPLE OF HOW TABLE ROUTINES WORK + * Suppose we have a query such as SELECT emp.name FROM emp and we have + * a single SeqScan node in the query plan. + * + * At ExecutorStart() + * ---------------- + * + * - ExecInitSeqScan() calls ExecInitScanTupleSlot() to construct a + * TupleTableSlots for the tuples returned by the access method, and + * ExecInitResultTypeTL() to define the node's return + * type. ExecAssignScanProjectionInfo() will, if necessary, create + * another TupleTableSlot for the tuples resulting from performing + * target list projections. + * + * During ExecutorRun() + * ---------------- + * - SeqNext() calls ExecStoreBufferHeapTuple() to place the tuple + * returned by the access method into the scan tuple slot. + * + * - ExecSeqScan() (via ExecScan), if necessary, calls ExecProject(), + * putting the result of the projection in the result tuple slot. If + * not necessary, it directly returns the slot returned by SeqNext(). + * + * - ExecutePlan() calls the output function. + * + * The important thing to watch in the executor code is how pointers + * to the slots containing tuples are passed instead of the tuples + * themselves. This facilitates the communication of related information + * (such as whether or not a tuple should be pfreed, what buffer contains + * this tuple, the tuple's tuple descriptor, etc). It also allows us + * to avoid physically constructing projection tuples in many cases. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execTuples.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heaptoast.h" +#include "access/htup_details.h" +#include "access/tupdesc_details.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "nodes/nodeFuncs.h" +#include "storage/bufmgr.h" +#include "utils/builtins.h" +#include "utils/expandeddatum.h" +#include "utils/lsyscache.h" +#include "utils/typcache.h" + +static TupleDesc ExecTypeFromTLInternal(List *targetList, + bool skipjunk); +static pg_attribute_always_inline void slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp, + int natts); +static inline void tts_buffer_heap_store_tuple(TupleTableSlot *slot, + HeapTuple tuple, + Buffer buffer, + bool transfer_pin); +static void tts_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple, bool shouldFree); + + +const TupleTableSlotOps TTSOpsVirtual; +const TupleTableSlotOps TTSOpsHeapTuple; +const TupleTableSlotOps TTSOpsMinimalTuple; +const TupleTableSlotOps TTSOpsBufferHeapTuple; + + +/* + * TupleTableSlotOps implementations. + */ + +/* + * TupleTableSlotOps implementation for VirtualTupleTableSlot. + */ +static void +tts_virtual_init(TupleTableSlot *slot) +{ +} + +static void +tts_virtual_release(TupleTableSlot *slot) +{ +} + +static void +tts_virtual_clear(TupleTableSlot *slot) +{ + if (unlikely(TTS_SHOULDFREE(slot))) + { + VirtualTupleTableSlot *vslot = (VirtualTupleTableSlot *) slot; + + pfree(vslot->data); + vslot->data = NULL; + + slot->tts_flags &= ~TTS_FLAG_SHOULDFREE; + } + + slot->tts_nvalid = 0; + slot->tts_flags |= TTS_FLAG_EMPTY; + ItemPointerSetInvalid(&slot->tts_tid); +} + +/* + * VirtualTupleTableSlots always have fully populated tts_values and + * tts_isnull arrays. So this function should never be called. + */ +static void +tts_virtual_getsomeattrs(TupleTableSlot *slot, int natts) +{ + elog(ERROR, "getsomeattrs is not required to be called on a virtual tuple table slot"); +} + +/* + * VirtualTupleTableSlots never provide system attributes (except those + * handled generically, such as tableoid). We generally shouldn't get + * here, but provide a user-friendly message if we do. + */ +static Datum +tts_virtual_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull) +{ + Assert(!TTS_EMPTY(slot)); + + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot retrieve a system column in this context"))); + + return 0; /* silence compiler warnings */ +} + +/* + * To materialize a virtual slot all the datums that aren't passed by value + * have to be copied into the slot's memory context. To do so, compute the + * required size, and allocate enough memory to store all attributes. That's + * good for cache hit ratio, but more importantly requires only memory + * allocation/deallocation. + */ +static void +tts_virtual_materialize(TupleTableSlot *slot) +{ + VirtualTupleTableSlot *vslot = (VirtualTupleTableSlot *) slot; + TupleDesc desc = slot->tts_tupleDescriptor; + Size sz = 0; + char *data; + + /* already materialized */ + if (TTS_SHOULDFREE(slot)) + return; + + /* compute size of memory required */ + for (int natt = 0; natt < desc->natts; natt++) + { + Form_pg_attribute att = TupleDescAttr(desc, natt); + Datum val; + + if (att->attbyval || slot->tts_isnull[natt]) + continue; + + val = slot->tts_values[natt]; + + if (att->attlen == -1 && + VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val))) + { + /* + * We want to flatten the expanded value so that the materialized + * slot doesn't depend on it. + */ + sz = att_align_nominal(sz, att->attalign); + sz += EOH_get_flat_size(DatumGetEOHP(val)); + } + else + { + sz = att_align_nominal(sz, att->attalign); + sz = att_addlength_datum(sz, att->attlen, val); + } + } + + /* all data is byval */ + if (sz == 0) + return; + + /* allocate memory */ + vslot->data = data = MemoryContextAlloc(slot->tts_mcxt, sz); + slot->tts_flags |= TTS_FLAG_SHOULDFREE; + + /* and copy all attributes into the pre-allocated space */ + for (int natt = 0; natt < desc->natts; natt++) + { + Form_pg_attribute att = TupleDescAttr(desc, natt); + Datum val; + + if (att->attbyval || slot->tts_isnull[natt]) + continue; + + val = slot->tts_values[natt]; + + if (att->attlen == -1 && + VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val))) + { + Size data_length; + + /* + * We want to flatten the expanded value so that the materialized + * slot doesn't depend on it. + */ + ExpandedObjectHeader *eoh = DatumGetEOHP(val); + + data = (char *) att_align_nominal(data, + att->attalign); + data_length = EOH_get_flat_size(eoh); + EOH_flatten_into(eoh, data, data_length); + + slot->tts_values[natt] = PointerGetDatum(data); + data += data_length; + } + else + { + Size data_length = 0; + + data = (char *) att_align_nominal(data, att->attalign); + data_length = att_addlength_datum(data_length, att->attlen, val); + + memcpy(data, DatumGetPointer(val), data_length); + + slot->tts_values[natt] = PointerGetDatum(data); + data += data_length; + } + } +} + +static void +tts_virtual_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot) +{ + TupleDesc srcdesc = srcslot->tts_tupleDescriptor; + + Assert(srcdesc->natts <= dstslot->tts_tupleDescriptor->natts); + + tts_virtual_clear(dstslot); + + slot_getallattrs(srcslot); + + for (int natt = 0; natt < srcdesc->natts; natt++) + { + dstslot->tts_values[natt] = srcslot->tts_values[natt]; + dstslot->tts_isnull[natt] = srcslot->tts_isnull[natt]; + } + + dstslot->tts_nvalid = srcdesc->natts; + dstslot->tts_flags &= ~TTS_FLAG_EMPTY; + + /* make sure storage doesn't depend on external memory */ + tts_virtual_materialize(dstslot); +} + +static HeapTuple +tts_virtual_copy_heap_tuple(TupleTableSlot *slot) +{ + Assert(!TTS_EMPTY(slot)); + + return heap_form_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); +} + +static MinimalTuple +tts_virtual_copy_minimal_tuple(TupleTableSlot *slot) +{ + Assert(!TTS_EMPTY(slot)); + + return heap_form_minimal_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); +} + + +/* + * TupleTableSlotOps implementation for HeapTupleTableSlot. + */ + +static void +tts_heap_init(TupleTableSlot *slot) +{ +} + +static void +tts_heap_release(TupleTableSlot *slot) +{ +} + +static void +tts_heap_clear(TupleTableSlot *slot) +{ + HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot; + + /* Free the memory for the heap tuple if it's allowed. */ + if (TTS_SHOULDFREE(slot)) + { + heap_freetuple(hslot->tuple); + slot->tts_flags &= ~TTS_FLAG_SHOULDFREE; + } + + slot->tts_nvalid = 0; + slot->tts_flags |= TTS_FLAG_EMPTY; + ItemPointerSetInvalid(&slot->tts_tid); + hslot->off = 0; + hslot->tuple = NULL; +} + +static void +tts_heap_getsomeattrs(TupleTableSlot *slot, int natts) +{ + HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + + slot_deform_heap_tuple(slot, hslot->tuple, &hslot->off, natts); +} + +static Datum +tts_heap_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull) +{ + HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + + /* + * In some code paths it's possible to get here with a non-materialized + * slot, in which case we can't retrieve system columns. + */ + if (!hslot->tuple) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot retrieve a system column in this context"))); + + return heap_getsysattr(hslot->tuple, attnum, + slot->tts_tupleDescriptor, isnull); +} + +static void +tts_heap_materialize(TupleTableSlot *slot) +{ + HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot; + MemoryContext oldContext; + + Assert(!TTS_EMPTY(slot)); + + /* If slot has its tuple already materialized, nothing to do. */ + if (TTS_SHOULDFREE(slot)) + return; + + oldContext = MemoryContextSwitchTo(slot->tts_mcxt); + + /* + * Have to deform from scratch, otherwise tts_values[] entries could point + * into the non-materialized tuple (which might be gone when accessed). + */ + slot->tts_nvalid = 0; + hslot->off = 0; + + if (!hslot->tuple) + hslot->tuple = heap_form_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); + else + { + /* + * The tuple contained in this slot is not allocated in the memory + * context of the given slot (else it would have TTS_SHOULDFREE set). + * Copy the tuple into the given slot's memory context. + */ + hslot->tuple = heap_copytuple(hslot->tuple); + } + + slot->tts_flags |= TTS_FLAG_SHOULDFREE; + + MemoryContextSwitchTo(oldContext); +} + +static void +tts_heap_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot) +{ + HeapTuple tuple; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(dstslot->tts_mcxt); + tuple = ExecCopySlotHeapTuple(srcslot); + MemoryContextSwitchTo(oldcontext); + + ExecStoreHeapTuple(tuple, dstslot, true); +} + +static HeapTuple +tts_heap_get_heap_tuple(TupleTableSlot *slot) +{ + HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + if (!hslot->tuple) + tts_heap_materialize(slot); + + return hslot->tuple; +} + +static HeapTuple +tts_heap_copy_heap_tuple(TupleTableSlot *slot) +{ + HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + if (!hslot->tuple) + tts_heap_materialize(slot); + + return heap_copytuple(hslot->tuple); +} + +static MinimalTuple +tts_heap_copy_minimal_tuple(TupleTableSlot *slot) +{ + HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot; + + if (!hslot->tuple) + tts_heap_materialize(slot); + + return minimal_tuple_from_heap_tuple(hslot->tuple); +} + +static void +tts_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple, bool shouldFree) +{ + HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot; + + tts_heap_clear(slot); + + slot->tts_nvalid = 0; + hslot->tuple = tuple; + hslot->off = 0; + slot->tts_flags &= ~(TTS_FLAG_EMPTY | TTS_FLAG_SHOULDFREE); + slot->tts_tid = tuple->t_self; + + if (shouldFree) + slot->tts_flags |= TTS_FLAG_SHOULDFREE; +} + + +/* + * TupleTableSlotOps implementation for MinimalTupleTableSlot. + */ + +static void +tts_minimal_init(TupleTableSlot *slot) +{ + MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot; + + /* + * Initialize the heap tuple pointer to access attributes of the minimal + * tuple contained in the slot as if its a heap tuple. + */ + mslot->tuple = &mslot->minhdr; +} + +static void +tts_minimal_release(TupleTableSlot *slot) +{ +} + +static void +tts_minimal_clear(TupleTableSlot *slot) +{ + MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot; + + if (TTS_SHOULDFREE(slot)) + { + heap_free_minimal_tuple(mslot->mintuple); + slot->tts_flags &= ~TTS_FLAG_SHOULDFREE; + } + + slot->tts_nvalid = 0; + slot->tts_flags |= TTS_FLAG_EMPTY; + ItemPointerSetInvalid(&slot->tts_tid); + mslot->off = 0; + mslot->mintuple = NULL; +} + +static void +tts_minimal_getsomeattrs(TupleTableSlot *slot, int natts) +{ + MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + + slot_deform_heap_tuple(slot, mslot->tuple, &mslot->off, natts); +} + +static Datum +tts_minimal_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull) +{ + Assert(!TTS_EMPTY(slot)); + + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot retrieve a system column in this context"))); + + return 0; /* silence compiler warnings */ +} + +static void +tts_minimal_materialize(TupleTableSlot *slot) +{ + MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot; + MemoryContext oldContext; + + Assert(!TTS_EMPTY(slot)); + + /* If slot has its tuple already materialized, nothing to do. */ + if (TTS_SHOULDFREE(slot)) + return; + + oldContext = MemoryContextSwitchTo(slot->tts_mcxt); + + /* + * Have to deform from scratch, otherwise tts_values[] entries could point + * into the non-materialized tuple (which might be gone when accessed). + */ + slot->tts_nvalid = 0; + mslot->off = 0; + + if (!mslot->mintuple) + { + mslot->mintuple = heap_form_minimal_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); + } + else + { + /* + * The minimal tuple contained in this slot is not allocated in the + * memory context of the given slot (else it would have TTS_SHOULDFREE + * set). Copy the minimal tuple into the given slot's memory context. + */ + mslot->mintuple = heap_copy_minimal_tuple(mslot->mintuple); + } + + slot->tts_flags |= TTS_FLAG_SHOULDFREE; + + Assert(mslot->tuple == &mslot->minhdr); + + mslot->minhdr.t_len = mslot->mintuple->t_len + MINIMAL_TUPLE_OFFSET; + mslot->minhdr.t_data = (HeapTupleHeader) ((char *) mslot->mintuple - MINIMAL_TUPLE_OFFSET); + + MemoryContextSwitchTo(oldContext); +} + +static void +tts_minimal_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot) +{ + MemoryContext oldcontext; + MinimalTuple mintuple; + + oldcontext = MemoryContextSwitchTo(dstslot->tts_mcxt); + mintuple = ExecCopySlotMinimalTuple(srcslot); + MemoryContextSwitchTo(oldcontext); + + ExecStoreMinimalTuple(mintuple, dstslot, true); +} + +static MinimalTuple +tts_minimal_get_minimal_tuple(TupleTableSlot *slot) +{ + MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot; + + if (!mslot->mintuple) + tts_minimal_materialize(slot); + + return mslot->mintuple; +} + +static HeapTuple +tts_minimal_copy_heap_tuple(TupleTableSlot *slot) +{ + MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot; + + if (!mslot->mintuple) + tts_minimal_materialize(slot); + + return heap_tuple_from_minimal_tuple(mslot->mintuple); +} + +static MinimalTuple +tts_minimal_copy_minimal_tuple(TupleTableSlot *slot) +{ + MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot; + + if (!mslot->mintuple) + tts_minimal_materialize(slot); + + return heap_copy_minimal_tuple(mslot->mintuple); +} + +static void +tts_minimal_store_tuple(TupleTableSlot *slot, MinimalTuple mtup, bool shouldFree) +{ + MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot; + + tts_minimal_clear(slot); + + Assert(!TTS_SHOULDFREE(slot)); + Assert(TTS_EMPTY(slot)); + + slot->tts_flags &= ~TTS_FLAG_EMPTY; + slot->tts_nvalid = 0; + mslot->off = 0; + + mslot->mintuple = mtup; + Assert(mslot->tuple == &mslot->minhdr); + mslot->minhdr.t_len = mtup->t_len + MINIMAL_TUPLE_OFFSET; + mslot->minhdr.t_data = (HeapTupleHeader) ((char *) mtup - MINIMAL_TUPLE_OFFSET); + /* no need to set t_self or t_tableOid since we won't allow access */ + + if (shouldFree) + slot->tts_flags |= TTS_FLAG_SHOULDFREE; +} + + +/* + * TupleTableSlotOps implementation for BufferHeapTupleTableSlot. + */ + +static void +tts_buffer_heap_init(TupleTableSlot *slot) +{ +} + +static void +tts_buffer_heap_release(TupleTableSlot *slot) +{ +} + +static void +tts_buffer_heap_clear(TupleTableSlot *slot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + + /* + * Free the memory for heap tuple if allowed. A tuple coming from buffer + * can never be freed. But we may have materialized a tuple from buffer. + * Such a tuple can be freed. + */ + if (TTS_SHOULDFREE(slot)) + { + /* We should have unpinned the buffer while materializing the tuple. */ + Assert(!BufferIsValid(bslot->buffer)); + + heap_freetuple(bslot->base.tuple); + slot->tts_flags &= ~TTS_FLAG_SHOULDFREE; + } + + if (BufferIsValid(bslot->buffer)) + ReleaseBuffer(bslot->buffer); + + slot->tts_nvalid = 0; + slot->tts_flags |= TTS_FLAG_EMPTY; + ItemPointerSetInvalid(&slot->tts_tid); + bslot->base.tuple = NULL; + bslot->base.off = 0; + bslot->buffer = InvalidBuffer; +} + +static void +tts_buffer_heap_getsomeattrs(TupleTableSlot *slot, int natts) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + + slot_deform_heap_tuple(slot, bslot->base.tuple, &bslot->base.off, natts); +} + +static Datum +tts_buffer_heap_getsysattr(TupleTableSlot *slot, int attnum, bool *isnull) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + + /* + * In some code paths it's possible to get here with a non-materialized + * slot, in which case we can't retrieve system columns. + */ + if (!bslot->base.tuple) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot retrieve a system column in this context"))); + + return heap_getsysattr(bslot->base.tuple, attnum, + slot->tts_tupleDescriptor, isnull); +} + +static void +tts_buffer_heap_materialize(TupleTableSlot *slot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + MemoryContext oldContext; + + Assert(!TTS_EMPTY(slot)); + + /* If slot has its tuple already materialized, nothing to do. */ + if (TTS_SHOULDFREE(slot)) + return; + + oldContext = MemoryContextSwitchTo(slot->tts_mcxt); + + /* + * Have to deform from scratch, otherwise tts_values[] entries could point + * into the non-materialized tuple (which might be gone when accessed). + */ + bslot->base.off = 0; + slot->tts_nvalid = 0; + + if (!bslot->base.tuple) + { + /* + * Normally BufferHeapTupleTableSlot should have a tuple + buffer + * associated with it, unless it's materialized (which would've + * returned above). But when it's useful to allow storing virtual + * tuples in a buffer slot, which then also needs to be + * materializable. + */ + bslot->base.tuple = heap_form_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); + } + else + { + bslot->base.tuple = heap_copytuple(bslot->base.tuple); + + /* + * A heap tuple stored in a BufferHeapTupleTableSlot should have a + * buffer associated with it, unless it's materialized or virtual. + */ + if (likely(BufferIsValid(bslot->buffer))) + ReleaseBuffer(bslot->buffer); + bslot->buffer = InvalidBuffer; + } + + /* + * We don't set TTS_FLAG_SHOULDFREE until after releasing the buffer, if + * any. This avoids having a transient state that would fall foul of our + * assertions that a slot with TTS_FLAG_SHOULDFREE doesn't own a buffer. + * In the unlikely event that ReleaseBuffer() above errors out, we'd + * effectively leak the copied tuple, but that seems fairly harmless. + */ + slot->tts_flags |= TTS_FLAG_SHOULDFREE; + + MemoryContextSwitchTo(oldContext); +} + +static void +tts_buffer_heap_copyslot(TupleTableSlot *dstslot, TupleTableSlot *srcslot) +{ + BufferHeapTupleTableSlot *bsrcslot = (BufferHeapTupleTableSlot *) srcslot; + BufferHeapTupleTableSlot *bdstslot = (BufferHeapTupleTableSlot *) dstslot; + + /* + * If the source slot is of a different kind, or is a buffer slot that has + * been materialized / is virtual, make a new copy of the tuple. Otherwise + * make a new reference to the in-buffer tuple. + */ + if (dstslot->tts_ops != srcslot->tts_ops || + TTS_SHOULDFREE(srcslot) || + !bsrcslot->base.tuple) + { + MemoryContext oldContext; + + ExecClearTuple(dstslot); + dstslot->tts_flags &= ~TTS_FLAG_EMPTY; + oldContext = MemoryContextSwitchTo(dstslot->tts_mcxt); + bdstslot->base.tuple = ExecCopySlotHeapTuple(srcslot); + dstslot->tts_flags |= TTS_FLAG_SHOULDFREE; + MemoryContextSwitchTo(oldContext); + } + else + { + Assert(BufferIsValid(bsrcslot->buffer)); + + tts_buffer_heap_store_tuple(dstslot, bsrcslot->base.tuple, + bsrcslot->buffer, false); + + /* + * The HeapTupleData portion of the source tuple might be shorter + * lived than the destination slot. Therefore copy the HeapTuple into + * our slot's tupdata, which is guaranteed to live long enough (but + * will still point into the buffer). + */ + memcpy(&bdstslot->base.tupdata, bdstslot->base.tuple, sizeof(HeapTupleData)); + bdstslot->base.tuple = &bdstslot->base.tupdata; + } +} + +static HeapTuple +tts_buffer_heap_get_heap_tuple(TupleTableSlot *slot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + + if (!bslot->base.tuple) + tts_buffer_heap_materialize(slot); + + return bslot->base.tuple; +} + +static HeapTuple +tts_buffer_heap_copy_heap_tuple(TupleTableSlot *slot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + + if (!bslot->base.tuple) + tts_buffer_heap_materialize(slot); + + return heap_copytuple(bslot->base.tuple); +} + +static MinimalTuple +tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + + Assert(!TTS_EMPTY(slot)); + + if (!bslot->base.tuple) + tts_buffer_heap_materialize(slot); + + return minimal_tuple_from_heap_tuple(bslot->base.tuple); +} + +static inline void +tts_buffer_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple, + Buffer buffer, bool transfer_pin) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + + if (TTS_SHOULDFREE(slot)) + { + /* materialized slot shouldn't have a buffer to release */ + Assert(!BufferIsValid(bslot->buffer)); + + heap_freetuple(bslot->base.tuple); + slot->tts_flags &= ~TTS_FLAG_SHOULDFREE; + } + + slot->tts_flags &= ~TTS_FLAG_EMPTY; + slot->tts_nvalid = 0; + bslot->base.tuple = tuple; + bslot->base.off = 0; + slot->tts_tid = tuple->t_self; + + /* + * If tuple is on a disk page, keep the page pinned as long as we hold a + * pointer into it. We assume the caller already has such a pin. If + * transfer_pin is true, we'll transfer that pin to this slot, if not + * we'll pin it again ourselves. + * + * This is coded to optimize the case where the slot previously held a + * tuple on the same disk page: in that case releasing and re-acquiring + * the pin is a waste of cycles. This is a common situation during + * seqscans, so it's worth troubling over. + */ + if (bslot->buffer != buffer) + { + if (BufferIsValid(bslot->buffer)) + ReleaseBuffer(bslot->buffer); + + bslot->buffer = buffer; + + if (!transfer_pin && BufferIsValid(buffer)) + IncrBufferRefCount(buffer); + } + else if (transfer_pin && BufferIsValid(buffer)) + { + /* + * In transfer_pin mode the caller won't know about the same-page + * optimization, so we gotta release its pin. + */ + ReleaseBuffer(buffer); + } +} + +/* + * slot_deform_heap_tuple + * Given a TupleTableSlot, extract data from the slot's physical tuple + * into its Datum/isnull arrays. Data is extracted up through the + * natts'th column (caller must ensure this is a legal column number). + * + * This is essentially an incremental version of heap_deform_tuple: + * on each call we extract attributes up to the one needed, without + * re-computing information about previously extracted attributes. + * slot->tts_nvalid is the number of attributes already extracted. + * + * This is marked as always inline, so the different offp for different types + * of slots gets optimized away. + */ +static pg_attribute_always_inline void +slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp, + int natts) +{ + TupleDesc tupleDesc = slot->tts_tupleDescriptor; + Datum *values = slot->tts_values; + bool *isnull = slot->tts_isnull; + HeapTupleHeader tup = tuple->t_data; + bool hasnulls = HeapTupleHasNulls(tuple); + int attnum; + char *tp; /* ptr to tuple data */ + uint32 off; /* offset in tuple data */ + bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */ + bool slow; /* can we use/set attcacheoff? */ + + /* We can only fetch as many attributes as the tuple has. */ + natts = Min(HeapTupleHeaderGetNatts(tuple->t_data), natts); + + /* + * Check whether the first call for this tuple, and initialize or restore + * loop state. + */ + attnum = slot->tts_nvalid; + if (attnum == 0) + { + /* Start from the first attribute */ + off = 0; + slow = false; + } + else + { + /* Restore state from previous execution */ + off = *offp; + slow = TTS_SLOW(slot); + } + + tp = (char *) tup + tup->t_hoff; + + for (; attnum < natts; attnum++) + { + Form_pg_attribute thisatt = TupleDescAttr(tupleDesc, attnum); + + if (hasnulls && att_isnull(attnum, bp)) + { + values[attnum] = (Datum) 0; + isnull[attnum] = true; + slow = true; /* can't use attcacheoff anymore */ + continue; + } + + isnull[attnum] = false; + + if (!slow && thisatt->attcacheoff >= 0) + off = thisatt->attcacheoff; + else if (thisatt->attlen == -1) + { + /* + * We can only cache the offset for a varlena attribute if the + * offset is already suitably aligned, so that there would be no + * pad bytes in any case: then the offset will be valid for either + * an aligned or unaligned value. + */ + if (!slow && + off == att_align_nominal(off, thisatt->attalign)) + thisatt->attcacheoff = off; + else + { + off = att_align_pointer(off, thisatt->attalign, -1, + tp + off); + slow = true; + } + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, thisatt->attalign); + + if (!slow) + thisatt->attcacheoff = off; + } + + values[attnum] = fetchatt(thisatt, tp + off); + + off = att_addlength_pointer(off, thisatt->attlen, tp + off); + + if (thisatt->attlen <= 0) + slow = true; /* can't use attcacheoff anymore */ + } + + /* + * Save state for next execution + */ + slot->tts_nvalid = attnum; + *offp = off; + if (slow) + slot->tts_flags |= TTS_FLAG_SLOW; + else + slot->tts_flags &= ~TTS_FLAG_SLOW; +} + + +const TupleTableSlotOps TTSOpsVirtual = { + .base_slot_size = sizeof(VirtualTupleTableSlot), + .init = tts_virtual_init, + .release = tts_virtual_release, + .clear = tts_virtual_clear, + .getsomeattrs = tts_virtual_getsomeattrs, + .getsysattr = tts_virtual_getsysattr, + .materialize = tts_virtual_materialize, + .copyslot = tts_virtual_copyslot, + + /* + * A virtual tuple table slot can not "own" a heap tuple or a minimal + * tuple. + */ + .get_heap_tuple = NULL, + .get_minimal_tuple = NULL, + .copy_heap_tuple = tts_virtual_copy_heap_tuple, + .copy_minimal_tuple = tts_virtual_copy_minimal_tuple +}; + +const TupleTableSlotOps TTSOpsHeapTuple = { + .base_slot_size = sizeof(HeapTupleTableSlot), + .init = tts_heap_init, + .release = tts_heap_release, + .clear = tts_heap_clear, + .getsomeattrs = tts_heap_getsomeattrs, + .getsysattr = tts_heap_getsysattr, + .materialize = tts_heap_materialize, + .copyslot = tts_heap_copyslot, + .get_heap_tuple = tts_heap_get_heap_tuple, + + /* A heap tuple table slot can not "own" a minimal tuple. */ + .get_minimal_tuple = NULL, + .copy_heap_tuple = tts_heap_copy_heap_tuple, + .copy_minimal_tuple = tts_heap_copy_minimal_tuple +}; + +const TupleTableSlotOps TTSOpsMinimalTuple = { + .base_slot_size = sizeof(MinimalTupleTableSlot), + .init = tts_minimal_init, + .release = tts_minimal_release, + .clear = tts_minimal_clear, + .getsomeattrs = tts_minimal_getsomeattrs, + .getsysattr = tts_minimal_getsysattr, + .materialize = tts_minimal_materialize, + .copyslot = tts_minimal_copyslot, + + /* A minimal tuple table slot can not "own" a heap tuple. */ + .get_heap_tuple = NULL, + .get_minimal_tuple = tts_minimal_get_minimal_tuple, + .copy_heap_tuple = tts_minimal_copy_heap_tuple, + .copy_minimal_tuple = tts_minimal_copy_minimal_tuple +}; + +const TupleTableSlotOps TTSOpsBufferHeapTuple = { + .base_slot_size = sizeof(BufferHeapTupleTableSlot), + .init = tts_buffer_heap_init, + .release = tts_buffer_heap_release, + .clear = tts_buffer_heap_clear, + .getsomeattrs = tts_buffer_heap_getsomeattrs, + .getsysattr = tts_buffer_heap_getsysattr, + .materialize = tts_buffer_heap_materialize, + .copyslot = tts_buffer_heap_copyslot, + .get_heap_tuple = tts_buffer_heap_get_heap_tuple, + + /* A buffer heap tuple table slot can not "own" a minimal tuple. */ + .get_minimal_tuple = NULL, + .copy_heap_tuple = tts_buffer_heap_copy_heap_tuple, + .copy_minimal_tuple = tts_buffer_heap_copy_minimal_tuple +}; + + +/* ---------------------------------------------------------------- + * tuple table create/delete functions + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * MakeTupleTableSlot + * + * Basic routine to make an empty TupleTableSlot of given + * TupleTableSlotType. If tupleDesc is specified the slot's descriptor is + * fixed for its lifetime, gaining some efficiency. If that's + * undesirable, pass NULL. + * -------------------------------- + */ +TupleTableSlot * +MakeTupleTableSlot(TupleDesc tupleDesc, + const TupleTableSlotOps *tts_ops) +{ + Size basesz, + allocsz; + TupleTableSlot *slot; + + basesz = tts_ops->base_slot_size; + + /* + * When a fixed descriptor is specified, we can reduce overhead by + * allocating the entire slot in one go. + */ + if (tupleDesc) + allocsz = MAXALIGN(basesz) + + MAXALIGN(tupleDesc->natts * sizeof(Datum)) + + MAXALIGN(tupleDesc->natts * sizeof(bool)); + else + allocsz = basesz; + + slot = palloc0(allocsz); + /* const for optimization purposes, OK to modify at allocation time */ + *((const TupleTableSlotOps **) &slot->tts_ops) = tts_ops; + slot->type = T_TupleTableSlot; + slot->tts_flags |= TTS_FLAG_EMPTY; + if (tupleDesc != NULL) + slot->tts_flags |= TTS_FLAG_FIXED; + slot->tts_tupleDescriptor = tupleDesc; + slot->tts_mcxt = CurrentMemoryContext; + slot->tts_nvalid = 0; + + if (tupleDesc != NULL) + { + slot->tts_values = (Datum *) + (((char *) slot) + + MAXALIGN(basesz)); + slot->tts_isnull = (bool *) + (((char *) slot) + + MAXALIGN(basesz) + + MAXALIGN(tupleDesc->natts * sizeof(Datum))); + + PinTupleDesc(tupleDesc); + } + + /* + * And allow slot type specific initialization. + */ + slot->tts_ops->init(slot); + + return slot; +} + +/* -------------------------------- + * ExecAllocTableSlot + * + * Create a tuple table slot within a tuple table (which is just a List). + * -------------------------------- + */ +TupleTableSlot * +ExecAllocTableSlot(List **tupleTable, TupleDesc desc, + const TupleTableSlotOps *tts_ops) +{ + TupleTableSlot *slot = MakeTupleTableSlot(desc, tts_ops); + + *tupleTable = lappend(*tupleTable, slot); + + return slot; +} + +/* -------------------------------- + * ExecResetTupleTable + * + * This releases any resources (buffer pins, tupdesc refcounts) + * held by the tuple table, and optionally releases the memory + * occupied by the tuple table data structure. + * It is expected that this routine be called by ExecEndPlan(). + * -------------------------------- + */ +void +ExecResetTupleTable(List *tupleTable, /* tuple table */ + bool shouldFree) /* true if we should free memory */ +{ + ListCell *lc; + + foreach(lc, tupleTable) + { + TupleTableSlot *slot = lfirst_node(TupleTableSlot, lc); + + /* Always release resources and reset the slot to empty */ + ExecClearTuple(slot); + slot->tts_ops->release(slot); + if (slot->tts_tupleDescriptor) + { + ReleaseTupleDesc(slot->tts_tupleDescriptor); + slot->tts_tupleDescriptor = NULL; + } + + /* If shouldFree, release memory occupied by the slot itself */ + if (shouldFree) + { + if (!TTS_FIXED(slot)) + { + if (slot->tts_values) + pfree(slot->tts_values); + if (slot->tts_isnull) + pfree(slot->tts_isnull); + } + pfree(slot); + } + } + + /* If shouldFree, release the list structure */ + if (shouldFree) + list_free(tupleTable); +} + +/* -------------------------------- + * MakeSingleTupleTableSlot + * + * This is a convenience routine for operations that need a standalone + * TupleTableSlot not gotten from the main executor tuple table. It makes + * a single slot of given TupleTableSlotType and initializes it to use the + * given tuple descriptor. + * -------------------------------- + */ +TupleTableSlot * +MakeSingleTupleTableSlot(TupleDesc tupdesc, + const TupleTableSlotOps *tts_ops) +{ + TupleTableSlot *slot = MakeTupleTableSlot(tupdesc, tts_ops); + + return slot; +} + +/* -------------------------------- + * ExecDropSingleTupleTableSlot + * + * Release a TupleTableSlot made with MakeSingleTupleTableSlot. + * DON'T use this on a slot that's part of a tuple table list! + * -------------------------------- + */ +void +ExecDropSingleTupleTableSlot(TupleTableSlot *slot) +{ + /* This should match ExecResetTupleTable's processing of one slot */ + Assert(IsA(slot, TupleTableSlot)); + ExecClearTuple(slot); + slot->tts_ops->release(slot); + if (slot->tts_tupleDescriptor) + ReleaseTupleDesc(slot->tts_tupleDescriptor); + if (!TTS_FIXED(slot)) + { + if (slot->tts_values) + pfree(slot->tts_values); + if (slot->tts_isnull) + pfree(slot->tts_isnull); + } + pfree(slot); +} + + +/* ---------------------------------------------------------------- + * tuple table slot accessor functions + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * ExecSetSlotDescriptor + * + * This function is used to set the tuple descriptor associated + * with the slot's tuple. The passed descriptor must have lifespan + * at least equal to the slot's. If it is a reference-counted descriptor + * then the reference count is incremented for as long as the slot holds + * a reference. + * -------------------------------- + */ +void +ExecSetSlotDescriptor(TupleTableSlot *slot, /* slot to change */ + TupleDesc tupdesc) /* new tuple descriptor */ +{ + Assert(!TTS_FIXED(slot)); + + /* For safety, make sure slot is empty before changing it */ + ExecClearTuple(slot); + + /* + * Release any old descriptor. Also release old Datum/isnull arrays if + * present (we don't bother to check if they could be re-used). + */ + if (slot->tts_tupleDescriptor) + ReleaseTupleDesc(slot->tts_tupleDescriptor); + + if (slot->tts_values) + pfree(slot->tts_values); + if (slot->tts_isnull) + pfree(slot->tts_isnull); + + /* + * Install the new descriptor; if it's refcounted, bump its refcount. + */ + slot->tts_tupleDescriptor = tupdesc; + PinTupleDesc(tupdesc); + + /* + * Allocate Datum/isnull arrays of the appropriate size. These must have + * the same lifetime as the slot, so allocate in the slot's own context. + */ + slot->tts_values = (Datum *) + MemoryContextAlloc(slot->tts_mcxt, tupdesc->natts * sizeof(Datum)); + slot->tts_isnull = (bool *) + MemoryContextAlloc(slot->tts_mcxt, tupdesc->natts * sizeof(bool)); +} + +/* -------------------------------- + * ExecStoreHeapTuple + * + * This function is used to store an on-the-fly physical tuple into a specified + * slot in the tuple table. + * + * tuple: tuple to store + * slot: TTSOpsHeapTuple type slot to store it in + * shouldFree: true if ExecClearTuple should pfree() the tuple + * when done with it + * + * shouldFree is normally set 'true' for tuples constructed on-the-fly. But it + * can be 'false' when the referenced tuple is held in a tuple table slot + * belonging to a lower-level executor Proc node. In this case the lower-level + * slot retains ownership and responsibility for eventually releasing the + * tuple. When this method is used, we must be certain that the upper-level + * Proc node will lose interest in the tuple sooner than the lower-level one + * does! If you're not certain, copy the lower-level tuple with heap_copytuple + * and let the upper-level table slot assume ownership of the copy! + * + * Return value is just the passed-in slot pointer. + * + * If the target slot is not guaranteed to be TTSOpsHeapTuple type slot, use + * the, more expensive, ExecForceStoreHeapTuple(). + * -------------------------------- + */ +TupleTableSlot * +ExecStoreHeapTuple(HeapTuple tuple, + TupleTableSlot *slot, + bool shouldFree) +{ + /* + * sanity checks + */ + Assert(tuple != NULL); + Assert(slot != NULL); + Assert(slot->tts_tupleDescriptor != NULL); + + if (unlikely(!TTS_IS_HEAPTUPLE(slot))) + elog(ERROR, "trying to store a heap tuple into wrong type of slot"); + tts_heap_store_tuple(slot, tuple, shouldFree); + + slot->tts_tableOid = tuple->t_tableOid; + + return slot; +} + +/* -------------------------------- + * ExecStoreBufferHeapTuple + * + * This function is used to store an on-disk physical tuple from a buffer + * into a specified slot in the tuple table. + * + * tuple: tuple to store + * slot: TTSOpsBufferHeapTuple type slot to store it in + * buffer: disk buffer if tuple is in a disk page, else InvalidBuffer + * + * The tuple table code acquires a pin on the buffer which is held until the + * slot is cleared, so that the tuple won't go away on us. + * + * Return value is just the passed-in slot pointer. + * + * If the target slot is not guaranteed to be TTSOpsBufferHeapTuple type slot, + * use the, more expensive, ExecForceStoreHeapTuple(). + * -------------------------------- + */ +TupleTableSlot * +ExecStoreBufferHeapTuple(HeapTuple tuple, + TupleTableSlot *slot, + Buffer buffer) +{ + /* + * sanity checks + */ + Assert(tuple != NULL); + Assert(slot != NULL); + Assert(slot->tts_tupleDescriptor != NULL); + Assert(BufferIsValid(buffer)); + + if (unlikely(!TTS_IS_BUFFERTUPLE(slot))) + elog(ERROR, "trying to store an on-disk heap tuple into wrong type of slot"); + tts_buffer_heap_store_tuple(slot, tuple, buffer, false); + + slot->tts_tableOid = tuple->t_tableOid; + + return slot; +} + +/* + * Like ExecStoreBufferHeapTuple, but transfer an existing pin from the caller + * to the slot, i.e. the caller doesn't need to, and may not, release the pin. + */ +TupleTableSlot * +ExecStorePinnedBufferHeapTuple(HeapTuple tuple, + TupleTableSlot *slot, + Buffer buffer) +{ + /* + * sanity checks + */ + Assert(tuple != NULL); + Assert(slot != NULL); + Assert(slot->tts_tupleDescriptor != NULL); + Assert(BufferIsValid(buffer)); + + if (unlikely(!TTS_IS_BUFFERTUPLE(slot))) + elog(ERROR, "trying to store an on-disk heap tuple into wrong type of slot"); + tts_buffer_heap_store_tuple(slot, tuple, buffer, true); + + slot->tts_tableOid = tuple->t_tableOid; + + return slot; +} + +/* + * Store a minimal tuple into TTSOpsMinimalTuple type slot. + * + * If the target slot is not guaranteed to be TTSOpsMinimalTuple type slot, + * use the, more expensive, ExecForceStoreMinimalTuple(). + */ +TupleTableSlot * +ExecStoreMinimalTuple(MinimalTuple mtup, + TupleTableSlot *slot, + bool shouldFree) +{ + /* + * sanity checks + */ + Assert(mtup != NULL); + Assert(slot != NULL); + Assert(slot->tts_tupleDescriptor != NULL); + + if (unlikely(!TTS_IS_MINIMALTUPLE(slot))) + elog(ERROR, "trying to store a minimal tuple into wrong type of slot"); + tts_minimal_store_tuple(slot, mtup, shouldFree); + + return slot; +} + +/* + * Store a HeapTuple into any kind of slot, performing conversion if + * necessary. + */ +void +ExecForceStoreHeapTuple(HeapTuple tuple, + TupleTableSlot *slot, + bool shouldFree) +{ + if (TTS_IS_HEAPTUPLE(slot)) + { + ExecStoreHeapTuple(tuple, slot, shouldFree); + } + else if (TTS_IS_BUFFERTUPLE(slot)) + { + MemoryContext oldContext; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + + ExecClearTuple(slot); + slot->tts_flags &= ~TTS_FLAG_EMPTY; + oldContext = MemoryContextSwitchTo(slot->tts_mcxt); + bslot->base.tuple = heap_copytuple(tuple); + slot->tts_flags |= TTS_FLAG_SHOULDFREE; + MemoryContextSwitchTo(oldContext); + + if (shouldFree) + pfree(tuple); + } + else + { + ExecClearTuple(slot); + heap_deform_tuple(tuple, slot->tts_tupleDescriptor, + slot->tts_values, slot->tts_isnull); + ExecStoreVirtualTuple(slot); + + if (shouldFree) + { + ExecMaterializeSlot(slot); + pfree(tuple); + } + } +} + +/* + * Store a MinimalTuple into any kind of slot, performing conversion if + * necessary. + */ +void +ExecForceStoreMinimalTuple(MinimalTuple mtup, + TupleTableSlot *slot, + bool shouldFree) +{ + if (TTS_IS_MINIMALTUPLE(slot)) + { + tts_minimal_store_tuple(slot, mtup, shouldFree); + } + else + { + HeapTupleData htup; + + ExecClearTuple(slot); + + htup.t_len = mtup->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup - MINIMAL_TUPLE_OFFSET); + heap_deform_tuple(&htup, slot->tts_tupleDescriptor, + slot->tts_values, slot->tts_isnull); + ExecStoreVirtualTuple(slot); + + if (shouldFree) + { + ExecMaterializeSlot(slot); + pfree(mtup); + } + } +} + +/* -------------------------------- + * ExecStoreVirtualTuple + * Mark a slot as containing a virtual tuple. + * + * The protocol for loading a slot with virtual tuple data is: + * * Call ExecClearTuple to mark the slot empty. + * * Store data into the Datum/isnull arrays. + * * Call ExecStoreVirtualTuple to mark the slot valid. + * This is a bit unclean but it avoids one round of data copying. + * -------------------------------- + */ +TupleTableSlot * +ExecStoreVirtualTuple(TupleTableSlot *slot) +{ + /* + * sanity checks + */ + Assert(slot != NULL); + Assert(slot->tts_tupleDescriptor != NULL); + Assert(TTS_EMPTY(slot)); + + slot->tts_flags &= ~TTS_FLAG_EMPTY; + slot->tts_nvalid = slot->tts_tupleDescriptor->natts; + + return slot; +} + +/* -------------------------------- + * ExecStoreAllNullTuple + * Set up the slot to contain a null in every column. + * + * At first glance this might sound just like ExecClearTuple, but it's + * entirely different: the slot ends up full, not empty. + * -------------------------------- + */ +TupleTableSlot * +ExecStoreAllNullTuple(TupleTableSlot *slot) +{ + /* + * sanity checks + */ + Assert(slot != NULL); + Assert(slot->tts_tupleDescriptor != NULL); + + /* Clear any old contents */ + ExecClearTuple(slot); + + /* + * Fill all the columns of the virtual tuple with nulls + */ + MemSet(slot->tts_values, 0, + slot->tts_tupleDescriptor->natts * sizeof(Datum)); + memset(slot->tts_isnull, true, + slot->tts_tupleDescriptor->natts * sizeof(bool)); + + return ExecStoreVirtualTuple(slot); +} + +/* + * Store a HeapTuple in datum form, into a slot. That always requires + * deforming it and storing it in virtual form. + * + * Until the slot is materialized, the contents of the slot depend on the + * datum. + */ +void +ExecStoreHeapTupleDatum(Datum data, TupleTableSlot *slot) +{ + HeapTupleData tuple = {0}; + HeapTupleHeader td; + + td = DatumGetHeapTupleHeader(data); + + tuple.t_len = HeapTupleHeaderGetDatumLength(td); + tuple.t_self = td->t_ctid; + tuple.t_data = td; + + ExecClearTuple(slot); + + heap_deform_tuple(&tuple, slot->tts_tupleDescriptor, + slot->tts_values, slot->tts_isnull); + ExecStoreVirtualTuple(slot); +} + +/* + * ExecFetchSlotHeapTuple - fetch HeapTuple representing the slot's content + * + * The returned HeapTuple represents the slot's content as closely as + * possible. + * + * If materialize is true, the contents of the slots will be made independent + * from the underlying storage (i.e. all buffer pins are released, memory is + * allocated in the slot's context). + * + * If shouldFree is not-NULL it'll be set to true if the returned tuple has + * been allocated in the calling memory context, and must be freed by the + * caller (via explicit pfree() or a memory context reset). + * + * NB: If materialize is true, modifications of the returned tuple are + * allowed. But it depends on the type of the slot whether such modifications + * will also affect the slot's contents. While that is not the nicest + * behaviour, all such modifications are in the process of being removed. + */ +HeapTuple +ExecFetchSlotHeapTuple(TupleTableSlot *slot, bool materialize, bool *shouldFree) +{ + /* + * sanity checks + */ + Assert(slot != NULL); + Assert(!TTS_EMPTY(slot)); + + /* Materialize the tuple so that the slot "owns" it, if requested. */ + if (materialize) + slot->tts_ops->materialize(slot); + + if (slot->tts_ops->get_heap_tuple == NULL) + { + if (shouldFree) + *shouldFree = true; + return slot->tts_ops->copy_heap_tuple(slot); + } + else + { + if (shouldFree) + *shouldFree = false; + return slot->tts_ops->get_heap_tuple(slot); + } +} + +/* -------------------------------- + * ExecFetchSlotMinimalTuple + * Fetch the slot's minimal physical tuple. + * + * If the given tuple table slot can hold a minimal tuple, indicated by a + * non-NULL get_minimal_tuple callback, the function returns the minimal + * tuple returned by that callback. It assumes that the minimal tuple + * returned by the callback is "owned" by the slot i.e. the slot is + * responsible for freeing the memory consumed by the tuple. Hence it sets + * *shouldFree to false, indicating that the caller should not free the + * memory consumed by the minimal tuple. In this case the returned minimal + * tuple should be considered as read-only. + * + * If that callback is not supported, it calls copy_minimal_tuple callback + * which is expected to return a copy of minimal tuple representing the + * contents of the slot. In this case *shouldFree is set to true, + * indicating the caller that it should free the memory consumed by the + * minimal tuple. In this case the returned minimal tuple may be written + * up. + * -------------------------------- + */ +MinimalTuple +ExecFetchSlotMinimalTuple(TupleTableSlot *slot, + bool *shouldFree) +{ + /* + * sanity checks + */ + Assert(slot != NULL); + Assert(!TTS_EMPTY(slot)); + + if (slot->tts_ops->get_minimal_tuple) + { + if (shouldFree) + *shouldFree = false; + return slot->tts_ops->get_minimal_tuple(slot); + } + else + { + if (shouldFree) + *shouldFree = true; + return slot->tts_ops->copy_minimal_tuple(slot); + } +} + +/* -------------------------------- + * ExecFetchSlotHeapTupleDatum + * Fetch the slot's tuple as a composite-type Datum. + * + * The result is always freshly palloc'd in the caller's memory context. + * -------------------------------- + */ +Datum +ExecFetchSlotHeapTupleDatum(TupleTableSlot *slot) +{ + HeapTuple tup; + TupleDesc tupdesc; + bool shouldFree; + Datum ret; + + /* Fetch slot's contents in regular-physical-tuple form */ + tup = ExecFetchSlotHeapTuple(slot, false, &shouldFree); + tupdesc = slot->tts_tupleDescriptor; + + /* Convert to Datum form */ + ret = heap_copy_tuple_as_datum(tup, tupdesc); + + if (shouldFree) + pfree(tup); + + return ret; +} + +/* ---------------------------------------------------------------- + * convenience initialization routines + * ---------------------------------------------------------------- + */ + +/* ---------------- + * ExecInitResultTypeTL + * + * Initialize result type, using the plan node's targetlist. + * ---------------- + */ +void +ExecInitResultTypeTL(PlanState *planstate) +{ + TupleDesc tupDesc = ExecTypeFromTL(planstate->plan->targetlist); + + planstate->ps_ResultTupleDesc = tupDesc; +} + +/* -------------------------------- + * ExecInit{Result,Scan,Extra}TupleSlot[TL] + * + * These are convenience routines to initialize the specified slot + * in nodes inheriting the appropriate state. ExecInitExtraTupleSlot + * is used for initializing special-purpose slots. + * -------------------------------- + */ + +/* ---------------- + * ExecInitResultTupleSlotTL + * + * Initialize result tuple slot, using the tuple descriptor previously + * computed with ExecInitResultTypeTL(). + * ---------------- + */ +void +ExecInitResultSlot(PlanState *planstate, const TupleTableSlotOps *tts_ops) +{ + TupleTableSlot *slot; + + slot = ExecAllocTableSlot(&planstate->state->es_tupleTable, + planstate->ps_ResultTupleDesc, tts_ops); + planstate->ps_ResultTupleSlot = slot; + + planstate->resultopsfixed = planstate->ps_ResultTupleDesc != NULL; + planstate->resultops = tts_ops; + planstate->resultopsset = true; +} + +/* ---------------- + * ExecInitResultTupleSlotTL + * + * Initialize result tuple slot, using the plan node's targetlist. + * ---------------- + */ +void +ExecInitResultTupleSlotTL(PlanState *planstate, + const TupleTableSlotOps *tts_ops) +{ + ExecInitResultTypeTL(planstate); + ExecInitResultSlot(planstate, tts_ops); +} + +/* ---------------- + * ExecInitScanTupleSlot + * ---------------- + */ +void +ExecInitScanTupleSlot(EState *estate, ScanState *scanstate, + TupleDesc tupledesc, const TupleTableSlotOps *tts_ops) +{ + scanstate->ss_ScanTupleSlot = ExecAllocTableSlot(&estate->es_tupleTable, + tupledesc, tts_ops); + scanstate->ps.scandesc = tupledesc; + scanstate->ps.scanopsfixed = tupledesc != NULL; + scanstate->ps.scanops = tts_ops; + scanstate->ps.scanopsset = true; +} + +/* ---------------- + * ExecInitExtraTupleSlot + * + * Return a newly created slot. If tupledesc is non-NULL the slot will have + * that as its fixed tupledesc. Otherwise the caller needs to use + * ExecSetSlotDescriptor() to set the descriptor before use. + * ---------------- + */ +TupleTableSlot * +ExecInitExtraTupleSlot(EState *estate, + TupleDesc tupledesc, + const TupleTableSlotOps *tts_ops) +{ + return ExecAllocTableSlot(&estate->es_tupleTable, tupledesc, tts_ops); +} + +/* ---------------- + * ExecInitNullTupleSlot + * + * Build a slot containing an all-nulls tuple of the given type. + * This is used as a substitute for an input tuple when performing an + * outer join. + * ---------------- + */ +TupleTableSlot * +ExecInitNullTupleSlot(EState *estate, TupleDesc tupType, + const TupleTableSlotOps *tts_ops) +{ + TupleTableSlot *slot = ExecInitExtraTupleSlot(estate, tupType, tts_ops); + + return ExecStoreAllNullTuple(slot); +} + +/* --------------------------------------------------------------- + * Routines for setting/accessing attributes in a slot. + * --------------------------------------------------------------- + */ + +/* + * Fill in missing values for a TupleTableSlot. + * + * This is only exposed because it's needed for JIT compiled tuple + * deforming. That exception aside, there should be no callers outside of this + * file. + */ +void +slot_getmissingattrs(TupleTableSlot *slot, int startAttNum, int lastAttNum) +{ + AttrMissing *attrmiss = NULL; + + if (slot->tts_tupleDescriptor->constr) + attrmiss = slot->tts_tupleDescriptor->constr->missing; + + if (!attrmiss) + { + /* no missing values array at all, so just fill everything in as NULL */ + memset(slot->tts_values + startAttNum, 0, + (lastAttNum - startAttNum) * sizeof(Datum)); + memset(slot->tts_isnull + startAttNum, 1, + (lastAttNum - startAttNum) * sizeof(bool)); + } + else + { + int missattnum; + + /* if there is a missing values array we must process them one by one */ + for (missattnum = startAttNum; + missattnum < lastAttNum; + missattnum++) + { + slot->tts_values[missattnum] = attrmiss[missattnum].am_value; + slot->tts_isnull[missattnum] = !attrmiss[missattnum].am_present; + } + } +} + +/* + * slot_getsomeattrs_int - workhorse for slot_getsomeattrs() + */ +void +slot_getsomeattrs_int(TupleTableSlot *slot, int attnum) +{ + /* Check for caller errors */ + Assert(slot->tts_nvalid < attnum); /* checked in slot_getsomeattrs */ + Assert(attnum > 0); + + if (unlikely(attnum > slot->tts_tupleDescriptor->natts)) + elog(ERROR, "invalid attribute number %d", attnum); + + /* Fetch as many attributes as possible from the underlying tuple. */ + slot->tts_ops->getsomeattrs(slot, attnum); + + /* + * If the underlying tuple doesn't have enough attributes, tuple + * descriptor must have the missing attributes. + */ + if (unlikely(slot->tts_nvalid < attnum)) + { + slot_getmissingattrs(slot, slot->tts_nvalid, attnum); + slot->tts_nvalid = attnum; + } +} + +/* ---------------------------------------------------------------- + * ExecTypeFromTL + * + * Generate a tuple descriptor for the result tuple of a targetlist. + * (A parse/plan tlist must be passed, not an ExprState tlist.) + * Note that resjunk columns, if any, are included in the result. + * + * Currently there are about 4 different places where we create + * TupleDescriptors. They should all be merged, or perhaps + * be rewritten to call BuildDesc(). + * ---------------------------------------------------------------- + */ +TupleDesc +ExecTypeFromTL(List *targetList) +{ + return ExecTypeFromTLInternal(targetList, false); +} + +/* ---------------------------------------------------------------- + * ExecCleanTypeFromTL + * + * Same as above, but resjunk columns are omitted from the result. + * ---------------------------------------------------------------- + */ +TupleDesc +ExecCleanTypeFromTL(List *targetList) +{ + return ExecTypeFromTLInternal(targetList, true); +} + +static TupleDesc +ExecTypeFromTLInternal(List *targetList, bool skipjunk) +{ + TupleDesc typeInfo; + ListCell *l; + int len; + int cur_resno = 1; + + if (skipjunk) + len = ExecCleanTargetListLength(targetList); + else + len = ExecTargetListLength(targetList); + typeInfo = CreateTemplateTupleDesc(len); + + foreach(l, targetList) + { + TargetEntry *tle = lfirst(l); + + if (skipjunk && tle->resjunk) + continue; + TupleDescInitEntry(typeInfo, + cur_resno, + tle->resname, + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), + 0); + TupleDescInitEntryCollation(typeInfo, + cur_resno, + exprCollation((Node *) tle->expr)); + cur_resno++; + } + + return typeInfo; +} + +/* + * ExecTypeFromExprList - build a tuple descriptor from a list of Exprs + * + * This is roughly like ExecTypeFromTL, but we work from bare expressions + * not TargetEntrys. No names are attached to the tupledesc's columns. + */ +TupleDesc +ExecTypeFromExprList(List *exprList) +{ + TupleDesc typeInfo; + ListCell *lc; + int cur_resno = 1; + + typeInfo = CreateTemplateTupleDesc(list_length(exprList)); + + foreach(lc, exprList) + { + Node *e = lfirst(lc); + + TupleDescInitEntry(typeInfo, + cur_resno, + NULL, + exprType(e), + exprTypmod(e), + 0); + TupleDescInitEntryCollation(typeInfo, + cur_resno, + exprCollation(e)); + cur_resno++; + } + + return typeInfo; +} + +/* + * ExecTypeSetColNames - set column names in a RECORD TupleDesc + * + * Column names must be provided as an alias list (list of String nodes). + */ +void +ExecTypeSetColNames(TupleDesc typeInfo, List *namesList) +{ + int colno = 0; + ListCell *lc; + + /* It's only OK to change col names in a not-yet-blessed RECORD type */ + Assert(typeInfo->tdtypeid == RECORDOID); + Assert(typeInfo->tdtypmod < 0); + + foreach(lc, namesList) + { + char *cname = strVal(lfirst(lc)); + Form_pg_attribute attr; + + /* Guard against too-long names list (probably can't happen) */ + if (colno >= typeInfo->natts) + break; + attr = TupleDescAttr(typeInfo, colno); + colno++; + + /* + * Do nothing for empty aliases or dropped columns (these cases + * probably can't arise in RECORD types, either) + */ + if (cname[0] == '\0' || attr->attisdropped) + continue; + + /* OK, assign the column name */ + namestrcpy(&(attr->attname), cname); + } +} + +/* + * BlessTupleDesc - make a completed tuple descriptor useful for SRFs + * + * Rowtype Datums returned by a function must contain valid type information. + * This happens "for free" if the tupdesc came from a relcache entry, but + * not if we have manufactured a tupdesc for a transient RECORD datatype. + * In that case we have to notify typcache.c of the existence of the type. + */ +TupleDesc +BlessTupleDesc(TupleDesc tupdesc) +{ + if (tupdesc->tdtypeid == RECORDOID && + tupdesc->tdtypmod < 0) + assign_record_type_typmod(tupdesc); + + return tupdesc; /* just for notational convenience */ +} + +/* + * TupleDescGetAttInMetadata - Build an AttInMetadata structure based on the + * supplied TupleDesc. AttInMetadata can be used in conjunction with C strings + * to produce a properly formed tuple. + */ +AttInMetadata * +TupleDescGetAttInMetadata(TupleDesc tupdesc) +{ + int natts = tupdesc->natts; + int i; + Oid atttypeid; + Oid attinfuncid; + FmgrInfo *attinfuncinfo; + Oid *attioparams; + int32 *atttypmods; + AttInMetadata *attinmeta; + + attinmeta = (AttInMetadata *) palloc(sizeof(AttInMetadata)); + + /* "Bless" the tupledesc so that we can make rowtype datums with it */ + attinmeta->tupdesc = BlessTupleDesc(tupdesc); + + /* + * Gather info needed later to call the "in" function for each attribute + */ + attinfuncinfo = (FmgrInfo *) palloc0(natts * sizeof(FmgrInfo)); + attioparams = (Oid *) palloc0(natts * sizeof(Oid)); + atttypmods = (int32 *) palloc0(natts * sizeof(int32)); + + for (i = 0; i < natts; i++) + { + Form_pg_attribute att = TupleDescAttr(tupdesc, i); + + /* Ignore dropped attributes */ + if (!att->attisdropped) + { + atttypeid = att->atttypid; + getTypeInputInfo(atttypeid, &attinfuncid, &attioparams[i]); + fmgr_info(attinfuncid, &attinfuncinfo[i]); + atttypmods[i] = att->atttypmod; + } + } + attinmeta->attinfuncs = attinfuncinfo; + attinmeta->attioparams = attioparams; + attinmeta->atttypmods = atttypmods; + + return attinmeta; +} + +/* + * BuildTupleFromCStrings - build a HeapTuple given user data in C string form. + * values is an array of C strings, one for each attribute of the return tuple. + * A NULL string pointer indicates we want to create a NULL field. + */ +HeapTuple +BuildTupleFromCStrings(AttInMetadata *attinmeta, char **values) +{ + TupleDesc tupdesc = attinmeta->tupdesc; + int natts = tupdesc->natts; + Datum *dvalues; + bool *nulls; + int i; + HeapTuple tuple; + + dvalues = (Datum *) palloc(natts * sizeof(Datum)); + nulls = (bool *) palloc(natts * sizeof(bool)); + + /* + * Call the "in" function for each non-dropped attribute, even for nulls, + * to support domains. + */ + for (i = 0; i < natts; i++) + { + if (!TupleDescAttr(tupdesc, i)->attisdropped) + { + /* Non-dropped attributes */ + dvalues[i] = InputFunctionCall(&attinmeta->attinfuncs[i], + values[i], + attinmeta->attioparams[i], + attinmeta->atttypmods[i]); + if (values[i] != NULL) + nulls[i] = false; + else + nulls[i] = true; + } + else + { + /* Handle dropped attributes by setting to NULL */ + dvalues[i] = (Datum) 0; + nulls[i] = true; + } + } + + /* + * Form a tuple + */ + tuple = heap_form_tuple(tupdesc, dvalues, nulls); + + /* + * Release locally palloc'd space. XXX would probably be good to pfree + * values of pass-by-reference datums, as well. + */ + pfree(dvalues); + pfree(nulls); + + return tuple; +} + +/* + * HeapTupleHeaderGetDatum - convert a HeapTupleHeader pointer to a Datum. + * + * This must *not* get applied to an on-disk tuple; the tuple should be + * freshly made by heap_form_tuple or some wrapper routine for it (such as + * BuildTupleFromCStrings). Be sure also that the tupledesc used to build + * the tuple has a properly "blessed" rowtype. + * + * Formerly this was a macro equivalent to PointerGetDatum, relying on the + * fact that heap_form_tuple fills in the appropriate tuple header fields + * for a composite Datum. However, we now require that composite Datums not + * contain any external TOAST pointers. We do not want heap_form_tuple itself + * to enforce that; more specifically, the rule applies only to actual Datums + * and not to HeapTuple structures. Therefore, HeapTupleHeaderGetDatum is + * now a function that detects whether there are externally-toasted fields + * and constructs a new tuple with inlined fields if so. We still need + * heap_form_tuple to insert the Datum header fields, because otherwise this + * code would have no way to obtain a tupledesc for the tuple. + * + * Note that if we do build a new tuple, it's palloc'd in the current + * memory context. Beware of code that changes context between the initial + * heap_form_tuple/etc call and calling HeapTuple(Header)GetDatum. + * + * For performance-critical callers, it could be worthwhile to take extra + * steps to ensure that there aren't TOAST pointers in the output of + * heap_form_tuple to begin with. It's likely however that the costs of the + * typcache lookup and tuple disassembly/reassembly are swamped by TOAST + * dereference costs, so that the benefits of such extra effort would be + * minimal. + * + * XXX it would likely be better to create wrapper functions that produce + * a composite Datum from the field values in one step. However, there's + * enough code using the existing APIs that we couldn't get rid of this + * hack anytime soon. + */ +Datum +HeapTupleHeaderGetDatum(HeapTupleHeader tuple) +{ + Datum result; + TupleDesc tupDesc; + + /* No work if there are no external TOAST pointers in the tuple */ + if (!HeapTupleHeaderHasExternal(tuple)) + return PointerGetDatum(tuple); + + /* Use the type data saved by heap_form_tuple to look up the rowtype */ + tupDesc = lookup_rowtype_tupdesc(HeapTupleHeaderGetTypeId(tuple), + HeapTupleHeaderGetTypMod(tuple)); + + /* And do the flattening */ + result = toast_flatten_tuple_to_datum(tuple, + HeapTupleHeaderGetDatumLength(tuple), + tupDesc); + + ReleaseTupleDesc(tupDesc); + + return result; +} + + +/* + * Functions for sending tuples to the frontend (or other specified destination) + * as though it is a SELECT result. These are used by utility commands that + * need to project directly to the destination and don't need or want full + * table function capability. Currently used by EXPLAIN and SHOW ALL. + */ +TupOutputState * +begin_tup_output_tupdesc(DestReceiver *dest, + TupleDesc tupdesc, + const TupleTableSlotOps *tts_ops) +{ + TupOutputState *tstate; + + tstate = (TupOutputState *) palloc(sizeof(TupOutputState)); + + tstate->slot = MakeSingleTupleTableSlot(tupdesc, tts_ops); + tstate->dest = dest; + + tstate->dest->rStartup(tstate->dest, (int) CMD_SELECT, tupdesc); + + return tstate; +} + +/* + * write a single tuple + */ +void +do_tup_output(TupOutputState *tstate, Datum *values, bool *isnull) +{ + TupleTableSlot *slot = tstate->slot; + int natts = slot->tts_tupleDescriptor->natts; + + /* make sure the slot is clear */ + ExecClearTuple(slot); + + /* insert data */ + memcpy(slot->tts_values, values, natts * sizeof(Datum)); + memcpy(slot->tts_isnull, isnull, natts * sizeof(bool)); + + /* mark slot as containing a virtual tuple */ + ExecStoreVirtualTuple(slot); + + /* send the tuple to the receiver */ + (void) tstate->dest->receiveSlot(slot, tstate->dest); + + /* clean up */ + ExecClearTuple(slot); +} + +/* + * write a chunk of text, breaking at newline characters + * + * Should only be used with a single-TEXT-attribute tupdesc. + */ +void +do_text_output_multiline(TupOutputState *tstate, const char *txt) +{ + Datum values[1]; + bool isnull[1] = {false}; + + while (*txt) + { + const char *eol; + int len; + + eol = strchr(txt, '\n'); + if (eol) + { + len = eol - txt; + eol++; + } + else + { + len = strlen(txt); + eol = txt + len; + } + + values[0] = PointerGetDatum(cstring_to_text_with_len(txt, len)); + do_tup_output(tstate, values, isnull); + pfree(DatumGetPointer(values[0])); + txt = eol; + } +} + +void +end_tup_output(TupOutputState *tstate) +{ + tstate->dest->rShutdown(tstate->dest); + /* note that destroying the dest is not ours to do */ + ExecDropSingleTupleTableSlot(tstate->slot); + pfree(tstate); +} diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c new file mode 100644 index 0000000..ad11392 --- /dev/null +++ b/src/backend/executor/execUtils.c @@ -0,0 +1,1351 @@ +/*------------------------------------------------------------------------- + * + * execUtils.c + * miscellaneous executor utility routines + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/execUtils.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * CreateExecutorState Create/delete executor working state + * FreeExecutorState + * CreateExprContext + * CreateStandaloneExprContext + * FreeExprContext + * ReScanExprContext + * + * ExecAssignExprContext Common code for plan node init routines. + * etc + * + * ExecOpenScanRelation Common code for scan node init routines. + * + * ExecInitRangeTable Set up executor's range-table-related data. + * + * ExecGetRangeTableRelation Fetch Relation for a rangetable entry. + * + * executor_errposition Report syntactic position of an error. + * + * RegisterExprContextCallback Register function shutdown callback + * UnregisterExprContextCallback Deregister function shutdown callback + * + * GetAttributeByName Runtime extraction of columns from tuples. + * GetAttributeByNum + * + * NOTES + * This file has traditionally been the place to stick misc. + * executor support stuff that doesn't really go anyplace else. + */ + +#include "postgres.h" + +#include "access/parallel.h" +#include "access/relscan.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "executor/executor.h" +#include "executor/execPartition.h" +#include "jit/jit.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "parser/parsetree.h" +#include "partitioning/partdesc.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/typcache.h" + + +static bool tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc); +static void ShutdownExprContext(ExprContext *econtext, bool isCommit); + + +/* ---------------------------------------------------------------- + * Executor state and memory management functions + * ---------------------------------------------------------------- + */ + +/* ---------------- + * CreateExecutorState + * + * Create and initialize an EState node, which is the root of + * working storage for an entire Executor invocation. + * + * Principally, this creates the per-query memory context that will be + * used to hold all working data that lives till the end of the query. + * Note that the per-query context will become a child of the caller's + * CurrentMemoryContext. + * ---------------- + */ +EState * +CreateExecutorState(void) +{ + EState *estate; + MemoryContext qcontext; + MemoryContext oldcontext; + + /* + * Create the per-query context for this Executor run. + */ + qcontext = AllocSetContextCreate(CurrentMemoryContext, + "ExecutorState", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the EState node within the per-query context. This way, we don't + * need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(qcontext); + + estate = makeNode(EState); + + /* + * Initialize all fields of the Executor State structure + */ + estate->es_direction = ForwardScanDirection; + estate->es_snapshot = InvalidSnapshot; /* caller must initialize this */ + estate->es_crosscheck_snapshot = InvalidSnapshot; /* no crosscheck */ + estate->es_range_table = NIL; + estate->es_range_table_size = 0; + estate->es_relations = NULL; + estate->es_rowmarks = NULL; + estate->es_plannedstmt = NULL; + + estate->es_junkFilter = NULL; + + estate->es_output_cid = (CommandId) 0; + + estate->es_result_relations = NULL; + estate->es_opened_result_relations = NIL; + estate->es_tuple_routing_result_relations = NIL; + estate->es_trig_target_relations = NIL; + + estate->es_param_list_info = NULL; + estate->es_param_exec_vals = NULL; + + estate->es_queryEnv = NULL; + + estate->es_query_cxt = qcontext; + + estate->es_tupleTable = NIL; + + estate->es_processed = 0; + + estate->es_top_eflags = 0; + estate->es_instrument = 0; + estate->es_finished = false; + + estate->es_exprcontexts = NIL; + + estate->es_subplanstates = NIL; + + estate->es_auxmodifytables = NIL; + + estate->es_per_tuple_exprcontext = NULL; + + estate->es_sourceText = NULL; + + estate->es_use_parallel_mode = false; + + estate->es_jit_flags = 0; + estate->es_jit = NULL; + + /* + * Return the executor state structure + */ + MemoryContextSwitchTo(oldcontext); + + return estate; +} + +/* ---------------- + * FreeExecutorState + * + * Release an EState along with all remaining working storage. + * + * Note: this is not responsible for releasing non-memory resources, such as + * open relations or buffer pins. But it will shut down any still-active + * ExprContexts within the EState and deallocate associated JITed expressions. + * That is sufficient cleanup for situations where the EState has only been + * used for expression evaluation, and not to run a complete Plan. + * + * This can be called in any memory context ... so long as it's not one + * of the ones to be freed. + * ---------------- + */ +void +FreeExecutorState(EState *estate) +{ + /* + * Shut down and free any remaining ExprContexts. We do this explicitly + * to ensure that any remaining shutdown callbacks get called (since they + * might need to release resources that aren't simply memory within the + * per-query memory context). + */ + while (estate->es_exprcontexts) + { + /* + * XXX: seems there ought to be a faster way to implement this than + * repeated list_delete(), no? + */ + FreeExprContext((ExprContext *) linitial(estate->es_exprcontexts), + true); + /* FreeExprContext removed the list link for us */ + } + + /* release JIT context, if allocated */ + if (estate->es_jit) + { + jit_release_context(estate->es_jit); + estate->es_jit = NULL; + } + + /* release partition directory, if allocated */ + if (estate->es_partition_directory) + { + DestroyPartitionDirectory(estate->es_partition_directory); + estate->es_partition_directory = NULL; + } + + /* + * Free the per-query memory context, thereby releasing all working + * memory, including the EState node itself. + */ + MemoryContextDelete(estate->es_query_cxt); +} + +/* + * Internal implementation for CreateExprContext() and CreateWorkExprContext() + * that allows control over the AllocSet parameters. + */ +static ExprContext * +CreateExprContextInternal(EState *estate, Size minContextSize, + Size initBlockSize, Size maxBlockSize) +{ + ExprContext *econtext; + MemoryContext oldcontext; + + /* Create the ExprContext node within the per-query memory context */ + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + econtext = makeNode(ExprContext); + + /* Initialize fields of ExprContext */ + econtext->ecxt_scantuple = NULL; + econtext->ecxt_innertuple = NULL; + econtext->ecxt_outertuple = NULL; + + econtext->ecxt_per_query_memory = estate->es_query_cxt; + + /* + * Create working memory for expression evaluation in this context. + */ + econtext->ecxt_per_tuple_memory = + AllocSetContextCreate(estate->es_query_cxt, + "ExprContext", + minContextSize, + initBlockSize, + maxBlockSize); + + econtext->ecxt_param_exec_vals = estate->es_param_exec_vals; + econtext->ecxt_param_list_info = estate->es_param_list_info; + + econtext->ecxt_aggvalues = NULL; + econtext->ecxt_aggnulls = NULL; + + econtext->caseValue_datum = (Datum) 0; + econtext->caseValue_isNull = true; + + econtext->domainValue_datum = (Datum) 0; + econtext->domainValue_isNull = true; + + econtext->ecxt_estate = estate; + + econtext->ecxt_callbacks = NULL; + + /* + * Link the ExprContext into the EState to ensure it is shut down when the + * EState is freed. Because we use lcons(), shutdowns will occur in + * reverse order of creation, which may not be essential but can't hurt. + */ + estate->es_exprcontexts = lcons(econtext, estate->es_exprcontexts); + + MemoryContextSwitchTo(oldcontext); + + return econtext; +} + +/* ---------------- + * CreateExprContext + * + * Create a context for expression evaluation within an EState. + * + * An executor run may require multiple ExprContexts (we usually make one + * for each Plan node, and a separate one for per-output-tuple processing + * such as constraint checking). Each ExprContext has its own "per-tuple" + * memory context. + * + * Note we make no assumption about the caller's memory context. + * ---------------- + */ +ExprContext * +CreateExprContext(EState *estate) +{ + return CreateExprContextInternal(estate, ALLOCSET_DEFAULT_SIZES); +} + + +/* ---------------- + * CreateWorkExprContext + * + * Like CreateExprContext, but specifies the AllocSet sizes to be reasonable + * in proportion to work_mem. If the maximum block allocation size is too + * large, it's easy to skip right past work_mem with a single allocation. + * ---------------- + */ +ExprContext * +CreateWorkExprContext(EState *estate) +{ + Size minContextSize = ALLOCSET_DEFAULT_MINSIZE; + Size initBlockSize = ALLOCSET_DEFAULT_INITSIZE; + Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE; + + /* choose the maxBlockSize to be no larger than 1/16 of work_mem */ + while (16 * maxBlockSize > work_mem * 1024L) + maxBlockSize >>= 1; + + if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE) + maxBlockSize = ALLOCSET_DEFAULT_INITSIZE; + + return CreateExprContextInternal(estate, minContextSize, + initBlockSize, maxBlockSize); +} + +/* ---------------- + * CreateStandaloneExprContext + * + * Create a context for standalone expression evaluation. + * + * An ExprContext made this way can be used for evaluation of expressions + * that contain no Params, subplans, or Var references (it might work to + * put tuple references into the scantuple field, but it seems unwise). + * + * The ExprContext struct is allocated in the caller's current memory + * context, which also becomes its "per query" context. + * + * It is caller's responsibility to free the ExprContext when done, + * or at least ensure that any shutdown callbacks have been called + * (ReScanExprContext() is suitable). Otherwise, non-memory resources + * might be leaked. + * ---------------- + */ +ExprContext * +CreateStandaloneExprContext(void) +{ + ExprContext *econtext; + + /* Create the ExprContext node within the caller's memory context */ + econtext = makeNode(ExprContext); + + /* Initialize fields of ExprContext */ + econtext->ecxt_scantuple = NULL; + econtext->ecxt_innertuple = NULL; + econtext->ecxt_outertuple = NULL; + + econtext->ecxt_per_query_memory = CurrentMemoryContext; + + /* + * Create working memory for expression evaluation in this context. + */ + econtext->ecxt_per_tuple_memory = + AllocSetContextCreate(CurrentMemoryContext, + "ExprContext", + ALLOCSET_DEFAULT_SIZES); + + econtext->ecxt_param_exec_vals = NULL; + econtext->ecxt_param_list_info = NULL; + + econtext->ecxt_aggvalues = NULL; + econtext->ecxt_aggnulls = NULL; + + econtext->caseValue_datum = (Datum) 0; + econtext->caseValue_isNull = true; + + econtext->domainValue_datum = (Datum) 0; + econtext->domainValue_isNull = true; + + econtext->ecxt_estate = NULL; + + econtext->ecxt_callbacks = NULL; + + return econtext; +} + +/* ---------------- + * FreeExprContext + * + * Free an expression context, including calling any remaining + * shutdown callbacks. + * + * Since we free the temporary context used for expression evaluation, + * any previously computed pass-by-reference expression result will go away! + * + * If isCommit is false, we are being called in error cleanup, and should + * not call callbacks but only release memory. (It might be better to call + * the callbacks and pass the isCommit flag to them, but that would require + * more invasive code changes than currently seems justified.) + * + * Note we make no assumption about the caller's memory context. + * ---------------- + */ +void +FreeExprContext(ExprContext *econtext, bool isCommit) +{ + EState *estate; + + /* Call any registered callbacks */ + ShutdownExprContext(econtext, isCommit); + /* And clean up the memory used */ + MemoryContextDelete(econtext->ecxt_per_tuple_memory); + /* Unlink self from owning EState, if any */ + estate = econtext->ecxt_estate; + if (estate) + estate->es_exprcontexts = list_delete_ptr(estate->es_exprcontexts, + econtext); + /* And delete the ExprContext node */ + pfree(econtext); +} + +/* + * ReScanExprContext + * + * Reset an expression context in preparation for a rescan of its + * plan node. This requires calling any registered shutdown callbacks, + * since any partially complete set-returning-functions must be canceled. + * + * Note we make no assumption about the caller's memory context. + */ +void +ReScanExprContext(ExprContext *econtext) +{ + /* Call any registered callbacks */ + ShutdownExprContext(econtext, true); + /* And clean up the memory used */ + MemoryContextReset(econtext->ecxt_per_tuple_memory); +} + +/* + * Build a per-output-tuple ExprContext for an EState. + * + * This is normally invoked via GetPerTupleExprContext() macro, + * not directly. + */ +ExprContext * +MakePerTupleExprContext(EState *estate) +{ + if (estate->es_per_tuple_exprcontext == NULL) + estate->es_per_tuple_exprcontext = CreateExprContext(estate); + + return estate->es_per_tuple_exprcontext; +} + + +/* ---------------------------------------------------------------- + * miscellaneous node-init support functions + * + * Note: all of these are expected to be called with CurrentMemoryContext + * equal to the per-query memory context. + * ---------------------------------------------------------------- + */ + +/* ---------------- + * ExecAssignExprContext + * + * This initializes the ps_ExprContext field. It is only necessary + * to do this for nodes which use ExecQual or ExecProject + * because those routines require an econtext. Other nodes that + * don't have to evaluate expressions don't need to do this. + * ---------------- + */ +void +ExecAssignExprContext(EState *estate, PlanState *planstate) +{ + planstate->ps_ExprContext = CreateExprContext(estate); +} + +/* ---------------- + * ExecGetResultType + * ---------------- + */ +TupleDesc +ExecGetResultType(PlanState *planstate) +{ + return planstate->ps_ResultTupleDesc; +} + +/* + * ExecGetResultSlotOps - information about node's type of result slot + */ +const TupleTableSlotOps * +ExecGetResultSlotOps(PlanState *planstate, bool *isfixed) +{ + if (planstate->resultopsset && planstate->resultops) + { + if (isfixed) + *isfixed = planstate->resultopsfixed; + return planstate->resultops; + } + + if (isfixed) + { + if (planstate->resultopsset) + *isfixed = planstate->resultopsfixed; + else if (planstate->ps_ResultTupleSlot) + *isfixed = TTS_FIXED(planstate->ps_ResultTupleSlot); + else + *isfixed = false; + } + + if (!planstate->ps_ResultTupleSlot) + return &TTSOpsVirtual; + + return planstate->ps_ResultTupleSlot->tts_ops; +} + + +/* ---------------- + * ExecAssignProjectionInfo + * + * forms the projection information from the node's targetlist + * + * Notes for inputDesc are same as for ExecBuildProjectionInfo: supply it + * for a relation-scan node, can pass NULL for upper-level nodes + * ---------------- + */ +void +ExecAssignProjectionInfo(PlanState *planstate, + TupleDesc inputDesc) +{ + planstate->ps_ProjInfo = + ExecBuildProjectionInfo(planstate->plan->targetlist, + planstate->ps_ExprContext, + planstate->ps_ResultTupleSlot, + planstate, + inputDesc); +} + + +/* ---------------- + * ExecConditionalAssignProjectionInfo + * + * as ExecAssignProjectionInfo, but store NULL rather than building projection + * info if no projection is required + * ---------------- + */ +void +ExecConditionalAssignProjectionInfo(PlanState *planstate, TupleDesc inputDesc, + Index varno) +{ + if (tlist_matches_tupdesc(planstate, + planstate->plan->targetlist, + varno, + inputDesc)) + { + planstate->ps_ProjInfo = NULL; + planstate->resultopsset = planstate->scanopsset; + planstate->resultopsfixed = planstate->scanopsfixed; + planstate->resultops = planstate->scanops; + } + else + { + if (!planstate->ps_ResultTupleSlot) + { + ExecInitResultSlot(planstate, &TTSOpsVirtual); + planstate->resultops = &TTSOpsVirtual; + planstate->resultopsfixed = true; + planstate->resultopsset = true; + } + ExecAssignProjectionInfo(planstate, inputDesc); + } +} + +static bool +tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc) +{ + int numattrs = tupdesc->natts; + int attrno; + ListCell *tlist_item = list_head(tlist); + + /* Check the tlist attributes */ + for (attrno = 1; attrno <= numattrs; attrno++) + { + Form_pg_attribute att_tup = TupleDescAttr(tupdesc, attrno - 1); + Var *var; + + if (tlist_item == NULL) + return false; /* tlist too short */ + var = (Var *) ((TargetEntry *) lfirst(tlist_item))->expr; + if (!var || !IsA(var, Var)) + return false; /* tlist item not a Var */ + /* if these Asserts fail, planner messed up */ + Assert(var->varno == varno); + Assert(var->varlevelsup == 0); + if (var->varattno != attrno) + return false; /* out of order */ + if (att_tup->attisdropped) + return false; /* table contains dropped columns */ + if (att_tup->atthasmissing) + return false; /* table contains cols with missing values */ + + /* + * Note: usually the Var's type should match the tupdesc exactly, but + * in situations involving unions of columns that have different + * typmods, the Var may have come from above the union and hence have + * typmod -1. This is a legitimate situation since the Var still + * describes the column, just not as exactly as the tupdesc does. We + * could change the planner to prevent it, but it'd then insert + * projection steps just to convert from specific typmod to typmod -1, + * which is pretty silly. + */ + if (var->vartype != att_tup->atttypid || + (var->vartypmod != att_tup->atttypmod && + var->vartypmod != -1)) + return false; /* type mismatch */ + + tlist_item = lnext(tlist, tlist_item); + } + + if (tlist_item) + return false; /* tlist too long */ + + return true; +} + +/* ---------------- + * ExecFreeExprContext + * + * A plan node's ExprContext should be freed explicitly during executor + * shutdown because there may be shutdown callbacks to call. (Other resources + * made by the above routines, such as projection info, don't need to be freed + * explicitly because they're just memory in the per-query memory context.) + * + * However ... there is no particular need to do it during ExecEndNode, + * because FreeExecutorState will free any remaining ExprContexts within + * the EState. Letting FreeExecutorState do it allows the ExprContexts to + * be freed in reverse order of creation, rather than order of creation as + * will happen if we delete them here, which saves O(N^2) work in the list + * cleanup inside FreeExprContext. + * ---------------- + */ +void +ExecFreeExprContext(PlanState *planstate) +{ + /* + * Per above discussion, don't actually delete the ExprContext. We do + * unlink it from the plan node, though. + */ + planstate->ps_ExprContext = NULL; +} + + +/* ---------------------------------------------------------------- + * Scan node support + * ---------------------------------------------------------------- + */ + +/* ---------------- + * ExecAssignScanType + * ---------------- + */ +void +ExecAssignScanType(ScanState *scanstate, TupleDesc tupDesc) +{ + TupleTableSlot *slot = scanstate->ss_ScanTupleSlot; + + ExecSetSlotDescriptor(slot, tupDesc); +} + +/* ---------------- + * ExecCreateScanSlotFromOuterPlan + * ---------------- + */ +void +ExecCreateScanSlotFromOuterPlan(EState *estate, + ScanState *scanstate, + const TupleTableSlotOps *tts_ops) +{ + PlanState *outerPlan; + TupleDesc tupDesc; + + outerPlan = outerPlanState(scanstate); + tupDesc = ExecGetResultType(outerPlan); + + ExecInitScanTupleSlot(estate, scanstate, tupDesc, tts_ops); +} + +/* ---------------------------------------------------------------- + * ExecRelationIsTargetRelation + * + * Detect whether a relation (identified by rangetable index) + * is one of the target relations of the query. + * + * Note: This is currently no longer used in core. We keep it around + * because FDWs may wish to use it to determine if their foreign table + * is a target relation. + * ---------------------------------------------------------------- + */ +bool +ExecRelationIsTargetRelation(EState *estate, Index scanrelid) +{ + return list_member_int(estate->es_plannedstmt->resultRelations, scanrelid); +} + +/* ---------------------------------------------------------------- + * ExecOpenScanRelation + * + * Open the heap relation to be scanned by a base-level scan plan node. + * This should be called during the node's ExecInit routine. + * ---------------------------------------------------------------- + */ +Relation +ExecOpenScanRelation(EState *estate, Index scanrelid, int eflags) +{ + Relation rel; + + /* Open the relation. */ + rel = ExecGetRangeTableRelation(estate, scanrelid); + + /* + * Complain if we're attempting a scan of an unscannable relation, except + * when the query won't actually be run. This is a slightly klugy place + * to do this, perhaps, but there is no better place. + */ + if ((eflags & (EXEC_FLAG_EXPLAIN_ONLY | EXEC_FLAG_WITH_NO_DATA)) == 0 && + !RelationIsScannable(rel)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("materialized view \"%s\" has not been populated", + RelationGetRelationName(rel)), + errhint("Use the REFRESH MATERIALIZED VIEW command."))); + + return rel; +} + +/* + * ExecInitRangeTable + * Set up executor's range-table-related data + * + * In addition to the range table proper, initialize arrays that are + * indexed by rangetable index. + */ +void +ExecInitRangeTable(EState *estate, List *rangeTable) +{ + /* Remember the range table List as-is */ + estate->es_range_table = rangeTable; + + /* Set size of associated arrays */ + estate->es_range_table_size = list_length(rangeTable); + + /* + * Allocate an array to store an open Relation corresponding to each + * rangetable entry, and initialize entries to NULL. Relations are opened + * and stored here as needed. + */ + estate->es_relations = (Relation *) + palloc0(estate->es_range_table_size * sizeof(Relation)); + + /* + * es_result_relations and es_rowmarks are also parallel to + * es_range_table, but are allocated only if needed. + */ + estate->es_result_relations = NULL; + estate->es_rowmarks = NULL; +} + +/* + * ExecGetRangeTableRelation + * Open the Relation for a range table entry, if not already done + * + * The Relations will be closed again in ExecEndPlan(). + */ +Relation +ExecGetRangeTableRelation(EState *estate, Index rti) +{ + Relation rel; + + Assert(rti > 0 && rti <= estate->es_range_table_size); + + rel = estate->es_relations[rti - 1]; + if (rel == NULL) + { + /* First time through, so open the relation */ + RangeTblEntry *rte = exec_rt_fetch(rti, estate); + + Assert(rte->rtekind == RTE_RELATION); + + if (!IsParallelWorker()) + { + /* + * In a normal query, we should already have the appropriate lock, + * but verify that through an Assert. Since there's already an + * Assert inside table_open that insists on holding some lock, it + * seems sufficient to check this only when rellockmode is higher + * than the minimum. + */ + rel = table_open(rte->relid, NoLock); + Assert(rte->rellockmode == AccessShareLock || + CheckRelationLockedByMe(rel, rte->rellockmode, false)); + } + else + { + /* + * If we are a parallel worker, we need to obtain our own local + * lock on the relation. This ensures sane behavior in case the + * parent process exits before we do. + */ + rel = table_open(rte->relid, rte->rellockmode); + } + + estate->es_relations[rti - 1] = rel; + } + + return rel; +} + +/* + * ExecInitResultRelation + * Open relation given by the passed-in RT index and fill its + * ResultRelInfo node + * + * Here, we also save the ResultRelInfo in estate->es_result_relations array + * such that it can be accessed later using the RT index. + */ +void +ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo, + Index rti) +{ + Relation resultRelationDesc; + + resultRelationDesc = ExecGetRangeTableRelation(estate, rti); + InitResultRelInfo(resultRelInfo, + resultRelationDesc, + rti, + NULL, + estate->es_instrument); + + if (estate->es_result_relations == NULL) + estate->es_result_relations = (ResultRelInfo **) + palloc0(estate->es_range_table_size * sizeof(ResultRelInfo *)); + estate->es_result_relations[rti - 1] = resultRelInfo; + + /* + * Saving in the list allows to avoid needlessly traversing the whole + * array when only a few of its entries are possibly non-NULL. + */ + estate->es_opened_result_relations = + lappend(estate->es_opened_result_relations, resultRelInfo); +} + +/* + * UpdateChangedParamSet + * Add changed parameters to a plan node's chgParam set + */ +void +UpdateChangedParamSet(PlanState *node, Bitmapset *newchg) +{ + Bitmapset *parmset; + + /* + * The plan node only depends on params listed in its allParam set. Don't + * include anything else into its chgParam set. + */ + parmset = bms_intersect(node->plan->allParam, newchg); + + /* + * Keep node->chgParam == NULL if there's not actually any members; this + * allows the simplest possible tests in executor node files. + */ + if (!bms_is_empty(parmset)) + node->chgParam = bms_join(node->chgParam, parmset); + else + bms_free(parmset); +} + +/* + * executor_errposition + * Report an execution-time cursor position, if possible. + * + * This is expected to be used within an ereport() call. The return value + * is a dummy (always 0, in fact). + * + * The locations stored in parsetrees are byte offsets into the source string. + * We have to convert them to 1-based character indexes for reporting to + * clients. (We do things this way to avoid unnecessary overhead in the + * normal non-error case: computing character indexes would be much more + * expensive than storing token offsets.) + */ +int +executor_errposition(EState *estate, int location) +{ + int pos; + + /* No-op if location was not provided */ + if (location < 0) + return 0; + /* Can't do anything if source text is not available */ + if (estate == NULL || estate->es_sourceText == NULL) + return 0; + /* Convert offset to character number */ + pos = pg_mbstrlen_with_len(estate->es_sourceText, location) + 1; + /* And pass it to the ereport mechanism */ + return errposition(pos); +} + +/* + * Register a shutdown callback in an ExprContext. + * + * Shutdown callbacks will be called (in reverse order of registration) + * when the ExprContext is deleted or rescanned. This provides a hook + * for functions called in the context to do any cleanup needed --- it's + * particularly useful for functions returning sets. Note that the + * callback will *not* be called in the event that execution is aborted + * by an error. + */ +void +RegisterExprContextCallback(ExprContext *econtext, + ExprContextCallbackFunction function, + Datum arg) +{ + ExprContext_CB *ecxt_callback; + + /* Save the info in appropriate memory context */ + ecxt_callback = (ExprContext_CB *) + MemoryContextAlloc(econtext->ecxt_per_query_memory, + sizeof(ExprContext_CB)); + + ecxt_callback->function = function; + ecxt_callback->arg = arg; + + /* link to front of list for appropriate execution order */ + ecxt_callback->next = econtext->ecxt_callbacks; + econtext->ecxt_callbacks = ecxt_callback; +} + +/* + * Deregister a shutdown callback in an ExprContext. + * + * Any list entries matching the function and arg will be removed. + * This can be used if it's no longer necessary to call the callback. + */ +void +UnregisterExprContextCallback(ExprContext *econtext, + ExprContextCallbackFunction function, + Datum arg) +{ + ExprContext_CB **prev_callback; + ExprContext_CB *ecxt_callback; + + prev_callback = &econtext->ecxt_callbacks; + + while ((ecxt_callback = *prev_callback) != NULL) + { + if (ecxt_callback->function == function && ecxt_callback->arg == arg) + { + *prev_callback = ecxt_callback->next; + pfree(ecxt_callback); + } + else + prev_callback = &ecxt_callback->next; + } +} + +/* + * Call all the shutdown callbacks registered in an ExprContext. + * + * The callback list is emptied (important in case this is only a rescan + * reset, and not deletion of the ExprContext). + * + * If isCommit is false, just clean the callback list but don't call 'em. + * (See comment for FreeExprContext.) + */ +static void +ShutdownExprContext(ExprContext *econtext, bool isCommit) +{ + ExprContext_CB *ecxt_callback; + MemoryContext oldcontext; + + /* Fast path in normal case where there's nothing to do. */ + if (econtext->ecxt_callbacks == NULL) + return; + + /* + * Call the callbacks in econtext's per-tuple context. This ensures that + * any memory they might leak will get cleaned up. + */ + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + /* + * Call each callback function in reverse registration order. + */ + while ((ecxt_callback = econtext->ecxt_callbacks) != NULL) + { + econtext->ecxt_callbacks = ecxt_callback->next; + if (isCommit) + ecxt_callback->function(ecxt_callback->arg); + pfree(ecxt_callback); + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * GetAttributeByName + * GetAttributeByNum + * + * These functions return the value of the requested attribute + * out of the given tuple Datum. + * C functions which take a tuple as an argument are expected + * to use these. Ex: overpaid(EMP) might call GetAttributeByNum(). + * Note: these are actually rather slow because they do a typcache + * lookup on each call. + */ +Datum +GetAttributeByName(HeapTupleHeader tuple, const char *attname, bool *isNull) +{ + AttrNumber attrno; + Datum result; + Oid tupType; + int32 tupTypmod; + TupleDesc tupDesc; + HeapTupleData tmptup; + int i; + + if (attname == NULL) + elog(ERROR, "invalid attribute name"); + + if (isNull == NULL) + elog(ERROR, "a NULL isNull pointer was passed"); + + if (tuple == NULL) + { + /* Kinda bogus but compatible with old behavior... */ + *isNull = true; + return (Datum) 0; + } + + tupType = HeapTupleHeaderGetTypeId(tuple); + tupTypmod = HeapTupleHeaderGetTypMod(tuple); + tupDesc = lookup_rowtype_tupdesc(tupType, tupTypmod); + + attrno = InvalidAttrNumber; + for (i = 0; i < tupDesc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(tupDesc, i); + + if (namestrcmp(&(att->attname), attname) == 0) + { + attrno = att->attnum; + break; + } + } + + if (attrno == InvalidAttrNumber) + elog(ERROR, "attribute \"%s\" does not exist", attname); + + /* + * heap_getattr needs a HeapTuple not a bare HeapTupleHeader. We set all + * the fields in the struct just in case user tries to inspect system + * columns. + */ + tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple); + ItemPointerSetInvalid(&(tmptup.t_self)); + tmptup.t_tableOid = InvalidOid; + tmptup.t_data = tuple; + + result = heap_getattr(&tmptup, + attrno, + tupDesc, + isNull); + + ReleaseTupleDesc(tupDesc); + + return result; +} + +Datum +GetAttributeByNum(HeapTupleHeader tuple, + AttrNumber attrno, + bool *isNull) +{ + Datum result; + Oid tupType; + int32 tupTypmod; + TupleDesc tupDesc; + HeapTupleData tmptup; + + if (!AttributeNumberIsValid(attrno)) + elog(ERROR, "invalid attribute number %d", attrno); + + if (isNull == NULL) + elog(ERROR, "a NULL isNull pointer was passed"); + + if (tuple == NULL) + { + /* Kinda bogus but compatible with old behavior... */ + *isNull = true; + return (Datum) 0; + } + + tupType = HeapTupleHeaderGetTypeId(tuple); + tupTypmod = HeapTupleHeaderGetTypMod(tuple); + tupDesc = lookup_rowtype_tupdesc(tupType, tupTypmod); + + /* + * heap_getattr needs a HeapTuple not a bare HeapTupleHeader. We set all + * the fields in the struct just in case user tries to inspect system + * columns. + */ + tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple); + ItemPointerSetInvalid(&(tmptup.t_self)); + tmptup.t_tableOid = InvalidOid; + tmptup.t_data = tuple; + + result = heap_getattr(&tmptup, + attrno, + tupDesc, + isNull); + + ReleaseTupleDesc(tupDesc); + + return result; +} + +/* + * Number of items in a tlist (including any resjunk items!) + */ +int +ExecTargetListLength(List *targetlist) +{ + /* This used to be more complex, but fjoins are dead */ + return list_length(targetlist); +} + +/* + * Number of items in a tlist, not including any resjunk items + */ +int +ExecCleanTargetListLength(List *targetlist) +{ + int len = 0; + ListCell *tl; + + foreach(tl, targetlist) + { + TargetEntry *curTle = lfirst_node(TargetEntry, tl); + + if (!curTle->resjunk) + len++; + } + return len; +} + +/* + * Return a relInfo's tuple slot for a trigger's OLD tuples. + */ +TupleTableSlot * +ExecGetTriggerOldSlot(EState *estate, ResultRelInfo *relInfo) +{ + if (relInfo->ri_TrigOldSlot == NULL) + { + Relation rel = relInfo->ri_RelationDesc; + MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + relInfo->ri_TrigOldSlot = + ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel), + table_slot_callbacks(rel)); + + MemoryContextSwitchTo(oldcontext); + } + + return relInfo->ri_TrigOldSlot; +} + +/* + * Return a relInfo's tuple slot for a trigger's NEW tuples. + */ +TupleTableSlot * +ExecGetTriggerNewSlot(EState *estate, ResultRelInfo *relInfo) +{ + if (relInfo->ri_TrigNewSlot == NULL) + { + Relation rel = relInfo->ri_RelationDesc; + MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + relInfo->ri_TrigNewSlot = + ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel), + table_slot_callbacks(rel)); + + MemoryContextSwitchTo(oldcontext); + } + + return relInfo->ri_TrigNewSlot; +} + +/* + * Return a relInfo's tuple slot for processing returning tuples. + */ +TupleTableSlot * +ExecGetReturningSlot(EState *estate, ResultRelInfo *relInfo) +{ + if (relInfo->ri_ReturningSlot == NULL) + { + Relation rel = relInfo->ri_RelationDesc; + MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + relInfo->ri_ReturningSlot = + ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel), + table_slot_callbacks(rel)); + + MemoryContextSwitchTo(oldcontext); + } + + return relInfo->ri_ReturningSlot; +} + +/* + * Return the map needed to convert given child result relation's tuples to + * the rowtype of the query's main target ("root") relation. Note that a + * NULL result is valid and means that no conversion is needed. + */ +TupleConversionMap * +ExecGetChildToRootMap(ResultRelInfo *resultRelInfo) +{ + /* If we didn't already do so, compute the map for this child. */ + if (!resultRelInfo->ri_ChildToRootMapValid) + { + ResultRelInfo *rootRelInfo = resultRelInfo->ri_RootResultRelInfo; + + if (rootRelInfo) + resultRelInfo->ri_ChildToRootMap = + convert_tuples_by_name(RelationGetDescr(resultRelInfo->ri_RelationDesc), + RelationGetDescr(rootRelInfo->ri_RelationDesc)); + else /* this isn't a child result rel */ + resultRelInfo->ri_ChildToRootMap = NULL; + + resultRelInfo->ri_ChildToRootMapValid = true; + } + + return resultRelInfo->ri_ChildToRootMap; +} + +/* Return a bitmap representing columns being inserted */ +Bitmapset * +ExecGetInsertedCols(ResultRelInfo *relinfo, EState *estate) +{ + /* + * The columns are stored in the range table entry. If this ResultRelInfo + * represents a partition routing target, and doesn't have an entry of its + * own in the range table, fetch the parent's RTE and map the columns to + * the order they are in the partition. + */ + if (relinfo->ri_RangeTableIndex != 0) + { + RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate); + + return rte->insertedCols; + } + else if (relinfo->ri_RootResultRelInfo) + { + ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo; + RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate); + + if (relinfo->ri_RootToPartitionMap != NULL) + return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap, + rte->insertedCols); + else + return rte->insertedCols; + } + else + { + /* + * The relation isn't in the range table and it isn't a partition + * routing target. This ResultRelInfo must've been created only for + * firing triggers and the relation is not being inserted into. (See + * ExecGetTriggerResultRel.) + */ + return NULL; + } +} + +/* Return a bitmap representing columns being updated */ +Bitmapset * +ExecGetUpdatedCols(ResultRelInfo *relinfo, EState *estate) +{ + /* see ExecGetInsertedCols() */ + if (relinfo->ri_RangeTableIndex != 0) + { + RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate); + + return rte->updatedCols; + } + else if (relinfo->ri_RootResultRelInfo) + { + ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo; + RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate); + + if (relinfo->ri_RootToPartitionMap != NULL) + return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap, + rte->updatedCols); + else + return rte->updatedCols; + } + else + return NULL; +} + +/* Return a bitmap representing generated columns being updated */ +Bitmapset * +ExecGetExtraUpdatedCols(ResultRelInfo *relinfo, EState *estate) +{ + /* see ExecGetInsertedCols() */ + if (relinfo->ri_RangeTableIndex != 0) + { + RangeTblEntry *rte = exec_rt_fetch(relinfo->ri_RangeTableIndex, estate); + + return rte->extraUpdatedCols; + } + else if (relinfo->ri_RootResultRelInfo) + { + ResultRelInfo *rootRelInfo = relinfo->ri_RootResultRelInfo; + RangeTblEntry *rte = exec_rt_fetch(rootRelInfo->ri_RangeTableIndex, estate); + + if (relinfo->ri_RootToPartitionMap != NULL) + return execute_attr_map_cols(relinfo->ri_RootToPartitionMap->attrMap, + rte->extraUpdatedCols); + else + return rte->extraUpdatedCols; + } + else + return NULL; +} + +/* Return columns being updated, including generated columns */ +Bitmapset * +ExecGetAllUpdatedCols(ResultRelInfo *relinfo, EState *estate) +{ + return bms_union(ExecGetUpdatedCols(relinfo, estate), + ExecGetExtraUpdatedCols(relinfo, estate)); +} diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c new file mode 100644 index 0000000..296e54e --- /dev/null +++ b/src/backend/executor/functions.c @@ -0,0 +1,2103 @@ +/*------------------------------------------------------------------------- + * + * functions.c + * Execution of SQL-language functions + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/functions.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "executor/functions.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "parser/parse_coerce.h" +#include "parser/parse_collate.h" +#include "parser/parse_func.h" +#include "rewrite/rewriteHandler.h" +#include "storage/proc.h" +#include "tcop/utility.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + + +/* + * Specialized DestReceiver for collecting query output in a SQL function + */ +typedef struct +{ + DestReceiver pub; /* publicly-known function pointers */ + Tuplestorestate *tstore; /* where to put result tuples */ + MemoryContext cxt; /* context containing tstore */ + JunkFilter *filter; /* filter to convert tuple type */ +} DR_sqlfunction; + +/* + * We have an execution_state record for each query in a function. Each + * record contains a plantree for its query. If the query is currently in + * F_EXEC_RUN state then there's a QueryDesc too. + * + * The "next" fields chain together all the execution_state records generated + * from a single original parsetree. (There will only be more than one in + * case of rule expansion of the original parsetree.) + */ +typedef enum +{ + F_EXEC_START, F_EXEC_RUN, F_EXEC_DONE +} ExecStatus; + +typedef struct execution_state +{ + struct execution_state *next; + ExecStatus status; + bool setsResult; /* true if this query produces func's result */ + bool lazyEval; /* true if should fetch one row at a time */ + PlannedStmt *stmt; /* plan for this query */ + QueryDesc *qd; /* null unless status == RUN */ +} execution_state; + + +/* + * An SQLFunctionCache record is built during the first call, + * and linked to from the fn_extra field of the FmgrInfo struct. + * + * Note that currently this has only the lifespan of the calling query. + * Someday we should rewrite this code to use plancache.c to save parse/plan + * results for longer than that. + * + * Physically, though, the data has the lifespan of the FmgrInfo that's used + * to call the function, and there are cases (particularly with indexes) + * where the FmgrInfo might survive across transactions. We cannot assume + * that the parse/plan trees are good for longer than the (sub)transaction in + * which parsing was done, so we must mark the record with the LXID/subxid of + * its creation time, and regenerate everything if that's obsolete. To avoid + * memory leakage when we do have to regenerate things, all the data is kept + * in a sub-context of the FmgrInfo's fn_mcxt. + */ +typedef struct +{ + char *fname; /* function name (for error msgs) */ + char *src; /* function body text (for error msgs) */ + + SQLFunctionParseInfoPtr pinfo; /* data for parser callback hooks */ + + Oid rettype; /* actual return type */ + int16 typlen; /* length of the return type */ + bool typbyval; /* true if return type is pass by value */ + bool returnsSet; /* true if returning multiple rows */ + bool returnsTuple; /* true if returning whole tuple result */ + bool shutdown_reg; /* true if registered shutdown callback */ + bool readonly_func; /* true to run in "read only" mode */ + bool lazyEval; /* true if using lazyEval for result query */ + + ParamListInfo paramLI; /* Param list representing current args */ + + Tuplestorestate *tstore; /* where we accumulate result tuples */ + + JunkFilter *junkFilter; /* will be NULL if function returns VOID */ + + /* + * func_state is a List of execution_state records, each of which is the + * first for its original parsetree, with any additional records chained + * to it via the "next" fields. This sublist structure is needed to keep + * track of where the original query boundaries are. + */ + List *func_state; + + MemoryContext fcontext; /* memory context holding this struct and all + * subsidiary data */ + + LocalTransactionId lxid; /* lxid in which cache was made */ + SubTransactionId subxid; /* subxid in which cache was made */ +} SQLFunctionCache; + +typedef SQLFunctionCache *SQLFunctionCachePtr; + + +/* non-export function prototypes */ +static Node *sql_fn_param_ref(ParseState *pstate, ParamRef *pref); +static Node *sql_fn_post_column_ref(ParseState *pstate, + ColumnRef *cref, Node *var); +static Node *sql_fn_make_param(SQLFunctionParseInfoPtr pinfo, + int paramno, int location); +static Node *sql_fn_resolve_param_name(SQLFunctionParseInfoPtr pinfo, + const char *paramname, int location); +static List *init_execution_state(List *queryTree_list, + SQLFunctionCachePtr fcache, + bool lazyEvalOK); +static void init_sql_fcache(FunctionCallInfo fcinfo, Oid collation, bool lazyEvalOK); +static void postquel_start(execution_state *es, SQLFunctionCachePtr fcache); +static bool postquel_getnext(execution_state *es, SQLFunctionCachePtr fcache); +static void postquel_end(execution_state *es); +static void postquel_sub_params(SQLFunctionCachePtr fcache, + FunctionCallInfo fcinfo); +static Datum postquel_get_single_result(TupleTableSlot *slot, + FunctionCallInfo fcinfo, + SQLFunctionCachePtr fcache, + MemoryContext resultcontext); +static void sql_exec_error_callback(void *arg); +static void ShutdownSQLFunction(Datum arg); +static bool coerce_fn_result_column(TargetEntry *src_tle, + Oid res_type, int32 res_typmod, + bool tlist_is_modifiable, + List **upper_tlist, + bool *upper_tlist_nontrivial); +static void sqlfunction_startup(DestReceiver *self, int operation, TupleDesc typeinfo); +static bool sqlfunction_receive(TupleTableSlot *slot, DestReceiver *self); +static void sqlfunction_shutdown(DestReceiver *self); +static void sqlfunction_destroy(DestReceiver *self); + + +/* + * Prepare the SQLFunctionParseInfo struct for parsing a SQL function body + * + * This includes resolving actual types of polymorphic arguments. + * + * call_expr can be passed as NULL, but then we will fail if there are any + * polymorphic arguments. + */ +SQLFunctionParseInfoPtr +prepare_sql_fn_parse_info(HeapTuple procedureTuple, + Node *call_expr, + Oid inputCollation) +{ + SQLFunctionParseInfoPtr pinfo; + Form_pg_proc procedureStruct = (Form_pg_proc) GETSTRUCT(procedureTuple); + int nargs; + + pinfo = (SQLFunctionParseInfoPtr) palloc0(sizeof(SQLFunctionParseInfo)); + + /* Function's name (only) can be used to qualify argument names */ + pinfo->fname = pstrdup(NameStr(procedureStruct->proname)); + + /* Save the function's input collation */ + pinfo->collation = inputCollation; + + /* + * Copy input argument types from the pg_proc entry, then resolve any + * polymorphic types. + */ + pinfo->nargs = nargs = procedureStruct->pronargs; + if (nargs > 0) + { + Oid *argOidVect; + int argnum; + + argOidVect = (Oid *) palloc(nargs * sizeof(Oid)); + memcpy(argOidVect, + procedureStruct->proargtypes.values, + nargs * sizeof(Oid)); + + for (argnum = 0; argnum < nargs; argnum++) + { + Oid argtype = argOidVect[argnum]; + + if (IsPolymorphicType(argtype)) + { + argtype = get_call_expr_argtype(call_expr, argnum); + if (argtype == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("could not determine actual type of argument declared %s", + format_type_be(argOidVect[argnum])))); + argOidVect[argnum] = argtype; + } + } + + pinfo->argtypes = argOidVect; + } + + /* + * Collect names of arguments, too, if any + */ + if (nargs > 0) + { + Datum proargnames; + Datum proargmodes; + int n_arg_names; + bool isNull; + + proargnames = SysCacheGetAttr(PROCNAMEARGSNSP, procedureTuple, + Anum_pg_proc_proargnames, + &isNull); + if (isNull) + proargnames = PointerGetDatum(NULL); /* just to be sure */ + + proargmodes = SysCacheGetAttr(PROCNAMEARGSNSP, procedureTuple, + Anum_pg_proc_proargmodes, + &isNull); + if (isNull) + proargmodes = PointerGetDatum(NULL); /* just to be sure */ + + n_arg_names = get_func_input_arg_names(proargnames, proargmodes, + &pinfo->argnames); + + /* Paranoia: ignore the result if too few array entries */ + if (n_arg_names < nargs) + pinfo->argnames = NULL; + } + else + pinfo->argnames = NULL; + + return pinfo; +} + +/* + * Parser setup hook for parsing a SQL function body. + */ +void +sql_fn_parser_setup(struct ParseState *pstate, SQLFunctionParseInfoPtr pinfo) +{ + pstate->p_pre_columnref_hook = NULL; + pstate->p_post_columnref_hook = sql_fn_post_column_ref; + pstate->p_paramref_hook = sql_fn_param_ref; + /* no need to use p_coerce_param_hook */ + pstate->p_ref_hook_state = (void *) pinfo; +} + +/* + * sql_fn_post_column_ref parser callback for ColumnRefs + */ +static Node * +sql_fn_post_column_ref(ParseState *pstate, ColumnRef *cref, Node *var) +{ + SQLFunctionParseInfoPtr pinfo = (SQLFunctionParseInfoPtr) pstate->p_ref_hook_state; + int nnames; + Node *field1; + Node *subfield = NULL; + const char *name1; + const char *name2 = NULL; + Node *param; + + /* + * Never override a table-column reference. This corresponds to + * considering the parameter names to appear in a scope outside the + * individual SQL commands, which is what we want. + */ + if (var != NULL) + return NULL; + + /*---------- + * The allowed syntaxes are: + * + * A A = parameter name + * A.B A = function name, B = parameter name + * OR: A = record-typed parameter name, B = field name + * (the first possibility takes precedence) + * A.B.C A = function name, B = record-typed parameter name, + * C = field name + * A.* Whole-row reference to composite parameter A. + * A.B.* Same, with A = function name, B = parameter name + * + * Here, it's sufficient to ignore the "*" in the last two cases --- the + * main parser will take care of expanding the whole-row reference. + *---------- + */ + nnames = list_length(cref->fields); + + if (nnames > 3) + return NULL; + + if (IsA(llast(cref->fields), A_Star)) + nnames--; + + field1 = (Node *) linitial(cref->fields); + Assert(IsA(field1, String)); + name1 = strVal(field1); + if (nnames > 1) + { + subfield = (Node *) lsecond(cref->fields); + Assert(IsA(subfield, String)); + name2 = strVal(subfield); + } + + if (nnames == 3) + { + /* + * Three-part name: if the first part doesn't match the function name, + * we can fail immediately. Otherwise, look up the second part, and + * take the third part to be a field reference. + */ + if (strcmp(name1, pinfo->fname) != 0) + return NULL; + + param = sql_fn_resolve_param_name(pinfo, name2, cref->location); + + subfield = (Node *) lthird(cref->fields); + Assert(IsA(subfield, String)); + } + else if (nnames == 2 && strcmp(name1, pinfo->fname) == 0) + { + /* + * Two-part name with first part matching function name: first see if + * second part matches any parameter name. + */ + param = sql_fn_resolve_param_name(pinfo, name2, cref->location); + + if (param) + { + /* Yes, so this is a parameter reference, no subfield */ + subfield = NULL; + } + else + { + /* No, so try to match as parameter name and subfield */ + param = sql_fn_resolve_param_name(pinfo, name1, cref->location); + } + } + else + { + /* Single name, or parameter name followed by subfield */ + param = sql_fn_resolve_param_name(pinfo, name1, cref->location); + } + + if (!param) + return NULL; /* No match */ + + if (subfield) + { + /* + * Must be a reference to a field of a composite parameter; otherwise + * ParseFuncOrColumn will return NULL, and we'll fail back at the + * caller. + */ + param = ParseFuncOrColumn(pstate, + list_make1(subfield), + list_make1(param), + pstate->p_last_srf, + NULL, + false, + cref->location); + } + + return param; +} + +/* + * sql_fn_param_ref parser callback for ParamRefs ($n symbols) + */ +static Node * +sql_fn_param_ref(ParseState *pstate, ParamRef *pref) +{ + SQLFunctionParseInfoPtr pinfo = (SQLFunctionParseInfoPtr) pstate->p_ref_hook_state; + int paramno = pref->number; + + /* Check parameter number is valid */ + if (paramno <= 0 || paramno > pinfo->nargs) + return NULL; /* unknown parameter number */ + + return sql_fn_make_param(pinfo, paramno, pref->location); +} + +/* + * sql_fn_make_param construct a Param node for the given paramno + */ +static Node * +sql_fn_make_param(SQLFunctionParseInfoPtr pinfo, + int paramno, int location) +{ + Param *param; + + param = makeNode(Param); + param->paramkind = PARAM_EXTERN; + param->paramid = paramno; + param->paramtype = pinfo->argtypes[paramno - 1]; + param->paramtypmod = -1; + param->paramcollid = get_typcollation(param->paramtype); + param->location = location; + + /* + * If we have a function input collation, allow it to override the + * type-derived collation for parameter symbols. (XXX perhaps this should + * not happen if the type collation is not default?) + */ + if (OidIsValid(pinfo->collation) && OidIsValid(param->paramcollid)) + param->paramcollid = pinfo->collation; + + return (Node *) param; +} + +/* + * Search for a function parameter of the given name; if there is one, + * construct and return a Param node for it. If not, return NULL. + * Helper function for sql_fn_post_column_ref. + */ +static Node * +sql_fn_resolve_param_name(SQLFunctionParseInfoPtr pinfo, + const char *paramname, int location) +{ + int i; + + if (pinfo->argnames == NULL) + return NULL; + + for (i = 0; i < pinfo->nargs; i++) + { + if (pinfo->argnames[i] && strcmp(pinfo->argnames[i], paramname) == 0) + return sql_fn_make_param(pinfo, i + 1, location); + } + + return NULL; +} + +/* + * Set up the per-query execution_state records for a SQL function. + * + * The input is a List of Lists of parsed and rewritten, but not planned, + * querytrees. The sublist structure denotes the original query boundaries. + */ +static List * +init_execution_state(List *queryTree_list, + SQLFunctionCachePtr fcache, + bool lazyEvalOK) +{ + List *eslist = NIL; + execution_state *lasttages = NULL; + ListCell *lc1; + + foreach(lc1, queryTree_list) + { + List *qtlist = lfirst_node(List, lc1); + execution_state *firstes = NULL; + execution_state *preves = NULL; + ListCell *lc2; + + foreach(lc2, qtlist) + { + Query *queryTree = lfirst_node(Query, lc2); + PlannedStmt *stmt; + execution_state *newes; + + /* Plan the query if needed */ + if (queryTree->commandType == CMD_UTILITY) + { + /* Utility commands require no planning. */ + stmt = makeNode(PlannedStmt); + stmt->commandType = CMD_UTILITY; + stmt->canSetTag = queryTree->canSetTag; + stmt->utilityStmt = queryTree->utilityStmt; + stmt->stmt_location = queryTree->stmt_location; + stmt->stmt_len = queryTree->stmt_len; + } + else + stmt = pg_plan_query(queryTree, + fcache->src, + CURSOR_OPT_PARALLEL_OK, + NULL); + + /* + * Precheck all commands for validity in a function. This should + * generally match the restrictions spi.c applies. + */ + if (stmt->commandType == CMD_UTILITY) + { + if (IsA(stmt->utilityStmt, CopyStmt) && + ((CopyStmt *) stmt->utilityStmt)->filename == NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot COPY to/from client in an SQL function"))); + + if (IsA(stmt->utilityStmt, TransactionStmt)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /* translator: %s is a SQL statement name */ + errmsg("%s is not allowed in an SQL function", + CreateCommandName(stmt->utilityStmt)))); + } + + if (fcache->readonly_func && !CommandIsReadOnly(stmt)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /* translator: %s is a SQL statement name */ + errmsg("%s is not allowed in a non-volatile function", + CreateCommandName((Node *) stmt)))); + + /* OK, build the execution_state for this query */ + newes = (execution_state *) palloc(sizeof(execution_state)); + if (preves) + preves->next = newes; + else + firstes = newes; + + newes->next = NULL; + newes->status = F_EXEC_START; + newes->setsResult = false; /* might change below */ + newes->lazyEval = false; /* might change below */ + newes->stmt = stmt; + newes->qd = NULL; + + if (queryTree->canSetTag) + lasttages = newes; + + preves = newes; + } + + eslist = lappend(eslist, firstes); + } + + /* + * Mark the last canSetTag query as delivering the function result; then, + * if it is a plain SELECT, mark it for lazy evaluation. If it's not a + * SELECT we must always run it to completion. + * + * Note: at some point we might add additional criteria for whether to use + * lazy eval. However, we should prefer to use it whenever the function + * doesn't return set, since fetching more than one row is useless in that + * case. + * + * Note: don't set setsResult if the function returns VOID, as evidenced + * by not having made a junkfilter. This ensures we'll throw away any + * output from the last statement in such a function. + */ + if (lasttages && fcache->junkFilter) + { + lasttages->setsResult = true; + if (lazyEvalOK && + lasttages->stmt->commandType == CMD_SELECT && + !lasttages->stmt->hasModifyingCTE) + fcache->lazyEval = lasttages->lazyEval = true; + } + + return eslist; +} + +/* + * Initialize the SQLFunctionCache for a SQL function + */ +static void +init_sql_fcache(FunctionCallInfo fcinfo, Oid collation, bool lazyEvalOK) +{ + FmgrInfo *finfo = fcinfo->flinfo; + Oid foid = finfo->fn_oid; + MemoryContext fcontext; + MemoryContext oldcontext; + Oid rettype; + TupleDesc rettupdesc; + HeapTuple procedureTuple; + Form_pg_proc procedureStruct; + SQLFunctionCachePtr fcache; + List *queryTree_list; + List *resulttlist; + ListCell *lc; + Datum tmp; + bool isNull; + + /* + * Create memory context that holds all the SQLFunctionCache data. It + * must be a child of whatever context holds the FmgrInfo. + */ + fcontext = AllocSetContextCreate(finfo->fn_mcxt, + "SQL function", + ALLOCSET_DEFAULT_SIZES); + + oldcontext = MemoryContextSwitchTo(fcontext); + + /* + * Create the struct proper, link it to fcontext and fn_extra. Once this + * is done, we'll be able to recover the memory after failure, even if the + * FmgrInfo is long-lived. + */ + fcache = (SQLFunctionCachePtr) palloc0(sizeof(SQLFunctionCache)); + fcache->fcontext = fcontext; + finfo->fn_extra = (void *) fcache; + + /* + * get the procedure tuple corresponding to the given function Oid + */ + procedureTuple = SearchSysCache1(PROCOID, ObjectIdGetDatum(foid)); + if (!HeapTupleIsValid(procedureTuple)) + elog(ERROR, "cache lookup failed for function %u", foid); + procedureStruct = (Form_pg_proc) GETSTRUCT(procedureTuple); + + /* + * copy function name immediately for use by error reporting callback, and + * for use as memory context identifier + */ + fcache->fname = pstrdup(NameStr(procedureStruct->proname)); + MemoryContextSetIdentifier(fcontext, fcache->fname); + + /* + * Resolve any polymorphism, obtaining the actual result type, and the + * corresponding tupdesc if it's a rowtype. + */ + (void) get_call_result_type(fcinfo, &rettype, &rettupdesc); + + fcache->rettype = rettype; + + /* Fetch the typlen and byval info for the result type */ + get_typlenbyval(rettype, &fcache->typlen, &fcache->typbyval); + + /* Remember whether we're returning setof something */ + fcache->returnsSet = procedureStruct->proretset; + + /* Remember if function is STABLE/IMMUTABLE */ + fcache->readonly_func = + (procedureStruct->provolatile != PROVOLATILE_VOLATILE); + + /* + * We need the actual argument types to pass to the parser. Also make + * sure that parameter symbols are considered to have the function's + * resolved input collation. + */ + fcache->pinfo = prepare_sql_fn_parse_info(procedureTuple, + finfo->fn_expr, + collation); + + /* + * And of course we need the function body text. + */ + tmp = SysCacheGetAttr(PROCOID, + procedureTuple, + Anum_pg_proc_prosrc, + &isNull); + if (isNull) + elog(ERROR, "null prosrc for function %u", foid); + fcache->src = TextDatumGetCString(tmp); + + /* If we have prosqlbody, pay attention to that not prosrc. */ + tmp = SysCacheGetAttr(PROCOID, + procedureTuple, + Anum_pg_proc_prosqlbody, + &isNull); + + /* + * Parse and rewrite the queries in the function text. Use sublists to + * keep track of the original query boundaries. + * + * Note: since parsing and planning is done in fcontext, we will generate + * a lot of cruft that lives as long as the fcache does. This is annoying + * but we'll not worry about it until the module is rewritten to use + * plancache.c. + */ + queryTree_list = NIL; + if (!isNull) + { + Node *n; + List *stored_query_list; + + n = stringToNode(TextDatumGetCString(tmp)); + if (IsA(n, List)) + stored_query_list = linitial_node(List, castNode(List, n)); + else + stored_query_list = list_make1(n); + + foreach(lc, stored_query_list) + { + Query *parsetree = lfirst_node(Query, lc); + List *queryTree_sublist; + + AcquireRewriteLocks(parsetree, true, false); + queryTree_sublist = pg_rewrite_query(parsetree); + queryTree_list = lappend(queryTree_list, queryTree_sublist); + } + } + else + { + List *raw_parsetree_list; + + raw_parsetree_list = pg_parse_query(fcache->src); + + foreach(lc, raw_parsetree_list) + { + RawStmt *parsetree = lfirst_node(RawStmt, lc); + List *queryTree_sublist; + + queryTree_sublist = pg_analyze_and_rewrite_params(parsetree, + fcache->src, + (ParserSetupHook) sql_fn_parser_setup, + fcache->pinfo, + NULL); + queryTree_list = lappend(queryTree_list, queryTree_sublist); + } + } + + /* + * Check that there are no statements we don't want to allow. + */ + check_sql_fn_statements(queryTree_list); + + /* + * Check that the function returns the type it claims to. Although in + * simple cases this was already done when the function was defined, we + * have to recheck because database objects used in the function's queries + * might have changed type. We'd have to recheck anyway if the function + * had any polymorphic arguments. Moreover, check_sql_fn_retval takes + * care of injecting any required column type coercions. (But we don't + * ask it to insert nulls for dropped columns; the junkfilter handles + * that.) + * + * Note: we set fcache->returnsTuple according to whether we are returning + * the whole tuple result or just a single column. In the latter case we + * clear returnsTuple because we need not act different from the scalar + * result case, even if it's a rowtype column. (However, we have to force + * lazy eval mode in that case; otherwise we'd need extra code to expand + * the rowtype column into multiple columns, since we have no way to + * notify the caller that it should do that.) + */ + fcache->returnsTuple = check_sql_fn_retval(queryTree_list, + rettype, + rettupdesc, + false, + &resulttlist); + + /* + * Construct a JunkFilter we can use to coerce the returned rowtype to the + * desired form, unless the result type is VOID, in which case there's + * nothing to coerce to. (XXX Frequently, the JunkFilter isn't doing + * anything very interesting, but much of this module expects it to be + * there anyway.) + */ + if (rettype != VOIDOID) + { + TupleTableSlot *slot = MakeSingleTupleTableSlot(NULL, + &TTSOpsMinimalTuple); + + /* + * If the result is composite, *and* we are returning the whole tuple + * result, we need to insert nulls for any dropped columns. In the + * single-column-result case, there might be dropped columns within + * the composite column value, but it's not our problem here. There + * should be no resjunk entries in resulttlist, so in the second case + * the JunkFilter is certainly a no-op. + */ + if (rettupdesc && fcache->returnsTuple) + fcache->junkFilter = ExecInitJunkFilterConversion(resulttlist, + rettupdesc, + slot); + else + fcache->junkFilter = ExecInitJunkFilter(resulttlist, slot); + } + + if (fcache->returnsTuple) + { + /* Make sure output rowtype is properly blessed */ + BlessTupleDesc(fcache->junkFilter->jf_resultSlot->tts_tupleDescriptor); + } + else if (fcache->returnsSet && type_is_rowtype(fcache->rettype)) + { + /* + * Returning rowtype as if it were scalar --- materialize won't work. + * Right now it's sufficient to override any caller preference for + * materialize mode, but to add more smarts in init_execution_state + * about this, we'd probably need a three-way flag instead of bool. + */ + lazyEvalOK = true; + } + + /* Finally, plan the queries */ + fcache->func_state = init_execution_state(queryTree_list, + fcache, + lazyEvalOK); + + /* Mark fcache with time of creation to show it's valid */ + fcache->lxid = MyProc->lxid; + fcache->subxid = GetCurrentSubTransactionId(); + + ReleaseSysCache(procedureTuple); + + MemoryContextSwitchTo(oldcontext); +} + +/* Start up execution of one execution_state node */ +static void +postquel_start(execution_state *es, SQLFunctionCachePtr fcache) +{ + DestReceiver *dest; + + Assert(es->qd == NULL); + + /* Caller should have ensured a suitable snapshot is active */ + Assert(ActiveSnapshotSet()); + + /* + * If this query produces the function result, send its output to the + * tuplestore; else discard any output. + */ + if (es->setsResult) + { + DR_sqlfunction *myState; + + dest = CreateDestReceiver(DestSQLFunction); + /* pass down the needed info to the dest receiver routines */ + myState = (DR_sqlfunction *) dest; + Assert(myState->pub.mydest == DestSQLFunction); + myState->tstore = fcache->tstore; + myState->cxt = CurrentMemoryContext; + myState->filter = fcache->junkFilter; + } + else + dest = None_Receiver; + + es->qd = CreateQueryDesc(es->stmt, + fcache->src, + GetActiveSnapshot(), + InvalidSnapshot, + dest, + fcache->paramLI, + es->qd ? es->qd->queryEnv : NULL, + 0); + + /* Utility commands don't need Executor. */ + if (es->qd->operation != CMD_UTILITY) + { + /* + * In lazyEval mode, do not let the executor set up an AfterTrigger + * context. This is necessary not just an optimization, because we + * mustn't exit from the function execution with a stacked + * AfterTrigger level still active. We are careful not to select + * lazyEval mode for any statement that could possibly queue triggers. + */ + int eflags; + + if (es->lazyEval) + eflags = EXEC_FLAG_SKIP_TRIGGERS; + else + eflags = 0; /* default run-to-completion flags */ + ExecutorStart(es->qd, eflags); + } + + es->status = F_EXEC_RUN; +} + +/* Run one execution_state; either to completion or to first result row */ +/* Returns true if we ran to completion */ +static bool +postquel_getnext(execution_state *es, SQLFunctionCachePtr fcache) +{ + bool result; + + if (es->qd->operation == CMD_UTILITY) + { + ProcessUtility(es->qd->plannedstmt, + fcache->src, + false, + PROCESS_UTILITY_QUERY, + es->qd->params, + es->qd->queryEnv, + es->qd->dest, + NULL); + result = true; /* never stops early */ + } + else + { + /* Run regular commands to completion unless lazyEval */ + uint64 count = (es->lazyEval) ? 1 : 0; + + ExecutorRun(es->qd, ForwardScanDirection, count, !fcache->returnsSet || !es->lazyEval); + + /* + * If we requested run to completion OR there was no tuple returned, + * command must be complete. + */ + result = (count == 0 || es->qd->estate->es_processed == 0); + } + + return result; +} + +/* Shut down execution of one execution_state node */ +static void +postquel_end(execution_state *es) +{ + /* mark status done to ensure we don't do ExecutorEnd twice */ + es->status = F_EXEC_DONE; + + /* Utility commands don't need Executor. */ + if (es->qd->operation != CMD_UTILITY) + { + ExecutorFinish(es->qd); + ExecutorEnd(es->qd); + } + + es->qd->dest->rDestroy(es->qd->dest); + + FreeQueryDesc(es->qd); + es->qd = NULL; +} + +/* Build ParamListInfo array representing current arguments */ +static void +postquel_sub_params(SQLFunctionCachePtr fcache, + FunctionCallInfo fcinfo) +{ + int nargs = fcinfo->nargs; + + if (nargs > 0) + { + ParamListInfo paramLI; + + if (fcache->paramLI == NULL) + { + paramLI = makeParamList(nargs); + fcache->paramLI = paramLI; + } + else + { + paramLI = fcache->paramLI; + Assert(paramLI->numParams == nargs); + } + + for (int i = 0; i < nargs; i++) + { + ParamExternData *prm = ¶mLI->params[i]; + + prm->value = fcinfo->args[i].value; + prm->isnull = fcinfo->args[i].isnull; + prm->pflags = 0; + prm->ptype = fcache->pinfo->argtypes[i]; + } + } + else + fcache->paramLI = NULL; +} + +/* + * Extract the SQL function's value from a single result row. This is used + * both for scalar (non-set) functions and for each row of a lazy-eval set + * result. + */ +static Datum +postquel_get_single_result(TupleTableSlot *slot, + FunctionCallInfo fcinfo, + SQLFunctionCachePtr fcache, + MemoryContext resultcontext) +{ + Datum value; + MemoryContext oldcontext; + + /* + * Set up to return the function value. For pass-by-reference datatypes, + * be sure to allocate the result in resultcontext, not the current memory + * context (which has query lifespan). We can't leave the data in the + * TupleTableSlot because we intend to clear the slot before returning. + */ + oldcontext = MemoryContextSwitchTo(resultcontext); + + if (fcache->returnsTuple) + { + /* We must return the whole tuple as a Datum. */ + fcinfo->isnull = false; + value = ExecFetchSlotHeapTupleDatum(slot); + } + else + { + /* + * Returning a scalar, which we have to extract from the first column + * of the SELECT result, and then copy into result context if needed. + */ + value = slot_getattr(slot, 1, &(fcinfo->isnull)); + + if (!fcinfo->isnull) + value = datumCopy(value, fcache->typbyval, fcache->typlen); + } + + MemoryContextSwitchTo(oldcontext); + + return value; +} + +/* + * fmgr_sql: function call manager for SQL functions + */ +Datum +fmgr_sql(PG_FUNCTION_ARGS) +{ + SQLFunctionCachePtr fcache; + ErrorContextCallback sqlerrcontext; + MemoryContext oldcontext; + bool randomAccess; + bool lazyEvalOK; + bool is_first; + bool pushed_snapshot; + execution_state *es; + TupleTableSlot *slot; + Datum result; + List *eslist; + ListCell *eslc; + + /* + * Setup error traceback support for ereport() + */ + sqlerrcontext.callback = sql_exec_error_callback; + sqlerrcontext.arg = fcinfo->flinfo; + sqlerrcontext.previous = error_context_stack; + error_context_stack = &sqlerrcontext; + + /* Check call context */ + if (fcinfo->flinfo->fn_retset) + { + ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo; + + /* + * For simplicity, we require callers to support both set eval modes. + * There are cases where we must use one or must use the other, and + * it's not really worthwhile to postpone the check till we know. But + * note we do not require caller to provide an expectedDesc. + */ + if (!rsi || !IsA(rsi, ReturnSetInfo) || + (rsi->allowedModes & SFRM_ValuePerCall) == 0 || + (rsi->allowedModes & SFRM_Materialize) == 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + randomAccess = rsi->allowedModes & SFRM_Materialize_Random; + lazyEvalOK = !(rsi->allowedModes & SFRM_Materialize_Preferred); + } + else + { + randomAccess = false; + lazyEvalOK = true; + } + + /* + * Initialize fcache (build plans) if first time through; or re-initialize + * if the cache is stale. + */ + fcache = (SQLFunctionCachePtr) fcinfo->flinfo->fn_extra; + + if (fcache != NULL) + { + if (fcache->lxid != MyProc->lxid || + !SubTransactionIsActive(fcache->subxid)) + { + /* It's stale; unlink and delete */ + fcinfo->flinfo->fn_extra = NULL; + MemoryContextDelete(fcache->fcontext); + fcache = NULL; + } + } + + if (fcache == NULL) + { + init_sql_fcache(fcinfo, PG_GET_COLLATION(), lazyEvalOK); + fcache = (SQLFunctionCachePtr) fcinfo->flinfo->fn_extra; + } + + /* + * Switch to context in which the fcache lives. This ensures that our + * tuplestore etc will have sufficient lifetime. The sub-executor is + * responsible for deleting per-tuple information. (XXX in the case of a + * long-lived FmgrInfo, this policy represents more memory leakage, but + * it's not entirely clear where to keep stuff instead.) + */ + oldcontext = MemoryContextSwitchTo(fcache->fcontext); + + /* + * Find first unfinished query in function, and note whether it's the + * first query. + */ + eslist = fcache->func_state; + es = NULL; + is_first = true; + foreach(eslc, eslist) + { + es = (execution_state *) lfirst(eslc); + + while (es && es->status == F_EXEC_DONE) + { + is_first = false; + es = es->next; + } + + if (es) + break; + } + + /* + * Convert params to appropriate format if starting a fresh execution. (If + * continuing execution, we can re-use prior params.) + */ + if (is_first && es && es->status == F_EXEC_START) + postquel_sub_params(fcache, fcinfo); + + /* + * Build tuplestore to hold results, if we don't have one already. Note + * it's in the query-lifespan context. + */ + if (!fcache->tstore) + fcache->tstore = tuplestore_begin_heap(randomAccess, false, work_mem); + + /* + * Execute each command in the function one after another until we either + * run out of commands or get a result row from a lazily-evaluated SELECT. + * + * Notes about snapshot management: + * + * In a read-only function, we just use the surrounding query's snapshot. + * + * In a non-read-only function, we rely on the fact that we'll never + * suspend execution between queries of the function: the only reason to + * suspend execution before completion is if we are returning a row from a + * lazily-evaluated SELECT. So, when first entering this loop, we'll + * either start a new query (and push a fresh snapshot) or re-establish + * the active snapshot from the existing query descriptor. If we need to + * start a new query in a subsequent execution of the loop, either we need + * a fresh snapshot (and pushed_snapshot is false) or the existing + * snapshot is on the active stack and we can just bump its command ID. + */ + pushed_snapshot = false; + while (es) + { + bool completed; + + if (es->status == F_EXEC_START) + { + /* + * If not read-only, be sure to advance the command counter for + * each command, so that all work to date in this transaction is + * visible. Take a new snapshot if we don't have one yet, + * otherwise just bump the command ID in the existing snapshot. + */ + if (!fcache->readonly_func) + { + CommandCounterIncrement(); + if (!pushed_snapshot) + { + PushActiveSnapshot(GetTransactionSnapshot()); + pushed_snapshot = true; + } + else + UpdateActiveSnapshotCommandId(); + } + + postquel_start(es, fcache); + } + else if (!fcache->readonly_func && !pushed_snapshot) + { + /* Re-establish active snapshot when re-entering function */ + PushActiveSnapshot(es->qd->snapshot); + pushed_snapshot = true; + } + + completed = postquel_getnext(es, fcache); + + /* + * If we ran the command to completion, we can shut it down now. Any + * row(s) we need to return are safely stashed in the tuplestore, and + * we want to be sure that, for example, AFTER triggers get fired + * before we return anything. Also, if the function doesn't return + * set, we can shut it down anyway because it must be a SELECT and we + * don't care about fetching any more result rows. + */ + if (completed || !fcache->returnsSet) + postquel_end(es); + + /* + * Break from loop if we didn't shut down (implying we got a + * lazily-evaluated row). Otherwise we'll press on till the whole + * function is done, relying on the tuplestore to keep hold of the + * data to eventually be returned. This is necessary since an + * INSERT/UPDATE/DELETE RETURNING that sets the result might be + * followed by additional rule-inserted commands, and we want to + * finish doing all those commands before we return anything. + */ + if (es->status != F_EXEC_DONE) + break; + + /* + * Advance to next execution_state, which might be in the next list. + */ + es = es->next; + while (!es) + { + eslc = lnext(eslist, eslc); + if (!eslc) + break; /* end of function */ + + es = (execution_state *) lfirst(eslc); + + /* + * Flush the current snapshot so that we will take a new one for + * the new query list. This ensures that new snaps are taken at + * original-query boundaries, matching the behavior of interactive + * execution. + */ + if (pushed_snapshot) + { + PopActiveSnapshot(); + pushed_snapshot = false; + } + } + } + + /* + * The tuplestore now contains whatever row(s) we are supposed to return. + */ + if (fcache->returnsSet) + { + ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo; + + if (es) + { + /* + * If we stopped short of being done, we must have a lazy-eval + * row. + */ + Assert(es->lazyEval); + /* Re-use the junkfilter's output slot to fetch back the tuple */ + Assert(fcache->junkFilter); + slot = fcache->junkFilter->jf_resultSlot; + if (!tuplestore_gettupleslot(fcache->tstore, true, false, slot)) + elog(ERROR, "failed to fetch lazy-eval tuple"); + /* Extract the result as a datum, and copy out from the slot */ + result = postquel_get_single_result(slot, fcinfo, + fcache, oldcontext); + /* Clear the tuplestore, but keep it for next time */ + /* NB: this might delete the slot's content, but we don't care */ + tuplestore_clear(fcache->tstore); + + /* + * Let caller know we're not finished. + */ + rsi->isDone = ExprMultipleResult; + + /* + * Ensure we will get shut down cleanly if the exprcontext is not + * run to completion. + */ + if (!fcache->shutdown_reg) + { + RegisterExprContextCallback(rsi->econtext, + ShutdownSQLFunction, + PointerGetDatum(fcache)); + fcache->shutdown_reg = true; + } + } + else if (fcache->lazyEval) + { + /* + * We are done with a lazy evaluation. Clean up. + */ + tuplestore_clear(fcache->tstore); + + /* + * Let caller know we're finished. + */ + rsi->isDone = ExprEndResult; + + fcinfo->isnull = true; + result = (Datum) 0; + + /* Deregister shutdown callback, if we made one */ + if (fcache->shutdown_reg) + { + UnregisterExprContextCallback(rsi->econtext, + ShutdownSQLFunction, + PointerGetDatum(fcache)); + fcache->shutdown_reg = false; + } + } + else + { + /* + * We are done with a non-lazy evaluation. Return whatever is in + * the tuplestore. (It is now caller's responsibility to free the + * tuplestore when done.) + */ + rsi->returnMode = SFRM_Materialize; + rsi->setResult = fcache->tstore; + fcache->tstore = NULL; + /* must copy desc because execSRF.c will free it */ + if (fcache->junkFilter) + rsi->setDesc = CreateTupleDescCopy(fcache->junkFilter->jf_cleanTupType); + + fcinfo->isnull = true; + result = (Datum) 0; + + /* Deregister shutdown callback, if we made one */ + if (fcache->shutdown_reg) + { + UnregisterExprContextCallback(rsi->econtext, + ShutdownSQLFunction, + PointerGetDatum(fcache)); + fcache->shutdown_reg = false; + } + } + } + else + { + /* + * Non-set function. If we got a row, return it; else return NULL. + */ + if (fcache->junkFilter) + { + /* Re-use the junkfilter's output slot to fetch back the tuple */ + slot = fcache->junkFilter->jf_resultSlot; + if (tuplestore_gettupleslot(fcache->tstore, true, false, slot)) + result = postquel_get_single_result(slot, fcinfo, + fcache, oldcontext); + else + { + fcinfo->isnull = true; + result = (Datum) 0; + } + } + else + { + /* Should only get here for VOID functions and procedures */ + Assert(fcache->rettype == VOIDOID); + fcinfo->isnull = true; + result = (Datum) 0; + } + + /* Clear the tuplestore, but keep it for next time */ + tuplestore_clear(fcache->tstore); + } + + /* Pop snapshot if we have pushed one */ + if (pushed_snapshot) + PopActiveSnapshot(); + + /* + * If we've gone through every command in the function, we are done. Reset + * the execution states to start over again on next call. + */ + if (es == NULL) + { + foreach(eslc, fcache->func_state) + { + es = (execution_state *) lfirst(eslc); + while (es) + { + es->status = F_EXEC_START; + es = es->next; + } + } + } + + error_context_stack = sqlerrcontext.previous; + + MemoryContextSwitchTo(oldcontext); + + return result; +} + + +/* + * error context callback to let us supply a call-stack traceback + */ +static void +sql_exec_error_callback(void *arg) +{ + FmgrInfo *flinfo = (FmgrInfo *) arg; + SQLFunctionCachePtr fcache = (SQLFunctionCachePtr) flinfo->fn_extra; + int syntaxerrposition; + + /* + * We can do nothing useful if init_sql_fcache() didn't get as far as + * saving the function name + */ + if (fcache == NULL || fcache->fname == NULL) + return; + + /* + * If there is a syntax error position, convert to internal syntax error + */ + syntaxerrposition = geterrposition(); + if (syntaxerrposition > 0 && fcache->src != NULL) + { + errposition(0); + internalerrposition(syntaxerrposition); + internalerrquery(fcache->src); + } + + /* + * Try to determine where in the function we failed. If there is a query + * with non-null QueryDesc, finger it. (We check this rather than looking + * for F_EXEC_RUN state, so that errors during ExecutorStart or + * ExecutorEnd are blamed on the appropriate query; see postquel_start and + * postquel_end.) + */ + if (fcache->func_state) + { + execution_state *es; + int query_num; + ListCell *lc; + + es = NULL; + query_num = 1; + foreach(lc, fcache->func_state) + { + es = (execution_state *) lfirst(lc); + while (es) + { + if (es->qd) + { + errcontext("SQL function \"%s\" statement %d", + fcache->fname, query_num); + break; + } + es = es->next; + } + if (es) + break; + query_num++; + } + if (es == NULL) + { + /* + * couldn't identify a running query; might be function entry, + * function exit, or between queries. + */ + errcontext("SQL function \"%s\"", fcache->fname); + } + } + else + { + /* + * Assume we failed during init_sql_fcache(). (It's possible that the + * function actually has an empty body, but in that case we may as + * well report all errors as being "during startup".) + */ + errcontext("SQL function \"%s\" during startup", fcache->fname); + } +} + + +/* + * callback function in case a function-returning-set needs to be shut down + * before it has been run to completion + */ +static void +ShutdownSQLFunction(Datum arg) +{ + SQLFunctionCachePtr fcache = (SQLFunctionCachePtr) DatumGetPointer(arg); + execution_state *es; + ListCell *lc; + + foreach(lc, fcache->func_state) + { + es = (execution_state *) lfirst(lc); + while (es) + { + /* Shut down anything still running */ + if (es->status == F_EXEC_RUN) + { + /* Re-establish active snapshot for any called functions */ + if (!fcache->readonly_func) + PushActiveSnapshot(es->qd->snapshot); + + postquel_end(es); + + if (!fcache->readonly_func) + PopActiveSnapshot(); + } + + /* Reset states to START in case we're called again */ + es->status = F_EXEC_START; + es = es->next; + } + } + + /* Release tuplestore if we have one */ + if (fcache->tstore) + tuplestore_end(fcache->tstore); + fcache->tstore = NULL; + + /* execUtils will deregister the callback... */ + fcache->shutdown_reg = false; +} + +/* + * check_sql_fn_statements + * + * Check statements in an SQL function. Error out if there is anything that + * is not acceptable. + */ +void +check_sql_fn_statements(List *queryTreeLists) +{ + ListCell *lc; + + /* We are given a list of sublists of Queries */ + foreach(lc, queryTreeLists) + { + List *sublist = lfirst_node(List, lc); + ListCell *lc2; + + foreach(lc2, sublist) + { + Query *query = lfirst_node(Query, lc2); + + /* + * Disallow calling procedures with output arguments. The current + * implementation would just throw the output values away, unless + * the statement is the last one. Per SQL standard, we should + * assign the output values by name. By disallowing this here, we + * preserve an opportunity for future improvement. + */ + if (query->commandType == CMD_UTILITY && + IsA(query->utilityStmt, CallStmt)) + { + CallStmt *stmt = (CallStmt *) query->utilityStmt; + + if (stmt->outargs != NIL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("calling procedures with output arguments is not supported in SQL functions"))); + } + } + } +} + +/* + * check_sql_fn_retval() + * Check return value of a list of lists of sql parse trees. + * + * The return value of a sql function is the value returned by the last + * canSetTag query in the function. We do some ad-hoc type checking and + * coercion here to ensure that the function returns what it's supposed to. + * Note that we may actually modify the last query to make it match! + * + * This function returns true if the sql function returns the entire tuple + * result of its final statement, or false if it returns just the first column + * result of that statement. It throws an error if the final statement doesn't + * return the right type at all. + * + * Note that because we allow "SELECT rowtype_expression", the result can be + * false even when the declared function return type is a rowtype. + * + * For a polymorphic function the passed rettype must be the actual resolved + * output type of the function. (This means we can't check the type during + * function definition of a polymorphic function.) If we do see a polymorphic + * rettype we'll throw an error, saying it is not a supported rettype. + * + * If the function returns composite, the passed rettupdesc should describe + * the expected output. If rettupdesc is NULL, we can't verify that the + * output matches; that should only happen in fmgr_sql_validator(), or when + * the function returns RECORD and the caller doesn't actually care which + * composite type it is. + * + * (Typically, rettype and rettupdesc are computed by get_call_result_type + * or a sibling function.) + * + * In addition to coercing individual output columns, we can modify the + * output to include dummy NULL columns for any dropped columns appearing + * in rettupdesc. This is done only if the caller asks for it. + * + * If resultTargetList isn't NULL, then *resultTargetList is set to the + * targetlist that defines the final statement's result. Exception: if the + * function is defined to return VOID then *resultTargetList is set to NIL. + */ +bool +check_sql_fn_retval(List *queryTreeLists, + Oid rettype, TupleDesc rettupdesc, + bool insertDroppedCols, + List **resultTargetList) +{ + bool is_tuple_result = false; + Query *parse; + ListCell *parse_cell; + List *tlist; + int tlistlen; + bool tlist_is_modifiable; + char fn_typtype; + List *upper_tlist = NIL; + bool upper_tlist_nontrivial = false; + ListCell *lc; + + if (resultTargetList) + *resultTargetList = NIL; /* initialize in case of VOID result */ + + /* + * If it's declared to return VOID, we don't care what's in the function. + * (This takes care of the procedure case, as well.) + */ + if (rettype == VOIDOID) + return false; + + /* + * Find the last canSetTag query in the function body (which is presented + * to us as a list of sublists of Query nodes). This isn't necessarily + * the last parsetree, because rule rewriting can insert queries after + * what the user wrote. Note that it might not even be in the last + * sublist, for example if the last query rewrites to DO INSTEAD NOTHING. + * (It might not be unreasonable to throw an error in such a case, but + * this is the historical behavior and it doesn't seem worth changing.) + */ + parse = NULL; + parse_cell = NULL; + foreach(lc, queryTreeLists) + { + List *sublist = lfirst_node(List, lc); + ListCell *lc2; + + foreach(lc2, sublist) + { + Query *q = lfirst_node(Query, lc2); + + if (q->canSetTag) + { + parse = q; + parse_cell = lc2; + } + } + } + + /* + * If it's a plain SELECT, it returns whatever the targetlist says. + * Otherwise, if it's INSERT/UPDATE/DELETE with RETURNING, it returns + * that. Otherwise, the function return type must be VOID. + * + * Note: eventually replace this test with QueryReturnsTuples? We'd need + * a more general method of determining the output type, though. Also, it + * seems too dangerous to consider FETCH or EXECUTE as returning a + * determinable rowtype, since they depend on relatively short-lived + * entities. + */ + if (parse && + parse->commandType == CMD_SELECT) + { + tlist = parse->targetList; + /* tlist is modifiable unless it's a dummy in a setop query */ + tlist_is_modifiable = (parse->setOperations == NULL); + } + else if (parse && + (parse->commandType == CMD_INSERT || + parse->commandType == CMD_UPDATE || + parse->commandType == CMD_DELETE) && + parse->returningList) + { + tlist = parse->returningList; + /* returningList can always be modified */ + tlist_is_modifiable = true; + } + else + { + /* Empty function body, or last statement is a utility command */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("return type mismatch in function declared to return %s", + format_type_be(rettype)), + errdetail("Function's final statement must be SELECT or INSERT/UPDATE/DELETE RETURNING."))); + return false; /* keep compiler quiet */ + } + + /* + * OK, check that the targetlist returns something matching the declared + * type, and modify it if necessary. If possible, we insert any coercion + * steps right into the final statement's targetlist. However, that might + * risk changes in the statement's semantics --- we can't safely change + * the output type of a grouping column, for instance. In such cases we + * handle coercions by inserting an extra level of Query that effectively + * just does a projection. + */ + + /* + * Count the non-junk entries in the result targetlist. + */ + tlistlen = ExecCleanTargetListLength(tlist); + + fn_typtype = get_typtype(rettype); + + if (fn_typtype == TYPTYPE_BASE || + fn_typtype == TYPTYPE_DOMAIN || + fn_typtype == TYPTYPE_ENUM || + fn_typtype == TYPTYPE_RANGE || + fn_typtype == TYPTYPE_MULTIRANGE) + { + /* + * For scalar-type returns, the target list must have exactly one + * non-junk entry, and its type must be coercible to rettype. + */ + TargetEntry *tle; + + if (tlistlen != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("return type mismatch in function declared to return %s", + format_type_be(rettype)), + errdetail("Final statement must return exactly one column."))); + + /* We assume here that non-junk TLEs must come first in tlists */ + tle = (TargetEntry *) linitial(tlist); + Assert(!tle->resjunk); + + if (!coerce_fn_result_column(tle, rettype, -1, + tlist_is_modifiable, + &upper_tlist, + &upper_tlist_nontrivial)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("return type mismatch in function declared to return %s", + format_type_be(rettype)), + errdetail("Actual return type is %s.", + format_type_be(exprType((Node *) tle->expr))))); + } + else if (fn_typtype == TYPTYPE_COMPOSITE || rettype == RECORDOID) + { + /* + * Returns a rowtype. + * + * Note that we will not consider a domain over composite to be a + * "rowtype" return type; it goes through the scalar case above. This + * is because we only provide column-by-column implicit casting, and + * will not cast the complete record result. So the only way to + * produce a domain-over-composite result is to compute it as an + * explicit single-column result. The single-composite-column code + * path just below could handle such cases, but it won't be reached. + */ + int tupnatts; /* physical number of columns in tuple */ + int tuplogcols; /* # of nondeleted columns in tuple */ + int colindex; /* physical column index */ + + /* + * If the target list has one non-junk entry, and that expression has + * or can be coerced to the declared return type, take it as the + * result. This allows, for example, 'SELECT func2()', where func2 + * has the same composite return type as the function that's calling + * it. This provision creates some ambiguity --- maybe the expression + * was meant to be the lone field of the composite result --- but it + * works well enough as long as we don't get too enthusiastic about + * inventing coercions from scalar to composite types. + * + * XXX Note that if rettype is RECORD and the expression is of a named + * composite type, or vice versa, this coercion will succeed, whether + * or not the record type really matches. For the moment we rely on + * runtime type checking to catch any discrepancy, but it'd be nice to + * do better at parse time. + */ + if (tlistlen == 1) + { + TargetEntry *tle = (TargetEntry *) linitial(tlist); + + Assert(!tle->resjunk); + if (coerce_fn_result_column(tle, rettype, -1, + tlist_is_modifiable, + &upper_tlist, + &upper_tlist_nontrivial)) + { + /* Note that we're NOT setting is_tuple_result */ + goto tlist_coercion_finished; + } + } + + /* + * If the caller didn't provide an expected tupdesc, we can't do any + * further checking. Assume we're returning the whole tuple. + */ + if (rettupdesc == NULL) + { + /* Return tlist if requested */ + if (resultTargetList) + *resultTargetList = tlist; + return true; + } + + /* + * Verify that the targetlist matches the return tuple type. We scan + * the non-resjunk columns, and coerce them if necessary to match the + * datatypes of the non-deleted attributes. For deleted attributes, + * insert NULL result columns if the caller asked for that. + */ + tupnatts = rettupdesc->natts; + tuplogcols = 0; /* we'll count nondeleted cols as we go */ + colindex = 0; + + foreach(lc, tlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + Form_pg_attribute attr; + + /* resjunk columns can simply be ignored */ + if (tle->resjunk) + continue; + + do + { + colindex++; + if (colindex > tupnatts) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("return type mismatch in function declared to return %s", + format_type_be(rettype)), + errdetail("Final statement returns too many columns."))); + attr = TupleDescAttr(rettupdesc, colindex - 1); + if (attr->attisdropped && insertDroppedCols) + { + Expr *null_expr; + + /* The type of the null we insert isn't important */ + null_expr = (Expr *) makeConst(INT4OID, + -1, + InvalidOid, + sizeof(int32), + (Datum) 0, + true, /* isnull */ + true /* byval */ ); + upper_tlist = lappend(upper_tlist, + makeTargetEntry(null_expr, + list_length(upper_tlist) + 1, + NULL, + false)); + upper_tlist_nontrivial = true; + } + } while (attr->attisdropped); + tuplogcols++; + + if (!coerce_fn_result_column(tle, + attr->atttypid, attr->atttypmod, + tlist_is_modifiable, + &upper_tlist, + &upper_tlist_nontrivial)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("return type mismatch in function declared to return %s", + format_type_be(rettype)), + errdetail("Final statement returns %s instead of %s at column %d.", + format_type_be(exprType((Node *) tle->expr)), + format_type_be(attr->atttypid), + tuplogcols))); + } + + /* remaining columns in rettupdesc had better all be dropped */ + for (colindex++; colindex <= tupnatts; colindex++) + { + if (!TupleDescAttr(rettupdesc, colindex - 1)->attisdropped) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("return type mismatch in function declared to return %s", + format_type_be(rettype)), + errdetail("Final statement returns too few columns."))); + if (insertDroppedCols) + { + Expr *null_expr; + + /* The type of the null we insert isn't important */ + null_expr = (Expr *) makeConst(INT4OID, + -1, + InvalidOid, + sizeof(int32), + (Datum) 0, + true, /* isnull */ + true /* byval */ ); + upper_tlist = lappend(upper_tlist, + makeTargetEntry(null_expr, + list_length(upper_tlist) + 1, + NULL, + false)); + upper_tlist_nontrivial = true; + } + } + + /* Report that we are returning entire tuple result */ + is_tuple_result = true; + } + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("return type %s is not supported for SQL functions", + format_type_be(rettype)))); + +tlist_coercion_finished: + + /* + * If necessary, modify the final Query by injecting an extra Query level + * that just performs a projection. (It'd be dubious to do this to a + * non-SELECT query, but we never have to; RETURNING lists can always be + * modified in-place.) + */ + if (upper_tlist_nontrivial) + { + Query *newquery; + List *colnames; + RangeTblEntry *rte; + RangeTblRef *rtr; + + Assert(parse->commandType == CMD_SELECT); + + /* Most of the upper Query struct can be left as zeroes/nulls */ + newquery = makeNode(Query); + newquery->commandType = CMD_SELECT; + newquery->querySource = parse->querySource; + newquery->canSetTag = true; + newquery->targetList = upper_tlist; + + /* We need a moderately realistic colnames list for the subquery RTE */ + colnames = NIL; + foreach(lc, parse->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + if (tle->resjunk) + continue; + colnames = lappend(colnames, + makeString(tle->resname ? tle->resname : "")); + } + + /* Build a suitable RTE for the subquery */ + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_SUBQUERY; + rte->subquery = parse; + rte->eref = rte->alias = makeAlias("*SELECT*", colnames); + rte->lateral = false; + rte->inh = false; + rte->inFromCl = true; + newquery->rtable = list_make1(rte); + + rtr = makeNode(RangeTblRef); + rtr->rtindex = 1; + newquery->jointree = makeFromExpr(list_make1(rtr), NULL); + + /* Replace original query in the correct element of the query list */ + lfirst(parse_cell) = newquery; + } + + /* Return tlist (possibly modified) if requested */ + if (resultTargetList) + *resultTargetList = upper_tlist; + + return is_tuple_result; +} + +/* + * Process one function result column for check_sql_fn_retval + * + * Coerce the output value to the required type/typmod, and add a column + * to *upper_tlist for it. Set *upper_tlist_nontrivial to true if we + * add an upper tlist item that's not just a Var. + * + * Returns true if OK, false if could not coerce to required type + * (in which case, no changes have been made) + */ +static bool +coerce_fn_result_column(TargetEntry *src_tle, + Oid res_type, + int32 res_typmod, + bool tlist_is_modifiable, + List **upper_tlist, + bool *upper_tlist_nontrivial) +{ + TargetEntry *new_tle; + Expr *new_tle_expr; + Node *cast_result; + + /* + * If the TLE has a sortgroupref marking, don't change it, as it probably + * is referenced by ORDER BY, DISTINCT, etc, and changing its type would + * break query semantics. Otherwise, it's safe to modify in-place unless + * the query as a whole has issues with that. + */ + if (tlist_is_modifiable && src_tle->ressortgroupref == 0) + { + /* OK to modify src_tle in place, if necessary */ + cast_result = coerce_to_target_type(NULL, + (Node *) src_tle->expr, + exprType((Node *) src_tle->expr), + res_type, res_typmod, + COERCION_ASSIGNMENT, + COERCE_IMPLICIT_CAST, + -1); + if (cast_result == NULL) + return false; + assign_expr_collations(NULL, cast_result); + src_tle->expr = (Expr *) cast_result; + /* Make a Var referencing the possibly-modified TLE */ + new_tle_expr = (Expr *) makeVarFromTargetEntry(1, src_tle); + } + else + { + /* Any casting must happen in the upper tlist */ + Var *var = makeVarFromTargetEntry(1, src_tle); + + cast_result = coerce_to_target_type(NULL, + (Node *) var, + var->vartype, + res_type, res_typmod, + COERCION_ASSIGNMENT, + COERCE_IMPLICIT_CAST, + -1); + if (cast_result == NULL) + return false; + assign_expr_collations(NULL, cast_result); + /* Did the coercion actually do anything? */ + if (cast_result != (Node *) var) + *upper_tlist_nontrivial = true; + new_tle_expr = (Expr *) cast_result; + } + new_tle = makeTargetEntry(new_tle_expr, + list_length(*upper_tlist) + 1, + src_tle->resname, false); + *upper_tlist = lappend(*upper_tlist, new_tle); + return true; +} + + +/* + * CreateSQLFunctionDestReceiver -- create a suitable DestReceiver object + */ +DestReceiver * +CreateSQLFunctionDestReceiver(void) +{ + DR_sqlfunction *self = (DR_sqlfunction *) palloc0(sizeof(DR_sqlfunction)); + + self->pub.receiveSlot = sqlfunction_receive; + self->pub.rStartup = sqlfunction_startup; + self->pub.rShutdown = sqlfunction_shutdown; + self->pub.rDestroy = sqlfunction_destroy; + self->pub.mydest = DestSQLFunction; + + /* private fields will be set by postquel_start */ + + return (DestReceiver *) self; +} + +/* + * sqlfunction_startup --- executor startup + */ +static void +sqlfunction_startup(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + /* no-op */ +} + +/* + * sqlfunction_receive --- receive one tuple + */ +static bool +sqlfunction_receive(TupleTableSlot *slot, DestReceiver *self) +{ + DR_sqlfunction *myState = (DR_sqlfunction *) self; + + /* Filter tuple as needed */ + slot = ExecFilterJunk(myState->filter, slot); + + /* Store the filtered tuple into the tuplestore */ + tuplestore_puttupleslot(myState->tstore, slot); + + return true; +} + +/* + * sqlfunction_shutdown --- executor end + */ +static void +sqlfunction_shutdown(DestReceiver *self) +{ + /* no-op */ +} + +/* + * sqlfunction_destroy --- release DestReceiver object + */ +static void +sqlfunction_destroy(DestReceiver *self) +{ + pfree(self); +} diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c new file mode 100644 index 0000000..2b106d8 --- /dev/null +++ b/src/backend/executor/instrument.c @@ -0,0 +1,279 @@ +/*------------------------------------------------------------------------- + * + * instrument.c + * functions for instrumentation of plan execution + * + * + * Copyright (c) 2001-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/executor/instrument.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "executor/instrument.h" + +BufferUsage pgBufferUsage; +static BufferUsage save_pgBufferUsage; +WalUsage pgWalUsage; +static WalUsage save_pgWalUsage; + +static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); +static void WalUsageAdd(WalUsage *dst, WalUsage *add); + + +/* Allocate new instrumentation structure(s) */ +Instrumentation * +InstrAlloc(int n, int instrument_options, bool async_mode) +{ + Instrumentation *instr; + + /* initialize all fields to zeroes, then modify as needed */ + instr = palloc0(n * sizeof(Instrumentation)); + if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) + { + bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; + bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + int i; + + for (i = 0; i < n; i++) + { + instr[i].need_bufusage = need_buffers; + instr[i].need_walusage = need_wal; + instr[i].need_timer = need_timer; + instr[i].async_mode = async_mode; + } + } + + return instr; +} + +/* Initialize a pre-allocated instrumentation structure. */ +void +InstrInit(Instrumentation *instr, int instrument_options) +{ + memset(instr, 0, sizeof(Instrumentation)); + instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; + instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; + instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; +} + +/* Entry to a plan node */ +void +InstrStartNode(Instrumentation *instr) +{ + if (instr->need_timer && + !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) + elog(ERROR, "InstrStartNode called twice in a row"); + + /* save buffer usage totals at node entry, if needed */ + if (instr->need_bufusage) + instr->bufusage_start = pgBufferUsage; + + if (instr->need_walusage) + instr->walusage_start = pgWalUsage; +} + +/* Exit from a plan node */ +void +InstrStopNode(Instrumentation *instr, double nTuples) +{ + double save_tuplecount = instr->tuplecount; + instr_time endtime; + + /* count the returned tuples */ + instr->tuplecount += nTuples; + + /* let's update the time only if the timer was requested */ + if (instr->need_timer) + { + if (INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrStopNode called without start"); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); + + INSTR_TIME_SET_ZERO(instr->starttime); + } + + /* Add delta of buffer usage since entry to node's totals */ + if (instr->need_bufusage) + BufferUsageAccumDiff(&instr->bufusage, + &pgBufferUsage, &instr->bufusage_start); + + if (instr->need_walusage) + WalUsageAccumDiff(&instr->walusage, + &pgWalUsage, &instr->walusage_start); + + /* Is this the first tuple of this cycle? */ + if (!instr->running) + { + instr->running = true; + instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); + } + else + { + /* + * In async mode, if the plan node hadn't emitted any tuples before, + * this might be the first tuple + */ + if (instr->async_mode && save_tuplecount < 1.0) + instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); + } +} + +/* Update tuple count */ +void +InstrUpdateTupleCount(Instrumentation *instr, double nTuples) +{ + /* count the returned tuples */ + instr->tuplecount += nTuples; +} + +/* Finish a run cycle for a plan node */ +void +InstrEndLoop(Instrumentation *instr) +{ + double totaltime; + + /* Skip if nothing has happened, or already shut down */ + if (!instr->running) + return; + + if (!INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrEndLoop called on running node"); + + /* Accumulate per-cycle statistics into totals */ + totaltime = INSTR_TIME_GET_DOUBLE(instr->counter); + + instr->startup += instr->firsttuple; + instr->total += totaltime; + instr->ntuples += instr->tuplecount; + instr->nloops += 1; + + /* Reset for next cycle (if any) */ + instr->running = false; + INSTR_TIME_SET_ZERO(instr->starttime); + INSTR_TIME_SET_ZERO(instr->counter); + instr->firsttuple = 0; + instr->tuplecount = 0; +} + +/* aggregate instrumentation information */ +void +InstrAggNode(Instrumentation *dst, Instrumentation *add) +{ + if (!dst->running && add->running) + { + dst->running = true; + dst->firsttuple = add->firsttuple; + } + else if (dst->running && add->running && dst->firsttuple > add->firsttuple) + dst->firsttuple = add->firsttuple; + + INSTR_TIME_ADD(dst->counter, add->counter); + + dst->tuplecount += add->tuplecount; + dst->startup += add->startup; + dst->total += add->total; + dst->ntuples += add->ntuples; + dst->ntuples2 += add->ntuples2; + dst->nloops += add->nloops; + dst->nfiltered1 += add->nfiltered1; + dst->nfiltered2 += add->nfiltered2; + + /* Add delta of buffer usage since entry to node's totals */ + if (dst->need_bufusage) + BufferUsageAdd(&dst->bufusage, &add->bufusage); + + if (dst->need_walusage) + WalUsageAdd(&dst->walusage, &add->walusage); +} + +/* note current values during parallel executor startup */ +void +InstrStartParallelQuery(void) +{ + save_pgBufferUsage = pgBufferUsage; + save_pgWalUsage = pgWalUsage; +} + +/* report usage after parallel executor shutdown */ +void +InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +{ + memset(bufusage, 0, sizeof(BufferUsage)); + BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); + memset(walusage, 0, sizeof(WalUsage)); + WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); +} + +/* accumulate work done by workers in leader's stats */ +void +InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +{ + BufferUsageAdd(&pgBufferUsage, bufusage); + WalUsageAdd(&pgWalUsage, walusage); +} + +/* dst += add */ +static void +BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) +{ + dst->shared_blks_hit += add->shared_blks_hit; + dst->shared_blks_read += add->shared_blks_read; + dst->shared_blks_dirtied += add->shared_blks_dirtied; + dst->shared_blks_written += add->shared_blks_written; + dst->local_blks_hit += add->local_blks_hit; + dst->local_blks_read += add->local_blks_read; + dst->local_blks_dirtied += add->local_blks_dirtied; + dst->local_blks_written += add->local_blks_written; + dst->temp_blks_read += add->temp_blks_read; + dst->temp_blks_written += add->temp_blks_written; + INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time); + INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time); +} + +/* dst += add - sub */ +void +BufferUsageAccumDiff(BufferUsage *dst, + const BufferUsage *add, + const BufferUsage *sub) +{ + dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; + dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; + dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied; + dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; + dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit; + dst->local_blks_read += add->local_blks_read - sub->local_blks_read; + dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied; + dst->local_blks_written += add->local_blks_written - sub->local_blks_written; + dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; + dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; + INSTR_TIME_ACCUM_DIFF(dst->blk_read_time, + add->blk_read_time, sub->blk_read_time); + INSTR_TIME_ACCUM_DIFF(dst->blk_write_time, + add->blk_write_time, sub->blk_write_time); +} + +/* helper functions for WAL usage accumulation */ +static void +WalUsageAdd(WalUsage *dst, WalUsage *add) +{ + dst->wal_bytes += add->wal_bytes; + dst->wal_records += add->wal_records; + dst->wal_fpi += add->wal_fpi; +} + +void +WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) +{ + dst->wal_bytes += add->wal_bytes - sub->wal_bytes; + dst->wal_records += add->wal_records - sub->wal_records; + dst->wal_fpi += add->wal_fpi - sub->wal_fpi; +} diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c new file mode 100644 index 0000000..31609c6 --- /dev/null +++ b/src/backend/executor/nodeAgg.c @@ -0,0 +1,4829 @@ +/*------------------------------------------------------------------------- + * + * nodeAgg.c + * Routines to handle aggregate nodes. + * + * ExecAgg normally evaluates each aggregate in the following steps: + * + * transvalue = initcond + * foreach input_tuple do + * transvalue = transfunc(transvalue, input_value(s)) + * result = finalfunc(transvalue, direct_argument(s)) + * + * If a finalfunc is not supplied then the result is just the ending + * value of transvalue. + * + * Other behaviors can be selected by the "aggsplit" mode, which exists + * to support partial aggregation. It is possible to: + * * Skip running the finalfunc, so that the output is always the + * final transvalue state. + * * Substitute the combinefunc for the transfunc, so that transvalue + * states (propagated up from a child partial-aggregation step) are merged + * rather than processing raw input rows. (The statements below about + * the transfunc apply equally to the combinefunc, when it's selected.) + * * Apply the serializefunc to the output values (this only makes sense + * when skipping the finalfunc, since the serializefunc works on the + * transvalue data type). + * * Apply the deserializefunc to the input values (this only makes sense + * when using the combinefunc, for similar reasons). + * It is the planner's responsibility to connect up Agg nodes using these + * alternate behaviors in a way that makes sense, with partial aggregation + * results being fed to nodes that expect them. + * + * If a normal aggregate call specifies DISTINCT or ORDER BY, we sort the + * input tuples and eliminate duplicates (if required) before performing + * the above-depicted process. (However, we don't do that for ordered-set + * aggregates; their "ORDER BY" inputs are ordinary aggregate arguments + * so far as this module is concerned.) Note that partial aggregation + * is not supported in these cases, since we couldn't ensure global + * ordering or distinctness of the inputs. + * + * If transfunc is marked "strict" in pg_proc and initcond is NULL, + * then the first non-NULL input_value is assigned directly to transvalue, + * and transfunc isn't applied until the second non-NULL input_value. + * The agg's first input type and transtype must be the same in this case! + * + * If transfunc is marked "strict" then NULL input_values are skipped, + * keeping the previous transvalue. If transfunc is not strict then it + * is called for every input tuple and must deal with NULL initcond + * or NULL input_values for itself. + * + * If finalfunc is marked "strict" then it is not called when the + * ending transvalue is NULL, instead a NULL result is created + * automatically (this is just the usual handling of strict functions, + * of course). A non-strict finalfunc can make its own choice of + * what to return for a NULL ending transvalue. + * + * Ordered-set aggregates are treated specially in one other way: we + * evaluate any "direct" arguments and pass them to the finalfunc along + * with the transition value. + * + * A finalfunc can have additional arguments beyond the transvalue and + * any "direct" arguments, corresponding to the input arguments of the + * aggregate. These are always just passed as NULL. Such arguments may be + * needed to allow resolution of a polymorphic aggregate's result type. + * + * We compute aggregate input expressions and run the transition functions + * in a temporary econtext (aggstate->tmpcontext). This is reset at least + * once per input tuple, so when the transvalue datatype is + * pass-by-reference, we have to be careful to copy it into a longer-lived + * memory context, and free the prior value to avoid memory leakage. We + * store transvalues in another set of econtexts, aggstate->aggcontexts + * (one per grouping set, see below), which are also used for the hashtable + * structures in AGG_HASHED mode. These econtexts are rescanned, not just + * reset, at group boundaries so that aggregate transition functions can + * register shutdown callbacks via AggRegisterCallback. + * + * The node's regular econtext (aggstate->ss.ps.ps_ExprContext) is used to + * run finalize functions and compute the output tuple; this context can be + * reset once per output tuple. + * + * The executor's AggState node is passed as the fmgr "context" value in + * all transfunc and finalfunc calls. It is not recommended that the + * transition functions look at the AggState node directly, but they can + * use AggCheckCallContext() to verify that they are being called by + * nodeAgg.c (and not as ordinary SQL functions). The main reason a + * transition function might want to know this is so that it can avoid + * palloc'ing a fixed-size pass-by-ref transition value on every call: + * it can instead just scribble on and return its left input. Ordinarily + * it is completely forbidden for functions to modify pass-by-ref inputs, + * but in the aggregate case we know the left input is either the initial + * transition value or a previous function result, and in either case its + * value need not be preserved. See int8inc() for an example. Notice that + * the EEOP_AGG_PLAIN_TRANS step is coded to avoid a data copy step when + * the previous transition value pointer is returned. It is also possible + * to avoid repeated data copying when the transition value is an expanded + * object: to do that, the transition function must take care to return + * an expanded object that is in a child context of the memory context + * returned by AggCheckCallContext(). Also, some transition functions want + * to store working state in addition to the nominal transition value; they + * can use the memory context returned by AggCheckCallContext() to do that. + * + * Note: AggCheckCallContext() is available as of PostgreSQL 9.0. The + * AggState is available as context in earlier releases (back to 8.1), + * but direct examination of the node is needed to use it before 9.0. + * + * As of 9.4, aggregate transition functions can also use AggGetAggref() + * to get hold of the Aggref expression node for their aggregate call. + * This is mainly intended for ordered-set aggregates, which are not + * supported as window functions. (A regular aggregate function would + * need some fallback logic to use this, since there's no Aggref node + * for a window function.) + * + * Grouping sets: + * + * A list of grouping sets which is structurally equivalent to a ROLLUP + * clause (e.g. (a,b,c), (a,b), (a)) can be processed in a single pass over + * ordered data. We do this by keeping a separate set of transition values + * for each grouping set being concurrently processed; for each input tuple + * we update them all, and on group boundaries we reset those states + * (starting at the front of the list) whose grouping values have changed + * (the list of grouping sets is ordered from most specific to least + * specific). + * + * Where more complex grouping sets are used, we break them down into + * "phases", where each phase has a different sort order (except phase 0 + * which is reserved for hashing). During each phase but the last, the + * input tuples are additionally stored in a tuplesort which is keyed to the + * next phase's sort order; during each phase but the first, the input + * tuples are drawn from the previously sorted data. (The sorting of the + * data for the first phase is handled by the planner, as it might be + * satisfied by underlying nodes.) + * + * Hashing can be mixed with sorted grouping. To do this, we have an + * AGG_MIXED strategy that populates the hashtables during the first sorted + * phase, and switches to reading them out after completing all sort phases. + * We can also support AGG_HASHED with multiple hash tables and no sorting + * at all. + * + * From the perspective of aggregate transition and final functions, the + * only issue regarding grouping sets is this: a single call site (flinfo) + * of an aggregate function may be used for updating several different + * transition values in turn. So the function must not cache in the flinfo + * anything which logically belongs as part of the transition value (most + * importantly, the memory context in which the transition value exists). + * The support API functions (AggCheckCallContext, AggRegisterCallback) are + * sensitive to the grouping set for which the aggregate function is + * currently being called. + * + * Plan structure: + * + * What we get from the planner is actually one "real" Agg node which is + * part of the plan tree proper, but which optionally has an additional list + * of Agg nodes hung off the side via the "chain" field. This is because an + * Agg node happens to be a convenient representation of all the data we + * need for grouping sets. + * + * For many purposes, we treat the "real" node as if it were just the first + * node in the chain. The chain must be ordered such that hashed entries + * come before sorted/plain entries; the real node is marked AGG_MIXED if + * there are both types present (in which case the real node describes one + * of the hashed groupings, other AGG_HASHED nodes may optionally follow in + * the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node). If + * the real node is marked AGG_HASHED or AGG_SORTED, then all the chained + * nodes must be of the same type; if it is AGG_PLAIN, there can be no + * chained nodes. + * + * We collect all hashed nodes into a single "phase", numbered 0, and create + * a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node. + * Phase 0 is allocated even if there are no hashes, but remains unused in + * that case. + * + * AGG_HASHED nodes actually refer to only a single grouping set each, + * because for each hashed grouping we need a separate grpColIdx and + * numGroups estimate. AGG_SORTED nodes represent a "rollup", a list of + * grouping sets that share a sort order. Each AGG_SORTED node other than + * the first one has an associated Sort node which describes the sort order + * to be used; the first sorted node takes its input from the outer subtree, + * which the planner has already arranged to provide ordered data. + * + * Memory and ExprContext usage: + * + * Because we're accumulating aggregate values across input rows, we need to + * use more memory contexts than just simple input/output tuple contexts. + * In fact, for a rollup, we need a separate context for each grouping set + * so that we can reset the inner (finer-grained) aggregates on their group + * boundaries while continuing to accumulate values for outer + * (coarser-grained) groupings. On top of this, we might be simultaneously + * populating hashtables; however, we only need one context for all the + * hashtables. + * + * So we create an array, aggcontexts, with an ExprContext for each grouping + * set in the largest rollup that we're going to process, and use the + * per-tuple memory context of those ExprContexts to store the aggregate + * transition values. hashcontext is the single context created to support + * all hash tables. + * + * Spilling To Disk + * + * When performing hash aggregation, if the hash table memory exceeds the + * limit (see hash_agg_check_limits()), we enter "spill mode". In spill + * mode, we advance the transition states only for groups already in the + * hash table. For tuples that would need to create a new hash table + * entries (and initialize new transition states), we instead spill them to + * disk to be processed later. The tuples are spilled in a partitioned + * manner, so that subsequent batches are smaller and less likely to exceed + * hash_mem (if a batch does exceed hash_mem, it must be spilled + * recursively). + * + * Spilled data is written to logical tapes. These provide better control + * over memory usage, disk space, and the number of files than if we were + * to use a BufFile for each spill. + * + * Note that it's possible for transition states to start small but then + * grow very large; for instance in the case of ARRAY_AGG. In such cases, + * it's still possible to significantly exceed hash_mem. We try to avoid + * this situation by estimating what will fit in the available memory, and + * imposing a limit on the number of groups separately from the amount of + * memory consumed. + * + * Transition / Combine function invocation: + * + * For performance reasons transition functions, including combine + * functions, aren't invoked one-by-one from nodeAgg.c after computing + * arguments using the expression evaluation engine. Instead + * ExecBuildAggTrans() builds one large expression that does both argument + * evaluation and transition function invocation. That avoids performance + * issues due to repeated uses of expression evaluation, complications due + * to filter expressions having to be evaluated early, and allows to JIT + * the entire expression into one native function. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeAgg.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/parallel.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "common/hashfn.h" +#include "executor/execExpr.h" +#include "executor/executor.h" +#include "executor/nodeAgg.h" +#include "lib/hyperloglog.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/optimizer.h" +#include "parser/parse_agg.h" +#include "parser/parse_coerce.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/dynahash.h" +#include "utils/expandeddatum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" +#include "utils/tuplesort.h" + +/* + * Control how many partitions are created when spilling HashAgg to + * disk. + * + * HASHAGG_PARTITION_FACTOR is multiplied by the estimated number of + * partitions needed such that each partition will fit in memory. The factor + * is set higher than one because there's not a high cost to having a few too + * many partitions, and it makes it less likely that a partition will need to + * be spilled recursively. Another benefit of having more, smaller partitions + * is that small hash tables may perform better than large ones due to memory + * caching effects. + * + * We also specify a min and max number of partitions per spill. Too few might + * mean a lot of wasted I/O from repeated spilling of the same tuples. Too + * many will result in lots of memory wasted buffering the spill files (which + * could instead be spent on a larger hash table). + */ +#define HASHAGG_PARTITION_FACTOR 1.50 +#define HASHAGG_MIN_PARTITIONS 4 +#define HASHAGG_MAX_PARTITIONS 1024 + +/* + * For reading from tapes, the buffer size must be a multiple of + * BLCKSZ. Larger values help when reading from multiple tapes concurrently, + * but that doesn't happen in HashAgg, so we simply use BLCKSZ. Writing to a + * tape always uses a buffer of size BLCKSZ. + */ +#define HASHAGG_READ_BUFFER_SIZE BLCKSZ +#define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ + +/* + * HyperLogLog is used for estimating the cardinality of the spilled tuples in + * a given partition. 5 bits corresponds to a size of about 32 bytes and a + * worst-case error of around 18%. That's effective enough to choose a + * reasonable number of partitions when recursing. + */ +#define HASHAGG_HLL_BIT_WIDTH 5 + +/* + * Estimate chunk overhead as a constant 16 bytes. XXX: should this be + * improved? + */ +#define CHUNKHDRSZ 16 + +/* + * Track all tapes needed for a HashAgg that spills. We don't know the maximum + * number of tapes needed at the start of the algorithm (because it can + * recurse), so one tape set is allocated and extended as needed for new + * tapes. When a particular tape is already read, rewind it for write mode and + * put it in the free list. + * + * Tapes' buffers can take up substantial memory when many tapes are open at + * once. We only need one tape open at a time in read mode (using a buffer + * that's a multiple of BLCKSZ); but we need one tape open in write mode (each + * requiring a buffer of size BLCKSZ) for each partition. + */ +typedef struct HashTapeInfo +{ + LogicalTapeSet *tapeset; + int ntapes; + int *freetapes; + int nfreetapes; + int freetapes_alloc; +} HashTapeInfo; + +/* + * Represents partitioned spill data for a single hashtable. Contains the + * necessary information to route tuples to the correct partition, and to + * transform the spilled data into new batches. + * + * The high bits are used for partition selection (when recursing, we ignore + * the bits that have already been used for partition selection at an earlier + * level). + */ +typedef struct HashAggSpill +{ + LogicalTapeSet *tapeset; /* borrowed reference to tape set */ + int npartitions; /* number of partitions */ + int *partitions; /* spill partition tape numbers */ + int64 *ntuples; /* number of tuples in each partition */ + uint32 mask; /* mask to find partition from hash value */ + int shift; /* after masking, shift by this amount */ + hyperLogLogState *hll_card; /* cardinality estimate for contents */ +} HashAggSpill; + +/* + * Represents work to be done for one pass of hash aggregation (with only one + * grouping set). + * + * Also tracks the bits of the hash already used for partition selection by + * earlier iterations, so that this batch can use new bits. If all bits have + * already been used, no partitioning will be done (any spilled data will go + * to a single output tape). + */ +typedef struct HashAggBatch +{ + int setno; /* grouping set */ + int used_bits; /* number of bits of hash already used */ + LogicalTapeSet *tapeset; /* borrowed reference to tape set */ + int input_tapenum; /* input partition tape */ + int64 input_tuples; /* number of tuples in this batch */ + double input_card; /* estimated group cardinality */ +} HashAggBatch; + +/* used to find referenced colnos */ +typedef struct FindColsContext +{ + bool is_aggref; /* is under an aggref */ + Bitmapset *aggregated; /* column references under an aggref */ + Bitmapset *unaggregated; /* other column references */ +} FindColsContext; + +static void select_current_set(AggState *aggstate, int setno, bool is_hash); +static void initialize_phase(AggState *aggstate, int newphase); +static TupleTableSlot *fetch_input_tuple(AggState *aggstate); +static void initialize_aggregates(AggState *aggstate, + AggStatePerGroup *pergroups, + int numReset); +static void advance_transition_function(AggState *aggstate, + AggStatePerTrans pertrans, + AggStatePerGroup pergroupstate); +static void advance_aggregates(AggState *aggstate); +static void process_ordered_aggregate_single(AggState *aggstate, + AggStatePerTrans pertrans, + AggStatePerGroup pergroupstate); +static void process_ordered_aggregate_multi(AggState *aggstate, + AggStatePerTrans pertrans, + AggStatePerGroup pergroupstate); +static void finalize_aggregate(AggState *aggstate, + AggStatePerAgg peragg, + AggStatePerGroup pergroupstate, + Datum *resultVal, bool *resultIsNull); +static void finalize_partialaggregate(AggState *aggstate, + AggStatePerAgg peragg, + AggStatePerGroup pergroupstate, + Datum *resultVal, bool *resultIsNull); +static inline void prepare_hash_slot(AggStatePerHash perhash, + TupleTableSlot *inputslot, + TupleTableSlot *hashslot); +static void prepare_projection_slot(AggState *aggstate, + TupleTableSlot *slot, + int currentSet); +static void finalize_aggregates(AggState *aggstate, + AggStatePerAgg peragg, + AggStatePerGroup pergroup); +static TupleTableSlot *project_aggregates(AggState *aggstate); +static void find_cols(AggState *aggstate, Bitmapset **aggregated, + Bitmapset **unaggregated); +static bool find_cols_walker(Node *node, FindColsContext *context); +static void build_hash_tables(AggState *aggstate); +static void build_hash_table(AggState *aggstate, int setno, long nbuckets); +static void hashagg_recompile_expressions(AggState *aggstate, bool minslot, + bool nullcheck); +static long hash_choose_num_buckets(double hashentrysize, + long estimated_nbuckets, + Size memory); +static int hash_choose_num_partitions(double input_groups, + double hashentrysize, + int used_bits, + int *log2_npartittions); +static void initialize_hash_entry(AggState *aggstate, + TupleHashTable hashtable, + TupleHashEntry entry); +static void lookup_hash_entries(AggState *aggstate); +static TupleTableSlot *agg_retrieve_direct(AggState *aggstate); +static void agg_fill_hash_table(AggState *aggstate); +static bool agg_refill_hash_table(AggState *aggstate); +static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate); +static TupleTableSlot *agg_retrieve_hash_table_in_memory(AggState *aggstate); +static void hash_agg_check_limits(AggState *aggstate); +static void hash_agg_enter_spill_mode(AggState *aggstate); +static void hash_agg_update_metrics(AggState *aggstate, bool from_tape, + int npartitions); +static void hashagg_finish_initial_spills(AggState *aggstate); +static void hashagg_reset_spill_state(AggState *aggstate); +static HashAggBatch *hashagg_batch_new(LogicalTapeSet *tapeset, + int input_tapenum, int setno, + int64 input_tuples, double input_card, + int used_bits); +static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp); +static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, + int used_bits, double input_groups, + double hashentrysize); +static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill, + TupleTableSlot *slot, uint32 hash); +static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, + int setno); +static void hashagg_tapeinfo_init(AggState *aggstate); +static void hashagg_tapeinfo_assign(HashTapeInfo *tapeinfo, int *dest, + int ndest); +static void hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum); +static Datum GetAggInitVal(Datum textInitVal, Oid transtype); +static void build_pertrans_for_aggref(AggStatePerTrans pertrans, + AggState *aggstate, EState *estate, + Aggref *aggref, Oid aggtransfn, Oid aggtranstype, + Oid aggserialfn, Oid aggdeserialfn, + Datum initValue, bool initValueIsNull, + Oid *inputTypes, int numArguments); + + +/* + * Select the current grouping set; affects current_set and + * curaggcontext. + */ +static void +select_current_set(AggState *aggstate, int setno, bool is_hash) +{ + /* + * When changing this, also adapt ExecAggPlainTransByVal() and + * ExecAggPlainTransByRef(). + */ + if (is_hash) + aggstate->curaggcontext = aggstate->hashcontext; + else + aggstate->curaggcontext = aggstate->aggcontexts[setno]; + + aggstate->current_set = setno; +} + +/* + * Switch to phase "newphase", which must either be 0 or 1 (to reset) or + * current_phase + 1. Juggle the tuplesorts accordingly. + * + * Phase 0 is for hashing, which we currently handle last in the AGG_MIXED + * case, so when entering phase 0, all we need to do is drop open sorts. + */ +static void +initialize_phase(AggState *aggstate, int newphase) +{ + Assert(newphase <= 1 || newphase == aggstate->current_phase + 1); + + /* + * Whatever the previous state, we're now done with whatever input + * tuplesort was in use. + */ + if (aggstate->sort_in) + { + tuplesort_end(aggstate->sort_in); + aggstate->sort_in = NULL; + } + + if (newphase <= 1) + { + /* + * Discard any existing output tuplesort. + */ + if (aggstate->sort_out) + { + tuplesort_end(aggstate->sort_out); + aggstate->sort_out = NULL; + } + } + else + { + /* + * The old output tuplesort becomes the new input one, and this is the + * right time to actually sort it. + */ + aggstate->sort_in = aggstate->sort_out; + aggstate->sort_out = NULL; + Assert(aggstate->sort_in); + tuplesort_performsort(aggstate->sort_in); + } + + /* + * If this isn't the last phase, we need to sort appropriately for the + * next phase in sequence. + */ + if (newphase > 0 && newphase < aggstate->numphases - 1) + { + Sort *sortnode = aggstate->phases[newphase + 1].sortnode; + PlanState *outerNode = outerPlanState(aggstate); + TupleDesc tupDesc = ExecGetResultType(outerNode); + + aggstate->sort_out = tuplesort_begin_heap(tupDesc, + sortnode->numCols, + sortnode->sortColIdx, + sortnode->sortOperators, + sortnode->collations, + sortnode->nullsFirst, + work_mem, + NULL, false); + } + + aggstate->current_phase = newphase; + aggstate->phase = &aggstate->phases[newphase]; +} + +/* + * Fetch a tuple from either the outer plan (for phase 1) or from the sorter + * populated by the previous phase. Copy it to the sorter for the next phase + * if any. + * + * Callers cannot rely on memory for tuple in returned slot remaining valid + * past any subsequently fetched tuple. + */ +static TupleTableSlot * +fetch_input_tuple(AggState *aggstate) +{ + TupleTableSlot *slot; + + if (aggstate->sort_in) + { + /* make sure we check for interrupts in either path through here */ + CHECK_FOR_INTERRUPTS(); + if (!tuplesort_gettupleslot(aggstate->sort_in, true, false, + aggstate->sort_slot, NULL)) + return NULL; + slot = aggstate->sort_slot; + } + else + slot = ExecProcNode(outerPlanState(aggstate)); + + if (!TupIsNull(slot) && aggstate->sort_out) + tuplesort_puttupleslot(aggstate->sort_out, slot); + + return slot; +} + +/* + * (Re)Initialize an individual aggregate. + * + * This function handles only one grouping set, already set in + * aggstate->current_set. + * + * When called, CurrentMemoryContext should be the per-query context. + */ +static void +initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans, + AggStatePerGroup pergroupstate) +{ + /* + * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate. + */ + if (pertrans->numSortCols > 0) + { + /* + * In case of rescan, maybe there could be an uncompleted sort + * operation? Clean it up if so. + */ + if (pertrans->sortstates[aggstate->current_set]) + tuplesort_end(pertrans->sortstates[aggstate->current_set]); + + + /* + * We use a plain Datum sorter when there's a single input column; + * otherwise sort the full tuple. (See comments for + * process_ordered_aggregate_single.) + */ + if (pertrans->numInputs == 1) + { + Form_pg_attribute attr = TupleDescAttr(pertrans->sortdesc, 0); + + pertrans->sortstates[aggstate->current_set] = + tuplesort_begin_datum(attr->atttypid, + pertrans->sortOperators[0], + pertrans->sortCollations[0], + pertrans->sortNullsFirst[0], + work_mem, NULL, false); + } + else + pertrans->sortstates[aggstate->current_set] = + tuplesort_begin_heap(pertrans->sortdesc, + pertrans->numSortCols, + pertrans->sortColIdx, + pertrans->sortOperators, + pertrans->sortCollations, + pertrans->sortNullsFirst, + work_mem, NULL, false); + } + + /* + * (Re)set transValue to the initial value. + * + * Note that when the initial value is pass-by-ref, we must copy it (into + * the aggcontext) since we will pfree the transValue later. + */ + if (pertrans->initValueIsNull) + pergroupstate->transValue = pertrans->initValue; + else + { + MemoryContext oldContext; + + oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory); + pergroupstate->transValue = datumCopy(pertrans->initValue, + pertrans->transtypeByVal, + pertrans->transtypeLen); + MemoryContextSwitchTo(oldContext); + } + pergroupstate->transValueIsNull = pertrans->initValueIsNull; + + /* + * If the initial value for the transition state doesn't exist in the + * pg_aggregate table then we will let the first non-NULL value returned + * from the outer procNode become the initial value. (This is useful for + * aggregates like max() and min().) The noTransValue flag signals that we + * still need to do this. + */ + pergroupstate->noTransValue = pertrans->initValueIsNull; +} + +/* + * Initialize all aggregate transition states for a new group of input values. + * + * If there are multiple grouping sets, we initialize only the first numReset + * of them (the grouping sets are ordered so that the most specific one, which + * is reset most often, is first). As a convenience, if numReset is 0, we + * reinitialize all sets. + * + * NB: This cannot be used for hash aggregates, as for those the grouping set + * number has to be specified from further up. + * + * When called, CurrentMemoryContext should be the per-query context. + */ +static void +initialize_aggregates(AggState *aggstate, + AggStatePerGroup *pergroups, + int numReset) +{ + int transno; + int numGroupingSets = Max(aggstate->phase->numsets, 1); + int setno = 0; + int numTrans = aggstate->numtrans; + AggStatePerTrans transstates = aggstate->pertrans; + + if (numReset == 0) + numReset = numGroupingSets; + + for (setno = 0; setno < numReset; setno++) + { + AggStatePerGroup pergroup = pergroups[setno]; + + select_current_set(aggstate, setno, false); + + for (transno = 0; transno < numTrans; transno++) + { + AggStatePerTrans pertrans = &transstates[transno]; + AggStatePerGroup pergroupstate = &pergroup[transno]; + + initialize_aggregate(aggstate, pertrans, pergroupstate); + } + } +} + +/* + * Given new input value(s), advance the transition function of one aggregate + * state within one grouping set only (already set in aggstate->current_set) + * + * The new values (and null flags) have been preloaded into argument positions + * 1 and up in pertrans->transfn_fcinfo, so that we needn't copy them again to + * pass to the transition function. We also expect that the static fields of + * the fcinfo are already initialized; that was done by ExecInitAgg(). + * + * It doesn't matter which memory context this is called in. + */ +static void +advance_transition_function(AggState *aggstate, + AggStatePerTrans pertrans, + AggStatePerGroup pergroupstate) +{ + FunctionCallInfo fcinfo = pertrans->transfn_fcinfo; + MemoryContext oldContext; + Datum newVal; + + if (pertrans->transfn.fn_strict) + { + /* + * For a strict transfn, nothing happens when there's a NULL input; we + * just keep the prior transValue. + */ + int numTransInputs = pertrans->numTransInputs; + int i; + + for (i = 1; i <= numTransInputs; i++) + { + if (fcinfo->args[i].isnull) + return; + } + if (pergroupstate->noTransValue) + { + /* + * transValue has not been initialized. This is the first non-NULL + * input value. We use it as the initial value for transValue. (We + * already checked that the agg's input type is binary-compatible + * with its transtype, so straight copy here is OK.) + * + * We must copy the datum into aggcontext if it is pass-by-ref. We + * do not need to pfree the old transValue, since it's NULL. + */ + oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory); + pergroupstate->transValue = datumCopy(fcinfo->args[1].value, + pertrans->transtypeByVal, + pertrans->transtypeLen); + pergroupstate->transValueIsNull = false; + pergroupstate->noTransValue = false; + MemoryContextSwitchTo(oldContext); + return; + } + if (pergroupstate->transValueIsNull) + { + /* + * Don't call a strict function with NULL inputs. Note it is + * possible to get here despite the above tests, if the transfn is + * strict *and* returned a NULL on a prior cycle. If that happens + * we will propagate the NULL all the way to the end. + */ + return; + } + } + + /* We run the transition functions in per-input-tuple memory context */ + oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory); + + /* set up aggstate->curpertrans for AggGetAggref() */ + aggstate->curpertrans = pertrans; + + /* + * OK to call the transition function + */ + fcinfo->args[0].value = pergroupstate->transValue; + fcinfo->args[0].isnull = pergroupstate->transValueIsNull; + fcinfo->isnull = false; /* just in case transfn doesn't set it */ + + newVal = FunctionCallInvoke(fcinfo); + + aggstate->curpertrans = NULL; + + /* + * If pass-by-ref datatype, must copy the new value into aggcontext and + * free the prior transValue. But if transfn returned a pointer to its + * first input, we don't need to do anything. Also, if transfn returned a + * pointer to a R/W expanded object that is already a child of the + * aggcontext, assume we can adopt that value without copying it. + * + * It's safe to compare newVal with pergroup->transValue without regard + * for either being NULL, because ExecAggTransReparent() takes care to set + * transValue to 0 when NULL. Otherwise we could end up accidentally not + * reparenting, when the transValue has the same numerical value as + * newValue, despite being NULL. This is a somewhat hot path, making it + * undesirable to instead solve this with another branch for the common + * case of the transition function returning its (modified) input + * argument. + */ + if (!pertrans->transtypeByVal && + DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue)) + newVal = ExecAggTransReparent(aggstate, pertrans, + newVal, fcinfo->isnull, + pergroupstate->transValue, + pergroupstate->transValueIsNull); + + pergroupstate->transValue = newVal; + pergroupstate->transValueIsNull = fcinfo->isnull; + + MemoryContextSwitchTo(oldContext); +} + +/* + * Advance each aggregate transition state for one input tuple. The input + * tuple has been stored in tmpcontext->ecxt_outertuple, so that it is + * accessible to ExecEvalExpr. + * + * We have two sets of transition states to handle: one for sorted aggregation + * and one for hashed; we do them both here, to avoid multiple evaluation of + * the inputs. + * + * When called, CurrentMemoryContext should be the per-query context. + */ +static void +advance_aggregates(AggState *aggstate) +{ + bool dummynull; + + ExecEvalExprSwitchContext(aggstate->phase->evaltrans, + aggstate->tmpcontext, + &dummynull); +} + +/* + * Run the transition function for a DISTINCT or ORDER BY aggregate + * with only one input. This is called after we have completed + * entering all the input values into the sort object. We complete the + * sort, read out the values in sorted order, and run the transition + * function on each value (applying DISTINCT if appropriate). + * + * Note that the strictness of the transition function was checked when + * entering the values into the sort, so we don't check it again here; + * we just apply standard SQL DISTINCT logic. + * + * The one-input case is handled separately from the multi-input case + * for performance reasons: for single by-value inputs, such as the + * common case of count(distinct id), the tuplesort_getdatum code path + * is around 300% faster. (The speedup for by-reference types is less + * but still noticeable.) + * + * This function handles only one grouping set (already set in + * aggstate->current_set). + * + * When called, CurrentMemoryContext should be the per-query context. + */ +static void +process_ordered_aggregate_single(AggState *aggstate, + AggStatePerTrans pertrans, + AggStatePerGroup pergroupstate) +{ + Datum oldVal = (Datum) 0; + bool oldIsNull = true; + bool haveOldVal = false; + MemoryContext workcontext = aggstate->tmpcontext->ecxt_per_tuple_memory; + MemoryContext oldContext; + bool isDistinct = (pertrans->numDistinctCols > 0); + Datum newAbbrevVal = (Datum) 0; + Datum oldAbbrevVal = (Datum) 0; + FunctionCallInfo fcinfo = pertrans->transfn_fcinfo; + Datum *newVal; + bool *isNull; + + Assert(pertrans->numDistinctCols < 2); + + tuplesort_performsort(pertrans->sortstates[aggstate->current_set]); + + /* Load the column into argument 1 (arg 0 will be transition value) */ + newVal = &fcinfo->args[1].value; + isNull = &fcinfo->args[1].isnull; + + /* + * Note: if input type is pass-by-ref, the datums returned by the sort are + * freshly palloc'd in the per-query context, so we must be careful to + * pfree them when they are no longer needed. + */ + + while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set], + true, newVal, isNull, &newAbbrevVal)) + { + /* + * Clear and select the working context for evaluation of the equality + * function and transition function. + */ + MemoryContextReset(workcontext); + oldContext = MemoryContextSwitchTo(workcontext); + + /* + * If DISTINCT mode, and not distinct from prior, skip it. + */ + if (isDistinct && + haveOldVal && + ((oldIsNull && *isNull) || + (!oldIsNull && !*isNull && + oldAbbrevVal == newAbbrevVal && + DatumGetBool(FunctionCall2Coll(&pertrans->equalfnOne, + pertrans->aggCollation, + oldVal, *newVal))))) + { + /* equal to prior, so forget this one */ + if (!pertrans->inputtypeByVal && !*isNull) + pfree(DatumGetPointer(*newVal)); + } + else + { + advance_transition_function(aggstate, pertrans, pergroupstate); + /* forget the old value, if any */ + if (!oldIsNull && !pertrans->inputtypeByVal) + pfree(DatumGetPointer(oldVal)); + /* and remember the new one for subsequent equality checks */ + oldVal = *newVal; + oldAbbrevVal = newAbbrevVal; + oldIsNull = *isNull; + haveOldVal = true; + } + + MemoryContextSwitchTo(oldContext); + } + + if (!oldIsNull && !pertrans->inputtypeByVal) + pfree(DatumGetPointer(oldVal)); + + tuplesort_end(pertrans->sortstates[aggstate->current_set]); + pertrans->sortstates[aggstate->current_set] = NULL; +} + +/* + * Run the transition function for a DISTINCT or ORDER BY aggregate + * with more than one input. This is called after we have completed + * entering all the input values into the sort object. We complete the + * sort, read out the values in sorted order, and run the transition + * function on each value (applying DISTINCT if appropriate). + * + * This function handles only one grouping set (already set in + * aggstate->current_set). + * + * When called, CurrentMemoryContext should be the per-query context. + */ +static void +process_ordered_aggregate_multi(AggState *aggstate, + AggStatePerTrans pertrans, + AggStatePerGroup pergroupstate) +{ + ExprContext *tmpcontext = aggstate->tmpcontext; + FunctionCallInfo fcinfo = pertrans->transfn_fcinfo; + TupleTableSlot *slot1 = pertrans->sortslot; + TupleTableSlot *slot2 = pertrans->uniqslot; + int numTransInputs = pertrans->numTransInputs; + int numDistinctCols = pertrans->numDistinctCols; + Datum newAbbrevVal = (Datum) 0; + Datum oldAbbrevVal = (Datum) 0; + bool haveOldValue = false; + TupleTableSlot *save = aggstate->tmpcontext->ecxt_outertuple; + int i; + + tuplesort_performsort(pertrans->sortstates[aggstate->current_set]); + + ExecClearTuple(slot1); + if (slot2) + ExecClearTuple(slot2); + + while (tuplesort_gettupleslot(pertrans->sortstates[aggstate->current_set], + true, true, slot1, &newAbbrevVal)) + { + CHECK_FOR_INTERRUPTS(); + + tmpcontext->ecxt_outertuple = slot1; + tmpcontext->ecxt_innertuple = slot2; + + if (numDistinctCols == 0 || + !haveOldValue || + newAbbrevVal != oldAbbrevVal || + !ExecQual(pertrans->equalfnMulti, tmpcontext)) + { + /* + * Extract the first numTransInputs columns as datums to pass to + * the transfn. + */ + slot_getsomeattrs(slot1, numTransInputs); + + /* Load values into fcinfo */ + /* Start from 1, since the 0th arg will be the transition value */ + for (i = 0; i < numTransInputs; i++) + { + fcinfo->args[i + 1].value = slot1->tts_values[i]; + fcinfo->args[i + 1].isnull = slot1->tts_isnull[i]; + } + + advance_transition_function(aggstate, pertrans, pergroupstate); + + if (numDistinctCols > 0) + { + /* swap the slot pointers to retain the current tuple */ + TupleTableSlot *tmpslot = slot2; + + slot2 = slot1; + slot1 = tmpslot; + /* avoid ExecQual() calls by reusing abbreviated keys */ + oldAbbrevVal = newAbbrevVal; + haveOldValue = true; + } + } + + /* Reset context each time */ + ResetExprContext(tmpcontext); + + ExecClearTuple(slot1); + } + + if (slot2) + ExecClearTuple(slot2); + + tuplesort_end(pertrans->sortstates[aggstate->current_set]); + pertrans->sortstates[aggstate->current_set] = NULL; + + /* restore previous slot, potentially in use for grouping sets */ + tmpcontext->ecxt_outertuple = save; +} + +/* + * Compute the final value of one aggregate. + * + * This function handles only one grouping set (already set in + * aggstate->current_set). + * + * The finalfn will be run, and the result delivered, in the + * output-tuple context; caller's CurrentMemoryContext does not matter. + * + * The finalfn uses the state as set in the transno. This also might be + * being used by another aggregate function, so it's important that we do + * nothing destructive here. + */ +static void +finalize_aggregate(AggState *aggstate, + AggStatePerAgg peragg, + AggStatePerGroup pergroupstate, + Datum *resultVal, bool *resultIsNull) +{ + LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS); + bool anynull = false; + MemoryContext oldContext; + int i; + ListCell *lc; + AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno]; + + oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory); + + /* + * Evaluate any direct arguments. We do this even if there's no finalfn + * (which is unlikely anyway), so that side-effects happen as expected. + * The direct arguments go into arg positions 1 and up, leaving position 0 + * for the transition state value. + */ + i = 1; + foreach(lc, peragg->aggdirectargs) + { + ExprState *expr = (ExprState *) lfirst(lc); + + fcinfo->args[i].value = ExecEvalExpr(expr, + aggstate->ss.ps.ps_ExprContext, + &fcinfo->args[i].isnull); + anynull |= fcinfo->args[i].isnull; + i++; + } + + /* + * Apply the agg's finalfn if one is provided, else return transValue. + */ + if (OidIsValid(peragg->finalfn_oid)) + { + int numFinalArgs = peragg->numFinalArgs; + + /* set up aggstate->curperagg for AggGetAggref() */ + aggstate->curperagg = peragg; + + InitFunctionCallInfoData(*fcinfo, &peragg->finalfn, + numFinalArgs, + pertrans->aggCollation, + (void *) aggstate, NULL); + + /* Fill in the transition state value */ + fcinfo->args[0].value = + MakeExpandedObjectReadOnly(pergroupstate->transValue, + pergroupstate->transValueIsNull, + pertrans->transtypeLen); + fcinfo->args[0].isnull = pergroupstate->transValueIsNull; + anynull |= pergroupstate->transValueIsNull; + + /* Fill any remaining argument positions with nulls */ + for (; i < numFinalArgs; i++) + { + fcinfo->args[i].value = (Datum) 0; + fcinfo->args[i].isnull = true; + anynull = true; + } + + if (fcinfo->flinfo->fn_strict && anynull) + { + /* don't call a strict function with NULL inputs */ + *resultVal = (Datum) 0; + *resultIsNull = true; + } + else + { + *resultVal = FunctionCallInvoke(fcinfo); + *resultIsNull = fcinfo->isnull; + } + aggstate->curperagg = NULL; + } + else + { + /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */ + *resultVal = pergroupstate->transValue; + *resultIsNull = pergroupstate->transValueIsNull; + } + + /* + * If result is pass-by-ref, make sure it is in the right context. + */ + if (!peragg->resulttypeByVal && !*resultIsNull && + !MemoryContextContains(CurrentMemoryContext, + DatumGetPointer(*resultVal))) + *resultVal = datumCopy(*resultVal, + peragg->resulttypeByVal, + peragg->resulttypeLen); + + MemoryContextSwitchTo(oldContext); +} + +/* + * Compute the output value of one partial aggregate. + * + * The serialization function will be run, and the result delivered, in the + * output-tuple context; caller's CurrentMemoryContext does not matter. + */ +static void +finalize_partialaggregate(AggState *aggstate, + AggStatePerAgg peragg, + AggStatePerGroup pergroupstate, + Datum *resultVal, bool *resultIsNull) +{ + AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno]; + MemoryContext oldContext; + + oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory); + + /* + * serialfn_oid will be set if we must serialize the transvalue before + * returning it + */ + if (OidIsValid(pertrans->serialfn_oid)) + { + /* Don't call a strict serialization function with NULL input. */ + if (pertrans->serialfn.fn_strict && pergroupstate->transValueIsNull) + { + *resultVal = (Datum) 0; + *resultIsNull = true; + } + else + { + FunctionCallInfo fcinfo = pertrans->serialfn_fcinfo; + + fcinfo->args[0].value = + MakeExpandedObjectReadOnly(pergroupstate->transValue, + pergroupstate->transValueIsNull, + pertrans->transtypeLen); + fcinfo->args[0].isnull = pergroupstate->transValueIsNull; + fcinfo->isnull = false; + + *resultVal = FunctionCallInvoke(fcinfo); + *resultIsNull = fcinfo->isnull; + } + } + else + { + /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */ + *resultVal = pergroupstate->transValue; + *resultIsNull = pergroupstate->transValueIsNull; + } + + /* If result is pass-by-ref, make sure it is in the right context. */ + if (!peragg->resulttypeByVal && !*resultIsNull && + !MemoryContextContains(CurrentMemoryContext, + DatumGetPointer(*resultVal))) + *resultVal = datumCopy(*resultVal, + peragg->resulttypeByVal, + peragg->resulttypeLen); + + MemoryContextSwitchTo(oldContext); +} + +/* + * Extract the attributes that make up the grouping key into the + * hashslot. This is necessary to compute the hash or perform a lookup. + */ +static inline void +prepare_hash_slot(AggStatePerHash perhash, + TupleTableSlot *inputslot, + TupleTableSlot *hashslot) +{ + int i; + + /* transfer just the needed columns into hashslot */ + slot_getsomeattrs(inputslot, perhash->largestGrpColIdx); + ExecClearTuple(hashslot); + + for (i = 0; i < perhash->numhashGrpCols; i++) + { + int varNumber = perhash->hashGrpColIdxInput[i] - 1; + + hashslot->tts_values[i] = inputslot->tts_values[varNumber]; + hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber]; + } + ExecStoreVirtualTuple(hashslot); +} + +/* + * Prepare to finalize and project based on the specified representative tuple + * slot and grouping set. + * + * In the specified tuple slot, force to null all attributes that should be + * read as null in the context of the current grouping set. Also stash the + * current group bitmap where GroupingExpr can get at it. + * + * This relies on three conditions: + * + * 1) Nothing is ever going to try and extract the whole tuple from this slot, + * only reference it in evaluations, which will only access individual + * attributes. + * + * 2) No system columns are going to need to be nulled. (If a system column is + * referenced in a group clause, it is actually projected in the outer plan + * tlist.) + * + * 3) Within a given phase, we never need to recover the value of an attribute + * once it has been set to null. + * + * Poking into the slot this way is a bit ugly, but the consensus is that the + * alternative was worse. + */ +static void +prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet) +{ + if (aggstate->phase->grouped_cols) + { + Bitmapset *grouped_cols = aggstate->phase->grouped_cols[currentSet]; + + aggstate->grouped_cols = grouped_cols; + + if (TTS_EMPTY(slot)) + { + /* + * Force all values to be NULL if working on an empty input tuple + * (i.e. an empty grouping set for which no input rows were + * supplied). + */ + ExecStoreAllNullTuple(slot); + } + else if (aggstate->all_grouped_cols) + { + ListCell *lc; + + /* all_grouped_cols is arranged in desc order */ + slot_getsomeattrs(slot, linitial_int(aggstate->all_grouped_cols)); + + foreach(lc, aggstate->all_grouped_cols) + { + int attnum = lfirst_int(lc); + + if (!bms_is_member(attnum, grouped_cols)) + slot->tts_isnull[attnum - 1] = true; + } + } + } +} + +/* + * Compute the final value of all aggregates for one group. + * + * This function handles only one grouping set at a time, which the caller must + * have selected. It's also the caller's responsibility to adjust the supplied + * pergroup parameter to point to the current set's transvalues. + * + * Results are stored in the output econtext aggvalues/aggnulls. + */ +static void +finalize_aggregates(AggState *aggstate, + AggStatePerAgg peraggs, + AggStatePerGroup pergroup) +{ + ExprContext *econtext = aggstate->ss.ps.ps_ExprContext; + Datum *aggvalues = econtext->ecxt_aggvalues; + bool *aggnulls = econtext->ecxt_aggnulls; + int aggno; + int transno; + + /* + * If there were any DISTINCT and/or ORDER BY aggregates, sort their + * inputs and run the transition functions. + */ + for (transno = 0; transno < aggstate->numtrans; transno++) + { + AggStatePerTrans pertrans = &aggstate->pertrans[transno]; + AggStatePerGroup pergroupstate; + + pergroupstate = &pergroup[transno]; + + if (pertrans->numSortCols > 0) + { + Assert(aggstate->aggstrategy != AGG_HASHED && + aggstate->aggstrategy != AGG_MIXED); + + if (pertrans->numInputs == 1) + process_ordered_aggregate_single(aggstate, + pertrans, + pergroupstate); + else + process_ordered_aggregate_multi(aggstate, + pertrans, + pergroupstate); + } + } + + /* + * Run the final functions. + */ + for (aggno = 0; aggno < aggstate->numaggs; aggno++) + { + AggStatePerAgg peragg = &peraggs[aggno]; + int transno = peragg->transno; + AggStatePerGroup pergroupstate; + + pergroupstate = &pergroup[transno]; + + if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)) + finalize_partialaggregate(aggstate, peragg, pergroupstate, + &aggvalues[aggno], &aggnulls[aggno]); + else + finalize_aggregate(aggstate, peragg, pergroupstate, + &aggvalues[aggno], &aggnulls[aggno]); + } +} + +/* + * Project the result of a group (whose aggs have already been calculated by + * finalize_aggregates). Returns the result slot, or NULL if no row is + * projected (suppressed by qual). + */ +static TupleTableSlot * +project_aggregates(AggState *aggstate) +{ + ExprContext *econtext = aggstate->ss.ps.ps_ExprContext; + + /* + * Check the qual (HAVING clause); if the group does not match, ignore it. + */ + if (ExecQual(aggstate->ss.ps.qual, econtext)) + { + /* + * Form and return projection tuple using the aggregate results and + * the representative input tuple. + */ + return ExecProject(aggstate->ss.ps.ps_ProjInfo); + } + else + InstrCountFiltered1(aggstate, 1); + + return NULL; +} + +/* + * Find input-tuple columns that are needed, dividing them into + * aggregated and unaggregated sets. + */ +static void +find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated) +{ + Agg *agg = (Agg *) aggstate->ss.ps.plan; + FindColsContext context; + + context.is_aggref = false; + context.aggregated = NULL; + context.unaggregated = NULL; + + /* Examine tlist and quals */ + (void) find_cols_walker((Node *) agg->plan.targetlist, &context); + (void) find_cols_walker((Node *) agg->plan.qual, &context); + + /* In some cases, grouping columns will not appear in the tlist */ + for (int i = 0; i < agg->numCols; i++) + context.unaggregated = bms_add_member(context.unaggregated, + agg->grpColIdx[i]); + + *aggregated = context.aggregated; + *unaggregated = context.unaggregated; +} + +static bool +find_cols_walker(Node *node, FindColsContext *context) +{ + if (node == NULL) + return false; + if (IsA(node, Var)) + { + Var *var = (Var *) node; + + /* setrefs.c should have set the varno to OUTER_VAR */ + Assert(var->varno == OUTER_VAR); + Assert(var->varlevelsup == 0); + if (context->is_aggref) + context->aggregated = bms_add_member(context->aggregated, + var->varattno); + else + context->unaggregated = bms_add_member(context->unaggregated, + var->varattno); + return false; + } + if (IsA(node, Aggref)) + { + Assert(!context->is_aggref); + context->is_aggref = true; + expression_tree_walker(node, find_cols_walker, (void *) context); + context->is_aggref = false; + return false; + } + return expression_tree_walker(node, find_cols_walker, + (void *) context); +} + +/* + * (Re-)initialize the hash table(s) to empty. + * + * To implement hashed aggregation, we need a hashtable that stores a + * representative tuple and an array of AggStatePerGroup structs for each + * distinct set of GROUP BY column values. We compute the hash key from the + * GROUP BY columns. The per-group data is allocated in lookup_hash_entry(), + * for each entry. + * + * We have a separate hashtable and associated perhash data structure for each + * grouping set for which we're doing hashing. + * + * The contents of the hash tables always live in the hashcontext's per-tuple + * memory context (there is only one of these for all tables together, since + * they are all reset at the same time). + */ +static void +build_hash_tables(AggState *aggstate) +{ + int setno; + + for (setno = 0; setno < aggstate->num_hashes; ++setno) + { + AggStatePerHash perhash = &aggstate->perhash[setno]; + long nbuckets; + Size memory; + + if (perhash->hashtable != NULL) + { + ResetTupleHashTable(perhash->hashtable); + continue; + } + + Assert(perhash->aggnode->numGroups > 0); + + memory = aggstate->hash_mem_limit / aggstate->num_hashes; + + /* choose reasonable number of buckets per hashtable */ + nbuckets = hash_choose_num_buckets(aggstate->hashentrysize, + perhash->aggnode->numGroups, + memory); + + build_hash_table(aggstate, setno, nbuckets); + } + + aggstate->hash_ngroups_current = 0; +} + +/* + * Build a single hashtable for this grouping set. + */ +static void +build_hash_table(AggState *aggstate, int setno, long nbuckets) +{ + AggStatePerHash perhash = &aggstate->perhash[setno]; + MemoryContext metacxt = aggstate->hash_metacxt; + MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory; + MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory; + Size additionalsize; + + Assert(aggstate->aggstrategy == AGG_HASHED || + aggstate->aggstrategy == AGG_MIXED); + + /* + * Used to make sure initial hash table allocation does not exceed + * hash_mem. Note that the estimate does not include space for + * pass-by-reference transition data values, nor for the representative + * tuple of each group. + */ + additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData); + + perhash->hashtable = BuildTupleHashTableExt(&aggstate->ss.ps, + perhash->hashslot->tts_tupleDescriptor, + perhash->numCols, + perhash->hashGrpColIdxHash, + perhash->eqfuncoids, + perhash->hashfunctions, + perhash->aggnode->grpCollations, + nbuckets, + additionalsize, + metacxt, + hashcxt, + tmpcxt, + DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)); +} + +/* + * Compute columns that actually need to be stored in hashtable entries. The + * incoming tuples from the child plan node will contain grouping columns, + * other columns referenced in our targetlist and qual, columns used to + * compute the aggregate functions, and perhaps just junk columns we don't use + * at all. Only columns of the first two types need to be stored in the + * hashtable, and getting rid of the others can make the table entries + * significantly smaller. The hashtable only contains the relevant columns, + * and is packed/unpacked in lookup_hash_entry() / agg_retrieve_hash_table() + * into the format of the normal input descriptor. + * + * Additional columns, in addition to the columns grouped by, come from two + * sources: Firstly functionally dependent columns that we don't need to group + * by themselves, and secondly ctids for row-marks. + * + * To eliminate duplicates, we build a bitmapset of the needed columns, and + * then build an array of the columns included in the hashtable. We might + * still have duplicates if the passed-in grpColIdx has them, which can happen + * in edge cases from semijoins/distinct; these can't always be removed, + * because it's not certain that the duplicate cols will be using the same + * hash function. + * + * Note that the array is preserved over ExecReScanAgg, so we allocate it in + * the per-query context (unlike the hash table itself). + */ +static void +find_hash_columns(AggState *aggstate) +{ + Bitmapset *base_colnos; + Bitmapset *aggregated_colnos; + TupleDesc scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + List *outerTlist = outerPlanState(aggstate)->plan->targetlist; + int numHashes = aggstate->num_hashes; + EState *estate = aggstate->ss.ps.state; + int j; + + /* Find Vars that will be needed in tlist and qual */ + find_cols(aggstate, &aggregated_colnos, &base_colnos); + aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos); + aggstate->max_colno_needed = 0; + aggstate->all_cols_needed = true; + + for (int i = 0; i < scanDesc->natts; i++) + { + int colno = i + 1; + + if (bms_is_member(colno, aggstate->colnos_needed)) + aggstate->max_colno_needed = colno; + else + aggstate->all_cols_needed = false; + } + + for (j = 0; j < numHashes; ++j) + { + AggStatePerHash perhash = &aggstate->perhash[j]; + Bitmapset *colnos = bms_copy(base_colnos); + AttrNumber *grpColIdx = perhash->aggnode->grpColIdx; + List *hashTlist = NIL; + TupleDesc hashDesc; + int maxCols; + int i; + + perhash->largestGrpColIdx = 0; + + /* + * If we're doing grouping sets, then some Vars might be referenced in + * tlist/qual for the benefit of other grouping sets, but not needed + * when hashing; i.e. prepare_projection_slot will null them out, so + * there'd be no point storing them. Use prepare_projection_slot's + * logic to determine which. + */ + if (aggstate->phases[0].grouped_cols) + { + Bitmapset *grouped_cols = aggstate->phases[0].grouped_cols[j]; + ListCell *lc; + + foreach(lc, aggstate->all_grouped_cols) + { + int attnum = lfirst_int(lc); + + if (!bms_is_member(attnum, grouped_cols)) + colnos = bms_del_member(colnos, attnum); + } + } + + /* + * Compute maximum number of input columns accounting for possible + * duplications in the grpColIdx array, which can happen in some edge + * cases where HashAggregate was generated as part of a semijoin or a + * DISTINCT. + */ + maxCols = bms_num_members(colnos) + perhash->numCols; + + perhash->hashGrpColIdxInput = + palloc(maxCols * sizeof(AttrNumber)); + perhash->hashGrpColIdxHash = + palloc(perhash->numCols * sizeof(AttrNumber)); + + /* Add all the grouping columns to colnos */ + for (i = 0; i < perhash->numCols; i++) + colnos = bms_add_member(colnos, grpColIdx[i]); + + /* + * First build mapping for columns directly hashed. These are the + * first, because they'll be accessed when computing hash values and + * comparing tuples for exact matches. We also build simple mapping + * for execGrouping, so it knows where to find the to-be-hashed / + * compared columns in the input. + */ + for (i = 0; i < perhash->numCols; i++) + { + perhash->hashGrpColIdxInput[i] = grpColIdx[i]; + perhash->hashGrpColIdxHash[i] = i + 1; + perhash->numhashGrpCols++; + /* delete already mapped columns */ + bms_del_member(colnos, grpColIdx[i]); + } + + /* and add the remaining columns */ + while ((i = bms_first_member(colnos)) >= 0) + { + perhash->hashGrpColIdxInput[perhash->numhashGrpCols] = i; + perhash->numhashGrpCols++; + } + + /* and build a tuple descriptor for the hashtable */ + for (i = 0; i < perhash->numhashGrpCols; i++) + { + int varNumber = perhash->hashGrpColIdxInput[i] - 1; + + hashTlist = lappend(hashTlist, list_nth(outerTlist, varNumber)); + perhash->largestGrpColIdx = + Max(varNumber + 1, perhash->largestGrpColIdx); + } + + hashDesc = ExecTypeFromTL(hashTlist); + + execTuplesHashPrepare(perhash->numCols, + perhash->aggnode->grpOperators, + &perhash->eqfuncoids, + &perhash->hashfunctions); + perhash->hashslot = + ExecAllocTableSlot(&estate->es_tupleTable, hashDesc, + &TTSOpsMinimalTuple); + + list_free(hashTlist); + bms_free(colnos); + } + + bms_free(base_colnos); +} + +/* + * Estimate per-hash-table-entry overhead. + */ +Size +hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace) +{ + Size tupleChunkSize; + Size pergroupChunkSize; + Size transitionChunkSize; + Size tupleSize = (MAXALIGN(SizeofMinimalTupleHeader) + + tupleWidth); + Size pergroupSize = numTrans * sizeof(AggStatePerGroupData); + + tupleChunkSize = CHUNKHDRSZ + tupleSize; + + if (pergroupSize > 0) + pergroupChunkSize = CHUNKHDRSZ + pergroupSize; + else + pergroupChunkSize = 0; + + if (transitionSpace > 0) + transitionChunkSize = CHUNKHDRSZ + transitionSpace; + else + transitionChunkSize = 0; + + return + sizeof(TupleHashEntryData) + + tupleChunkSize + + pergroupChunkSize + + transitionChunkSize; +} + +/* + * hashagg_recompile_expressions() + * + * Identifies the right phase, compiles the right expression given the + * arguments, and then sets phase->evalfunc to that expression. + * + * Different versions of the compiled expression are needed depending on + * whether hash aggregation has spilled or not, and whether it's reading from + * the outer plan or a tape. Before spilling to disk, the expression reads + * from the outer plan and does not need to perform a NULL check. After + * HashAgg begins to spill, new groups will not be created in the hash table, + * and the AggStatePerGroup array may be NULL; therefore we need to add a null + * pointer check to the expression. Then, when reading spilled data from a + * tape, we change the outer slot type to be a fixed minimal tuple slot. + * + * It would be wasteful to recompile every time, so cache the compiled + * expressions in the AggStatePerPhase, and reuse when appropriate. + */ +static void +hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck) +{ + AggStatePerPhase phase; + int i = minslot ? 1 : 0; + int j = nullcheck ? 1 : 0; + + Assert(aggstate->aggstrategy == AGG_HASHED || + aggstate->aggstrategy == AGG_MIXED); + + if (aggstate->aggstrategy == AGG_HASHED) + phase = &aggstate->phases[0]; + else /* AGG_MIXED */ + phase = &aggstate->phases[1]; + + if (phase->evaltrans_cache[i][j] == NULL) + { + const TupleTableSlotOps *outerops = aggstate->ss.ps.outerops; + bool outerfixed = aggstate->ss.ps.outeropsfixed; + bool dohash = true; + bool dosort = false; + + /* + * If minslot is true, that means we are processing a spilled batch + * (inside agg_refill_hash_table()), and we must not advance the + * sorted grouping sets. + */ + if (aggstate->aggstrategy == AGG_MIXED && !minslot) + dosort = true; + + /* temporarily change the outerops while compiling the expression */ + if (minslot) + { + aggstate->ss.ps.outerops = &TTSOpsMinimalTuple; + aggstate->ss.ps.outeropsfixed = true; + } + + phase->evaltrans_cache[i][j] = ExecBuildAggTrans(aggstate, phase, + dosort, dohash, + nullcheck); + + /* change back */ + aggstate->ss.ps.outerops = outerops; + aggstate->ss.ps.outeropsfixed = outerfixed; + } + + phase->evaltrans = phase->evaltrans_cache[i][j]; +} + +/* + * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the + * number of partitions we expect to create (if we do spill). + * + * There are two limits: a memory limit, and also an ngroups limit. The + * ngroups limit becomes important when we expect transition values to grow + * substantially larger than the initial value. + */ +void +hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, + Size *mem_limit, uint64 *ngroups_limit, + int *num_partitions) +{ + int npartitions; + Size partition_mem; + Size hash_mem_limit = get_hash_memory_limit(); + + /* if not expected to spill, use all of hash_mem */ + if (input_groups * hashentrysize <= hash_mem_limit) + { + if (num_partitions != NULL) + *num_partitions = 0; + *mem_limit = hash_mem_limit; + *ngroups_limit = hash_mem_limit / hashentrysize; + return; + } + + /* + * Calculate expected memory requirements for spilling, which is the size + * of the buffers needed for all the tapes that need to be open at once. + * Then, subtract that from the memory available for holding hash tables. + */ + npartitions = hash_choose_num_partitions(input_groups, + hashentrysize, + used_bits, + NULL); + if (num_partitions != NULL) + *num_partitions = npartitions; + + partition_mem = + HASHAGG_READ_BUFFER_SIZE + + HASHAGG_WRITE_BUFFER_SIZE * npartitions; + + /* + * Don't set the limit below 3/4 of hash_mem. In that case, we are at the + * minimum number of partitions, so we aren't going to dramatically exceed + * work mem anyway. + */ + if (hash_mem_limit > 4 * partition_mem) + *mem_limit = hash_mem_limit - partition_mem; + else + *mem_limit = hash_mem_limit * 0.75; + + if (*mem_limit > hashentrysize) + *ngroups_limit = *mem_limit / hashentrysize; + else + *ngroups_limit = 1; +} + +/* + * hash_agg_check_limits + * + * After adding a new group to the hash table, check whether we need to enter + * spill mode. Allocations may happen without adding new groups (for instance, + * if the transition state size grows), so this check is imperfect. + */ +static void +hash_agg_check_limits(AggState *aggstate) +{ + uint64 ngroups = aggstate->hash_ngroups_current; + Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, + true); + Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, + true); + + /* + * Don't spill unless there's at least one group in the hash table so we + * can be sure to make progress even in edge cases. + */ + if (aggstate->hash_ngroups_current > 0 && + (meta_mem + hashkey_mem > aggstate->hash_mem_limit || + ngroups > aggstate->hash_ngroups_limit)) + { + hash_agg_enter_spill_mode(aggstate); + } +} + +/* + * Enter "spill mode", meaning that no new groups are added to any of the hash + * tables. Tuples that would create a new group are instead spilled, and + * processed later. + */ +static void +hash_agg_enter_spill_mode(AggState *aggstate) +{ + aggstate->hash_spill_mode = true; + hashagg_recompile_expressions(aggstate, aggstate->table_filled, true); + + if (!aggstate->hash_ever_spilled) + { + Assert(aggstate->hash_tapeinfo == NULL); + Assert(aggstate->hash_spills == NULL); + + aggstate->hash_ever_spilled = true; + + hashagg_tapeinfo_init(aggstate); + + aggstate->hash_spills = palloc(sizeof(HashAggSpill) * aggstate->num_hashes); + + for (int setno = 0; setno < aggstate->num_hashes; setno++) + { + AggStatePerHash perhash = &aggstate->perhash[setno]; + HashAggSpill *spill = &aggstate->hash_spills[setno]; + + hashagg_spill_init(spill, aggstate->hash_tapeinfo, 0, + perhash->aggnode->numGroups, + aggstate->hashentrysize); + } + } +} + +/* + * Update metrics after filling the hash table. + * + * If reading from the outer plan, from_tape should be false; if reading from + * another tape, from_tape should be true. + */ +static void +hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) +{ + Size meta_mem; + Size hashkey_mem; + Size buffer_mem; + Size total_mem; + + if (aggstate->aggstrategy != AGG_MIXED && + aggstate->aggstrategy != AGG_HASHED) + return; + + /* memory for the hash table itself */ + meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); + + /* memory for the group keys and transition states */ + hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); + + /* memory for read/write tape buffers, if spilled */ + buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE; + if (from_tape) + buffer_mem += HASHAGG_READ_BUFFER_SIZE; + + /* update peak mem */ + total_mem = meta_mem + hashkey_mem + buffer_mem; + if (total_mem > aggstate->hash_mem_peak) + aggstate->hash_mem_peak = total_mem; + + /* update disk usage */ + if (aggstate->hash_tapeinfo != NULL) + { + uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeinfo->tapeset) * (BLCKSZ / 1024); + + if (aggstate->hash_disk_used < disk_used) + aggstate->hash_disk_used = disk_used; + } + + /* update hashentrysize estimate based on contents */ + if (aggstate->hash_ngroups_current > 0) + { + aggstate->hashentrysize = + sizeof(TupleHashEntryData) + + (hashkey_mem / (double) aggstate->hash_ngroups_current); + } +} + +/* + * Choose a reasonable number of buckets for the initial hash table size. + */ +static long +hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory) +{ + long max_nbuckets; + long nbuckets = ngroups; + + max_nbuckets = memory / hashentrysize; + + /* + * Underestimating is better than overestimating. Too many buckets crowd + * out space for group keys and transition state values. + */ + max_nbuckets >>= 1; + + if (nbuckets > max_nbuckets) + nbuckets = max_nbuckets; + + return Max(nbuckets, 1); +} + +/* + * Determine the number of partitions to create when spilling, which will + * always be a power of two. If log2_npartitions is non-NULL, set + * *log2_npartitions to the log2() of the number of partitions. + */ +static int +hash_choose_num_partitions(double input_groups, double hashentrysize, + int used_bits, int *log2_npartitions) +{ + Size hash_mem_limit = get_hash_memory_limit(); + double partition_limit; + double mem_wanted; + double dpartitions; + int npartitions; + int partition_bits; + + /* + * Avoid creating so many partitions that the memory requirements of the + * open partition files are greater than 1/4 of hash_mem. + */ + partition_limit = + (hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) / + HASHAGG_WRITE_BUFFER_SIZE; + + mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize; + + /* make enough partitions so that each one is likely to fit in memory */ + dpartitions = 1 + (mem_wanted / hash_mem_limit); + + if (dpartitions > partition_limit) + dpartitions = partition_limit; + + if (dpartitions < HASHAGG_MIN_PARTITIONS) + dpartitions = HASHAGG_MIN_PARTITIONS; + if (dpartitions > HASHAGG_MAX_PARTITIONS) + dpartitions = HASHAGG_MAX_PARTITIONS; + + /* HASHAGG_MAX_PARTITIONS limit makes this safe */ + npartitions = (int) dpartitions; + + /* ceil(log2(npartitions)) */ + partition_bits = my_log2(npartitions); + + /* make sure that we don't exhaust the hash bits */ + if (partition_bits + used_bits >= 32) + partition_bits = 32 - used_bits; + + if (log2_npartitions != NULL) + *log2_npartitions = partition_bits; + + /* number of partitions will be a power of two */ + npartitions = 1 << partition_bits; + + return npartitions; +} + +/* + * Initialize a freshly-created TupleHashEntry. + */ +static void +initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable, + TupleHashEntry entry) +{ + AggStatePerGroup pergroup; + int transno; + + aggstate->hash_ngroups_current++; + hash_agg_check_limits(aggstate); + + /* no need to allocate or initialize per-group state */ + if (aggstate->numtrans == 0) + return; + + pergroup = (AggStatePerGroup) + MemoryContextAlloc(hashtable->tablecxt, + sizeof(AggStatePerGroupData) * aggstate->numtrans); + + entry->additional = pergroup; + + /* + * Initialize aggregates for new tuple group, lookup_hash_entries() + * already has selected the relevant grouping set. + */ + for (transno = 0; transno < aggstate->numtrans; transno++) + { + AggStatePerTrans pertrans = &aggstate->pertrans[transno]; + AggStatePerGroup pergroupstate = &pergroup[transno]; + + initialize_aggregate(aggstate, pertrans, pergroupstate); + } +} + +/* + * Look up hash entries for the current tuple in all hashed grouping sets. + * + * Be aware that lookup_hash_entry can reset the tmpcontext. + * + * Some entries may be left NULL if we are in "spill mode". The same tuple + * will belong to different groups for each grouping set, so may match a group + * already in memory for one set and match a group not in memory for another + * set. When in "spill mode", the tuple will be spilled for each grouping set + * where it doesn't match a group in memory. + * + * NB: It's possible to spill the same tuple for several different grouping + * sets. This may seem wasteful, but it's actually a trade-off: if we spill + * the tuple multiple times for multiple grouping sets, it can be partitioned + * for each grouping set, making the refilling of the hash table very + * efficient. + */ +static void +lookup_hash_entries(AggState *aggstate) +{ + AggStatePerGroup *pergroup = aggstate->hash_pergroup; + TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple; + int setno; + + for (setno = 0; setno < aggstate->num_hashes; setno++) + { + AggStatePerHash perhash = &aggstate->perhash[setno]; + TupleHashTable hashtable = perhash->hashtable; + TupleTableSlot *hashslot = perhash->hashslot; + TupleHashEntry entry; + uint32 hash; + bool isnew = false; + bool *p_isnew; + + /* if hash table already spilled, don't create new entries */ + p_isnew = aggstate->hash_spill_mode ? NULL : &isnew; + + select_current_set(aggstate, setno, true); + prepare_hash_slot(perhash, + outerslot, + hashslot); + + entry = LookupTupleHashEntry(hashtable, hashslot, + p_isnew, &hash); + + if (entry != NULL) + { + if (isnew) + initialize_hash_entry(aggstate, hashtable, entry); + pergroup[setno] = entry->additional; + } + else + { + HashAggSpill *spill = &aggstate->hash_spills[setno]; + TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple; + + if (spill->partitions == NULL) + hashagg_spill_init(spill, aggstate->hash_tapeinfo, 0, + perhash->aggnode->numGroups, + aggstate->hashentrysize); + + hashagg_spill_tuple(aggstate, spill, slot, hash); + pergroup[setno] = NULL; + } + } +} + +/* + * ExecAgg - + * + * ExecAgg receives tuples from its outer subplan and aggregates over + * the appropriate attribute for each aggregate function use (Aggref + * node) appearing in the targetlist or qual of the node. The number + * of tuples to aggregate over depends on whether grouped or plain + * aggregation is selected. In grouped aggregation, we produce a result + * row for each group; in plain aggregation there's a single result row + * for the whole query. In either case, the value of each aggregate is + * stored in the expression context to be used when ExecProject evaluates + * the result tuple. + */ +static TupleTableSlot * +ExecAgg(PlanState *pstate) +{ + AggState *node = castNode(AggState, pstate); + TupleTableSlot *result = NULL; + + CHECK_FOR_INTERRUPTS(); + + if (!node->agg_done) + { + /* Dispatch based on strategy */ + switch (node->phase->aggstrategy) + { + case AGG_HASHED: + if (!node->table_filled) + agg_fill_hash_table(node); + /* FALLTHROUGH */ + case AGG_MIXED: + result = agg_retrieve_hash_table(node); + break; + case AGG_PLAIN: + case AGG_SORTED: + result = agg_retrieve_direct(node); + break; + } + + if (!TupIsNull(result)) + return result; + } + + return NULL; +} + +/* + * ExecAgg for non-hashed case + */ +static TupleTableSlot * +agg_retrieve_direct(AggState *aggstate) +{ + Agg *node = aggstate->phase->aggnode; + ExprContext *econtext; + ExprContext *tmpcontext; + AggStatePerAgg peragg; + AggStatePerGroup *pergroups; + TupleTableSlot *outerslot; + TupleTableSlot *firstSlot; + TupleTableSlot *result; + bool hasGroupingSets = aggstate->phase->numsets > 0; + int numGroupingSets = Max(aggstate->phase->numsets, 1); + int currentSet; + int nextSetSize; + int numReset; + int i; + + /* + * get state info from node + * + * econtext is the per-output-tuple expression context + * + * tmpcontext is the per-input-tuple expression context + */ + econtext = aggstate->ss.ps.ps_ExprContext; + tmpcontext = aggstate->tmpcontext; + + peragg = aggstate->peragg; + pergroups = aggstate->pergroups; + firstSlot = aggstate->ss.ss_ScanTupleSlot; + + /* + * We loop retrieving groups until we find one matching + * aggstate->ss.ps.qual + * + * For grouping sets, we have the invariant that aggstate->projected_set + * is either -1 (initial call) or the index (starting from 0) in + * gset_lengths for the group we just completed (either by projecting a + * row or by discarding it in the qual). + */ + while (!aggstate->agg_done) + { + /* + * Clear the per-output-tuple context for each group, as well as + * aggcontext (which contains any pass-by-ref transvalues of the old + * group). Some aggregate functions store working state in child + * contexts; those now get reset automatically without us needing to + * do anything special. + * + * We use ReScanExprContext not just ResetExprContext because we want + * any registered shutdown callbacks to be called. That allows + * aggregate functions to ensure they've cleaned up any non-memory + * resources. + */ + ReScanExprContext(econtext); + + /* + * Determine how many grouping sets need to be reset at this boundary. + */ + if (aggstate->projected_set >= 0 && + aggstate->projected_set < numGroupingSets) + numReset = aggstate->projected_set + 1; + else + numReset = numGroupingSets; + + /* + * numReset can change on a phase boundary, but that's OK; we want to + * reset the contexts used in _this_ phase, and later, after possibly + * changing phase, initialize the right number of aggregates for the + * _new_ phase. + */ + + for (i = 0; i < numReset; i++) + { + ReScanExprContext(aggstate->aggcontexts[i]); + } + + /* + * Check if input is complete and there are no more groups to project + * in this phase; move to next phase or mark as done. + */ + if (aggstate->input_done == true && + aggstate->projected_set >= (numGroupingSets - 1)) + { + if (aggstate->current_phase < aggstate->numphases - 1) + { + initialize_phase(aggstate, aggstate->current_phase + 1); + aggstate->input_done = false; + aggstate->projected_set = -1; + numGroupingSets = Max(aggstate->phase->numsets, 1); + node = aggstate->phase->aggnode; + numReset = numGroupingSets; + } + else if (aggstate->aggstrategy == AGG_MIXED) + { + /* + * Mixed mode; we've output all the grouped stuff and have + * full hashtables, so switch to outputting those. + */ + initialize_phase(aggstate, 0); + aggstate->table_filled = true; + ResetTupleHashIterator(aggstate->perhash[0].hashtable, + &aggstate->perhash[0].hashiter); + select_current_set(aggstate, 0, true); + return agg_retrieve_hash_table(aggstate); + } + else + { + aggstate->agg_done = true; + break; + } + } + + /* + * Get the number of columns in the next grouping set after the last + * projected one (if any). This is the number of columns to compare to + * see if we reached the boundary of that set too. + */ + if (aggstate->projected_set >= 0 && + aggstate->projected_set < (numGroupingSets - 1)) + nextSetSize = aggstate->phase->gset_lengths[aggstate->projected_set + 1]; + else + nextSetSize = 0; + + /*---------- + * If a subgroup for the current grouping set is present, project it. + * + * We have a new group if: + * - we're out of input but haven't projected all grouping sets + * (checked above) + * OR + * - we already projected a row that wasn't from the last grouping + * set + * AND + * - the next grouping set has at least one grouping column (since + * empty grouping sets project only once input is exhausted) + * AND + * - the previous and pending rows differ on the grouping columns + * of the next grouping set + *---------- + */ + tmpcontext->ecxt_innertuple = econtext->ecxt_outertuple; + if (aggstate->input_done || + (node->aggstrategy != AGG_PLAIN && + aggstate->projected_set != -1 && + aggstate->projected_set < (numGroupingSets - 1) && + nextSetSize > 0 && + !ExecQualAndReset(aggstate->phase->eqfunctions[nextSetSize - 1], + tmpcontext))) + { + aggstate->projected_set += 1; + + Assert(aggstate->projected_set < numGroupingSets); + Assert(nextSetSize > 0 || aggstate->input_done); + } + else + { + /* + * We no longer care what group we just projected, the next + * projection will always be the first (or only) grouping set + * (unless the input proves to be empty). + */ + aggstate->projected_set = 0; + + /* + * If we don't already have the first tuple of the new group, + * fetch it from the outer plan. + */ + if (aggstate->grp_firstTuple == NULL) + { + outerslot = fetch_input_tuple(aggstate); + if (!TupIsNull(outerslot)) + { + /* + * Make a copy of the first input tuple; we will use this + * for comparisons (in group mode) and for projection. + */ + aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot); + } + else + { + /* outer plan produced no tuples at all */ + if (hasGroupingSets) + { + /* + * If there was no input at all, we need to project + * rows only if there are grouping sets of size 0. + * Note that this implies that there can't be any + * references to ungrouped Vars, which would otherwise + * cause issues with the empty output slot. + * + * XXX: This is no longer true, we currently deal with + * this in finalize_aggregates(). + */ + aggstate->input_done = true; + + while (aggstate->phase->gset_lengths[aggstate->projected_set] > 0) + { + aggstate->projected_set += 1; + if (aggstate->projected_set >= numGroupingSets) + { + /* + * We can't set agg_done here because we might + * have more phases to do, even though the + * input is empty. So we need to restart the + * whole outer loop. + */ + break; + } + } + + if (aggstate->projected_set >= numGroupingSets) + continue; + } + else + { + aggstate->agg_done = true; + /* If we are grouping, we should produce no tuples too */ + if (node->aggstrategy != AGG_PLAIN) + return NULL; + } + } + } + + /* + * Initialize working state for a new input tuple group. + */ + initialize_aggregates(aggstate, pergroups, numReset); + + if (aggstate->grp_firstTuple != NULL) + { + /* + * Store the copied first input tuple in the tuple table slot + * reserved for it. The tuple will be deleted when it is + * cleared from the slot. + */ + ExecForceStoreHeapTuple(aggstate->grp_firstTuple, + firstSlot, true); + aggstate->grp_firstTuple = NULL; /* don't keep two pointers */ + + /* set up for first advance_aggregates call */ + tmpcontext->ecxt_outertuple = firstSlot; + + /* + * Process each outer-plan tuple, and then fetch the next one, + * until we exhaust the outer plan or cross a group boundary. + */ + for (;;) + { + /* + * During phase 1 only of a mixed agg, we need to update + * hashtables as well in advance_aggregates. + */ + if (aggstate->aggstrategy == AGG_MIXED && + aggstate->current_phase == 1) + { + lookup_hash_entries(aggstate); + } + + /* Advance the aggregates (or combine functions) */ + advance_aggregates(aggstate); + + /* Reset per-input-tuple context after each tuple */ + ResetExprContext(tmpcontext); + + outerslot = fetch_input_tuple(aggstate); + if (TupIsNull(outerslot)) + { + /* no more outer-plan tuples available */ + + /* if we built hash tables, finalize any spills */ + if (aggstate->aggstrategy == AGG_MIXED && + aggstate->current_phase == 1) + hashagg_finish_initial_spills(aggstate); + + if (hasGroupingSets) + { + aggstate->input_done = true; + break; + } + else + { + aggstate->agg_done = true; + break; + } + } + /* set up for next advance_aggregates call */ + tmpcontext->ecxt_outertuple = outerslot; + + /* + * If we are grouping, check whether we've crossed a group + * boundary. + */ + if (node->aggstrategy != AGG_PLAIN) + { + tmpcontext->ecxt_innertuple = firstSlot; + if (!ExecQual(aggstate->phase->eqfunctions[node->numCols - 1], + tmpcontext)) + { + aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot); + break; + } + } + } + } + + /* + * Use the representative input tuple for any references to + * non-aggregated input columns in aggregate direct args, the node + * qual, and the tlist. (If we are not grouping, and there are no + * input rows at all, we will come here with an empty firstSlot + * ... but if not grouping, there can't be any references to + * non-aggregated input columns, so no problem.) + */ + econtext->ecxt_outertuple = firstSlot; + } + + Assert(aggstate->projected_set >= 0); + + currentSet = aggstate->projected_set; + + prepare_projection_slot(aggstate, econtext->ecxt_outertuple, currentSet); + + select_current_set(aggstate, currentSet, false); + + finalize_aggregates(aggstate, + peragg, + pergroups[currentSet]); + + /* + * If there's no row to project right now, we must continue rather + * than returning a null since there might be more groups. + */ + result = project_aggregates(aggstate); + if (result) + return result; + } + + /* No more groups */ + return NULL; +} + +/* + * ExecAgg for hashed case: read input and build hash table + */ +static void +agg_fill_hash_table(AggState *aggstate) +{ + TupleTableSlot *outerslot; + ExprContext *tmpcontext = aggstate->tmpcontext; + + /* + * Process each outer-plan tuple, and then fetch the next one, until we + * exhaust the outer plan. + */ + for (;;) + { + outerslot = fetch_input_tuple(aggstate); + if (TupIsNull(outerslot)) + break; + + /* set up for lookup_hash_entries and advance_aggregates */ + tmpcontext->ecxt_outertuple = outerslot; + + /* Find or build hashtable entries */ + lookup_hash_entries(aggstate); + + /* Advance the aggregates (or combine functions) */ + advance_aggregates(aggstate); + + /* + * Reset per-input-tuple context after each tuple, but note that the + * hash lookups do this too + */ + ResetExprContext(aggstate->tmpcontext); + } + + /* finalize spills, if any */ + hashagg_finish_initial_spills(aggstate); + + aggstate->table_filled = true; + /* Initialize to walk the first hash table */ + select_current_set(aggstate, 0, true); + ResetTupleHashIterator(aggstate->perhash[0].hashtable, + &aggstate->perhash[0].hashiter); +} + +/* + * If any data was spilled during hash aggregation, reset the hash table and + * reprocess one batch of spilled data. After reprocessing a batch, the hash + * table will again contain data, ready to be consumed by + * agg_retrieve_hash_table_in_memory(). + * + * Should only be called after all in memory hash table entries have been + * finalized and emitted. + * + * Return false when input is exhausted and there's no more work to be done; + * otherwise return true. + */ +static bool +agg_refill_hash_table(AggState *aggstate) +{ + HashAggBatch *batch; + AggStatePerHash perhash; + HashAggSpill spill; + HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo; + bool spill_initialized = false; + + if (aggstate->hash_batches == NIL) + return false; + + /* hash_batches is a stack, with the top item at the end of the list */ + batch = llast(aggstate->hash_batches); + aggstate->hash_batches = list_delete_last(aggstate->hash_batches); + + hash_agg_set_limits(aggstate->hashentrysize, batch->input_card, + batch->used_bits, &aggstate->hash_mem_limit, + &aggstate->hash_ngroups_limit, NULL); + + /* + * Each batch only processes one grouping set; set the rest to NULL so + * that advance_aggregates() knows to ignore them. We don't touch + * pergroups for sorted grouping sets here, because they will be needed if + * we rescan later. The expressions for sorted grouping sets will not be + * evaluated after we recompile anyway. + */ + MemSet(aggstate->hash_pergroup, 0, + sizeof(AggStatePerGroup) * aggstate->num_hashes); + + /* free memory and reset hash tables */ + ReScanExprContext(aggstate->hashcontext); + for (int setno = 0; setno < aggstate->num_hashes; setno++) + ResetTupleHashTable(aggstate->perhash[setno].hashtable); + + aggstate->hash_ngroups_current = 0; + + /* + * In AGG_MIXED mode, hash aggregation happens in phase 1 and the output + * happens in phase 0. So, we switch to phase 1 when processing a batch, + * and back to phase 0 after the batch is done. + */ + Assert(aggstate->current_phase == 0); + if (aggstate->phase->aggstrategy == AGG_MIXED) + { + aggstate->current_phase = 1; + aggstate->phase = &aggstate->phases[aggstate->current_phase]; + } + + select_current_set(aggstate, batch->setno, true); + + perhash = &aggstate->perhash[aggstate->current_set]; + + /* + * Spilled tuples are always read back as MinimalTuples, which may be + * different from the outer plan, so recompile the aggregate expressions. + * + * We still need the NULL check, because we are only processing one + * grouping set at a time and the rest will be NULL. + */ + hashagg_recompile_expressions(aggstate, true, true); + + for (;;) + { + TupleTableSlot *spillslot = aggstate->hash_spill_rslot; + TupleTableSlot *hashslot = perhash->hashslot; + TupleHashEntry entry; + MinimalTuple tuple; + uint32 hash; + bool isnew = false; + bool *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew; + + CHECK_FOR_INTERRUPTS(); + + tuple = hashagg_batch_read(batch, &hash); + if (tuple == NULL) + break; + + ExecStoreMinimalTuple(tuple, spillslot, true); + aggstate->tmpcontext->ecxt_outertuple = spillslot; + + prepare_hash_slot(perhash, + aggstate->tmpcontext->ecxt_outertuple, + hashslot); + entry = LookupTupleHashEntryHash( + perhash->hashtable, hashslot, p_isnew, hash); + + if (entry != NULL) + { + if (isnew) + initialize_hash_entry(aggstate, perhash->hashtable, entry); + aggstate->hash_pergroup[batch->setno] = entry->additional; + advance_aggregates(aggstate); + } + else + { + if (!spill_initialized) + { + /* + * Avoid initializing the spill until we actually need it so + * that we don't assign tapes that will never be used. + */ + spill_initialized = true; + hashagg_spill_init(&spill, tapeinfo, batch->used_bits, + batch->input_card, aggstate->hashentrysize); + } + /* no memory for a new group, spill */ + hashagg_spill_tuple(aggstate, &spill, spillslot, hash); + + aggstate->hash_pergroup[batch->setno] = NULL; + } + + /* + * Reset per-input-tuple context after each tuple, but note that the + * hash lookups do this too + */ + ResetExprContext(aggstate->tmpcontext); + } + + hashagg_tapeinfo_release(tapeinfo, batch->input_tapenum); + + /* change back to phase 0 */ + aggstate->current_phase = 0; + aggstate->phase = &aggstate->phases[aggstate->current_phase]; + + if (spill_initialized) + { + hashagg_spill_finish(aggstate, &spill, batch->setno); + hash_agg_update_metrics(aggstate, true, spill.npartitions); + } + else + hash_agg_update_metrics(aggstate, true, 0); + + aggstate->hash_spill_mode = false; + + /* prepare to walk the first hash table */ + select_current_set(aggstate, batch->setno, true); + ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable, + &aggstate->perhash[batch->setno].hashiter); + + pfree(batch); + + return true; +} + +/* + * ExecAgg for hashed case: retrieving groups from hash table + * + * After exhausting in-memory tuples, also try refilling the hash table using + * previously-spilled tuples. Only returns NULL after all in-memory and + * spilled tuples are exhausted. + */ +static TupleTableSlot * +agg_retrieve_hash_table(AggState *aggstate) +{ + TupleTableSlot *result = NULL; + + while (result == NULL) + { + result = agg_retrieve_hash_table_in_memory(aggstate); + if (result == NULL) + { + if (!agg_refill_hash_table(aggstate)) + { + aggstate->agg_done = true; + break; + } + } + } + + return result; +} + +/* + * Retrieve the groups from the in-memory hash tables without considering any + * spilled tuples. + */ +static TupleTableSlot * +agg_retrieve_hash_table_in_memory(AggState *aggstate) +{ + ExprContext *econtext; + AggStatePerAgg peragg; + AggStatePerGroup pergroup; + TupleHashEntryData *entry; + TupleTableSlot *firstSlot; + TupleTableSlot *result; + AggStatePerHash perhash; + + /* + * get state info from node. + * + * econtext is the per-output-tuple expression context. + */ + econtext = aggstate->ss.ps.ps_ExprContext; + peragg = aggstate->peragg; + firstSlot = aggstate->ss.ss_ScanTupleSlot; + + /* + * Note that perhash (and therefore anything accessed through it) can + * change inside the loop, as we change between grouping sets. + */ + perhash = &aggstate->perhash[aggstate->current_set]; + + /* + * We loop retrieving groups until we find one satisfying + * aggstate->ss.ps.qual + */ + for (;;) + { + TupleTableSlot *hashslot = perhash->hashslot; + int i; + + CHECK_FOR_INTERRUPTS(); + + /* + * Find the next entry in the hash table + */ + entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter); + if (entry == NULL) + { + int nextset = aggstate->current_set + 1; + + if (nextset < aggstate->num_hashes) + { + /* + * Switch to next grouping set, reinitialize, and restart the + * loop. + */ + select_current_set(aggstate, nextset, true); + + perhash = &aggstate->perhash[aggstate->current_set]; + + ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter); + + continue; + } + else + { + return NULL; + } + } + + /* + * Clear the per-output-tuple context for each group + * + * We intentionally don't use ReScanExprContext here; if any aggs have + * registered shutdown callbacks, they mustn't be called yet, since we + * might not be done with that agg. + */ + ResetExprContext(econtext); + + /* + * Transform representative tuple back into one with the right + * columns. + */ + ExecStoreMinimalTuple(entry->firstTuple, hashslot, false); + slot_getallattrs(hashslot); + + ExecClearTuple(firstSlot); + memset(firstSlot->tts_isnull, true, + firstSlot->tts_tupleDescriptor->natts * sizeof(bool)); + + for (i = 0; i < perhash->numhashGrpCols; i++) + { + int varNumber = perhash->hashGrpColIdxInput[i] - 1; + + firstSlot->tts_values[varNumber] = hashslot->tts_values[i]; + firstSlot->tts_isnull[varNumber] = hashslot->tts_isnull[i]; + } + ExecStoreVirtualTuple(firstSlot); + + pergroup = (AggStatePerGroup) entry->additional; + + /* + * Use the representative input tuple for any references to + * non-aggregated input columns in the qual and tlist. + */ + econtext->ecxt_outertuple = firstSlot; + + prepare_projection_slot(aggstate, + econtext->ecxt_outertuple, + aggstate->current_set); + + finalize_aggregates(aggstate, peragg, pergroup); + + result = project_aggregates(aggstate); + if (result) + return result; + } + + /* No more groups */ + return NULL; +} + +/* + * Initialize HashTapeInfo + */ +static void +hashagg_tapeinfo_init(AggState *aggstate) +{ + HashTapeInfo *tapeinfo = palloc(sizeof(HashTapeInfo)); + int init_tapes = 16; /* expanded dynamically */ + + tapeinfo->tapeset = LogicalTapeSetCreate(init_tapes, true, NULL, NULL, -1); + tapeinfo->ntapes = init_tapes; + tapeinfo->nfreetapes = init_tapes; + tapeinfo->freetapes_alloc = init_tapes; + tapeinfo->freetapes = palloc(init_tapes * sizeof(int)); + for (int i = 0; i < init_tapes; i++) + tapeinfo->freetapes[i] = i; + + aggstate->hash_tapeinfo = tapeinfo; +} + +/* + * Assign unused tapes to spill partitions, extending the tape set if + * necessary. + */ +static void +hashagg_tapeinfo_assign(HashTapeInfo *tapeinfo, int *partitions, + int npartitions) +{ + int partidx = 0; + + /* use free tapes if available */ + while (partidx < npartitions && tapeinfo->nfreetapes > 0) + partitions[partidx++] = tapeinfo->freetapes[--tapeinfo->nfreetapes]; + + if (partidx < npartitions) + { + LogicalTapeSetExtend(tapeinfo->tapeset, npartitions - partidx); + + while (partidx < npartitions) + partitions[partidx++] = tapeinfo->ntapes++; + } +} + +/* + * After a tape has already been written to and then read, this function + * rewinds it for writing and adds it to the free list. + */ +static void +hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum) +{ + /* rewinding frees the buffer while not in use */ + LogicalTapeRewindForWrite(tapeinfo->tapeset, tapenum); + if (tapeinfo->freetapes_alloc == tapeinfo->nfreetapes) + { + tapeinfo->freetapes_alloc <<= 1; + tapeinfo->freetapes = repalloc(tapeinfo->freetapes, + tapeinfo->freetapes_alloc * sizeof(int)); + } + tapeinfo->freetapes[tapeinfo->nfreetapes++] = tapenum; +} + +/* + * hashagg_spill_init + * + * Called after we determined that spilling is necessary. Chooses the number + * of partitions to create, and initializes them. + */ +static void +hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits, + double input_groups, double hashentrysize) +{ + int npartitions; + int partition_bits; + + npartitions = hash_choose_num_partitions(input_groups, hashentrysize, + used_bits, &partition_bits); + + spill->partitions = palloc0(sizeof(int) * npartitions); + spill->ntuples = palloc0(sizeof(int64) * npartitions); + spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions); + + hashagg_tapeinfo_assign(tapeinfo, spill->partitions, npartitions); + + spill->tapeset = tapeinfo->tapeset; + spill->shift = 32 - used_bits - partition_bits; + spill->mask = (npartitions - 1) << spill->shift; + spill->npartitions = npartitions; + + for (int i = 0; i < npartitions; i++) + initHyperLogLog(&spill->hll_card[i], HASHAGG_HLL_BIT_WIDTH); +} + +/* + * hashagg_spill_tuple + * + * No room for new groups in the hash table. Save for later in the appropriate + * partition. + */ +static Size +hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill, + TupleTableSlot *inputslot, uint32 hash) +{ + LogicalTapeSet *tapeset = spill->tapeset; + TupleTableSlot *spillslot; + int partition; + MinimalTuple tuple; + int tapenum; + int total_written = 0; + bool shouldFree; + + Assert(spill->partitions != NULL); + + /* spill only attributes that we actually need */ + if (!aggstate->all_cols_needed) + { + spillslot = aggstate->hash_spill_wslot; + slot_getsomeattrs(inputslot, aggstate->max_colno_needed); + ExecClearTuple(spillslot); + for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++) + { + if (bms_is_member(i + 1, aggstate->colnos_needed)) + { + spillslot->tts_values[i] = inputslot->tts_values[i]; + spillslot->tts_isnull[i] = inputslot->tts_isnull[i]; + } + else + spillslot->tts_isnull[i] = true; + } + ExecStoreVirtualTuple(spillslot); + } + else + spillslot = inputslot; + + tuple = ExecFetchSlotMinimalTuple(spillslot, &shouldFree); + + partition = (hash & spill->mask) >> spill->shift; + spill->ntuples[partition]++; + + /* + * All hash values destined for a given partition have some bits in + * common, which causes bad HLL cardinality estimates. Hash the hash to + * get a more uniform distribution. + */ + addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash)); + + tapenum = spill->partitions[partition]; + + LogicalTapeWrite(tapeset, tapenum, (void *) &hash, sizeof(uint32)); + total_written += sizeof(uint32); + + LogicalTapeWrite(tapeset, tapenum, (void *) tuple, tuple->t_len); + total_written += tuple->t_len; + + if (shouldFree) + pfree(tuple); + + return total_written; +} + +/* + * hashagg_batch_new + * + * Construct a HashAggBatch item, which represents one iteration of HashAgg to + * be done. + */ +static HashAggBatch * +hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno, + int64 input_tuples, double input_card, int used_bits) +{ + HashAggBatch *batch = palloc0(sizeof(HashAggBatch)); + + batch->setno = setno; + batch->used_bits = used_bits; + batch->tapeset = tapeset; + batch->input_tapenum = tapenum; + batch->input_tuples = input_tuples; + batch->input_card = input_card; + + return batch; +} + +/* + * read_spilled_tuple + * read the next tuple from a batch's tape. Return NULL if no more. + */ +static MinimalTuple +hashagg_batch_read(HashAggBatch *batch, uint32 *hashp) +{ + LogicalTapeSet *tapeset = batch->tapeset; + int tapenum = batch->input_tapenum; + MinimalTuple tuple; + uint32 t_len; + size_t nread; + uint32 hash; + + nread = LogicalTapeRead(tapeset, tapenum, &hash, sizeof(uint32)); + if (nread == 0) + return NULL; + if (nread != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes", + tapenum, sizeof(uint32), nread))); + if (hashp != NULL) + *hashp = hash; + + nread = LogicalTapeRead(tapeset, tapenum, &t_len, sizeof(t_len)); + if (nread != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes", + tapenum, sizeof(uint32), nread))); + + tuple = (MinimalTuple) palloc(t_len); + tuple->t_len = t_len; + + nread = LogicalTapeRead(tapeset, tapenum, + (void *) ((char *) tuple + sizeof(uint32)), + t_len - sizeof(uint32)); + if (nread != t_len - sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("unexpected EOF for tape %d: requested %zu bytes, read %zu bytes", + tapenum, t_len - sizeof(uint32), nread))); + + return tuple; +} + +/* + * hashagg_finish_initial_spills + * + * After a HashAggBatch has been processed, it may have spilled tuples to + * disk. If so, turn the spilled partitions into new batches that must later + * be executed. + */ +static void +hashagg_finish_initial_spills(AggState *aggstate) +{ + int setno; + int total_npartitions = 0; + + if (aggstate->hash_spills != NULL) + { + for (setno = 0; setno < aggstate->num_hashes; setno++) + { + HashAggSpill *spill = &aggstate->hash_spills[setno]; + + total_npartitions += spill->npartitions; + hashagg_spill_finish(aggstate, spill, setno); + } + + /* + * We're not processing tuples from outer plan any more; only + * processing batches of spilled tuples. The initial spill structures + * are no longer needed. + */ + pfree(aggstate->hash_spills); + aggstate->hash_spills = NULL; + } + + hash_agg_update_metrics(aggstate, false, total_npartitions); + aggstate->hash_spill_mode = false; +} + +/* + * hashagg_spill_finish + * + * Transform spill partitions into new batches. + */ +static void +hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno) +{ + int i; + int used_bits = 32 - spill->shift; + + if (spill->npartitions == 0) + return; /* didn't spill */ + + for (i = 0; i < spill->npartitions; i++) + { + LogicalTapeSet *tapeset = aggstate->hash_tapeinfo->tapeset; + int tapenum = spill->partitions[i]; + HashAggBatch *new_batch; + double cardinality; + + /* if the partition is empty, don't create a new batch of work */ + if (spill->ntuples[i] == 0) + continue; + + cardinality = estimateHyperLogLog(&spill->hll_card[i]); + freeHyperLogLog(&spill->hll_card[i]); + + /* rewinding frees the buffer while not in use */ + LogicalTapeRewindForRead(tapeset, tapenum, + HASHAGG_READ_BUFFER_SIZE); + + new_batch = hashagg_batch_new(tapeset, tapenum, setno, + spill->ntuples[i], cardinality, + used_bits); + aggstate->hash_batches = lappend(aggstate->hash_batches, new_batch); + aggstate->hash_batches_used++; + } + + pfree(spill->ntuples); + pfree(spill->hll_card); + pfree(spill->partitions); +} + +/* + * Free resources related to a spilled HashAgg. + */ +static void +hashagg_reset_spill_state(AggState *aggstate) +{ + /* free spills from initial pass */ + if (aggstate->hash_spills != NULL) + { + int setno; + + for (setno = 0; setno < aggstate->num_hashes; setno++) + { + HashAggSpill *spill = &aggstate->hash_spills[setno]; + + pfree(spill->ntuples); + pfree(spill->partitions); + } + pfree(aggstate->hash_spills); + aggstate->hash_spills = NULL; + } + + /* free batches */ + list_free_deep(aggstate->hash_batches); + aggstate->hash_batches = NIL; + + /* close tape set */ + if (aggstate->hash_tapeinfo != NULL) + { + HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo; + + LogicalTapeSetClose(tapeinfo->tapeset); + pfree(tapeinfo->freetapes); + pfree(tapeinfo); + aggstate->hash_tapeinfo = NULL; + } +} + + +/* ----------------- + * ExecInitAgg + * + * Creates the run-time information for the agg node produced by the + * planner and initializes its outer subtree. + * + * ----------------- + */ +AggState * +ExecInitAgg(Agg *node, EState *estate, int eflags) +{ + AggState *aggstate; + AggStatePerAgg peraggs; + AggStatePerTrans pertransstates; + AggStatePerGroup *pergroups; + Plan *outerPlan; + ExprContext *econtext; + TupleDesc scanDesc; + int max_aggno; + int max_transno; + int numaggrefs; + int numaggs; + int numtrans; + int phase; + int phaseidx; + ListCell *l; + Bitmapset *all_grouped_cols = NULL; + int numGroupingSets = 1; + int numPhases; + int numHashes; + int i = 0; + int j = 0; + bool use_hashing = (node->aggstrategy == AGG_HASHED || + node->aggstrategy == AGG_MIXED); + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + aggstate = makeNode(AggState); + aggstate->ss.ps.plan = (Plan *) node; + aggstate->ss.ps.state = estate; + aggstate->ss.ps.ExecProcNode = ExecAgg; + + aggstate->aggs = NIL; + aggstate->numaggs = 0; + aggstate->numtrans = 0; + aggstate->aggstrategy = node->aggstrategy; + aggstate->aggsplit = node->aggsplit; + aggstate->maxsets = 0; + aggstate->projected_set = -1; + aggstate->current_set = 0; + aggstate->peragg = NULL; + aggstate->pertrans = NULL; + aggstate->curperagg = NULL; + aggstate->curpertrans = NULL; + aggstate->input_done = false; + aggstate->agg_done = false; + aggstate->pergroups = NULL; + aggstate->grp_firstTuple = NULL; + aggstate->sort_in = NULL; + aggstate->sort_out = NULL; + + /* + * phases[0] always exists, but is dummy in sorted/plain mode + */ + numPhases = (use_hashing ? 1 : 2); + numHashes = (use_hashing ? 1 : 0); + + /* + * Calculate the maximum number of grouping sets in any phase; this + * determines the size of some allocations. Also calculate the number of + * phases, since all hashed/mixed nodes contribute to only a single phase. + */ + if (node->groupingSets) + { + numGroupingSets = list_length(node->groupingSets); + + foreach(l, node->chain) + { + Agg *agg = lfirst(l); + + numGroupingSets = Max(numGroupingSets, + list_length(agg->groupingSets)); + + /* + * additional AGG_HASHED aggs become part of phase 0, but all + * others add an extra phase. + */ + if (agg->aggstrategy != AGG_HASHED) + ++numPhases; + else + ++numHashes; + } + } + + aggstate->maxsets = numGroupingSets; + aggstate->numphases = numPhases; + + aggstate->aggcontexts = (ExprContext **) + palloc0(sizeof(ExprContext *) * numGroupingSets); + + /* + * Create expression contexts. We need three or more, one for + * per-input-tuple processing, one for per-output-tuple processing, one + * for all the hashtables, and one for each grouping set. The per-tuple + * memory context of the per-grouping-set ExprContexts (aggcontexts) + * replaces the standalone memory context formerly used to hold transition + * values. We cheat a little by using ExecAssignExprContext() to build + * all of them. + * + * NOTE: the details of what is stored in aggcontexts and what is stored + * in the regular per-query memory context are driven by a simple + * decision: we want to reset the aggcontext at group boundaries (if not + * hashing) and in ExecReScanAgg to recover no-longer-wanted space. + */ + ExecAssignExprContext(estate, &aggstate->ss.ps); + aggstate->tmpcontext = aggstate->ss.ps.ps_ExprContext; + + for (i = 0; i < numGroupingSets; ++i) + { + ExecAssignExprContext(estate, &aggstate->ss.ps); + aggstate->aggcontexts[i] = aggstate->ss.ps.ps_ExprContext; + } + + if (use_hashing) + aggstate->hashcontext = CreateWorkExprContext(estate); + + ExecAssignExprContext(estate, &aggstate->ss.ps); + + /* + * Initialize child nodes. + * + * If we are doing a hashed aggregation then the child plan does not need + * to handle REWIND efficiently; see ExecReScanAgg. + */ + if (node->aggstrategy == AGG_HASHED) + eflags &= ~EXEC_FLAG_REWIND; + outerPlan = outerPlan(node); + outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags); + + /* + * initialize source tuple type. + */ + aggstate->ss.ps.outerops = + ExecGetResultSlotOps(outerPlanState(&aggstate->ss), + &aggstate->ss.ps.outeropsfixed); + aggstate->ss.ps.outeropsset = true; + + ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss, + aggstate->ss.ps.outerops); + scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + + /* + * If there are more than two phases (including a potential dummy phase + * 0), input will be resorted using tuplesort. Need a slot for that. + */ + if (numPhases > 2) + { + aggstate->sort_slot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + + /* + * The output of the tuplesort, and the output from the outer child + * might not use the same type of slot. In most cases the child will + * be a Sort, and thus return a TTSOpsMinimalTuple type slot - but the + * input can also be presorted due an index, in which case it could be + * a different type of slot. + * + * XXX: For efficiency it would be good to instead/additionally + * generate expressions with corresponding settings of outerops* for + * the individual phases - deforming is often a bottleneck for + * aggregations with lots of rows per group. If there's multiple + * sorts, we know that all but the first use TTSOpsMinimalTuple (via + * the nodeAgg.c internal tuplesort). + */ + if (aggstate->ss.ps.outeropsfixed && + aggstate->ss.ps.outerops != &TTSOpsMinimalTuple) + aggstate->ss.ps.outeropsfixed = false; + } + + /* + * Initialize result type, slot and projection. + */ + ExecInitResultTupleSlotTL(&aggstate->ss.ps, &TTSOpsVirtual); + ExecAssignProjectionInfo(&aggstate->ss.ps, NULL); + + /* + * initialize child expressions + * + * We expect the parser to have checked that no aggs contain other agg + * calls in their arguments (and just to be sure, we verify it again while + * initializing the plan node). This would make no sense under SQL + * semantics, and it's forbidden by the spec. Because it is true, we + * don't need to worry about evaluating the aggs in any particular order. + * + * Note: execExpr.c finds Aggrefs for us, and adds them to aggstate->aggs. + * Aggrefs in the qual are found here; Aggrefs in the targetlist are found + * during ExecAssignProjectionInfo, above. + */ + aggstate->ss.ps.qual = + ExecInitQual(node->plan.qual, (PlanState *) aggstate); + + /* + * We should now have found all Aggrefs in the targetlist and quals. + */ + numaggrefs = list_length(aggstate->aggs); + max_aggno = -1; + max_transno = -1; + foreach(l, aggstate->aggs) + { + Aggref *aggref = (Aggref *) lfirst(l); + + max_aggno = Max(max_aggno, aggref->aggno); + max_transno = Max(max_transno, aggref->aggtransno); + } + numaggs = max_aggno + 1; + numtrans = max_transno + 1; + + /* + * For each phase, prepare grouping set data and fmgr lookup data for + * compare functions. Accumulate all_grouped_cols in passing. + */ + aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData)); + + aggstate->num_hashes = numHashes; + if (numHashes) + { + aggstate->perhash = palloc0(sizeof(AggStatePerHashData) * numHashes); + aggstate->phases[0].numsets = 0; + aggstate->phases[0].gset_lengths = palloc(numHashes * sizeof(int)); + aggstate->phases[0].grouped_cols = palloc(numHashes * sizeof(Bitmapset *)); + } + + phase = 0; + for (phaseidx = 0; phaseidx <= list_length(node->chain); ++phaseidx) + { + Agg *aggnode; + Sort *sortnode; + + if (phaseidx > 0) + { + aggnode = list_nth_node(Agg, node->chain, phaseidx - 1); + sortnode = castNode(Sort, aggnode->plan.lefttree); + } + else + { + aggnode = node; + sortnode = NULL; + } + + Assert(phase <= 1 || sortnode); + + if (aggnode->aggstrategy == AGG_HASHED + || aggnode->aggstrategy == AGG_MIXED) + { + AggStatePerPhase phasedata = &aggstate->phases[0]; + AggStatePerHash perhash; + Bitmapset *cols = NULL; + + Assert(phase == 0); + i = phasedata->numsets++; + perhash = &aggstate->perhash[i]; + + /* phase 0 always points to the "real" Agg in the hash case */ + phasedata->aggnode = node; + phasedata->aggstrategy = node->aggstrategy; + + /* but the actual Agg node representing this hash is saved here */ + perhash->aggnode = aggnode; + + phasedata->gset_lengths[i] = perhash->numCols = aggnode->numCols; + + for (j = 0; j < aggnode->numCols; ++j) + cols = bms_add_member(cols, aggnode->grpColIdx[j]); + + phasedata->grouped_cols[i] = cols; + + all_grouped_cols = bms_add_members(all_grouped_cols, cols); + continue; + } + else + { + AggStatePerPhase phasedata = &aggstate->phases[++phase]; + int num_sets; + + phasedata->numsets = num_sets = list_length(aggnode->groupingSets); + + if (num_sets) + { + phasedata->gset_lengths = palloc(num_sets * sizeof(int)); + phasedata->grouped_cols = palloc(num_sets * sizeof(Bitmapset *)); + + i = 0; + foreach(l, aggnode->groupingSets) + { + int current_length = list_length(lfirst(l)); + Bitmapset *cols = NULL; + + /* planner forces this to be correct */ + for (j = 0; j < current_length; ++j) + cols = bms_add_member(cols, aggnode->grpColIdx[j]); + + phasedata->grouped_cols[i] = cols; + phasedata->gset_lengths[i] = current_length; + + ++i; + } + + all_grouped_cols = bms_add_members(all_grouped_cols, + phasedata->grouped_cols[0]); + } + else + { + Assert(phaseidx == 0); + + phasedata->gset_lengths = NULL; + phasedata->grouped_cols = NULL; + } + + /* + * If we are grouping, precompute fmgr lookup data for inner loop. + */ + if (aggnode->aggstrategy == AGG_SORTED) + { + int i = 0; + + Assert(aggnode->numCols > 0); + + /* + * Build a separate function for each subset of columns that + * need to be compared. + */ + phasedata->eqfunctions = + (ExprState **) palloc0(aggnode->numCols * sizeof(ExprState *)); + + /* for each grouping set */ + for (i = 0; i < phasedata->numsets; i++) + { + int length = phasedata->gset_lengths[i]; + + if (phasedata->eqfunctions[length - 1] != NULL) + continue; + + phasedata->eqfunctions[length - 1] = + execTuplesMatchPrepare(scanDesc, + length, + aggnode->grpColIdx, + aggnode->grpOperators, + aggnode->grpCollations, + (PlanState *) aggstate); + } + + /* and for all grouped columns, unless already computed */ + if (phasedata->eqfunctions[aggnode->numCols - 1] == NULL) + { + phasedata->eqfunctions[aggnode->numCols - 1] = + execTuplesMatchPrepare(scanDesc, + aggnode->numCols, + aggnode->grpColIdx, + aggnode->grpOperators, + aggnode->grpCollations, + (PlanState *) aggstate); + } + } + + phasedata->aggnode = aggnode; + phasedata->aggstrategy = aggnode->aggstrategy; + phasedata->sortnode = sortnode; + } + } + + /* + * Convert all_grouped_cols to a descending-order list. + */ + i = -1; + while ((i = bms_next_member(all_grouped_cols, i)) >= 0) + aggstate->all_grouped_cols = lcons_int(i, aggstate->all_grouped_cols); + + /* + * Set up aggregate-result storage in the output expr context, and also + * allocate my private per-agg working storage + */ + econtext = aggstate->ss.ps.ps_ExprContext; + econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs); + econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs); + + peraggs = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs); + pertransstates = (AggStatePerTrans) palloc0(sizeof(AggStatePerTransData) * numtrans); + + aggstate->peragg = peraggs; + aggstate->pertrans = pertransstates; + + + aggstate->all_pergroups = + (AggStatePerGroup *) palloc0(sizeof(AggStatePerGroup) + * (numGroupingSets + numHashes)); + pergroups = aggstate->all_pergroups; + + if (node->aggstrategy != AGG_HASHED) + { + for (i = 0; i < numGroupingSets; i++) + { + pergroups[i] = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData) + * numaggs); + } + + aggstate->pergroups = pergroups; + pergroups += numGroupingSets; + } + + /* + * Hashing can only appear in the initial phase. + */ + if (use_hashing) + { + Plan *outerplan = outerPlan(node); + uint64 totalGroups = 0; + int i; + + aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt, + "HashAgg meta context", + ALLOCSET_DEFAULT_SIZES); + aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsVirtual); + + /* this is an array of pointers, not structures */ + aggstate->hash_pergroup = pergroups; + + aggstate->hashentrysize = hash_agg_entry_size(aggstate->numtrans, + outerplan->plan_width, + node->transitionSpace); + + /* + * Consider all of the grouping sets together when setting the limits + * and estimating the number of partitions. This can be inaccurate + * when there is more than one grouping set, but should still be + * reasonable. + */ + for (i = 0; i < aggstate->num_hashes; i++) + totalGroups += aggstate->perhash[i].aggnode->numGroups; + + hash_agg_set_limits(aggstate->hashentrysize, totalGroups, 0, + &aggstate->hash_mem_limit, + &aggstate->hash_ngroups_limit, + &aggstate->hash_planned_partitions); + find_hash_columns(aggstate); + + /* Skip massive memory allocation if we are just doing EXPLAIN */ + if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY)) + build_hash_tables(aggstate); + + aggstate->table_filled = false; + + /* Initialize this to 1, meaning nothing spilled, yet */ + aggstate->hash_batches_used = 1; + } + + /* + * Initialize current phase-dependent values to initial phase. The initial + * phase is 1 (first sort pass) for all strategies that use sorting (if + * hashing is being done too, then phase 0 is processed last); but if only + * hashing is being done, then phase 0 is all there is. + */ + if (node->aggstrategy == AGG_HASHED) + { + aggstate->current_phase = 0; + initialize_phase(aggstate, 0); + select_current_set(aggstate, 0, true); + } + else + { + aggstate->current_phase = 1; + initialize_phase(aggstate, 1); + select_current_set(aggstate, 0, false); + } + + /* + * Perform lookups of aggregate function info, and initialize the + * unchanging fields of the per-agg and per-trans data. + */ + foreach(l, aggstate->aggs) + { + Aggref *aggref = lfirst(l); + AggStatePerAgg peragg; + AggStatePerTrans pertrans; + Oid inputTypes[FUNC_MAX_ARGS]; + int numArguments; + int numDirectArgs; + HeapTuple aggTuple; + Form_pg_aggregate aggform; + AclResult aclresult; + Oid finalfn_oid; + Oid serialfn_oid, + deserialfn_oid; + Oid aggOwner; + Expr *finalfnexpr; + Oid aggtranstype; + + /* Planner should have assigned aggregate to correct level */ + Assert(aggref->agglevelsup == 0); + /* ... and the split mode should match */ + Assert(aggref->aggsplit == aggstate->aggsplit); + + peragg = &peraggs[aggref->aggno]; + + /* Check if we initialized the state for this aggregate already. */ + if (peragg->aggref != NULL) + continue; + + peragg->aggref = aggref; + peragg->transno = aggref->aggtransno; + + /* Fetch the pg_aggregate row */ + aggTuple = SearchSysCache1(AGGFNOID, + ObjectIdGetDatum(aggref->aggfnoid)); + if (!HeapTupleIsValid(aggTuple)) + elog(ERROR, "cache lookup failed for aggregate %u", + aggref->aggfnoid); + aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple); + + /* Check permission to call aggregate function */ + aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(), + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_AGGREGATE, + get_func_name(aggref->aggfnoid)); + InvokeFunctionExecuteHook(aggref->aggfnoid); + + /* planner recorded transition state type in the Aggref itself */ + aggtranstype = aggref->aggtranstype; + Assert(OidIsValid(aggtranstype)); + + /* Final function only required if we're finalizing the aggregates */ + if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)) + peragg->finalfn_oid = finalfn_oid = InvalidOid; + else + peragg->finalfn_oid = finalfn_oid = aggform->aggfinalfn; + + serialfn_oid = InvalidOid; + deserialfn_oid = InvalidOid; + + /* + * Check if serialization/deserialization is required. We only do it + * for aggregates that have transtype INTERNAL. + */ + if (aggtranstype == INTERNALOID) + { + /* + * The planner should only have generated a serialize agg node if + * every aggregate with an INTERNAL state has a serialization + * function. Verify that. + */ + if (DO_AGGSPLIT_SERIALIZE(aggstate->aggsplit)) + { + /* serialization only valid when not running finalfn */ + Assert(DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)); + + if (!OidIsValid(aggform->aggserialfn)) + elog(ERROR, "serialfunc not provided for serialization aggregation"); + serialfn_oid = aggform->aggserialfn; + } + + /* Likewise for deserialization functions */ + if (DO_AGGSPLIT_DESERIALIZE(aggstate->aggsplit)) + { + /* deserialization only valid when combining states */ + Assert(DO_AGGSPLIT_COMBINE(aggstate->aggsplit)); + + if (!OidIsValid(aggform->aggdeserialfn)) + elog(ERROR, "deserialfunc not provided for deserialization aggregation"); + deserialfn_oid = aggform->aggdeserialfn; + } + } + + /* Check that aggregate owner has permission to call component fns */ + { + HeapTuple procTuple; + + procTuple = SearchSysCache1(PROCOID, + ObjectIdGetDatum(aggref->aggfnoid)); + if (!HeapTupleIsValid(procTuple)) + elog(ERROR, "cache lookup failed for function %u", + aggref->aggfnoid); + aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner; + ReleaseSysCache(procTuple); + + if (OidIsValid(finalfn_oid)) + { + aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(finalfn_oid)); + InvokeFunctionExecuteHook(finalfn_oid); + } + if (OidIsValid(serialfn_oid)) + { + aclresult = pg_proc_aclcheck(serialfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(serialfn_oid)); + InvokeFunctionExecuteHook(serialfn_oid); + } + if (OidIsValid(deserialfn_oid)) + { + aclresult = pg_proc_aclcheck(deserialfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(deserialfn_oid)); + InvokeFunctionExecuteHook(deserialfn_oid); + } + } + + /* + * Get actual datatypes of the (nominal) aggregate inputs. These + * could be different from the agg's declared input types, when the + * agg accepts ANY or a polymorphic type. + */ + numArguments = get_aggregate_argtypes(aggref, inputTypes); + + /* Count the "direct" arguments, if any */ + numDirectArgs = list_length(aggref->aggdirectargs); + + /* Detect how many arguments to pass to the finalfn */ + if (aggform->aggfinalextra) + peragg->numFinalArgs = numArguments + 1; + else + peragg->numFinalArgs = numDirectArgs + 1; + + /* Initialize any direct-argument expressions */ + peragg->aggdirectargs = ExecInitExprList(aggref->aggdirectargs, + (PlanState *) aggstate); + + /* + * build expression trees using actual argument & result types for the + * finalfn, if it exists and is required. + */ + if (OidIsValid(finalfn_oid)) + { + build_aggregate_finalfn_expr(inputTypes, + peragg->numFinalArgs, + aggtranstype, + aggref->aggtype, + aggref->inputcollid, + finalfn_oid, + &finalfnexpr); + fmgr_info(finalfn_oid, &peragg->finalfn); + fmgr_info_set_expr((Node *) finalfnexpr, &peragg->finalfn); + } + + /* get info about the output value's datatype */ + get_typlenbyval(aggref->aggtype, + &peragg->resulttypeLen, + &peragg->resulttypeByVal); + + /* + * Build working state for invoking the transition function, if we + * haven't done it already. + */ + pertrans = &pertransstates[aggref->aggtransno]; + if (pertrans->aggref == NULL) + { + Datum textInitVal; + Datum initValue; + bool initValueIsNull; + Oid transfn_oid; + + /* + * If this aggregation is performing state combines, then instead + * of using the transition function, we'll use the combine + * function + */ + if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit)) + { + transfn_oid = aggform->aggcombinefn; + + /* If not set then the planner messed up */ + if (!OidIsValid(transfn_oid)) + elog(ERROR, "combinefn not set for aggregate function"); + } + else + transfn_oid = aggform->aggtransfn; + + aclresult = pg_proc_aclcheck(transfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(transfn_oid)); + InvokeFunctionExecuteHook(transfn_oid); + + /* + * initval is potentially null, so don't try to access it as a + * struct field. Must do it the hard way with SysCacheGetAttr. + */ + textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, + Anum_pg_aggregate_agginitval, + &initValueIsNull); + if (initValueIsNull) + initValue = (Datum) 0; + else + initValue = GetAggInitVal(textInitVal, aggtranstype); + + build_pertrans_for_aggref(pertrans, aggstate, estate, + aggref, transfn_oid, aggtranstype, + serialfn_oid, deserialfn_oid, + initValue, initValueIsNull, + inputTypes, numArguments); + } + else + pertrans->aggshared = true; + ReleaseSysCache(aggTuple); + } + + /* + * Update aggstate->numaggs to be the number of unique aggregates found. + * Also set numstates to the number of unique transition states found. + */ + aggstate->numaggs = numaggs; + aggstate->numtrans = numtrans; + + /* + * Last, check whether any more aggregates got added onto the node while + * we processed the expressions for the aggregate arguments (including not + * only the regular arguments and FILTER expressions handled immediately + * above, but any direct arguments we might've handled earlier). If so, + * we have nested aggregate functions, which is semantically nonsensical, + * so complain. (This should have been caught by the parser, so we don't + * need to work hard on a helpful error message; but we defend against it + * here anyway, just to be sure.) + */ + if (numaggrefs != list_length(aggstate->aggs)) + ereport(ERROR, + (errcode(ERRCODE_GROUPING_ERROR), + errmsg("aggregate function calls cannot be nested"))); + + /* + * Build expressions doing all the transition work at once. We build a + * different one for each phase, as the number of transition function + * invocation can differ between phases. Note this'll work both for + * transition and combination functions (although there'll only be one + * phase in the latter case). + */ + for (phaseidx = 0; phaseidx < aggstate->numphases; phaseidx++) + { + AggStatePerPhase phase = &aggstate->phases[phaseidx]; + bool dohash = false; + bool dosort = false; + + /* phase 0 doesn't necessarily exist */ + if (!phase->aggnode) + continue; + + if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 1) + { + /* + * Phase one, and only phase one, in a mixed agg performs both + * sorting and aggregation. + */ + dohash = true; + dosort = true; + } + else if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 0) + { + /* + * No need to compute a transition function for an AGG_MIXED phase + * 0 - the contents of the hashtables will have been computed + * during phase 1. + */ + continue; + } + else if (phase->aggstrategy == AGG_PLAIN || + phase->aggstrategy == AGG_SORTED) + { + dohash = false; + dosort = true; + } + else if (phase->aggstrategy == AGG_HASHED) + { + dohash = true; + dosort = false; + } + else + Assert(false); + + phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash, + false); + + /* cache compiled expression for outer slot without NULL check */ + phase->evaltrans_cache[0][0] = phase->evaltrans; + } + + return aggstate; +} + +/* + * Build the state needed to calculate a state value for an aggregate. + * + * This initializes all the fields in 'pertrans'. 'aggref' is the aggregate + * to initialize the state for. 'aggtransfn', 'aggtranstype', and the rest + * of the arguments could be calculated from 'aggref', but the caller has + * calculated them already, so might as well pass them. + */ +static void +build_pertrans_for_aggref(AggStatePerTrans pertrans, + AggState *aggstate, EState *estate, + Aggref *aggref, + Oid aggtransfn, Oid aggtranstype, + Oid aggserialfn, Oid aggdeserialfn, + Datum initValue, bool initValueIsNull, + Oid *inputTypes, int numArguments) +{ + int numGroupingSets = Max(aggstate->maxsets, 1); + Expr *serialfnexpr = NULL; + Expr *deserialfnexpr = NULL; + ListCell *lc; + int numInputs; + int numDirectArgs; + List *sortlist; + int numSortCols; + int numDistinctCols; + int i; + + /* Begin filling in the pertrans data */ + pertrans->aggref = aggref; + pertrans->aggshared = false; + pertrans->aggCollation = aggref->inputcollid; + pertrans->transfn_oid = aggtransfn; + pertrans->serialfn_oid = aggserialfn; + pertrans->deserialfn_oid = aggdeserialfn; + pertrans->initValue = initValue; + pertrans->initValueIsNull = initValueIsNull; + + /* Count the "direct" arguments, if any */ + numDirectArgs = list_length(aggref->aggdirectargs); + + /* Count the number of aggregated input columns */ + pertrans->numInputs = numInputs = list_length(aggref->args); + + pertrans->aggtranstype = aggtranstype; + + /* + * When combining states, we have no use at all for the aggregate + * function's transfn. Instead we use the combinefn. In this case, the + * transfn and transfn_oid fields of pertrans refer to the combine + * function rather than the transition function. + */ + if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit)) + { + Expr *combinefnexpr; + size_t numTransArgs; + + /* + * When combining there's only one input, the to-be-combined added + * transition value from below (this node's transition value is + * counted separately). + */ + pertrans->numTransInputs = 1; + + /* account for the current transition state */ + numTransArgs = pertrans->numTransInputs + 1; + + build_aggregate_combinefn_expr(aggtranstype, + aggref->inputcollid, + aggtransfn, + &combinefnexpr); + fmgr_info(aggtransfn, &pertrans->transfn); + fmgr_info_set_expr((Node *) combinefnexpr, &pertrans->transfn); + + pertrans->transfn_fcinfo = + (FunctionCallInfo) palloc(SizeForFunctionCallInfo(2)); + InitFunctionCallInfoData(*pertrans->transfn_fcinfo, + &pertrans->transfn, + numTransArgs, + pertrans->aggCollation, + (void *) aggstate, NULL); + + /* + * Ensure that a combine function to combine INTERNAL states is not + * strict. This should have been checked during CREATE AGGREGATE, but + * the strict property could have been changed since then. + */ + if (pertrans->transfn.fn_strict && aggtranstype == INTERNALOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("combine function with transition type %s must not be declared STRICT", + format_type_be(aggtranstype)))); + } + else + { + Expr *transfnexpr; + size_t numTransArgs; + + /* Detect how many arguments to pass to the transfn */ + if (AGGKIND_IS_ORDERED_SET(aggref->aggkind)) + pertrans->numTransInputs = numInputs; + else + pertrans->numTransInputs = numArguments; + + /* account for the current transition state */ + numTransArgs = pertrans->numTransInputs + 1; + + /* + * Set up infrastructure for calling the transfn. Note that + * invtransfn is not needed here. + */ + build_aggregate_transfn_expr(inputTypes, + numArguments, + numDirectArgs, + aggref->aggvariadic, + aggtranstype, + aggref->inputcollid, + aggtransfn, + InvalidOid, + &transfnexpr, + NULL); + fmgr_info(aggtransfn, &pertrans->transfn); + fmgr_info_set_expr((Node *) transfnexpr, &pertrans->transfn); + + pertrans->transfn_fcinfo = + (FunctionCallInfo) palloc(SizeForFunctionCallInfo(numTransArgs)); + InitFunctionCallInfoData(*pertrans->transfn_fcinfo, + &pertrans->transfn, + numTransArgs, + pertrans->aggCollation, + (void *) aggstate, NULL); + + /* + * If the transfn is strict and the initval is NULL, make sure input + * type and transtype are the same (or at least binary-compatible), so + * that it's OK to use the first aggregated input value as the initial + * transValue. This should have been checked at agg definition time, + * but we must check again in case the transfn's strictness property + * has been changed. + */ + if (pertrans->transfn.fn_strict && pertrans->initValueIsNull) + { + if (numArguments <= numDirectArgs || + !IsBinaryCoercible(inputTypes[numDirectArgs], + aggtranstype)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("aggregate %u needs to have compatible input type and transition type", + aggref->aggfnoid))); + } + } + + /* get info about the state value's datatype */ + get_typlenbyval(aggtranstype, + &pertrans->transtypeLen, + &pertrans->transtypeByVal); + + if (OidIsValid(aggserialfn)) + { + build_aggregate_serialfn_expr(aggserialfn, + &serialfnexpr); + fmgr_info(aggserialfn, &pertrans->serialfn); + fmgr_info_set_expr((Node *) serialfnexpr, &pertrans->serialfn); + + pertrans->serialfn_fcinfo = + (FunctionCallInfo) palloc(SizeForFunctionCallInfo(1)); + InitFunctionCallInfoData(*pertrans->serialfn_fcinfo, + &pertrans->serialfn, + 1, + InvalidOid, + (void *) aggstate, NULL); + } + + if (OidIsValid(aggdeserialfn)) + { + build_aggregate_deserialfn_expr(aggdeserialfn, + &deserialfnexpr); + fmgr_info(aggdeserialfn, &pertrans->deserialfn); + fmgr_info_set_expr((Node *) deserialfnexpr, &pertrans->deserialfn); + + pertrans->deserialfn_fcinfo = + (FunctionCallInfo) palloc(SizeForFunctionCallInfo(2)); + InitFunctionCallInfoData(*pertrans->deserialfn_fcinfo, + &pertrans->deserialfn, + 2, + InvalidOid, + (void *) aggstate, NULL); + + } + + /* + * If we're doing either DISTINCT or ORDER BY for a plain agg, then we + * have a list of SortGroupClause nodes; fish out the data in them and + * stick them into arrays. We ignore ORDER BY for an ordered-set agg, + * however; the agg's transfn and finalfn are responsible for that. + * + * Note that by construction, if there is a DISTINCT clause then the ORDER + * BY clause is a prefix of it (see transformDistinctClause). + */ + if (AGGKIND_IS_ORDERED_SET(aggref->aggkind)) + { + sortlist = NIL; + numSortCols = numDistinctCols = 0; + } + else if (aggref->aggdistinct) + { + sortlist = aggref->aggdistinct; + numSortCols = numDistinctCols = list_length(sortlist); + Assert(numSortCols >= list_length(aggref->aggorder)); + } + else + { + sortlist = aggref->aggorder; + numSortCols = list_length(sortlist); + numDistinctCols = 0; + } + + pertrans->numSortCols = numSortCols; + pertrans->numDistinctCols = numDistinctCols; + + /* + * If we have either sorting or filtering to do, create a tupledesc and + * slot corresponding to the aggregated inputs (including sort + * expressions) of the agg. + */ + if (numSortCols > 0 || aggref->aggfilter) + { + pertrans->sortdesc = ExecTypeFromTL(aggref->args); + pertrans->sortslot = + ExecInitExtraTupleSlot(estate, pertrans->sortdesc, + &TTSOpsMinimalTuple); + } + + if (numSortCols > 0) + { + /* + * We don't implement DISTINCT or ORDER BY aggs in the HASHED case + * (yet) + */ + Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED); + + /* If we have only one input, we need its len/byval info. */ + if (numInputs == 1) + { + get_typlenbyval(inputTypes[numDirectArgs], + &pertrans->inputtypeLen, + &pertrans->inputtypeByVal); + } + else if (numDistinctCols > 0) + { + /* we will need an extra slot to store prior values */ + pertrans->uniqslot = + ExecInitExtraTupleSlot(estate, pertrans->sortdesc, + &TTSOpsMinimalTuple); + } + + /* Extract the sort information for use later */ + pertrans->sortColIdx = + (AttrNumber *) palloc(numSortCols * sizeof(AttrNumber)); + pertrans->sortOperators = + (Oid *) palloc(numSortCols * sizeof(Oid)); + pertrans->sortCollations = + (Oid *) palloc(numSortCols * sizeof(Oid)); + pertrans->sortNullsFirst = + (bool *) palloc(numSortCols * sizeof(bool)); + + i = 0; + foreach(lc, sortlist) + { + SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc); + TargetEntry *tle = get_sortgroupclause_tle(sortcl, aggref->args); + + /* the parser should have made sure of this */ + Assert(OidIsValid(sortcl->sortop)); + + pertrans->sortColIdx[i] = tle->resno; + pertrans->sortOperators[i] = sortcl->sortop; + pertrans->sortCollations[i] = exprCollation((Node *) tle->expr); + pertrans->sortNullsFirst[i] = sortcl->nulls_first; + i++; + } + Assert(i == numSortCols); + } + + if (aggref->aggdistinct) + { + Oid *ops; + + Assert(numArguments > 0); + Assert(list_length(aggref->aggdistinct) == numDistinctCols); + + ops = palloc(numDistinctCols * sizeof(Oid)); + + i = 0; + foreach(lc, aggref->aggdistinct) + ops[i++] = ((SortGroupClause *) lfirst(lc))->eqop; + + /* lookup / build the necessary comparators */ + if (numDistinctCols == 1) + fmgr_info(get_opcode(ops[0]), &pertrans->equalfnOne); + else + pertrans->equalfnMulti = + execTuplesMatchPrepare(pertrans->sortdesc, + numDistinctCols, + pertrans->sortColIdx, + ops, + pertrans->sortCollations, + &aggstate->ss.ps); + pfree(ops); + } + + pertrans->sortstates = (Tuplesortstate **) + palloc0(sizeof(Tuplesortstate *) * numGroupingSets); +} + + +static Datum +GetAggInitVal(Datum textInitVal, Oid transtype) +{ + Oid typinput, + typioparam; + char *strInitVal; + Datum initVal; + + getTypeInputInfo(transtype, &typinput, &typioparam); + strInitVal = TextDatumGetCString(textInitVal); + initVal = OidInputFunctionCall(typinput, strInitVal, + typioparam, -1); + pfree(strInitVal); + return initVal; +} + +void +ExecEndAgg(AggState *node) +{ + PlanState *outerPlan; + int transno; + int numGroupingSets = Max(node->maxsets, 1); + int setno; + + /* + * When ending a parallel worker, copy the statistics gathered by the + * worker back into shared memory so that it can be picked up by the main + * process to report in EXPLAIN ANALYZE. + */ + if (node->shared_info && IsParallelWorker()) + { + AggregateInstrumentation *si; + + Assert(ParallelWorkerNumber <= node->shared_info->num_workers); + si = &node->shared_info->sinstrument[ParallelWorkerNumber]; + si->hash_batches_used = node->hash_batches_used; + si->hash_disk_used = node->hash_disk_used; + si->hash_mem_peak = node->hash_mem_peak; + } + + /* Make sure we have closed any open tuplesorts */ + + if (node->sort_in) + tuplesort_end(node->sort_in); + if (node->sort_out) + tuplesort_end(node->sort_out); + + hashagg_reset_spill_state(node); + + if (node->hash_metacxt != NULL) + { + MemoryContextDelete(node->hash_metacxt); + node->hash_metacxt = NULL; + } + + for (transno = 0; transno < node->numtrans; transno++) + { + AggStatePerTrans pertrans = &node->pertrans[transno]; + + for (setno = 0; setno < numGroupingSets; setno++) + { + if (pertrans->sortstates[setno]) + tuplesort_end(pertrans->sortstates[setno]); + } + } + + /* And ensure any agg shutdown callbacks have been called */ + for (setno = 0; setno < numGroupingSets; setno++) + ReScanExprContext(node->aggcontexts[setno]); + if (node->hashcontext) + ReScanExprContext(node->hashcontext); + + /* + * We don't actually free any ExprContexts here (see comment in + * ExecFreeExprContext), just unlinking the output one from the plan node + * suffices. + */ + ExecFreeExprContext(&node->ss.ps); + + /* clean up tuple table */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + outerPlan = outerPlanState(node); + ExecEndNode(outerPlan); +} + +void +ExecReScanAgg(AggState *node) +{ + ExprContext *econtext = node->ss.ps.ps_ExprContext; + PlanState *outerPlan = outerPlanState(node); + Agg *aggnode = (Agg *) node->ss.ps.plan; + int transno; + int numGroupingSets = Max(node->maxsets, 1); + int setno; + + node->agg_done = false; + + if (node->aggstrategy == AGG_HASHED) + { + /* + * In the hashed case, if we haven't yet built the hash table then we + * can just return; nothing done yet, so nothing to undo. If subnode's + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else no reason to re-scan it at all. + */ + if (!node->table_filled) + return; + + /* + * If we do have the hash table, and it never spilled, and the subplan + * does not have any parameter changes, and none of our own parameter + * changes affect input expressions of the aggregated functions, then + * we can just rescan the existing hash table; no need to build it + * again. + */ + if (outerPlan->chgParam == NULL && !node->hash_ever_spilled && + !bms_overlap(node->ss.ps.chgParam, aggnode->aggParams)) + { + ResetTupleHashIterator(node->perhash[0].hashtable, + &node->perhash[0].hashiter); + select_current_set(node, 0, true); + return; + } + } + + /* Make sure we have closed any open tuplesorts */ + for (transno = 0; transno < node->numtrans; transno++) + { + for (setno = 0; setno < numGroupingSets; setno++) + { + AggStatePerTrans pertrans = &node->pertrans[transno]; + + if (pertrans->sortstates[setno]) + { + tuplesort_end(pertrans->sortstates[setno]); + pertrans->sortstates[setno] = NULL; + } + } + } + + /* + * We don't need to ReScanExprContext the output tuple context here; + * ExecReScan already did it. But we do need to reset our per-grouping-set + * contexts, which may have transvalues stored in them. (We use rescan + * rather than just reset because transfns may have registered callbacks + * that need to be run now.) For the AGG_HASHED case, see below. + */ + + for (setno = 0; setno < numGroupingSets; setno++) + { + ReScanExprContext(node->aggcontexts[setno]); + } + + /* Release first tuple of group, if we have made a copy */ + if (node->grp_firstTuple != NULL) + { + heap_freetuple(node->grp_firstTuple); + node->grp_firstTuple = NULL; + } + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* Forget current agg values */ + MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numaggs); + MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs); + + /* + * With AGG_HASHED/MIXED, the hash table is allocated in a sub-context of + * the hashcontext. This used to be an issue, but now, resetting a context + * automatically deletes sub-contexts too. + */ + if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED) + { + hashagg_reset_spill_state(node); + + node->hash_ever_spilled = false; + node->hash_spill_mode = false; + node->hash_ngroups_current = 0; + + ReScanExprContext(node->hashcontext); + /* Rebuild an empty hash table */ + build_hash_tables(node); + node->table_filled = false; + /* iterator will be reset when the table is filled */ + + hashagg_recompile_expressions(node, false, false); + } + + if (node->aggstrategy != AGG_HASHED) + { + /* + * Reset the per-group state (in particular, mark transvalues null) + */ + for (setno = 0; setno < numGroupingSets; setno++) + { + MemSet(node->pergroups[setno], 0, + sizeof(AggStatePerGroupData) * node->numaggs); + } + + /* reset to phase 1 */ + initialize_phase(node, 1); + + node->input_done = false; + node->projected_set = -1; + } + + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} + + +/*********************************************************************** + * API exposed to aggregate functions + ***********************************************************************/ + + +/* + * AggCheckCallContext - test if a SQL function is being called as an aggregate + * + * The transition and/or final functions of an aggregate may want to verify + * that they are being called as aggregates, rather than as plain SQL + * functions. They should use this function to do so. The return value + * is nonzero if being called as an aggregate, or zero if not. (Specific + * nonzero values are AGG_CONTEXT_AGGREGATE or AGG_CONTEXT_WINDOW, but more + * values could conceivably appear in future.) + * + * If aggcontext isn't NULL, the function also stores at *aggcontext the + * identity of the memory context that aggregate transition values are being + * stored in. Note that the same aggregate call site (flinfo) may be called + * interleaved on different transition values in different contexts, so it's + * not kosher to cache aggcontext under fn_extra. It is, however, kosher to + * cache it in the transvalue itself (for internal-type transvalues). + */ +int +AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext) +{ + if (fcinfo->context && IsA(fcinfo->context, AggState)) + { + if (aggcontext) + { + AggState *aggstate = ((AggState *) fcinfo->context); + ExprContext *cxt = aggstate->curaggcontext; + + *aggcontext = cxt->ecxt_per_tuple_memory; + } + return AGG_CONTEXT_AGGREGATE; + } + if (fcinfo->context && IsA(fcinfo->context, WindowAggState)) + { + if (aggcontext) + *aggcontext = ((WindowAggState *) fcinfo->context)->curaggcontext; + return AGG_CONTEXT_WINDOW; + } + + /* this is just to prevent "uninitialized variable" warnings */ + if (aggcontext) + *aggcontext = NULL; + return 0; +} + +/* + * AggGetAggref - allow an aggregate support function to get its Aggref + * + * If the function is being called as an aggregate support function, + * return the Aggref node for the aggregate call. Otherwise, return NULL. + * + * Aggregates sharing the same inputs and transition functions can get + * merged into a single transition calculation. If the transition function + * calls AggGetAggref, it will get some one of the Aggrefs for which it is + * executing. It must therefore not pay attention to the Aggref fields that + * relate to the final function, as those are indeterminate. But if a final + * function calls AggGetAggref, it will get a precise result. + * + * Note that if an aggregate is being used as a window function, this will + * return NULL. We could provide a similar function to return the relevant + * WindowFunc node in such cases, but it's not needed yet. + */ +Aggref * +AggGetAggref(FunctionCallInfo fcinfo) +{ + if (fcinfo->context && IsA(fcinfo->context, AggState)) + { + AggState *aggstate = (AggState *) fcinfo->context; + AggStatePerAgg curperagg; + AggStatePerTrans curpertrans; + + /* check curperagg (valid when in a final function) */ + curperagg = aggstate->curperagg; + + if (curperagg) + return curperagg->aggref; + + /* check curpertrans (valid when in a transition function) */ + curpertrans = aggstate->curpertrans; + + if (curpertrans) + return curpertrans->aggref; + } + return NULL; +} + +/* + * AggGetTempMemoryContext - fetch short-term memory context for aggregates + * + * This is useful in agg final functions; the context returned is one that + * the final function can safely reset as desired. This isn't useful for + * transition functions, since the context returned MAY (we don't promise) + * be the same as the context those are called in. + * + * As above, this is currently not useful for aggs called as window functions. + */ +MemoryContext +AggGetTempMemoryContext(FunctionCallInfo fcinfo) +{ + if (fcinfo->context && IsA(fcinfo->context, AggState)) + { + AggState *aggstate = (AggState *) fcinfo->context; + + return aggstate->tmpcontext->ecxt_per_tuple_memory; + } + return NULL; +} + +/* + * AggStateIsShared - find out whether transition state is shared + * + * If the function is being called as an aggregate support function, + * return true if the aggregate's transition state is shared across + * multiple aggregates, false if it is not. + * + * Returns true if not called as an aggregate support function. + * This is intended as a conservative answer, ie "no you'd better not + * scribble on your input". In particular, will return true if the + * aggregate is being used as a window function, which is a scenario + * in which changing the transition state is a bad idea. We might + * want to refine the behavior for the window case in future. + */ +bool +AggStateIsShared(FunctionCallInfo fcinfo) +{ + if (fcinfo->context && IsA(fcinfo->context, AggState)) + { + AggState *aggstate = (AggState *) fcinfo->context; + AggStatePerAgg curperagg; + AggStatePerTrans curpertrans; + + /* check curperagg (valid when in a final function) */ + curperagg = aggstate->curperagg; + + if (curperagg) + return aggstate->pertrans[curperagg->transno].aggshared; + + /* check curpertrans (valid when in a transition function) */ + curpertrans = aggstate->curpertrans; + + if (curpertrans) + return curpertrans->aggshared; + } + return true; +} + +/* + * AggRegisterCallback - register a cleanup callback for an aggregate + * + * This is useful for aggs to register shutdown callbacks, which will ensure + * that non-memory resources are freed. The callback will occur just before + * the associated aggcontext (as returned by AggCheckCallContext) is reset, + * either between groups or as a result of rescanning the query. The callback + * will NOT be called on error paths. The typical use-case is for freeing of + * tuplestores or tuplesorts maintained in aggcontext, or pins held by slots + * created by the agg functions. (The callback will not be called until after + * the result of the finalfn is no longer needed, so it's safe for the finalfn + * to return data that will be freed by the callback.) + * + * As above, this is currently not useful for aggs called as window functions. + */ +void +AggRegisterCallback(FunctionCallInfo fcinfo, + ExprContextCallbackFunction func, + Datum arg) +{ + if (fcinfo->context && IsA(fcinfo->context, AggState)) + { + AggState *aggstate = (AggState *) fcinfo->context; + ExprContext *cxt = aggstate->curaggcontext; + + RegisterExprContextCallback(cxt, func, arg); + + return; + } + elog(ERROR, "aggregate function cannot register a callback in this context"); +} + + +/* ---------------------------------------------------------------- + * Parallel Query Support + * ---------------------------------------------------------------- + */ + + /* ---------------------------------------------------------------- + * ExecAggEstimate + * + * Estimate space required to propagate aggregate statistics. + * ---------------------------------------------------------------- + */ +void +ExecAggEstimate(AggState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation)); + size = add_size(size, offsetof(SharedAggInfo, sinstrument)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecAggInitializeDSM + * + * Initialize DSM space for aggregate statistics. + * ---------------------------------------------------------------- + */ +void +ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedAggInfo, sinstrument) + + pcxt->nworkers * sizeof(AggregateInstrumentation); + node->shared_info = shm_toc_allocate(pcxt->toc, size); + /* ensure any unfilled slots will contain zeroes */ + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, + node->shared_info); +} + +/* ---------------------------------------------------------------- + * ExecAggInitializeWorker + * + * Attach worker to DSM space for aggregate statistics. + * ---------------------------------------------------------------- + */ +void +ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt) +{ + node->shared_info = + shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true); +} + +/* ---------------------------------------------------------------- + * ExecAggRetrieveInstrumentation + * + * Transfer aggregate statistics from DSM to private memory. + * ---------------------------------------------------------------- + */ +void +ExecAggRetrieveInstrumentation(AggState *node) +{ + Size size; + SharedAggInfo *si; + + if (node->shared_info == NULL) + return; + + size = offsetof(SharedAggInfo, sinstrument) + + node->shared_info->num_workers * sizeof(AggregateInstrumentation); + si = palloc(size); + memcpy(si, node->shared_info, size); + node->shared_info = si; +} diff --git a/src/backend/executor/nodeAppend.c b/src/backend/executor/nodeAppend.c new file mode 100644 index 0000000..6a2daa6 --- /dev/null +++ b/src/backend/executor/nodeAppend.c @@ -0,0 +1,1186 @@ +/*------------------------------------------------------------------------- + * + * nodeAppend.c + * routines to handle append nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeAppend.c + * + *------------------------------------------------------------------------- + */ +/* INTERFACE ROUTINES + * ExecInitAppend - initialize the append node + * ExecAppend - retrieve the next tuple from the node + * ExecEndAppend - shut down the append node + * ExecReScanAppend - rescan the append node + * + * NOTES + * Each append node contains a list of one or more subplans which + * must be iteratively processed (forwards or backwards). + * Tuples are retrieved by executing the 'whichplan'th subplan + * until the subplan stops returning tuples, at which point that + * plan is shut down and the next started up. + * + * Append nodes don't make use of their left and right + * subtrees, rather they maintain a list of subplans so + * a typical append node looks like this in the plan tree: + * + * ... + * / + * Append -------+------+------+--- nil + * / \ | | | + * nil nil ... ... ... + * subplans + * + * Append nodes are currently used for unions, and to support + * inheritance queries, where several relations need to be scanned. + * For example, in our standard person/student/employee/student-emp + * example, where student and employee inherit from person + * and student-emp inherits from student and employee, the + * query: + * + * select name from person + * + * generates the plan: + * + * | + * Append -------+-------+--------+--------+ + * / \ | | | | + * nil nil Scan Scan Scan Scan + * | | | | + * person employee student student-emp + */ + +#include "postgres.h" + +#include "executor/execAsync.h" +#include "executor/execdebug.h" +#include "executor/execPartition.h" +#include "executor/nodeAppend.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/latch.h" + +/* Shared state for parallel-aware Append. */ +struct ParallelAppendState +{ + LWLock pa_lock; /* mutual exclusion to choose next subplan */ + int pa_next_plan; /* next plan to choose by any worker */ + + /* + * pa_finished[i] should be true if no more workers should select subplan + * i. for a non-partial plan, this should be set to true as soon as a + * worker selects the plan; for a partial plan, it remains false until + * some worker executes the plan to completion. + */ + bool pa_finished[FLEXIBLE_ARRAY_MEMBER]; +}; + +#define INVALID_SUBPLAN_INDEX -1 +#define EVENT_BUFFER_SIZE 16 + +static TupleTableSlot *ExecAppend(PlanState *pstate); +static bool choose_next_subplan_locally(AppendState *node); +static bool choose_next_subplan_for_leader(AppendState *node); +static bool choose_next_subplan_for_worker(AppendState *node); +static void mark_invalid_subplans_as_finished(AppendState *node); +static void ExecAppendAsyncBegin(AppendState *node); +static bool ExecAppendAsyncGetNext(AppendState *node, TupleTableSlot **result); +static bool ExecAppendAsyncRequest(AppendState *node, TupleTableSlot **result); +static void ExecAppendAsyncEventWait(AppendState *node); +static void classify_matching_subplans(AppendState *node); + +/* ---------------------------------------------------------------- + * ExecInitAppend + * + * Begin all of the subscans of the append node. + * + * (This is potentially wasteful, since the entire result of the + * append node may not be scanned, but this way all of the + * structures get allocated in the executor's top level memory + * block instead of that of the call to ExecAppend.) + * ---------------------------------------------------------------- + */ +AppendState * +ExecInitAppend(Append *node, EState *estate, int eflags) +{ + AppendState *appendstate = makeNode(AppendState); + PlanState **appendplanstates; + Bitmapset *validsubplans; + Bitmapset *asyncplans; + int nplans; + int nasyncplans; + int firstvalid; + int i, + j; + + /* check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* + * create new AppendState for our append node + */ + appendstate->ps.plan = (Plan *) node; + appendstate->ps.state = estate; + appendstate->ps.ExecProcNode = ExecAppend; + + /* Let choose_next_subplan_* function handle setting the first subplan */ + appendstate->as_whichplan = INVALID_SUBPLAN_INDEX; + appendstate->as_syncdone = false; + appendstate->as_begun = false; + + /* If run-time partition pruning is enabled, then set that up now */ + if (node->part_prune_info != NULL) + { + PartitionPruneState *prunestate; + + /* We may need an expression context to evaluate partition exprs */ + ExecAssignExprContext(estate, &appendstate->ps); + + /* Create the working data structure for pruning. */ + prunestate = ExecCreatePartitionPruneState(&appendstate->ps, + node->part_prune_info); + appendstate->as_prune_state = prunestate; + + /* Perform an initial partition prune, if required. */ + if (prunestate->do_initial_prune) + { + /* Determine which subplans survive initial pruning */ + validsubplans = ExecFindInitialMatchingSubPlans(prunestate, + list_length(node->appendplans)); + + nplans = bms_num_members(validsubplans); + } + else + { + /* We'll need to initialize all subplans */ + nplans = list_length(node->appendplans); + Assert(nplans > 0); + validsubplans = bms_add_range(NULL, 0, nplans - 1); + } + + /* + * When no run-time pruning is required and there's at least one + * subplan, we can fill as_valid_subplans immediately, preventing + * later calls to ExecFindMatchingSubPlans. + */ + if (!prunestate->do_exec_prune && nplans > 0) + appendstate->as_valid_subplans = bms_add_range(NULL, 0, nplans - 1); + } + else + { + nplans = list_length(node->appendplans); + + /* + * When run-time partition pruning is not enabled we can just mark all + * subplans as valid; they must also all be initialized. + */ + Assert(nplans > 0); + appendstate->as_valid_subplans = validsubplans = + bms_add_range(NULL, 0, nplans - 1); + appendstate->as_prune_state = NULL; + } + + /* + * Initialize result tuple type and slot. + */ + ExecInitResultTupleSlotTL(&appendstate->ps, &TTSOpsVirtual); + + /* node returns slots from each of its subnodes, therefore not fixed */ + appendstate->ps.resultopsset = true; + appendstate->ps.resultopsfixed = false; + + appendplanstates = (PlanState **) palloc(nplans * + sizeof(PlanState *)); + + /* + * call ExecInitNode on each of the valid plans to be executed and save + * the results into the appendplanstates array. + * + * While at it, find out the first valid partial plan. + */ + j = 0; + asyncplans = NULL; + nasyncplans = 0; + firstvalid = nplans; + i = -1; + while ((i = bms_next_member(validsubplans, i)) >= 0) + { + Plan *initNode = (Plan *) list_nth(node->appendplans, i); + + /* + * Record async subplans. When executing EvalPlanQual, we treat them + * as sync ones; don't do this when initializing an EvalPlanQual plan + * tree. + */ + if (initNode->async_capable && estate->es_epq_active == NULL) + { + asyncplans = bms_add_member(asyncplans, j); + nasyncplans++; + } + + /* + * Record the lowest appendplans index which is a valid partial plan. + */ + if (i >= node->first_partial_plan && j < firstvalid) + firstvalid = j; + + appendplanstates[j++] = ExecInitNode(initNode, estate, eflags); + } + + appendstate->as_first_partial_plan = firstvalid; + appendstate->appendplans = appendplanstates; + appendstate->as_nplans = nplans; + + /* Initialize async state */ + appendstate->as_asyncplans = asyncplans; + appendstate->as_nasyncplans = nasyncplans; + appendstate->as_asyncrequests = NULL; + appendstate->as_asyncresults = NULL; + appendstate->as_nasyncresults = 0; + appendstate->as_nasyncremain = 0; + appendstate->as_needrequest = NULL; + appendstate->as_eventset = NULL; + appendstate->as_valid_asyncplans = NULL; + + if (nasyncplans > 0) + { + appendstate->as_asyncrequests = (AsyncRequest **) + palloc0(nplans * sizeof(AsyncRequest *)); + + i = -1; + while ((i = bms_next_member(asyncplans, i)) >= 0) + { + AsyncRequest *areq; + + areq = palloc(sizeof(AsyncRequest)); + areq->requestor = (PlanState *) appendstate; + areq->requestee = appendplanstates[i]; + areq->request_index = i; + areq->callback_pending = false; + areq->request_complete = false; + areq->result = NULL; + + appendstate->as_asyncrequests[i] = areq; + } + + appendstate->as_asyncresults = (TupleTableSlot **) + palloc0(nasyncplans * sizeof(TupleTableSlot *)); + + if (appendstate->as_valid_subplans != NULL) + classify_matching_subplans(appendstate); + } + + /* + * Miscellaneous initialization + */ + + appendstate->ps.ps_ProjInfo = NULL; + + /* For parallel query, this will be overridden later. */ + appendstate->choose_next_subplan = choose_next_subplan_locally; + + return appendstate; +} + +/* ---------------------------------------------------------------- + * ExecAppend + * + * Handles iteration over multiple subplans. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecAppend(PlanState *pstate) +{ + AppendState *node = castNode(AppendState, pstate); + TupleTableSlot *result; + + /* + * If this is the first call after Init or ReScan, we need to do the + * initialization work. + */ + if (!node->as_begun) + { + Assert(node->as_whichplan == INVALID_SUBPLAN_INDEX); + Assert(!node->as_syncdone); + + /* Nothing to do if there are no subplans */ + if (node->as_nplans == 0) + return ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* If there are any async subplans, begin executing them. */ + if (node->as_nasyncplans > 0) + ExecAppendAsyncBegin(node); + + /* + * If no sync subplan has been chosen, we must choose one before + * proceeding. + */ + if (!node->choose_next_subplan(node) && node->as_nasyncremain == 0) + return ExecClearTuple(node->ps.ps_ResultTupleSlot); + + Assert(node->as_syncdone || + (node->as_whichplan >= 0 && + node->as_whichplan < node->as_nplans)); + + /* And we're initialized. */ + node->as_begun = true; + } + + for (;;) + { + PlanState *subnode; + + CHECK_FOR_INTERRUPTS(); + + /* + * try to get a tuple from an async subplan if any + */ + if (node->as_syncdone || !bms_is_empty(node->as_needrequest)) + { + if (ExecAppendAsyncGetNext(node, &result)) + return result; + Assert(!node->as_syncdone); + Assert(bms_is_empty(node->as_needrequest)); + } + + /* + * figure out which sync subplan we are currently processing + */ + Assert(node->as_whichplan >= 0 && node->as_whichplan < node->as_nplans); + subnode = node->appendplans[node->as_whichplan]; + + /* + * get a tuple from the subplan + */ + result = ExecProcNode(subnode); + + if (!TupIsNull(result)) + { + /* + * If the subplan gave us something then return it as-is. We do + * NOT make use of the result slot that was set up in + * ExecInitAppend; there's no need for it. + */ + return result; + } + + /* + * wait or poll for async events if any. We do this before checking + * for the end of iteration, because it might drain the remaining + * async subplans. + */ + if (node->as_nasyncremain > 0) + ExecAppendAsyncEventWait(node); + + /* choose new sync subplan; if no sync/async subplans, we're done */ + if (!node->choose_next_subplan(node) && node->as_nasyncremain == 0) + return ExecClearTuple(node->ps.ps_ResultTupleSlot); + } +} + +/* ---------------------------------------------------------------- + * ExecEndAppend + * + * Shuts down the subscans of the append node. + * + * Returns nothing of interest. + * ---------------------------------------------------------------- + */ +void +ExecEndAppend(AppendState *node) +{ + PlanState **appendplans; + int nplans; + int i; + + /* + * get information from the node + */ + appendplans = node->appendplans; + nplans = node->as_nplans; + + /* + * shut down each of the subscans + */ + for (i = 0; i < nplans; i++) + ExecEndNode(appendplans[i]); +} + +void +ExecReScanAppend(AppendState *node) +{ + int nasyncplans = node->as_nasyncplans; + int i; + + /* + * If any PARAM_EXEC Params used in pruning expressions have changed, then + * we'd better unset the valid subplans so that they are reselected for + * the new parameter values. + */ + if (node->as_prune_state && + bms_overlap(node->ps.chgParam, + node->as_prune_state->execparamids)) + { + bms_free(node->as_valid_subplans); + node->as_valid_subplans = NULL; + if (nasyncplans > 0) + { + bms_free(node->as_valid_asyncplans); + node->as_valid_asyncplans = NULL; + } + } + + for (i = 0; i < node->as_nplans; i++) + { + PlanState *subnode = node->appendplans[i]; + + /* + * ExecReScan doesn't know about my subplans, so I have to do + * changed-parameter signaling myself. + */ + if (node->ps.chgParam != NULL) + UpdateChangedParamSet(subnode, node->ps.chgParam); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode or by first ExecAsyncRequest. + */ + if (subnode->chgParam == NULL) + ExecReScan(subnode); + } + + /* Reset async state */ + if (nasyncplans > 0) + { + i = -1; + while ((i = bms_next_member(node->as_asyncplans, i)) >= 0) + { + AsyncRequest *areq = node->as_asyncrequests[i]; + + areq->callback_pending = false; + areq->request_complete = false; + areq->result = NULL; + } + + node->as_nasyncresults = 0; + node->as_nasyncremain = 0; + bms_free(node->as_needrequest); + node->as_needrequest = NULL; + } + + /* Let choose_next_subplan_* function handle setting the first subplan */ + node->as_whichplan = INVALID_SUBPLAN_INDEX; + node->as_syncdone = false; + node->as_begun = false; +} + +/* ---------------------------------------------------------------- + * Parallel Append Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecAppendEstimate + * + * Compute the amount of space we'll need in the parallel + * query DSM, and inform pcxt->estimator about our needs. + * ---------------------------------------------------------------- + */ +void +ExecAppendEstimate(AppendState *node, + ParallelContext *pcxt) +{ + node->pstate_len = + add_size(offsetof(ParallelAppendState, pa_finished), + sizeof(bool) * node->as_nplans); + + shm_toc_estimate_chunk(&pcxt->estimator, node->pstate_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + + +/* ---------------------------------------------------------------- + * ExecAppendInitializeDSM + * + * Set up shared state for Parallel Append. + * ---------------------------------------------------------------- + */ +void +ExecAppendInitializeDSM(AppendState *node, + ParallelContext *pcxt) +{ + ParallelAppendState *pstate; + + pstate = shm_toc_allocate(pcxt->toc, node->pstate_len); + memset(pstate, 0, node->pstate_len); + LWLockInitialize(&pstate->pa_lock, LWTRANCHE_PARALLEL_APPEND); + shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id, pstate); + + node->as_pstate = pstate; + node->choose_next_subplan = choose_next_subplan_for_leader; +} + +/* ---------------------------------------------------------------- + * ExecAppendReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecAppendReInitializeDSM(AppendState *node, ParallelContext *pcxt) +{ + ParallelAppendState *pstate = node->as_pstate; + + pstate->pa_next_plan = 0; + memset(pstate->pa_finished, 0, sizeof(bool) * node->as_nplans); +} + +/* ---------------------------------------------------------------- + * ExecAppendInitializeWorker + * + * Copy relevant information from TOC into planstate, and initialize + * whatever is required to choose and execute the optimal subplan. + * ---------------------------------------------------------------- + */ +void +ExecAppendInitializeWorker(AppendState *node, ParallelWorkerContext *pwcxt) +{ + node->as_pstate = shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false); + node->choose_next_subplan = choose_next_subplan_for_worker; +} + +/* ---------------------------------------------------------------- + * choose_next_subplan_locally + * + * Choose next sync subplan for a non-parallel-aware Append, + * returning false if there are no more. + * ---------------------------------------------------------------- + */ +static bool +choose_next_subplan_locally(AppendState *node) +{ + int whichplan = node->as_whichplan; + int nextplan; + + /* We should never be called when there are no subplans */ + Assert(node->as_nplans > 0); + + /* Nothing to do if syncdone */ + if (node->as_syncdone) + return false; + + /* + * If first call then have the bms member function choose the first valid + * sync subplan by initializing whichplan to -1. If there happen to be no + * valid sync subplans then the bms member function will handle that by + * returning a negative number which will allow us to exit returning a + * false value. + */ + if (whichplan == INVALID_SUBPLAN_INDEX) + { + if (node->as_nasyncplans > 0) + { + /* We'd have filled as_valid_subplans already */ + Assert(node->as_valid_subplans); + } + else if (node->as_valid_subplans == NULL) + node->as_valid_subplans = + ExecFindMatchingSubPlans(node->as_prune_state); + + whichplan = -1; + } + + /* Ensure whichplan is within the expected range */ + Assert(whichplan >= -1 && whichplan <= node->as_nplans); + + if (ScanDirectionIsForward(node->ps.state->es_direction)) + nextplan = bms_next_member(node->as_valid_subplans, whichplan); + else + nextplan = bms_prev_member(node->as_valid_subplans, whichplan); + + if (nextplan < 0) + { + /* Set as_syncdone if in async mode */ + if (node->as_nasyncplans > 0) + node->as_syncdone = true; + return false; + } + + node->as_whichplan = nextplan; + + return true; +} + +/* ---------------------------------------------------------------- + * choose_next_subplan_for_leader + * + * Try to pick a plan which doesn't commit us to doing much + * work locally, so that as much work as possible is done in + * the workers. Cheapest subplans are at the end. + * ---------------------------------------------------------------- + */ +static bool +choose_next_subplan_for_leader(AppendState *node) +{ + ParallelAppendState *pstate = node->as_pstate; + + /* Backward scan is not supported by parallel-aware plans */ + Assert(ScanDirectionIsForward(node->ps.state->es_direction)); + + /* We should never be called when there are no subplans */ + Assert(node->as_nplans > 0); + + LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE); + + if (node->as_whichplan != INVALID_SUBPLAN_INDEX) + { + /* Mark just-completed subplan as finished. */ + node->as_pstate->pa_finished[node->as_whichplan] = true; + } + else + { + /* Start with last subplan. */ + node->as_whichplan = node->as_nplans - 1; + + /* + * If we've yet to determine the valid subplans then do so now. If + * run-time pruning is disabled then the valid subplans will always be + * set to all subplans. + */ + if (node->as_valid_subplans == NULL) + { + node->as_valid_subplans = + ExecFindMatchingSubPlans(node->as_prune_state); + + /* + * Mark each invalid plan as finished to allow the loop below to + * select the first valid subplan. + */ + mark_invalid_subplans_as_finished(node); + } + } + + /* Loop until we find a subplan to execute. */ + while (pstate->pa_finished[node->as_whichplan]) + { + if (node->as_whichplan == 0) + { + pstate->pa_next_plan = INVALID_SUBPLAN_INDEX; + node->as_whichplan = INVALID_SUBPLAN_INDEX; + LWLockRelease(&pstate->pa_lock); + return false; + } + + /* + * We needn't pay attention to as_valid_subplans here as all invalid + * plans have been marked as finished. + */ + node->as_whichplan--; + } + + /* If non-partial, immediately mark as finished. */ + if (node->as_whichplan < node->as_first_partial_plan) + node->as_pstate->pa_finished[node->as_whichplan] = true; + + LWLockRelease(&pstate->pa_lock); + + return true; +} + +/* ---------------------------------------------------------------- + * choose_next_subplan_for_worker + * + * Choose next subplan for a parallel-aware Append, returning + * false if there are no more. + * + * We start from the first plan and advance through the list; + * when we get back to the end, we loop back to the first + * partial plan. This assigns the non-partial plans first in + * order of descending cost and then spreads out the workers + * as evenly as possible across the remaining partial plans. + * ---------------------------------------------------------------- + */ +static bool +choose_next_subplan_for_worker(AppendState *node) +{ + ParallelAppendState *pstate = node->as_pstate; + + /* Backward scan is not supported by parallel-aware plans */ + Assert(ScanDirectionIsForward(node->ps.state->es_direction)); + + /* We should never be called when there are no subplans */ + Assert(node->as_nplans > 0); + + LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE); + + /* Mark just-completed subplan as finished. */ + if (node->as_whichplan != INVALID_SUBPLAN_INDEX) + node->as_pstate->pa_finished[node->as_whichplan] = true; + + /* + * If we've yet to determine the valid subplans then do so now. If + * run-time pruning is disabled then the valid subplans will always be set + * to all subplans. + */ + else if (node->as_valid_subplans == NULL) + { + node->as_valid_subplans = + ExecFindMatchingSubPlans(node->as_prune_state); + mark_invalid_subplans_as_finished(node); + } + + /* If all the plans are already done, we have nothing to do */ + if (pstate->pa_next_plan == INVALID_SUBPLAN_INDEX) + { + LWLockRelease(&pstate->pa_lock); + return false; + } + + /* Save the plan from which we are starting the search. */ + node->as_whichplan = pstate->pa_next_plan; + + /* Loop until we find a valid subplan to execute. */ + while (pstate->pa_finished[pstate->pa_next_plan]) + { + int nextplan; + + nextplan = bms_next_member(node->as_valid_subplans, + pstate->pa_next_plan); + if (nextplan >= 0) + { + /* Advance to the next valid plan. */ + pstate->pa_next_plan = nextplan; + } + else if (node->as_whichplan > node->as_first_partial_plan) + { + /* + * Try looping back to the first valid partial plan, if there is + * one. If there isn't, arrange to bail out below. + */ + nextplan = bms_next_member(node->as_valid_subplans, + node->as_first_partial_plan - 1); + pstate->pa_next_plan = + nextplan < 0 ? node->as_whichplan : nextplan; + } + else + { + /* + * At last plan, and either there are no partial plans or we've + * tried them all. Arrange to bail out. + */ + pstate->pa_next_plan = node->as_whichplan; + } + + if (pstate->pa_next_plan == node->as_whichplan) + { + /* We've tried everything! */ + pstate->pa_next_plan = INVALID_SUBPLAN_INDEX; + LWLockRelease(&pstate->pa_lock); + return false; + } + } + + /* Pick the plan we found, and advance pa_next_plan one more time. */ + node->as_whichplan = pstate->pa_next_plan; + pstate->pa_next_plan = bms_next_member(node->as_valid_subplans, + pstate->pa_next_plan); + + /* + * If there are no more valid plans then try setting the next plan to the + * first valid partial plan. + */ + if (pstate->pa_next_plan < 0) + { + int nextplan = bms_next_member(node->as_valid_subplans, + node->as_first_partial_plan - 1); + + if (nextplan >= 0) + pstate->pa_next_plan = nextplan; + else + { + /* + * There are no valid partial plans, and we already chose the last + * non-partial plan; so flag that there's nothing more for our + * fellow workers to do. + */ + pstate->pa_next_plan = INVALID_SUBPLAN_INDEX; + } + } + + /* If non-partial, immediately mark as finished. */ + if (node->as_whichplan < node->as_first_partial_plan) + node->as_pstate->pa_finished[node->as_whichplan] = true; + + LWLockRelease(&pstate->pa_lock); + + return true; +} + +/* + * mark_invalid_subplans_as_finished + * Marks the ParallelAppendState's pa_finished as true for each invalid + * subplan. + * + * This function should only be called for parallel Append with run-time + * pruning enabled. + */ +static void +mark_invalid_subplans_as_finished(AppendState *node) +{ + int i; + + /* Only valid to call this while in parallel Append mode */ + Assert(node->as_pstate); + + /* Shouldn't have been called when run-time pruning is not enabled */ + Assert(node->as_prune_state); + + /* Nothing to do if all plans are valid */ + if (bms_num_members(node->as_valid_subplans) == node->as_nplans) + return; + + /* Mark all non-valid plans as finished */ + for (i = 0; i < node->as_nplans; i++) + { + if (!bms_is_member(i, node->as_valid_subplans)) + node->as_pstate->pa_finished[i] = true; + } +} + +/* ---------------------------------------------------------------- + * Asynchronous Append Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecAppendAsyncBegin + * + * Begin executing designed async-capable subplans. + * ---------------------------------------------------------------- + */ +static void +ExecAppendAsyncBegin(AppendState *node) +{ + int i; + + /* Backward scan is not supported by async-aware Appends. */ + Assert(ScanDirectionIsForward(node->ps.state->es_direction)); + + /* We should never be called when there are no subplans */ + Assert(node->as_nplans > 0); + + /* We should never be called when there are no async subplans. */ + Assert(node->as_nasyncplans > 0); + + /* If we've yet to determine the valid subplans then do so now. */ + if (node->as_valid_subplans == NULL) + { + node->as_valid_subplans = + ExecFindMatchingSubPlans(node->as_prune_state); + + classify_matching_subplans(node); + } + + /* Initialize state variables. */ + node->as_syncdone = bms_is_empty(node->as_valid_subplans); + node->as_nasyncremain = bms_num_members(node->as_valid_asyncplans); + + /* Nothing to do if there are no valid async subplans. */ + if (node->as_nasyncremain == 0) + return; + + /* Make a request for each of the valid async subplans. */ + i = -1; + while ((i = bms_next_member(node->as_valid_asyncplans, i)) >= 0) + { + AsyncRequest *areq = node->as_asyncrequests[i]; + + Assert(areq->request_index == i); + Assert(!areq->callback_pending); + + /* Do the actual work. */ + ExecAsyncRequest(areq); + } +} + +/* ---------------------------------------------------------------- + * ExecAppendAsyncGetNext + * + * Get the next tuple from any of the asynchronous subplans. + * ---------------------------------------------------------------- + */ +static bool +ExecAppendAsyncGetNext(AppendState *node, TupleTableSlot **result) +{ + *result = NULL; + + /* We should never be called when there are no valid async subplans. */ + Assert(node->as_nasyncremain > 0); + + /* Request a tuple asynchronously. */ + if (ExecAppendAsyncRequest(node, result)) + return true; + + while (node->as_nasyncremain > 0) + { + CHECK_FOR_INTERRUPTS(); + + /* Wait or poll for async events. */ + ExecAppendAsyncEventWait(node); + + /* Request a tuple asynchronously. */ + if (ExecAppendAsyncRequest(node, result)) + return true; + + /* Break from loop if there's any sync subplan that isn't complete. */ + if (!node->as_syncdone) + break; + } + + /* + * If all sync subplans are complete, we're totally done scanning the + * given node. Otherwise, we're done with the asynchronous stuff but must + * continue scanning the sync subplans. + */ + if (node->as_syncdone) + { + Assert(node->as_nasyncremain == 0); + *result = ExecClearTuple(node->ps.ps_ResultTupleSlot); + return true; + } + + return false; +} + +/* ---------------------------------------------------------------- + * ExecAppendAsyncRequest + * + * Request a tuple asynchronously. + * ---------------------------------------------------------------- + */ +static bool +ExecAppendAsyncRequest(AppendState *node, TupleTableSlot **result) +{ + Bitmapset *needrequest; + int i; + + /* Nothing to do if there are no async subplans needing a new request. */ + if (bms_is_empty(node->as_needrequest)) + { + Assert(node->as_nasyncresults == 0); + return false; + } + + /* + * If there are any asynchronously-generated results that have not yet + * been returned, we have nothing to do; just return one of them. + */ + if (node->as_nasyncresults > 0) + { + --node->as_nasyncresults; + *result = node->as_asyncresults[node->as_nasyncresults]; + return true; + } + + /* Make a new request for each of the async subplans that need it. */ + needrequest = node->as_needrequest; + node->as_needrequest = NULL; + i = -1; + while ((i = bms_next_member(needrequest, i)) >= 0) + { + AsyncRequest *areq = node->as_asyncrequests[i]; + + /* Do the actual work. */ + ExecAsyncRequest(areq); + } + bms_free(needrequest); + + /* Return one of the asynchronously-generated results if any. */ + if (node->as_nasyncresults > 0) + { + --node->as_nasyncresults; + *result = node->as_asyncresults[node->as_nasyncresults]; + return true; + } + + return false; +} + +/* ---------------------------------------------------------------- + * ExecAppendAsyncEventWait + * + * Wait or poll for file descriptor events and fire callbacks. + * ---------------------------------------------------------------- + */ +static void +ExecAppendAsyncEventWait(AppendState *node) +{ + int nevents = node->as_nasyncplans + 1; + long timeout = node->as_syncdone ? -1 : 0; + WaitEvent occurred_event[EVENT_BUFFER_SIZE]; + int noccurred; + int i; + + /* We should never be called when there are no valid async subplans. */ + Assert(node->as_nasyncremain > 0); + + node->as_eventset = CreateWaitEventSet(CurrentMemoryContext, nevents); + AddWaitEventToSet(node->as_eventset, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + + /* Give each waiting subplan a chance to add an event. */ + i = -1; + while ((i = bms_next_member(node->as_asyncplans, i)) >= 0) + { + AsyncRequest *areq = node->as_asyncrequests[i]; + + if (areq->callback_pending) + ExecAsyncConfigureWait(areq); + } + + /* + * No need for further processing if there are no configured events other + * than the postmaster death event. + */ + if (GetNumRegisteredWaitEvents(node->as_eventset) == 1) + { + FreeWaitEventSet(node->as_eventset); + node->as_eventset = NULL; + return; + } + + /* We wait on at most EVENT_BUFFER_SIZE events. */ + if (nevents > EVENT_BUFFER_SIZE) + nevents = EVENT_BUFFER_SIZE; + + /* + * If the timeout is -1, wait until at least one event occurs. If the + * timeout is 0, poll for events, but do not wait at all. + */ + noccurred = WaitEventSetWait(node->as_eventset, timeout, occurred_event, + nevents, WAIT_EVENT_APPEND_READY); + FreeWaitEventSet(node->as_eventset); + node->as_eventset = NULL; + if (noccurred == 0) + return; + + /* Deliver notifications. */ + for (i = 0; i < noccurred; i++) + { + WaitEvent *w = &occurred_event[i]; + + /* + * Each waiting subplan should have registered its wait event with + * user_data pointing back to its AsyncRequest. + */ + if ((w->events & WL_SOCKET_READABLE) != 0) + { + AsyncRequest *areq = (AsyncRequest *) w->user_data; + + if (areq->callback_pending) + { + /* + * Mark it as no longer needing a callback. We must do this + * before dispatching the callback in case the callback resets + * the flag. + */ + areq->callback_pending = false; + + /* Do the actual work. */ + ExecAsyncNotify(areq); + } + } + } +} + +/* ---------------------------------------------------------------- + * ExecAsyncAppendResponse + * + * Receive a response from an asynchronous request we made. + * ---------------------------------------------------------------- + */ +void +ExecAsyncAppendResponse(AsyncRequest *areq) +{ + AppendState *node = (AppendState *) areq->requestor; + TupleTableSlot *slot = areq->result; + + /* The result should be a TupleTableSlot or NULL. */ + Assert(slot == NULL || IsA(slot, TupleTableSlot)); + + /* Nothing to do if the request is pending. */ + if (!areq->request_complete) + { + /* The request would have been pending for a callback. */ + Assert(areq->callback_pending); + return; + } + + /* If the result is NULL or an empty slot, there's nothing more to do. */ + if (TupIsNull(slot)) + { + /* The ending subplan wouldn't have been pending for a callback. */ + Assert(!areq->callback_pending); + --node->as_nasyncremain; + return; + } + + /* Save result so we can return it. */ + Assert(node->as_nasyncresults < node->as_nasyncplans); + node->as_asyncresults[node->as_nasyncresults++] = slot; + + /* + * Mark the subplan that returned a result as ready for a new request. We + * don't launch another one here immediately because it might complete. + */ + node->as_needrequest = bms_add_member(node->as_needrequest, + areq->request_index); +} + +/* ---------------------------------------------------------------- + * classify_matching_subplans + * + * Classify the node's as_valid_subplans into sync ones and + * async ones, adjust it to contain sync ones only, and save + * async ones in the node's as_valid_asyncplans. + * ---------------------------------------------------------------- + */ +static void +classify_matching_subplans(AppendState *node) +{ + Bitmapset *valid_asyncplans; + + Assert(node->as_valid_asyncplans == NULL); + + /* Nothing to do if there are no valid subplans. */ + if (bms_is_empty(node->as_valid_subplans)) + { + node->as_syncdone = true; + node->as_nasyncremain = 0; + return; + } + + /* Nothing to do if there are no valid async subplans. */ + if (!bms_overlap(node->as_valid_subplans, node->as_asyncplans)) + { + node->as_nasyncremain = 0; + return; + } + + /* Get valid async subplans. */ + valid_asyncplans = bms_copy(node->as_asyncplans); + valid_asyncplans = bms_int_members(valid_asyncplans, + node->as_valid_subplans); + + /* Adjust the valid subplans to contain sync subplans only. */ + node->as_valid_subplans = bms_del_members(node->as_valid_subplans, + valid_asyncplans); + + /* Save valid async subplans. */ + node->as_valid_asyncplans = valid_asyncplans; +} diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c new file mode 100644 index 0000000..a8d7b1e --- /dev/null +++ b/src/backend/executor/nodeBitmapAnd.c @@ -0,0 +1,223 @@ +/*------------------------------------------------------------------------- + * + * nodeBitmapAnd.c + * routines to handle BitmapAnd nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeBitmapAnd.c + * + *------------------------------------------------------------------------- + */ +/* INTERFACE ROUTINES + * ExecInitBitmapAnd - initialize the BitmapAnd node + * MultiExecBitmapAnd - retrieve the result bitmap from the node + * ExecEndBitmapAnd - shut down the BitmapAnd node + * ExecReScanBitmapAnd - rescan the BitmapAnd node + * + * NOTES + * BitmapAnd nodes don't make use of their left and right + * subtrees, rather they maintain a list of subplans, + * much like Append nodes. The logic is much simpler than + * Append, however, since we needn't cope with forward/backward + * execution. + */ + +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/nodeBitmapAnd.h" + + +/* ---------------------------------------------------------------- + * ExecBitmapAnd + * + * stub for pro forma compliance + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecBitmapAnd(PlanState *pstate) +{ + elog(ERROR, "BitmapAnd node does not support ExecProcNode call convention"); + return NULL; +} + +/* ---------------------------------------------------------------- + * ExecInitBitmapAnd + * + * Begin all of the subscans of the BitmapAnd node. + * ---------------------------------------------------------------- + */ +BitmapAndState * +ExecInitBitmapAnd(BitmapAnd *node, EState *estate, int eflags) +{ + BitmapAndState *bitmapandstate = makeNode(BitmapAndState); + PlanState **bitmapplanstates; + int nplans; + int i; + ListCell *l; + Plan *initNode; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * Set up empty vector of subplan states + */ + nplans = list_length(node->bitmapplans); + + bitmapplanstates = (PlanState **) palloc0(nplans * sizeof(PlanState *)); + + /* + * create new BitmapAndState for our BitmapAnd node + */ + bitmapandstate->ps.plan = (Plan *) node; + bitmapandstate->ps.state = estate; + bitmapandstate->ps.ExecProcNode = ExecBitmapAnd; + bitmapandstate->bitmapplans = bitmapplanstates; + bitmapandstate->nplans = nplans; + + /* + * call ExecInitNode on each of the plans to be executed and save the + * results into the array "bitmapplanstates". + */ + i = 0; + foreach(l, node->bitmapplans) + { + initNode = (Plan *) lfirst(l); + bitmapplanstates[i] = ExecInitNode(initNode, estate, eflags); + i++; + } + + /* + * Miscellaneous initialization + * + * BitmapAnd plans don't have expression contexts because they never call + * ExecQual or ExecProject. They don't need any tuple slots either. + */ + + return bitmapandstate; +} + +/* ---------------------------------------------------------------- + * MultiExecBitmapAnd + * ---------------------------------------------------------------- + */ +Node * +MultiExecBitmapAnd(BitmapAndState *node) +{ + PlanState **bitmapplans; + int nplans; + int i; + TIDBitmap *result = NULL; + + /* must provide our own instrumentation support */ + if (node->ps.instrument) + InstrStartNode(node->ps.instrument); + + /* + * get information from the node + */ + bitmapplans = node->bitmapplans; + nplans = node->nplans; + + /* + * Scan all the subplans and AND their result bitmaps + */ + for (i = 0; i < nplans; i++) + { + PlanState *subnode = bitmapplans[i]; + TIDBitmap *subresult; + + subresult = (TIDBitmap *) MultiExecProcNode(subnode); + + if (!subresult || !IsA(subresult, TIDBitmap)) + elog(ERROR, "unrecognized result from subplan"); + + if (result == NULL) + result = subresult; /* first subplan */ + else + { + tbm_intersect(result, subresult); + tbm_free(subresult); + } + + /* + * If at any stage we have a completely empty bitmap, we can fall out + * without evaluating the remaining subplans, since ANDing them can no + * longer change the result. (Note: the fact that indxpath.c orders + * the subplans by selectivity should make this case more likely to + * occur.) + */ + if (tbm_is_empty(result)) + break; + } + + if (result == NULL) + elog(ERROR, "BitmapAnd doesn't support zero inputs"); + + /* must provide our own instrumentation support */ + if (node->ps.instrument) + InstrStopNode(node->ps.instrument, 0 /* XXX */ ); + + return (Node *) result; +} + +/* ---------------------------------------------------------------- + * ExecEndBitmapAnd + * + * Shuts down the subscans of the BitmapAnd node. + * + * Returns nothing of interest. + * ---------------------------------------------------------------- + */ +void +ExecEndBitmapAnd(BitmapAndState *node) +{ + PlanState **bitmapplans; + int nplans; + int i; + + /* + * get information from the node + */ + bitmapplans = node->bitmapplans; + nplans = node->nplans; + + /* + * shut down each of the subscans (that we've initialized) + */ + for (i = 0; i < nplans; i++) + { + if (bitmapplans[i]) + ExecEndNode(bitmapplans[i]); + } +} + +void +ExecReScanBitmapAnd(BitmapAndState *node) +{ + int i; + + for (i = 0; i < node->nplans; i++) + { + PlanState *subnode = node->bitmapplans[i]; + + /* + * ExecReScan doesn't know about my subplans, so I have to do + * changed-parameter signaling myself. + */ + if (node->ps.chgParam != NULL) + UpdateChangedParamSet(subnode, node->ps.chgParam); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (subnode->chgParam == NULL) + ExecReScan(subnode); + } +} diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c new file mode 100644 index 0000000..2db1914 --- /dev/null +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -0,0 +1,954 @@ +/*------------------------------------------------------------------------- + * + * nodeBitmapHeapscan.c + * Routines to support bitmapped scans of relations + * + * NOTE: it is critical that this plan type only be used with MVCC-compliant + * snapshots (ie, regular snapshots, not SnapshotAny or one of the other + * special snapshots). The reason is that since index and heap scans are + * decoupled, there can be no assurance that the index tuple prompting a + * visit to a particular heap TID still exists when the visit is made. + * Therefore the tuple might not exist anymore either (which is OK because + * heap_fetch will cope) --- but worse, the tuple slot could have been + * re-used for a newer tuple. With an MVCC snapshot the newer tuple is + * certain to fail the time qual and so it will not be mistakenly returned, + * but with anything else we might return a tuple that doesn't meet the + * required index qual conditions. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeBitmapHeapscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecBitmapHeapScan scans a relation using bitmap info + * ExecBitmapHeapNext workhorse for above + * ExecInitBitmapHeapScan creates and initializes state info. + * ExecReScanBitmapHeapScan prepares to rescan the plan. + * ExecEndBitmapHeapScan releases all storage. + */ +#include "postgres.h" + +#include + +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/visibilitymap.h" +#include "executor/execdebug.h" +#include "executor/nodeBitmapHeapscan.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/spccache.h" + +static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node); +static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate); +static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, + TBMIterateResult *tbmres); +static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node); +static inline void BitmapPrefetch(BitmapHeapScanState *node, + TableScanDesc scan); +static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate); + + +/* ---------------------------------------------------------------- + * BitmapHeapNext + * + * Retrieve next tuple from the BitmapHeapScan node's currentRelation + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +BitmapHeapNext(BitmapHeapScanState *node) +{ + ExprContext *econtext; + TableScanDesc scan; + TIDBitmap *tbm; + TBMIterator *tbmiterator = NULL; + TBMSharedIterator *shared_tbmiterator = NULL; + TBMIterateResult *tbmres; + TupleTableSlot *slot; + ParallelBitmapHeapState *pstate = node->pstate; + dsa_area *dsa = node->ss.ps.state->es_query_dsa; + + /* + * extract necessary information from index scan node + */ + econtext = node->ss.ps.ps_ExprContext; + slot = node->ss.ss_ScanTupleSlot; + scan = node->ss.ss_currentScanDesc; + tbm = node->tbm; + if (pstate == NULL) + tbmiterator = node->tbmiterator; + else + shared_tbmiterator = node->shared_tbmiterator; + tbmres = node->tbmres; + + /* + * If we haven't yet performed the underlying index scan, do it, and begin + * the iteration over the bitmap. + * + * For prefetching, we use *two* iterators, one for the pages we are + * actually scanning and another that runs ahead of the first for + * prefetching. node->prefetch_pages tracks exactly how many pages ahead + * the prefetch iterator is. Also, node->prefetch_target tracks the + * desired prefetch distance, which starts small and increases up to the + * node->prefetch_maximum. This is to avoid doing a lot of prefetching in + * a scan that stops after a few tuples because of a LIMIT. + */ + if (!node->initialized) + { + if (!pstate) + { + tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); + + if (!tbm || !IsA(tbm, TIDBitmap)) + elog(ERROR, "unrecognized result from subplan"); + + node->tbm = tbm; + node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); + node->tbmres = tbmres = NULL; + +#ifdef USE_PREFETCH + if (node->prefetch_maximum > 0) + { + node->prefetch_iterator = tbm_begin_iterate(tbm); + node->prefetch_pages = 0; + node->prefetch_target = -1; + } +#endif /* USE_PREFETCH */ + } + else + { + /* + * The leader will immediately come out of the function, but + * others will be blocked until leader populates the TBM and wakes + * them up. + */ + if (BitmapShouldInitializeSharedState(pstate)) + { + tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); + if (!tbm || !IsA(tbm, TIDBitmap)) + elog(ERROR, "unrecognized result from subplan"); + + node->tbm = tbm; + + /* + * Prepare to iterate over the TBM. This will return the + * dsa_pointer of the iterator state which will be used by + * multiple processes to iterate jointly. + */ + pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); +#ifdef USE_PREFETCH + if (node->prefetch_maximum > 0) + { + pstate->prefetch_iterator = + tbm_prepare_shared_iterate(tbm); + + /* + * We don't need the mutex here as we haven't yet woke up + * others. + */ + pstate->prefetch_pages = 0; + pstate->prefetch_target = -1; + } +#endif + + /* We have initialized the shared state so wake up others. */ + BitmapDoneInitializingSharedState(pstate); + } + + /* Allocate a private iterator and attach the shared state to it */ + node->shared_tbmiterator = shared_tbmiterator = + tbm_attach_shared_iterate(dsa, pstate->tbmiterator); + node->tbmres = tbmres = NULL; + +#ifdef USE_PREFETCH + if (node->prefetch_maximum > 0) + { + node->shared_prefetch_iterator = + tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator); + } +#endif /* USE_PREFETCH */ + } + node->initialized = true; + } + + for (;;) + { + bool skip_fetch; + + CHECK_FOR_INTERRUPTS(); + + /* + * Get next page of results if needed + */ + if (tbmres == NULL) + { + if (!pstate) + node->tbmres = tbmres = tbm_iterate(tbmiterator); + else + node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); + if (tbmres == NULL) + { + /* no more entries in the bitmap */ + break; + } + + BitmapAdjustPrefetchIterator(node, tbmres); + + /* + * We can skip fetching the heap page if we don't need any fields + * from the heap, and the bitmap entries don't need rechecking, + * and all tuples on the page are visible to our transaction. + * + * XXX: It's a layering violation that we do these checks above + * tableam, they should probably moved below it at some point. + */ + skip_fetch = (node->can_skip_fetch && + !tbmres->recheck && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmres->blockno, + &node->vmbuffer)); + + if (skip_fetch) + { + /* can't be lossy in the skip_fetch case */ + Assert(tbmres->ntuples >= 0); + + /* + * The number of tuples on this page is put into + * node->return_empty_tuples. + */ + node->return_empty_tuples = tbmres->ntuples; + } + else if (!table_scan_bitmap_next_block(scan, tbmres)) + { + /* AM doesn't think this block is valid, skip */ + continue; + } + + if (tbmres->ntuples >= 0) + node->exact_pages++; + else + node->lossy_pages++; + + /* Adjust the prefetch target */ + BitmapAdjustPrefetchTarget(node); + } + else + { + /* + * Continuing in previously obtained page. + */ + +#ifdef USE_PREFETCH + + /* + * Try to prefetch at least a few pages even before we get to the + * second page if we don't stop reading after the first tuple. + */ + if (!pstate) + { + if (node->prefetch_target < node->prefetch_maximum) + node->prefetch_target++; + } + else if (pstate->prefetch_target < node->prefetch_maximum) + { + /* take spinlock while updating shared state */ + SpinLockAcquire(&pstate->mutex); + if (pstate->prefetch_target < node->prefetch_maximum) + pstate->prefetch_target++; + SpinLockRelease(&pstate->mutex); + } +#endif /* USE_PREFETCH */ + } + + /* + * We issue prefetch requests *after* fetching the current page to try + * to avoid having prefetching interfere with the main I/O. Also, this + * should happen only when we have determined there is still something + * to do on the current page, else we may uselessly prefetch the same + * page we are just about to request for real. + * + * XXX: It's a layering violation that we do these checks above + * tableam, they should probably moved below it at some point. + */ + BitmapPrefetch(node, scan); + + if (node->return_empty_tuples > 0) + { + /* + * If we don't have to fetch the tuple, just return nulls. + */ + ExecStoreAllNullTuple(slot); + + if (--node->return_empty_tuples == 0) + { + /* no more tuples to return in the next round */ + node->tbmres = tbmres = NULL; + } + } + else + { + /* + * Attempt to fetch tuple from AM. + */ + if (!table_scan_bitmap_next_tuple(scan, tbmres, slot)) + { + /* nothing more to look at on this page */ + node->tbmres = tbmres = NULL; + continue; + } + + /* + * If we are using lossy info, we have to recheck the qual + * conditions at every tuple. + */ + if (tbmres->recheck) + { + econtext->ecxt_scantuple = slot; + if (!ExecQualAndReset(node->bitmapqualorig, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + ExecClearTuple(slot); + continue; + } + } + } + + /* OK to return this tuple */ + return slot; + } + + /* + * if we get here it means we are at the end of the scan.. + */ + return ExecClearTuple(slot); +} + +/* + * BitmapDoneInitializingSharedState - Shared state is initialized + * + * By this time the leader has already populated the TBM and initialized the + * shared state so wake up other processes. + */ +static inline void +BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate) +{ + SpinLockAcquire(&pstate->mutex); + pstate->state = BM_FINISHED; + SpinLockRelease(&pstate->mutex); + ConditionVariableBroadcast(&pstate->cv); +} + +/* + * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator + */ +static inline void +BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, + TBMIterateResult *tbmres) +{ +#ifdef USE_PREFETCH + ParallelBitmapHeapState *pstate = node->pstate; + + if (pstate == NULL) + { + TBMIterator *prefetch_iterator = node->prefetch_iterator; + + if (node->prefetch_pages > 0) + { + /* The main iterator has closed the distance by one page */ + node->prefetch_pages--; + } + else if (prefetch_iterator) + { + /* Do not let the prefetch iterator get behind the main one */ + TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + + if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) + elog(ERROR, "prefetch and main iterators are out of sync"); + } + return; + } + + if (node->prefetch_maximum > 0) + { + TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; + + SpinLockAcquire(&pstate->mutex); + if (pstate->prefetch_pages > 0) + { + pstate->prefetch_pages--; + SpinLockRelease(&pstate->mutex); + } + else + { + /* Release the mutex before iterating */ + SpinLockRelease(&pstate->mutex); + + /* + * In case of shared mode, we can not ensure that the current + * blockno of the main iterator and that of the prefetch iterator + * are same. It's possible that whatever blockno we are + * prefetching will be processed by another process. Therefore, + * we don't validate the blockno here as we do in non-parallel + * case. + */ + if (prefetch_iterator) + tbm_shared_iterate(prefetch_iterator); + } + } +#endif /* USE_PREFETCH */ +} + +/* + * BitmapAdjustPrefetchTarget - Adjust the prefetch target + * + * Increase prefetch target if it's not yet at the max. Note that + * we will increase it to zero after fetching the very first + * page/tuple, then to one after the second tuple is fetched, then + * it doubles as later pages are fetched. + */ +static inline void +BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) +{ +#ifdef USE_PREFETCH + ParallelBitmapHeapState *pstate = node->pstate; + + if (pstate == NULL) + { + if (node->prefetch_target >= node->prefetch_maximum) + /* don't increase any further */ ; + else if (node->prefetch_target >= node->prefetch_maximum / 2) + node->prefetch_target = node->prefetch_maximum; + else if (node->prefetch_target > 0) + node->prefetch_target *= 2; + else + node->prefetch_target++; + return; + } + + /* Do an unlocked check first to save spinlock acquisitions. */ + if (pstate->prefetch_target < node->prefetch_maximum) + { + SpinLockAcquire(&pstate->mutex); + if (pstate->prefetch_target >= node->prefetch_maximum) + /* don't increase any further */ ; + else if (pstate->prefetch_target >= node->prefetch_maximum / 2) + pstate->prefetch_target = node->prefetch_maximum; + else if (pstate->prefetch_target > 0) + pstate->prefetch_target *= 2; + else + pstate->prefetch_target++; + SpinLockRelease(&pstate->mutex); + } +#endif /* USE_PREFETCH */ +} + +/* + * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target + */ +static inline void +BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) +{ +#ifdef USE_PREFETCH + ParallelBitmapHeapState *pstate = node->pstate; + + if (pstate == NULL) + { + TBMIterator *prefetch_iterator = node->prefetch_iterator; + + if (prefetch_iterator) + { + while (node->prefetch_pages < node->prefetch_target) + { + TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + bool skip_fetch; + + if (tbmpre == NULL) + { + /* No more pages to prefetch */ + tbm_end_iterate(prefetch_iterator); + node->prefetch_iterator = NULL; + break; + } + node->prefetch_pages++; + + /* + * If we expect not to have to actually read this heap page, + * skip this prefetch call, but continue to run the prefetch + * logic normally. (Would it be better not to increment + * prefetch_pages?) + * + * This depends on the assumption that the index AM will + * report the same recheck flag for this future heap page as + * it did for the current heap page; which is not a certainty + * but is true in many cases. + */ + skip_fetch = (node->can_skip_fetch && + (node->tbmres ? !node->tbmres->recheck : false) && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmpre->blockno, + &node->pvmbuffer)); + + if (!skip_fetch) + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + } + } + + return; + } + + if (pstate->prefetch_pages < pstate->prefetch_target) + { + TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; + + if (prefetch_iterator) + { + while (1) + { + TBMIterateResult *tbmpre; + bool do_prefetch = false; + bool skip_fetch; + + /* + * Recheck under the mutex. If some other process has already + * done enough prefetching then we need not to do anything. + */ + SpinLockAcquire(&pstate->mutex); + if (pstate->prefetch_pages < pstate->prefetch_target) + { + pstate->prefetch_pages++; + do_prefetch = true; + } + SpinLockRelease(&pstate->mutex); + + if (!do_prefetch) + return; + + tbmpre = tbm_shared_iterate(prefetch_iterator); + if (tbmpre == NULL) + { + /* No more pages to prefetch */ + tbm_end_shared_iterate(prefetch_iterator); + node->shared_prefetch_iterator = NULL; + break; + } + + /* As above, skip prefetch if we expect not to need page */ + skip_fetch = (node->can_skip_fetch && + (node->tbmres ? !node->tbmres->recheck : false) && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmpre->blockno, + &node->pvmbuffer)); + + if (!skip_fetch) + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + } + } + } +#endif /* USE_PREFETCH */ +} + +/* + * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot) +{ + ExprContext *econtext; + + /* + * extract necessary information from index scan node + */ + econtext = node->ss.ps.ps_ExprContext; + + /* Does the tuple meet the original qual conditions? */ + econtext->ecxt_scantuple = slot; + return ExecQualAndReset(node->bitmapqualorig, econtext); +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapScan(node) + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecBitmapHeapScan(PlanState *pstate) +{ + BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) BitmapHeapNext, + (ExecScanRecheckMtd) BitmapHeapRecheck); +} + +/* ---------------------------------------------------------------- + * ExecReScanBitmapHeapScan(node) + * ---------------------------------------------------------------- + */ +void +ExecReScanBitmapHeapScan(BitmapHeapScanState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + /* rescan to release any page pin */ + table_rescan(node->ss.ss_currentScanDesc, NULL); + + /* release bitmaps and buffers if any */ + if (node->tbmiterator) + tbm_end_iterate(node->tbmiterator); + if (node->prefetch_iterator) + tbm_end_iterate(node->prefetch_iterator); + if (node->shared_tbmiterator) + tbm_end_shared_iterate(node->shared_tbmiterator); + if (node->shared_prefetch_iterator) + tbm_end_shared_iterate(node->shared_prefetch_iterator); + if (node->tbm) + tbm_free(node->tbm); + if (node->vmbuffer != InvalidBuffer) + ReleaseBuffer(node->vmbuffer); + if (node->pvmbuffer != InvalidBuffer) + ReleaseBuffer(node->pvmbuffer); + node->tbm = NULL; + node->tbmiterator = NULL; + node->tbmres = NULL; + node->prefetch_iterator = NULL; + node->initialized = false; + node->shared_tbmiterator = NULL; + node->shared_prefetch_iterator = NULL; + node->vmbuffer = InvalidBuffer; + node->pvmbuffer = InvalidBuffer; + + ExecScanReScan(&node->ss); + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} + +/* ---------------------------------------------------------------- + * ExecEndBitmapHeapScan + * ---------------------------------------------------------------- + */ +void +ExecEndBitmapHeapScan(BitmapHeapScanState *node) +{ + TableScanDesc scanDesc; + + /* + * extract information from the node + */ + scanDesc = node->ss.ss_currentScanDesc; + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clear out tuple table slots + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close down subplans + */ + ExecEndNode(outerPlanState(node)); + + /* + * release bitmaps and buffers if any + */ + if (node->tbmiterator) + tbm_end_iterate(node->tbmiterator); + if (node->prefetch_iterator) + tbm_end_iterate(node->prefetch_iterator); + if (node->tbm) + tbm_free(node->tbm); + if (node->shared_tbmiterator) + tbm_end_shared_iterate(node->shared_tbmiterator); + if (node->shared_prefetch_iterator) + tbm_end_shared_iterate(node->shared_prefetch_iterator); + if (node->vmbuffer != InvalidBuffer) + ReleaseBuffer(node->vmbuffer); + if (node->pvmbuffer != InvalidBuffer) + ReleaseBuffer(node->pvmbuffer); + + /* + * close heap scan + */ + table_endscan(scanDesc); +} + +/* ---------------------------------------------------------------- + * ExecInitBitmapHeapScan + * + * Initializes the scan's state information. + * ---------------------------------------------------------------- + */ +BitmapHeapScanState * +ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) +{ + BitmapHeapScanState *scanstate; + Relation currentRelation; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * Assert caller didn't ask for an unsafe snapshot --- see comments at + * head of file. + */ + Assert(IsMVCCSnapshot(estate->es_snapshot)); + + /* + * create state structure + */ + scanstate = makeNode(BitmapHeapScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan; + + scanstate->tbm = NULL; + scanstate->tbmiterator = NULL; + scanstate->tbmres = NULL; + scanstate->return_empty_tuples = 0; + scanstate->vmbuffer = InvalidBuffer; + scanstate->pvmbuffer = InvalidBuffer; + scanstate->exact_pages = 0; + scanstate->lossy_pages = 0; + scanstate->prefetch_iterator = NULL; + scanstate->prefetch_pages = 0; + scanstate->prefetch_target = 0; + scanstate->pscan_len = 0; + scanstate->initialized = false; + scanstate->shared_tbmiterator = NULL; + scanstate->shared_prefetch_iterator = NULL; + scanstate->pstate = NULL; + + /* + * We can potentially skip fetching heap pages if we do not need any + * columns of the table, either for checking non-indexable quals or for + * returning data. This test is a bit simplistic, as it checks the + * stronger condition that there's no qual or return tlist at all. But in + * most cases it's probably not worth working harder than that. + */ + scanstate->can_skip_fetch = (node->scan.plan.qual == NIL && + node->scan.plan.targetlist == NIL); + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * open the scan relation + */ + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); + + /* + * initialize child nodes + */ + outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * get the scan type from the relation descriptor. + */ + ExecInitScanTupleSlot(estate, &scanstate->ss, + RelationGetDescr(currentRelation), + table_slot_callbacks(currentRelation)); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); + scanstate->bitmapqualorig = + ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate); + + /* + * Maximum number of prefetches for the tablespace if configured, + * otherwise the current value of the effective_io_concurrency GUC. + */ + scanstate->prefetch_maximum = + get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); + + scanstate->ss.ss_currentRelation = currentRelation; + + scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation, + estate->es_snapshot, + 0, + NULL); + + /* + * all done. + */ + return scanstate; +} + +/*---------------- + * BitmapShouldInitializeSharedState + * + * The first process to come here and see the state to the BM_INITIAL + * will become the leader for the parallel bitmap scan and will be + * responsible for populating the TIDBitmap. The other processes will + * be blocked by the condition variable until the leader wakes them up. + * --------------- + */ +static bool +BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate) +{ + SharedBitmapState state; + + while (1) + { + SpinLockAcquire(&pstate->mutex); + state = pstate->state; + if (pstate->state == BM_INITIAL) + pstate->state = BM_INPROGRESS; + SpinLockRelease(&pstate->mutex); + + /* Exit if bitmap is done, or if we're the leader. */ + if (state != BM_INPROGRESS) + break; + + /* Wait for the leader to wake us up. */ + ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN); + } + + ConditionVariableCancelSleep(); + + return (state == BM_INITIAL); +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapEstimate + * + * Compute the amount of space we'll need in the parallel + * query DSM, and inform pcxt->estimator about our needs. + * ---------------------------------------------------------------- + */ +void +ExecBitmapHeapEstimate(BitmapHeapScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + + node->pscan_len = add_size(offsetof(ParallelBitmapHeapState, + phs_snapshot_data), + EstimateSnapshotSpace(estate->es_snapshot)); + + shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapInitializeDSM + * + * Set up a parallel bitmap heap scan descriptor. + * ---------------------------------------------------------------- + */ +void +ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, + ParallelContext *pcxt) +{ + ParallelBitmapHeapState *pstate; + EState *estate = node->ss.ps.state; + dsa_area *dsa = node->ss.ps.state->es_query_dsa; + + /* If there's no DSA, there are no workers; initialize nothing. */ + if (dsa == NULL) + return; + + pstate = shm_toc_allocate(pcxt->toc, node->pscan_len); + + pstate->tbmiterator = 0; + pstate->prefetch_iterator = 0; + + /* Initialize the mutex */ + SpinLockInit(&pstate->mutex); + pstate->prefetch_pages = 0; + pstate->prefetch_target = 0; + pstate->state = BM_INITIAL; + + ConditionVariableInit(&pstate->cv); + SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data); + + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate); + node->pstate = pstate; +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, + ParallelContext *pcxt) +{ + ParallelBitmapHeapState *pstate = node->pstate; + dsa_area *dsa = node->ss.ps.state->es_query_dsa; + + /* If there's no DSA, there are no workers; do nothing. */ + if (dsa == NULL) + return; + + pstate->state = BM_INITIAL; + + if (DsaPointerIsValid(pstate->tbmiterator)) + tbm_free_shared_area(dsa, pstate->tbmiterator); + + if (DsaPointerIsValid(pstate->prefetch_iterator)) + tbm_free_shared_area(dsa, pstate->prefetch_iterator); + + pstate->tbmiterator = InvalidDsaPointer; + pstate->prefetch_iterator = InvalidDsaPointer; +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapInitializeWorker + * + * Copy relevant information from TOC into planstate. + * ---------------------------------------------------------------- + */ +void +ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, + ParallelWorkerContext *pwcxt) +{ + ParallelBitmapHeapState *pstate; + Snapshot snapshot; + + Assert(node->ss.ps.state->es_query_dsa != NULL); + + pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); + node->pstate = pstate; + + snapshot = RestoreSnapshot(pstate->phs_snapshot_data); + table_scan_update_snapshot(node->ss.ss_currentScanDesc, snapshot); +} diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c new file mode 100644 index 0000000..48c2036 --- /dev/null +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -0,0 +1,330 @@ +/*------------------------------------------------------------------------- + * + * nodeBitmapIndexscan.c + * Routines to support bitmapped index scans of relations + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeBitmapIndexscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * MultiExecBitmapIndexScan scans a relation using index. + * ExecInitBitmapIndexScan creates and initializes state info. + * ExecReScanBitmapIndexScan prepares to rescan the plan. + * ExecEndBitmapIndexScan releases all storage. + */ +#include "postgres.h" + +#include "access/genam.h" +#include "executor/execdebug.h" +#include "executor/nodeBitmapIndexscan.h" +#include "executor/nodeIndexscan.h" +#include "miscadmin.h" +#include "utils/memutils.h" + + +/* ---------------------------------------------------------------- + * ExecBitmapIndexScan + * + * stub for pro forma compliance + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecBitmapIndexScan(PlanState *pstate) +{ + elog(ERROR, "BitmapIndexScan node does not support ExecProcNode call convention"); + return NULL; +} + +/* ---------------------------------------------------------------- + * MultiExecBitmapIndexScan(node) + * ---------------------------------------------------------------- + */ +Node * +MultiExecBitmapIndexScan(BitmapIndexScanState *node) +{ + TIDBitmap *tbm; + IndexScanDesc scandesc; + double nTuples = 0; + bool doscan; + + /* must provide our own instrumentation support */ + if (node->ss.ps.instrument) + InstrStartNode(node->ss.ps.instrument); + + /* + * extract necessary information from index scan node + */ + scandesc = node->biss_ScanDesc; + + /* + * If we have runtime keys and they've not already been set up, do it now. + * Array keys are also treated as runtime keys; note that if ExecReScan + * returns with biss_RuntimeKeysReady still false, then there is an empty + * array key so we should do nothing. + */ + if (!node->biss_RuntimeKeysReady && + (node->biss_NumRuntimeKeys != 0 || node->biss_NumArrayKeys != 0)) + { + ExecReScan((PlanState *) node); + doscan = node->biss_RuntimeKeysReady; + } + else + doscan = true; + + /* + * Prepare the result bitmap. Normally we just create a new one to pass + * back; however, our parent node is allowed to store a pre-made one into + * node->biss_result, in which case we just OR our tuple IDs into the + * existing bitmap. (This saves needing explicit UNION steps.) + */ + if (node->biss_result) + { + tbm = node->biss_result; + node->biss_result = NULL; /* reset for next time */ + } + else + { + /* XXX should we use less than work_mem for this? */ + tbm = tbm_create(work_mem * 1024L, + ((BitmapIndexScan *) node->ss.ps.plan)->isshared ? + node->ss.ps.state->es_query_dsa : NULL); + } + + /* + * Get TIDs from index and insert into bitmap + */ + while (doscan) + { + nTuples += (double) index_getbitmap(scandesc, tbm); + + CHECK_FOR_INTERRUPTS(); + + doscan = ExecIndexAdvanceArrayKeys(node->biss_ArrayKeys, + node->biss_NumArrayKeys); + if (doscan) /* reset index scan */ + index_rescan(node->biss_ScanDesc, + node->biss_ScanKeys, node->biss_NumScanKeys, + NULL, 0); + } + + /* must provide our own instrumentation support */ + if (node->ss.ps.instrument) + InstrStopNode(node->ss.ps.instrument, nTuples); + + return (Node *) tbm; +} + +/* ---------------------------------------------------------------- + * ExecReScanBitmapIndexScan(node) + * + * Recalculates the values of any scan keys whose value depends on + * information known at runtime, then rescans the indexed relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanBitmapIndexScan(BitmapIndexScanState *node) +{ + ExprContext *econtext = node->biss_RuntimeContext; + + /* + * Reset the runtime-key context so we don't leak memory as each outer + * tuple is scanned. Note this assumes that we will recalculate *all* + * runtime keys on each call. + */ + if (econtext) + ResetExprContext(econtext); + + /* + * If we are doing runtime key calculations (ie, any of the index key + * values weren't simple Consts), compute the new key values. + * + * Array keys are also treated as runtime keys; note that if we return + * with biss_RuntimeKeysReady still false, then there is an empty array + * key so no index scan is needed. + */ + if (node->biss_NumRuntimeKeys != 0) + ExecIndexEvalRuntimeKeys(econtext, + node->biss_RuntimeKeys, + node->biss_NumRuntimeKeys); + if (node->biss_NumArrayKeys != 0) + node->biss_RuntimeKeysReady = + ExecIndexEvalArrayKeys(econtext, + node->biss_ArrayKeys, + node->biss_NumArrayKeys); + else + node->biss_RuntimeKeysReady = true; + + /* reset index scan */ + if (node->biss_RuntimeKeysReady) + index_rescan(node->biss_ScanDesc, + node->biss_ScanKeys, node->biss_NumScanKeys, + NULL, 0); +} + +/* ---------------------------------------------------------------- + * ExecEndBitmapIndexScan + * ---------------------------------------------------------------- + */ +void +ExecEndBitmapIndexScan(BitmapIndexScanState *node) +{ + Relation indexRelationDesc; + IndexScanDesc indexScanDesc; + + /* + * extract information from the node + */ + indexRelationDesc = node->biss_RelationDesc; + indexScanDesc = node->biss_ScanDesc; + + /* + * Free the exprcontext ... now dead code, see ExecFreeExprContext + */ +#ifdef NOT_USED + if (node->biss_RuntimeContext) + FreeExprContext(node->biss_RuntimeContext, true); +#endif + + /* + * close the index relation (no-op if we didn't open it) + */ + if (indexScanDesc) + index_endscan(indexScanDesc); + if (indexRelationDesc) + index_close(indexRelationDesc, NoLock); +} + +/* ---------------------------------------------------------------- + * ExecInitBitmapIndexScan + * + * Initializes the index scan's state information. + * ---------------------------------------------------------------- + */ +BitmapIndexScanState * +ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags) +{ + BitmapIndexScanState *indexstate; + LOCKMODE lockmode; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + indexstate = makeNode(BitmapIndexScanState); + indexstate->ss.ps.plan = (Plan *) node; + indexstate->ss.ps.state = estate; + indexstate->ss.ps.ExecProcNode = ExecBitmapIndexScan; + + /* normally we don't make the result bitmap till runtime */ + indexstate->biss_result = NULL; + + /* + * We do not open or lock the base relation here. We assume that an + * ancestor BitmapHeapScan node is holding AccessShareLock (or better) on + * the heap relation throughout the execution of the plan tree. + */ + + indexstate->ss.ss_currentRelation = NULL; + indexstate->ss.ss_currentScanDesc = NULL; + + /* + * Miscellaneous initialization + * + * We do not need a standard exprcontext for this node, though we may + * decide below to create a runtime-key exprcontext + */ + + /* + * initialize child expressions + * + * We don't need to initialize targetlist or qual since neither are used. + * + * Note: we don't initialize all of the indexqual expression, only the + * sub-parts corresponding to runtime keys (see below). + */ + + /* + * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop + * here. This allows an index-advisor plugin to EXPLAIN a plan containing + * references to nonexistent indexes. + */ + if (eflags & EXEC_FLAG_EXPLAIN_ONLY) + return indexstate; + + /* Open the index relation. */ + lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; + indexstate->biss_RelationDesc = index_open(node->indexid, lockmode); + + /* + * Initialize index-specific scan state + */ + indexstate->biss_RuntimeKeysReady = false; + indexstate->biss_RuntimeKeys = NULL; + indexstate->biss_NumRuntimeKeys = 0; + + /* + * build the index scan keys from the index qualification + */ + ExecIndexBuildScanKeys((PlanState *) indexstate, + indexstate->biss_RelationDesc, + node->indexqual, + false, + &indexstate->biss_ScanKeys, + &indexstate->biss_NumScanKeys, + &indexstate->biss_RuntimeKeys, + &indexstate->biss_NumRuntimeKeys, + &indexstate->biss_ArrayKeys, + &indexstate->biss_NumArrayKeys); + + /* + * If we have runtime keys or array keys, we need an ExprContext to + * evaluate them. We could just create a "standard" plan node exprcontext, + * but to keep the code looking similar to nodeIndexscan.c, it seems + * better to stick with the approach of using a separate ExprContext. + */ + if (indexstate->biss_NumRuntimeKeys != 0 || + indexstate->biss_NumArrayKeys != 0) + { + ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext; + + ExecAssignExprContext(estate, &indexstate->ss.ps); + indexstate->biss_RuntimeContext = indexstate->ss.ps.ps_ExprContext; + indexstate->ss.ps.ps_ExprContext = stdecontext; + } + else + { + indexstate->biss_RuntimeContext = NULL; + } + + /* + * Initialize scan descriptor. + */ + indexstate->biss_ScanDesc = + index_beginscan_bitmap(indexstate->biss_RelationDesc, + estate->es_snapshot, + indexstate->biss_NumScanKeys); + + /* + * If no run-time keys to calculate, go ahead and pass the scankeys to the + * index AM. + */ + if (indexstate->biss_NumRuntimeKeys == 0 && + indexstate->biss_NumArrayKeys == 0) + index_rescan(indexstate->biss_ScanDesc, + indexstate->biss_ScanKeys, indexstate->biss_NumScanKeys, + NULL, 0); + + /* + * all done. + */ + return indexstate; +} diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c new file mode 100644 index 0000000..4a8c01d --- /dev/null +++ b/src/backend/executor/nodeBitmapOr.c @@ -0,0 +1,241 @@ +/*------------------------------------------------------------------------- + * + * nodeBitmapOr.c + * routines to handle BitmapOr nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeBitmapOr.c + * + *------------------------------------------------------------------------- + */ +/* INTERFACE ROUTINES + * ExecInitBitmapOr - initialize the BitmapOr node + * MultiExecBitmapOr - retrieve the result bitmap from the node + * ExecEndBitmapOr - shut down the BitmapOr node + * ExecReScanBitmapOr - rescan the BitmapOr node + * + * NOTES + * BitmapOr nodes don't make use of their left and right + * subtrees, rather they maintain a list of subplans, + * much like Append nodes. The logic is much simpler than + * Append, however, since we needn't cope with forward/backward + * execution. + */ + +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/nodeBitmapOr.h" +#include "miscadmin.h" + + +/* ---------------------------------------------------------------- + * ExecBitmapOr + * + * stub for pro forma compliance + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecBitmapOr(PlanState *pstate) +{ + elog(ERROR, "BitmapOr node does not support ExecProcNode call convention"); + return NULL; +} + +/* ---------------------------------------------------------------- + * ExecInitBitmapOr + * + * Begin all of the subscans of the BitmapOr node. + * ---------------------------------------------------------------- + */ +BitmapOrState * +ExecInitBitmapOr(BitmapOr *node, EState *estate, int eflags) +{ + BitmapOrState *bitmaporstate = makeNode(BitmapOrState); + PlanState **bitmapplanstates; + int nplans; + int i; + ListCell *l; + Plan *initNode; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * Set up empty vector of subplan states + */ + nplans = list_length(node->bitmapplans); + + bitmapplanstates = (PlanState **) palloc0(nplans * sizeof(PlanState *)); + + /* + * create new BitmapOrState for our BitmapOr node + */ + bitmaporstate->ps.plan = (Plan *) node; + bitmaporstate->ps.state = estate; + bitmaporstate->ps.ExecProcNode = ExecBitmapOr; + bitmaporstate->bitmapplans = bitmapplanstates; + bitmaporstate->nplans = nplans; + + /* + * call ExecInitNode on each of the plans to be executed and save the + * results into the array "bitmapplanstates". + */ + i = 0; + foreach(l, node->bitmapplans) + { + initNode = (Plan *) lfirst(l); + bitmapplanstates[i] = ExecInitNode(initNode, estate, eflags); + i++; + } + + /* + * Miscellaneous initialization + * + * BitmapOr plans don't have expression contexts because they never call + * ExecQual or ExecProject. They don't need any tuple slots either. + */ + + return bitmaporstate; +} + +/* ---------------------------------------------------------------- + * MultiExecBitmapOr + * ---------------------------------------------------------------- + */ +Node * +MultiExecBitmapOr(BitmapOrState *node) +{ + PlanState **bitmapplans; + int nplans; + int i; + TIDBitmap *result = NULL; + + /* must provide our own instrumentation support */ + if (node->ps.instrument) + InstrStartNode(node->ps.instrument); + + /* + * get information from the node + */ + bitmapplans = node->bitmapplans; + nplans = node->nplans; + + /* + * Scan all the subplans and OR their result bitmaps + */ + for (i = 0; i < nplans; i++) + { + PlanState *subnode = bitmapplans[i]; + TIDBitmap *subresult; + + /* + * We can special-case BitmapIndexScan children to avoid an explicit + * tbm_union step for each child: just pass down the current result + * bitmap and let the child OR directly into it. + */ + if (IsA(subnode, BitmapIndexScanState)) + { + if (result == NULL) /* first subplan */ + { + /* XXX should we use less than work_mem for this? */ + result = tbm_create(work_mem * 1024L, + ((BitmapOr *) node->ps.plan)->isshared ? + node->ps.state->es_query_dsa : NULL); + } + + ((BitmapIndexScanState *) subnode)->biss_result = result; + + subresult = (TIDBitmap *) MultiExecProcNode(subnode); + + if (subresult != result) + elog(ERROR, "unrecognized result from subplan"); + } + else + { + /* standard implementation */ + subresult = (TIDBitmap *) MultiExecProcNode(subnode); + + if (!subresult || !IsA(subresult, TIDBitmap)) + elog(ERROR, "unrecognized result from subplan"); + + if (result == NULL) + result = subresult; /* first subplan */ + else + { + tbm_union(result, subresult); + tbm_free(subresult); + } + } + } + + /* We could return an empty result set here? */ + if (result == NULL) + elog(ERROR, "BitmapOr doesn't support zero inputs"); + + /* must provide our own instrumentation support */ + if (node->ps.instrument) + InstrStopNode(node->ps.instrument, 0 /* XXX */ ); + + return (Node *) result; +} + +/* ---------------------------------------------------------------- + * ExecEndBitmapOr + * + * Shuts down the subscans of the BitmapOr node. + * + * Returns nothing of interest. + * ---------------------------------------------------------------- + */ +void +ExecEndBitmapOr(BitmapOrState *node) +{ + PlanState **bitmapplans; + int nplans; + int i; + + /* + * get information from the node + */ + bitmapplans = node->bitmapplans; + nplans = node->nplans; + + /* + * shut down each of the subscans (that we've initialized) + */ + for (i = 0; i < nplans; i++) + { + if (bitmapplans[i]) + ExecEndNode(bitmapplans[i]); + } +} + +void +ExecReScanBitmapOr(BitmapOrState *node) +{ + int i; + + for (i = 0; i < node->nplans; i++) + { + PlanState *subnode = node->bitmapplans[i]; + + /* + * ExecReScan doesn't know about my subplans, so I have to do + * changed-parameter signaling myself. + */ + if (node->ps.chgParam != NULL) + UpdateChangedParamSet(subnode, node->ps.chgParam); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (subnode->chgParam == NULL) + ExecReScan(subnode); + } +} diff --git a/src/backend/executor/nodeCtescan.c b/src/backend/executor/nodeCtescan.c new file mode 100644 index 0000000..9c2b08d --- /dev/null +++ b/src/backend/executor/nodeCtescan.c @@ -0,0 +1,351 @@ +/*------------------------------------------------------------------------- + * + * nodeCtescan.c + * routines to handle CteScan nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeCtescan.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/nodeCtescan.h" +#include "miscadmin.h" + +static TupleTableSlot *CteScanNext(CteScanState *node); + +/* ---------------------------------------------------------------- + * CteScanNext + * + * This is a workhorse for ExecCteScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +CteScanNext(CteScanState *node) +{ + EState *estate; + ScanDirection dir; + bool forward; + Tuplestorestate *tuplestorestate; + bool eof_tuplestore; + TupleTableSlot *slot; + + /* + * get state info from node + */ + estate = node->ss.ps.state; + dir = estate->es_direction; + forward = ScanDirectionIsForward(dir); + tuplestorestate = node->leader->cte_table; + tuplestore_select_read_pointer(tuplestorestate, node->readptr); + slot = node->ss.ss_ScanTupleSlot; + + /* + * If we are not at the end of the tuplestore, or are going backwards, try + * to fetch a tuple from tuplestore. + */ + eof_tuplestore = tuplestore_ateof(tuplestorestate); + + if (!forward && eof_tuplestore) + { + if (!node->leader->eof_cte) + { + /* + * When reversing direction at tuplestore EOF, the first + * gettupleslot call will fetch the last-added tuple; but we want + * to return the one before that, if possible. So do an extra + * fetch. + */ + if (!tuplestore_advance(tuplestorestate, forward)) + return NULL; /* the tuplestore must be empty */ + } + eof_tuplestore = false; + } + + /* + * If we can fetch another tuple from the tuplestore, return it. + * + * Note: we have to use copy=true in the tuplestore_gettupleslot call, + * because we are sharing the tuplestore with other nodes that might write + * into the tuplestore before we get called again. + */ + if (!eof_tuplestore) + { + if (tuplestore_gettupleslot(tuplestorestate, forward, true, slot)) + return slot; + if (forward) + eof_tuplestore = true; + } + + /* + * If necessary, try to fetch another row from the CTE query. + * + * Note: the eof_cte state variable exists to short-circuit further calls + * of the CTE plan. It's not optional, unfortunately, because some plan + * node types are not robust about being called again when they've already + * returned NULL. + */ + if (eof_tuplestore && !node->leader->eof_cte) + { + TupleTableSlot *cteslot; + + /* + * We can only get here with forward==true, so no need to worry about + * which direction the subplan will go. + */ + cteslot = ExecProcNode(node->cteplanstate); + if (TupIsNull(cteslot)) + { + node->leader->eof_cte = true; + return NULL; + } + + /* + * There are corner cases where the subplan could change which + * tuplestore read pointer is active, so be sure to reselect ours + * before storing the tuple we got. + */ + tuplestore_select_read_pointer(tuplestorestate, node->readptr); + + /* + * Append a copy of the returned tuple to tuplestore. NOTE: because + * our read pointer is certainly in EOF state, its read position will + * move forward over the added tuple. This is what we want. Also, + * any other readers will *not* move past the new tuple, which is what + * they want. + */ + tuplestore_puttupleslot(tuplestorestate, cteslot); + + /* + * We MUST copy the CTE query's output tuple into our own slot. This + * is because other CteScan nodes might advance the CTE query before + * we are called again, and our output tuple must stay stable over + * that. + */ + return ExecCopySlot(slot, cteslot); + } + + /* + * Nothing left ... + */ + return ExecClearTuple(slot); +} + +/* + * CteScanRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +CteScanRecheck(CteScanState *node, TupleTableSlot *slot) +{ + /* nothing to check */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecCteScan(node) + * + * Scans the CTE sequentially and returns the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecCteScan(PlanState *pstate) +{ + CteScanState *node = castNode(CteScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) CteScanNext, + (ExecScanRecheckMtd) CteScanRecheck); +} + + +/* ---------------------------------------------------------------- + * ExecInitCteScan + * ---------------------------------------------------------------- + */ +CteScanState * +ExecInitCteScan(CteScan *node, EState *estate, int eflags) +{ + CteScanState *scanstate; + ParamExecData *prmdata; + + /* check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* + * For the moment we have to force the tuplestore to allow REWIND, because + * we might be asked to rescan the CTE even though upper levels didn't + * tell us to be prepared to do it efficiently. Annoying, since this + * prevents truncation of the tuplestore. XXX FIXME + * + * Note: if we are in an EPQ recheck plan tree, it's likely that no access + * to the tuplestore is needed at all, making this even more annoying. + * It's not worth improving that as long as all the read pointers would + * have REWIND anyway, but if we ever improve this logic then that aspect + * should be considered too. + */ + eflags |= EXEC_FLAG_REWIND; + + /* + * CteScan should not have any children. + */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create new CteScanState for node + */ + scanstate = makeNode(CteScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecCteScan; + scanstate->eflags = eflags; + scanstate->cte_table = NULL; + scanstate->eof_cte = false; + + /* + * Find the already-initialized plan for the CTE query. + */ + scanstate->cteplanstate = (PlanState *) list_nth(estate->es_subplanstates, + node->ctePlanId - 1); + + /* + * The Param slot associated with the CTE query is used to hold a pointer + * to the CteState of the first CteScan node that initializes for this + * CTE. This node will be the one that holds the shared state for all the + * CTEs, particularly the shared tuplestore. + */ + prmdata = &(estate->es_param_exec_vals[node->cteParam]); + Assert(prmdata->execPlan == NULL); + Assert(!prmdata->isnull); + scanstate->leader = castNode(CteScanState, DatumGetPointer(prmdata->value)); + if (scanstate->leader == NULL) + { + /* I am the leader */ + prmdata->value = PointerGetDatum(scanstate); + scanstate->leader = scanstate; + scanstate->cte_table = tuplestore_begin_heap(true, false, work_mem); + tuplestore_set_eflags(scanstate->cte_table, scanstate->eflags); + scanstate->readptr = 0; + } + else + { + /* Not the leader */ + /* Create my own read pointer, and ensure it is at start */ + scanstate->readptr = + tuplestore_alloc_read_pointer(scanstate->leader->cte_table, + scanstate->eflags); + tuplestore_select_read_pointer(scanstate->leader->cte_table, + scanstate->readptr); + tuplestore_rescan(scanstate->leader->cte_table); + } + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * The scan tuple type (ie, the rowtype we expect to find in the work + * table) is the same as the result rowtype of the CTE query. + */ + ExecInitScanTupleSlot(estate, &scanstate->ss, + ExecGetResultType(scanstate->cteplanstate), + &TTSOpsMinimalTuple); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndCteScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndCteScan(CteScanState *node) +{ + /* + * Free exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * If I am the leader, free the tuplestore. + */ + if (node->leader == node) + { + tuplestore_end(node->cte_table); + node->cte_table = NULL; + } +} + +/* ---------------------------------------------------------------- + * ExecReScanCteScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanCteScan(CteScanState *node) +{ + Tuplestorestate *tuplestorestate = node->leader->cte_table; + + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + ExecScanReScan(&node->ss); + + /* + * Clear the tuplestore if a new scan of the underlying CTE is required. + * This implicitly resets all the tuplestore's read pointers. Note that + * multiple CTE nodes might redundantly clear the tuplestore; that's OK, + * and not unduly expensive. We'll stop taking this path as soon as + * somebody has attempted to read something from the underlying CTE + * (thereby causing its chgParam to be cleared). + */ + if (node->leader->cteplanstate->chgParam != NULL) + { + tuplestore_clear(tuplestorestate); + node->leader->eof_cte = false; + } + else + { + /* + * Else, just rewind my own pointer. Either the underlying CTE + * doesn't need a rescan (and we can re-read what's in the tuplestore + * now), or somebody else already took care of it. + */ + tuplestore_select_read_pointer(tuplestorestate, node->readptr); + tuplestore_rescan(tuplestorestate); + } +} diff --git a/src/backend/executor/nodeCustom.c b/src/backend/executor/nodeCustom.c new file mode 100644 index 0000000..c82060e --- /dev/null +++ b/src/backend/executor/nodeCustom.c @@ -0,0 +1,228 @@ +/* ------------------------------------------------------------------------ + * + * nodeCustom.c + * Routines to handle execution of custom scan node + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * ------------------------------------------------------------------------ + */ +#include "postgres.h" + +#include "access/parallel.h" +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "nodes/extensible.h" +#include "nodes/plannodes.h" +#include "parser/parsetree.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +static TupleTableSlot *ExecCustomScan(PlanState *pstate); + + +CustomScanState * +ExecInitCustomScan(CustomScan *cscan, EState *estate, int eflags) +{ + CustomScanState *css; + Relation scan_rel = NULL; + Index scanrelid = cscan->scan.scanrelid; + Index tlistvarno; + + /* + * Allocate the CustomScanState object. We let the custom scan provider + * do the palloc, in case it wants to make a larger object that embeds + * CustomScanState as the first field. It must set the node tag and the + * methods field correctly at this time. Other standard fields should be + * set to zero. + */ + css = castNode(CustomScanState, + cscan->methods->CreateCustomScanState(cscan)); + + /* ensure flags is filled correctly */ + css->flags = cscan->flags; + + /* fill up fields of ScanState */ + css->ss.ps.plan = &cscan->scan.plan; + css->ss.ps.state = estate; + css->ss.ps.ExecProcNode = ExecCustomScan; + + /* create expression context for node */ + ExecAssignExprContext(estate, &css->ss.ps); + + /* + * open the scan relation, if any + */ + if (scanrelid > 0) + { + scan_rel = ExecOpenScanRelation(estate, scanrelid, eflags); + css->ss.ss_currentRelation = scan_rel; + } + + /* + * Determine the scan tuple type. If the custom scan provider provided a + * targetlist describing the scan tuples, use that; else use base + * relation's rowtype. + */ + if (cscan->custom_scan_tlist != NIL || scan_rel == NULL) + { + TupleDesc scan_tupdesc; + + scan_tupdesc = ExecTypeFromTL(cscan->custom_scan_tlist); + ExecInitScanTupleSlot(estate, &css->ss, scan_tupdesc, &TTSOpsVirtual); + /* Node's targetlist will contain Vars with varno = INDEX_VAR */ + tlistvarno = INDEX_VAR; + } + else + { + ExecInitScanTupleSlot(estate, &css->ss, RelationGetDescr(scan_rel), + &TTSOpsVirtual); + /* Node's targetlist will contain Vars with varno = scanrelid */ + tlistvarno = scanrelid; + } + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTupleSlotTL(&css->ss.ps, &TTSOpsVirtual); + ExecAssignScanProjectionInfoWithVarno(&css->ss, tlistvarno); + + /* initialize child expressions */ + css->ss.ps.qual = + ExecInitQual(cscan->scan.plan.qual, (PlanState *) css); + + /* + * The callback of custom-scan provider applies the final initialization + * of the custom-scan-state node according to its logic. + */ + css->methods->BeginCustomScan(css, estate, eflags); + + return css; +} + +static TupleTableSlot * +ExecCustomScan(PlanState *pstate) +{ + CustomScanState *node = castNode(CustomScanState, pstate); + + CHECK_FOR_INTERRUPTS(); + + Assert(node->methods->ExecCustomScan != NULL); + return node->methods->ExecCustomScan(node); +} + +void +ExecEndCustomScan(CustomScanState *node) +{ + Assert(node->methods->EndCustomScan != NULL); + node->methods->EndCustomScan(node); + + /* Free the exprcontext */ + ExecFreeExprContext(&node->ss.ps); + + /* Clean out the tuple table */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +void +ExecReScanCustomScan(CustomScanState *node) +{ + Assert(node->methods->ReScanCustomScan != NULL); + node->methods->ReScanCustomScan(node); +} + +void +ExecCustomMarkPos(CustomScanState *node) +{ + if (!node->methods->MarkPosCustomScan) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("custom scan \"%s\" does not support MarkPos", + node->methods->CustomName))); + node->methods->MarkPosCustomScan(node); +} + +void +ExecCustomRestrPos(CustomScanState *node) +{ + if (!node->methods->RestrPosCustomScan) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("custom scan \"%s\" does not support MarkPos", + node->methods->CustomName))); + node->methods->RestrPosCustomScan(node); +} + +void +ExecCustomScanEstimate(CustomScanState *node, ParallelContext *pcxt) +{ + const CustomExecMethods *methods = node->methods; + + if (methods->EstimateDSMCustomScan) + { + node->pscan_len = methods->EstimateDSMCustomScan(node, pcxt); + shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } +} + +void +ExecCustomScanInitializeDSM(CustomScanState *node, ParallelContext *pcxt) +{ + const CustomExecMethods *methods = node->methods; + + if (methods->InitializeDSMCustomScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; + + coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len); + methods->InitializeDSMCustomScan(node, pcxt, coordinate); + shm_toc_insert(pcxt->toc, plan_node_id, coordinate); + } +} + +void +ExecCustomScanReInitializeDSM(CustomScanState *node, ParallelContext *pcxt) +{ + const CustomExecMethods *methods = node->methods; + + if (methods->ReInitializeDSMCustomScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; + + coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false); + methods->ReInitializeDSMCustomScan(node, pcxt, coordinate); + } +} + +void +ExecCustomScanInitializeWorker(CustomScanState *node, + ParallelWorkerContext *pwcxt) +{ + const CustomExecMethods *methods = node->methods; + + if (methods->InitializeWorkerCustomScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; + + coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false); + methods->InitializeWorkerCustomScan(node, pwcxt->toc, coordinate); + } +} + +void +ExecShutdownCustomScan(CustomScanState *node) +{ + const CustomExecMethods *methods = node->methods; + + if (methods->ShutdownCustomScan) + methods->ShutdownCustomScan(node); +} diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c new file mode 100644 index 0000000..d27849a --- /dev/null +++ b/src/backend/executor/nodeForeignscan.c @@ -0,0 +1,504 @@ +/*------------------------------------------------------------------------- + * + * nodeForeignscan.c + * Routines to support scans of foreign tables + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeForeignscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * + * ExecForeignScan scans a foreign table. + * ExecInitForeignScan creates and initializes state info. + * ExecReScanForeignScan rescans the foreign relation. + * ExecEndForeignScan releases any resources allocated. + */ +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeForeignscan.h" +#include "foreign/fdwapi.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +static TupleTableSlot *ForeignNext(ForeignScanState *node); +static bool ForeignRecheck(ForeignScanState *node, TupleTableSlot *slot); + + +/* ---------------------------------------------------------------- + * ForeignNext + * + * This is a workhorse for ExecForeignScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ForeignNext(ForeignScanState *node) +{ + TupleTableSlot *slot; + ForeignScan *plan = (ForeignScan *) node->ss.ps.plan; + ExprContext *econtext = node->ss.ps.ps_ExprContext; + MemoryContext oldcontext; + + /* Call the Iterate function in short-lived context */ + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + if (plan->operation != CMD_SELECT) + { + /* + * direct modifications cannot be re-evaluated, so shouldn't get here + * during EvalPlanQual processing + */ + Assert(node->ss.ps.state->es_epq_active == NULL); + + slot = node->fdwroutine->IterateDirectModify(node); + } + else + slot = node->fdwroutine->IterateForeignScan(node); + MemoryContextSwitchTo(oldcontext); + + /* + * Insert valid value into tableoid, the only actually-useful system + * column. + */ + if (plan->fsSystemCol && !TupIsNull(slot)) + slot->tts_tableOid = RelationGetRelid(node->ss.ss_currentRelation); + + return slot; +} + +/* + * ForeignRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +ForeignRecheck(ForeignScanState *node, TupleTableSlot *slot) +{ + FdwRoutine *fdwroutine = node->fdwroutine; + ExprContext *econtext; + + /* + * extract necessary information from foreign scan node + */ + econtext = node->ss.ps.ps_ExprContext; + + /* Does the tuple meet the remote qual condition? */ + econtext->ecxt_scantuple = slot; + + ResetExprContext(econtext); + + /* + * If an outer join is pushed down, RecheckForeignScan may need to store a + * different tuple in the slot, because a different set of columns may go + * to NULL upon recheck. Otherwise, it shouldn't need to change the slot + * contents, just return true or false to indicate whether the quals still + * pass. For simple cases, setting fdw_recheck_quals may be easier than + * providing this callback. + */ + if (fdwroutine->RecheckForeignScan && + !fdwroutine->RecheckForeignScan(node, slot)) + return false; + + return ExecQual(node->fdw_recheck_quals, econtext); +} + +/* ---------------------------------------------------------------- + * ExecForeignScan(node) + * + * Fetches the next tuple from the FDW, checks local quals, and + * returns it. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecForeignScan(PlanState *pstate) +{ + ForeignScanState *node = castNode(ForeignScanState, pstate); + ForeignScan *plan = (ForeignScan *) node->ss.ps.plan; + EState *estate = node->ss.ps.state; + + /* + * Ignore direct modifications when EvalPlanQual is active --- they are + * irrelevant for EvalPlanQual rechecking + */ + if (estate->es_epq_active != NULL && plan->operation != CMD_SELECT) + return NULL; + + return ExecScan(&node->ss, + (ExecScanAccessMtd) ForeignNext, + (ExecScanRecheckMtd) ForeignRecheck); +} + + +/* ---------------------------------------------------------------- + * ExecInitForeignScan + * ---------------------------------------------------------------- + */ +ForeignScanState * +ExecInitForeignScan(ForeignScan *node, EState *estate, int eflags) +{ + ForeignScanState *scanstate; + Relation currentRelation = NULL; + Index scanrelid = node->scan.scanrelid; + Index tlistvarno; + FdwRoutine *fdwroutine; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + scanstate = makeNode(ForeignScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecForeignScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * open the scan relation, if any; also acquire function pointers from the + * FDW's handler + */ + if (scanrelid > 0) + { + currentRelation = ExecOpenScanRelation(estate, scanrelid, eflags); + scanstate->ss.ss_currentRelation = currentRelation; + fdwroutine = GetFdwRoutineForRelation(currentRelation, true); + } + else + { + /* We can't use the relcache, so get fdwroutine the hard way */ + fdwroutine = GetFdwRoutineByServerId(node->fs_server); + } + + /* + * Determine the scan tuple type. If the FDW provided a targetlist + * describing the scan tuples, use that; else use base relation's rowtype. + */ + if (node->fdw_scan_tlist != NIL || currentRelation == NULL) + { + TupleDesc scan_tupdesc; + + scan_tupdesc = ExecTypeFromTL(node->fdw_scan_tlist); + ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc, + &TTSOpsHeapTuple); + /* Node's targetlist will contain Vars with varno = INDEX_VAR */ + tlistvarno = INDEX_VAR; + } + else + { + TupleDesc scan_tupdesc; + + /* don't trust FDWs to return tuples fulfilling NOT NULL constraints */ + scan_tupdesc = CreateTupleDescCopy(RelationGetDescr(currentRelation)); + ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc, + &TTSOpsHeapTuple); + /* Node's targetlist will contain Vars with varno = scanrelid */ + tlistvarno = scanrelid; + } + + /* Don't know what an FDW might return */ + scanstate->ss.ps.scanopsfixed = false; + scanstate->ss.ps.scanopsset = true; + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfoWithVarno(&scanstate->ss, tlistvarno); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); + scanstate->fdw_recheck_quals = + ExecInitQual(node->fdw_recheck_quals, (PlanState *) scanstate); + + /* + * Determine whether to scan the foreign relation asynchronously or not; + * this has to be kept in sync with the code in ExecInitAppend(). + */ + scanstate->ss.ps.async_capable = (((Plan *) node)->async_capable && + estate->es_epq_active == NULL); + + /* + * Initialize FDW-related state. + */ + scanstate->fdwroutine = fdwroutine; + scanstate->fdw_state = NULL; + + /* + * For the FDW's convenience, look up the modification target relation's + * ResultRelInfo. The ModifyTable node should have initialized it for us, + * see ExecInitModifyTable. + * + * Don't try to look up the ResultRelInfo when EvalPlanQual is active, + * though. Direct modifications cannot be re-evaluated as part of + * EvalPlanQual. The lookup wouldn't work anyway because during + * EvalPlanQual processing, EvalPlanQual only initializes the subtree + * under the ModifyTable, and doesn't run ExecInitModifyTable. + */ + if (node->resultRelation > 0 && estate->es_epq_active == NULL) + { + if (estate->es_result_relations == NULL || + estate->es_result_relations[node->resultRelation - 1] == NULL) + { + elog(ERROR, "result relation not initialized"); + } + scanstate->resultRelInfo = estate->es_result_relations[node->resultRelation - 1]; + } + + /* Initialize any outer plan. */ + if (outerPlan(node)) + outerPlanState(scanstate) = + ExecInitNode(outerPlan(node), estate, eflags); + + /* + * Tell the FDW to initialize the scan. + */ + if (node->operation != CMD_SELECT) + { + /* + * Direct modifications cannot be re-evaluated by EvalPlanQual, so + * don't bother preparing the FDW. + * + * In case of an inherited UPDATE/DELETE with foreign targets there + * can be direct-modify ForeignScan nodes in the EvalPlanQual subtree, + * so we need to ignore such ForeignScan nodes during EvalPlanQual + * processing. See also ExecForeignScan/ExecReScanForeignScan. + */ + if (estate->es_epq_active == NULL) + fdwroutine->BeginDirectModify(scanstate, eflags); + } + else + fdwroutine->BeginForeignScan(scanstate, eflags); + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndForeignScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndForeignScan(ForeignScanState *node) +{ + ForeignScan *plan = (ForeignScan *) node->ss.ps.plan; + EState *estate = node->ss.ps.state; + + /* Let the FDW shut down */ + if (plan->operation != CMD_SELECT) + { + if (estate->es_epq_active == NULL) + node->fdwroutine->EndDirectModify(node); + } + else + node->fdwroutine->EndForeignScan(node); + + /* Shut down any outer plan. */ + if (outerPlanState(node)) + ExecEndNode(outerPlanState(node)); + + /* Free the exprcontext */ + ExecFreeExprContext(&node->ss.ps); + + /* clean out the tuple table */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecReScanForeignScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanForeignScan(ForeignScanState *node) +{ + ForeignScan *plan = (ForeignScan *) node->ss.ps.plan; + EState *estate = node->ss.ps.state; + PlanState *outerPlan = outerPlanState(node); + + /* + * Ignore direct modifications when EvalPlanQual is active --- they are + * irrelevant for EvalPlanQual rechecking + */ + if (estate->es_epq_active != NULL && plan->operation != CMD_SELECT) + return; + + node->fdwroutine->ReScanForeignScan(node); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. outerPlan may also be NULL, in which case there is + * nothing to rescan at all. + */ + if (outerPlan != NULL && outerPlan->chgParam == NULL) + ExecReScan(outerPlan); + + ExecScanReScan(&node->ss); +} + +/* ---------------------------------------------------------------- + * ExecForeignScanEstimate + * + * Informs size of the parallel coordination information, if any + * ---------------------------------------------------------------- + */ +void +ExecForeignScanEstimate(ForeignScanState *node, ParallelContext *pcxt) +{ + FdwRoutine *fdwroutine = node->fdwroutine; + + if (fdwroutine->EstimateDSMForeignScan) + { + node->pscan_len = fdwroutine->EstimateDSMForeignScan(node, pcxt); + shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } +} + +/* ---------------------------------------------------------------- + * ExecForeignScanInitializeDSM + * + * Initialize the parallel coordination information + * ---------------------------------------------------------------- + */ +void +ExecForeignScanInitializeDSM(ForeignScanState *node, ParallelContext *pcxt) +{ + FdwRoutine *fdwroutine = node->fdwroutine; + + if (fdwroutine->InitializeDSMForeignScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; + + coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len); + fdwroutine->InitializeDSMForeignScan(node, pcxt, coordinate); + shm_toc_insert(pcxt->toc, plan_node_id, coordinate); + } +} + +/* ---------------------------------------------------------------- + * ExecForeignScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecForeignScanReInitializeDSM(ForeignScanState *node, ParallelContext *pcxt) +{ + FdwRoutine *fdwroutine = node->fdwroutine; + + if (fdwroutine->ReInitializeDSMForeignScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; + + coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false); + fdwroutine->ReInitializeDSMForeignScan(node, pcxt, coordinate); + } +} + +/* ---------------------------------------------------------------- + * ExecForeignScanInitializeWorker + * + * Initialization according to the parallel coordination information + * ---------------------------------------------------------------- + */ +void +ExecForeignScanInitializeWorker(ForeignScanState *node, + ParallelWorkerContext *pwcxt) +{ + FdwRoutine *fdwroutine = node->fdwroutine; + + if (fdwroutine->InitializeWorkerForeignScan) + { + int plan_node_id = node->ss.ps.plan->plan_node_id; + void *coordinate; + + coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false); + fdwroutine->InitializeWorkerForeignScan(node, pwcxt->toc, coordinate); + } +} + +/* ---------------------------------------------------------------- + * ExecShutdownForeignScan + * + * Gives FDW chance to stop asynchronous resource consumption + * and release any resources still held. + * ---------------------------------------------------------------- + */ +void +ExecShutdownForeignScan(ForeignScanState *node) +{ + FdwRoutine *fdwroutine = node->fdwroutine; + + if (fdwroutine->ShutdownForeignScan) + fdwroutine->ShutdownForeignScan(node); +} + +/* ---------------------------------------------------------------- + * ExecAsyncForeignScanRequest + * + * Asynchronously request a tuple from a designed async-capable node + * ---------------------------------------------------------------- + */ +void +ExecAsyncForeignScanRequest(AsyncRequest *areq) +{ + ForeignScanState *node = (ForeignScanState *) areq->requestee; + FdwRoutine *fdwroutine = node->fdwroutine; + + Assert(fdwroutine->ForeignAsyncRequest != NULL); + fdwroutine->ForeignAsyncRequest(areq); +} + +/* ---------------------------------------------------------------- + * ExecAsyncForeignScanConfigureWait + * + * In async mode, configure for a wait + * ---------------------------------------------------------------- + */ +void +ExecAsyncForeignScanConfigureWait(AsyncRequest *areq) +{ + ForeignScanState *node = (ForeignScanState *) areq->requestee; + FdwRoutine *fdwroutine = node->fdwroutine; + + Assert(fdwroutine->ForeignAsyncConfigureWait != NULL); + fdwroutine->ForeignAsyncConfigureWait(areq); +} + +/* ---------------------------------------------------------------- + * ExecAsyncForeignScanNotify + * + * Callback invoked when a relevant event has occurred + * ---------------------------------------------------------------- + */ +void +ExecAsyncForeignScanNotify(AsyncRequest *areq) +{ + ForeignScanState *node = (ForeignScanState *) areq->requestee; + FdwRoutine *fdwroutine = node->fdwroutine; + + Assert(fdwroutine->ForeignAsyncNotify != NULL); + fdwroutine->ForeignAsyncNotify(areq); +} diff --git a/src/backend/executor/nodeFunctionscan.c b/src/backend/executor/nodeFunctionscan.c new file mode 100644 index 0000000..b31b2b2 --- /dev/null +++ b/src/backend/executor/nodeFunctionscan.c @@ -0,0 +1,620 @@ +/*------------------------------------------------------------------------- + * + * nodeFunctionscan.c + * Support routines for scanning RangeFunctions (functions in rangetable). + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeFunctionscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecFunctionScan scans a function. + * ExecFunctionNext retrieve next tuple in sequential order. + * ExecInitFunctionScan creates and initializes a functionscan node. + * ExecEndFunctionScan releases any storage allocated. + * ExecReScanFunctionScan rescans the function + */ +#include "postgres.h" + +#include "catalog/pg_type.h" +#include "executor/nodeFunctionscan.h" +#include "funcapi.h" +#include "nodes/nodeFuncs.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + + +/* + * Runtime data for each function being scanned. + */ +typedef struct FunctionScanPerFuncState +{ + SetExprState *setexpr; /* state of the expression being evaluated */ + TupleDesc tupdesc; /* desc of the function result type */ + int colcount; /* expected number of result columns */ + Tuplestorestate *tstore; /* holds the function result set */ + int64 rowcount; /* # of rows in result set, -1 if not known */ + TupleTableSlot *func_slot; /* function result slot (or NULL) */ +} FunctionScanPerFuncState; + +static TupleTableSlot *FunctionNext(FunctionScanState *node); + + +/* ---------------------------------------------------------------- + * Scan Support + * ---------------------------------------------------------------- + */ +/* ---------------------------------------------------------------- + * FunctionNext + * + * This is a workhorse for ExecFunctionScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +FunctionNext(FunctionScanState *node) +{ + EState *estate; + ScanDirection direction; + TupleTableSlot *scanslot; + bool alldone; + int64 oldpos; + int funcno; + int att; + + /* + * get information from the estate and scan state + */ + estate = node->ss.ps.state; + direction = estate->es_direction; + scanslot = node->ss.ss_ScanTupleSlot; + + if (node->simple) + { + /* + * Fast path for the trivial case: the function return type and scan + * result type are the same, so we fetch the function result straight + * into the scan result slot. No need to update ordinality or + * rowcounts either. + */ + Tuplestorestate *tstore = node->funcstates[0].tstore; + + /* + * If first time through, read all tuples from function and put them + * in a tuplestore. Subsequent calls just fetch tuples from + * tuplestore. + */ + if (tstore == NULL) + { + node->funcstates[0].tstore = tstore = + ExecMakeTableFunctionResult(node->funcstates[0].setexpr, + node->ss.ps.ps_ExprContext, + node->argcontext, + node->funcstates[0].tupdesc, + node->eflags & EXEC_FLAG_BACKWARD); + + /* + * paranoia - cope if the function, which may have constructed the + * tuplestore itself, didn't leave it pointing at the start. This + * call is fast, so the overhead shouldn't be an issue. + */ + tuplestore_rescan(tstore); + } + + /* + * Get the next tuple from tuplestore. + */ + (void) tuplestore_gettupleslot(tstore, + ScanDirectionIsForward(direction), + false, + scanslot); + return scanslot; + } + + /* + * Increment or decrement ordinal counter before checking for end-of-data, + * so that we can move off either end of the result by 1 (and no more than + * 1) without losing correct count. See PortalRunSelect for why we can + * assume that we won't be called repeatedly in the end-of-data state. + */ + oldpos = node->ordinal; + if (ScanDirectionIsForward(direction)) + node->ordinal++; + else + node->ordinal--; + + /* + * Main loop over functions. + * + * We fetch the function results into func_slots (which match the function + * return types), and then copy the values to scanslot (which matches the + * scan result type), setting the ordinal column (if any) as well. + */ + ExecClearTuple(scanslot); + att = 0; + alldone = true; + for (funcno = 0; funcno < node->nfuncs; funcno++) + { + FunctionScanPerFuncState *fs = &node->funcstates[funcno]; + int i; + + /* + * If first time through, read all tuples from function and put them + * in a tuplestore. Subsequent calls just fetch tuples from + * tuplestore. + */ + if (fs->tstore == NULL) + { + fs->tstore = + ExecMakeTableFunctionResult(fs->setexpr, + node->ss.ps.ps_ExprContext, + node->argcontext, + fs->tupdesc, + node->eflags & EXEC_FLAG_BACKWARD); + + /* + * paranoia - cope if the function, which may have constructed the + * tuplestore itself, didn't leave it pointing at the start. This + * call is fast, so the overhead shouldn't be an issue. + */ + tuplestore_rescan(fs->tstore); + } + + /* + * Get the next tuple from tuplestore. + * + * If we have a rowcount for the function, and we know the previous + * read position was out of bounds, don't try the read. This allows + * backward scan to work when there are mixed row counts present. + */ + if (fs->rowcount != -1 && fs->rowcount < oldpos) + ExecClearTuple(fs->func_slot); + else + (void) tuplestore_gettupleslot(fs->tstore, + ScanDirectionIsForward(direction), + false, + fs->func_slot); + + if (TupIsNull(fs->func_slot)) + { + /* + * If we ran out of data for this function in the forward + * direction then we now know how many rows it returned. We need + * to know this in order to handle backwards scans. The row count + * we store is actually 1+ the actual number, because we have to + * position the tuplestore 1 off its end sometimes. + */ + if (ScanDirectionIsForward(direction) && fs->rowcount == -1) + fs->rowcount = node->ordinal; + + /* + * populate the result cols with nulls + */ + for (i = 0; i < fs->colcount; i++) + { + scanslot->tts_values[att] = (Datum) 0; + scanslot->tts_isnull[att] = true; + att++; + } + } + else + { + /* + * we have a result, so just copy it to the result cols. + */ + slot_getallattrs(fs->func_slot); + + for (i = 0; i < fs->colcount; i++) + { + scanslot->tts_values[att] = fs->func_slot->tts_values[i]; + scanslot->tts_isnull[att] = fs->func_slot->tts_isnull[i]; + att++; + } + + /* + * We're not done until every function result is exhausted; we pad + * the shorter results with nulls until then. + */ + alldone = false; + } + } + + /* + * ordinal col is always last, per spec. + */ + if (node->ordinality) + { + scanslot->tts_values[att] = Int64GetDatumFast(node->ordinal); + scanslot->tts_isnull[att] = false; + } + + /* + * If alldone, we just return the previously-cleared scanslot. Otherwise, + * finish creating the virtual tuple. + */ + if (!alldone) + ExecStoreVirtualTuple(scanslot); + + return scanslot; +} + +/* + * FunctionRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +FunctionRecheck(FunctionScanState *node, TupleTableSlot *slot) +{ + /* nothing to check */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecFunctionScan(node) + * + * Scans the function sequentially and returns the next qualifying + * tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecFunctionScan(PlanState *pstate) +{ + FunctionScanState *node = castNode(FunctionScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) FunctionNext, + (ExecScanRecheckMtd) FunctionRecheck); +} + +/* ---------------------------------------------------------------- + * ExecInitFunctionScan + * ---------------------------------------------------------------- + */ +FunctionScanState * +ExecInitFunctionScan(FunctionScan *node, EState *estate, int eflags) +{ + FunctionScanState *scanstate; + int nfuncs = list_length(node->functions); + TupleDesc scan_tupdesc; + int i, + natts; + ListCell *lc; + + /* check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* + * FunctionScan should not have any children. + */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create new ScanState for node + */ + scanstate = makeNode(FunctionScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecFunctionScan; + scanstate->eflags = eflags; + + /* + * are we adding an ordinality column? + */ + scanstate->ordinality = node->funcordinality; + + scanstate->nfuncs = nfuncs; + if (nfuncs == 1 && !node->funcordinality) + scanstate->simple = true; + else + scanstate->simple = false; + + /* + * Ordinal 0 represents the "before the first row" position. + * + * We need to track ordinal position even when not adding an ordinality + * column to the result, in order to handle backwards scanning properly + * with multiple functions with different result sizes. (We can't position + * any individual function's tuplestore any more than 1 place beyond its + * end, so when scanning backwards, we need to know when to start + * including the function in the scan again.) + */ + scanstate->ordinal = 0; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + scanstate->funcstates = palloc(nfuncs * sizeof(FunctionScanPerFuncState)); + + natts = 0; + i = 0; + foreach(lc, node->functions) + { + RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc); + Node *funcexpr = rtfunc->funcexpr; + int colcount = rtfunc->funccolcount; + FunctionScanPerFuncState *fs = &scanstate->funcstates[i]; + TypeFuncClass functypclass; + Oid funcrettype; + TupleDesc tupdesc; + + fs->setexpr = + ExecInitTableFunctionResult((Expr *) funcexpr, + scanstate->ss.ps.ps_ExprContext, + &scanstate->ss.ps); + + /* + * Don't allocate the tuplestores; the actual calls to the functions + * do that. NULL means that we have not called the function yet (or + * need to call it again after a rescan). + */ + fs->tstore = NULL; + fs->rowcount = -1; + + /* + * Now determine if the function returns a simple or composite type, + * and build an appropriate tupdesc. Note that in the composite case, + * the function may now return more columns than it did when the plan + * was made; we have to ignore any columns beyond "colcount". + */ + functypclass = get_expr_result_type(funcexpr, + &funcrettype, + &tupdesc); + + if (functypclass == TYPEFUNC_COMPOSITE || + functypclass == TYPEFUNC_COMPOSITE_DOMAIN) + { + /* Composite data type, e.g. a table's row type */ + Assert(tupdesc); + Assert(tupdesc->natts >= colcount); + /* Must copy it out of typcache for safety */ + tupdesc = CreateTupleDescCopy(tupdesc); + } + else if (functypclass == TYPEFUNC_SCALAR) + { + /* Base data type, i.e. scalar */ + tupdesc = CreateTemplateTupleDesc(1); + TupleDescInitEntry(tupdesc, + (AttrNumber) 1, + NULL, /* don't care about the name here */ + funcrettype, + -1, + 0); + TupleDescInitEntryCollation(tupdesc, + (AttrNumber) 1, + exprCollation(funcexpr)); + } + else if (functypclass == TYPEFUNC_RECORD) + { + tupdesc = BuildDescFromLists(rtfunc->funccolnames, + rtfunc->funccoltypes, + rtfunc->funccoltypmods, + rtfunc->funccolcollations); + + /* + * For RECORD results, make sure a typmod has been assigned. (The + * function should do this for itself, but let's cover things in + * case it doesn't.) + */ + BlessTupleDesc(tupdesc); + } + else + { + /* crummy error message, but parser should have caught this */ + elog(ERROR, "function in FROM has unsupported return type"); + } + + fs->tupdesc = tupdesc; + fs->colcount = colcount; + + /* + * We only need separate slots for the function results if we are + * doing ordinality or multiple functions; otherwise, we'll fetch + * function results directly into the scan slot. + */ + if (!scanstate->simple) + { + fs->func_slot = ExecInitExtraTupleSlot(estate, fs->tupdesc, + &TTSOpsMinimalTuple); + } + else + fs->func_slot = NULL; + + natts += colcount; + i++; + } + + /* + * Create the combined TupleDesc + * + * If there is just one function without ordinality, the scan result + * tupdesc is the same as the function result tupdesc --- except that we + * may stuff new names into it below, so drop any rowtype label. + */ + if (scanstate->simple) + { + scan_tupdesc = CreateTupleDescCopy(scanstate->funcstates[0].tupdesc); + scan_tupdesc->tdtypeid = RECORDOID; + scan_tupdesc->tdtypmod = -1; + } + else + { + AttrNumber attno = 0; + + if (node->funcordinality) + natts++; + + scan_tupdesc = CreateTemplateTupleDesc(natts); + + for (i = 0; i < nfuncs; i++) + { + TupleDesc tupdesc = scanstate->funcstates[i].tupdesc; + int colcount = scanstate->funcstates[i].colcount; + int j; + + for (j = 1; j <= colcount; j++) + TupleDescCopyEntry(scan_tupdesc, ++attno, tupdesc, j); + } + + /* If doing ordinality, add a column of type "bigint" at the end */ + if (node->funcordinality) + { + TupleDescInitEntry(scan_tupdesc, + ++attno, + NULL, /* don't care about the name here */ + INT8OID, + -1, + 0); + } + + Assert(attno == natts); + } + + /* + * Initialize scan slot and type. + */ + ExecInitScanTupleSlot(estate, &scanstate->ss, scan_tupdesc, + &TTSOpsMinimalTuple); + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); + + /* + * Create a memory context that ExecMakeTableFunctionResult can use to + * evaluate function arguments in. We can't use the per-tuple context for + * this because it gets reset too often; but we don't want to leak + * evaluation results into the query-lifespan context either. We just + * need one context, because we evaluate each function separately. + */ + scanstate->argcontext = AllocSetContextCreate(CurrentMemoryContext, + "Table function arguments", + ALLOCSET_DEFAULT_SIZES); + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndFunctionScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndFunctionScan(FunctionScanState *node) +{ + int i; + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * Release slots and tuplestore resources + */ + for (i = 0; i < node->nfuncs; i++) + { + FunctionScanPerFuncState *fs = &node->funcstates[i]; + + if (fs->func_slot) + ExecClearTuple(fs->func_slot); + + if (fs->tstore != NULL) + { + tuplestore_end(node->funcstates[i].tstore); + fs->tstore = NULL; + } + } +} + +/* ---------------------------------------------------------------- + * ExecReScanFunctionScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanFunctionScan(FunctionScanState *node) +{ + FunctionScan *scan = (FunctionScan *) node->ss.ps.plan; + int i; + Bitmapset *chgparam = node->ss.ps.chgParam; + + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + for (i = 0; i < node->nfuncs; i++) + { + FunctionScanPerFuncState *fs = &node->funcstates[i]; + + if (fs->func_slot) + ExecClearTuple(fs->func_slot); + } + + ExecScanReScan(&node->ss); + + /* + * Here we have a choice whether to drop the tuplestores (and recompute + * the function outputs) or just rescan them. We must recompute if an + * expression contains changed parameters, else we rescan. + * + * XXX maybe we should recompute if the function is volatile? But in + * general the executor doesn't conditionalize its actions on that. + */ + if (chgparam) + { + ListCell *lc; + + i = 0; + foreach(lc, scan->functions) + { + RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc); + + if (bms_overlap(chgparam, rtfunc->funcparams)) + { + if (node->funcstates[i].tstore != NULL) + { + tuplestore_end(node->funcstates[i].tstore); + node->funcstates[i].tstore = NULL; + } + node->funcstates[i].rowcount = -1; + } + i++; + } + } + + /* Reset ordinality counter */ + node->ordinal = 0; + + /* Make sure we rewind any remaining tuplestores */ + for (i = 0; i < node->nfuncs; i++) + { + if (node->funcstates[i].tstore != NULL) + tuplestore_rescan(node->funcstates[i].tstore); + } +} diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c new file mode 100644 index 0000000..734142b --- /dev/null +++ b/src/backend/executor/nodeGather.c @@ -0,0 +1,477 @@ +/*------------------------------------------------------------------------- + * + * nodeGather.c + * Support routines for scanning a plan via multiple workers. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * A Gather executor launches parallel workers to run multiple copies of a + * plan. It can also run the plan itself, if the workers are not available + * or have not started up yet. It then merges all of the results it produces + * and the results from the workers into a single output stream. Therefore, + * it will normally be used with a plan where running multiple copies of the + * same plan does not produce duplicate output, such as parallel-aware + * SeqScan. + * + * Alternatively, a Gather node can be configured to use just one worker + * and the single-copy flag can be set. In this case, the Gather node will + * run the plan in one worker and will not execute the plan itself. In + * this case, it simply returns whatever tuples were returned by the worker. + * If a worker cannot be obtained, then it will run the plan itself and + * return the results. Therefore, a plan used with a single-copy Gather + * node need not be parallel-aware. + * + * IDENTIFICATION + * src/backend/executor/nodeGather.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relscan.h" +#include "access/xact.h" +#include "executor/execdebug.h" +#include "executor/execParallel.h" +#include "executor/nodeGather.h" +#include "executor/nodeSubplan.h" +#include "executor/tqueue.h" +#include "miscadmin.h" +#include "optimizer/optimizer.h" +#include "pgstat.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +static TupleTableSlot *ExecGather(PlanState *pstate); +static TupleTableSlot *gather_getnext(GatherState *gatherstate); +static MinimalTuple gather_readnext(GatherState *gatherstate); +static void ExecShutdownGatherWorkers(GatherState *node); + + +/* ---------------------------------------------------------------- + * ExecInitGather + * ---------------------------------------------------------------- + */ +GatherState * +ExecInitGather(Gather *node, EState *estate, int eflags) +{ + GatherState *gatherstate; + Plan *outerNode; + TupleDesc tupDesc; + + /* Gather node doesn't have innerPlan node. */ + Assert(innerPlan(node) == NULL); + + /* + * create state structure + */ + gatherstate = makeNode(GatherState); + gatherstate->ps.plan = (Plan *) node; + gatherstate->ps.state = estate; + gatherstate->ps.ExecProcNode = ExecGather; + + gatherstate->initialized = false; + gatherstate->need_to_scan_locally = + !node->single_copy && parallel_leader_participation; + gatherstate->tuples_needed = -1; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &gatherstate->ps); + + /* + * now initialize outer plan + */ + outerNode = outerPlan(node); + outerPlanState(gatherstate) = ExecInitNode(outerNode, estate, eflags); + tupDesc = ExecGetResultType(outerPlanState(gatherstate)); + + /* + * Leader may access ExecProcNode result directly (if + * need_to_scan_locally), or from workers via tuple queue. So we can't + * trivially rely on the slot type being fixed for expressions evaluated + * within this node. + */ + gatherstate->ps.outeropsset = true; + gatherstate->ps.outeropsfixed = false; + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&gatherstate->ps); + ExecConditionalAssignProjectionInfo(&gatherstate->ps, tupDesc, OUTER_VAR); + + /* + * Without projections result slot type is not trivially known, see + * comment above. + */ + if (gatherstate->ps.ps_ProjInfo == NULL) + { + gatherstate->ps.resultopsset = true; + gatherstate->ps.resultopsfixed = false; + } + + /* + * Initialize funnel slot to same tuple descriptor as outer plan. + */ + gatherstate->funnel_slot = ExecInitExtraTupleSlot(estate, tupDesc, + &TTSOpsMinimalTuple); + + /* + * Gather doesn't support checking a qual (it's always more efficient to + * do it in the child node). + */ + Assert(!node->plan.qual); + + return gatherstate; +} + +/* ---------------------------------------------------------------- + * ExecGather(node) + * + * Scans the relation via multiple workers and returns + * the next qualifying tuple. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecGather(PlanState *pstate) +{ + GatherState *node = castNode(GatherState, pstate); + TupleTableSlot *slot; + ExprContext *econtext; + + CHECK_FOR_INTERRUPTS(); + + /* + * Initialize the parallel context and workers on first execution. We do + * this on first execution rather than during node initialization, as it + * needs to allocate a large dynamic segment, so it is better to do it + * only if it is really needed. + */ + if (!node->initialized) + { + EState *estate = node->ps.state; + Gather *gather = (Gather *) node->ps.plan; + + /* + * Sometimes we might have to run without parallelism; but if parallel + * mode is active then we can try to fire up some workers. + */ + if (gather->num_workers > 0 && estate->es_use_parallel_mode) + { + ParallelContext *pcxt; + + /* Initialize, or re-initialize, shared state needed by workers. */ + if (!node->pei) + node->pei = ExecInitParallelPlan(node->ps.lefttree, + estate, + gather->initParam, + gather->num_workers, + node->tuples_needed); + else + ExecParallelReinitialize(node->ps.lefttree, + node->pei, + gather->initParam); + + /* + * Register backend workers. We might not get as many as we + * requested, or indeed any at all. + */ + pcxt = node->pei->pcxt; + LaunchParallelWorkers(pcxt); + /* We save # workers launched for the benefit of EXPLAIN */ + node->nworkers_launched = pcxt->nworkers_launched; + + /* Set up tuple queue readers to read the results. */ + if (pcxt->nworkers_launched > 0) + { + ExecParallelCreateReaders(node->pei); + /* Make a working array showing the active readers */ + node->nreaders = pcxt->nworkers_launched; + node->reader = (TupleQueueReader **) + palloc(node->nreaders * sizeof(TupleQueueReader *)); + memcpy(node->reader, node->pei->reader, + node->nreaders * sizeof(TupleQueueReader *)); + } + else + { + /* No workers? Then never mind. */ + node->nreaders = 0; + node->reader = NULL; + } + node->nextreader = 0; + } + + /* Run plan locally if no workers or enabled and not single-copy. */ + node->need_to_scan_locally = (node->nreaders == 0) + || (!gather->single_copy && parallel_leader_participation); + node->initialized = true; + } + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + econtext = node->ps.ps_ExprContext; + ResetExprContext(econtext); + + /* + * Get next tuple, either from one of our workers, or by running the plan + * ourselves. + */ + slot = gather_getnext(node); + if (TupIsNull(slot)) + return NULL; + + /* If no projection is required, we're done. */ + if (node->ps.ps_ProjInfo == NULL) + return slot; + + /* + * Form the result tuple using ExecProject(), and return it. + */ + econtext->ecxt_outertuple = slot; + return ExecProject(node->ps.ps_ProjInfo); +} + +/* ---------------------------------------------------------------- + * ExecEndGather + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndGather(GatherState *node) +{ + ExecEndNode(outerPlanState(node)); /* let children clean up first */ + ExecShutdownGather(node); + ExecFreeExprContext(&node->ps); + if (node->ps.ps_ResultTupleSlot) + ExecClearTuple(node->ps.ps_ResultTupleSlot); +} + +/* + * Read the next tuple. We might fetch a tuple from one of the tuple queues + * using gather_readnext, or if no tuple queue contains a tuple and the + * single_copy flag is not set, we might generate one locally instead. + */ +static TupleTableSlot * +gather_getnext(GatherState *gatherstate) +{ + PlanState *outerPlan = outerPlanState(gatherstate); + TupleTableSlot *outerTupleSlot; + TupleTableSlot *fslot = gatherstate->funnel_slot; + MinimalTuple tup; + + while (gatherstate->nreaders > 0 || gatherstate->need_to_scan_locally) + { + CHECK_FOR_INTERRUPTS(); + + if (gatherstate->nreaders > 0) + { + tup = gather_readnext(gatherstate); + + if (HeapTupleIsValid(tup)) + { + ExecStoreMinimalTuple(tup, /* tuple to store */ + fslot, /* slot to store the tuple */ + false); /* don't pfree tuple */ + return fslot; + } + } + + if (gatherstate->need_to_scan_locally) + { + EState *estate = gatherstate->ps.state; + + /* Install our DSA area while executing the plan. */ + estate->es_query_dsa = + gatherstate->pei ? gatherstate->pei->area : NULL; + outerTupleSlot = ExecProcNode(outerPlan); + estate->es_query_dsa = NULL; + + if (!TupIsNull(outerTupleSlot)) + return outerTupleSlot; + + gatherstate->need_to_scan_locally = false; + } + } + + return ExecClearTuple(fslot); +} + +/* + * Attempt to read a tuple from one of our parallel workers. + */ +static MinimalTuple +gather_readnext(GatherState *gatherstate) +{ + int nvisited = 0; + + for (;;) + { + TupleQueueReader *reader; + MinimalTuple tup; + bool readerdone; + + /* Check for async events, particularly messages from workers. */ + CHECK_FOR_INTERRUPTS(); + + /* + * Attempt to read a tuple, but don't block if none is available. + * + * Note that TupleQueueReaderNext will just return NULL for a worker + * which fails to initialize. We'll treat that worker as having + * produced no tuples; WaitForParallelWorkersToFinish will error out + * when we get there. + */ + Assert(gatherstate->nextreader < gatherstate->nreaders); + reader = gatherstate->reader[gatherstate->nextreader]; + tup = TupleQueueReaderNext(reader, true, &readerdone); + + /* + * If this reader is done, remove it from our working array of active + * readers. If all readers are done, we're outta here. + */ + if (readerdone) + { + Assert(!tup); + --gatherstate->nreaders; + if (gatherstate->nreaders == 0) + { + ExecShutdownGatherWorkers(gatherstate); + return NULL; + } + memmove(&gatherstate->reader[gatherstate->nextreader], + &gatherstate->reader[gatherstate->nextreader + 1], + sizeof(TupleQueueReader *) + * (gatherstate->nreaders - gatherstate->nextreader)); + if (gatherstate->nextreader >= gatherstate->nreaders) + gatherstate->nextreader = 0; + continue; + } + + /* If we got a tuple, return it. */ + if (tup) + return tup; + + /* + * Advance nextreader pointer in round-robin fashion. Note that we + * only reach this code if we weren't able to get a tuple from the + * current worker. We used to advance the nextreader pointer after + * every tuple, but it turns out to be much more efficient to keep + * reading from the same queue until that would require blocking. + */ + gatherstate->nextreader++; + if (gatherstate->nextreader >= gatherstate->nreaders) + gatherstate->nextreader = 0; + + /* Have we visited every (surviving) TupleQueueReader? */ + nvisited++; + if (nvisited >= gatherstate->nreaders) + { + /* + * If (still) running plan locally, return NULL so caller can + * generate another tuple from the local copy of the plan. + */ + if (gatherstate->need_to_scan_locally) + return NULL; + + /* Nothing to do except wait for developments. */ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + WAIT_EVENT_EXECUTE_GATHER); + ResetLatch(MyLatch); + nvisited = 0; + } + } +} + +/* ---------------------------------------------------------------- + * ExecShutdownGatherWorkers + * + * Stop all the parallel workers. + * ---------------------------------------------------------------- + */ +static void +ExecShutdownGatherWorkers(GatherState *node) +{ + if (node->pei != NULL) + ExecParallelFinish(node->pei); + + /* Flush local copy of reader array */ + if (node->reader) + pfree(node->reader); + node->reader = NULL; +} + +/* ---------------------------------------------------------------- + * ExecShutdownGather + * + * Destroy the setup for parallel workers including parallel context. + * ---------------------------------------------------------------- + */ +void +ExecShutdownGather(GatherState *node) +{ + ExecShutdownGatherWorkers(node); + + /* Now destroy the parallel context. */ + if (node->pei != NULL) + { + ExecParallelCleanup(node->pei); + node->pei = NULL; + } +} + +/* ---------------------------------------------------------------- + * Join Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecReScanGather + * + * Prepare to re-scan the result of a Gather. + * ---------------------------------------------------------------- + */ +void +ExecReScanGather(GatherState *node) +{ + Gather *gather = (Gather *) node->ps.plan; + PlanState *outerPlan = outerPlanState(node); + + /* Make sure any existing workers are gracefully shut down */ + ExecShutdownGatherWorkers(node); + + /* Mark node so that shared state will be rebuilt at next call */ + node->initialized = false; + + /* + * Set child node's chgParam to tell it that the next scan might deliver a + * different set of rows within the leader process. (The overall rowset + * shouldn't change, but the leader process's subset might; hence nodes + * between here and the parallel table scan node mustn't optimize on the + * assumption of an unchanging rowset.) + */ + if (gather->rescan_param >= 0) + outerPlan->chgParam = bms_add_member(outerPlan->chgParam, + gather->rescan_param); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. Note: because this does nothing if we have a + * rescan_param, it's currently guaranteed that parallel-aware child nodes + * will not see a ReScan call until after they get a ReInitializeDSM call. + * That ordering might not be something to rely on, though. A good rule + * of thumb is that ReInitializeDSM should reset only shared state, ReScan + * should reset only local state, and anything that depends on both of + * those steps being finished must wait until the first ExecProcNode call. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c new file mode 100644 index 0000000..03f02a1 --- /dev/null +++ b/src/backend/executor/nodeGatherMerge.c @@ -0,0 +1,789 @@ +/*------------------------------------------------------------------------- + * + * nodeGatherMerge.c + * Scan a plan in multiple workers, and do order-preserving merge. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeGatherMerge.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relscan.h" +#include "access/xact.h" +#include "executor/execdebug.h" +#include "executor/execParallel.h" +#include "executor/nodeGatherMerge.h" +#include "executor/nodeSubplan.h" +#include "executor/tqueue.h" +#include "lib/binaryheap.h" +#include "miscadmin.h" +#include "optimizer/optimizer.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* + * When we read tuples from workers, it's a good idea to read several at once + * for efficiency when possible: this minimizes context-switching overhead. + * But reading too many at a time wastes memory without improving performance. + * We'll read up to MAX_TUPLE_STORE tuples (in addition to the first one). + */ +#define MAX_TUPLE_STORE 10 + +/* + * Pending-tuple array for each worker. This holds additional tuples that + * we were able to fetch from the worker, but can't process yet. In addition, + * this struct holds the "done" flag indicating the worker is known to have + * no more tuples. (We do not use this struct for the leader; we don't keep + * any pending tuples for the leader, and the need_to_scan_locally flag serves + * as its "done" indicator.) + */ +typedef struct GMReaderTupleBuffer +{ + MinimalTuple *tuple; /* array of length MAX_TUPLE_STORE */ + int nTuples; /* number of tuples currently stored */ + int readCounter; /* index of next tuple to extract */ + bool done; /* true if reader is known exhausted */ +} GMReaderTupleBuffer; + +static TupleTableSlot *ExecGatherMerge(PlanState *pstate); +static int32 heap_compare_slots(Datum a, Datum b, void *arg); +static TupleTableSlot *gather_merge_getnext(GatherMergeState *gm_state); +static MinimalTuple gm_readnext_tuple(GatherMergeState *gm_state, int nreader, + bool nowait, bool *done); +static void ExecShutdownGatherMergeWorkers(GatherMergeState *node); +static void gather_merge_setup(GatherMergeState *gm_state); +static void gather_merge_init(GatherMergeState *gm_state); +static void gather_merge_clear_tuples(GatherMergeState *gm_state); +static bool gather_merge_readnext(GatherMergeState *gm_state, int reader, + bool nowait); +static void load_tuple_array(GatherMergeState *gm_state, int reader); + +/* ---------------------------------------------------------------- + * ExecInitGather + * ---------------------------------------------------------------- + */ +GatherMergeState * +ExecInitGatherMerge(GatherMerge *node, EState *estate, int eflags) +{ + GatherMergeState *gm_state; + Plan *outerNode; + TupleDesc tupDesc; + + /* Gather merge node doesn't have innerPlan node. */ + Assert(innerPlan(node) == NULL); + + /* + * create state structure + */ + gm_state = makeNode(GatherMergeState); + gm_state->ps.plan = (Plan *) node; + gm_state->ps.state = estate; + gm_state->ps.ExecProcNode = ExecGatherMerge; + + gm_state->initialized = false; + gm_state->gm_initialized = false; + gm_state->tuples_needed = -1; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &gm_state->ps); + + /* + * GatherMerge doesn't support checking a qual (it's always more efficient + * to do it in the child node). + */ + Assert(!node->plan.qual); + + /* + * now initialize outer plan + */ + outerNode = outerPlan(node); + outerPlanState(gm_state) = ExecInitNode(outerNode, estate, eflags); + + /* + * Leader may access ExecProcNode result directly (if + * need_to_scan_locally), or from workers via tuple queue. So we can't + * trivially rely on the slot type being fixed for expressions evaluated + * within this node. + */ + gm_state->ps.outeropsset = true; + gm_state->ps.outeropsfixed = false; + + /* + * Store the tuple descriptor into gather merge state, so we can use it + * while initializing the gather merge slots. + */ + tupDesc = ExecGetResultType(outerPlanState(gm_state)); + gm_state->tupDesc = tupDesc; + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&gm_state->ps); + ExecConditionalAssignProjectionInfo(&gm_state->ps, tupDesc, OUTER_VAR); + + /* + * Without projections result slot type is not trivially known, see + * comment above. + */ + if (gm_state->ps.ps_ProjInfo == NULL) + { + gm_state->ps.resultopsset = true; + gm_state->ps.resultopsfixed = false; + } + + /* + * initialize sort-key information + */ + if (node->numCols) + { + int i; + + gm_state->gm_nkeys = node->numCols; + gm_state->gm_sortkeys = + palloc0(sizeof(SortSupportData) * node->numCols); + + for (i = 0; i < node->numCols; i++) + { + SortSupport sortKey = gm_state->gm_sortkeys + i; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = node->collations[i]; + sortKey->ssup_nulls_first = node->nullsFirst[i]; + sortKey->ssup_attno = node->sortColIdx[i]; + + /* + * We don't perform abbreviated key conversion here, for the same + * reasons that it isn't used in MergeAppend + */ + sortKey->abbreviate = false; + + PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey); + } + } + + /* Now allocate the workspace for gather merge */ + gather_merge_setup(gm_state); + + return gm_state; +} + +/* ---------------------------------------------------------------- + * ExecGatherMerge(node) + * + * Scans the relation via multiple workers and returns + * the next qualifying tuple. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecGatherMerge(PlanState *pstate) +{ + GatherMergeState *node = castNode(GatherMergeState, pstate); + TupleTableSlot *slot; + ExprContext *econtext; + + CHECK_FOR_INTERRUPTS(); + + /* + * As with Gather, we don't launch workers until this node is actually + * executed. + */ + if (!node->initialized) + { + EState *estate = node->ps.state; + GatherMerge *gm = castNode(GatherMerge, node->ps.plan); + + /* + * Sometimes we might have to run without parallelism; but if parallel + * mode is active then we can try to fire up some workers. + */ + if (gm->num_workers > 0 && estate->es_use_parallel_mode) + { + ParallelContext *pcxt; + + /* Initialize, or re-initialize, shared state needed by workers. */ + if (!node->pei) + node->pei = ExecInitParallelPlan(node->ps.lefttree, + estate, + gm->initParam, + gm->num_workers, + node->tuples_needed); + else + ExecParallelReinitialize(node->ps.lefttree, + node->pei, + gm->initParam); + + /* Try to launch workers. */ + pcxt = node->pei->pcxt; + LaunchParallelWorkers(pcxt); + /* We save # workers launched for the benefit of EXPLAIN */ + node->nworkers_launched = pcxt->nworkers_launched; + + /* Set up tuple queue readers to read the results. */ + if (pcxt->nworkers_launched > 0) + { + ExecParallelCreateReaders(node->pei); + /* Make a working array showing the active readers */ + node->nreaders = pcxt->nworkers_launched; + node->reader = (TupleQueueReader **) + palloc(node->nreaders * sizeof(TupleQueueReader *)); + memcpy(node->reader, node->pei->reader, + node->nreaders * sizeof(TupleQueueReader *)); + } + else + { + /* No workers? Then never mind. */ + node->nreaders = 0; + node->reader = NULL; + } + } + + /* allow leader to participate if enabled or no choice */ + if (parallel_leader_participation || node->nreaders == 0) + node->need_to_scan_locally = true; + node->initialized = true; + } + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + econtext = node->ps.ps_ExprContext; + ResetExprContext(econtext); + + /* + * Get next tuple, either from one of our workers, or by running the plan + * ourselves. + */ + slot = gather_merge_getnext(node); + if (TupIsNull(slot)) + return NULL; + + /* If no projection is required, we're done. */ + if (node->ps.ps_ProjInfo == NULL) + return slot; + + /* + * Form the result tuple using ExecProject(), and return it. + */ + econtext->ecxt_outertuple = slot; + return ExecProject(node->ps.ps_ProjInfo); +} + +/* ---------------------------------------------------------------- + * ExecEndGatherMerge + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndGatherMerge(GatherMergeState *node) +{ + ExecEndNode(outerPlanState(node)); /* let children clean up first */ + ExecShutdownGatherMerge(node); + ExecFreeExprContext(&node->ps); + if (node->ps.ps_ResultTupleSlot) + ExecClearTuple(node->ps.ps_ResultTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecShutdownGatherMerge + * + * Destroy the setup for parallel workers including parallel context. + * ---------------------------------------------------------------- + */ +void +ExecShutdownGatherMerge(GatherMergeState *node) +{ + ExecShutdownGatherMergeWorkers(node); + + /* Now destroy the parallel context. */ + if (node->pei != NULL) + { + ExecParallelCleanup(node->pei); + node->pei = NULL; + } +} + +/* ---------------------------------------------------------------- + * ExecShutdownGatherMergeWorkers + * + * Stop all the parallel workers. + * ---------------------------------------------------------------- + */ +static void +ExecShutdownGatherMergeWorkers(GatherMergeState *node) +{ + if (node->pei != NULL) + ExecParallelFinish(node->pei); + + /* Flush local copy of reader array */ + if (node->reader) + pfree(node->reader); + node->reader = NULL; +} + +/* ---------------------------------------------------------------- + * ExecReScanGatherMerge + * + * Prepare to re-scan the result of a GatherMerge. + * ---------------------------------------------------------------- + */ +void +ExecReScanGatherMerge(GatherMergeState *node) +{ + GatherMerge *gm = (GatherMerge *) node->ps.plan; + PlanState *outerPlan = outerPlanState(node); + + /* Make sure any existing workers are gracefully shut down */ + ExecShutdownGatherMergeWorkers(node); + + /* Free any unused tuples, so we don't leak memory across rescans */ + gather_merge_clear_tuples(node); + + /* Mark node so that shared state will be rebuilt at next call */ + node->initialized = false; + node->gm_initialized = false; + + /* + * Set child node's chgParam to tell it that the next scan might deliver a + * different set of rows within the leader process. (The overall rowset + * shouldn't change, but the leader process's subset might; hence nodes + * between here and the parallel table scan node mustn't optimize on the + * assumption of an unchanging rowset.) + */ + if (gm->rescan_param >= 0) + outerPlan->chgParam = bms_add_member(outerPlan->chgParam, + gm->rescan_param); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. Note: because this does nothing if we have a + * rescan_param, it's currently guaranteed that parallel-aware child nodes + * will not see a ReScan call until after they get a ReInitializeDSM call. + * That ordering might not be something to rely on, though. A good rule + * of thumb is that ReInitializeDSM should reset only shared state, ReScan + * should reset only local state, and anything that depends on both of + * those steps being finished must wait until the first ExecProcNode call. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} + +/* + * Set up the data structures that we'll need for Gather Merge. + * + * We allocate these once on the basis of gm->num_workers, which is an + * upper bound for the number of workers we'll actually have. During + * a rescan, we reset the structures to empty. This approach simplifies + * not leaking memory across rescans. + * + * In the gm_slots[] array, index 0 is for the leader, and indexes 1 to n + * are for workers. The values placed into gm_heap correspond to indexes + * in gm_slots[]. The gm_tuple_buffers[] array, however, is indexed from + * 0 to n-1; it has no entry for the leader. + */ +static void +gather_merge_setup(GatherMergeState *gm_state) +{ + GatherMerge *gm = castNode(GatherMerge, gm_state->ps.plan); + int nreaders = gm->num_workers; + int i; + + /* + * Allocate gm_slots for the number of workers + one more slot for leader. + * Slot 0 is always for the leader. Leader always calls ExecProcNode() to + * read the tuple, and then stores it directly into its gm_slots entry. + * For other slots, code below will call ExecInitExtraTupleSlot() to + * create a slot for the worker's results. Note that during any single + * scan, we might have fewer than num_workers available workers, in which + * case the extra array entries go unused. + */ + gm_state->gm_slots = (TupleTableSlot **) + palloc0((nreaders + 1) * sizeof(TupleTableSlot *)); + + /* Allocate the tuple slot and tuple array for each worker */ + gm_state->gm_tuple_buffers = (GMReaderTupleBuffer *) + palloc0(nreaders * sizeof(GMReaderTupleBuffer)); + + for (i = 0; i < nreaders; i++) + { + /* Allocate the tuple array with length MAX_TUPLE_STORE */ + gm_state->gm_tuple_buffers[i].tuple = + (MinimalTuple *) palloc0(sizeof(MinimalTuple) * MAX_TUPLE_STORE); + + /* Initialize tuple slot for worker */ + gm_state->gm_slots[i + 1] = + ExecInitExtraTupleSlot(gm_state->ps.state, gm_state->tupDesc, + &TTSOpsMinimalTuple); + } + + /* Allocate the resources for the merge */ + gm_state->gm_heap = binaryheap_allocate(nreaders + 1, + heap_compare_slots, + gm_state); +} + +/* + * Initialize the Gather Merge. + * + * Reset data structures to ensure they're empty. Then pull at least one + * tuple from leader + each worker (or set its "done" indicator), and set up + * the heap. + */ +static void +gather_merge_init(GatherMergeState *gm_state) +{ + int nreaders = gm_state->nreaders; + bool nowait = true; + int i; + + /* Assert that gather_merge_setup made enough space */ + Assert(nreaders <= castNode(GatherMerge, gm_state->ps.plan)->num_workers); + + /* Reset leader's tuple slot to empty */ + gm_state->gm_slots[0] = NULL; + + /* Reset the tuple slot and tuple array for each worker */ + for (i = 0; i < nreaders; i++) + { + /* Reset tuple array to empty */ + gm_state->gm_tuple_buffers[i].nTuples = 0; + gm_state->gm_tuple_buffers[i].readCounter = 0; + /* Reset done flag to not-done */ + gm_state->gm_tuple_buffers[i].done = false; + /* Ensure output slot is empty */ + ExecClearTuple(gm_state->gm_slots[i + 1]); + } + + /* Reset binary heap to empty */ + binaryheap_reset(gm_state->gm_heap); + + /* + * First, try to read a tuple from each worker (including leader) in + * nowait mode. After this, if not all workers were able to produce a + * tuple (or a "done" indication), then re-read from remaining workers, + * this time using wait mode. Add all live readers (those producing at + * least one tuple) to the heap. + */ +reread: + for (i = 0; i <= nreaders; i++) + { + CHECK_FOR_INTERRUPTS(); + + /* skip this source if already known done */ + if ((i == 0) ? gm_state->need_to_scan_locally : + !gm_state->gm_tuple_buffers[i - 1].done) + { + if (TupIsNull(gm_state->gm_slots[i])) + { + /* Don't have a tuple yet, try to get one */ + if (gather_merge_readnext(gm_state, i, nowait)) + binaryheap_add_unordered(gm_state->gm_heap, + Int32GetDatum(i)); + } + else + { + /* + * We already got at least one tuple from this worker, but + * might as well see if it has any more ready by now. + */ + load_tuple_array(gm_state, i); + } + } + } + + /* need not recheck leader, since nowait doesn't matter for it */ + for (i = 1; i <= nreaders; i++) + { + if (!gm_state->gm_tuple_buffers[i - 1].done && + TupIsNull(gm_state->gm_slots[i])) + { + nowait = false; + goto reread; + } + } + + /* Now heapify the heap. */ + binaryheap_build(gm_state->gm_heap); + + gm_state->gm_initialized = true; +} + +/* + * Clear out the tuple table slot, and any unused pending tuples, + * for each gather merge input. + */ +static void +gather_merge_clear_tuples(GatherMergeState *gm_state) +{ + int i; + + for (i = 0; i < gm_state->nreaders; i++) + { + GMReaderTupleBuffer *tuple_buffer = &gm_state->gm_tuple_buffers[i]; + + while (tuple_buffer->readCounter < tuple_buffer->nTuples) + pfree(tuple_buffer->tuple[tuple_buffer->readCounter++]); + + ExecClearTuple(gm_state->gm_slots[i + 1]); + } +} + +/* + * Read the next tuple for gather merge. + * + * Fetch the sorted tuple out of the heap. + */ +static TupleTableSlot * +gather_merge_getnext(GatherMergeState *gm_state) +{ + int i; + + if (!gm_state->gm_initialized) + { + /* + * First time through: pull the first tuple from each participant, and + * set up the heap. + */ + gather_merge_init(gm_state); + } + else + { + /* + * Otherwise, pull the next tuple from whichever participant we + * returned from last time, and reinsert that participant's index into + * the heap, because it might now compare differently against the + * other elements of the heap. + */ + i = DatumGetInt32(binaryheap_first(gm_state->gm_heap)); + + if (gather_merge_readnext(gm_state, i, false)) + binaryheap_replace_first(gm_state->gm_heap, Int32GetDatum(i)); + else + { + /* reader exhausted, remove it from heap */ + (void) binaryheap_remove_first(gm_state->gm_heap); + } + } + + if (binaryheap_empty(gm_state->gm_heap)) + { + /* All the queues are exhausted, and so is the heap */ + gather_merge_clear_tuples(gm_state); + return NULL; + } + else + { + /* Return next tuple from whichever participant has the leading one */ + i = DatumGetInt32(binaryheap_first(gm_state->gm_heap)); + return gm_state->gm_slots[i]; + } +} + +/* + * Read tuple(s) for given reader in nowait mode, and load into its tuple + * array, until we have MAX_TUPLE_STORE of them or would have to block. + */ +static void +load_tuple_array(GatherMergeState *gm_state, int reader) +{ + GMReaderTupleBuffer *tuple_buffer; + int i; + + /* Don't do anything if this is the leader. */ + if (reader == 0) + return; + + tuple_buffer = &gm_state->gm_tuple_buffers[reader - 1]; + + /* If there's nothing in the array, reset the counters to zero. */ + if (tuple_buffer->nTuples == tuple_buffer->readCounter) + tuple_buffer->nTuples = tuple_buffer->readCounter = 0; + + /* Try to fill additional slots in the array. */ + for (i = tuple_buffer->nTuples; i < MAX_TUPLE_STORE; i++) + { + MinimalTuple tuple; + + tuple = gm_readnext_tuple(gm_state, + reader, + true, + &tuple_buffer->done); + if (!tuple) + break; + tuple_buffer->tuple[i] = tuple; + tuple_buffer->nTuples++; + } +} + +/* + * Store the next tuple for a given reader into the appropriate slot. + * + * Returns true if successful, false if not (either reader is exhausted, + * or we didn't want to wait for a tuple). Sets done flag if reader + * is found to be exhausted. + */ +static bool +gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait) +{ + GMReaderTupleBuffer *tuple_buffer; + MinimalTuple tup; + + /* + * If we're being asked to generate a tuple from the leader, then we just + * call ExecProcNode as normal to produce one. + */ + if (reader == 0) + { + if (gm_state->need_to_scan_locally) + { + PlanState *outerPlan = outerPlanState(gm_state); + TupleTableSlot *outerTupleSlot; + EState *estate = gm_state->ps.state; + + /* Install our DSA area while executing the plan. */ + estate->es_query_dsa = gm_state->pei ? gm_state->pei->area : NULL; + outerTupleSlot = ExecProcNode(outerPlan); + estate->es_query_dsa = NULL; + + if (!TupIsNull(outerTupleSlot)) + { + gm_state->gm_slots[0] = outerTupleSlot; + return true; + } + /* need_to_scan_locally serves as "done" flag for leader */ + gm_state->need_to_scan_locally = false; + } + return false; + } + + /* Otherwise, check the state of the relevant tuple buffer. */ + tuple_buffer = &gm_state->gm_tuple_buffers[reader - 1]; + + if (tuple_buffer->nTuples > tuple_buffer->readCounter) + { + /* Return any tuple previously read that is still buffered. */ + tup = tuple_buffer->tuple[tuple_buffer->readCounter++]; + } + else if (tuple_buffer->done) + { + /* Reader is known to be exhausted. */ + return false; + } + else + { + /* Read and buffer next tuple. */ + tup = gm_readnext_tuple(gm_state, + reader, + nowait, + &tuple_buffer->done); + if (!tup) + return false; + + /* + * Attempt to read more tuples in nowait mode and store them in the + * pending-tuple array for the reader. + */ + load_tuple_array(gm_state, reader); + } + + Assert(tup); + + /* Build the TupleTableSlot for the given tuple */ + ExecStoreMinimalTuple(tup, /* tuple to store */ + gm_state->gm_slots[reader], /* slot in which to + * store the tuple */ + true); /* pfree tuple when done with it */ + + return true; +} + +/* + * Attempt to read a tuple from given worker. + */ +static MinimalTuple +gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait, + bool *done) +{ + TupleQueueReader *reader; + MinimalTuple tup; + + /* Check for async events, particularly messages from workers. */ + CHECK_FOR_INTERRUPTS(); + + /* + * Attempt to read a tuple. + * + * Note that TupleQueueReaderNext will just return NULL for a worker which + * fails to initialize. We'll treat that worker as having produced no + * tuples; WaitForParallelWorkersToFinish will error out when we get + * there. + */ + reader = gm_state->reader[nreader - 1]; + tup = TupleQueueReaderNext(reader, nowait, done); + + /* + * Since we'll be buffering these across multiple calls, we need to make a + * copy. + */ + return tup ? heap_copy_minimal_tuple(tup) : NULL; +} + +/* + * We have one slot for each item in the heap array. We use SlotNumber + * to store slot indexes. This doesn't actually provide any formal + * type-safety, but it makes the code more self-documenting. + */ +typedef int32 SlotNumber; + +/* + * Compare the tuples in the two given slots. + */ +static int32 +heap_compare_slots(Datum a, Datum b, void *arg) +{ + GatherMergeState *node = (GatherMergeState *) arg; + SlotNumber slot1 = DatumGetInt32(a); + SlotNumber slot2 = DatumGetInt32(b); + + TupleTableSlot *s1 = node->gm_slots[slot1]; + TupleTableSlot *s2 = node->gm_slots[slot2]; + int nkey; + + Assert(!TupIsNull(s1)); + Assert(!TupIsNull(s2)); + + for (nkey = 0; nkey < node->gm_nkeys; nkey++) + { + SortSupport sortKey = node->gm_sortkeys + nkey; + AttrNumber attno = sortKey->ssup_attno; + Datum datum1, + datum2; + bool isNull1, + isNull2; + int compare; + + datum1 = slot_getattr(s1, attno, &isNull1); + datum2 = slot_getattr(s2, attno, &isNull2); + + compare = ApplySortComparator(datum1, isNull1, + datum2, isNull2, + sortKey); + if (compare != 0) + { + INVERT_COMPARE_RESULT(compare); + return compare; + } + } + return 0; +} diff --git a/src/backend/executor/nodeGroup.c b/src/backend/executor/nodeGroup.c new file mode 100644 index 0000000..1721b2a --- /dev/null +++ b/src/backend/executor/nodeGroup.c @@ -0,0 +1,255 @@ +/*------------------------------------------------------------------------- + * + * nodeGroup.c + * Routines to handle group nodes (used for queries with GROUP BY clause). + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * DESCRIPTION + * The Group node is designed for handling queries with a GROUP BY clause. + * Its outer plan must deliver tuples that are sorted in the order + * specified by the grouping columns (ie. tuples from the same group are + * consecutive). That way, we just have to compare adjacent tuples to + * locate group boundaries. + * + * IDENTIFICATION + * src/backend/executor/nodeGroup.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeGroup.h" +#include "miscadmin.h" +#include "utils/memutils.h" + + +/* + * ExecGroup - + * + * Return one tuple for each group of matching input tuples. + */ +static TupleTableSlot * +ExecGroup(PlanState *pstate) +{ + GroupState *node = castNode(GroupState, pstate); + ExprContext *econtext; + TupleTableSlot *firsttupleslot; + TupleTableSlot *outerslot; + + CHECK_FOR_INTERRUPTS(); + + /* + * get state info from node + */ + if (node->grp_done) + return NULL; + econtext = node->ss.ps.ps_ExprContext; + + /* + * The ScanTupleSlot holds the (copied) first tuple of each group. + */ + firsttupleslot = node->ss.ss_ScanTupleSlot; + + /* + * We need not call ResetExprContext here because ExecQualAndReset() will + * reset the per-tuple memory context once per input tuple. + */ + + /* + * If first time through, acquire first input tuple and determine whether + * to return it or not. + */ + if (TupIsNull(firsttupleslot)) + { + outerslot = ExecProcNode(outerPlanState(node)); + if (TupIsNull(outerslot)) + { + /* empty input, so return nothing */ + node->grp_done = true; + return NULL; + } + /* Copy tuple into firsttupleslot */ + ExecCopySlot(firsttupleslot, outerslot); + + /* + * Set it up as input for qual test and projection. The expressions + * will access the input tuple as varno OUTER. + */ + econtext->ecxt_outertuple = firsttupleslot; + + /* + * Check the qual (HAVING clause); if the group does not match, ignore + * it and fall into scan loop. + */ + if (ExecQual(node->ss.ps.qual, econtext)) + { + /* + * Form and return a projection tuple using the first input tuple. + */ + return ExecProject(node->ss.ps.ps_ProjInfo); + } + else + InstrCountFiltered1(node, 1); + } + + /* + * This loop iterates once per input tuple group. At the head of the + * loop, we have finished processing the first tuple of the group and now + * need to scan over all the other group members. + */ + for (;;) + { + /* + * Scan over all remaining tuples that belong to this group + */ + for (;;) + { + outerslot = ExecProcNode(outerPlanState(node)); + if (TupIsNull(outerslot)) + { + /* no more groups, so we're done */ + node->grp_done = true; + return NULL; + } + + /* + * Compare with first tuple and see if this tuple is of the same + * group. If so, ignore it and keep scanning. + */ + econtext->ecxt_innertuple = firsttupleslot; + econtext->ecxt_outertuple = outerslot; + if (!ExecQualAndReset(node->eqfunction, econtext)) + break; + } + + /* + * We have the first tuple of the next input group. See if we want to + * return it. + */ + /* Copy tuple, set up as input for qual test and projection */ + ExecCopySlot(firsttupleslot, outerslot); + econtext->ecxt_outertuple = firsttupleslot; + + /* + * Check the qual (HAVING clause); if the group does not match, ignore + * it and loop back to scan the rest of the group. + */ + if (ExecQual(node->ss.ps.qual, econtext)) + { + /* + * Form and return a projection tuple using the first input tuple. + */ + return ExecProject(node->ss.ps.ps_ProjInfo); + } + else + InstrCountFiltered1(node, 1); + } +} + +/* ----------------- + * ExecInitGroup + * + * Creates the run-time information for the group node produced by the + * planner and initializes its outer subtree + * ----------------- + */ +GroupState * +ExecInitGroup(Group *node, EState *estate, int eflags) +{ + GroupState *grpstate; + const TupleTableSlotOps *tts_ops; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + grpstate = makeNode(GroupState); + grpstate->ss.ps.plan = (Plan *) node; + grpstate->ss.ps.state = estate; + grpstate->ss.ps.ExecProcNode = ExecGroup; + grpstate->grp_done = false; + + /* + * create expression context + */ + ExecAssignExprContext(estate, &grpstate->ss.ps); + + /* + * initialize child nodes + */ + outerPlanState(grpstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * Initialize scan slot and type. + */ + tts_ops = ExecGetResultSlotOps(outerPlanState(&grpstate->ss), NULL); + ExecCreateScanSlotFromOuterPlan(estate, &grpstate->ss, tts_ops); + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTupleSlotTL(&grpstate->ss.ps, &TTSOpsVirtual); + ExecAssignProjectionInfo(&grpstate->ss.ps, NULL); + + /* + * initialize child expressions + */ + grpstate->ss.ps.qual = + ExecInitQual(node->plan.qual, (PlanState *) grpstate); + + /* + * Precompute fmgr lookup data for inner loop + */ + grpstate->eqfunction = + execTuplesMatchPrepare(ExecGetResultType(outerPlanState(grpstate)), + node->numCols, + node->grpColIdx, + node->grpOperators, + node->grpCollations, + &grpstate->ss.ps); + + return grpstate; +} + +/* ------------------------ + * ExecEndGroup(node) + * + * ----------------------- + */ +void +ExecEndGroup(GroupState *node) +{ + PlanState *outerPlan; + + ExecFreeExprContext(&node->ss.ps); + + /* clean up tuple table */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + outerPlan = outerPlanState(node); + ExecEndNode(outerPlan); +} + +void +ExecReScanGroup(GroupState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + node->grp_done = false; + /* must clear first tuple */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c new file mode 100644 index 0000000..15d8bbe --- /dev/null +++ b/src/backend/executor/nodeHash.c @@ -0,0 +1,3434 @@ +/*------------------------------------------------------------------------- + * + * nodeHash.c + * Routines to hash relations for hashjoin + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeHash.c + * + * See note on parallelism in nodeHashjoin.c. + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * MultiExecHash - generate an in-memory hash table of the relation + * ExecInitHash - initialize node and subnodes + * ExecEndHash - shutdown node and subnodes + */ + +#include "postgres.h" + +#include +#include + +#include "access/htup_details.h" +#include "access/parallel.h" +#include "catalog/pg_statistic.h" +#include "commands/tablespace.h" +#include "executor/execdebug.h" +#include "executor/hashjoin.h" +#include "executor/nodeHash.h" +#include "executor/nodeHashjoin.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "port/pg_bitutils.h" +#include "utils/dynahash.h" +#include "utils/guc.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" + +static void ExecHashIncreaseNumBatches(HashJoinTable hashtable); +static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable); +static void ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable); +static void ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable); +static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, + int mcvsToUse); +static void ExecHashSkewTableInsert(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue, + int bucketNumber); +static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable); + +static void *dense_alloc(HashJoinTable hashtable, Size size); +static HashJoinTuple ExecParallelHashTupleAlloc(HashJoinTable hashtable, + size_t size, + dsa_pointer *shared); +static void MultiExecPrivateHash(HashState *node); +static void MultiExecParallelHash(HashState *node); +static inline HashJoinTuple ExecParallelHashFirstTuple(HashJoinTable table, + int bucketno); +static inline HashJoinTuple ExecParallelHashNextTuple(HashJoinTable table, + HashJoinTuple tuple); +static inline void ExecParallelHashPushTuple(dsa_pointer_atomic *head, + HashJoinTuple tuple, + dsa_pointer tuple_shared); +static void ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch); +static void ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable); +static void ExecParallelHashRepartitionFirst(HashJoinTable hashtable); +static void ExecParallelHashRepartitionRest(HashJoinTable hashtable); +static HashMemoryChunk ExecParallelHashPopChunkQueue(HashJoinTable table, + dsa_pointer *shared); +static bool ExecParallelHashTuplePrealloc(HashJoinTable hashtable, + int batchno, + size_t size); +static void ExecParallelHashMergeCounters(HashJoinTable hashtable); +static void ExecParallelHashCloseBatchAccessors(HashJoinTable hashtable); + + +/* ---------------------------------------------------------------- + * ExecHash + * + * stub for pro forma compliance + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecHash(PlanState *pstate) +{ + elog(ERROR, "Hash node does not support ExecProcNode call convention"); + return NULL; +} + +/* ---------------------------------------------------------------- + * MultiExecHash + * + * build hash table for hashjoin, doing partitioning if more + * than one batch is required. + * ---------------------------------------------------------------- + */ +Node * +MultiExecHash(HashState *node) +{ + /* must provide our own instrumentation support */ + if (node->ps.instrument) + InstrStartNode(node->ps.instrument); + + if (node->parallel_state != NULL) + MultiExecParallelHash(node); + else + MultiExecPrivateHash(node); + + /* must provide our own instrumentation support */ + if (node->ps.instrument) + InstrStopNode(node->ps.instrument, node->hashtable->partialTuples); + + /* + * We do not return the hash table directly because it's not a subtype of + * Node, and so would violate the MultiExecProcNode API. Instead, our + * parent Hashjoin node is expected to know how to fish it out of our node + * state. Ugly but not really worth cleaning up, since Hashjoin knows + * quite a bit more about Hash besides that. + */ + return NULL; +} + +/* ---------------------------------------------------------------- + * MultiExecPrivateHash + * + * parallel-oblivious version, building a backend-private + * hash table and (if necessary) batch files. + * ---------------------------------------------------------------- + */ +static void +MultiExecPrivateHash(HashState *node) +{ + PlanState *outerNode; + List *hashkeys; + HashJoinTable hashtable; + TupleTableSlot *slot; + ExprContext *econtext; + uint32 hashvalue; + + /* + * get state info from node + */ + outerNode = outerPlanState(node); + hashtable = node->hashtable; + + /* + * set expression context + */ + hashkeys = node->hashkeys; + econtext = node->ps.ps_ExprContext; + + /* + * Get all tuples from the node below the Hash node and insert into the + * hash table (or temp files). + */ + for (;;) + { + slot = ExecProcNode(outerNode); + if (TupIsNull(slot)) + break; + /* We have to compute the hash value */ + econtext->ecxt_outertuple = slot; + if (ExecHashGetHashValue(hashtable, econtext, hashkeys, + false, hashtable->keepNulls, + &hashvalue)) + { + int bucketNumber; + + bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue); + if (bucketNumber != INVALID_SKEW_BUCKET_NO) + { + /* It's a skew tuple, so put it into that hash table */ + ExecHashSkewTableInsert(hashtable, slot, hashvalue, + bucketNumber); + hashtable->skewTuples += 1; + } + else + { + /* Not subject to skew optimization, so insert normally */ + ExecHashTableInsert(hashtable, slot, hashvalue); + } + hashtable->totalTuples += 1; + } + } + + /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */ + if (hashtable->nbuckets != hashtable->nbuckets_optimal) + ExecHashIncreaseNumBuckets(hashtable); + + /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */ + hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple); + if (hashtable->spaceUsed > hashtable->spacePeak) + hashtable->spacePeak = hashtable->spaceUsed; + + hashtable->partialTuples = hashtable->totalTuples; +} + +/* ---------------------------------------------------------------- + * MultiExecParallelHash + * + * parallel-aware version, building a shared hash table and + * (if necessary) batch files using the combined effort of + * a set of co-operating backends. + * ---------------------------------------------------------------- + */ +static void +MultiExecParallelHash(HashState *node) +{ + ParallelHashJoinState *pstate; + PlanState *outerNode; + List *hashkeys; + HashJoinTable hashtable; + TupleTableSlot *slot; + ExprContext *econtext; + uint32 hashvalue; + Barrier *build_barrier; + int i; + + /* + * get state info from node + */ + outerNode = outerPlanState(node); + hashtable = node->hashtable; + + /* + * set expression context + */ + hashkeys = node->hashkeys; + econtext = node->ps.ps_ExprContext; + + /* + * Synchronize the parallel hash table build. At this stage we know that + * the shared hash table has been or is being set up by + * ExecHashTableCreate(), but we don't know if our peers have returned + * from there or are here in MultiExecParallelHash(), and if so how far + * through they are. To find out, we check the build_barrier phase then + * and jump to the right step in the build algorithm. + */ + pstate = hashtable->parallel_state; + build_barrier = &pstate->build_barrier; + Assert(BarrierPhase(build_barrier) >= PHJ_BUILD_ALLOCATING); + switch (BarrierPhase(build_barrier)) + { + case PHJ_BUILD_ALLOCATING: + + /* + * Either I just allocated the initial hash table in + * ExecHashTableCreate(), or someone else is doing that. Either + * way, wait for everyone to arrive here so we can proceed. + */ + BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ALLOCATE); + /* Fall through. */ + + case PHJ_BUILD_HASHING_INNER: + + /* + * It's time to begin hashing, or if we just arrived here then + * hashing is already underway, so join in that effort. While + * hashing we have to be prepared to help increase the number of + * batches or buckets at any time, and if we arrived here when + * that was already underway we'll have to help complete that work + * immediately so that it's safe to access batches and buckets + * below. + */ + if (PHJ_GROW_BATCHES_PHASE(BarrierAttach(&pstate->grow_batches_barrier)) != + PHJ_GROW_BATCHES_ELECTING) + ExecParallelHashIncreaseNumBatches(hashtable); + if (PHJ_GROW_BUCKETS_PHASE(BarrierAttach(&pstate->grow_buckets_barrier)) != + PHJ_GROW_BUCKETS_ELECTING) + ExecParallelHashIncreaseNumBuckets(hashtable); + ExecParallelHashEnsureBatchAccessors(hashtable); + ExecParallelHashTableSetCurrentBatch(hashtable, 0); + for (;;) + { + slot = ExecProcNode(outerNode); + if (TupIsNull(slot)) + break; + econtext->ecxt_outertuple = slot; + if (ExecHashGetHashValue(hashtable, econtext, hashkeys, + false, hashtable->keepNulls, + &hashvalue)) + ExecParallelHashTableInsert(hashtable, slot, hashvalue); + hashtable->partialTuples++; + } + + /* + * Make sure that any tuples we wrote to disk are visible to + * others before anyone tries to load them. + */ + for (i = 0; i < hashtable->nbatch; ++i) + sts_end_write(hashtable->batches[i].inner_tuples); + + /* + * Update shared counters. We need an accurate total tuple count + * to control the empty table optimization. + */ + ExecParallelHashMergeCounters(hashtable); + + BarrierDetach(&pstate->grow_buckets_barrier); + BarrierDetach(&pstate->grow_batches_barrier); + + /* + * Wait for everyone to finish building and flushing files and + * counters. + */ + if (BarrierArriveAndWait(build_barrier, + WAIT_EVENT_HASH_BUILD_HASH_INNER)) + { + /* + * Elect one backend to disable any further growth. Batches + * are now fixed. While building them we made sure they'd fit + * in our memory budget when we load them back in later (or we + * tried to do that and gave up because we detected extreme + * skew). + */ + pstate->growth = PHJ_GROWTH_DISABLED; + } + } + + /* + * We're not yet attached to a batch. We all agree on the dimensions and + * number of inner tuples (for the empty table optimization). + */ + hashtable->curbatch = -1; + hashtable->nbuckets = pstate->nbuckets; + hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->totalTuples = pstate->total_tuples; + ExecParallelHashEnsureBatchAccessors(hashtable); + + /* + * The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE + * case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't + * there already). + */ + Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER || + BarrierPhase(build_barrier) == PHJ_BUILD_DONE); +} + +/* ---------------------------------------------------------------- + * ExecInitHash + * + * Init routine for Hash node + * ---------------------------------------------------------------- + */ +HashState * +ExecInitHash(Hash *node, EState *estate, int eflags) +{ + HashState *hashstate; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + hashstate = makeNode(HashState); + hashstate->ps.plan = (Plan *) node; + hashstate->ps.state = estate; + hashstate->ps.ExecProcNode = ExecHash; + hashstate->hashtable = NULL; + hashstate->hashkeys = NIL; /* will be set by parent HashJoin */ + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &hashstate->ps); + + /* + * initialize child nodes + */ + outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * initialize our result slot and type. No need to build projection + * because this node doesn't do projections. + */ + ExecInitResultTupleSlotTL(&hashstate->ps, &TTSOpsMinimalTuple); + hashstate->ps.ps_ProjInfo = NULL; + + /* + * initialize child expressions + */ + Assert(node->plan.qual == NIL); + hashstate->hashkeys = + ExecInitExprList(node->hashkeys, (PlanState *) hashstate); + + return hashstate; +} + +/* --------------------------------------------------------------- + * ExecEndHash + * + * clean up routine for Hash node + * ---------------------------------------------------------------- + */ +void +ExecEndHash(HashState *node) +{ + PlanState *outerPlan; + + /* + * free exprcontext + */ + ExecFreeExprContext(&node->ps); + + /* + * shut down the subplan + */ + outerPlan = outerPlanState(node); + ExecEndNode(outerPlan); +} + + +/* ---------------------------------------------------------------- + * ExecHashTableCreate + * + * create an empty hashtable data structure for hashjoin. + * ---------------------------------------------------------------- + */ +HashJoinTable +ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, bool keepNulls) +{ + Hash *node; + HashJoinTable hashtable; + Plan *outerNode; + size_t space_allowed; + int nbuckets; + int nbatch; + double rows; + int num_skew_mcvs; + int log2_nbuckets; + int nkeys; + int i; + ListCell *ho; + ListCell *hc; + MemoryContext oldcxt; + + /* + * Get information about the size of the relation to be hashed (it's the + * "outer" subtree of this node, but the inner relation of the hashjoin). + * Compute the appropriate size of the hash table. + */ + node = (Hash *) state->ps.plan; + outerNode = outerPlan(node); + + /* + * If this is shared hash table with a partial plan, then we can't use + * outerNode->plan_rows to estimate its size. We need an estimate of the + * total number of rows across all copies of the partial plan. + */ + rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows; + + ExecChooseHashTableSize(rows, outerNode->plan_width, + OidIsValid(node->skewTable), + state->parallel_state != NULL, + state->parallel_state != NULL ? + state->parallel_state->nparticipants - 1 : 0, + &space_allowed, + &nbuckets, &nbatch, &num_skew_mcvs); + + /* nbuckets must be a power of 2 */ + log2_nbuckets = my_log2(nbuckets); + Assert(nbuckets == (1 << log2_nbuckets)); + + /* + * Initialize the hash table control block. + * + * The hashtable control block is just palloc'd from the executor's + * per-query memory context. Everything else should be kept inside the + * subsidiary hashCxt or batchCxt. + */ + hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData)); + hashtable->nbuckets = nbuckets; + hashtable->nbuckets_original = nbuckets; + hashtable->nbuckets_optimal = nbuckets; + hashtable->log2_nbuckets = log2_nbuckets; + hashtable->log2_nbuckets_optimal = log2_nbuckets; + hashtable->buckets.unshared = NULL; + hashtable->keepNulls = keepNulls; + hashtable->skewEnabled = false; + hashtable->skewBucket = NULL; + hashtable->skewBucketLen = 0; + hashtable->nSkewBuckets = 0; + hashtable->skewBucketNums = NULL; + hashtable->nbatch = nbatch; + hashtable->curbatch = 0; + hashtable->nbatch_original = nbatch; + hashtable->nbatch_outstart = nbatch; + hashtable->growEnabled = true; + hashtable->totalTuples = 0; + hashtable->partialTuples = 0; + hashtable->skewTuples = 0; + hashtable->innerBatchFile = NULL; + hashtable->outerBatchFile = NULL; + hashtable->spaceUsed = 0; + hashtable->spacePeak = 0; + hashtable->spaceAllowed = space_allowed; + hashtable->spaceUsedSkew = 0; + hashtable->spaceAllowedSkew = + hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100; + hashtable->chunks = NULL; + hashtable->current_chunk = NULL; + hashtable->parallel_state = state->parallel_state; + hashtable->area = state->ps.state->es_query_dsa; + hashtable->batches = NULL; + +#ifdef HJDEBUG + printf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n", + hashtable, nbatch, nbuckets); +#endif + + /* + * Create temporary memory contexts in which to keep the hashtable working + * storage. See notes in executor/hashjoin.h. + */ + hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext, + "HashTableContext", + ALLOCSET_DEFAULT_SIZES); + + hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt, + "HashBatchContext", + ALLOCSET_DEFAULT_SIZES); + + /* Allocate data that will live for the life of the hashjoin */ + + oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); + + /* + * Get info about the hash functions to be used for each hash key. Also + * remember whether the join operators are strict. + */ + nkeys = list_length(hashOperators); + hashtable->outer_hashfunctions = + (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo)); + hashtable->inner_hashfunctions = + (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo)); + hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool)); + hashtable->collations = (Oid *) palloc(nkeys * sizeof(Oid)); + i = 0; + forboth(ho, hashOperators, hc, hashCollations) + { + Oid hashop = lfirst_oid(ho); + Oid left_hashfn; + Oid right_hashfn; + + if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn)) + elog(ERROR, "could not find hash function for hash operator %u", + hashop); + fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]); + fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]); + hashtable->hashStrict[i] = op_strict(hashop); + hashtable->collations[i] = lfirst_oid(hc); + i++; + } + + if (nbatch > 1 && hashtable->parallel_state == NULL) + { + /* + * allocate and initialize the file arrays in hashCxt (not needed for + * parallel case which uses shared tuplestores instead of raw files) + */ + hashtable->innerBatchFile = (BufFile **) + palloc0(nbatch * sizeof(BufFile *)); + hashtable->outerBatchFile = (BufFile **) + palloc0(nbatch * sizeof(BufFile *)); + /* The files will not be opened until needed... */ + /* ... but make sure we have temp tablespaces established for them */ + PrepareTempTablespaces(); + } + + MemoryContextSwitchTo(oldcxt); + + if (hashtable->parallel_state) + { + ParallelHashJoinState *pstate = hashtable->parallel_state; + Barrier *build_barrier; + + /* + * Attach to the build barrier. The corresponding detach operation is + * in ExecHashTableDetach. Note that we won't attach to the + * batch_barrier for batch 0 yet. We'll attach later and start it out + * in PHJ_BATCH_PROBING phase, because batch 0 is allocated up front + * and then loaded while hashing (the standard hybrid hash join + * algorithm), and we'll coordinate that using build_barrier. + */ + build_barrier = &pstate->build_barrier; + BarrierAttach(build_barrier); + + /* + * So far we have no idea whether there are any other participants, + * and if so, what phase they are working on. The only thing we care + * about at this point is whether someone has already created the + * SharedHashJoinBatch objects and the hash table for batch 0. One + * backend will be elected to do that now if necessary. + */ + if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECTING && + BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECT)) + { + pstate->nbatch = nbatch; + pstate->space_allowed = space_allowed; + pstate->growth = PHJ_GROWTH_OK; + + /* Set up the shared state for coordinating batches. */ + ExecParallelHashJoinSetUpBatches(hashtable, nbatch); + + /* + * Allocate batch 0's hash table up front so we can load it + * directly while hashing. + */ + pstate->nbuckets = nbuckets; + ExecParallelHashTableAlloc(hashtable, 0); + } + + /* + * The next Parallel Hash synchronization point is in + * MultiExecParallelHash(), which will progress it all the way to + * PHJ_BUILD_DONE. The caller must not return control from this + * executor node between now and then. + */ + } + else + { + /* + * Prepare context for the first-scan space allocations; allocate the + * hashbucket array therein, and set each bucket "empty". + */ + MemoryContextSwitchTo(hashtable->batchCxt); + + hashtable->buckets.unshared = (HashJoinTuple *) + palloc0(nbuckets * sizeof(HashJoinTuple)); + + /* + * Set up for skew optimization, if possible and there's a need for + * more than one batch. (In a one-batch join, there's no point in + * it.) + */ + if (nbatch > 1) + ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs); + + MemoryContextSwitchTo(oldcxt); + } + + return hashtable; +} + + +/* + * Compute appropriate size for hashtable given the estimated size of the + * relation to be hashed (number of rows and average row width). + * + * This is exported so that the planner's costsize.c can use it. + */ + +/* Target bucket loading (tuples per bucket) */ +#define NTUP_PER_BUCKET 1 + +void +ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, + bool try_combined_hash_mem, + int parallel_workers, + size_t *space_allowed, + int *numbuckets, + int *numbatches, + int *num_skew_mcvs) +{ + int tupsize; + double inner_rel_bytes; + size_t hash_table_bytes; + size_t bucket_bytes; + size_t max_pointers; + int nbatch = 1; + int nbuckets; + double dbuckets; + + /* Force a plausible relation size if no info */ + if (ntuples <= 0.0) + ntuples = 1000.0; + + /* + * Estimate tupsize based on footprint of tuple in hashtable... note this + * does not allow for any palloc overhead. The manipulations of spaceUsed + * don't count palloc overhead either. + */ + tupsize = HJTUPLE_OVERHEAD + + MAXALIGN(SizeofMinimalTupleHeader) + + MAXALIGN(tupwidth); + inner_rel_bytes = ntuples * tupsize; + + /* + * Compute in-memory hashtable size limit from GUCs. + */ + hash_table_bytes = get_hash_memory_limit(); + + /* + * Parallel Hash tries to use the combined hash_mem of all workers to + * avoid the need to batch. If that won't work, it falls back to hash_mem + * per worker and tries to process batches in parallel. + */ + if (try_combined_hash_mem) + { + /* Careful, this could overflow size_t */ + double newlimit; + + newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1); + newlimit = Min(newlimit, (double) SIZE_MAX); + hash_table_bytes = (size_t) newlimit; + } + + *space_allowed = hash_table_bytes; + + /* + * If skew optimization is possible, estimate the number of skew buckets + * that will fit in the memory allowed, and decrement the assumed space + * available for the main hash table accordingly. + * + * We make the optimistic assumption that each skew bucket will contain + * one inner-relation tuple. If that turns out to be low, we will recover + * at runtime by reducing the number of skew buckets. + * + * hashtable->skewBucket will have up to 8 times as many HashSkewBucket + * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash + * will round up to the next power of 2 and then multiply by 4 to reduce + * collisions. + */ + if (useskew) + { + size_t bytes_per_mcv; + size_t skew_mcvs; + + /*---------- + * Compute number of MCVs we could hold in hash_table_bytes + * + * Divisor is: + * size of a hash tuple + + * worst-case size of skewBucket[] per MCV + + * size of skewBucketNums[] entry + + * size of skew bucket struct itself + *---------- + */ + bytes_per_mcv = tupsize + + (8 * sizeof(HashSkewBucket *)) + + sizeof(int) + + SKEW_BUCKET_OVERHEAD; + skew_mcvs = hash_table_bytes / bytes_per_mcv; + + /* + * Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as + * not to worry about size_t overflow in the multiplication) + */ + skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100; + + /* Now clamp to integer range */ + skew_mcvs = Min(skew_mcvs, INT_MAX); + + *num_skew_mcvs = (int) skew_mcvs; + + /* Reduce hash_table_bytes by the amount needed for the skew table */ + if (skew_mcvs > 0) + hash_table_bytes -= skew_mcvs * bytes_per_mcv; + } + else + *num_skew_mcvs = 0; + + /* + * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when + * memory is filled, assuming a single batch; but limit the value so that + * the pointer arrays we'll try to allocate do not exceed hash_table_bytes + * nor MaxAllocSize. + * + * Note that both nbuckets and nbatch must be powers of 2 to make + * ExecHashGetBucketAndBatch fast. + */ + max_pointers = hash_table_bytes / sizeof(HashJoinTuple); + max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple)); + /* If max_pointers isn't a power of 2, must round it down to one */ + max_pointers = pg_prevpower2_size_t(max_pointers); + + /* Also ensure we avoid integer overflow in nbatch and nbuckets */ + /* (this step is redundant given the current value of MaxAllocSize) */ + max_pointers = Min(max_pointers, INT_MAX / 2 + 1); + + dbuckets = ceil(ntuples / NTUP_PER_BUCKET); + dbuckets = Min(dbuckets, max_pointers); + nbuckets = (int) dbuckets; + /* don't let nbuckets be really small, though ... */ + nbuckets = Max(nbuckets, 1024); + /* ... and force it to be a power of 2. */ + nbuckets = pg_nextpower2_32(nbuckets); + + /* + * If there's not enough space to store the projected number of tuples and + * the required bucket headers, we will need multiple batches. + */ + bucket_bytes = sizeof(HashJoinTuple) * nbuckets; + if (inner_rel_bytes + bucket_bytes > hash_table_bytes) + { + /* We'll need multiple batches */ + size_t sbuckets; + double dbatch; + int minbatch; + size_t bucket_size; + + /* + * If Parallel Hash with combined hash_mem would still need multiple + * batches, we'll have to fall back to regular hash_mem budget. + */ + if (try_combined_hash_mem) + { + ExecChooseHashTableSize(ntuples, tupwidth, useskew, + false, parallel_workers, + space_allowed, + numbuckets, + numbatches, + num_skew_mcvs); + return; + } + + /* + * Estimate the number of buckets we'll want to have when hash_mem is + * entirely full. Each bucket will contain a bucket pointer plus + * NTUP_PER_BUCKET tuples, whose projected size already includes + * overhead for the hash code, pointer to the next tuple, etc. + */ + bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple)); + sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size); + sbuckets = Min(sbuckets, max_pointers); + nbuckets = (int) sbuckets; + nbuckets = pg_nextpower2_32(nbuckets); + bucket_bytes = nbuckets * sizeof(HashJoinTuple); + + /* + * Buckets are simple pointers to hashjoin tuples, while tupsize + * includes the pointer, hash code, and MinimalTupleData. So buckets + * should never really exceed 25% of hash_mem (even for + * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not + * 2^N bytes, where we might get more because of doubling. So let's + * look for 50% here. + */ + Assert(bucket_bytes <= hash_table_bytes / 2); + + /* Calculate required number of batches. */ + dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes)); + dbatch = Min(dbatch, max_pointers); + minbatch = (int) dbatch; + nbatch = pg_nextpower2_32(Max(2, minbatch)); + } + + Assert(nbuckets > 0); + Assert(nbatch > 0); + + *numbuckets = nbuckets; + *numbatches = nbatch; +} + + +/* ---------------------------------------------------------------- + * ExecHashTableDestroy + * + * destroy a hash table + * ---------------------------------------------------------------- + */ +void +ExecHashTableDestroy(HashJoinTable hashtable) +{ + int i; + + /* + * Make sure all the temp files are closed. We skip batch 0, since it + * can't have any temp files (and the arrays might not even exist if + * nbatch is only 1). Parallel hash joins don't use these files. + */ + if (hashtable->innerBatchFile != NULL) + { + for (i = 1; i < hashtable->nbatch; i++) + { + if (hashtable->innerBatchFile[i]) + BufFileClose(hashtable->innerBatchFile[i]); + if (hashtable->outerBatchFile[i]) + BufFileClose(hashtable->outerBatchFile[i]); + } + } + + /* Release working memory (batchCxt is a child, so it goes away too) */ + MemoryContextDelete(hashtable->hashCxt); + + /* And drop the control block */ + pfree(hashtable); +} + +/* + * ExecHashIncreaseNumBatches + * increase the original number of batches in order to reduce + * current memory consumption + */ +static void +ExecHashIncreaseNumBatches(HashJoinTable hashtable) +{ + int oldnbatch = hashtable->nbatch; + int curbatch = hashtable->curbatch; + int nbatch; + MemoryContext oldcxt; + long ninmemory; + long nfreed; + HashMemoryChunk oldchunks; + + /* do nothing if we've decided to shut off growth */ + if (!hashtable->growEnabled) + return; + + /* safety check to avoid overflow */ + if (oldnbatch > Min(INT_MAX / 2, MaxAllocSize / (sizeof(void *) * 2))) + return; + + nbatch = oldnbatch * 2; + Assert(nbatch > 1); + +#ifdef HJDEBUG + printf("Hashjoin %p: increasing nbatch to %d because space = %zu\n", + hashtable, nbatch, hashtable->spaceUsed); +#endif + + oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); + + if (hashtable->innerBatchFile == NULL) + { + /* we had no file arrays before */ + hashtable->innerBatchFile = (BufFile **) + palloc0(nbatch * sizeof(BufFile *)); + hashtable->outerBatchFile = (BufFile **) + palloc0(nbatch * sizeof(BufFile *)); + /* time to establish the temp tablespaces, too */ + PrepareTempTablespaces(); + } + else + { + /* enlarge arrays and zero out added entries */ + hashtable->innerBatchFile = (BufFile **) + repalloc(hashtable->innerBatchFile, nbatch * sizeof(BufFile *)); + hashtable->outerBatchFile = (BufFile **) + repalloc(hashtable->outerBatchFile, nbatch * sizeof(BufFile *)); + MemSet(hashtable->innerBatchFile + oldnbatch, 0, + (nbatch - oldnbatch) * sizeof(BufFile *)); + MemSet(hashtable->outerBatchFile + oldnbatch, 0, + (nbatch - oldnbatch) * sizeof(BufFile *)); + } + + MemoryContextSwitchTo(oldcxt); + + hashtable->nbatch = nbatch; + + /* + * Scan through the existing hash table entries and dump out any that are + * no longer of the current batch. + */ + ninmemory = nfreed = 0; + + /* If know we need to resize nbuckets, we can do it while rebatching. */ + if (hashtable->nbuckets_optimal != hashtable->nbuckets) + { + /* we never decrease the number of buckets */ + Assert(hashtable->nbuckets_optimal > hashtable->nbuckets); + + hashtable->nbuckets = hashtable->nbuckets_optimal; + hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal; + + hashtable->buckets.unshared = + repalloc(hashtable->buckets.unshared, + sizeof(HashJoinTuple) * hashtable->nbuckets); + } + + /* + * We will scan through the chunks directly, so that we can reset the + * buckets now and not have to keep track which tuples in the buckets have + * already been processed. We will free the old chunks as we go. + */ + memset(hashtable->buckets.unshared, 0, + sizeof(HashJoinTuple) * hashtable->nbuckets); + oldchunks = hashtable->chunks; + hashtable->chunks = NULL; + + /* so, let's scan through the old chunks, and all tuples in each chunk */ + while (oldchunks != NULL) + { + HashMemoryChunk nextchunk = oldchunks->next.unshared; + + /* position within the buffer (up to oldchunks->used) */ + size_t idx = 0; + + /* process all tuples stored in this chunk (and then free it) */ + while (idx < oldchunks->used) + { + HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(oldchunks) + idx); + MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple); + int hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len); + int bucketno; + int batchno; + + ninmemory++; + ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, + &bucketno, &batchno); + + if (batchno == curbatch) + { + /* keep tuple in memory - copy it into the new chunk */ + HashJoinTuple copyTuple; + + copyTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize); + memcpy(copyTuple, hashTuple, hashTupleSize); + + /* and add it back to the appropriate bucket */ + copyTuple->next.unshared = hashtable->buckets.unshared[bucketno]; + hashtable->buckets.unshared[bucketno] = copyTuple; + } + else + { + /* dump it out */ + Assert(batchno > curbatch); + ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple), + hashTuple->hashvalue, + &hashtable->innerBatchFile[batchno]); + + hashtable->spaceUsed -= hashTupleSize; + nfreed++; + } + + /* next tuple in this chunk */ + idx += MAXALIGN(hashTupleSize); + + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + } + + /* we're done with this chunk - free it and proceed to the next one */ + pfree(oldchunks); + oldchunks = nextchunk; + } + +#ifdef HJDEBUG + printf("Hashjoin %p: freed %ld of %ld tuples, space now %zu\n", + hashtable, nfreed, ninmemory, hashtable->spaceUsed); +#endif + + /* + * If we dumped out either all or none of the tuples in the table, disable + * further expansion of nbatch. This situation implies that we have + * enough tuples of identical hashvalues to overflow spaceAllowed. + * Increasing nbatch will not fix it since there's no way to subdivide the + * group any more finely. We have to just gut it out and hope the server + * has enough RAM. + */ + if (nfreed == 0 || nfreed == ninmemory) + { + hashtable->growEnabled = false; +#ifdef HJDEBUG + printf("Hashjoin %p: disabling further increase of nbatch\n", + hashtable); +#endif + } +} + +/* + * ExecParallelHashIncreaseNumBatches + * Every participant attached to grow_batches_barrier must run this + * function when it observes growth == PHJ_GROWTH_NEED_MORE_BATCHES. + */ +static void +ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + int i; + + Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER); + + /* + * It's unlikely, but we need to be prepared for new participants to show + * up while we're in the middle of this operation so we need to switch on + * barrier phase here. + */ + switch (PHJ_GROW_BATCHES_PHASE(BarrierPhase(&pstate->grow_batches_barrier))) + { + case PHJ_GROW_BATCHES_ELECTING: + + /* + * Elect one participant to prepare to grow the number of batches. + * This involves reallocating or resetting the buckets of batch 0 + * in preparation for all participants to begin repartitioning the + * tuples. + */ + if (BarrierArriveAndWait(&pstate->grow_batches_barrier, + WAIT_EVENT_HASH_GROW_BATCHES_ELECT)) + { + dsa_pointer_atomic *buckets; + ParallelHashJoinBatch *old_batch0; + int new_nbatch; + int i; + + /* Move the old batch out of the way. */ + old_batch0 = hashtable->batches[0].shared; + pstate->old_batches = pstate->batches; + pstate->old_nbatch = hashtable->nbatch; + pstate->batches = InvalidDsaPointer; + + /* Free this backend's old accessors. */ + ExecParallelHashCloseBatchAccessors(hashtable); + + /* Figure out how many batches to use. */ + if (hashtable->nbatch == 1) + { + /* + * We are going from single-batch to multi-batch. We need + * to switch from one large combined memory budget to the + * regular hash_mem budget. + */ + pstate->space_allowed = get_hash_memory_limit(); + + /* + * The combined hash_mem of all participants wasn't + * enough. Therefore one batch per participant would be + * approximately equivalent and would probably also be + * insufficient. So try two batches per participant, + * rounded up to a power of two. + */ + new_nbatch = pg_nextpower2_32(pstate->nparticipants * 2); + } + else + { + /* + * We were already multi-batched. Try doubling the number + * of batches. + */ + new_nbatch = hashtable->nbatch * 2; + } + + /* Allocate new larger generation of batches. */ + Assert(hashtable->nbatch == pstate->nbatch); + ExecParallelHashJoinSetUpBatches(hashtable, new_nbatch); + Assert(hashtable->nbatch == pstate->nbatch); + + /* Replace or recycle batch 0's bucket array. */ + if (pstate->old_nbatch == 1) + { + double dtuples; + double dbuckets; + int new_nbuckets; + + /* + * We probably also need a smaller bucket array. How many + * tuples do we expect per batch, assuming we have only + * half of them so far? Normally we don't need to change + * the bucket array's size, because the size of each batch + * stays the same as we add more batches, but in this + * special case we move from a large batch to many smaller + * batches and it would be wasteful to keep the large + * array. + */ + dtuples = (old_batch0->ntuples * 2.0) / new_nbatch; + dbuckets = ceil(dtuples / NTUP_PER_BUCKET); + dbuckets = Min(dbuckets, + MaxAllocSize / sizeof(dsa_pointer_atomic)); + new_nbuckets = (int) dbuckets; + new_nbuckets = Max(new_nbuckets, 1024); + new_nbuckets = pg_nextpower2_32(new_nbuckets); + dsa_free(hashtable->area, old_batch0->buckets); + hashtable->batches[0].shared->buckets = + dsa_allocate(hashtable->area, + sizeof(dsa_pointer_atomic) * new_nbuckets); + buckets = (dsa_pointer_atomic *) + dsa_get_address(hashtable->area, + hashtable->batches[0].shared->buckets); + for (i = 0; i < new_nbuckets; ++i) + dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer); + pstate->nbuckets = new_nbuckets; + } + else + { + /* Recycle the existing bucket array. */ + hashtable->batches[0].shared->buckets = old_batch0->buckets; + buckets = (dsa_pointer_atomic *) + dsa_get_address(hashtable->area, old_batch0->buckets); + for (i = 0; i < hashtable->nbuckets; ++i) + dsa_pointer_atomic_write(&buckets[i], InvalidDsaPointer); + } + + /* Move all chunks to the work queue for parallel processing. */ + pstate->chunk_work_queue = old_batch0->chunks; + + /* Disable further growth temporarily while we're growing. */ + pstate->growth = PHJ_GROWTH_DISABLED; + } + else + { + /* All other participants just flush their tuples to disk. */ + ExecParallelHashCloseBatchAccessors(hashtable); + } + /* Fall through. */ + + case PHJ_GROW_BATCHES_ALLOCATING: + /* Wait for the above to be finished. */ + BarrierArriveAndWait(&pstate->grow_batches_barrier, + WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE); + /* Fall through. */ + + case PHJ_GROW_BATCHES_REPARTITIONING: + /* Make sure that we have the current dimensions and buckets. */ + ExecParallelHashEnsureBatchAccessors(hashtable); + ExecParallelHashTableSetCurrentBatch(hashtable, 0); + /* Then partition, flush counters. */ + ExecParallelHashRepartitionFirst(hashtable); + ExecParallelHashRepartitionRest(hashtable); + ExecParallelHashMergeCounters(hashtable); + /* Wait for the above to be finished. */ + BarrierArriveAndWait(&pstate->grow_batches_barrier, + WAIT_EVENT_HASH_GROW_BATCHES_REPARTITION); + /* Fall through. */ + + case PHJ_GROW_BATCHES_DECIDING: + + /* + * Elect one participant to clean up and decide whether further + * repartitioning is needed, or should be disabled because it's + * not helping. + */ + if (BarrierArriveAndWait(&pstate->grow_batches_barrier, + WAIT_EVENT_HASH_GROW_BATCHES_DECIDE)) + { + bool space_exhausted = false; + bool extreme_skew_detected = false; + + /* Make sure that we have the current dimensions and buckets. */ + ExecParallelHashEnsureBatchAccessors(hashtable); + ExecParallelHashTableSetCurrentBatch(hashtable, 0); + + /* Are any of the new generation of batches exhausted? */ + for (i = 0; i < hashtable->nbatch; ++i) + { + ParallelHashJoinBatch *batch = hashtable->batches[i].shared; + + if (batch->space_exhausted || + batch->estimated_size > pstate->space_allowed) + { + int parent; + + space_exhausted = true; + + /* + * Did this batch receive ALL of the tuples from its + * parent batch? That would indicate that further + * repartitioning isn't going to help (the hash values + * are probably all the same). + */ + parent = i % pstate->old_nbatch; + if (batch->ntuples == hashtable->batches[parent].shared->old_ntuples) + extreme_skew_detected = true; + } + } + + /* Don't keep growing if it's not helping or we'd overflow. */ + if (extreme_skew_detected || hashtable->nbatch >= INT_MAX / 2) + pstate->growth = PHJ_GROWTH_DISABLED; + else if (space_exhausted) + pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES; + else + pstate->growth = PHJ_GROWTH_OK; + + /* Free the old batches in shared memory. */ + dsa_free(hashtable->area, pstate->old_batches); + pstate->old_batches = InvalidDsaPointer; + } + /* Fall through. */ + + case PHJ_GROW_BATCHES_FINISHING: + /* Wait for the above to complete. */ + BarrierArriveAndWait(&pstate->grow_batches_barrier, + WAIT_EVENT_HASH_GROW_BATCHES_FINISH); + } +} + +/* + * Repartition the tuples currently loaded into memory for inner batch 0 + * because the number of batches has been increased. Some tuples are retained + * in memory and some are written out to a later batch. + */ +static void +ExecParallelHashRepartitionFirst(HashJoinTable hashtable) +{ + dsa_pointer chunk_shared; + HashMemoryChunk chunk; + + Assert(hashtable->nbatch == hashtable->parallel_state->nbatch); + + while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared))) + { + size_t idx = 0; + + /* Repartition all tuples in this chunk. */ + while (idx < chunk->used) + { + HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx); + MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple); + HashJoinTuple copyTuple; + dsa_pointer shared; + int bucketno; + int batchno; + + ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, + &bucketno, &batchno); + + Assert(batchno < hashtable->nbatch); + if (batchno == 0) + { + /* It still belongs in batch 0. Copy to a new chunk. */ + copyTuple = + ExecParallelHashTupleAlloc(hashtable, + HJTUPLE_OVERHEAD + tuple->t_len, + &shared); + copyTuple->hashvalue = hashTuple->hashvalue; + memcpy(HJTUPLE_MINTUPLE(copyTuple), tuple, tuple->t_len); + ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], + copyTuple, shared); + } + else + { + size_t tuple_size = + MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); + + /* It belongs in a later batch. */ + hashtable->batches[batchno].estimated_size += tuple_size; + sts_puttuple(hashtable->batches[batchno].inner_tuples, + &hashTuple->hashvalue, tuple); + } + + /* Count this tuple. */ + ++hashtable->batches[0].old_ntuples; + ++hashtable->batches[batchno].ntuples; + + idx += MAXALIGN(HJTUPLE_OVERHEAD + + HJTUPLE_MINTUPLE(hashTuple)->t_len); + } + + /* Free this chunk. */ + dsa_free(hashtable->area, chunk_shared); + + CHECK_FOR_INTERRUPTS(); + } +} + +/* + * Help repartition inner batches 1..n. + */ +static void +ExecParallelHashRepartitionRest(HashJoinTable hashtable) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + int old_nbatch = pstate->old_nbatch; + SharedTuplestoreAccessor **old_inner_tuples; + ParallelHashJoinBatch *old_batches; + int i; + + /* Get our hands on the previous generation of batches. */ + old_batches = (ParallelHashJoinBatch *) + dsa_get_address(hashtable->area, pstate->old_batches); + old_inner_tuples = palloc0(sizeof(SharedTuplestoreAccessor *) * old_nbatch); + for (i = 1; i < old_nbatch; ++i) + { + ParallelHashJoinBatch *shared = + NthParallelHashJoinBatch(old_batches, i); + + old_inner_tuples[i] = sts_attach(ParallelHashJoinBatchInner(shared), + ParallelWorkerNumber + 1, + &pstate->fileset); + } + + /* Join in the effort to repartition them. */ + for (i = 1; i < old_nbatch; ++i) + { + MinimalTuple tuple; + uint32 hashvalue; + + /* Scan one partition from the previous generation. */ + sts_begin_parallel_scan(old_inner_tuples[i]); + while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], &hashvalue))) + { + size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); + int bucketno; + int batchno; + + /* Decide which partition it goes to in the new generation. */ + ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, + &batchno); + + hashtable->batches[batchno].estimated_size += tuple_size; + ++hashtable->batches[batchno].ntuples; + ++hashtable->batches[i].old_ntuples; + + /* Store the tuple its new batch. */ + sts_puttuple(hashtable->batches[batchno].inner_tuples, + &hashvalue, tuple); + + CHECK_FOR_INTERRUPTS(); + } + sts_end_parallel_scan(old_inner_tuples[i]); + } + + pfree(old_inner_tuples); +} + +/* + * Transfer the backend-local per-batch counters to the shared totals. + */ +static void +ExecParallelHashMergeCounters(HashJoinTable hashtable) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + int i; + + LWLockAcquire(&pstate->lock, LW_EXCLUSIVE); + pstate->total_tuples = 0; + for (i = 0; i < hashtable->nbatch; ++i) + { + ParallelHashJoinBatchAccessor *batch = &hashtable->batches[i]; + + batch->shared->size += batch->size; + batch->shared->estimated_size += batch->estimated_size; + batch->shared->ntuples += batch->ntuples; + batch->shared->old_ntuples += batch->old_ntuples; + batch->size = 0; + batch->estimated_size = 0; + batch->ntuples = 0; + batch->old_ntuples = 0; + pstate->total_tuples += batch->shared->ntuples; + } + LWLockRelease(&pstate->lock); +} + +/* + * ExecHashIncreaseNumBuckets + * increase the original number of buckets in order to reduce + * number of tuples per bucket + */ +static void +ExecHashIncreaseNumBuckets(HashJoinTable hashtable) +{ + HashMemoryChunk chunk; + + /* do nothing if not an increase (it's called increase for a reason) */ + if (hashtable->nbuckets >= hashtable->nbuckets_optimal) + return; + +#ifdef HJDEBUG + printf("Hashjoin %p: increasing nbuckets %d => %d\n", + hashtable, hashtable->nbuckets, hashtable->nbuckets_optimal); +#endif + + hashtable->nbuckets = hashtable->nbuckets_optimal; + hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal; + + Assert(hashtable->nbuckets > 1); + Assert(hashtable->nbuckets <= (INT_MAX / 2)); + Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets)); + + /* + * Just reallocate the proper number of buckets - we don't need to walk + * through them - we can walk the dense-allocated chunks (just like in + * ExecHashIncreaseNumBatches, but without all the copying into new + * chunks) + */ + hashtable->buckets.unshared = + (HashJoinTuple *) repalloc(hashtable->buckets.unshared, + hashtable->nbuckets * sizeof(HashJoinTuple)); + + memset(hashtable->buckets.unshared, 0, + hashtable->nbuckets * sizeof(HashJoinTuple)); + + /* scan through all tuples in all chunks to rebuild the hash table */ + for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next.unshared) + { + /* process all tuples stored in this chunk */ + size_t idx = 0; + + while (idx < chunk->used) + { + HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx); + int bucketno; + int batchno; + + ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, + &bucketno, &batchno); + + /* add the tuple to the proper bucket */ + hashTuple->next.unshared = hashtable->buckets.unshared[bucketno]; + hashtable->buckets.unshared[bucketno] = hashTuple; + + /* advance index past the tuple */ + idx += MAXALIGN(HJTUPLE_OVERHEAD + + HJTUPLE_MINTUPLE(hashTuple)->t_len); + } + + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + } +} + +static void +ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + int i; + HashMemoryChunk chunk; + dsa_pointer chunk_s; + + Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER); + + /* + * It's unlikely, but we need to be prepared for new participants to show + * up while we're in the middle of this operation so we need to switch on + * barrier phase here. + */ + switch (PHJ_GROW_BUCKETS_PHASE(BarrierPhase(&pstate->grow_buckets_barrier))) + { + case PHJ_GROW_BUCKETS_ELECTING: + /* Elect one participant to prepare to increase nbuckets. */ + if (BarrierArriveAndWait(&pstate->grow_buckets_barrier, + WAIT_EVENT_HASH_GROW_BUCKETS_ELECT)) + { + size_t size; + dsa_pointer_atomic *buckets; + + /* Double the size of the bucket array. */ + pstate->nbuckets *= 2; + size = pstate->nbuckets * sizeof(dsa_pointer_atomic); + hashtable->batches[0].shared->size += size / 2; + dsa_free(hashtable->area, hashtable->batches[0].shared->buckets); + hashtable->batches[0].shared->buckets = + dsa_allocate(hashtable->area, size); + buckets = (dsa_pointer_atomic *) + dsa_get_address(hashtable->area, + hashtable->batches[0].shared->buckets); + for (i = 0; i < pstate->nbuckets; ++i) + dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer); + + /* Put the chunk list onto the work queue. */ + pstate->chunk_work_queue = hashtable->batches[0].shared->chunks; + + /* Clear the flag. */ + pstate->growth = PHJ_GROWTH_OK; + } + /* Fall through. */ + + case PHJ_GROW_BUCKETS_ALLOCATING: + /* Wait for the above to complete. */ + BarrierArriveAndWait(&pstate->grow_buckets_barrier, + WAIT_EVENT_HASH_GROW_BUCKETS_ALLOCATE); + /* Fall through. */ + + case PHJ_GROW_BUCKETS_REINSERTING: + /* Reinsert all tuples into the hash table. */ + ExecParallelHashEnsureBatchAccessors(hashtable); + ExecParallelHashTableSetCurrentBatch(hashtable, 0); + while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_s))) + { + size_t idx = 0; + + while (idx < chunk->used) + { + HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx); + dsa_pointer shared = chunk_s + HASH_CHUNK_HEADER_SIZE + idx; + int bucketno; + int batchno; + + ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, + &bucketno, &batchno); + Assert(batchno == 0); + + /* add the tuple to the proper bucket */ + ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], + hashTuple, shared); + + /* advance index past the tuple */ + idx += MAXALIGN(HJTUPLE_OVERHEAD + + HJTUPLE_MINTUPLE(hashTuple)->t_len); + } + + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + } + BarrierArriveAndWait(&pstate->grow_buckets_barrier, + WAIT_EVENT_HASH_GROW_BUCKETS_REINSERT); + } +} + +/* + * ExecHashTableInsert + * insert a tuple into the hash table depending on the hash value + * it may just go to a temp file for later batches + * + * Note: the passed TupleTableSlot may contain a regular, minimal, or virtual + * tuple; the minimal case in particular is certain to happen while reloading + * tuples from batch files. We could save some cycles in the regular-tuple + * case by not forcing the slot contents into minimal form; not clear if it's + * worth the messiness required. + */ +void +ExecHashTableInsert(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue) +{ + bool shouldFree; + MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); + int bucketno; + int batchno; + + ExecHashGetBucketAndBatch(hashtable, hashvalue, + &bucketno, &batchno); + + /* + * decide whether to put the tuple in the hash table or a temp file + */ + if (batchno == hashtable->curbatch) + { + /* + * put the tuple in hash table + */ + HashJoinTuple hashTuple; + int hashTupleSize; + double ntuples = (hashtable->totalTuples - hashtable->skewTuples); + + /* Create the HashJoinTuple */ + hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len; + hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize); + + hashTuple->hashvalue = hashvalue; + memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len); + + /* + * We always reset the tuple-matched flag on insertion. This is okay + * even when reloading a tuple from a batch file, since the tuple + * could not possibly have been matched to an outer tuple before it + * went into the batch file. + */ + HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple)); + + /* Push it onto the front of the bucket's list */ + hashTuple->next.unshared = hashtable->buckets.unshared[bucketno]; + hashtable->buckets.unshared[bucketno] = hashTuple; + + /* + * Increase the (optimal) number of buckets if we just exceeded the + * NTUP_PER_BUCKET threshold, but only when there's still a single + * batch. + */ + if (hashtable->nbatch == 1 && + ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)) + { + /* Guard against integer overflow and alloc size overflow */ + if (hashtable->nbuckets_optimal <= INT_MAX / 2 && + hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple)) + { + hashtable->nbuckets_optimal *= 2; + hashtable->log2_nbuckets_optimal += 1; + } + } + + /* Account for space used, and back off if we've used too much */ + hashtable->spaceUsed += hashTupleSize; + if (hashtable->spaceUsed > hashtable->spacePeak) + hashtable->spacePeak = hashtable->spaceUsed; + if (hashtable->spaceUsed + + hashtable->nbuckets_optimal * sizeof(HashJoinTuple) + > hashtable->spaceAllowed) + ExecHashIncreaseNumBatches(hashtable); + } + else + { + /* + * put the tuple into a temp file for later batches + */ + Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(tuple, + hashvalue, + &hashtable->innerBatchFile[batchno]); + } + + if (shouldFree) + heap_free_minimal_tuple(tuple); +} + +/* + * ExecParallelHashTableInsert + * insert a tuple into a shared hash table or shared batch tuplestore + */ +void +ExecParallelHashTableInsert(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue) +{ + bool shouldFree; + MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); + dsa_pointer shared; + int bucketno; + int batchno; + +retry: + ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno); + + if (batchno == 0) + { + HashJoinTuple hashTuple; + + /* Try to load it into memory. */ + Assert(BarrierPhase(&hashtable->parallel_state->build_barrier) == + PHJ_BUILD_HASHING_INNER); + hashTuple = ExecParallelHashTupleAlloc(hashtable, + HJTUPLE_OVERHEAD + tuple->t_len, + &shared); + if (hashTuple == NULL) + goto retry; + + /* Store the hash value in the HashJoinTuple header. */ + hashTuple->hashvalue = hashvalue; + memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len); + + /* Push it onto the front of the bucket's list */ + ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], + hashTuple, shared); + } + else + { + size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); + + Assert(batchno > 0); + + /* Try to preallocate space in the batch if necessary. */ + if (hashtable->batches[batchno].preallocated < tuple_size) + { + if (!ExecParallelHashTuplePrealloc(hashtable, batchno, tuple_size)) + goto retry; + } + + Assert(hashtable->batches[batchno].preallocated >= tuple_size); + hashtable->batches[batchno].preallocated -= tuple_size; + sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue, + tuple); + } + ++hashtable->batches[batchno].ntuples; + + if (shouldFree) + heap_free_minimal_tuple(tuple); +} + +/* + * Insert a tuple into the current hash table. Unlike + * ExecParallelHashTableInsert, this version is not prepared to send the tuple + * to other batches or to run out of memory, and should only be called with + * tuples that belong in the current batch once growth has been disabled. + */ +void +ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue) +{ + bool shouldFree; + MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); + HashJoinTuple hashTuple; + dsa_pointer shared; + int batchno; + int bucketno; + + ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno); + Assert(batchno == hashtable->curbatch); + hashTuple = ExecParallelHashTupleAlloc(hashtable, + HJTUPLE_OVERHEAD + tuple->t_len, + &shared); + hashTuple->hashvalue = hashvalue; + memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len); + HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple)); + ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], + hashTuple, shared); + + if (shouldFree) + heap_free_minimal_tuple(tuple); +} + +/* + * ExecHashGetHashValue + * Compute the hash value for a tuple + * + * The tuple to be tested must be in econtext->ecxt_outertuple (thus Vars in + * the hashkeys expressions need to have OUTER_VAR as varno). If outer_tuple + * is false (meaning it's the HashJoin's inner node, Hash), econtext, + * hashkeys, and slot need to be from Hash, with hashkeys/slot referencing and + * being suitable for tuples from the node below the Hash. Conversely, if + * outer_tuple is true, econtext is from HashJoin, and hashkeys/slot need to + * be appropriate for tuples from HashJoin's outer node. + * + * A true result means the tuple's hash value has been successfully computed + * and stored at *hashvalue. A false result means the tuple cannot match + * because it contains a null attribute, and hence it should be discarded + * immediately. (If keep_nulls is true then false is never returned.) + */ +bool +ExecHashGetHashValue(HashJoinTable hashtable, + ExprContext *econtext, + List *hashkeys, + bool outer_tuple, + bool keep_nulls, + uint32 *hashvalue) +{ + uint32 hashkey = 0; + FmgrInfo *hashfunctions; + ListCell *hk; + int i = 0; + MemoryContext oldContext; + + /* + * We reset the eval context each time to reclaim any memory leaked in the + * hashkey expressions. + */ + ResetExprContext(econtext); + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + if (outer_tuple) + hashfunctions = hashtable->outer_hashfunctions; + else + hashfunctions = hashtable->inner_hashfunctions; + + foreach(hk, hashkeys) + { + ExprState *keyexpr = (ExprState *) lfirst(hk); + Datum keyval; + bool isNull; + + /* rotate hashkey left 1 bit at each step */ + hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0); + + /* + * Get the join attribute value of the tuple + */ + keyval = ExecEvalExpr(keyexpr, econtext, &isNull); + + /* + * If the attribute is NULL, and the join operator is strict, then + * this tuple cannot pass the join qual so we can reject it + * immediately (unless we're scanning the outside of an outer join, in + * which case we must not reject it). Otherwise we act like the + * hashcode of NULL is zero (this will support operators that act like + * IS NOT DISTINCT, though not any more-random behavior). We treat + * the hash support function as strict even if the operator is not. + * + * Note: currently, all hashjoinable operators must be strict since + * the hash index AM assumes that. However, it takes so little extra + * code here to allow non-strict that we may as well do it. + */ + if (isNull) + { + if (hashtable->hashStrict[i] && !keep_nulls) + { + MemoryContextSwitchTo(oldContext); + return false; /* cannot match */ + } + /* else, leave hashkey unmodified, equivalent to hashcode 0 */ + } + else + { + /* Compute the hash function */ + uint32 hkey; + + hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i], hashtable->collations[i], keyval)); + hashkey ^= hkey; + } + + i++; + } + + MemoryContextSwitchTo(oldContext); + + *hashvalue = hashkey; + return true; +} + +/* + * ExecHashGetBucketAndBatch + * Determine the bucket number and batch number for a hash value + * + * Note: on-the-fly increases of nbatch must not change the bucket number + * for a given hash code (since we don't move tuples to different hash + * chains), and must only cause the batch number to remain the same or + * increase. Our algorithm is + * bucketno = hashvalue MOD nbuckets + * batchno = ROR(hashvalue, log2_nbuckets) MOD nbatch + * where nbuckets and nbatch are both expected to be powers of 2, so we can + * do the computations by shifting and masking. (This assumes that all hash + * functions are good about randomizing all their output bits, else we are + * likely to have very skewed bucket or batch occupancy.) + * + * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic + * bucket count growth. Once we start batching, the value is fixed and does + * not change over the course of the join (making it possible to compute batch + * number the way we do here). + * + * nbatch is always a power of 2; we increase it only by doubling it. This + * effectively adds one more bit to the top of the batchno. In very large + * joins, we might run out of bits to add, so we do this by rotating the hash + * value. This causes batchno to steal bits from bucketno when the number of + * virtual buckets exceeds 2^32. It's better to have longer bucket chains + * than to lose the ability to divide batches. + */ +void +ExecHashGetBucketAndBatch(HashJoinTable hashtable, + uint32 hashvalue, + int *bucketno, + int *batchno) +{ + uint32 nbuckets = (uint32) hashtable->nbuckets; + uint32 nbatch = (uint32) hashtable->nbatch; + + if (nbatch > 1) + { + *bucketno = hashvalue & (nbuckets - 1); + *batchno = pg_rotate_right32(hashvalue, + hashtable->log2_nbuckets) & (nbatch - 1); + } + else + { + *bucketno = hashvalue & (nbuckets - 1); + *batchno = 0; + } +} + +/* + * ExecScanHashBucket + * scan a hash bucket for matches to the current outer tuple + * + * The current outer tuple must be stored in econtext->ecxt_outertuple. + * + * On success, the inner tuple is stored into hjstate->hj_CurTuple and + * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot + * for the latter. + */ +bool +ExecScanHashBucket(HashJoinState *hjstate, + ExprContext *econtext) +{ + ExprState *hjclauses = hjstate->hashclauses; + HashJoinTable hashtable = hjstate->hj_HashTable; + HashJoinTuple hashTuple = hjstate->hj_CurTuple; + uint32 hashvalue = hjstate->hj_CurHashValue; + + /* + * hj_CurTuple is the address of the tuple last returned from the current + * bucket, or NULL if it's time to start scanning a new bucket. + * + * If the tuple hashed to a skew bucket then scan the skew bucket + * otherwise scan the standard hashtable bucket. + */ + if (hashTuple != NULL) + hashTuple = hashTuple->next.unshared; + else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO) + hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples; + else + hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo]; + + while (hashTuple != NULL) + { + if (hashTuple->hashvalue == hashvalue) + { + TupleTableSlot *inntuple; + + /* insert hashtable's tuple into exec slot so ExecQual sees it */ + inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple), + hjstate->hj_HashTupleSlot, + false); /* do not pfree */ + econtext->ecxt_innertuple = inntuple; + + if (ExecQualAndReset(hjclauses, econtext)) + { + hjstate->hj_CurTuple = hashTuple; + return true; + } + } + + hashTuple = hashTuple->next.unshared; + } + + /* + * no match + */ + return false; +} + +/* + * ExecParallelScanHashBucket + * scan a hash bucket for matches to the current outer tuple + * + * The current outer tuple must be stored in econtext->ecxt_outertuple. + * + * On success, the inner tuple is stored into hjstate->hj_CurTuple and + * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot + * for the latter. + */ +bool +ExecParallelScanHashBucket(HashJoinState *hjstate, + ExprContext *econtext) +{ + ExprState *hjclauses = hjstate->hashclauses; + HashJoinTable hashtable = hjstate->hj_HashTable; + HashJoinTuple hashTuple = hjstate->hj_CurTuple; + uint32 hashvalue = hjstate->hj_CurHashValue; + + /* + * hj_CurTuple is the address of the tuple last returned from the current + * bucket, or NULL if it's time to start scanning a new bucket. + */ + if (hashTuple != NULL) + hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple); + else + hashTuple = ExecParallelHashFirstTuple(hashtable, + hjstate->hj_CurBucketNo); + + while (hashTuple != NULL) + { + if (hashTuple->hashvalue == hashvalue) + { + TupleTableSlot *inntuple; + + /* insert hashtable's tuple into exec slot so ExecQual sees it */ + inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple), + hjstate->hj_HashTupleSlot, + false); /* do not pfree */ + econtext->ecxt_innertuple = inntuple; + + if (ExecQualAndReset(hjclauses, econtext)) + { + hjstate->hj_CurTuple = hashTuple; + return true; + } + } + + hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple); + } + + /* + * no match + */ + return false; +} + +/* + * ExecPrepHashTableForUnmatched + * set up for a series of ExecScanHashTableForUnmatched calls + */ +void +ExecPrepHashTableForUnmatched(HashJoinState *hjstate) +{ + /*---------- + * During this scan we use the HashJoinState fields as follows: + * + * hj_CurBucketNo: next regular bucket to scan + * hj_CurSkewBucketNo: next skew bucket (an index into skewBucketNums) + * hj_CurTuple: last tuple returned, or NULL to start next bucket + *---------- + */ + hjstate->hj_CurBucketNo = 0; + hjstate->hj_CurSkewBucketNo = 0; + hjstate->hj_CurTuple = NULL; +} + +/* + * ExecScanHashTableForUnmatched + * scan the hash table for unmatched inner tuples + * + * On success, the inner tuple is stored into hjstate->hj_CurTuple and + * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot + * for the latter. + */ +bool +ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + HashJoinTuple hashTuple = hjstate->hj_CurTuple; + + for (;;) + { + /* + * hj_CurTuple is the address of the tuple last returned from the + * current bucket, or NULL if it's time to start scanning a new + * bucket. + */ + if (hashTuple != NULL) + hashTuple = hashTuple->next.unshared; + else if (hjstate->hj_CurBucketNo < hashtable->nbuckets) + { + hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo]; + hjstate->hj_CurBucketNo++; + } + else if (hjstate->hj_CurSkewBucketNo < hashtable->nSkewBuckets) + { + int j = hashtable->skewBucketNums[hjstate->hj_CurSkewBucketNo]; + + hashTuple = hashtable->skewBucket[j]->tuples; + hjstate->hj_CurSkewBucketNo++; + } + else + break; /* finished all buckets */ + + while (hashTuple != NULL) + { + if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple))) + { + TupleTableSlot *inntuple; + + /* insert hashtable's tuple into exec slot */ + inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple), + hjstate->hj_HashTupleSlot, + false); /* do not pfree */ + econtext->ecxt_innertuple = inntuple; + + /* + * Reset temp memory each time; although this function doesn't + * do any qual eval, the caller will, so let's keep it + * parallel to ExecScanHashBucket. + */ + ResetExprContext(econtext); + + hjstate->hj_CurTuple = hashTuple; + return true; + } + + hashTuple = hashTuple->next.unshared; + } + + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + } + + /* + * no more unmatched tuples + */ + return false; +} + +/* + * ExecHashTableReset + * + * reset hash table header for new batch + */ +void +ExecHashTableReset(HashJoinTable hashtable) +{ + MemoryContext oldcxt; + int nbuckets = hashtable->nbuckets; + + /* + * Release all the hash buckets and tuples acquired in the prior pass, and + * reinitialize the context for a new pass. + */ + MemoryContextReset(hashtable->batchCxt); + oldcxt = MemoryContextSwitchTo(hashtable->batchCxt); + + /* Reallocate and reinitialize the hash bucket headers. */ + hashtable->buckets.unshared = (HashJoinTuple *) + palloc0(nbuckets * sizeof(HashJoinTuple)); + + hashtable->spaceUsed = 0; + + MemoryContextSwitchTo(oldcxt); + + /* Forget the chunks (the memory was freed by the context reset above). */ + hashtable->chunks = NULL; +} + +/* + * ExecHashTableResetMatchFlags + * Clear all the HeapTupleHeaderHasMatch flags in the table + */ +void +ExecHashTableResetMatchFlags(HashJoinTable hashtable) +{ + HashJoinTuple tuple; + int i; + + /* Reset all flags in the main table ... */ + for (i = 0; i < hashtable->nbuckets; i++) + { + for (tuple = hashtable->buckets.unshared[i]; tuple != NULL; + tuple = tuple->next.unshared) + HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple)); + } + + /* ... and the same for the skew buckets, if any */ + for (i = 0; i < hashtable->nSkewBuckets; i++) + { + int j = hashtable->skewBucketNums[i]; + HashSkewBucket *skewBucket = hashtable->skewBucket[j]; + + for (tuple = skewBucket->tuples; tuple != NULL; tuple = tuple->next.unshared) + HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple)); + } +} + + +void +ExecReScanHash(HashState *node) +{ + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); +} + + +/* + * ExecHashBuildSkewHash + * + * Set up for skew optimization if we can identify the most common values + * (MCVs) of the outer relation's join key. We make a skew hash bucket + * for the hash value of each MCV, up to the number of slots allowed + * based on available memory. + */ +static void +ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, int mcvsToUse) +{ + HeapTupleData *statsTuple; + AttStatsSlot sslot; + + /* Do nothing if planner didn't identify the outer relation's join key */ + if (!OidIsValid(node->skewTable)) + return; + /* Also, do nothing if we don't have room for at least one skew bucket */ + if (mcvsToUse <= 0) + return; + + /* + * Try to find the MCV statistics for the outer relation's join key. + */ + statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(node->skewTable), + Int16GetDatum(node->skewColumn), + BoolGetDatum(node->skewInherit)); + if (!HeapTupleIsValid(statsTuple)) + return; + + if (get_attstatsslot(&sslot, statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS)) + { + double frac; + int nbuckets; + FmgrInfo *hashfunctions; + int i; + + if (mcvsToUse > sslot.nvalues) + mcvsToUse = sslot.nvalues; + + /* + * Calculate the expected fraction of outer relation that will + * participate in the skew optimization. If this isn't at least + * SKEW_MIN_OUTER_FRACTION, don't use skew optimization. + */ + frac = 0; + for (i = 0; i < mcvsToUse; i++) + frac += sslot.numbers[i]; + if (frac < SKEW_MIN_OUTER_FRACTION) + { + free_attstatsslot(&sslot); + ReleaseSysCache(statsTuple); + return; + } + + /* + * Okay, set up the skew hashtable. + * + * skewBucket[] is an open addressing hashtable with a power of 2 size + * that is greater than the number of MCV values. (This ensures there + * will be at least one null entry, so searches will always + * terminate.) + * + * Note: this code could fail if mcvsToUse exceeds INT_MAX/8 or + * MaxAllocSize/sizeof(void *)/8, but that is not currently possible + * since we limit pg_statistic entries to much less than that. + */ + nbuckets = pg_nextpower2_32(mcvsToUse + 1); + /* use two more bits just to help avoid collisions */ + nbuckets <<= 2; + + hashtable->skewEnabled = true; + hashtable->skewBucketLen = nbuckets; + + /* + * We allocate the bucket memory in the hashtable's batch context. It + * is only needed during the first batch, and this ensures it will be + * automatically removed once the first batch is done. + */ + hashtable->skewBucket = (HashSkewBucket **) + MemoryContextAllocZero(hashtable->batchCxt, + nbuckets * sizeof(HashSkewBucket *)); + hashtable->skewBucketNums = (int *) + MemoryContextAllocZero(hashtable->batchCxt, + mcvsToUse * sizeof(int)); + + hashtable->spaceUsed += nbuckets * sizeof(HashSkewBucket *) + + mcvsToUse * sizeof(int); + hashtable->spaceUsedSkew += nbuckets * sizeof(HashSkewBucket *) + + mcvsToUse * sizeof(int); + if (hashtable->spaceUsed > hashtable->spacePeak) + hashtable->spacePeak = hashtable->spaceUsed; + + /* + * Create a skew bucket for each MCV hash value. + * + * Note: it is very important that we create the buckets in order of + * decreasing MCV frequency. If we have to remove some buckets, they + * must be removed in reverse order of creation (see notes in + * ExecHashRemoveNextSkewBucket) and we want the least common MCVs to + * be removed first. + */ + hashfunctions = hashtable->outer_hashfunctions; + + for (i = 0; i < mcvsToUse; i++) + { + uint32 hashvalue; + int bucket; + + hashvalue = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[0], + hashtable->collations[0], + sslot.values[i])); + + /* + * While we have not hit a hole in the hashtable and have not hit + * the desired bucket, we have collided with some previous hash + * value, so try the next bucket location. NB: this code must + * match ExecHashGetSkewBucket. + */ + bucket = hashvalue & (nbuckets - 1); + while (hashtable->skewBucket[bucket] != NULL && + hashtable->skewBucket[bucket]->hashvalue != hashvalue) + bucket = (bucket + 1) & (nbuckets - 1); + + /* + * If we found an existing bucket with the same hashvalue, leave + * it alone. It's okay for two MCVs to share a hashvalue. + */ + if (hashtable->skewBucket[bucket] != NULL) + continue; + + /* Okay, create a new skew bucket for this hashvalue. */ + hashtable->skewBucket[bucket] = (HashSkewBucket *) + MemoryContextAlloc(hashtable->batchCxt, + sizeof(HashSkewBucket)); + hashtable->skewBucket[bucket]->hashvalue = hashvalue; + hashtable->skewBucket[bucket]->tuples = NULL; + hashtable->skewBucketNums[hashtable->nSkewBuckets] = bucket; + hashtable->nSkewBuckets++; + hashtable->spaceUsed += SKEW_BUCKET_OVERHEAD; + hashtable->spaceUsedSkew += SKEW_BUCKET_OVERHEAD; + if (hashtable->spaceUsed > hashtable->spacePeak) + hashtable->spacePeak = hashtable->spaceUsed; + } + + free_attstatsslot(&sslot); + } + + ReleaseSysCache(statsTuple); +} + +/* + * ExecHashGetSkewBucket + * + * Returns the index of the skew bucket for this hashvalue, + * or INVALID_SKEW_BUCKET_NO if the hashvalue is not + * associated with any active skew bucket. + */ +int +ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue) +{ + int bucket; + + /* + * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in + * particular, this happens after the initial batch is done). + */ + if (!hashtable->skewEnabled) + return INVALID_SKEW_BUCKET_NO; + + /* + * Since skewBucketLen is a power of 2, we can do a modulo by ANDing. + */ + bucket = hashvalue & (hashtable->skewBucketLen - 1); + + /* + * While we have not hit a hole in the hashtable and have not hit the + * desired bucket, we have collided with some other hash value, so try the + * next bucket location. + */ + while (hashtable->skewBucket[bucket] != NULL && + hashtable->skewBucket[bucket]->hashvalue != hashvalue) + bucket = (bucket + 1) & (hashtable->skewBucketLen - 1); + + /* + * Found the desired bucket? + */ + if (hashtable->skewBucket[bucket] != NULL) + return bucket; + + /* + * There must not be any hashtable entry for this hash value. + */ + return INVALID_SKEW_BUCKET_NO; +} + +/* + * ExecHashSkewTableInsert + * + * Insert a tuple into the skew hashtable. + * + * This should generally match up with the current-batch case in + * ExecHashTableInsert. + */ +static void +ExecHashSkewTableInsert(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue, + int bucketNumber) +{ + bool shouldFree; + MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); + HashJoinTuple hashTuple; + int hashTupleSize; + + /* Create the HashJoinTuple */ + hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len; + hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt, + hashTupleSize); + hashTuple->hashvalue = hashvalue; + memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len); + HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple)); + + /* Push it onto the front of the skew bucket's list */ + hashTuple->next.unshared = hashtable->skewBucket[bucketNumber]->tuples; + hashtable->skewBucket[bucketNumber]->tuples = hashTuple; + Assert(hashTuple != hashTuple->next.unshared); + + /* Account for space used, and back off if we've used too much */ + hashtable->spaceUsed += hashTupleSize; + hashtable->spaceUsedSkew += hashTupleSize; + if (hashtable->spaceUsed > hashtable->spacePeak) + hashtable->spacePeak = hashtable->spaceUsed; + while (hashtable->spaceUsedSkew > hashtable->spaceAllowedSkew) + ExecHashRemoveNextSkewBucket(hashtable); + + /* Check we are not over the total spaceAllowed, either */ + if (hashtable->spaceUsed > hashtable->spaceAllowed) + ExecHashIncreaseNumBatches(hashtable); + + if (shouldFree) + heap_free_minimal_tuple(tuple); +} + +/* + * ExecHashRemoveNextSkewBucket + * + * Remove the least valuable skew bucket by pushing its tuples into + * the main hash table. + */ +static void +ExecHashRemoveNextSkewBucket(HashJoinTable hashtable) +{ + int bucketToRemove; + HashSkewBucket *bucket; + uint32 hashvalue; + int bucketno; + int batchno; + HashJoinTuple hashTuple; + + /* Locate the bucket to remove */ + bucketToRemove = hashtable->skewBucketNums[hashtable->nSkewBuckets - 1]; + bucket = hashtable->skewBucket[bucketToRemove]; + + /* + * Calculate which bucket and batch the tuples belong to in the main + * hashtable. They all have the same hash value, so it's the same for all + * of them. Also note that it's not possible for nbatch to increase while + * we are processing the tuples. + */ + hashvalue = bucket->hashvalue; + ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno); + + /* Process all tuples in the bucket */ + hashTuple = bucket->tuples; + while (hashTuple != NULL) + { + HashJoinTuple nextHashTuple = hashTuple->next.unshared; + MinimalTuple tuple; + Size tupleSize; + + /* + * This code must agree with ExecHashTableInsert. We do not use + * ExecHashTableInsert directly as ExecHashTableInsert expects a + * TupleTableSlot while we already have HashJoinTuples. + */ + tuple = HJTUPLE_MINTUPLE(hashTuple); + tupleSize = HJTUPLE_OVERHEAD + tuple->t_len; + + /* Decide whether to put the tuple in the hash table or a temp file */ + if (batchno == hashtable->curbatch) + { + /* Move the tuple to the main hash table */ + HashJoinTuple copyTuple; + + /* + * We must copy the tuple into the dense storage, else it will not + * be found by, eg, ExecHashIncreaseNumBatches. + */ + copyTuple = (HashJoinTuple) dense_alloc(hashtable, tupleSize); + memcpy(copyTuple, hashTuple, tupleSize); + pfree(hashTuple); + + copyTuple->next.unshared = hashtable->buckets.unshared[bucketno]; + hashtable->buckets.unshared[bucketno] = copyTuple; + + /* We have reduced skew space, but overall space doesn't change */ + hashtable->spaceUsedSkew -= tupleSize; + } + else + { + /* Put the tuple into a temp file for later batches */ + Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(tuple, hashvalue, + &hashtable->innerBatchFile[batchno]); + pfree(hashTuple); + hashtable->spaceUsed -= tupleSize; + hashtable->spaceUsedSkew -= tupleSize; + } + + hashTuple = nextHashTuple; + + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + } + + /* + * Free the bucket struct itself and reset the hashtable entry to NULL. + * + * NOTE: this is not nearly as simple as it looks on the surface, because + * of the possibility of collisions in the hashtable. Suppose that hash + * values A and B collide at a particular hashtable entry, and that A was + * entered first so B gets shifted to a different table entry. If we were + * to remove A first then ExecHashGetSkewBucket would mistakenly start + * reporting that B is not in the hashtable, because it would hit the NULL + * before finding B. However, we always remove entries in the reverse + * order of creation, so this failure cannot happen. + */ + hashtable->skewBucket[bucketToRemove] = NULL; + hashtable->nSkewBuckets--; + pfree(bucket); + hashtable->spaceUsed -= SKEW_BUCKET_OVERHEAD; + hashtable->spaceUsedSkew -= SKEW_BUCKET_OVERHEAD; + + /* + * If we have removed all skew buckets then give up on skew optimization. + * Release the arrays since they aren't useful any more. + */ + if (hashtable->nSkewBuckets == 0) + { + hashtable->skewEnabled = false; + pfree(hashtable->skewBucket); + pfree(hashtable->skewBucketNums); + hashtable->skewBucket = NULL; + hashtable->skewBucketNums = NULL; + hashtable->spaceUsed -= hashtable->spaceUsedSkew; + hashtable->spaceUsedSkew = 0; + } +} + +/* + * Reserve space in the DSM segment for instrumentation data. + */ +void +ExecHashEstimate(HashState *node, ParallelContext *pcxt) +{ + size_t size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation)); + size = add_size(size, offsetof(SharedHashInfo, hinstrument)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* + * Set up a space in the DSM for all workers to record instrumentation data + * about their hash table. + */ +void +ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt) +{ + size_t size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedHashInfo, hinstrument) + + pcxt->nworkers * sizeof(HashInstrumentation); + node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size); + + /* Each per-worker area must start out as zeroes. */ + memset(node->shared_info, 0, size); + + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id, + node->shared_info); +} + +/* + * Locate the DSM space for hash table instrumentation data that we'll write + * to at shutdown time. + */ +void +ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt) +{ + SharedHashInfo *shared_info; + + /* don't need this if not instrumenting */ + if (!node->ps.instrument) + return; + + /* + * Find our entry in the shared area, and set up a pointer to it so that + * we'll accumulate stats there when shutting down or rebuilding the hash + * table. + */ + shared_info = (SharedHashInfo *) + shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false); + node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber]; +} + +/* + * Collect EXPLAIN stats if needed, saving them into DSM memory if + * ExecHashInitializeWorker was called, or local storage if not. In the + * parallel case, this must be done in ExecShutdownHash() rather than + * ExecEndHash() because the latter runs after we've detached from the DSM + * segment. + */ +void +ExecShutdownHash(HashState *node) +{ + /* Allocate save space if EXPLAIN'ing and we didn't do so already */ + if (node->ps.instrument && !node->hinstrument) + node->hinstrument = (HashInstrumentation *) + palloc0(sizeof(HashInstrumentation)); + /* Now accumulate data for the current (final) hash table */ + if (node->hinstrument && node->hashtable) + ExecHashAccumInstrumentation(node->hinstrument, node->hashtable); +} + +/* + * Retrieve instrumentation data from workers before the DSM segment is + * detached, so that EXPLAIN can access it. + */ +void +ExecHashRetrieveInstrumentation(HashState *node) +{ + SharedHashInfo *shared_info = node->shared_info; + size_t size; + + if (shared_info == NULL) + return; + + /* Replace node->shared_info with a copy in backend-local memory. */ + size = offsetof(SharedHashInfo, hinstrument) + + shared_info->num_workers * sizeof(HashInstrumentation); + node->shared_info = palloc(size); + memcpy(node->shared_info, shared_info, size); +} + +/* + * Accumulate instrumentation data from 'hashtable' into an + * initially-zeroed HashInstrumentation struct. + * + * This is used to merge information across successive hash table instances + * within a single plan node. We take the maximum values of each interesting + * number. The largest nbuckets and largest nbatch values might have occurred + * in different instances, so there's some risk of confusion from reporting + * unrelated numbers; but there's a bigger risk of misdiagnosing a performance + * issue if we don't report the largest values. Similarly, we want to report + * the largest spacePeak regardless of whether it happened in the same + * instance as the largest nbuckets or nbatch. All the instances should have + * the same nbuckets_original and nbatch_original; but there's little value + * in depending on that here, so handle them the same way. + */ +void +ExecHashAccumInstrumentation(HashInstrumentation *instrument, + HashJoinTable hashtable) +{ + instrument->nbuckets = Max(instrument->nbuckets, + hashtable->nbuckets); + instrument->nbuckets_original = Max(instrument->nbuckets_original, + hashtable->nbuckets_original); + instrument->nbatch = Max(instrument->nbatch, + hashtable->nbatch); + instrument->nbatch_original = Max(instrument->nbatch_original, + hashtable->nbatch_original); + instrument->space_peak = Max(instrument->space_peak, + hashtable->spacePeak); +} + +/* + * Allocate 'size' bytes from the currently active HashMemoryChunk + */ +static void * +dense_alloc(HashJoinTable hashtable, Size size) +{ + HashMemoryChunk newChunk; + char *ptr; + + /* just in case the size is not already aligned properly */ + size = MAXALIGN(size); + + /* + * If tuple size is larger than threshold, allocate a separate chunk. + */ + if (size > HASH_CHUNK_THRESHOLD) + { + /* allocate new chunk and put it at the beginning of the list */ + newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt, + HASH_CHUNK_HEADER_SIZE + size); + newChunk->maxlen = size; + newChunk->used = size; + newChunk->ntuples = 1; + + /* + * Add this chunk to the list after the first existing chunk, so that + * we don't lose the remaining space in the "current" chunk. + */ + if (hashtable->chunks != NULL) + { + newChunk->next = hashtable->chunks->next; + hashtable->chunks->next.unshared = newChunk; + } + else + { + newChunk->next.unshared = hashtable->chunks; + hashtable->chunks = newChunk; + } + + return HASH_CHUNK_DATA(newChunk); + } + + /* + * See if we have enough space for it in the current chunk (if any). If + * not, allocate a fresh chunk. + */ + if ((hashtable->chunks == NULL) || + (hashtable->chunks->maxlen - hashtable->chunks->used) < size) + { + /* allocate new chunk and put it at the beginning of the list */ + newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt, + HASH_CHUNK_HEADER_SIZE + HASH_CHUNK_SIZE); + + newChunk->maxlen = HASH_CHUNK_SIZE; + newChunk->used = size; + newChunk->ntuples = 1; + + newChunk->next.unshared = hashtable->chunks; + hashtable->chunks = newChunk; + + return HASH_CHUNK_DATA(newChunk); + } + + /* There is enough space in the current chunk, let's add the tuple */ + ptr = HASH_CHUNK_DATA(hashtable->chunks) + hashtable->chunks->used; + hashtable->chunks->used += size; + hashtable->chunks->ntuples += 1; + + /* return pointer to the start of the tuple memory */ + return ptr; +} + +/* + * Allocate space for a tuple in shared dense storage. This is equivalent to + * dense_alloc but for Parallel Hash using shared memory. + * + * While loading a tuple into shared memory, we might run out of memory and + * decide to repartition, or determine that the load factor is too high and + * decide to expand the bucket array, or discover that another participant has + * commanded us to help do that. Return NULL if number of buckets or batches + * has changed, indicating that the caller must retry (considering the + * possibility that the tuple no longer belongs in the same batch). + */ +static HashJoinTuple +ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, + dsa_pointer *shared) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + dsa_pointer chunk_shared; + HashMemoryChunk chunk; + Size chunk_size; + HashJoinTuple result; + int curbatch = hashtable->curbatch; + + size = MAXALIGN(size); + + /* + * Fast path: if there is enough space in this backend's current chunk, + * then we can allocate without any locking. + */ + chunk = hashtable->current_chunk; + if (chunk != NULL && + size <= HASH_CHUNK_THRESHOLD && + chunk->maxlen - chunk->used >= size) + { + + chunk_shared = hashtable->current_chunk_shared; + Assert(chunk == dsa_get_address(hashtable->area, chunk_shared)); + *shared = chunk_shared + HASH_CHUNK_HEADER_SIZE + chunk->used; + result = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + chunk->used); + chunk->used += size; + + Assert(chunk->used <= chunk->maxlen); + Assert(result == dsa_get_address(hashtable->area, *shared)); + + return result; + } + + /* Slow path: try to allocate a new chunk. */ + LWLockAcquire(&pstate->lock, LW_EXCLUSIVE); + + /* + * Check if we need to help increase the number of buckets or batches. + */ + if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES || + pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS) + { + ParallelHashGrowth growth = pstate->growth; + + hashtable->current_chunk = NULL; + LWLockRelease(&pstate->lock); + + /* Another participant has commanded us to help grow. */ + if (growth == PHJ_GROWTH_NEED_MORE_BATCHES) + ExecParallelHashIncreaseNumBatches(hashtable); + else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS) + ExecParallelHashIncreaseNumBuckets(hashtable); + + /* The caller must retry. */ + return NULL; + } + + /* Oversized tuples get their own chunk. */ + if (size > HASH_CHUNK_THRESHOLD) + chunk_size = size + HASH_CHUNK_HEADER_SIZE; + else + chunk_size = HASH_CHUNK_SIZE; + + /* Check if it's time to grow batches or buckets. */ + if (pstate->growth != PHJ_GROWTH_DISABLED) + { + Assert(curbatch == 0); + Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER); + + /* + * Check if our space limit would be exceeded. To avoid choking on + * very large tuples or very low hash_mem setting, we'll always allow + * each backend to allocate at least one chunk. + */ + if (hashtable->batches[0].at_least_one_chunk && + hashtable->batches[0].shared->size + + chunk_size > pstate->space_allowed) + { + pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES; + hashtable->batches[0].shared->space_exhausted = true; + LWLockRelease(&pstate->lock); + + return NULL; + } + + /* Check if our load factor limit would be exceeded. */ + if (hashtable->nbatch == 1) + { + hashtable->batches[0].shared->ntuples += hashtable->batches[0].ntuples; + hashtable->batches[0].ntuples = 0; + /* Guard against integer overflow and alloc size overflow */ + if (hashtable->batches[0].shared->ntuples + 1 > + hashtable->nbuckets * NTUP_PER_BUCKET && + hashtable->nbuckets < (INT_MAX / 2) && + hashtable->nbuckets * 2 <= + MaxAllocSize / sizeof(dsa_pointer_atomic)) + { + pstate->growth = PHJ_GROWTH_NEED_MORE_BUCKETS; + LWLockRelease(&pstate->lock); + + return NULL; + } + } + } + + /* We are cleared to allocate a new chunk. */ + chunk_shared = dsa_allocate(hashtable->area, chunk_size); + hashtable->batches[curbatch].shared->size += chunk_size; + hashtable->batches[curbatch].at_least_one_chunk = true; + + /* Set up the chunk. */ + chunk = (HashMemoryChunk) dsa_get_address(hashtable->area, chunk_shared); + *shared = chunk_shared + HASH_CHUNK_HEADER_SIZE; + chunk->maxlen = chunk_size - HASH_CHUNK_HEADER_SIZE; + chunk->used = size; + + /* + * Push it onto the list of chunks, so that it can be found if we need to + * increase the number of buckets or batches (batch 0 only) and later for + * freeing the memory (all batches). + */ + chunk->next.shared = hashtable->batches[curbatch].shared->chunks; + hashtable->batches[curbatch].shared->chunks = chunk_shared; + + if (size <= HASH_CHUNK_THRESHOLD) + { + /* + * Make this the current chunk so that we can use the fast path to + * fill the rest of it up in future calls. + */ + hashtable->current_chunk = chunk; + hashtable->current_chunk_shared = chunk_shared; + } + LWLockRelease(&pstate->lock); + + Assert(HASH_CHUNK_DATA(chunk) == dsa_get_address(hashtable->area, *shared)); + result = (HashJoinTuple) HASH_CHUNK_DATA(chunk); + + return result; +} + +/* + * One backend needs to set up the shared batch state including tuplestores. + * Other backends will ensure they have correctly configured accessors by + * called ExecParallelHashEnsureBatchAccessors(). + */ +static void +ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + ParallelHashJoinBatch *batches; + MemoryContext oldcxt; + int i; + + Assert(hashtable->batches == NULL); + + /* Allocate space. */ + pstate->batches = + dsa_allocate0(hashtable->area, + EstimateParallelHashJoinBatch(hashtable) * nbatch); + pstate->nbatch = nbatch; + batches = dsa_get_address(hashtable->area, pstate->batches); + + /* Use hash join memory context. */ + oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); + + /* Allocate this backend's accessor array. */ + hashtable->nbatch = nbatch; + hashtable->batches = (ParallelHashJoinBatchAccessor *) + palloc0(sizeof(ParallelHashJoinBatchAccessor) * hashtable->nbatch); + + /* Set up the shared state, tuplestores and backend-local accessors. */ + for (i = 0; i < hashtable->nbatch; ++i) + { + ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i]; + ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i); + char name[MAXPGPATH]; + + /* + * All members of shared were zero-initialized. We just need to set + * up the Barrier. + */ + BarrierInit(&shared->batch_barrier, 0); + if (i == 0) + { + /* Batch 0 doesn't need to be loaded. */ + BarrierAttach(&shared->batch_barrier); + while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_PROBING) + BarrierArriveAndWait(&shared->batch_barrier, 0); + BarrierDetach(&shared->batch_barrier); + } + + /* Initialize accessor state. All members were zero-initialized. */ + accessor->shared = shared; + + /* Initialize the shared tuplestores. */ + snprintf(name, sizeof(name), "i%dof%d", i, hashtable->nbatch); + accessor->inner_tuples = + sts_initialize(ParallelHashJoinBatchInner(shared), + pstate->nparticipants, + ParallelWorkerNumber + 1, + sizeof(uint32), + SHARED_TUPLESTORE_SINGLE_PASS, + &pstate->fileset, + name); + snprintf(name, sizeof(name), "o%dof%d", i, hashtable->nbatch); + accessor->outer_tuples = + sts_initialize(ParallelHashJoinBatchOuter(shared, + pstate->nparticipants), + pstate->nparticipants, + ParallelWorkerNumber + 1, + sizeof(uint32), + SHARED_TUPLESTORE_SINGLE_PASS, + &pstate->fileset, + name); + } + + MemoryContextSwitchTo(oldcxt); +} + +/* + * Free the current set of ParallelHashJoinBatchAccessor objects. + */ +static void +ExecParallelHashCloseBatchAccessors(HashJoinTable hashtable) +{ + int i; + + for (i = 0; i < hashtable->nbatch; ++i) + { + /* Make sure no files are left open. */ + sts_end_write(hashtable->batches[i].inner_tuples); + sts_end_write(hashtable->batches[i].outer_tuples); + sts_end_parallel_scan(hashtable->batches[i].inner_tuples); + sts_end_parallel_scan(hashtable->batches[i].outer_tuples); + } + pfree(hashtable->batches); + hashtable->batches = NULL; +} + +/* + * Make sure this backend has up-to-date accessors for the current set of + * batches. + */ +static void +ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + ParallelHashJoinBatch *batches; + MemoryContext oldcxt; + int i; + + if (hashtable->batches != NULL) + { + if (hashtable->nbatch == pstate->nbatch) + return; + ExecParallelHashCloseBatchAccessors(hashtable); + } + + /* + * It's possible for a backend to start up very late so that the whole + * join is finished and the shm state for tracking batches has already + * been freed by ExecHashTableDetach(). In that case we'll just leave + * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives + * up early. + */ + if (!DsaPointerIsValid(pstate->batches)) + return; + + /* Use hash join memory context. */ + oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); + + /* Allocate this backend's accessor array. */ + hashtable->nbatch = pstate->nbatch; + hashtable->batches = (ParallelHashJoinBatchAccessor *) + palloc0(sizeof(ParallelHashJoinBatchAccessor) * hashtable->nbatch); + + /* Find the base of the pseudo-array of ParallelHashJoinBatch objects. */ + batches = (ParallelHashJoinBatch *) + dsa_get_address(hashtable->area, pstate->batches); + + /* Set up the accessor array and attach to the tuplestores. */ + for (i = 0; i < hashtable->nbatch; ++i) + { + ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i]; + ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i); + + accessor->shared = shared; + accessor->preallocated = 0; + accessor->done = false; + accessor->inner_tuples = + sts_attach(ParallelHashJoinBatchInner(shared), + ParallelWorkerNumber + 1, + &pstate->fileset); + accessor->outer_tuples = + sts_attach(ParallelHashJoinBatchOuter(shared, + pstate->nparticipants), + ParallelWorkerNumber + 1, + &pstate->fileset); + } + + MemoryContextSwitchTo(oldcxt); +} + +/* + * Allocate an empty shared memory hash table for a given batch. + */ +void +ExecParallelHashTableAlloc(HashJoinTable hashtable, int batchno) +{ + ParallelHashJoinBatch *batch = hashtable->batches[batchno].shared; + dsa_pointer_atomic *buckets; + int nbuckets = hashtable->parallel_state->nbuckets; + int i; + + batch->buckets = + dsa_allocate(hashtable->area, sizeof(dsa_pointer_atomic) * nbuckets); + buckets = (dsa_pointer_atomic *) + dsa_get_address(hashtable->area, batch->buckets); + for (i = 0; i < nbuckets; ++i) + dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer); +} + +/* + * If we are currently attached to a shared hash join batch, detach. If we + * are last to detach, clean up. + */ +void +ExecHashTableDetachBatch(HashJoinTable hashtable) +{ + if (hashtable->parallel_state != NULL && + hashtable->curbatch >= 0) + { + int curbatch = hashtable->curbatch; + ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared; + + /* Make sure any temporary files are closed. */ + sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples); + sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples); + + /* Detach from the batch we were last working on. */ + if (BarrierArriveAndDetach(&batch->batch_barrier)) + { + /* + * Technically we shouldn't access the barrier because we're no + * longer attached, but since there is no way it's moving after + * this point it seems safe to make the following assertion. + */ + Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_DONE); + + /* Free shared chunks and buckets. */ + while (DsaPointerIsValid(batch->chunks)) + { + HashMemoryChunk chunk = + dsa_get_address(hashtable->area, batch->chunks); + dsa_pointer next = chunk->next.shared; + + dsa_free(hashtable->area, batch->chunks); + batch->chunks = next; + } + if (DsaPointerIsValid(batch->buckets)) + { + dsa_free(hashtable->area, batch->buckets); + batch->buckets = InvalidDsaPointer; + } + } + + /* + * Track the largest batch we've been attached to. Though each + * backend might see a different subset of batches, explain.c will + * scan the results from all backends to find the largest value. + */ + hashtable->spacePeak = + Max(hashtable->spacePeak, + batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets); + + /* Remember that we are not attached to a batch. */ + hashtable->curbatch = -1; + } +} + +/* + * Detach from all shared resources. If we are last to detach, clean up. + */ +void +ExecHashTableDetach(HashJoinTable hashtable) +{ + if (hashtable->parallel_state) + { + ParallelHashJoinState *pstate = hashtable->parallel_state; + int i; + + /* Make sure any temporary files are closed. */ + if (hashtable->batches) + { + for (i = 0; i < hashtable->nbatch; ++i) + { + sts_end_write(hashtable->batches[i].inner_tuples); + sts_end_write(hashtable->batches[i].outer_tuples); + sts_end_parallel_scan(hashtable->batches[i].inner_tuples); + sts_end_parallel_scan(hashtable->batches[i].outer_tuples); + } + } + + /* If we're last to detach, clean up shared memory. */ + if (BarrierDetach(&pstate->build_barrier)) + { + if (DsaPointerIsValid(pstate->batches)) + { + dsa_free(hashtable->area, pstate->batches); + pstate->batches = InvalidDsaPointer; + } + } + + hashtable->parallel_state = NULL; + } +} + +/* + * Get the first tuple in a given bucket identified by number. + */ +static inline HashJoinTuple +ExecParallelHashFirstTuple(HashJoinTable hashtable, int bucketno) +{ + HashJoinTuple tuple; + dsa_pointer p; + + Assert(hashtable->parallel_state); + p = dsa_pointer_atomic_read(&hashtable->buckets.shared[bucketno]); + tuple = (HashJoinTuple) dsa_get_address(hashtable->area, p); + + return tuple; +} + +/* + * Get the next tuple in the same bucket as 'tuple'. + */ +static inline HashJoinTuple +ExecParallelHashNextTuple(HashJoinTable hashtable, HashJoinTuple tuple) +{ + HashJoinTuple next; + + Assert(hashtable->parallel_state); + next = (HashJoinTuple) dsa_get_address(hashtable->area, tuple->next.shared); + + return next; +} + +/* + * Insert a tuple at the front of a chain of tuples in DSA memory atomically. + */ +static inline void +ExecParallelHashPushTuple(dsa_pointer_atomic *head, + HashJoinTuple tuple, + dsa_pointer tuple_shared) +{ + for (;;) + { + tuple->next.shared = dsa_pointer_atomic_read(head); + if (dsa_pointer_atomic_compare_exchange(head, + &tuple->next.shared, + tuple_shared)) + break; + } +} + +/* + * Prepare to work on a given batch. + */ +void +ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno) +{ + Assert(hashtable->batches[batchno].shared->buckets != InvalidDsaPointer); + + hashtable->curbatch = batchno; + hashtable->buckets.shared = (dsa_pointer_atomic *) + dsa_get_address(hashtable->area, + hashtable->batches[batchno].shared->buckets); + hashtable->nbuckets = hashtable->parallel_state->nbuckets; + hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->current_chunk = NULL; + hashtable->current_chunk_shared = InvalidDsaPointer; + hashtable->batches[batchno].at_least_one_chunk = false; +} + +/* + * Take the next available chunk from the queue of chunks being worked on in + * parallel. Return NULL if there are none left. Otherwise return a pointer + * to the chunk, and set *shared to the DSA pointer to the chunk. + */ +static HashMemoryChunk +ExecParallelHashPopChunkQueue(HashJoinTable hashtable, dsa_pointer *shared) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + HashMemoryChunk chunk; + + LWLockAcquire(&pstate->lock, LW_EXCLUSIVE); + if (DsaPointerIsValid(pstate->chunk_work_queue)) + { + *shared = pstate->chunk_work_queue; + chunk = (HashMemoryChunk) + dsa_get_address(hashtable->area, *shared); + pstate->chunk_work_queue = chunk->next.shared; + } + else + chunk = NULL; + LWLockRelease(&pstate->lock); + + return chunk; +} + +/* + * Increase the space preallocated in this backend for a given inner batch by + * at least a given amount. This allows us to track whether a given batch + * would fit in memory when loaded back in. Also increase the number of + * batches or buckets if required. + * + * This maintains a running estimation of how much space will be taken when we + * load the batch back into memory by simulating the way chunks will be handed + * out to workers. It's not perfectly accurate because the tuples will be + * packed into memory chunks differently by ExecParallelHashTupleAlloc(), but + * it should be pretty close. It tends to overestimate by a fraction of a + * chunk per worker since all workers gang up to preallocate during hashing, + * but workers tend to reload batches alone if there are enough to go around, + * leaving fewer partially filled chunks. This effect is bounded by + * nparticipants. + * + * Return false if the number of batches or buckets has changed, and the + * caller should reconsider which batch a given tuple now belongs in and call + * again. + */ +static bool +ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size) +{ + ParallelHashJoinState *pstate = hashtable->parallel_state; + ParallelHashJoinBatchAccessor *batch = &hashtable->batches[batchno]; + size_t want = Max(size, HASH_CHUNK_SIZE - HASH_CHUNK_HEADER_SIZE); + + Assert(batchno > 0); + Assert(batchno < hashtable->nbatch); + Assert(size == MAXALIGN(size)); + + LWLockAcquire(&pstate->lock, LW_EXCLUSIVE); + + /* Has another participant commanded us to help grow? */ + if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES || + pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS) + { + ParallelHashGrowth growth = pstate->growth; + + LWLockRelease(&pstate->lock); + if (growth == PHJ_GROWTH_NEED_MORE_BATCHES) + ExecParallelHashIncreaseNumBatches(hashtable); + else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS) + ExecParallelHashIncreaseNumBuckets(hashtable); + + return false; + } + + if (pstate->growth != PHJ_GROWTH_DISABLED && + batch->at_least_one_chunk && + (batch->shared->estimated_size + want + HASH_CHUNK_HEADER_SIZE + > pstate->space_allowed)) + { + /* + * We have determined that this batch would exceed the space budget if + * loaded into memory. Command all participants to help repartition. + */ + batch->shared->space_exhausted = true; + pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES; + LWLockRelease(&pstate->lock); + + return false; + } + + batch->at_least_one_chunk = true; + batch->shared->estimated_size += want + HASH_CHUNK_HEADER_SIZE; + batch->preallocated = want; + LWLockRelease(&pstate->lock); + + return true; +} + +/* + * Calculate the limit on how much memory can be used by Hash and similar + * plan types. This is work_mem times hash_mem_multiplier, and is + * expressed in bytes. + * + * Exported for use by the planner, as well as other hash-like executor + * nodes. This is a rather random place for this, but there is no better + * place. + */ +size_t +get_hash_memory_limit(void) +{ + double mem_limit; + + /* Do initial calculation in double arithmetic */ + mem_limit = (double) work_mem * hash_mem_multiplier * 1024.0; + + /* Clamp in case it doesn't fit in size_t */ + mem_limit = Min(mem_limit, (double) SIZE_MAX); + + return (size_t) mem_limit; +} + +/* + * Convert the hash memory limit to an integer number of kilobytes, + * that is something comparable to work_mem. Like work_mem, we clamp + * the result to ensure that multiplying it by 1024 fits in a long int. + * + * This is deprecated since it may understate the actual memory limit. + * It is unused in core and will eventually be removed. + */ +int +get_hash_mem(void) +{ + size_t mem_limit = get_hash_memory_limit(); + + /* Remove the kilobyte factor */ + mem_limit /= 1024; + + /* Clamp to MAX_KILOBYTES, like work_mem */ + mem_limit = Min(mem_limit, (size_t) MAX_KILOBYTES); + + return (int) mem_limit; +} diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c new file mode 100644 index 0000000..510bdd3 --- /dev/null +++ b/src/backend/executor/nodeHashjoin.c @@ -0,0 +1,1551 @@ +/*------------------------------------------------------------------------- + * + * nodeHashjoin.c + * Routines to handle hash join nodes + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeHashjoin.c + * + * PARALLELISM + * + * Hash joins can participate in parallel query execution in several ways. A + * parallel-oblivious hash join is one where the node is unaware that it is + * part of a parallel plan. In this case, a copy of the inner plan is used to + * build a copy of the hash table in every backend, and the outer plan could + * either be built from a partial or complete path, so that the results of the + * hash join are correspondingly either partial or complete. A parallel-aware + * hash join is one that behaves differently, coordinating work between + * backends, and appears as Parallel Hash Join in EXPLAIN output. A Parallel + * Hash Join always appears with a Parallel Hash node. + * + * Parallel-aware hash joins use the same per-backend state machine to track + * progress through the hash join algorithm as parallel-oblivious hash joins. + * In a parallel-aware hash join, there is also a shared state machine that + * co-operating backends use to synchronize their local state machines and + * program counters. The shared state machine is managed with a Barrier IPC + * primitive. When all attached participants arrive at a barrier, the phase + * advances and all waiting participants are released. + * + * When a participant begins working on a parallel hash join, it must first + * figure out how much progress has already been made, because participants + * don't wait for each other to begin. For this reason there are switch + * statements at key points in the code where we have to synchronize our local + * state machine with the phase, and then jump to the correct part of the + * algorithm so that we can get started. + * + * One barrier called build_barrier is used to coordinate the hashing phases. + * The phase is represented by an integer which begins at zero and increments + * one by one, but in the code it is referred to by symbolic names as follows: + * + * PHJ_BUILD_ELECTING -- initial state + * PHJ_BUILD_ALLOCATING -- one sets up the batches and table 0 + * PHJ_BUILD_HASHING_INNER -- all hash the inner rel + * PHJ_BUILD_HASHING_OUTER -- (multi-batch only) all hash the outer + * PHJ_BUILD_DONE -- building done, probing can begin + * + * While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may + * be used repeatedly as required to coordinate expansions in the number of + * batches or buckets. Their phases are as follows: + * + * PHJ_GROW_BATCHES_ELECTING -- initial state + * PHJ_GROW_BATCHES_ALLOCATING -- one allocates new batches + * PHJ_GROW_BATCHES_REPARTITIONING -- all repartition + * PHJ_GROW_BATCHES_FINISHING -- one cleans up, detects skew + * + * PHJ_GROW_BUCKETS_ELECTING -- initial state + * PHJ_GROW_BUCKETS_ALLOCATING -- one allocates new buckets + * PHJ_GROW_BUCKETS_REINSERTING -- all insert tuples + * + * If the planner got the number of batches and buckets right, those won't be + * necessary, but on the other hand we might finish up needing to expand the + * buckets or batches multiple times while hashing the inner relation to stay + * within our memory budget and load factor target. For that reason it's a + * separate pair of barriers using circular phases. + * + * The PHJ_BUILD_HASHING_OUTER phase is required only for multi-batch joins, + * because we need to divide the outer relation into batches up front in order + * to be able to process batches entirely independently. In contrast, the + * parallel-oblivious algorithm simply throws tuples 'forward' to 'later' + * batches whenever it encounters them while scanning and probing, which it + * can do because it processes batches in serial order. + * + * Once PHJ_BUILD_DONE is reached, backends then split up and process + * different batches, or gang up and work together on probing batches if there + * aren't enough to go around. For each batch there is a separate barrier + * with the following phases: + * + * PHJ_BATCH_ELECTING -- initial state + * PHJ_BATCH_ALLOCATING -- one allocates buckets + * PHJ_BATCH_LOADING -- all load the hash table from disk + * PHJ_BATCH_PROBING -- all probe + * PHJ_BATCH_DONE -- end + * + * Batch 0 is a special case, because it starts out in phase + * PHJ_BATCH_PROBING; populating batch 0's hash table is done during + * PHJ_BUILD_HASHING_INNER so we can skip loading. + * + * Initially we try to plan for a single-batch hash join using the combined + * hash_mem of all participants to create a large shared hash table. If that + * turns out either at planning or execution time to be impossible then we + * fall back to regular hash_mem sized hash tables. + * + * To avoid deadlocks, we never wait for any barrier unless it is known that + * all other backends attached to it are actively executing the node or have + * already arrived. Practically, that means that we never return a tuple + * while attached to a barrier, unless the barrier has reached its final + * state. In the slightly special case of the per-batch barrier, we return + * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use + * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/parallel.h" +#include "executor/executor.h" +#include "executor/hashjoin.h" +#include "executor/nodeHash.h" +#include "executor/nodeHashjoin.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/memutils.h" +#include "utils/sharedtuplestore.h" + + +/* + * States of the ExecHashJoin state machine + */ +#define HJ_BUILD_HASHTABLE 1 +#define HJ_NEED_NEW_OUTER 2 +#define HJ_SCAN_BUCKET 3 +#define HJ_FILL_OUTER_TUPLE 4 +#define HJ_FILL_INNER_TUPLES 5 +#define HJ_NEED_NEW_BATCH 6 + +/* Returns true if doing null-fill on outer relation */ +#define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL) +/* Returns true if doing null-fill on inner relation */ +#define HJ_FILL_INNER(hjstate) ((hjstate)->hj_NullOuterTupleSlot != NULL) + +static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode, + HashJoinState *hjstate, + uint32 *hashvalue); +static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, + HashJoinState *hjstate, + uint32 *hashvalue); +static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, + BufFile *file, + uint32 *hashvalue, + TupleTableSlot *tupleSlot); +static bool ExecHashJoinNewBatch(HashJoinState *hjstate); +static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate); +static void ExecParallelHashJoinPartitionOuter(HashJoinState *node); + + +/* ---------------------------------------------------------------- + * ExecHashJoinImpl + * + * This function implements the Hybrid Hashjoin algorithm. It is marked + * with an always-inline attribute so that ExecHashJoin() and + * ExecParallelHashJoin() can inline it. Compilers that respect the + * attribute should create versions specialized for parallel == true and + * parallel == false with unnecessary branches removed. + * + * Note: the relation we build hash table on is the "inner" + * the other one is "outer". + * ---------------------------------------------------------------- + */ +static pg_attribute_always_inline TupleTableSlot * +ExecHashJoinImpl(PlanState *pstate, bool parallel) +{ + HashJoinState *node = castNode(HashJoinState, pstate); + PlanState *outerNode; + HashState *hashNode; + ExprState *joinqual; + ExprState *otherqual; + ExprContext *econtext; + HashJoinTable hashtable; + TupleTableSlot *outerTupleSlot; + uint32 hashvalue; + int batchno; + ParallelHashJoinState *parallel_state; + + /* + * get information from HashJoin node + */ + joinqual = node->js.joinqual; + otherqual = node->js.ps.qual; + hashNode = (HashState *) innerPlanState(node); + outerNode = outerPlanState(node); + hashtable = node->hj_HashTable; + econtext = node->js.ps.ps_ExprContext; + parallel_state = hashNode->parallel_state; + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + ResetExprContext(econtext); + + /* + * run the hash join state machine + */ + for (;;) + { + /* + * It's possible to iterate this loop many times before returning a + * tuple, in some pathological cases such as needing to move much of + * the current batch to a later batch. So let's check for interrupts + * each time through. + */ + CHECK_FOR_INTERRUPTS(); + + switch (node->hj_JoinState) + { + case HJ_BUILD_HASHTABLE: + + /* + * First time through: build hash table for inner relation. + */ + Assert(hashtable == NULL); + + /* + * If the outer relation is completely empty, and it's not + * right/full join, we can quit without building the hash + * table. However, for an inner join it is only a win to + * check this when the outer relation's startup cost is less + * than the projected cost of building the hash table. + * Otherwise it's best to build the hash table first and see + * if the inner relation is empty. (When it's a left join, we + * should always make this check, since we aren't going to be + * able to skip the join on the strength of an empty inner + * relation anyway.) + * + * If we are rescanning the join, we make use of information + * gained on the previous scan: don't bother to try the + * prefetch if the previous scan found the outer relation + * nonempty. This is not 100% reliable since with new + * parameters the outer relation might yield different + * results, but it's a good heuristic. + * + * The only way to make the check is to try to fetch a tuple + * from the outer plan node. If we succeed, we have to stash + * it away for later consumption by ExecHashJoinOuterGetTuple. + */ + if (HJ_FILL_INNER(node)) + { + /* no chance to not build the hash table */ + node->hj_FirstOuterTupleSlot = NULL; + } + else if (parallel) + { + /* + * The empty-outer optimization is not implemented for + * shared hash tables, because no one participant can + * determine that there are no outer tuples, and it's not + * yet clear that it's worth the synchronization overhead + * of reaching consensus to figure that out. So we have + * to build the hash table. + */ + node->hj_FirstOuterTupleSlot = NULL; + } + else if (HJ_FILL_OUTER(node) || + (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost && + !node->hj_OuterNotEmpty)) + { + node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode); + if (TupIsNull(node->hj_FirstOuterTupleSlot)) + { + node->hj_OuterNotEmpty = false; + return NULL; + } + else + node->hj_OuterNotEmpty = true; + } + else + node->hj_FirstOuterTupleSlot = NULL; + + /* + * Create the hash table. If using Parallel Hash, then + * whoever gets here first will create the hash table and any + * later arrivals will merely attach to it. + */ + hashtable = ExecHashTableCreate(hashNode, + node->hj_HashOperators, + node->hj_Collations, + HJ_FILL_INNER(node)); + node->hj_HashTable = hashtable; + + /* + * Execute the Hash node, to build the hash table. If using + * Parallel Hash, then we'll try to help hashing unless we + * arrived too late. + */ + hashNode->hashtable = hashtable; + (void) MultiExecProcNode((PlanState *) hashNode); + + /* + * If the inner relation is completely empty, and we're not + * doing a left outer join, we can quit without scanning the + * outer relation. + */ + if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node)) + return NULL; + + /* + * need to remember whether nbatch has increased since we + * began scanning the outer relation + */ + hashtable->nbatch_outstart = hashtable->nbatch; + + /* + * Reset OuterNotEmpty for scan. (It's OK if we fetched a + * tuple above, because ExecHashJoinOuterGetTuple will + * immediately set it again.) + */ + node->hj_OuterNotEmpty = false; + + if (parallel) + { + Barrier *build_barrier; + + build_barrier = ¶llel_state->build_barrier; + Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER || + BarrierPhase(build_barrier) == PHJ_BUILD_DONE); + if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER) + { + /* + * If multi-batch, we need to hash the outer relation + * up front. + */ + if (hashtable->nbatch > 1) + ExecParallelHashJoinPartitionOuter(node); + BarrierArriveAndWait(build_barrier, + WAIT_EVENT_HASH_BUILD_HASH_OUTER); + } + Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE); + + /* Each backend should now select a batch to work on. */ + hashtable->curbatch = -1; + node->hj_JoinState = HJ_NEED_NEW_BATCH; + + continue; + } + else + node->hj_JoinState = HJ_NEED_NEW_OUTER; + + /* FALL THRU */ + + case HJ_NEED_NEW_OUTER: + + /* + * We don't have an outer tuple, try to get the next one + */ + if (parallel) + outerTupleSlot = + ExecParallelHashJoinOuterGetTuple(outerNode, node, + &hashvalue); + else + outerTupleSlot = + ExecHashJoinOuterGetTuple(outerNode, node, &hashvalue); + + if (TupIsNull(outerTupleSlot)) + { + /* end of batch, or maybe whole join */ + if (HJ_FILL_INNER(node)) + { + /* set up to scan for unmatched inner tuples */ + ExecPrepHashTableForUnmatched(node); + node->hj_JoinState = HJ_FILL_INNER_TUPLES; + } + else + node->hj_JoinState = HJ_NEED_NEW_BATCH; + continue; + } + + econtext->ecxt_outertuple = outerTupleSlot; + node->hj_MatchedOuter = false; + + /* + * Find the corresponding bucket for this tuple in the main + * hash table or skew hash table. + */ + node->hj_CurHashValue = hashvalue; + ExecHashGetBucketAndBatch(hashtable, hashvalue, + &node->hj_CurBucketNo, &batchno); + node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable, + hashvalue); + node->hj_CurTuple = NULL; + + /* + * The tuple might not belong to the current batch (where + * "current batch" includes the skew buckets if any). + */ + if (batchno != hashtable->curbatch && + node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) + { + bool shouldFree; + MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot, + &shouldFree); + + /* + * Need to postpone this outer tuple to a later batch. + * Save it in the corresponding outer-batch file. + */ + Assert(parallel_state == NULL); + Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(mintuple, hashvalue, + &hashtable->outerBatchFile[batchno]); + + if (shouldFree) + heap_free_minimal_tuple(mintuple); + + /* Loop around, staying in HJ_NEED_NEW_OUTER state */ + continue; + } + + /* OK, let's scan the bucket for matches */ + node->hj_JoinState = HJ_SCAN_BUCKET; + + /* FALL THRU */ + + case HJ_SCAN_BUCKET: + + /* + * Scan the selected hash bucket for matches to current outer + */ + if (parallel) + { + if (!ExecParallelScanHashBucket(node, econtext)) + { + /* out of matches; check for possible outer-join fill */ + node->hj_JoinState = HJ_FILL_OUTER_TUPLE; + continue; + } + } + else + { + if (!ExecScanHashBucket(node, econtext)) + { + /* out of matches; check for possible outer-join fill */ + node->hj_JoinState = HJ_FILL_OUTER_TUPLE; + continue; + } + } + + /* + * We've got a match, but still need to test non-hashed quals. + * ExecScanHashBucket already set up all the state needed to + * call ExecQual. + * + * If we pass the qual, then save state for next call and have + * ExecProject form the projection, store it in the tuple + * table, and return the slot. + * + * Only the joinquals determine tuple match status, but all + * quals must pass to actually return the tuple. + */ + if (joinqual == NULL || ExecQual(joinqual, econtext)) + { + node->hj_MatchedOuter = true; + + if (parallel) + { + /* + * Full/right outer joins are currently not supported + * for parallel joins, so we don't need to set the + * match bit. Experiments show that it's worth + * avoiding the shared memory traffic on large + * systems. + */ + Assert(!HJ_FILL_INNER(node)); + } + else + { + /* + * This is really only needed if HJ_FILL_INNER(node), + * but we'll avoid the branch and just set it always. + */ + HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); + } + + /* In an antijoin, we never return a matched tuple */ + if (node->js.jointype == JOIN_ANTI) + { + node->hj_JoinState = HJ_NEED_NEW_OUTER; + continue; + } + + /* + * If we only need to join to the first matching inner + * tuple, then consider returning this one, but after that + * continue with next outer tuple. + */ + if (node->js.single_match) + node->hj_JoinState = HJ_NEED_NEW_OUTER; + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + return ExecProject(node->js.ps.ps_ProjInfo); + else + InstrCountFiltered2(node, 1); + } + else + InstrCountFiltered1(node, 1); + break; + + case HJ_FILL_OUTER_TUPLE: + + /* + * The current outer tuple has run out of matches, so check + * whether to emit a dummy outer-join tuple. Whether we emit + * one or not, the next state is NEED_NEW_OUTER. + */ + node->hj_JoinState = HJ_NEED_NEW_OUTER; + + if (!node->hj_MatchedOuter && + HJ_FILL_OUTER(node)) + { + /* + * Generate a fake join tuple with nulls for the inner + * tuple, and return it if it passes the non-join quals. + */ + econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + return ExecProject(node->js.ps.ps_ProjInfo); + else + InstrCountFiltered2(node, 1); + } + break; + + case HJ_FILL_INNER_TUPLES: + + /* + * We have finished a batch, but we are doing right/full join, + * so any unmatched inner tuples in the hashtable have to be + * emitted before we continue to the next batch. + */ + if (!ExecScanHashTableForUnmatched(node, econtext)) + { + /* no more unmatched tuples */ + node->hj_JoinState = HJ_NEED_NEW_BATCH; + continue; + } + + /* + * Generate a fake join tuple with nulls for the outer tuple, + * and return it if it passes the non-join quals. + */ + econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot; + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + return ExecProject(node->js.ps.ps_ProjInfo); + else + InstrCountFiltered2(node, 1); + break; + + case HJ_NEED_NEW_BATCH: + + /* + * Try to advance to next batch. Done if there are no more. + */ + if (parallel) + { + if (!ExecParallelHashJoinNewBatch(node)) + return NULL; /* end of parallel-aware join */ + } + else + { + if (!ExecHashJoinNewBatch(node)) + return NULL; /* end of parallel-oblivious join */ + } + node->hj_JoinState = HJ_NEED_NEW_OUTER; + break; + + default: + elog(ERROR, "unrecognized hashjoin state: %d", + (int) node->hj_JoinState); + } + } +} + +/* ---------------------------------------------------------------- + * ExecHashJoin + * + * Parallel-oblivious version. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* return: a tuple or NULL */ +ExecHashJoin(PlanState *pstate) +{ + /* + * On sufficiently smart compilers this should be inlined with the + * parallel-aware branches removed. + */ + return ExecHashJoinImpl(pstate, false); +} + +/* ---------------------------------------------------------------- + * ExecParallelHashJoin + * + * Parallel-aware version. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* return: a tuple or NULL */ +ExecParallelHashJoin(PlanState *pstate) +{ + /* + * On sufficiently smart compilers this should be inlined with the + * parallel-oblivious branches removed. + */ + return ExecHashJoinImpl(pstate, true); +} + +/* ---------------------------------------------------------------- + * ExecInitHashJoin + * + * Init routine for HashJoin node. + * ---------------------------------------------------------------- + */ +HashJoinState * +ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) +{ + HashJoinState *hjstate; + Plan *outerNode; + Hash *hashNode; + TupleDesc outerDesc, + innerDesc; + const TupleTableSlotOps *ops; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + hjstate = makeNode(HashJoinState); + hjstate->js.ps.plan = (Plan *) node; + hjstate->js.ps.state = estate; + + /* + * See ExecHashJoinInitializeDSM() and ExecHashJoinInitializeWorker() + * where this function may be replaced with a parallel version, if we + * managed to launch a parallel query. + */ + hjstate->js.ps.ExecProcNode = ExecHashJoin; + hjstate->js.jointype = node->join.jointype; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &hjstate->js.ps); + + /* + * initialize child nodes + * + * Note: we could suppress the REWIND flag for the inner input, which + * would amount to betting that the hash will be a single batch. Not + * clear if this would be a win or not. + */ + outerNode = outerPlan(node); + hashNode = (Hash *) innerPlan(node); + + outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags); + outerDesc = ExecGetResultType(outerPlanState(hjstate)); + innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags); + innerDesc = ExecGetResultType(innerPlanState(hjstate)); + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTupleSlotTL(&hjstate->js.ps, &TTSOpsVirtual); + ExecAssignProjectionInfo(&hjstate->js.ps, NULL); + + /* + * tuple table initialization + */ + ops = ExecGetResultSlotOps(outerPlanState(hjstate), NULL); + hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate, outerDesc, + ops); + + /* + * detect whether we need only consider the first matching inner tuple + */ + hjstate->js.single_match = (node->join.inner_unique || + node->join.jointype == JOIN_SEMI); + + /* set up null tuples for outer joins, if needed */ + switch (node->join.jointype) + { + case JOIN_INNER: + case JOIN_SEMI: + break; + case JOIN_LEFT: + case JOIN_ANTI: + hjstate->hj_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual); + break; + case JOIN_RIGHT: + hjstate->hj_NullOuterTupleSlot = + ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual); + break; + case JOIN_FULL: + hjstate->hj_NullOuterTupleSlot = + ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual); + hjstate->hj_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) node->join.jointype); + } + + /* + * now for some voodoo. our temporary tuple slot is actually the result + * tuple slot of the Hash node (which is our inner plan). we can do this + * because Hash nodes don't return tuples via ExecProcNode() -- instead + * the hash join node uses ExecScanHashBucket() to get at the contents of + * the hash table. -cim 6/9/91 + */ + { + HashState *hashstate = (HashState *) innerPlanState(hjstate); + TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot; + + hjstate->hj_HashTupleSlot = slot; + } + + /* + * initialize child expressions + */ + hjstate->js.ps.qual = + ExecInitQual(node->join.plan.qual, (PlanState *) hjstate); + hjstate->js.joinqual = + ExecInitQual(node->join.joinqual, (PlanState *) hjstate); + hjstate->hashclauses = + ExecInitQual(node->hashclauses, (PlanState *) hjstate); + + /* + * initialize hash-specific info + */ + hjstate->hj_HashTable = NULL; + hjstate->hj_FirstOuterTupleSlot = NULL; + + hjstate->hj_CurHashValue = 0; + hjstate->hj_CurBucketNo = 0; + hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; + hjstate->hj_CurTuple = NULL; + + hjstate->hj_OuterHashKeys = ExecInitExprList(node->hashkeys, + (PlanState *) hjstate); + hjstate->hj_HashOperators = node->hashoperators; + hjstate->hj_Collations = node->hashcollations; + + hjstate->hj_JoinState = HJ_BUILD_HASHTABLE; + hjstate->hj_MatchedOuter = false; + hjstate->hj_OuterNotEmpty = false; + + return hjstate; +} + +/* ---------------------------------------------------------------- + * ExecEndHashJoin + * + * clean up routine for HashJoin node + * ---------------------------------------------------------------- + */ +void +ExecEndHashJoin(HashJoinState *node) +{ + /* + * Free hash table + */ + if (node->hj_HashTable) + { + ExecHashTableDestroy(node->hj_HashTable); + node->hj_HashTable = NULL; + } + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->js.ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->js.ps.ps_ResultTupleSlot); + ExecClearTuple(node->hj_OuterTupleSlot); + ExecClearTuple(node->hj_HashTupleSlot); + + /* + * clean up subtrees + */ + ExecEndNode(outerPlanState(node)); + ExecEndNode(innerPlanState(node)); +} + +/* + * ExecHashJoinOuterGetTuple + * + * get the next outer tuple for a parallel oblivious hashjoin: either by + * executing the outer plan node in the first pass, or from the temp + * files for the hashjoin batches. + * + * Returns a null slot if no more outer tuples (within the current batch). + * + * On success, the tuple's hash value is stored at *hashvalue --- this is + * either originally computed, or re-read from the temp file. + */ +static TupleTableSlot * +ExecHashJoinOuterGetTuple(PlanState *outerNode, + HashJoinState *hjstate, + uint32 *hashvalue) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int curbatch = hashtable->curbatch; + TupleTableSlot *slot; + + if (curbatch == 0) /* if it is the first pass */ + { + /* + * Check to see if first outer tuple was already fetched by + * ExecHashJoin() and not used yet. + */ + slot = hjstate->hj_FirstOuterTupleSlot; + if (!TupIsNull(slot)) + hjstate->hj_FirstOuterTupleSlot = NULL; + else + slot = ExecProcNode(outerNode); + + while (!TupIsNull(slot)) + { + /* + * We have to compute the tuple's hash value. + */ + ExprContext *econtext = hjstate->js.ps.ps_ExprContext; + + econtext->ecxt_outertuple = slot; + if (ExecHashGetHashValue(hashtable, econtext, + hjstate->hj_OuterHashKeys, + true, /* outer tuple */ + HJ_FILL_OUTER(hjstate), + hashvalue)) + { + /* remember outer relation is not empty for possible rescan */ + hjstate->hj_OuterNotEmpty = true; + + return slot; + } + + /* + * That tuple couldn't match because of a NULL, so discard it and + * continue with the next one. + */ + slot = ExecProcNode(outerNode); + } + } + else if (curbatch < hashtable->nbatch) + { + BufFile *file = hashtable->outerBatchFile[curbatch]; + + /* + * In outer-join cases, we could get here even though the batch file + * is empty. + */ + if (file == NULL) + return NULL; + + slot = ExecHashJoinGetSavedTuple(hjstate, + file, + hashvalue, + hjstate->hj_OuterTupleSlot); + if (!TupIsNull(slot)) + return slot; + } + + /* End of this batch */ + return NULL; +} + +/* + * ExecHashJoinOuterGetTuple variant for the parallel case. + */ +static TupleTableSlot * +ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, + HashJoinState *hjstate, + uint32 *hashvalue) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int curbatch = hashtable->curbatch; + TupleTableSlot *slot; + + /* + * In the Parallel Hash case we only run the outer plan directly for + * single-batch hash joins. Otherwise we have to go to batch files, even + * for batch 0. + */ + if (curbatch == 0 && hashtable->nbatch == 1) + { + slot = ExecProcNode(outerNode); + + while (!TupIsNull(slot)) + { + ExprContext *econtext = hjstate->js.ps.ps_ExprContext; + + econtext->ecxt_outertuple = slot; + if (ExecHashGetHashValue(hashtable, econtext, + hjstate->hj_OuterHashKeys, + true, /* outer tuple */ + HJ_FILL_OUTER(hjstate), + hashvalue)) + return slot; + + /* + * That tuple couldn't match because of a NULL, so discard it and + * continue with the next one. + */ + slot = ExecProcNode(outerNode); + } + } + else if (curbatch < hashtable->nbatch) + { + MinimalTuple tuple; + + tuple = sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples, + hashvalue); + if (tuple != NULL) + { + ExecForceStoreMinimalTuple(tuple, + hjstate->hj_OuterTupleSlot, + false); + slot = hjstate->hj_OuterTupleSlot; + return slot; + } + else + ExecClearTuple(hjstate->hj_OuterTupleSlot); + } + + /* End of this batch */ + return NULL; +} + +/* + * ExecHashJoinNewBatch + * switch to a new hashjoin batch + * + * Returns true if successful, false if there are no more batches. + */ +static bool +ExecHashJoinNewBatch(HashJoinState *hjstate) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int nbatch; + int curbatch; + BufFile *innerFile; + TupleTableSlot *slot; + uint32 hashvalue; + + nbatch = hashtable->nbatch; + curbatch = hashtable->curbatch; + + if (curbatch > 0) + { + /* + * We no longer need the previous outer batch file; close it right + * away to free disk space. + */ + if (hashtable->outerBatchFile[curbatch]) + BufFileClose(hashtable->outerBatchFile[curbatch]); + hashtable->outerBatchFile[curbatch] = NULL; + } + else /* we just finished the first batch */ + { + /* + * Reset some of the skew optimization state variables, since we no + * longer need to consider skew tuples after the first batch. The + * memory context reset we are about to do will release the skew + * hashtable itself. + */ + hashtable->skewEnabled = false; + hashtable->skewBucket = NULL; + hashtable->skewBucketNums = NULL; + hashtable->nSkewBuckets = 0; + hashtable->spaceUsedSkew = 0; + } + + /* + * We can always skip over any batches that are completely empty on both + * sides. We can sometimes skip over batches that are empty on only one + * side, but there are exceptions: + * + * 1. In a left/full outer join, we have to process outer batches even if + * the inner batch is empty. Similarly, in a right/full outer join, we + * have to process inner batches even if the outer batch is empty. + * + * 2. If we have increased nbatch since the initial estimate, we have to + * scan inner batches since they might contain tuples that need to be + * reassigned to later inner batches. + * + * 3. Similarly, if we have increased nbatch since starting the outer + * scan, we have to rescan outer batches in case they contain tuples that + * need to be reassigned. + */ + curbatch++; + while (curbatch < nbatch && + (hashtable->outerBatchFile[curbatch] == NULL || + hashtable->innerBatchFile[curbatch] == NULL)) + { + if (hashtable->outerBatchFile[curbatch] && + HJ_FILL_OUTER(hjstate)) + break; /* must process due to rule 1 */ + if (hashtable->innerBatchFile[curbatch] && + HJ_FILL_INNER(hjstate)) + break; /* must process due to rule 1 */ + if (hashtable->innerBatchFile[curbatch] && + nbatch != hashtable->nbatch_original) + break; /* must process due to rule 2 */ + if (hashtable->outerBatchFile[curbatch] && + nbatch != hashtable->nbatch_outstart) + break; /* must process due to rule 3 */ + /* We can ignore this batch. */ + /* Release associated temp files right away. */ + if (hashtable->innerBatchFile[curbatch]) + BufFileClose(hashtable->innerBatchFile[curbatch]); + hashtable->innerBatchFile[curbatch] = NULL; + if (hashtable->outerBatchFile[curbatch]) + BufFileClose(hashtable->outerBatchFile[curbatch]); + hashtable->outerBatchFile[curbatch] = NULL; + curbatch++; + } + + if (curbatch >= nbatch) + return false; /* no more batches */ + + hashtable->curbatch = curbatch; + + /* + * Reload the hash table with the new inner batch (which could be empty) + */ + ExecHashTableReset(hashtable); + + innerFile = hashtable->innerBatchFile[curbatch]; + + if (innerFile != NULL) + { + if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file"))); + + while ((slot = ExecHashJoinGetSavedTuple(hjstate, + innerFile, + &hashvalue, + hjstate->hj_HashTupleSlot))) + { + /* + * NOTE: some tuples may be sent to future batches. Also, it is + * possible for hashtable->nbatch to be increased here! + */ + ExecHashTableInsert(hashtable, slot, hashvalue); + } + + /* + * after we build the hash table, the inner batch file is no longer + * needed + */ + BufFileClose(innerFile); + hashtable->innerBatchFile[curbatch] = NULL; + } + + /* + * Rewind outer batch file (if present), so that we can start reading it. + */ + if (hashtable->outerBatchFile[curbatch] != NULL) + { + if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file"))); + } + + return true; +} + +/* + * Choose a batch to work on, and attach to it. Returns true if successful, + * false if there are no more batches. + */ +static bool +ExecParallelHashJoinNewBatch(HashJoinState *hjstate) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int start_batchno; + int batchno; + + /* + * If we started up so late that the batch tracking array has been freed + * already by ExecHashTableDetach(), then we are finished. See also + * ExecParallelHashEnsureBatchAccessors(). + */ + if (hashtable->batches == NULL) + return false; + + /* + * If we were already attached to a batch, remember not to bother checking + * it again, and detach from it (possibly freeing the hash table if we are + * last to detach). + */ + if (hashtable->curbatch >= 0) + { + hashtable->batches[hashtable->curbatch].done = true; + ExecHashTableDetachBatch(hashtable); + } + + /* + * Search for a batch that isn't done. We use an atomic counter to start + * our search at a different batch in every participant when there are + * more batches than participants. + */ + batchno = start_batchno = + pg_atomic_fetch_add_u32(&hashtable->parallel_state->distributor, 1) % + hashtable->nbatch; + do + { + uint32 hashvalue; + MinimalTuple tuple; + TupleTableSlot *slot; + + if (!hashtable->batches[batchno].done) + { + SharedTuplestoreAccessor *inner_tuples; + Barrier *batch_barrier = + &hashtable->batches[batchno].shared->batch_barrier; + + switch (BarrierAttach(batch_barrier)) + { + case PHJ_BATCH_ELECTING: + + /* One backend allocates the hash table. */ + if (BarrierArriveAndWait(batch_barrier, + WAIT_EVENT_HASH_BATCH_ELECT)) + ExecParallelHashTableAlloc(hashtable, batchno); + /* Fall through. */ + + case PHJ_BATCH_ALLOCATING: + /* Wait for allocation to complete. */ + BarrierArriveAndWait(batch_barrier, + WAIT_EVENT_HASH_BATCH_ALLOCATE); + /* Fall through. */ + + case PHJ_BATCH_LOADING: + /* Start (or join in) loading tuples. */ + ExecParallelHashTableSetCurrentBatch(hashtable, batchno); + inner_tuples = hashtable->batches[batchno].inner_tuples; + sts_begin_parallel_scan(inner_tuples); + while ((tuple = sts_parallel_scan_next(inner_tuples, + &hashvalue))) + { + ExecForceStoreMinimalTuple(tuple, + hjstate->hj_HashTupleSlot, + false); + slot = hjstate->hj_HashTupleSlot; + ExecParallelHashTableInsertCurrentBatch(hashtable, slot, + hashvalue); + } + sts_end_parallel_scan(inner_tuples); + BarrierArriveAndWait(batch_barrier, + WAIT_EVENT_HASH_BATCH_LOAD); + /* Fall through. */ + + case PHJ_BATCH_PROBING: + + /* + * This batch is ready to probe. Return control to + * caller. We stay attached to batch_barrier so that the + * hash table stays alive until everyone's finished + * probing it, but no participant is allowed to wait at + * this barrier again (or else a deadlock could occur). + * All attached participants must eventually call + * BarrierArriveAndDetach() so that the final phase + * PHJ_BATCH_DONE can be reached. + */ + ExecParallelHashTableSetCurrentBatch(hashtable, batchno); + sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples); + return true; + + case PHJ_BATCH_DONE: + + /* + * Already done. Detach and go around again (if any + * remain). + */ + BarrierDetach(batch_barrier); + hashtable->batches[batchno].done = true; + hashtable->curbatch = -1; + break; + + default: + elog(ERROR, "unexpected batch phase %d", + BarrierPhase(batch_barrier)); + } + } + batchno = (batchno + 1) % hashtable->nbatch; + } while (batchno != start_batchno); + + return false; +} + +/* + * ExecHashJoinSaveTuple + * save a tuple to a batch file. + * + * The data recorded in the file for each tuple is its hash value, + * then the tuple in MinimalTuple format. + * + * Note: it is important always to call this in the regular executor + * context, not in a shorter-lived context; else the temp file buffers + * will get messed up. + */ +void +ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, + BufFile **fileptr) +{ + BufFile *file = *fileptr; + + if (file == NULL) + { + /* First write to this batch file, so open it. */ + file = BufFileCreateTemp(false); + *fileptr = file; + } + + BufFileWrite(file, (void *) &hashvalue, sizeof(uint32)); + BufFileWrite(file, (void *) tuple, tuple->t_len); +} + +/* + * ExecHashJoinGetSavedTuple + * read the next tuple from a batch file. Return NULL if no more. + * + * On success, *hashvalue is set to the tuple's hash value, and the tuple + * itself is stored in the given slot. + */ +static TupleTableSlot * +ExecHashJoinGetSavedTuple(HashJoinState *hjstate, + BufFile *file, + uint32 *hashvalue, + TupleTableSlot *tupleSlot) +{ + uint32 header[2]; + size_t nread; + MinimalTuple tuple; + + /* + * We check for interrupts here because this is typically taken as an + * alternative code path to an ExecProcNode() call, which would include + * such a check. + */ + CHECK_FOR_INTERRUPTS(); + + /* + * Since both the hash value and the MinimalTuple length word are uint32, + * we can read them both in one BufFileRead() call without any type + * cheating. + */ + nread = BufFileRead(file, (void *) header, sizeof(header)); + if (nread == 0) /* end of file */ + { + ExecClearTuple(tupleSlot); + return NULL; + } + if (nread != sizeof(header)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes", + nread, sizeof(header)))); + *hashvalue = header[0]; + tuple = (MinimalTuple) palloc(header[1]); + tuple->t_len = header[1]; + nread = BufFileRead(file, + (void *) ((char *) tuple + sizeof(uint32)), + header[1] - sizeof(uint32)); + if (nread != header[1] - sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes", + nread, header[1] - sizeof(uint32)))); + ExecForceStoreMinimalTuple(tuple, tupleSlot, true); + return tupleSlot; +} + + +void +ExecReScanHashJoin(HashJoinState *node) +{ + /* + * In a multi-batch join, we currently have to do rescans the hard way, + * primarily because batch temp files may have already been released. But + * if it's a single-batch join, and there is no parameter change for the + * inner subnode, then we can just re-use the existing hash table without + * rebuilding it. + */ + if (node->hj_HashTable != NULL) + { + if (node->hj_HashTable->nbatch == 1 && + node->js.ps.righttree->chgParam == NULL) + { + /* + * Okay to reuse the hash table; needn't rescan inner, either. + * + * However, if it's a right/full join, we'd better reset the + * inner-tuple match flags contained in the table. + */ + if (HJ_FILL_INNER(node)) + ExecHashTableResetMatchFlags(node->hj_HashTable); + + /* + * Also, we need to reset our state about the emptiness of the + * outer relation, so that the new scan of the outer will update + * it correctly if it turns out to be empty this time. (There's no + * harm in clearing it now because ExecHashJoin won't need the + * info. In the other cases, where the hash table doesn't exist + * or we are destroying it, we leave this state alone because + * ExecHashJoin will need it the first time through.) + */ + node->hj_OuterNotEmpty = false; + + /* ExecHashJoin can skip the BUILD_HASHTABLE step */ + node->hj_JoinState = HJ_NEED_NEW_OUTER; + } + else + { + /* must destroy and rebuild hash table */ + HashState *hashNode = castNode(HashState, innerPlanState(node)); + + Assert(hashNode->hashtable == node->hj_HashTable); + /* accumulate stats from old hash table, if wanted */ + /* (this should match ExecShutdownHash) */ + if (hashNode->ps.instrument && !hashNode->hinstrument) + hashNode->hinstrument = (HashInstrumentation *) + palloc0(sizeof(HashInstrumentation)); + if (hashNode->hinstrument) + ExecHashAccumInstrumentation(hashNode->hinstrument, + hashNode->hashtable); + /* for safety, be sure to clear child plan node's pointer too */ + hashNode->hashtable = NULL; + + ExecHashTableDestroy(node->hj_HashTable); + node->hj_HashTable = NULL; + node->hj_JoinState = HJ_BUILD_HASHTABLE; + + /* + * if chgParam of subnode is not null then plan will be re-scanned + * by first ExecProcNode. + */ + if (node->js.ps.righttree->chgParam == NULL) + ExecReScan(node->js.ps.righttree); + } + } + + /* Always reset intra-tuple state */ + node->hj_CurHashValue = 0; + node->hj_CurBucketNo = 0; + node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; + node->hj_CurTuple = NULL; + + node->hj_MatchedOuter = false; + node->hj_FirstOuterTupleSlot = NULL; + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->js.ps.lefttree->chgParam == NULL) + ExecReScan(node->js.ps.lefttree); +} + +void +ExecShutdownHashJoin(HashJoinState *node) +{ + if (node->hj_HashTable) + { + /* + * Detach from shared state before DSM memory goes away. This makes + * sure that we don't have any pointers into DSM memory by the time + * ExecEndHashJoin runs. + */ + ExecHashTableDetachBatch(node->hj_HashTable); + ExecHashTableDetach(node->hj_HashTable); + } +} + +static void +ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate) +{ + PlanState *outerState = outerPlanState(hjstate); + ExprContext *econtext = hjstate->js.ps.ps_ExprContext; + HashJoinTable hashtable = hjstate->hj_HashTable; + TupleTableSlot *slot; + uint32 hashvalue; + int i; + + Assert(hjstate->hj_FirstOuterTupleSlot == NULL); + + /* Execute outer plan, writing all tuples to shared tuplestores. */ + for (;;) + { + slot = ExecProcNode(outerState); + if (TupIsNull(slot)) + break; + econtext->ecxt_outertuple = slot; + if (ExecHashGetHashValue(hashtable, econtext, + hjstate->hj_OuterHashKeys, + true, /* outer tuple */ + HJ_FILL_OUTER(hjstate), + &hashvalue)) + { + int batchno; + int bucketno; + bool shouldFree; + MinimalTuple mintup = ExecFetchSlotMinimalTuple(slot, &shouldFree); + + ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, + &batchno); + sts_puttuple(hashtable->batches[batchno].outer_tuples, + &hashvalue, mintup); + + if (shouldFree) + heap_free_minimal_tuple(mintup); + } + CHECK_FOR_INTERRUPTS(); + } + + /* Make sure all outer partitions are readable by any backend. */ + for (i = 0; i < hashtable->nbatch; ++i) + sts_end_write(hashtable->batches[i].outer_tuples); +} + +void +ExecHashJoinEstimate(HashJoinState *state, ParallelContext *pcxt) +{ + shm_toc_estimate_chunk(&pcxt->estimator, sizeof(ParallelHashJoinState)); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +void +ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt) +{ + int plan_node_id = state->js.ps.plan->plan_node_id; + HashState *hashNode; + ParallelHashJoinState *pstate; + + /* + * Disable shared hash table mode if we failed to create a real DSM + * segment, because that means that we don't have a DSA area to work with. + */ + if (pcxt->seg == NULL) + return; + + ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin); + + /* + * Set up the state needed to coordinate access to the shared hash + * table(s), using the plan node ID as the toc key. + */ + pstate = shm_toc_allocate(pcxt->toc, sizeof(ParallelHashJoinState)); + shm_toc_insert(pcxt->toc, plan_node_id, pstate); + + /* + * Set up the shared hash join state with no batches initially. + * ExecHashTableCreate() will prepare at least one later and set nbatch + * and space_allowed. + */ + pstate->nbatch = 0; + pstate->space_allowed = 0; + pstate->batches = InvalidDsaPointer; + pstate->old_batches = InvalidDsaPointer; + pstate->nbuckets = 0; + pstate->growth = PHJ_GROWTH_OK; + pstate->chunk_work_queue = InvalidDsaPointer; + pg_atomic_init_u32(&pstate->distributor, 0); + pstate->nparticipants = pcxt->nworkers + 1; + pstate->total_tuples = 0; + LWLockInitialize(&pstate->lock, + LWTRANCHE_PARALLEL_HASH_JOIN); + BarrierInit(&pstate->build_barrier, 0); + BarrierInit(&pstate->grow_batches_barrier, 0); + BarrierInit(&pstate->grow_buckets_barrier, 0); + + /* Set up the space we'll use for shared temporary files. */ + SharedFileSetInit(&pstate->fileset, pcxt->seg); + + /* Initialize the shared state in the hash node. */ + hashNode = (HashState *) innerPlanState(state); + hashNode->parallel_state = pstate; +} + +/* ---------------------------------------------------------------- + * ExecHashJoinReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecHashJoinReInitializeDSM(HashJoinState *state, ParallelContext *cxt) +{ + int plan_node_id = state->js.ps.plan->plan_node_id; + ParallelHashJoinState *pstate = + shm_toc_lookup(cxt->toc, plan_node_id, false); + + /* + * It would be possible to reuse the shared hash table in single-batch + * cases by resetting and then fast-forwarding build_barrier to + * PHJ_BUILD_DONE and batch 0's batch_barrier to PHJ_BATCH_PROBING, but + * currently shared hash tables are already freed by now (by the last + * participant to detach from the batch). We could consider keeping it + * around for single-batch joins. We'd also need to adjust + * finalize_plan() so that it doesn't record a dummy dependency for + * Parallel Hash nodes, preventing the rescan optimization. For now we + * don't try. + */ + + /* Detach, freeing any remaining shared memory. */ + if (state->hj_HashTable != NULL) + { + ExecHashTableDetachBatch(state->hj_HashTable); + ExecHashTableDetach(state->hj_HashTable); + } + + /* Clear any shared batch files. */ + SharedFileSetDeleteAll(&pstate->fileset); + + /* Reset build_barrier to PHJ_BUILD_ELECTING so we can go around again. */ + BarrierInit(&pstate->build_barrier, 0); +} + +void +ExecHashJoinInitializeWorker(HashJoinState *state, + ParallelWorkerContext *pwcxt) +{ + HashState *hashNode; + int plan_node_id = state->js.ps.plan->plan_node_id; + ParallelHashJoinState *pstate = + shm_toc_lookup(pwcxt->toc, plan_node_id, false); + + /* Attach to the space for shared temporary files. */ + SharedFileSetAttach(&pstate->fileset, pwcxt->seg); + + /* Attach to the shared state in the hash node. */ + hashNode = (HashState *) innerPlanState(state); + hashNode->parallel_state = pstate; + + ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin); +} diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c new file mode 100644 index 0000000..934426a --- /dev/null +++ b/src/backend/executor/nodeIncrementalSort.c @@ -0,0 +1,1257 @@ +/*------------------------------------------------------------------------- + * + * nodeIncrementalSort.c + * Routines to handle incremental sorting of relations. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeIncrementalSort.c + * + * DESCRIPTION + * + * Incremental sort is an optimized variant of multikey sort for cases + * when the input is already sorted by a prefix of the sort keys. For + * example when a sort by (key1, key2 ... keyN) is requested, and the + * input is already sorted by (key1, key2 ... keyM), M < N, we can + * divide the input into groups where keys (key1, ... keyM) are equal, + * and only sort on the remaining columns. + * + * Consider the following example. We have input tuples consisting of + * two integers (X, Y) already presorted by X, while it's required to + * sort them by both X and Y. Let input tuples be following. + * + * (1, 5) + * (1, 2) + * (2, 9) + * (2, 1) + * (2, 5) + * (3, 3) + * (3, 7) + * + * An incremental sort algorithm would split the input into the following + * groups, which have equal X, and then sort them by Y individually: + * + * (1, 5) (1, 2) + * (2, 9) (2, 1) (2, 5) + * (3, 3) (3, 7) + * + * After sorting these groups and putting them altogether, we would get + * the following result which is sorted by X and Y, as requested: + * + * (1, 2) + * (1, 5) + * (2, 1) + * (2, 5) + * (2, 9) + * (3, 3) + * (3, 7) + * + * Incremental sort may be more efficient than plain sort, particularly + * on large datasets, as it reduces the amount of data to sort at once, + * making it more likely it fits into work_mem (eliminating the need to + * spill to disk). But the main advantage of incremental sort is that + * it can start producing rows early, before sorting the whole dataset, + * which is a significant benefit especially for queries with LIMIT. + * + * The algorithm we've implemented here is modified from the theoretical + * base described above by operating in two different modes: + * - Fetching a minimum number of tuples without checking prefix key + * group membership and sorting on all columns when safe. + * - Fetching all tuples for a single prefix key group and sorting on + * solely the unsorted columns. + * We always begin in the first mode, and employ a heuristic to switch + * into the second mode if we believe it's beneficial. + * + * Sorting incrementally can potentially use less memory, avoid fetching + * and sorting all tuples in the dataset, and begin returning tuples before + * the entire result set is available. + * + * The hybrid mode approach allows us to optimize for both very small + * groups (where the overhead of a new tuplesort is high) and very large + * groups (where we can lower cost by not having to sort on already sorted + * columns), albeit at some extra cost while switching between modes. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/execdebug.h" +#include "executor/nodeIncrementalSort.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/tuplesort.h" + +/* + * We need to store the instrumentation information in either local node's sort + * info or, for a parallel worker process, in the shared info (this avoids + * having to additionally memcpy the info from local memory to shared memory + * at each instrumentation call). This macro expands to choose the proper sort + * state and group info. + * + * Arguments: + * - node: type IncrementalSortState * + * - groupName: the token fullsort or prefixsort + */ +#define INSTRUMENT_SORT_GROUP(node, groupName) \ + do { \ + if ((node)->ss.ps.instrument != NULL) \ + { \ + if ((node)->shared_info && (node)->am_worker) \ + { \ + Assert(IsParallelWorker()); \ + Assert(ParallelWorkerNumber <= (node)->shared_info->num_workers); \ + instrumentSortedGroup(&(node)->shared_info->sinfo[ParallelWorkerNumber].groupName##GroupInfo, \ + (node)->groupName##_state); \ + } \ + else \ + { \ + instrumentSortedGroup(&(node)->incsort_info.groupName##GroupInfo, \ + (node)->groupName##_state); \ + } \ + } \ + } while (0) + + +/* ---------------------------------------------------------------- + * instrumentSortedGroup + * + * Because incremental sort processes (potentially many) sort batches, we need + * to capture tuplesort stats each time we finalize a sort state. This summary + * data is later used for EXPLAIN ANALYZE output. + * ---------------------------------------------------------------- + */ +static void +instrumentSortedGroup(IncrementalSortGroupInfo *groupInfo, + Tuplesortstate *sortState) +{ + TuplesortInstrumentation sort_instr; + + groupInfo->groupCount++; + + tuplesort_get_stats(sortState, &sort_instr); + + /* Calculate total and maximum memory and disk space used. */ + switch (sort_instr.spaceType) + { + case SORT_SPACE_TYPE_DISK: + groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed; + if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed) + groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed; + + break; + case SORT_SPACE_TYPE_MEMORY: + groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed; + if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed) + groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed; + + break; + } + + /* Track each sort method we've used. */ + groupInfo->sortMethods |= sort_instr.sortMethod; +} + +/* ---------------------------------------------------------------- + * preparePresortedCols + * + * Prepare information for presorted_keys comparisons. + * ---------------------------------------------------------------- + */ +static void +preparePresortedCols(IncrementalSortState *node) +{ + IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan); + + node->presorted_keys = + (PresortedKeyData *) palloc(plannode->nPresortedCols * + sizeof(PresortedKeyData)); + + /* Pre-cache comparison functions for each pre-sorted key. */ + for (int i = 0; i < plannode->nPresortedCols; i++) + { + Oid equalityOp, + equalityFunc; + PresortedKeyData *key; + + key = &node->presorted_keys[i]; + key->attno = plannode->sort.sortColIdx[i]; + + equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i], + NULL); + if (!OidIsValid(equalityOp)) + elog(ERROR, "missing equality operator for ordering operator %u", + plannode->sort.sortOperators[i]); + + equalityFunc = get_opcode(equalityOp); + if (!OidIsValid(equalityFunc)) + elog(ERROR, "missing function for operator %u", equalityOp); + + /* Lookup the comparison function */ + fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext); + + /* We can initialize the callinfo just once and re-use it */ + key->fcinfo = palloc0(SizeForFunctionCallInfo(2)); + InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2, + plannode->sort.collations[i], NULL, NULL); + key->fcinfo->args[0].isnull = false; + key->fcinfo->args[1].isnull = false; + } +} + +/* ---------------------------------------------------------------- + * isCurrentGroup + * + * Check whether a given tuple belongs to the current sort group by comparing + * the presorted column values to the pivot tuple of the current group. + * ---------------------------------------------------------------- + */ +static bool +isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple) +{ + int nPresortedCols; + + nPresortedCols = castNode(IncrementalSort, node->ss.ps.plan)->nPresortedCols; + + /* + * That the input is sorted by keys * (0, ... n) implies that the tail + * keys are more likely to change. Therefore we do our comparison starting + * from the last pre-sorted column to optimize for early detection of + * inequality and minimizing the number of function calls.. + */ + for (int i = nPresortedCols - 1; i >= 0; i--) + { + Datum datumA, + datumB, + result; + bool isnullA, + isnullB; + AttrNumber attno = node->presorted_keys[i].attno; + PresortedKeyData *key; + + datumA = slot_getattr(pivot, attno, &isnullA); + datumB = slot_getattr(tuple, attno, &isnullB); + + /* Special case for NULL-vs-NULL, else use standard comparison */ + if (isnullA || isnullB) + { + if (isnullA == isnullB) + continue; + else + return false; + } + + key = &node->presorted_keys[i]; + + key->fcinfo->args[0].value = datumA; + key->fcinfo->args[1].value = datumB; + + /* just for paranoia's sake, we reset isnull each time */ + key->fcinfo->isnull = false; + + result = FunctionCallInvoke(key->fcinfo); + + /* Check for null result, since caller is clearly not expecting one */ + if (key->fcinfo->isnull) + elog(ERROR, "function %u returned NULL", key->flinfo.fn_oid); + + if (!DatumGetBool(result)) + return false; + } + return true; +} + +/* ---------------------------------------------------------------- + * switchToPresortedPrefixMode + * + * When we determine that we've likely encountered a large batch of tuples all + * having the same presorted prefix values, we want to optimize tuplesort by + * only sorting on unsorted suffix keys. + * + * The problem is that we've already accumulated several tuples in another + * tuplesort configured to sort by all columns (assuming that there may be + * more than one prefix key group). So to switch to presorted prefix mode we + * have to go back and look at all the tuples we've already accumulated to + * verify they're all part of the same prefix key group before sorting them + * solely by unsorted suffix keys. + * + * While it's likely that all tuples already fetched are all part of a single + * prefix group, we also have to handle the possibility that there is at least + * one different prefix key group before the large prefix key group. + * ---------------------------------------------------------------- + */ +static void +switchToPresortedPrefixMode(PlanState *pstate) +{ + IncrementalSortState *node = castNode(IncrementalSortState, pstate); + ScanDirection dir; + int64 nTuples; + TupleDesc tupDesc; + PlanState *outerNode; + IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan); + + dir = node->ss.ps.state->es_direction; + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + /* Configure the prefix sort state the first time around. */ + if (node->prefixsort_state == NULL) + { + Tuplesortstate *prefixsort_state; + int nPresortedCols = plannode->nPresortedCols; + + /* + * Optimize the sort by assuming the prefix columns are all equal and + * thus we only need to sort by any remaining columns. + */ + prefixsort_state = tuplesort_begin_heap(tupDesc, + plannode->sort.numCols - nPresortedCols, + &(plannode->sort.sortColIdx[nPresortedCols]), + &(plannode->sort.sortOperators[nPresortedCols]), + &(plannode->sort.collations[nPresortedCols]), + &(plannode->sort.nullsFirst[nPresortedCols]), + work_mem, + NULL, + false); + node->prefixsort_state = prefixsort_state; + } + else + { + /* Next group of presorted data */ + tuplesort_reset(node->prefixsort_state); + } + + /* + * If the current node has a bound, then it's reasonably likely that a + * large prefix key group will benefit from bounded sort, so configure the + * tuplesort to allow for that optimization. + */ + if (node->bounded) + { + SO1_printf("Setting bound on presorted prefix tuplesort to: " INT64_FORMAT "\n", + node->bound - node->bound_Done); + tuplesort_set_bound(node->prefixsort_state, + node->bound - node->bound_Done); + } + + /* + * Copy as many tuples as we can (i.e., in the same prefix key group) from + * the full sort state to the prefix sort state. + */ + for (nTuples = 0; nTuples < node->n_fullsort_remaining; nTuples++) + { + /* + * When we encounter multiple prefix key groups inside the full sort + * tuplesort we have to carry over the last read tuple into the next + * batch. + */ + if (nTuples == 0 && !TupIsNull(node->transfer_tuple)) + { + tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple); + /* The carried over tuple is our new group pivot tuple. */ + ExecCopySlot(node->group_pivot, node->transfer_tuple); + } + else + { + tuplesort_gettupleslot(node->fullsort_state, + ScanDirectionIsForward(dir), + false, node->transfer_tuple, NULL); + + /* + * If this is our first time through the loop, then we need to + * save the first tuple we get as our new group pivot. + */ + if (TupIsNull(node->group_pivot)) + ExecCopySlot(node->group_pivot, node->transfer_tuple); + + if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple)) + { + tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple); + } + else + { + /* + * The tuple isn't part of the current batch so we need to + * carry it over into the next batch of tuples we transfer out + * of the full sort tuplesort into the presorted prefix + * tuplesort. We don't actually have to do anything special to + * save the tuple since we've already loaded it into the + * node->transfer_tuple slot, and, even though that slot + * points to memory inside the full sort tuplesort, we can't + * reset that tuplesort anyway until we've fully transferred + * out its tuples, so this reference is safe. We do need to + * reset the group pivot tuple though since we've finished the + * current prefix key group. + */ + ExecClearTuple(node->group_pivot); + + /* Break out of for-loop early */ + break; + } + } + } + + /* + * Track how many tuples remain in the full sort batch so that we know if + * we need to sort multiple prefix key groups before processing tuples + * remaining in the large single prefix key group we think we've + * encountered. + */ + SO1_printf("Moving " INT64_FORMAT " tuples to presorted prefix tuplesort\n", nTuples); + node->n_fullsort_remaining -= nTuples; + SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT "\n", node->n_fullsort_remaining); + + if (node->n_fullsort_remaining == 0) + { + /* + * We've found that all tuples remaining in the full sort batch are in + * the same prefix key group and moved all of those tuples into the + * presorted prefix tuplesort. We don't know that we've yet found the + * last tuple in the current prefix key group, so save our pivot + * comparison tuple and continue fetching tuples from the outer + * execution node to load into the presorted prefix tuplesort. + */ + ExecCopySlot(node->group_pivot, node->transfer_tuple); + SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n"); + node->execution_status = INCSORT_LOADPREFIXSORT; + + /* + * Make sure we clear the transfer tuple slot so that next time we + * encounter a large prefix key group we don't incorrectly assume we + * have a tuple carried over from the previous group. + */ + ExecClearTuple(node->transfer_tuple); + } + else + { + /* + * We finished a group but didn't consume all of the tuples from the + * full sort state, so we'll sort this batch, let the outer node read + * out all of those tuples, and then come back around to find another + * batch. + */ + SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples); + tuplesort_performsort(node->prefixsort_state); + + INSTRUMENT_SORT_GROUP(node, prefixsort); + + if (node->bounded) + { + /* + * If the current node has a bound and we've already sorted n + * tuples, then the functional bound remaining is (original bound + * - n), so store the current number of processed tuples for use + * in configuring sorting bound. + */ + SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n", + Min(node->bound, node->bound_Done + nTuples), node->bound_Done); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + + SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (switchToPresortedPrefixMode)\n"); + node->execution_status = INCSORT_READPREFIXSORT; + } +} + +/* + * Sorting many small groups with tuplesort is inefficient. In order to + * cope with this problem we don't start a new group until the current one + * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also + * means we can't assume small groups of tuples all have the same prefix keys.) + * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking + * for the new group as soon as we've met our bound to avoid fetching more + * tuples than we absolutely have to fetch. + */ +#define DEFAULT_MIN_GROUP_SIZE 32 + +/* + * While we've optimized for small prefix key groups by not starting our prefix + * key comparisons until we've reached a minimum number of tuples, we don't want + * that optimization to cause us to lose out on the benefits of being able to + * assume a large group of tuples is fully presorted by its prefix keys. + * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic + * for determining when we believe we've encountered a large group, and, if we + * get to that point without finding a new prefix key group we transition to + * presorted prefix key mode. + */ +#define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE) + +/* ---------------------------------------------------------------- + * ExecIncrementalSort + * + * Assuming that outer subtree returns tuple presorted by some prefix + * of target sort columns, performs incremental sort. + * + * Conditions: + * -- none. + * + * Initial States: + * -- the outer child is prepared to return the first tuple. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecIncrementalSort(PlanState *pstate) +{ + IncrementalSortState *node = castNode(IncrementalSortState, pstate); + EState *estate; + ScanDirection dir; + Tuplesortstate *read_sortstate; + Tuplesortstate *fullsort_state; + TupleTableSlot *slot; + IncrementalSort *plannode = (IncrementalSort *) node->ss.ps.plan; + PlanState *outerNode; + TupleDesc tupDesc; + int64 nTuples = 0; + int64 minGroupSize; + + CHECK_FOR_INTERRUPTS(); + + estate = node->ss.ps.state; + dir = estate->es_direction; + fullsort_state = node->fullsort_state; + + /* + * If a previous iteration has sorted a batch, then we need to check to + * see if there are any remaining tuples in that batch that we can return + * before moving on to other execution states. + */ + if (node->execution_status == INCSORT_READFULLSORT + || node->execution_status == INCSORT_READPREFIXSORT) + { + /* + * Return next tuple from the current sorted group set if available. + */ + read_sortstate = node->execution_status == INCSORT_READFULLSORT ? + fullsort_state : node->prefixsort_state; + slot = node->ss.ps.ps_ResultTupleSlot; + + /* + * We have to populate the slot from the tuplesort before checking + * outerNodeDone because it will set the slot to NULL if no more + * tuples remain. If the tuplesort is empty, but we don't have any + * more tuples available for sort from the outer node, then + * outerNodeDone will have been set so we'll return that now-empty + * slot to the caller. + */ + if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir), + false, slot, NULL) || node->outerNodeDone) + + /* + * Note: there isn't a good test case for the node->outerNodeDone + * check directly, but we need it for any plan where the outer + * node will fail when trying to fetch too many tuples. + */ + return slot; + else if (node->n_fullsort_remaining > 0) + { + /* + * When we transition to presorted prefix mode, we might have + * accumulated at least one additional prefix key group in the + * full sort tuplesort. The first call to + * switchToPresortedPrefixMode() will have pulled the first one of + * those groups out, and we've returned those tuples to the parent + * node, but if at this point we still have tuples remaining in + * the full sort state (i.e., n_fullsort_remaining > 0), then we + * need to re-execute the prefix mode transition function to pull + * out the next prefix key group. + */ + SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (" INT64_FORMAT ")\n", + node->n_fullsort_remaining); + switchToPresortedPrefixMode(pstate); + } + else + { + /* + * If we don't have any sorted tuples to read and we're not + * currently transitioning into presorted prefix sort mode, then + * it's time to start the process all over again by building a new + * group in the full sort state. + */ + SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n"); + node->execution_status = INCSORT_LOADFULLSORT; + } + } + + /* + * Scan the subplan in the forward direction while creating the sorted + * data. + */ + estate->es_direction = ForwardScanDirection; + + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + /* Load tuples into the full sort state. */ + if (node->execution_status == INCSORT_LOADFULLSORT) + { + /* + * Initialize sorting structures. + */ + if (fullsort_state == NULL) + { + /* + * Initialize presorted column support structures for + * isCurrentGroup(). It's correct to do this along with the + * initial initialization for the full sort state (and not for the + * prefix sort state) since we always load the full sort state + * first. + */ + preparePresortedCols(node); + + /* + * Since we optimize small prefix key groups by accumulating a + * minimum number of tuples before sorting, we can't assume that a + * group of tuples all have the same prefix key values. Hence we + * setup the full sort tuplesort to sort by all requested sort + * keys. + */ + fullsort_state = tuplesort_begin_heap(tupDesc, + plannode->sort.numCols, + plannode->sort.sortColIdx, + plannode->sort.sortOperators, + plannode->sort.collations, + plannode->sort.nullsFirst, + work_mem, + NULL, + false); + node->fullsort_state = fullsort_state; + } + else + { + /* Reset sort for the next batch. */ + tuplesort_reset(fullsort_state); + } + + /* + * Calculate the remaining tuples left if bounded and configure both + * bounded sort and the minimum group size accordingly. + */ + if (node->bounded) + { + int64 currentBound = node->bound - node->bound_Done; + + /* + * Bounded sort isn't likely to be a useful optimization for full + * sort mode since we limit full sort mode to a relatively small + * number of tuples and tuplesort doesn't switch over to top-n + * heap sort anyway unless it hits (2 * bound) tuples. + */ + if (currentBound < DEFAULT_MIN_GROUP_SIZE) + tuplesort_set_bound(fullsort_state, currentBound); + + minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound); + } + else + minGroupSize = DEFAULT_MIN_GROUP_SIZE; + + /* + * Because we have to read the next tuple to find out that we've + * encountered a new prefix key group, on subsequent groups we have to + * carry over that extra tuple and add it to the new group's sort here + * before we read any new tuples from the outer node. + */ + if (!TupIsNull(node->group_pivot)) + { + tuplesort_puttupleslot(fullsort_state, node->group_pivot); + nTuples++; + + /* + * We're in full sort mode accumulating a minimum number of tuples + * and not checking for prefix key equality yet, so we can't + * assume the group pivot tuple will remain the same -- unless + * we're using a minimum group size of 1, in which case the pivot + * is obviously still the pivot. + */ + if (nTuples != minGroupSize) + ExecClearTuple(node->group_pivot); + } + + + /* + * Pull as many tuples from the outer node as possible given our + * current operating mode. + */ + for (;;) + { + slot = ExecProcNode(outerNode); + + /* + * If the outer node can't provide us any more tuples, then we can + * sort the current group and return those tuples. + */ + if (TupIsNull(slot)) + { + /* + * We need to know later if the outer node has completed to be + * able to distinguish between being done with a batch and + * being done with the whole node. + */ + node->outerNodeDone = true; + + SO1_printf("Sorting fullsort with " INT64_FORMAT " tuples\n", nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort); + + SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n"); + node->execution_status = INCSORT_READFULLSORT; + break; + } + + /* Accumulate the next group of presorted tuples. */ + if (nTuples < minGroupSize) + { + /* + * If we haven't yet hit our target minimum group size, then + * we don't need to bother checking for inclusion in the + * current prefix group since at this point we'll assume that + * we'll full sort this batch to avoid a large number of very + * tiny (and thus inefficient) sorts. + */ + tuplesort_puttupleslot(fullsort_state, slot); + nTuples++; + + /* + * If we've reached our minimum group size, then we need to + * store the most recent tuple as a pivot. + */ + if (nTuples == minGroupSize) + ExecCopySlot(node->group_pivot, slot); + } + else + { + /* + * If we've already accumulated enough tuples to reach our + * minimum group size, then we need to compare any additional + * tuples to our pivot tuple to see if we reach the end of + * that prefix key group. Only after we find changed prefix + * keys can we guarantee sort stability of the tuples we've + * already accumulated. + */ + if (isCurrentGroup(node, node->group_pivot, slot)) + { + /* + * As long as the prefix keys match the pivot tuple then + * load the tuple into the tuplesort. + */ + tuplesort_puttupleslot(fullsort_state, slot); + nTuples++; + } + else + { + /* + * Since the tuple we fetched isn't part of the current + * prefix key group we don't want to sort it as part of + * the current batch. Instead we use the group_pivot slot + * to carry it over to the next batch (even though we + * won't actually treat it as a group pivot). + */ + ExecCopySlot(node->group_pivot, slot); + + if (node->bounded) + { + /* + * If the current node has a bound, and we've already + * sorted n tuples, then the functional bound + * remaining is (original bound - n), so store the + * current number of processed tuples for later use + * configuring the sort state's bound. + */ + SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n", + node->bound_Done, + Min(node->bound, node->bound_Done + nTuples)); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + + /* + * Once we find changed prefix keys we can complete the + * sort and transition modes to reading out the sorted + * tuples. + */ + SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n", + nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort); + + SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n"); + node->execution_status = INCSORT_READFULLSORT; + break; + } + } + + /* + * Unless we've already transitioned modes to reading from the + * full sort state, then we assume that having read at least + * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're + * processing a large group of tuples all having equal prefix keys + * (but haven't yet found the final tuple in that prefix key + * group), so we need to transition into presorted prefix mode. + */ + if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE && + node->execution_status != INCSORT_READFULLSORT) + { + /* + * The group pivot we have stored has already been put into + * the tuplesort; we don't want to carry it over. Since we + * haven't yet found the end of the prefix key group, it might + * seem like we should keep this, but we don't actually know + * how many prefix key groups might be represented in the full + * sort state, so we'll let the mode transition function + * manage this state for us. + */ + ExecClearTuple(node->group_pivot); + + /* + * Unfortunately the tuplesort API doesn't include a way to + * retrieve tuples unless a sort has been performed, so we + * perform the sort even though we could just as easily rely + * on FIFO retrieval semantics when transferring them to the + * presorted prefix tuplesort. + */ + SO1_printf("Sorting fullsort tuplesort with " INT64_FORMAT " tuples\n", nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort); + + /* + * If the full sort tuplesort happened to switch into top-n + * heapsort mode then we will only be able to retrieve + * currentBound tuples (since the tuplesort will have only + * retained the top-n tuples). This is safe even though we + * haven't yet completed fetching the current prefix key group + * because the tuples we've "lost" already sorted "below" the + * retained ones, and we're already contractually guaranteed + * to not need any more than the currentBound tuples. + */ + if (tuplesort_used_bound(node->fullsort_state)) + { + int64 currentBound = node->bound - node->bound_Done; + + SO2_printf("Read " INT64_FORMAT " tuples, but setting to " INT64_FORMAT " because we used bounded sort\n", + nTuples, Min(currentBound, nTuples)); + nTuples = Min(currentBound, nTuples); + } + + SO1_printf("Setting n_fullsort_remaining to " INT64_FORMAT " and calling switchToPresortedPrefixMode()\n", + nTuples); + + /* + * We might have multiple prefix key groups in the full sort + * state, so the mode transition function needs to know that + * it needs to move from the fullsort to presorted prefix + * sort. + */ + node->n_fullsort_remaining = nTuples; + + /* Transition the tuples to the presorted prefix tuplesort. */ + switchToPresortedPrefixMode(pstate); + + /* + * Since we know we had tuples to move to the presorted prefix + * tuplesort, we know that unless that transition has verified + * that all tuples belonged to the same prefix key group (in + * which case we can go straight to continuing to load tuples + * into that tuplesort), we should have a tuple to return + * here. + * + * Either way, the appropriate execution status should have + * been set by switchToPresortedPrefixMode(), so we can drop + * out of the loop here and let the appropriate path kick in. + */ + break; + } + } + } + + if (node->execution_status == INCSORT_LOADPREFIXSORT) + { + /* + * We only enter this state after the mode transition function has + * confirmed all remaining tuples from the full sort state have the + * same prefix and moved those tuples to the prefix sort state. That + * function has also set a group pivot tuple (which doesn't need to be + * carried over; it's already been put into the prefix sort state). + */ + Assert(!TupIsNull(node->group_pivot)); + + /* + * Read tuples from the outer node and load them into the prefix sort + * state until we encounter a tuple whose prefix keys don't match the + * current group_pivot tuple, since we can't guarantee sort stability + * until we have all tuples matching those prefix keys. + */ + for (;;) + { + slot = ExecProcNode(outerNode); + + /* + * If we've exhausted tuples from the outer node we're done + * loading the prefix sort state. + */ + if (TupIsNull(slot)) + { + /* + * We need to know later if the outer node has completed to be + * able to distinguish between being done with a batch and + * being done with the whole node. + */ + node->outerNodeDone = true; + break; + } + + /* + * If the tuple's prefix keys match our pivot tuple, we're not + * done yet and can load it into the prefix sort state. If not, we + * don't want to sort it as part of the current batch. Instead we + * use the group_pivot slot to carry it over to the next batch + * (even though we won't actually treat it as a group pivot). + */ + if (isCurrentGroup(node, node->group_pivot, slot)) + { + tuplesort_puttupleslot(node->prefixsort_state, slot); + nTuples++; + } + else + { + ExecCopySlot(node->group_pivot, slot); + break; + } + } + + /* + * Perform the sort and begin returning the tuples to the parent plan + * node. + */ + SO1_printf("Sorting presorted prefix tuplesort with " INT64_FORMAT " tuples\n", nTuples); + tuplesort_performsort(node->prefixsort_state); + + INSTRUMENT_SORT_GROUP(node, prefixsort); + + SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n"); + node->execution_status = INCSORT_READPREFIXSORT; + + if (node->bounded) + { + /* + * If the current node has a bound, and we've already sorted n + * tuples, then the functional bound remaining is (original bound + * - n), so store the current number of processed tuples for use + * in configuring sorting bound. + */ + SO2_printf("Changing bound_Done from " INT64_FORMAT " to " INT64_FORMAT "\n", + node->bound_Done, + Min(node->bound, node->bound_Done + nTuples)); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + } + + /* Restore to user specified direction. */ + estate->es_direction = dir; + + /* + * Get the first or next tuple from tuplesort. Returns NULL if no more + * tuples. + */ + read_sortstate = node->execution_status == INCSORT_READFULLSORT ? + fullsort_state : node->prefixsort_state; + slot = node->ss.ps.ps_ResultTupleSlot; + (void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir), + false, slot, NULL); + return slot; +} + +/* ---------------------------------------------------------------- + * ExecInitIncrementalSort + * + * Creates the run-time state information for the sort node + * produced by the planner and initializes its outer subtree. + * ---------------------------------------------------------------- + */ +IncrementalSortState * +ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags) +{ + IncrementalSortState *incrsortstate; + + SO_printf("ExecInitIncrementalSort: initializing sort node\n"); + + /* + * Incremental sort can't be used with EXEC_FLAG_BACKWARD or + * EXEC_FLAG_MARK, because the current sort state contains only one sort + * batch rather than the full result set. + */ + Assert((eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) == 0); + + /* Initialize state structure. */ + incrsortstate = makeNode(IncrementalSortState); + incrsortstate->ss.ps.plan = (Plan *) node; + incrsortstate->ss.ps.state = estate; + incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort; + + incrsortstate->execution_status = INCSORT_LOADFULLSORT; + incrsortstate->bounded = false; + incrsortstate->outerNodeDone = false; + incrsortstate->bound_Done = 0; + incrsortstate->fullsort_state = NULL; + incrsortstate->prefixsort_state = NULL; + incrsortstate->group_pivot = NULL; + incrsortstate->transfer_tuple = NULL; + incrsortstate->n_fullsort_remaining = 0; + incrsortstate->presorted_keys = NULL; + + if (incrsortstate->ss.ps.instrument != NULL) + { + IncrementalSortGroupInfo *fullsortGroupInfo = + &incrsortstate->incsort_info.fullsortGroupInfo; + IncrementalSortGroupInfo *prefixsortGroupInfo = + &incrsortstate->incsort_info.prefixsortGroupInfo; + + fullsortGroupInfo->groupCount = 0; + fullsortGroupInfo->maxDiskSpaceUsed = 0; + fullsortGroupInfo->totalDiskSpaceUsed = 0; + fullsortGroupInfo->maxMemorySpaceUsed = 0; + fullsortGroupInfo->totalMemorySpaceUsed = 0; + fullsortGroupInfo->sortMethods = 0; + prefixsortGroupInfo->groupCount = 0; + prefixsortGroupInfo->maxDiskSpaceUsed = 0; + prefixsortGroupInfo->totalDiskSpaceUsed = 0; + prefixsortGroupInfo->maxMemorySpaceUsed = 0; + prefixsortGroupInfo->totalMemorySpaceUsed = 0; + prefixsortGroupInfo->sortMethods = 0; + } + + /* + * Miscellaneous initialization + * + * Sort nodes don't initialize their ExprContexts because they never call + * ExecQual or ExecProject. + */ + + /* + * Initialize child nodes. + * + * Incremental sort does not support backwards scans and mark/restore, so + * we don't bother removing the flags from eflags here. We allow passing a + * REWIND flag, because although incremental sort can't use it, the child + * nodes may be able to do something more useful. + */ + outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * Initialize scan slot and type. + */ + ExecCreateScanSlotFromOuterPlan(estate, &incrsortstate->ss, &TTSOpsMinimalTuple); + + /* + * Initialize return slot and type. No need to initialize projection info + * because we don't do any projections. + */ + ExecInitResultTupleSlotTL(&incrsortstate->ss.ps, &TTSOpsMinimalTuple); + incrsortstate->ss.ps.ps_ProjInfo = NULL; + + /* + * Initialize standalone slots to store a tuple for pivot prefix keys and + * for carrying over a tuple from one batch to the next. + */ + incrsortstate->group_pivot = + MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)), + &TTSOpsMinimalTuple); + incrsortstate->transfer_tuple = + MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)), + &TTSOpsMinimalTuple); + + SO_printf("ExecInitIncrementalSort: sort node initialized\n"); + + return incrsortstate; +} + +/* ---------------------------------------------------------------- + * ExecEndIncrementalSort(node) + * ---------------------------------------------------------------- + */ +void +ExecEndIncrementalSort(IncrementalSortState *node) +{ + SO_printf("ExecEndIncrementalSort: shutting down sort node\n"); + + /* clean out the scan tuple */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + /* must drop standalone tuple slots from outer node */ + ExecDropSingleTupleTableSlot(node->group_pivot); + ExecDropSingleTupleTableSlot(node->transfer_tuple); + + /* + * Release tuplesort resources. + */ + if (node->fullsort_state != NULL) + { + tuplesort_end(node->fullsort_state); + node->fullsort_state = NULL; + } + if (node->prefixsort_state != NULL) + { + tuplesort_end(node->prefixsort_state); + node->prefixsort_state = NULL; + } + + /* + * Shut down the subplan. + */ + ExecEndNode(outerPlanState(node)); + + SO_printf("ExecEndIncrementalSort: sort node shutdown\n"); +} + +void +ExecReScanIncrementalSort(IncrementalSortState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + /* + * Incremental sort doesn't support efficient rescan even when parameters + * haven't changed (e.g., rewind) because unlike regular sort we don't + * store all tuples at once for the full sort. + * + * So even if EXEC_FLAG_REWIND is set we just reset all of our state and + * re-execute the sort along with the child node. Incremental sort itself + * can't do anything smarter, but maybe the child nodes can. + * + * In theory if we've only filled the full sort with one batch (and + * haven't reset it for a new batch yet) then we could efficiently rewind, + * but that seems a narrow enough case that it's not worth handling + * specially at this time. + */ + + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + if (node->group_pivot != NULL) + ExecClearTuple(node->group_pivot); + if (node->transfer_tuple != NULL) + ExecClearTuple(node->transfer_tuple); + + node->outerNodeDone = false; + node->n_fullsort_remaining = 0; + node->bound_Done = 0; + node->presorted_keys = NULL; + + node->execution_status = INCSORT_LOADFULLSORT; + + /* + * If we've set up either of the sort states yet, we need to reset them. + * We could end them and null out the pointers, but there's no reason to + * repay the setup cost, and because ExecIncrementalSort guards presorted + * column functions by checking to see if the full sort state has been + * initialized yet, setting the sort states to null here might actually + * cause a leak. + */ + if (node->fullsort_state != NULL) + { + tuplesort_reset(node->fullsort_state); + node->fullsort_state = NULL; + } + if (node->prefixsort_state != NULL) + { + tuplesort_reset(node->prefixsort_state); + node->prefixsort_state = NULL; + } + + /* + * If chgParam of subnode is not null, then the plan will be re-scanned by + * the first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} + +/* ---------------------------------------------------------------- + * Parallel Query Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecSortEstimate + * + * Estimate space required to propagate sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(IncrementalSortInfo)); + size = add_size(size, offsetof(SharedIncrementalSortInfo, sinfo)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeDSM + * + * Initialize DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedIncrementalSortInfo, sinfo) + + pcxt->nworkers * sizeof(IncrementalSortInfo); + node->shared_info = shm_toc_allocate(pcxt->toc, size); + /* ensure any unfilled slots will contain zeroes */ + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, + node->shared_info); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeWorker + * + * Attach worker to DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pwcxt) +{ + node->shared_info = + shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true); + node->am_worker = true; +} + +/* ---------------------------------------------------------------- + * ExecSortRetrieveInstrumentation + * + * Transfer sort statistics from DSM to private memory. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node) +{ + Size size; + SharedIncrementalSortInfo *si; + + if (node->shared_info == NULL) + return; + + size = offsetof(SharedIncrementalSortInfo, sinfo) + + node->shared_info->num_workers * sizeof(IncrementalSortInfo); + si = palloc(size); + memcpy(si, node->shared_info, size); + node->shared_info = si; +} diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c new file mode 100644 index 0000000..8fee958 --- /dev/null +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -0,0 +1,735 @@ +/*------------------------------------------------------------------------- + * + * nodeIndexonlyscan.c + * Routines to support index-only scans + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeIndexonlyscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecIndexOnlyScan scans an index + * IndexOnlyNext retrieve next tuple + * ExecInitIndexOnlyScan creates and initializes state info. + * ExecReScanIndexOnlyScan rescans the indexed relation. + * ExecEndIndexOnlyScan releases all storage. + * ExecIndexOnlyMarkPos marks scan position. + * ExecIndexOnlyRestrPos restores scan position. + * ExecIndexOnlyScanEstimate estimates DSM space needed for + * parallel index-only scan + * ExecIndexOnlyScanInitializeDSM initialize DSM for parallel + * index-only scan + * ExecIndexOnlyScanReInitializeDSM reinitialize DSM for fresh scan + * ExecIndexOnlyScanInitializeWorker attach to DSM info in parallel worker + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/tupdesc.h" +#include "access/visibilitymap.h" +#include "executor/execdebug.h" +#include "executor/nodeIndexonlyscan.h" +#include "executor/nodeIndexscan.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node); +static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, + TupleDesc itupdesc); + + +/* ---------------------------------------------------------------- + * IndexOnlyNext + * + * Retrieve a tuple from the IndexOnlyScan node's index. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +IndexOnlyNext(IndexOnlyScanState *node) +{ + EState *estate; + ExprContext *econtext; + ScanDirection direction; + IndexScanDesc scandesc; + TupleTableSlot *slot; + ItemPointer tid; + + /* + * extract necessary information from index scan node + */ + estate = node->ss.ps.state; + direction = estate->es_direction; + /* flip direction if this is an overall backward scan */ + if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) + { + if (ScanDirectionIsForward(direction)) + direction = BackwardScanDirection; + else if (ScanDirectionIsBackward(direction)) + direction = ForwardScanDirection; + } + scandesc = node->ioss_ScanDesc; + econtext = node->ss.ps.ps_ExprContext; + slot = node->ss.ss_ScanTupleSlot; + + if (scandesc == NULL) + { + /* + * We reach here if the index only scan is not parallel, or if we're + * serially executing an index only scan that was planned to be + * parallel. + */ + scandesc = index_beginscan(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + estate->es_snapshot, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys); + + node->ioss_ScanDesc = scandesc; + + + /* Set it up for index-only scan */ + node->ioss_ScanDesc->xs_want_itup = true; + node->ioss_VMBuffer = InvalidBuffer; + + /* + * If no run-time keys to calculate or they are ready, go ahead and + * pass the scankeys to the index AM. + */ + if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady) + index_rescan(scandesc, + node->ioss_ScanKeys, + node->ioss_NumScanKeys, + node->ioss_OrderByKeys, + node->ioss_NumOrderByKeys); + } + + /* + * OK, now that we have what we need, fetch the next tuple. + */ + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + { + bool tuple_from_heap = false; + + CHECK_FOR_INTERRUPTS(); + + /* + * We can skip the heap fetch if the TID references a heap page on + * which all tuples are known visible to everybody. In any case, + * we'll use the index tuple not the heap tuple as the data source. + * + * Note on Memory Ordering Effects: visibilitymap_get_status does not + * lock the visibility map buffer, and therefore the result we read + * here could be slightly stale. However, it can't be stale enough to + * matter. + * + * We need to detect clearing a VM bit due to an insert right away, + * because the tuple is present in the index page but not visible. The + * reading of the TID by this scan (using a shared lock on the index + * buffer) is serialized with the insert of the TID into the index + * (using an exclusive lock on the index buffer). Because the VM bit + * is cleared before updating the index, and locking/unlocking of the + * index page acts as a full memory barrier, we are sure to see the + * cleared bit if we see a recently-inserted TID. + * + * Deletes do not update the index page (only VACUUM will clear out + * the TID), so the clearing of the VM bit by a delete is not + * serialized with this test below, and we may see a value that is + * significantly stale. However, we don't care about the delete right + * away, because the tuple is still visible until the deleting + * transaction commits or the statement ends (if it's our + * transaction). In either case, the lock on the VM buffer will have + * been released (acting as a write barrier) after clearing the bit. + * And for us to have a snapshot that includes the deleting + * transaction (making the tuple invisible), we must have acquired + * ProcArrayLock after that time, acting as a read barrier. + * + * It's worth going through this complexity to avoid needing to lock + * the VM buffer, which could cause significant contention. + */ + if (!VM_ALL_VISIBLE(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->ioss_VMBuffer)) + { + /* + * Rats, we have to visit the heap to check visibility. + */ + InstrCountTuples2(node, 1); + if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) + continue; /* no visible tuple, try next index entry */ + + ExecClearTuple(node->ioss_TableSlot); + + /* + * Only MVCC snapshots are supported here, so there should be no + * need to keep following the HOT chain once a visible entry has + * been found. If we did want to allow that, we'd need to keep + * more state to remember not to call index_getnext_tid next time. + */ + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + + /* + * Note: at this point we are holding a pin on the heap page, as + * recorded in scandesc->xs_cbuf. We could release that pin now, + * but it's not clear whether it's a win to do so. The next index + * entry might require a visit to the same heap page. + */ + + tuple_from_heap = true; + } + + /* + * Fill the scan tuple slot with data from the index. This might be + * provided in either HeapTuple or IndexTuple format. Conceivably an + * index AM might fill both fields, in which case we prefer the heap + * format, since it's probably a bit cheaper to fill a slot from. + */ + if (scandesc->xs_hitup) + { + /* + * We don't take the trouble to verify that the provided tuple has + * exactly the slot's format, but it seems worth doing a quick + * check on the number of fields. + */ + Assert(slot->tts_tupleDescriptor->natts == + scandesc->xs_hitupdesc->natts); + ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false); + } + else if (scandesc->xs_itup) + StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc); + else + elog(ERROR, "no data returned for index-only scan"); + + /* + * If the index was lossy, we have to recheck the index quals. + */ + if (scandesc->xs_recheck) + { + econtext->ecxt_scantuple = slot; + if (!ExecQualAndReset(node->recheckqual, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + continue; + } + } + + /* + * We don't currently support rechecking ORDER BY distances. (In + * principle, if the index can support retrieval of the originally + * indexed value, it should be able to produce an exact distance + * calculation too. So it's not clear that adding code here for + * recheck/re-sort would be worth the trouble. But we should at least + * throw an error if someone tries it.) + */ + if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("lossy distance functions are not supported in index-only scans"))); + + /* + * If we didn't access the heap, then we'll need to take a predicate + * lock explicitly, as if we had. For now we do that at page level. + */ + if (!tuple_from_heap) + PredicateLockPage(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + estate->es_snapshot); + + return slot; + } + + /* + * if we get here it means the index scan failed so we are at the end of + * the scan.. + */ + return ExecClearTuple(slot); +} + +/* + * StoreIndexTuple + * Fill the slot with data from the index tuple. + * + * At some point this might be generally-useful functionality, but + * right now we don't need it elsewhere. + */ +static void +StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc) +{ + /* + * Note: we must use the tupdesc supplied by the AM in index_deform_tuple, + * not the slot's tupdesc, in case the latter has different datatypes + * (this happens for btree name_ops in particular). They'd better have + * the same number of columns though, as well as being datatype-compatible + * which is something we can't so easily check. + */ + Assert(slot->tts_tupleDescriptor->natts == itupdesc->natts); + + ExecClearTuple(slot); + index_deform_tuple(itup, itupdesc, slot->tts_values, slot->tts_isnull); + ExecStoreVirtualTuple(slot); +} + +/* + * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual + * + * This can't really happen, since an index can't supply CTID which would + * be necessary data for any potential EvalPlanQual target relation. If it + * did happen, the EPQ code would pass us the wrong data, namely a heap + * tuple not an index tuple. So throw an error. + */ +static bool +IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot) +{ + elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans"); + return false; /* keep compiler quiet */ +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyScan(node) + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecIndexOnlyScan(PlanState *pstate) +{ + IndexOnlyScanState *node = castNode(IndexOnlyScanState, pstate); + + /* + * If we have runtime keys and they've not already been set up, do it now. + */ + if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady) + ExecReScan((PlanState *) node); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) IndexOnlyNext, + (ExecScanRecheckMtd) IndexOnlyRecheck); +} + +/* ---------------------------------------------------------------- + * ExecReScanIndexOnlyScan(node) + * + * Recalculates the values of any scan keys whose value depends on + * information known at runtime, then rescans the indexed relation. + * + * Updating the scan key was formerly done separately in + * ExecUpdateIndexScanKeys. Integrating it into ReScan makes + * rescans of indices and relations/general streams more uniform. + * ---------------------------------------------------------------- + */ +void +ExecReScanIndexOnlyScan(IndexOnlyScanState *node) +{ + /* + * If we are doing runtime key calculations (ie, any of the index key + * values weren't simple Consts), compute the new key values. But first, + * reset the context so we don't leak memory as each outer tuple is + * scanned. Note this assumes that we will recalculate *all* runtime keys + * on each call. + */ + if (node->ioss_NumRuntimeKeys != 0) + { + ExprContext *econtext = node->ioss_RuntimeContext; + + ResetExprContext(econtext); + ExecIndexEvalRuntimeKeys(econtext, + node->ioss_RuntimeKeys, + node->ioss_NumRuntimeKeys); + } + node->ioss_RuntimeKeysReady = true; + + /* reset index scan */ + if (node->ioss_ScanDesc) + index_rescan(node->ioss_ScanDesc, + node->ioss_ScanKeys, node->ioss_NumScanKeys, + node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); + + ExecScanReScan(&node->ss); +} + + +/* ---------------------------------------------------------------- + * ExecEndIndexOnlyScan + * ---------------------------------------------------------------- + */ +void +ExecEndIndexOnlyScan(IndexOnlyScanState *node) +{ + Relation indexRelationDesc; + IndexScanDesc indexScanDesc; + + /* + * extract information from the node + */ + indexRelationDesc = node->ioss_RelationDesc; + indexScanDesc = node->ioss_ScanDesc; + + /* Release VM buffer pin, if any. */ + if (node->ioss_VMBuffer != InvalidBuffer) + { + ReleaseBuffer(node->ioss_VMBuffer); + node->ioss_VMBuffer = InvalidBuffer; + } + + /* + * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext + */ +#ifdef NOT_USED + ExecFreeExprContext(&node->ss.ps); + if (node->ioss_RuntimeContext) + FreeExprContext(node->ioss_RuntimeContext, true); +#endif + + /* + * clear out tuple table slots + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close the index relation (no-op if we didn't open it) + */ + if (indexScanDesc) + index_endscan(indexScanDesc); + if (indexRelationDesc) + index_close(indexRelationDesc, NoLock); +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyMarkPos + * + * Note: we assume that no caller attempts to set a mark before having read + * at least one tuple. Otherwise, ioss_ScanDesc might still be NULL. + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyMarkPos(IndexOnlyScanState *node) +{ + EState *estate = node->ss.ps.state; + EPQState *epqstate = estate->es_epq_active; + + if (epqstate != NULL) + { + /* + * We are inside an EvalPlanQual recheck. If a test tuple exists for + * this relation, then we shouldn't access the index at all. We would + * instead need to save, and later restore, the state of the + * relsubs_done flag, so that re-fetching the test tuple is possible. + * However, given the assumption that no caller sets a mark at the + * start of the scan, we can only get here with relsubs_done[i] + * already set, and so no state need be saved. + */ + Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid; + + Assert(scanrelid > 0); + if (epqstate->relsubs_slot[scanrelid - 1] != NULL || + epqstate->relsubs_rowmark[scanrelid - 1] != NULL) + { + /* Verify the claim above */ + if (!epqstate->relsubs_done[scanrelid - 1]) + elog(ERROR, "unexpected ExecIndexOnlyMarkPos call in EPQ recheck"); + return; + } + } + + index_markpos(node->ioss_ScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyRestrPos + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyRestrPos(IndexOnlyScanState *node) +{ + EState *estate = node->ss.ps.state; + EPQState *epqstate = estate->es_epq_active; + + if (estate->es_epq_active != NULL) + { + /* See comments in ExecIndexMarkPos */ + Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid; + + Assert(scanrelid > 0); + if (epqstate->relsubs_slot[scanrelid - 1] != NULL || + epqstate->relsubs_rowmark[scanrelid - 1] != NULL) + { + /* Verify the claim above */ + if (!epqstate->relsubs_done[scanrelid - 1]) + elog(ERROR, "unexpected ExecIndexOnlyRestrPos call in EPQ recheck"); + return; + } + } + + index_restrpos(node->ioss_ScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecInitIndexOnlyScan + * + * Initializes the index scan's state information, creates + * scan keys, and opens the base and index relations. + * + * Note: index scans have 2 sets of state information because + * we have to keep track of the base relation and the + * index relation. + * ---------------------------------------------------------------- + */ +IndexOnlyScanState * +ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) +{ + IndexOnlyScanState *indexstate; + Relation currentRelation; + LOCKMODE lockmode; + TupleDesc tupDesc; + + /* + * create state structure + */ + indexstate = makeNode(IndexOnlyScanState); + indexstate->ss.ps.plan = (Plan *) node; + indexstate->ss.ps.state = estate; + indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &indexstate->ss.ps); + + /* + * open the scan relation + */ + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); + + indexstate->ss.ss_currentRelation = currentRelation; + indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ + + /* + * Build the scan tuple type using the indextlist generated by the + * planner. We use this, rather than the index's physical tuple + * descriptor, because the latter contains storage column types not the + * types of the original datums. (It's the AM's responsibility to return + * suitable data anyway.) + */ + tupDesc = ExecTypeFromTL(node->indextlist); + ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc, + &TTSOpsVirtual); + + /* + * We need another slot, in a format that's suitable for the table AM, for + * when we need to fetch a tuple from the table for rechecking visibility. + */ + indexstate->ioss_TableSlot = + ExecAllocTableSlot(&estate->es_tupleTable, + RelationGetDescr(currentRelation), + table_slot_callbacks(currentRelation)); + + /* + * Initialize result type and projection info. The node's targetlist will + * contain Vars with varno = INDEX_VAR, referencing the scan tuple. + */ + ExecInitResultTypeTL(&indexstate->ss.ps); + ExecAssignScanProjectionInfoWithVarno(&indexstate->ss, INDEX_VAR); + + /* + * initialize child expressions + * + * Note: we don't initialize all of the indexorderby expression, only the + * sub-parts corresponding to runtime keys (see below). + */ + indexstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) indexstate); + indexstate->recheckqual = + ExecInitQual(node->recheckqual, (PlanState *) indexstate); + + /* + * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop + * here. This allows an index-advisor plugin to EXPLAIN a plan containing + * references to nonexistent indexes. + */ + if (eflags & EXEC_FLAG_EXPLAIN_ONLY) + return indexstate; + + /* Open the index relation. */ + lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; + indexstate->ioss_RelationDesc = index_open(node->indexid, lockmode); + + /* + * Initialize index-specific scan state + */ + indexstate->ioss_RuntimeKeysReady = false; + indexstate->ioss_RuntimeKeys = NULL; + indexstate->ioss_NumRuntimeKeys = 0; + + /* + * build the index scan keys from the index qualification + */ + ExecIndexBuildScanKeys((PlanState *) indexstate, + indexstate->ioss_RelationDesc, + node->indexqual, + false, + &indexstate->ioss_ScanKeys, + &indexstate->ioss_NumScanKeys, + &indexstate->ioss_RuntimeKeys, + &indexstate->ioss_NumRuntimeKeys, + NULL, /* no ArrayKeys */ + NULL); + + /* + * any ORDER BY exprs have to be turned into scankeys in the same way + */ + ExecIndexBuildScanKeys((PlanState *) indexstate, + indexstate->ioss_RelationDesc, + node->indexorderby, + true, + &indexstate->ioss_OrderByKeys, + &indexstate->ioss_NumOrderByKeys, + &indexstate->ioss_RuntimeKeys, + &indexstate->ioss_NumRuntimeKeys, + NULL, /* no ArrayKeys */ + NULL); + + /* + * If we have runtime keys, we need an ExprContext to evaluate them. The + * node's standard context won't do because we want to reset that context + * for every tuple. So, build another context just like the other one... + * -tgl 7/11/00 + */ + if (indexstate->ioss_NumRuntimeKeys != 0) + { + ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext; + + ExecAssignExprContext(estate, &indexstate->ss.ps); + indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext; + indexstate->ss.ps.ps_ExprContext = stdecontext; + } + else + { + indexstate->ioss_RuntimeContext = NULL; + } + + /* + * all done. + */ + return indexstate; +} + +/* ---------------------------------------------------------------- + * Parallel Index-only Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecIndexOnlyScanEstimate + * + * Compute the amount of space we'll need in the parallel + * query DSM, and inform pcxt->estimator about our needs. + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyScanEstimate(IndexOnlyScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + + node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc, + estate->es_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyScanInitializeDSM + * + * Set up a parallel index-only scan descriptor. + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + ParallelIndexScanDesc piscan; + + piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen); + index_parallelscan_initialize(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + estate->es_snapshot, + piscan); + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan); + node->ioss_ScanDesc = + index_beginscan_parallel(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys, + piscan); + node->ioss_ScanDesc->xs_want_itup = true; + node->ioss_VMBuffer = InvalidBuffer; + + /* + * If no run-time keys to calculate or they are ready, go ahead and pass + * the scankeys to the index AM. + */ + if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady) + index_rescan(node->ioss_ScanDesc, + node->ioss_ScanKeys, node->ioss_NumScanKeys, + node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node, + ParallelContext *pcxt) +{ + index_parallelrescan(node->ioss_ScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyScanInitializeWorker + * + * Copy relevant information from TOC into planstate. + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, + ParallelWorkerContext *pwcxt) +{ + ParallelIndexScanDesc piscan; + + piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); + node->ioss_ScanDesc = + index_beginscan_parallel(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys, + piscan); + node->ioss_ScanDesc->xs_want_itup = true; + + /* + * If no run-time keys to calculate or they are ready, go ahead and pass + * the scankeys to the index AM. + */ + if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady) + index_rescan(node->ioss_ScanDesc, + node->ioss_ScanKeys, node->ioss_NumScanKeys, + node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); +} diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c new file mode 100644 index 0000000..add29b3 --- /dev/null +++ b/src/backend/executor/nodeIndexscan.c @@ -0,0 +1,1747 @@ +/*------------------------------------------------------------------------- + * + * nodeIndexscan.c + * Routines to support indexed scans of relations + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeIndexscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecIndexScan scans a relation using an index + * IndexNext retrieve next tuple using index + * IndexNextWithReorder same, but recheck ORDER BY expressions + * ExecInitIndexScan creates and initializes state info. + * ExecReScanIndexScan rescans the indexed relation. + * ExecEndIndexScan releases all storage. + * ExecIndexMarkPos marks scan position. + * ExecIndexRestrPos restores scan position. + * ExecIndexScanEstimate estimates DSM space needed for parallel index scan + * ExecIndexScanInitializeDSM initialize DSM for parallel indexscan + * ExecIndexScanReInitializeDSM reinitialize DSM for fresh scan + * ExecIndexScanInitializeWorker attach to DSM info in parallel worker + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "catalog/pg_am.h" +#include "executor/execdebug.h" +#include "executor/nodeIndexscan.h" +#include "lib/pairingheap.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "utils/array.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +/* + * When an ordering operator is used, tuples fetched from the index that + * need to be reordered are queued in a pairing heap, as ReorderTuples. + */ +typedef struct +{ + pairingheap_node ph_node; + HeapTuple htup; + Datum *orderbyvals; + bool *orderbynulls; +} ReorderTuple; + +static TupleTableSlot *IndexNext(IndexScanState *node); +static TupleTableSlot *IndexNextWithReorder(IndexScanState *node); +static void EvalOrderByExpressions(IndexScanState *node, ExprContext *econtext); +static bool IndexRecheck(IndexScanState *node, TupleTableSlot *slot); +static int cmp_orderbyvals(const Datum *adist, const bool *anulls, + const Datum *bdist, const bool *bnulls, + IndexScanState *node); +static int reorderqueue_cmp(const pairingheap_node *a, + const pairingheap_node *b, void *arg); +static void reorderqueue_push(IndexScanState *node, TupleTableSlot *slot, + Datum *orderbyvals, bool *orderbynulls); +static HeapTuple reorderqueue_pop(IndexScanState *node); + + +/* ---------------------------------------------------------------- + * IndexNext + * + * Retrieve a tuple from the IndexScan node's currentRelation + * using the index specified in the IndexScanState information. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +IndexNext(IndexScanState *node) +{ + EState *estate; + ExprContext *econtext; + ScanDirection direction; + IndexScanDesc scandesc; + TupleTableSlot *slot; + + /* + * extract necessary information from index scan node + */ + estate = node->ss.ps.state; + direction = estate->es_direction; + /* flip direction if this is an overall backward scan */ + if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) + { + if (ScanDirectionIsForward(direction)) + direction = BackwardScanDirection; + else if (ScanDirectionIsBackward(direction)) + direction = ForwardScanDirection; + } + scandesc = node->iss_ScanDesc; + econtext = node->ss.ps.ps_ExprContext; + slot = node->ss.ss_ScanTupleSlot; + + if (scandesc == NULL) + { + /* + * We reach here if the index scan is not parallel, or if we're + * serially executing an index scan that was planned to be parallel. + */ + scandesc = index_beginscan(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys); + + node->iss_ScanDesc = scandesc; + + /* + * If no run-time keys to calculate or they are ready, go ahead and + * pass the scankeys to the index AM. + */ + if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady) + index_rescan(scandesc, + node->iss_ScanKeys, node->iss_NumScanKeys, + node->iss_OrderByKeys, node->iss_NumOrderByKeys); + } + + /* + * ok, now that we have what we need, fetch the next tuple. + */ + while (index_getnext_slot(scandesc, direction, slot)) + { + CHECK_FOR_INTERRUPTS(); + + /* + * If the index was lossy, we have to recheck the index quals using + * the fetched tuple. + */ + if (scandesc->xs_recheck) + { + econtext->ecxt_scantuple = slot; + if (!ExecQualAndReset(node->indexqualorig, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + continue; + } + } + + return slot; + } + + /* + * if we get here it means the index scan failed so we are at the end of + * the scan.. + */ + node->iss_ReachedEnd = true; + return ExecClearTuple(slot); +} + +/* ---------------------------------------------------------------- + * IndexNextWithReorder + * + * Like IndexNext, but this version can also re-check ORDER BY + * expressions, and reorder the tuples as necessary. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +IndexNextWithReorder(IndexScanState *node) +{ + EState *estate; + ExprContext *econtext; + IndexScanDesc scandesc; + TupleTableSlot *slot; + ReorderTuple *topmost = NULL; + bool was_exact; + Datum *lastfetched_vals; + bool *lastfetched_nulls; + int cmp; + + estate = node->ss.ps.state; + + /* + * Only forward scan is supported with reordering. Note: we can get away + * with just Asserting here because the system will not try to run the + * plan backwards if ExecSupportsBackwardScan() says it won't work. + * Currently, that is guaranteed because no index AMs support both + * amcanorderbyop and amcanbackward; if any ever do, + * ExecSupportsBackwardScan() will need to consider indexorderbys + * explicitly. + */ + Assert(!ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)); + Assert(ScanDirectionIsForward(estate->es_direction)); + + scandesc = node->iss_ScanDesc; + econtext = node->ss.ps.ps_ExprContext; + slot = node->ss.ss_ScanTupleSlot; + + if (scandesc == NULL) + { + /* + * We reach here if the index scan is not parallel, or if we're + * serially executing an index scan that was planned to be parallel. + */ + scandesc = index_beginscan(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys); + + node->iss_ScanDesc = scandesc; + + /* + * If no run-time keys to calculate or they are ready, go ahead and + * pass the scankeys to the index AM. + */ + if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady) + index_rescan(scandesc, + node->iss_ScanKeys, node->iss_NumScanKeys, + node->iss_OrderByKeys, node->iss_NumOrderByKeys); + } + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + /* + * Check the reorder queue first. If the topmost tuple in the queue + * has an ORDER BY value smaller than (or equal to) the value last + * returned by the index, we can return it now. + */ + if (!pairingheap_is_empty(node->iss_ReorderQueue)) + { + topmost = (ReorderTuple *) pairingheap_first(node->iss_ReorderQueue); + + if (node->iss_ReachedEnd || + cmp_orderbyvals(topmost->orderbyvals, + topmost->orderbynulls, + scandesc->xs_orderbyvals, + scandesc->xs_orderbynulls, + node) <= 0) + { + HeapTuple tuple; + + tuple = reorderqueue_pop(node); + + /* Pass 'true', as the tuple in the queue is a palloc'd copy */ + ExecForceStoreHeapTuple(tuple, slot, true); + return slot; + } + } + else if (node->iss_ReachedEnd) + { + /* Queue is empty, and no more tuples from index. We're done. */ + return ExecClearTuple(slot); + } + + /* + * Fetch next tuple from the index. + */ +next_indextuple: + if (!index_getnext_slot(scandesc, ForwardScanDirection, slot)) + { + /* + * No more tuples from the index. But we still need to drain any + * remaining tuples from the queue before we're done. + */ + node->iss_ReachedEnd = true; + continue; + } + + /* + * If the index was lossy, we have to recheck the index quals and + * ORDER BY expressions using the fetched tuple. + */ + if (scandesc->xs_recheck) + { + econtext->ecxt_scantuple = slot; + if (!ExecQualAndReset(node->indexqualorig, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + goto next_indextuple; + } + } + + if (scandesc->xs_recheckorderby) + { + econtext->ecxt_scantuple = slot; + ResetExprContext(econtext); + EvalOrderByExpressions(node, econtext); + + /* + * Was the ORDER BY value returned by the index accurate? The + * recheck flag means that the index can return inaccurate values, + * but then again, the value returned for any particular tuple + * could also be exactly correct. Compare the value returned by + * the index with the recalculated value. (If the value returned + * by the index happened to be exact right, we can often avoid + * pushing the tuple to the queue, just to pop it back out again.) + */ + cmp = cmp_orderbyvals(node->iss_OrderByValues, + node->iss_OrderByNulls, + scandesc->xs_orderbyvals, + scandesc->xs_orderbynulls, + node); + if (cmp < 0) + elog(ERROR, "index returned tuples in wrong order"); + else if (cmp == 0) + was_exact = true; + else + was_exact = false; + lastfetched_vals = node->iss_OrderByValues; + lastfetched_nulls = node->iss_OrderByNulls; + } + else + { + was_exact = true; + lastfetched_vals = scandesc->xs_orderbyvals; + lastfetched_nulls = scandesc->xs_orderbynulls; + } + + /* + * Can we return this tuple immediately, or does it need to be pushed + * to the reorder queue? If the ORDER BY expression values returned + * by the index were inaccurate, we can't return it yet, because the + * next tuple from the index might need to come before this one. Also, + * we can't return it yet if there are any smaller tuples in the queue + * already. + */ + if (!was_exact || (topmost && cmp_orderbyvals(lastfetched_vals, + lastfetched_nulls, + topmost->orderbyvals, + topmost->orderbynulls, + node) > 0)) + { + /* Put this tuple to the queue */ + reorderqueue_push(node, slot, lastfetched_vals, lastfetched_nulls); + continue; + } + else + { + /* Can return this tuple immediately. */ + return slot; + } + } + + /* + * if we get here it means the index scan failed so we are at the end of + * the scan.. + */ + return ExecClearTuple(slot); +} + +/* + * Calculate the expressions in the ORDER BY clause, based on the heap tuple. + */ +static void +EvalOrderByExpressions(IndexScanState *node, ExprContext *econtext) +{ + int i; + ListCell *l; + MemoryContext oldContext; + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + i = 0; + foreach(l, node->indexorderbyorig) + { + ExprState *orderby = (ExprState *) lfirst(l); + + node->iss_OrderByValues[i] = ExecEvalExpr(orderby, + econtext, + &node->iss_OrderByNulls[i]); + i++; + } + + MemoryContextSwitchTo(oldContext); +} + +/* + * IndexRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +IndexRecheck(IndexScanState *node, TupleTableSlot *slot) +{ + ExprContext *econtext; + + /* + * extract necessary information from index scan node + */ + econtext = node->ss.ps.ps_ExprContext; + + /* Does the tuple meet the indexqual condition? */ + econtext->ecxt_scantuple = slot; + return ExecQualAndReset(node->indexqualorig, econtext); +} + + +/* + * Compare ORDER BY expression values. + */ +static int +cmp_orderbyvals(const Datum *adist, const bool *anulls, + const Datum *bdist, const bool *bnulls, + IndexScanState *node) +{ + int i; + int result; + + for (i = 0; i < node->iss_NumOrderByKeys; i++) + { + SortSupport ssup = &node->iss_SortSupport[i]; + + /* + * Handle nulls. We only need to support NULLS LAST ordering, because + * match_pathkeys_to_index() doesn't consider indexorderby + * implementation otherwise. + */ + if (anulls[i] && !bnulls[i]) + return 1; + else if (!anulls[i] && bnulls[i]) + return -1; + else if (anulls[i] && bnulls[i]) + return 0; + + result = ssup->comparator(adist[i], bdist[i], ssup); + if (result != 0) + return result; + } + + return 0; +} + +/* + * Pairing heap provides getting topmost (greatest) element while KNN provides + * ascending sort. That's why we invert the sort order. + */ +static int +reorderqueue_cmp(const pairingheap_node *a, const pairingheap_node *b, + void *arg) +{ + ReorderTuple *rta = (ReorderTuple *) a; + ReorderTuple *rtb = (ReorderTuple *) b; + IndexScanState *node = (IndexScanState *) arg; + + /* exchange argument order to invert the sort order */ + return cmp_orderbyvals(rtb->orderbyvals, rtb->orderbynulls, + rta->orderbyvals, rta->orderbynulls, + node); +} + +/* + * Helper function to push a tuple to the reorder queue. + */ +static void +reorderqueue_push(IndexScanState *node, TupleTableSlot *slot, + Datum *orderbyvals, bool *orderbynulls) +{ + IndexScanDesc scandesc = node->iss_ScanDesc; + EState *estate = node->ss.ps.state; + MemoryContext oldContext = MemoryContextSwitchTo(estate->es_query_cxt); + ReorderTuple *rt; + int i; + + rt = (ReorderTuple *) palloc(sizeof(ReorderTuple)); + rt->htup = ExecCopySlotHeapTuple(slot); + rt->orderbyvals = + (Datum *) palloc(sizeof(Datum) * scandesc->numberOfOrderBys); + rt->orderbynulls = + (bool *) palloc(sizeof(bool) * scandesc->numberOfOrderBys); + for (i = 0; i < node->iss_NumOrderByKeys; i++) + { + if (!orderbynulls[i]) + rt->orderbyvals[i] = datumCopy(orderbyvals[i], + node->iss_OrderByTypByVals[i], + node->iss_OrderByTypLens[i]); + else + rt->orderbyvals[i] = (Datum) 0; + rt->orderbynulls[i] = orderbynulls[i]; + } + pairingheap_add(node->iss_ReorderQueue, &rt->ph_node); + + MemoryContextSwitchTo(oldContext); +} + +/* + * Helper function to pop the next tuple from the reorder queue. + */ +static HeapTuple +reorderqueue_pop(IndexScanState *node) +{ + HeapTuple result; + ReorderTuple *topmost; + int i; + + topmost = (ReorderTuple *) pairingheap_remove_first(node->iss_ReorderQueue); + + result = topmost->htup; + for (i = 0; i < node->iss_NumOrderByKeys; i++) + { + if (!node->iss_OrderByTypByVals[i] && !topmost->orderbynulls[i]) + pfree(DatumGetPointer(topmost->orderbyvals[i])); + } + pfree(topmost->orderbyvals); + pfree(topmost->orderbynulls); + pfree(topmost); + + return result; +} + + +/* ---------------------------------------------------------------- + * ExecIndexScan(node) + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecIndexScan(PlanState *pstate) +{ + IndexScanState *node = castNode(IndexScanState, pstate); + + /* + * If we have runtime keys and they've not already been set up, do it now. + */ + if (node->iss_NumRuntimeKeys != 0 && !node->iss_RuntimeKeysReady) + ExecReScan((PlanState *) node); + + if (node->iss_NumOrderByKeys > 0) + return ExecScan(&node->ss, + (ExecScanAccessMtd) IndexNextWithReorder, + (ExecScanRecheckMtd) IndexRecheck); + else + return ExecScan(&node->ss, + (ExecScanAccessMtd) IndexNext, + (ExecScanRecheckMtd) IndexRecheck); +} + +/* ---------------------------------------------------------------- + * ExecReScanIndexScan(node) + * + * Recalculates the values of any scan keys whose value depends on + * information known at runtime, then rescans the indexed relation. + * + * Updating the scan key was formerly done separately in + * ExecUpdateIndexScanKeys. Integrating it into ReScan makes + * rescans of indices and relations/general streams more uniform. + * ---------------------------------------------------------------- + */ +void +ExecReScanIndexScan(IndexScanState *node) +{ + /* + * If we are doing runtime key calculations (ie, any of the index key + * values weren't simple Consts), compute the new key values. But first, + * reset the context so we don't leak memory as each outer tuple is + * scanned. Note this assumes that we will recalculate *all* runtime keys + * on each call. + */ + if (node->iss_NumRuntimeKeys != 0) + { + ExprContext *econtext = node->iss_RuntimeContext; + + ResetExprContext(econtext); + ExecIndexEvalRuntimeKeys(econtext, + node->iss_RuntimeKeys, + node->iss_NumRuntimeKeys); + } + node->iss_RuntimeKeysReady = true; + + /* flush the reorder queue */ + if (node->iss_ReorderQueue) + { + HeapTuple tuple; + while (!pairingheap_is_empty(node->iss_ReorderQueue)) + { + tuple = reorderqueue_pop(node); + heap_freetuple(tuple); + } + } + + /* reset index scan */ + if (node->iss_ScanDesc) + index_rescan(node->iss_ScanDesc, + node->iss_ScanKeys, node->iss_NumScanKeys, + node->iss_OrderByKeys, node->iss_NumOrderByKeys); + node->iss_ReachedEnd = false; + + ExecScanReScan(&node->ss); +} + + +/* + * ExecIndexEvalRuntimeKeys + * Evaluate any runtime key values, and update the scankeys. + */ +void +ExecIndexEvalRuntimeKeys(ExprContext *econtext, + IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys) +{ + int j; + MemoryContext oldContext; + + /* We want to keep the key values in per-tuple memory */ + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + for (j = 0; j < numRuntimeKeys; j++) + { + ScanKey scan_key = runtimeKeys[j].scan_key; + ExprState *key_expr = runtimeKeys[j].key_expr; + Datum scanvalue; + bool isNull; + + /* + * For each run-time key, extract the run-time expression and evaluate + * it with respect to the current context. We then stick the result + * into the proper scan key. + * + * Note: the result of the eval could be a pass-by-ref value that's + * stored in some outer scan's tuple, not in + * econtext->ecxt_per_tuple_memory. We assume that the outer tuple + * will stay put throughout our scan. If this is wrong, we could copy + * the result into our context explicitly, but I think that's not + * necessary. + * + * It's also entirely possible that the result of the eval is a + * toasted value. In this case we should forcibly detoast it, to + * avoid repeat detoastings each time the value is examined by an + * index support function. + */ + scanvalue = ExecEvalExpr(key_expr, + econtext, + &isNull); + if (isNull) + { + scan_key->sk_argument = scanvalue; + scan_key->sk_flags |= SK_ISNULL; + } + else + { + if (runtimeKeys[j].key_toastable) + scanvalue = PointerGetDatum(PG_DETOAST_DATUM(scanvalue)); + scan_key->sk_argument = scanvalue; + scan_key->sk_flags &= ~SK_ISNULL; + } + } + + MemoryContextSwitchTo(oldContext); +} + +/* + * ExecIndexEvalArrayKeys + * Evaluate any array key values, and set up to iterate through arrays. + * + * Returns true if there are array elements to consider; false means there + * is at least one null or empty array, so no match is possible. On true + * result, the scankeys are initialized with the first elements of the arrays. + */ +bool +ExecIndexEvalArrayKeys(ExprContext *econtext, + IndexArrayKeyInfo *arrayKeys, int numArrayKeys) +{ + bool result = true; + int j; + MemoryContext oldContext; + + /* We want to keep the arrays in per-tuple memory */ + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + for (j = 0; j < numArrayKeys; j++) + { + ScanKey scan_key = arrayKeys[j].scan_key; + ExprState *array_expr = arrayKeys[j].array_expr; + Datum arraydatum; + bool isNull; + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int num_elems; + Datum *elem_values; + bool *elem_nulls; + + /* + * Compute and deconstruct the array expression. (Notes in + * ExecIndexEvalRuntimeKeys() apply here too.) + */ + arraydatum = ExecEvalExpr(array_expr, + econtext, + &isNull); + if (isNull) + { + result = false; + break; /* no point in evaluating more */ + } + arrayval = DatumGetArrayTypeP(arraydatum); + /* We could cache this data, but not clear it's worth it */ + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elem_values, &elem_nulls, &num_elems); + if (num_elems <= 0) + { + result = false; + break; /* no point in evaluating more */ + } + + /* + * Note: we expect the previous array data, if any, to be + * automatically freed by resetting the per-tuple context; hence no + * pfree's here. + */ + arrayKeys[j].elem_values = elem_values; + arrayKeys[j].elem_nulls = elem_nulls; + arrayKeys[j].num_elems = num_elems; + scan_key->sk_argument = elem_values[0]; + if (elem_nulls[0]) + scan_key->sk_flags |= SK_ISNULL; + else + scan_key->sk_flags &= ~SK_ISNULL; + arrayKeys[j].next_elem = 1; + } + + MemoryContextSwitchTo(oldContext); + + return result; +} + +/* + * ExecIndexAdvanceArrayKeys + * Advance to the next set of array key values, if any. + * + * Returns true if there is another set of values to consider, false if not. + * On true result, the scankeys are initialized with the next set of values. + */ +bool +ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys) +{ + bool found = false; + int j; + + /* + * Note we advance the rightmost array key most quickly, since it will + * correspond to the lowest-order index column among the available + * qualifications. This is hypothesized to result in better locality of + * access in the index. + */ + for (j = numArrayKeys - 1; j >= 0; j--) + { + ScanKey scan_key = arrayKeys[j].scan_key; + int next_elem = arrayKeys[j].next_elem; + int num_elems = arrayKeys[j].num_elems; + Datum *elem_values = arrayKeys[j].elem_values; + bool *elem_nulls = arrayKeys[j].elem_nulls; + + if (next_elem >= num_elems) + { + next_elem = 0; + found = false; /* need to advance next array key */ + } + else + found = true; + scan_key->sk_argument = elem_values[next_elem]; + if (elem_nulls[next_elem]) + scan_key->sk_flags |= SK_ISNULL; + else + scan_key->sk_flags &= ~SK_ISNULL; + arrayKeys[j].next_elem = next_elem + 1; + if (found) + break; + } + + return found; +} + + +/* ---------------------------------------------------------------- + * ExecEndIndexScan + * ---------------------------------------------------------------- + */ +void +ExecEndIndexScan(IndexScanState *node) +{ + Relation indexRelationDesc; + IndexScanDesc indexScanDesc; + + /* + * extract information from the node + */ + indexRelationDesc = node->iss_RelationDesc; + indexScanDesc = node->iss_ScanDesc; + + /* + * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext + */ +#ifdef NOT_USED + ExecFreeExprContext(&node->ss.ps); + if (node->iss_RuntimeContext) + FreeExprContext(node->iss_RuntimeContext, true); +#endif + + /* + * clear out tuple table slots + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close the index relation (no-op if we didn't open it) + */ + if (indexScanDesc) + index_endscan(indexScanDesc); + if (indexRelationDesc) + index_close(indexRelationDesc, NoLock); +} + +/* ---------------------------------------------------------------- + * ExecIndexMarkPos + * + * Note: we assume that no caller attempts to set a mark before having read + * at least one tuple. Otherwise, iss_ScanDesc might still be NULL. + * ---------------------------------------------------------------- + */ +void +ExecIndexMarkPos(IndexScanState *node) +{ + EState *estate = node->ss.ps.state; + EPQState *epqstate = estate->es_epq_active; + + if (epqstate != NULL) + { + /* + * We are inside an EvalPlanQual recheck. If a test tuple exists for + * this relation, then we shouldn't access the index at all. We would + * instead need to save, and later restore, the state of the + * relsubs_done flag, so that re-fetching the test tuple is possible. + * However, given the assumption that no caller sets a mark at the + * start of the scan, we can only get here with relsubs_done[i] + * already set, and so no state need be saved. + */ + Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid; + + Assert(scanrelid > 0); + if (epqstate->relsubs_slot[scanrelid - 1] != NULL || + epqstate->relsubs_rowmark[scanrelid - 1] != NULL) + { + /* Verify the claim above */ + if (!epqstate->relsubs_done[scanrelid - 1]) + elog(ERROR, "unexpected ExecIndexMarkPos call in EPQ recheck"); + return; + } + } + + index_markpos(node->iss_ScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecIndexRestrPos + * ---------------------------------------------------------------- + */ +void +ExecIndexRestrPos(IndexScanState *node) +{ + EState *estate = node->ss.ps.state; + EPQState *epqstate = estate->es_epq_active; + + if (estate->es_epq_active != NULL) + { + /* See comments in ExecIndexMarkPos */ + Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid; + + Assert(scanrelid > 0); + if (epqstate->relsubs_slot[scanrelid - 1] != NULL || + epqstate->relsubs_rowmark[scanrelid - 1] != NULL) + { + /* Verify the claim above */ + if (!epqstate->relsubs_done[scanrelid - 1]) + elog(ERROR, "unexpected ExecIndexRestrPos call in EPQ recheck"); + return; + } + } + + index_restrpos(node->iss_ScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecInitIndexScan + * + * Initializes the index scan's state information, creates + * scan keys, and opens the base and index relations. + * + * Note: index scans have 2 sets of state information because + * we have to keep track of the base relation and the + * index relation. + * ---------------------------------------------------------------- + */ +IndexScanState * +ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) +{ + IndexScanState *indexstate; + Relation currentRelation; + LOCKMODE lockmode; + + /* + * create state structure + */ + indexstate = makeNode(IndexScanState); + indexstate->ss.ps.plan = (Plan *) node; + indexstate->ss.ps.state = estate; + indexstate->ss.ps.ExecProcNode = ExecIndexScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &indexstate->ss.ps); + + /* + * open the scan relation + */ + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); + + indexstate->ss.ss_currentRelation = currentRelation; + indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ + + /* + * get the scan type from the relation descriptor. + */ + ExecInitScanTupleSlot(estate, &indexstate->ss, + RelationGetDescr(currentRelation), + table_slot_callbacks(currentRelation)); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&indexstate->ss.ps); + ExecAssignScanProjectionInfo(&indexstate->ss); + + /* + * initialize child expressions + * + * Note: we don't initialize all of the indexqual expression, only the + * sub-parts corresponding to runtime keys (see below). Likewise for + * indexorderby, if any. But the indexqualorig expression is always + * initialized even though it will only be used in some uncommon cases --- + * would be nice to improve that. (Problem is that any SubPlans present + * in the expression must be found now...) + */ + indexstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) indexstate); + indexstate->indexqualorig = + ExecInitQual(node->indexqualorig, (PlanState *) indexstate); + indexstate->indexorderbyorig = + ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate); + + /* + * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop + * here. This allows an index-advisor plugin to EXPLAIN a plan containing + * references to nonexistent indexes. + */ + if (eflags & EXEC_FLAG_EXPLAIN_ONLY) + return indexstate; + + /* Open the index relation. */ + lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; + indexstate->iss_RelationDesc = index_open(node->indexid, lockmode); + + /* + * Initialize index-specific scan state + */ + indexstate->iss_RuntimeKeysReady = false; + indexstate->iss_RuntimeKeys = NULL; + indexstate->iss_NumRuntimeKeys = 0; + + /* + * build the index scan keys from the index qualification + */ + ExecIndexBuildScanKeys((PlanState *) indexstate, + indexstate->iss_RelationDesc, + node->indexqual, + false, + &indexstate->iss_ScanKeys, + &indexstate->iss_NumScanKeys, + &indexstate->iss_RuntimeKeys, + &indexstate->iss_NumRuntimeKeys, + NULL, /* no ArrayKeys */ + NULL); + + /* + * any ORDER BY exprs have to be turned into scankeys in the same way + */ + ExecIndexBuildScanKeys((PlanState *) indexstate, + indexstate->iss_RelationDesc, + node->indexorderby, + true, + &indexstate->iss_OrderByKeys, + &indexstate->iss_NumOrderByKeys, + &indexstate->iss_RuntimeKeys, + &indexstate->iss_NumRuntimeKeys, + NULL, /* no ArrayKeys */ + NULL); + + /* Initialize sort support, if we need to re-check ORDER BY exprs */ + if (indexstate->iss_NumOrderByKeys > 0) + { + int numOrderByKeys = indexstate->iss_NumOrderByKeys; + int i; + ListCell *lco; + ListCell *lcx; + + /* + * Prepare sort support, and look up the data type for each ORDER BY + * expression. + */ + Assert(numOrderByKeys == list_length(node->indexorderbyops)); + Assert(numOrderByKeys == list_length(node->indexorderbyorig)); + indexstate->iss_SortSupport = (SortSupportData *) + palloc0(numOrderByKeys * sizeof(SortSupportData)); + indexstate->iss_OrderByTypByVals = (bool *) + palloc(numOrderByKeys * sizeof(bool)); + indexstate->iss_OrderByTypLens = (int16 *) + palloc(numOrderByKeys * sizeof(int16)); + i = 0; + forboth(lco, node->indexorderbyops, lcx, node->indexorderbyorig) + { + Oid orderbyop = lfirst_oid(lco); + Node *orderbyexpr = (Node *) lfirst(lcx); + Oid orderbyType = exprType(orderbyexpr); + Oid orderbyColl = exprCollation(orderbyexpr); + SortSupport orderbysort = &indexstate->iss_SortSupport[i]; + + /* Initialize sort support */ + orderbysort->ssup_cxt = CurrentMemoryContext; + orderbysort->ssup_collation = orderbyColl; + /* See cmp_orderbyvals() comments on NULLS LAST */ + orderbysort->ssup_nulls_first = false; + /* ssup_attno is unused here and elsewhere */ + orderbysort->ssup_attno = 0; + /* No abbreviation */ + orderbysort->abbreviate = false; + PrepareSortSupportFromOrderingOp(orderbyop, orderbysort); + + get_typlenbyval(orderbyType, + &indexstate->iss_OrderByTypLens[i], + &indexstate->iss_OrderByTypByVals[i]); + i++; + } + + /* allocate arrays to hold the re-calculated distances */ + indexstate->iss_OrderByValues = (Datum *) + palloc(numOrderByKeys * sizeof(Datum)); + indexstate->iss_OrderByNulls = (bool *) + palloc(numOrderByKeys * sizeof(bool)); + + /* and initialize the reorder queue */ + indexstate->iss_ReorderQueue = pairingheap_allocate(reorderqueue_cmp, + indexstate); + } + + /* + * If we have runtime keys, we need an ExprContext to evaluate them. The + * node's standard context won't do because we want to reset that context + * for every tuple. So, build another context just like the other one... + * -tgl 7/11/00 + */ + if (indexstate->iss_NumRuntimeKeys != 0) + { + ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext; + + ExecAssignExprContext(estate, &indexstate->ss.ps); + indexstate->iss_RuntimeContext = indexstate->ss.ps.ps_ExprContext; + indexstate->ss.ps.ps_ExprContext = stdecontext; + } + else + { + indexstate->iss_RuntimeContext = NULL; + } + + /* + * all done. + */ + return indexstate; +} + + +/* + * ExecIndexBuildScanKeys + * Build the index scan keys from the index qualification expressions + * + * The index quals are passed to the index AM in the form of a ScanKey array. + * This routine sets up the ScanKeys, fills in all constant fields of the + * ScanKeys, and prepares information about the keys that have non-constant + * comparison values. We divide index qual expressions into five types: + * + * 1. Simple operator with constant comparison value ("indexkey op constant"). + * For these, we just fill in a ScanKey containing the constant value. + * + * 2. Simple operator with non-constant value ("indexkey op expression"). + * For these, we create a ScanKey with everything filled in except the + * expression value, and set up an IndexRuntimeKeyInfo struct to drive + * evaluation of the expression at the right times. + * + * 3. RowCompareExpr ("(indexkey, indexkey, ...) op (expr, expr, ...)"). + * For these, we create a header ScanKey plus a subsidiary ScanKey array, + * as specified in access/skey.h. The elements of the row comparison + * can have either constant or non-constant comparison values. + * + * 4. ScalarArrayOpExpr ("indexkey op ANY (array-expression)"). If the index + * supports amsearcharray, we handle these the same as simple operators, + * setting the SK_SEARCHARRAY flag to tell the AM to handle them. Otherwise, + * we create a ScanKey with everything filled in except the comparison value, + * and set up an IndexArrayKeyInfo struct to drive processing of the qual. + * (Note that if we use an IndexArrayKeyInfo struct, the array expression is + * always treated as requiring runtime evaluation, even if it's a constant.) + * + * 5. NullTest ("indexkey IS NULL/IS NOT NULL"). We just fill in the + * ScanKey properly. + * + * This code is also used to prepare ORDER BY expressions for amcanorderbyop + * indexes. The behavior is exactly the same, except that we have to look up + * the operator differently. Note that only cases 1 and 2 are currently + * possible for ORDER BY. + * + * Input params are: + * + * planstate: executor state node we are working for + * index: the index we are building scan keys for + * quals: indexquals (or indexorderbys) expressions + * isorderby: true if processing ORDER BY exprs, false if processing quals + * *runtimeKeys: ptr to pre-existing IndexRuntimeKeyInfos, or NULL if none + * *numRuntimeKeys: number of pre-existing runtime keys + * + * Output params are: + * + * *scanKeys: receives ptr to array of ScanKeys + * *numScanKeys: receives number of scankeys + * *runtimeKeys: receives ptr to array of IndexRuntimeKeyInfos, or NULL if none + * *numRuntimeKeys: receives number of runtime keys + * *arrayKeys: receives ptr to array of IndexArrayKeyInfos, or NULL if none + * *numArrayKeys: receives number of array keys + * + * Caller may pass NULL for arrayKeys and numArrayKeys to indicate that + * IndexArrayKeyInfos are not supported. + */ +void +ExecIndexBuildScanKeys(PlanState *planstate, Relation index, + List *quals, bool isorderby, + ScanKey *scanKeys, int *numScanKeys, + IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys, + IndexArrayKeyInfo **arrayKeys, int *numArrayKeys) +{ + ListCell *qual_cell; + ScanKey scan_keys; + IndexRuntimeKeyInfo *runtime_keys; + IndexArrayKeyInfo *array_keys; + int n_scan_keys; + int n_runtime_keys; + int max_runtime_keys; + int n_array_keys; + int j; + + /* Allocate array for ScanKey structs: one per qual */ + n_scan_keys = list_length(quals); + scan_keys = (ScanKey) palloc(n_scan_keys * sizeof(ScanKeyData)); + + /* + * runtime_keys array is dynamically resized as needed. We handle it this + * way so that the same runtime keys array can be shared between + * indexquals and indexorderbys, which will be processed in separate calls + * of this function. Caller must be sure to pass in NULL/0 for first + * call. + */ + runtime_keys = *runtimeKeys; + n_runtime_keys = max_runtime_keys = *numRuntimeKeys; + + /* Allocate array_keys as large as it could possibly need to be */ + array_keys = (IndexArrayKeyInfo *) + palloc0(n_scan_keys * sizeof(IndexArrayKeyInfo)); + n_array_keys = 0; + + /* + * for each opclause in the given qual, convert the opclause into a single + * scan key + */ + j = 0; + foreach(qual_cell, quals) + { + Expr *clause = (Expr *) lfirst(qual_cell); + ScanKey this_scan_key = &scan_keys[j++]; + Oid opno; /* operator's OID */ + RegProcedure opfuncid; /* operator proc id used in scan */ + Oid opfamily; /* opfamily of index column */ + int op_strategy; /* operator's strategy number */ + Oid op_lefttype; /* operator's declared input types */ + Oid op_righttype; + Expr *leftop; /* expr on lhs of operator */ + Expr *rightop; /* expr on rhs ... */ + AttrNumber varattno; /* att number used in scan */ + int indnkeyatts; + + indnkeyatts = IndexRelationGetNumberOfKeyAttributes(index); + if (IsA(clause, OpExpr)) + { + /* indexkey op const or indexkey op expression */ + int flags = 0; + Datum scanvalue; + + opno = ((OpExpr *) clause)->opno; + opfuncid = ((OpExpr *) clause)->opfuncid; + + /* + * leftop should be the index key Var, possibly relabeled + */ + leftop = (Expr *) get_leftop(clause); + + if (leftop && IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + Assert(leftop != NULL); + + if (!(IsA(leftop, Var) && + ((Var *) leftop)->varno == INDEX_VAR)) + elog(ERROR, "indexqual doesn't have key on left side"); + + varattno = ((Var *) leftop)->varattno; + if (varattno < 1 || varattno > indnkeyatts) + elog(ERROR, "bogus index qualification"); + + /* + * We have to look up the operator's strategy number. This + * provides a cross-check that the operator does match the index. + */ + opfamily = index->rd_opfamily[varattno - 1]; + + get_op_opfamily_properties(opno, opfamily, isorderby, + &op_strategy, + &op_lefttype, + &op_righttype); + + if (isorderby) + flags |= SK_ORDER_BY; + + /* + * rightop is the constant or variable comparison value + */ + rightop = (Expr *) get_rightop(clause); + + if (rightop && IsA(rightop, RelabelType)) + rightop = ((RelabelType *) rightop)->arg; + + Assert(rightop != NULL); + + if (IsA(rightop, Const)) + { + /* OK, simple constant comparison value */ + scanvalue = ((Const *) rightop)->constvalue; + if (((Const *) rightop)->constisnull) + flags |= SK_ISNULL; + } + else + { + /* Need to treat this one as a runtime key */ + if (n_runtime_keys >= max_runtime_keys) + { + if (max_runtime_keys == 0) + { + max_runtime_keys = 8; + runtime_keys = (IndexRuntimeKeyInfo *) + palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo)); + } + else + { + max_runtime_keys *= 2; + runtime_keys = (IndexRuntimeKeyInfo *) + repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo)); + } + } + runtime_keys[n_runtime_keys].scan_key = this_scan_key; + runtime_keys[n_runtime_keys].key_expr = + ExecInitExpr(rightop, planstate); + runtime_keys[n_runtime_keys].key_toastable = + TypeIsToastable(op_righttype); + n_runtime_keys++; + scanvalue = (Datum) 0; + } + + /* + * initialize the scan key's fields appropriately + */ + ScanKeyEntryInitialize(this_scan_key, + flags, + varattno, /* attribute number to scan */ + op_strategy, /* op's strategy */ + op_righttype, /* strategy subtype */ + ((OpExpr *) clause)->inputcollid, /* collation */ + opfuncid, /* reg proc to use */ + scanvalue); /* constant */ + } + else if (IsA(clause, RowCompareExpr)) + { + /* (indexkey, indexkey, ...) op (expression, expression, ...) */ + RowCompareExpr *rc = (RowCompareExpr *) clause; + ScanKey first_sub_key; + int n_sub_key; + ListCell *largs_cell; + ListCell *rargs_cell; + ListCell *opnos_cell; + ListCell *collids_cell; + + Assert(!isorderby); + + first_sub_key = (ScanKey) + palloc(list_length(rc->opnos) * sizeof(ScanKeyData)); + n_sub_key = 0; + + /* Scan RowCompare columns and generate subsidiary ScanKey items */ + forfour(largs_cell, rc->largs, rargs_cell, rc->rargs, + opnos_cell, rc->opnos, collids_cell, rc->inputcollids) + { + ScanKey this_sub_key = &first_sub_key[n_sub_key]; + int flags = SK_ROW_MEMBER; + Datum scanvalue; + Oid inputcollation; + + leftop = (Expr *) lfirst(largs_cell); + rightop = (Expr *) lfirst(rargs_cell); + opno = lfirst_oid(opnos_cell); + inputcollation = lfirst_oid(collids_cell); + + /* + * leftop should be the index key Var, possibly relabeled + */ + if (leftop && IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + Assert(leftop != NULL); + + if (!(IsA(leftop, Var) && + ((Var *) leftop)->varno == INDEX_VAR)) + elog(ERROR, "indexqual doesn't have key on left side"); + + varattno = ((Var *) leftop)->varattno; + + /* + * We have to look up the operator's associated btree support + * function + */ + if (index->rd_rel->relam != BTREE_AM_OID || + varattno < 1 || varattno > indnkeyatts) + elog(ERROR, "bogus RowCompare index qualification"); + opfamily = index->rd_opfamily[varattno - 1]; + + get_op_opfamily_properties(opno, opfamily, isorderby, + &op_strategy, + &op_lefttype, + &op_righttype); + + if (op_strategy != rc->rctype) + elog(ERROR, "RowCompare index qualification contains wrong operator"); + + opfuncid = get_opfamily_proc(opfamily, + op_lefttype, + op_righttype, + BTORDER_PROC); + if (!RegProcedureIsValid(opfuncid)) + elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", + BTORDER_PROC, op_lefttype, op_righttype, opfamily); + + /* + * rightop is the constant or variable comparison value + */ + if (rightop && IsA(rightop, RelabelType)) + rightop = ((RelabelType *) rightop)->arg; + + Assert(rightop != NULL); + + if (IsA(rightop, Const)) + { + /* OK, simple constant comparison value */ + scanvalue = ((Const *) rightop)->constvalue; + if (((Const *) rightop)->constisnull) + flags |= SK_ISNULL; + } + else + { + /* Need to treat this one as a runtime key */ + if (n_runtime_keys >= max_runtime_keys) + { + if (max_runtime_keys == 0) + { + max_runtime_keys = 8; + runtime_keys = (IndexRuntimeKeyInfo *) + palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo)); + } + else + { + max_runtime_keys *= 2; + runtime_keys = (IndexRuntimeKeyInfo *) + repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo)); + } + } + runtime_keys[n_runtime_keys].scan_key = this_sub_key; + runtime_keys[n_runtime_keys].key_expr = + ExecInitExpr(rightop, planstate); + runtime_keys[n_runtime_keys].key_toastable = + TypeIsToastable(op_righttype); + n_runtime_keys++; + scanvalue = (Datum) 0; + } + + /* + * initialize the subsidiary scan key's fields appropriately + */ + ScanKeyEntryInitialize(this_sub_key, + flags, + varattno, /* attribute number */ + op_strategy, /* op's strategy */ + op_righttype, /* strategy subtype */ + inputcollation, /* collation */ + opfuncid, /* reg proc to use */ + scanvalue); /* constant */ + n_sub_key++; + } + + /* Mark the last subsidiary scankey correctly */ + first_sub_key[n_sub_key - 1].sk_flags |= SK_ROW_END; + + /* + * We don't use ScanKeyEntryInitialize for the header because it + * isn't going to contain a valid sk_func pointer. + */ + MemSet(this_scan_key, 0, sizeof(ScanKeyData)); + this_scan_key->sk_flags = SK_ROW_HEADER; + this_scan_key->sk_attno = first_sub_key->sk_attno; + this_scan_key->sk_strategy = rc->rctype; + /* sk_subtype, sk_collation, sk_func not used in a header */ + this_scan_key->sk_argument = PointerGetDatum(first_sub_key); + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + /* indexkey op ANY (array-expression) */ + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; + int flags = 0; + Datum scanvalue; + + Assert(!isorderby); + + Assert(saop->useOr); + opno = saop->opno; + opfuncid = saop->opfuncid; + + /* + * leftop should be the index key Var, possibly relabeled + */ + leftop = (Expr *) linitial(saop->args); + + if (leftop && IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + Assert(leftop != NULL); + + if (!(IsA(leftop, Var) && + ((Var *) leftop)->varno == INDEX_VAR)) + elog(ERROR, "indexqual doesn't have key on left side"); + + varattno = ((Var *) leftop)->varattno; + if (varattno < 1 || varattno > indnkeyatts) + elog(ERROR, "bogus index qualification"); + + /* + * We have to look up the operator's strategy number. This + * provides a cross-check that the operator does match the index. + */ + opfamily = index->rd_opfamily[varattno - 1]; + + get_op_opfamily_properties(opno, opfamily, isorderby, + &op_strategy, + &op_lefttype, + &op_righttype); + + /* + * rightop is the constant or variable array value + */ + rightop = (Expr *) lsecond(saop->args); + + if (rightop && IsA(rightop, RelabelType)) + rightop = ((RelabelType *) rightop)->arg; + + Assert(rightop != NULL); + + if (index->rd_indam->amsearcharray) + { + /* Index AM will handle this like a simple operator */ + flags |= SK_SEARCHARRAY; + if (IsA(rightop, Const)) + { + /* OK, simple constant comparison value */ + scanvalue = ((Const *) rightop)->constvalue; + if (((Const *) rightop)->constisnull) + flags |= SK_ISNULL; + } + else + { + /* Need to treat this one as a runtime key */ + if (n_runtime_keys >= max_runtime_keys) + { + if (max_runtime_keys == 0) + { + max_runtime_keys = 8; + runtime_keys = (IndexRuntimeKeyInfo *) + palloc(max_runtime_keys * sizeof(IndexRuntimeKeyInfo)); + } + else + { + max_runtime_keys *= 2; + runtime_keys = (IndexRuntimeKeyInfo *) + repalloc(runtime_keys, max_runtime_keys * sizeof(IndexRuntimeKeyInfo)); + } + } + runtime_keys[n_runtime_keys].scan_key = this_scan_key; + runtime_keys[n_runtime_keys].key_expr = + ExecInitExpr(rightop, planstate); + + /* + * Careful here: the runtime expression is not of + * op_righttype, but rather is an array of same; so + * TypeIsToastable() isn't helpful. However, we can + * assume that all array types are toastable. + */ + runtime_keys[n_runtime_keys].key_toastable = true; + n_runtime_keys++; + scanvalue = (Datum) 0; + } + } + else + { + /* Executor has to expand the array value */ + array_keys[n_array_keys].scan_key = this_scan_key; + array_keys[n_array_keys].array_expr = + ExecInitExpr(rightop, planstate); + /* the remaining fields were zeroed by palloc0 */ + n_array_keys++; + scanvalue = (Datum) 0; + } + + /* + * initialize the scan key's fields appropriately + */ + ScanKeyEntryInitialize(this_scan_key, + flags, + varattno, /* attribute number to scan */ + op_strategy, /* op's strategy */ + op_righttype, /* strategy subtype */ + saop->inputcollid, /* collation */ + opfuncid, /* reg proc to use */ + scanvalue); /* constant */ + } + else if (IsA(clause, NullTest)) + { + /* indexkey IS NULL or indexkey IS NOT NULL */ + NullTest *ntest = (NullTest *) clause; + int flags; + + Assert(!isorderby); + + /* + * argument should be the index key Var, possibly relabeled + */ + leftop = ntest->arg; + + if (leftop && IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + + Assert(leftop != NULL); + + if (!(IsA(leftop, Var) && + ((Var *) leftop)->varno == INDEX_VAR)) + elog(ERROR, "NullTest indexqual has wrong key"); + + varattno = ((Var *) leftop)->varattno; + + /* + * initialize the scan key's fields appropriately + */ + switch (ntest->nulltesttype) + { + case IS_NULL: + flags = SK_ISNULL | SK_SEARCHNULL; + break; + case IS_NOT_NULL: + flags = SK_ISNULL | SK_SEARCHNOTNULL; + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + flags = 0; /* keep compiler quiet */ + break; + } + + ScanKeyEntryInitialize(this_scan_key, + flags, + varattno, /* attribute number to scan */ + InvalidStrategy, /* no strategy */ + InvalidOid, /* no strategy subtype */ + InvalidOid, /* no collation */ + InvalidOid, /* no reg proc for this */ + (Datum) 0); /* constant */ + } + else + elog(ERROR, "unsupported indexqual type: %d", + (int) nodeTag(clause)); + } + + Assert(n_runtime_keys <= max_runtime_keys); + + /* Get rid of any unused arrays */ + if (n_array_keys == 0) + { + pfree(array_keys); + array_keys = NULL; + } + + /* + * Return info to our caller. + */ + *scanKeys = scan_keys; + *numScanKeys = n_scan_keys; + *runtimeKeys = runtime_keys; + *numRuntimeKeys = n_runtime_keys; + if (arrayKeys) + { + *arrayKeys = array_keys; + *numArrayKeys = n_array_keys; + } + else if (n_array_keys != 0) + elog(ERROR, "ScalarArrayOpExpr index qual found where not allowed"); +} + +/* ---------------------------------------------------------------- + * Parallel Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecIndexScanEstimate + * + * Compute the amount of space we'll need in the parallel + * query DSM, and inform pcxt->estimator about our needs. + * ---------------------------------------------------------------- + */ +void +ExecIndexScanEstimate(IndexScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + + node->iss_PscanLen = index_parallelscan_estimate(node->iss_RelationDesc, + estate->es_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, node->iss_PscanLen); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecIndexScanInitializeDSM + * + * Set up a parallel index scan descriptor. + * ---------------------------------------------------------------- + */ +void +ExecIndexScanInitializeDSM(IndexScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + ParallelIndexScanDesc piscan; + + piscan = shm_toc_allocate(pcxt->toc, node->iss_PscanLen); + index_parallelscan_initialize(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + piscan); + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan); + node->iss_ScanDesc = + index_beginscan_parallel(node->ss.ss_currentRelation, + node->iss_RelationDesc, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, + piscan); + + /* + * If no run-time keys to calculate or they are ready, go ahead and pass + * the scankeys to the index AM. + */ + if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady) + index_rescan(node->iss_ScanDesc, + node->iss_ScanKeys, node->iss_NumScanKeys, + node->iss_OrderByKeys, node->iss_NumOrderByKeys); +} + +/* ---------------------------------------------------------------- + * ExecIndexScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecIndexScanReInitializeDSM(IndexScanState *node, + ParallelContext *pcxt) +{ + index_parallelrescan(node->iss_ScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecIndexScanInitializeWorker + * + * Copy relevant information from TOC into planstate. + * ---------------------------------------------------------------- + */ +void +ExecIndexScanInitializeWorker(IndexScanState *node, + ParallelWorkerContext *pwcxt) +{ + ParallelIndexScanDesc piscan; + + piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); + node->iss_ScanDesc = + index_beginscan_parallel(node->ss.ss_currentRelation, + node->iss_RelationDesc, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, + piscan); + + /* + * If no run-time keys to calculate or they are ready, go ahead and pass + * the scankeys to the index AM. + */ + if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady) + index_rescan(node->iss_ScanDesc, + node->iss_ScanKeys, node->iss_NumScanKeys, + node->iss_OrderByKeys, node->iss_NumOrderByKeys); +} diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c new file mode 100644 index 0000000..128eb3e --- /dev/null +++ b/src/backend/executor/nodeLimit.c @@ -0,0 +1,558 @@ +/*------------------------------------------------------------------------- + * + * nodeLimit.c + * Routines to handle limiting of query results where appropriate + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeLimit.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecLimit - extract a limited range of tuples + * ExecInitLimit - initialize node and subnodes.. + * ExecEndLimit - shutdown node and subnodes + */ + +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeLimit.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" + +static void recompute_limits(LimitState *node); +static int64 compute_tuples_needed(LimitState *node); + + +/* ---------------------------------------------------------------- + * ExecLimit + * + * This is a very simple node which just performs LIMIT/OFFSET + * filtering on the stream of tuples returned by a subplan. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* return: a tuple or NULL */ +ExecLimit(PlanState *pstate) +{ + LimitState *node = castNode(LimitState, pstate); + ExprContext *econtext = node->ps.ps_ExprContext; + ScanDirection direction; + TupleTableSlot *slot; + PlanState *outerPlan; + + CHECK_FOR_INTERRUPTS(); + + /* + * get information from the node + */ + direction = node->ps.state->es_direction; + outerPlan = outerPlanState(node); + + /* + * The main logic is a simple state machine. + */ + switch (node->lstate) + { + case LIMIT_INITIAL: + + /* + * First call for this node, so compute limit/offset. (We can't do + * this any earlier, because parameters from upper nodes will not + * be set during ExecInitLimit.) This also sets position = 0 and + * changes the state to LIMIT_RESCAN. + */ + recompute_limits(node); + + /* FALL THRU */ + + case LIMIT_RESCAN: + + /* + * If backwards scan, just return NULL without changing state. + */ + if (!ScanDirectionIsForward(direction)) + return NULL; + + /* + * Check for empty window; if so, treat like empty subplan. + */ + if (node->count <= 0 && !node->noCount) + { + node->lstate = LIMIT_EMPTY; + return NULL; + } + + /* + * Fetch rows from subplan until we reach position > offset. + */ + for (;;) + { + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + { + /* + * The subplan returns too few tuples for us to produce + * any output at all. + */ + node->lstate = LIMIT_EMPTY; + return NULL; + } + + /* + * Tuple at limit is needed for comparison in subsequent + * execution to detect ties. + */ + if (node->limitOption == LIMIT_OPTION_WITH_TIES && + node->position - node->offset == node->count - 1) + { + ExecCopySlot(node->last_slot, slot); + } + node->subSlot = slot; + if (++node->position > node->offset) + break; + } + + /* + * Okay, we have the first tuple of the window. + */ + node->lstate = LIMIT_INWINDOW; + break; + + case LIMIT_EMPTY: + + /* + * The subplan is known to return no tuples (or not more than + * OFFSET tuples, in general). So we return no tuples. + */ + return NULL; + + case LIMIT_INWINDOW: + if (ScanDirectionIsForward(direction)) + { + /* + * Forwards scan, so check for stepping off end of window. At + * the end of the window, the behavior depends on whether WITH + * TIES was specified: if so, we need to change the state + * machine to WINDOWEND_TIES, and fall through to the code for + * that case. If not (nothing was specified, or ONLY was) + * return NULL without advancing the subplan or the position + * variable, but change the state machine to record having + * done so. + * + * Once at the end, ideally, we would shut down parallel + * resources; but that would destroy the parallel context + * which might be required for rescans. To do that, we'll + * need to find a way to pass down more information about + * whether rescans are possible. + */ + if (!node->noCount && + node->position - node->offset >= node->count) + { + if (node->limitOption == LIMIT_OPTION_COUNT) + { + node->lstate = LIMIT_WINDOWEND; + return NULL; + } + else + { + node->lstate = LIMIT_WINDOWEND_TIES; + /* we'll fall through to the next case */ + } + } + else + { + /* + * Get next tuple from subplan, if any. + */ + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + { + node->lstate = LIMIT_SUBPLANEOF; + return NULL; + } + + /* + * If WITH TIES is active, and this is the last in-window + * tuple, save it to be used in subsequent WINDOWEND_TIES + * processing. + */ + if (node->limitOption == LIMIT_OPTION_WITH_TIES && + node->position - node->offset == node->count - 1) + { + ExecCopySlot(node->last_slot, slot); + } + node->subSlot = slot; + node->position++; + break; + } + } + else + { + /* + * Backwards scan, so check for stepping off start of window. + * As above, only change state-machine status if so. + */ + if (node->position <= node->offset + 1) + { + node->lstate = LIMIT_WINDOWSTART; + return NULL; + } + + /* + * Get previous tuple from subplan; there should be one! + */ + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + elog(ERROR, "LIMIT subplan failed to run backwards"); + node->subSlot = slot; + node->position--; + break; + } + + Assert(node->lstate == LIMIT_WINDOWEND_TIES); + /* FALL THRU */ + + case LIMIT_WINDOWEND_TIES: + if (ScanDirectionIsForward(direction)) + { + /* + * Advance the subplan until we find the first row with + * different ORDER BY pathkeys. + */ + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + { + node->lstate = LIMIT_SUBPLANEOF; + return NULL; + } + + /* + * Test if the new tuple and the last tuple match. If so we + * return the tuple. + */ + econtext->ecxt_innertuple = slot; + econtext->ecxt_outertuple = node->last_slot; + if (ExecQualAndReset(node->eqfunction, econtext)) + { + node->subSlot = slot; + node->position++; + } + else + { + node->lstate = LIMIT_WINDOWEND; + return NULL; + } + } + else + { + /* + * Backwards scan, so check for stepping off start of window. + * Change only state-machine status if so. + */ + if (node->position <= node->offset + 1) + { + node->lstate = LIMIT_WINDOWSTART; + return NULL; + } + + /* + * Get previous tuple from subplan; there should be one! And + * change state-machine status. + */ + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + elog(ERROR, "LIMIT subplan failed to run backwards"); + node->subSlot = slot; + node->position--; + node->lstate = LIMIT_INWINDOW; + } + break; + + case LIMIT_SUBPLANEOF: + if (ScanDirectionIsForward(direction)) + return NULL; + + /* + * Backing up from subplan EOF, so re-fetch previous tuple; there + * should be one! Note previous tuple must be in window. + */ + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + elog(ERROR, "LIMIT subplan failed to run backwards"); + node->subSlot = slot; + node->lstate = LIMIT_INWINDOW; + /* position does not change 'cause we didn't advance it before */ + break; + + case LIMIT_WINDOWEND: + if (ScanDirectionIsForward(direction)) + return NULL; + + /* + * We already past one position to detect ties so re-fetch + * previous tuple; there should be one! Note previous tuple must + * be in window. + */ + if (node->limitOption == LIMIT_OPTION_WITH_TIES) + { + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + elog(ERROR, "LIMIT subplan failed to run backwards"); + node->subSlot = slot; + node->lstate = LIMIT_INWINDOW; + } + else + { + /* + * Backing up from window end: simply re-return the last tuple + * fetched from the subplan. + */ + slot = node->subSlot; + node->lstate = LIMIT_INWINDOW; + /* position does not change 'cause we didn't advance it before */ + } + break; + + case LIMIT_WINDOWSTART: + if (!ScanDirectionIsForward(direction)) + return NULL; + + /* + * Advancing after having backed off window start: simply + * re-return the last tuple fetched from the subplan. + */ + slot = node->subSlot; + node->lstate = LIMIT_INWINDOW; + /* position does not change 'cause we didn't change it before */ + break; + + default: + elog(ERROR, "impossible LIMIT state: %d", + (int) node->lstate); + slot = NULL; /* keep compiler quiet */ + break; + } + + /* Return the current tuple */ + Assert(!TupIsNull(slot)); + + return slot; +} + +/* + * Evaluate the limit/offset expressions --- done at startup or rescan. + * + * This is also a handy place to reset the current-position state info. + */ +static void +recompute_limits(LimitState *node) +{ + ExprContext *econtext = node->ps.ps_ExprContext; + Datum val; + bool isNull; + + if (node->limitOffset) + { + val = ExecEvalExprSwitchContext(node->limitOffset, + econtext, + &isNull); + /* Interpret NULL offset as no offset */ + if (isNull) + node->offset = 0; + else + { + node->offset = DatumGetInt64(val); + if (node->offset < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE), + errmsg("OFFSET must not be negative"))); + } + } + else + { + /* No OFFSET supplied */ + node->offset = 0; + } + + if (node->limitCount) + { + val = ExecEvalExprSwitchContext(node->limitCount, + econtext, + &isNull); + /* Interpret NULL count as no count (LIMIT ALL) */ + if (isNull) + { + node->count = 0; + node->noCount = true; + } + else + { + node->count = DatumGetInt64(val); + if (node->count < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_ROW_COUNT_IN_LIMIT_CLAUSE), + errmsg("LIMIT must not be negative"))); + node->noCount = false; + } + } + else + { + /* No COUNT supplied */ + node->count = 0; + node->noCount = true; + } + + /* Reset position to start-of-scan */ + node->position = 0; + node->subSlot = NULL; + + /* Set state-machine state */ + node->lstate = LIMIT_RESCAN; + + /* + * Notify child node about limit. Note: think not to "optimize" by + * skipping ExecSetTupleBound if compute_tuples_needed returns < 0. We + * must update the child node anyway, in case this is a rescan and the + * previous time we got a different result. + */ + ExecSetTupleBound(compute_tuples_needed(node), outerPlanState(node)); +} + +/* + * Compute the maximum number of tuples needed to satisfy this Limit node. + * Return a negative value if there is not a determinable limit. + */ +static int64 +compute_tuples_needed(LimitState *node) +{ + if ((node->noCount) || (node->limitOption == LIMIT_OPTION_WITH_TIES)) + return -1; + /* Note: if this overflows, we'll return a negative value, which is OK */ + return node->count + node->offset; +} + +/* ---------------------------------------------------------------- + * ExecInitLimit + * + * This initializes the limit node state structures and + * the node's subplan. + * ---------------------------------------------------------------- + */ +LimitState * +ExecInitLimit(Limit *node, EState *estate, int eflags) +{ + LimitState *limitstate; + Plan *outerPlan; + + /* check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* + * create state structure + */ + limitstate = makeNode(LimitState); + limitstate->ps.plan = (Plan *) node; + limitstate->ps.state = estate; + limitstate->ps.ExecProcNode = ExecLimit; + + limitstate->lstate = LIMIT_INITIAL; + + /* + * Miscellaneous initialization + * + * Limit nodes never call ExecQual or ExecProject, but they need an + * exprcontext anyway to evaluate the limit/offset parameters in. + */ + ExecAssignExprContext(estate, &limitstate->ps); + + /* + * initialize outer plan + */ + outerPlan = outerPlan(node); + outerPlanState(limitstate) = ExecInitNode(outerPlan, estate, eflags); + + /* + * initialize child expressions + */ + limitstate->limitOffset = ExecInitExpr((Expr *) node->limitOffset, + (PlanState *) limitstate); + limitstate->limitCount = ExecInitExpr((Expr *) node->limitCount, + (PlanState *) limitstate); + limitstate->limitOption = node->limitOption; + + /* + * Initialize result type. + */ + ExecInitResultTypeTL(&limitstate->ps); + + limitstate->ps.resultopsset = true; + limitstate->ps.resultops = ExecGetResultSlotOps(outerPlanState(limitstate), + &limitstate->ps.resultopsfixed); + + /* + * limit nodes do no projections, so initialize projection info for this + * node appropriately + */ + limitstate->ps.ps_ProjInfo = NULL; + + /* + * Initialize the equality evaluation, to detect ties. + */ + if (node->limitOption == LIMIT_OPTION_WITH_TIES) + { + TupleDesc desc; + const TupleTableSlotOps *ops; + + desc = ExecGetResultType(outerPlanState(limitstate)); + ops = ExecGetResultSlotOps(outerPlanState(limitstate), NULL); + + limitstate->last_slot = ExecInitExtraTupleSlot(estate, desc, ops); + limitstate->eqfunction = execTuplesMatchPrepare(desc, + node->uniqNumCols, + node->uniqColIdx, + node->uniqOperators, + node->uniqCollations, + &limitstate->ps); + } + + return limitstate; +} + +/* ---------------------------------------------------------------- + * ExecEndLimit + * + * This shuts down the subplan and frees resources allocated + * to this node. + * ---------------------------------------------------------------- + */ +void +ExecEndLimit(LimitState *node) +{ + ExecFreeExprContext(&node->ps); + ExecEndNode(outerPlanState(node)); +} + + +void +ExecReScanLimit(LimitState *node) +{ + /* + * Recompute limit/offset in case parameters changed, and reset the state + * machine. We must do this before rescanning our child node, in case + * it's a Sort that we are passing the parameters down to. + */ + recompute_limits(node); + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); +} diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c new file mode 100644 index 0000000..7583973 --- /dev/null +++ b/src/backend/executor/nodeLockRows.c @@ -0,0 +1,403 @@ +/*------------------------------------------------------------------------- + * + * nodeLockRows.c + * Routines to handle FOR UPDATE/FOR SHARE row locking + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeLockRows.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecLockRows - fetch locked rows + * ExecInitLockRows - initialize node and subnodes.. + * ExecEndLockRows - shutdown node and subnodes + */ + +#include "postgres.h" + +#include "access/tableam.h" +#include "access/xact.h" +#include "executor/executor.h" +#include "executor/nodeLockRows.h" +#include "foreign/fdwapi.h" +#include "miscadmin.h" +#include "utils/rel.h" + + +/* ---------------------------------------------------------------- + * ExecLockRows + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* return: a tuple or NULL */ +ExecLockRows(PlanState *pstate) +{ + LockRowsState *node = castNode(LockRowsState, pstate); + TupleTableSlot *slot; + EState *estate; + PlanState *outerPlan; + bool epq_needed; + ListCell *lc; + + CHECK_FOR_INTERRUPTS(); + + /* + * get information from the node + */ + estate = node->ps.state; + outerPlan = outerPlanState(node); + + /* + * Get next tuple from subplan, if any. + */ +lnext: + slot = ExecProcNode(outerPlan); + + if (TupIsNull(slot)) + { + /* Release any resources held by EPQ mechanism before exiting */ + EvalPlanQualEnd(&node->lr_epqstate); + return NULL; + } + + /* We don't need EvalPlanQual unless we get updated tuple version(s) */ + epq_needed = false; + + /* + * Attempt to lock the source tuple(s). (Note we only have locking + * rowmarks in lr_arowMarks.) + */ + foreach(lc, node->lr_arowMarks) + { + ExecAuxRowMark *aerm = (ExecAuxRowMark *) lfirst(lc); + ExecRowMark *erm = aerm->rowmark; + Datum datum; + bool isNull; + ItemPointerData tid; + TM_FailureData tmfd; + LockTupleMode lockmode; + int lockflags = 0; + TM_Result test; + TupleTableSlot *markSlot; + + /* clear any leftover test tuple for this rel */ + markSlot = EvalPlanQualSlot(&node->lr_epqstate, erm->relation, erm->rti); + ExecClearTuple(markSlot); + + /* if child rel, must check whether it produced this row */ + if (erm->rti != erm->prti) + { + Oid tableoid; + + datum = ExecGetJunkAttribute(slot, + aerm->toidAttNo, + &isNull); + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "tableoid is NULL"); + tableoid = DatumGetObjectId(datum); + + Assert(OidIsValid(erm->relid)); + if (tableoid != erm->relid) + { + /* this child is inactive right now */ + erm->ermActive = false; + ItemPointerSetInvalid(&(erm->curCtid)); + ExecClearTuple(markSlot); + continue; + } + } + erm->ermActive = true; + + /* fetch the tuple's ctid */ + datum = ExecGetJunkAttribute(slot, + aerm->ctidAttNo, + &isNull); + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "ctid is NULL"); + + /* requests for foreign tables must be passed to their FDW */ + if (erm->relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + { + FdwRoutine *fdwroutine; + bool updated = false; + + fdwroutine = GetFdwRoutineForRelation(erm->relation, false); + /* this should have been checked already, but let's be safe */ + if (fdwroutine->RefetchForeignRow == NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot lock rows in foreign table \"%s\"", + RelationGetRelationName(erm->relation)))); + + fdwroutine->RefetchForeignRow(estate, + erm, + datum, + markSlot, + &updated); + if (TupIsNull(markSlot)) + { + /* couldn't get the lock, so skip this row */ + goto lnext; + } + + /* + * if FDW says tuple was updated before getting locked, we need to + * perform EPQ testing to see if quals are still satisfied + */ + if (updated) + epq_needed = true; + + continue; + } + + /* okay, try to lock (and fetch) the tuple */ + tid = *((ItemPointer) DatumGetPointer(datum)); + switch (erm->markType) + { + case ROW_MARK_EXCLUSIVE: + lockmode = LockTupleExclusive; + break; + case ROW_MARK_NOKEYEXCLUSIVE: + lockmode = LockTupleNoKeyExclusive; + break; + case ROW_MARK_SHARE: + lockmode = LockTupleShare; + break; + case ROW_MARK_KEYSHARE: + lockmode = LockTupleKeyShare; + break; + default: + elog(ERROR, "unsupported rowmark type"); + lockmode = LockTupleNoKeyExclusive; /* keep compiler quiet */ + break; + } + + lockflags = TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS; + if (!IsolationUsesXactSnapshot()) + lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; + + test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot, + markSlot, estate->es_output_cid, + lockmode, erm->waitPolicy, + lockflags, + &tmfd); + + switch (test) + { + case TM_WouldBlock: + /* couldn't lock tuple in SKIP LOCKED mode */ + goto lnext; + + case TM_SelfModified: + + /* + * The target tuple was already updated or deleted by the + * current command, or by a later command in the current + * transaction. We *must* ignore the tuple in the former + * case, so as to avoid the "Halloween problem" of repeated + * update attempts. In the latter case it might be sensible + * to fetch the updated tuple instead, but doing so would + * require changing heap_update and heap_delete to not + * complain about updating "invisible" tuples, which seems + * pretty scary (table_tuple_lock will not complain, but few + * callers expect TM_Invisible, and we're not one of them). So + * for now, treat the tuple as deleted and do not process. + */ + goto lnext; + + case TM_Ok: + + /* + * Got the lock successfully, the locked tuple saved in + * markSlot for, if needed, EvalPlanQual testing below. + */ + if (tmfd.traversed) + epq_needed = true; + break; + + case TM_Updated: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + elog(ERROR, "unexpected table_tuple_lock status: %u", + test); + break; + + case TM_Deleted: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + /* tuple was deleted so don't return it */ + goto lnext; + + case TM_Invisible: + elog(ERROR, "attempted to lock invisible tuple"); + break; + + default: + elog(ERROR, "unrecognized table_tuple_lock status: %u", + test); + } + + /* Remember locked tuple's TID for EPQ testing and WHERE CURRENT OF */ + erm->curCtid = tid; + } + + /* + * If we need to do EvalPlanQual testing, do so. + */ + if (epq_needed) + { + /* Initialize EPQ machinery */ + EvalPlanQualBegin(&node->lr_epqstate); + + /* + * To fetch non-locked source rows the EPQ logic needs to access junk + * columns from the tuple being tested. + */ + EvalPlanQualSetSlot(&node->lr_epqstate, slot); + + /* + * And finally we can re-evaluate the tuple. + */ + slot = EvalPlanQualNext(&node->lr_epqstate); + if (TupIsNull(slot)) + { + /* Updated tuple fails qual, so ignore it and go on */ + goto lnext; + } + } + + /* Got all locks, so return the current tuple */ + return slot; +} + +/* ---------------------------------------------------------------- + * ExecInitLockRows + * + * This initializes the LockRows node state structures and + * the node's subplan. + * ---------------------------------------------------------------- + */ +LockRowsState * +ExecInitLockRows(LockRows *node, EState *estate, int eflags) +{ + LockRowsState *lrstate; + Plan *outerPlan = outerPlan(node); + List *epq_arowmarks; + ListCell *lc; + + /* check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* + * create state structure + */ + lrstate = makeNode(LockRowsState); + lrstate->ps.plan = (Plan *) node; + lrstate->ps.state = estate; + lrstate->ps.ExecProcNode = ExecLockRows; + + /* + * Miscellaneous initialization + * + * LockRows nodes never call ExecQual or ExecProject, therefore no + * ExprContext is needed. + */ + + /* + * Initialize result type. + */ + ExecInitResultTypeTL(&lrstate->ps); + + /* + * then initialize outer plan + */ + outerPlanState(lrstate) = ExecInitNode(outerPlan, estate, eflags); + + /* node returns unmodified slots from the outer plan */ + lrstate->ps.resultopsset = true; + lrstate->ps.resultops = ExecGetResultSlotOps(outerPlanState(lrstate), + &lrstate->ps.resultopsfixed); + + /* + * LockRows nodes do no projections, so initialize projection info for + * this node appropriately + */ + lrstate->ps.ps_ProjInfo = NULL; + + /* + * Locate the ExecRowMark(s) that this node is responsible for, and + * construct ExecAuxRowMarks for them. (InitPlan should already have + * built the global list of ExecRowMarks.) + */ + lrstate->lr_arowMarks = NIL; + epq_arowmarks = NIL; + foreach(lc, node->rowMarks) + { + PlanRowMark *rc = lfirst_node(PlanRowMark, lc); + ExecRowMark *erm; + ExecAuxRowMark *aerm; + + /* ignore "parent" rowmarks; they are irrelevant at runtime */ + if (rc->isParent) + continue; + + /* find ExecRowMark and build ExecAuxRowMark */ + erm = ExecFindRowMark(estate, rc->rti, false); + aerm = ExecBuildAuxRowMark(erm, outerPlan->targetlist); + + /* + * Only locking rowmarks go into our own list. Non-locking marks are + * passed off to the EvalPlanQual machinery. This is because we don't + * want to bother fetching non-locked rows unless we actually have to + * do an EPQ recheck. + */ + if (RowMarkRequiresRowShareLock(erm->markType)) + lrstate->lr_arowMarks = lappend(lrstate->lr_arowMarks, aerm); + else + epq_arowmarks = lappend(epq_arowmarks, aerm); + } + + /* Now we have the info needed to set up EPQ state */ + EvalPlanQualInit(&lrstate->lr_epqstate, estate, + outerPlan, epq_arowmarks, node->epqParam); + + return lrstate; +} + +/* ---------------------------------------------------------------- + * ExecEndLockRows + * + * This shuts down the subplan and frees resources allocated + * to this node. + * ---------------------------------------------------------------- + */ +void +ExecEndLockRows(LockRowsState *node) +{ + /* We may have shut down EPQ already, but no harm in another call */ + EvalPlanQualEnd(&node->lr_epqstate); + ExecEndNode(outerPlanState(node)); +} + + +void +ExecReScanLockRows(LockRowsState *node) +{ + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); +} diff --git a/src/backend/executor/nodeMaterial.c b/src/backend/executor/nodeMaterial.c new file mode 100644 index 0000000..7c53f8e --- /dev/null +++ b/src/backend/executor/nodeMaterial.c @@ -0,0 +1,368 @@ +/*------------------------------------------------------------------------- + * + * nodeMaterial.c + * Routines to handle materialization nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeMaterial.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecMaterial - materialize the result of a subplan + * ExecInitMaterial - initialize node and subnodes + * ExecEndMaterial - shutdown node and subnodes + * + */ +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeMaterial.h" +#include "miscadmin.h" + +/* ---------------------------------------------------------------- + * ExecMaterial + * + * As long as we are at the end of the data collected in the tuplestore, + * we collect one new row from the subplan on each call, and stash it + * aside in the tuplestore before returning it. The tuplestore is + * only read if we are asked to scan backwards, rescan, or mark/restore. + * + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* result tuple from subplan */ +ExecMaterial(PlanState *pstate) +{ + MaterialState *node = castNode(MaterialState, pstate); + EState *estate; + ScanDirection dir; + bool forward; + Tuplestorestate *tuplestorestate; + bool eof_tuplestore; + TupleTableSlot *slot; + + CHECK_FOR_INTERRUPTS(); + + /* + * get state info from node + */ + estate = node->ss.ps.state; + dir = estate->es_direction; + forward = ScanDirectionIsForward(dir); + tuplestorestate = node->tuplestorestate; + + /* + * If first time through, and we need a tuplestore, initialize it. + */ + if (tuplestorestate == NULL && node->eflags != 0) + { + tuplestorestate = tuplestore_begin_heap(true, false, work_mem); + tuplestore_set_eflags(tuplestorestate, node->eflags); + if (node->eflags & EXEC_FLAG_MARK) + { + /* + * Allocate a second read pointer to serve as the mark. We know it + * must have index 1, so needn't store that. + */ + int ptrno PG_USED_FOR_ASSERTS_ONLY; + + ptrno = tuplestore_alloc_read_pointer(tuplestorestate, + node->eflags); + Assert(ptrno == 1); + } + node->tuplestorestate = tuplestorestate; + } + + /* + * If we are not at the end of the tuplestore, or are going backwards, try + * to fetch a tuple from tuplestore. + */ + eof_tuplestore = (tuplestorestate == NULL) || + tuplestore_ateof(tuplestorestate); + + if (!forward && eof_tuplestore) + { + if (!node->eof_underlying) + { + /* + * When reversing direction at tuplestore EOF, the first + * gettupleslot call will fetch the last-added tuple; but we want + * to return the one before that, if possible. So do an extra + * fetch. + */ + if (!tuplestore_advance(tuplestorestate, forward)) + return NULL; /* the tuplestore must be empty */ + } + eof_tuplestore = false; + } + + /* + * If we can fetch another tuple from the tuplestore, return it. + */ + slot = node->ss.ps.ps_ResultTupleSlot; + if (!eof_tuplestore) + { + if (tuplestore_gettupleslot(tuplestorestate, forward, false, slot)) + return slot; + if (forward) + eof_tuplestore = true; + } + + /* + * If necessary, try to fetch another row from the subplan. + * + * Note: the eof_underlying state variable exists to short-circuit further + * subplan calls. It's not optional, unfortunately, because some plan + * node types are not robust about being called again when they've already + * returned NULL. + */ + if (eof_tuplestore && !node->eof_underlying) + { + PlanState *outerNode; + TupleTableSlot *outerslot; + + /* + * We can only get here with forward==true, so no need to worry about + * which direction the subplan will go. + */ + outerNode = outerPlanState(node); + outerslot = ExecProcNode(outerNode); + if (TupIsNull(outerslot)) + { + node->eof_underlying = true; + return NULL; + } + + /* + * Append a copy of the returned tuple to tuplestore. NOTE: because + * the tuplestore is certainly in EOF state, its read position will + * move forward over the added tuple. This is what we want. + */ + if (tuplestorestate) + tuplestore_puttupleslot(tuplestorestate, outerslot); + + ExecCopySlot(slot, outerslot); + return slot; + } + + /* + * Nothing left ... + */ + return ExecClearTuple(slot); +} + +/* ---------------------------------------------------------------- + * ExecInitMaterial + * ---------------------------------------------------------------- + */ +MaterialState * +ExecInitMaterial(Material *node, EState *estate, int eflags) +{ + MaterialState *matstate; + Plan *outerPlan; + + /* + * create state structure + */ + matstate = makeNode(MaterialState); + matstate->ss.ps.plan = (Plan *) node; + matstate->ss.ps.state = estate; + matstate->ss.ps.ExecProcNode = ExecMaterial; + + /* + * We must have a tuplestore buffering the subplan output to do backward + * scan or mark/restore. We also prefer to materialize the subplan output + * if we might be called on to rewind and replay it many times. However, + * if none of these cases apply, we can skip storing the data. + */ + matstate->eflags = (eflags & (EXEC_FLAG_REWIND | + EXEC_FLAG_BACKWARD | + EXEC_FLAG_MARK)); + + /* + * Tuplestore's interpretation of the flag bits is subtly different from + * the general executor meaning: it doesn't think BACKWARD necessarily + * means "backwards all the way to start". If told to support BACKWARD we + * must include REWIND in the tuplestore eflags, else tuplestore_trim + * might throw away too much. + */ + if (eflags & EXEC_FLAG_BACKWARD) + matstate->eflags |= EXEC_FLAG_REWIND; + + matstate->eof_underlying = false; + matstate->tuplestorestate = NULL; + + /* + * Miscellaneous initialization + * + * Materialization nodes don't need ExprContexts because they never call + * ExecQual or ExecProject. + */ + + /* + * initialize child nodes + * + * We shield the child node from the need to support REWIND, BACKWARD, or + * MARK/RESTORE. + */ + eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK); + + outerPlan = outerPlan(node); + outerPlanState(matstate) = ExecInitNode(outerPlan, estate, eflags); + + /* + * Initialize result type and slot. No need to initialize projection info + * because this node doesn't do projections. + * + * material nodes only return tuples from their materialized relation. + */ + ExecInitResultTupleSlotTL(&matstate->ss.ps, &TTSOpsMinimalTuple); + matstate->ss.ps.ps_ProjInfo = NULL; + + /* + * initialize tuple type. + */ + ExecCreateScanSlotFromOuterPlan(estate, &matstate->ss, &TTSOpsMinimalTuple); + + return matstate; +} + +/* ---------------------------------------------------------------- + * ExecEndMaterial + * ---------------------------------------------------------------- + */ +void +ExecEndMaterial(MaterialState *node) +{ + /* + * clean out the tuple table + */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * Release tuplestore resources + */ + if (node->tuplestorestate != NULL) + tuplestore_end(node->tuplestorestate); + node->tuplestorestate = NULL; + + /* + * shut down the subplan + */ + ExecEndNode(outerPlanState(node)); +} + +/* ---------------------------------------------------------------- + * ExecMaterialMarkPos + * + * Calls tuplestore to save the current position in the stored file. + * ---------------------------------------------------------------- + */ +void +ExecMaterialMarkPos(MaterialState *node) +{ + Assert(node->eflags & EXEC_FLAG_MARK); + + /* + * if we haven't materialized yet, just return. + */ + if (!node->tuplestorestate) + return; + + /* + * copy the active read pointer to the mark. + */ + tuplestore_copy_read_pointer(node->tuplestorestate, 0, 1); + + /* + * since we may have advanced the mark, try to truncate the tuplestore. + */ + tuplestore_trim(node->tuplestorestate); +} + +/* ---------------------------------------------------------------- + * ExecMaterialRestrPos + * + * Calls tuplestore to restore the last saved file position. + * ---------------------------------------------------------------- + */ +void +ExecMaterialRestrPos(MaterialState *node) +{ + Assert(node->eflags & EXEC_FLAG_MARK); + + /* + * if we haven't materialized yet, just return. + */ + if (!node->tuplestorestate) + return; + + /* + * copy the mark to the active read pointer. + */ + tuplestore_copy_read_pointer(node->tuplestorestate, 1, 0); +} + +/* ---------------------------------------------------------------- + * ExecReScanMaterial + * + * Rescans the materialized relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanMaterial(MaterialState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + if (node->eflags != 0) + { + /* + * If we haven't materialized yet, just return. If outerplan's + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else no reason to re-scan it at all. + */ + if (!node->tuplestorestate) + return; + + /* + * If subnode is to be rescanned then we forget previous stored + * results; we have to re-read the subplan and re-store. Also, if we + * told tuplestore it needn't support rescan, we lose and must + * re-read. (This last should not happen in common cases; else our + * caller lied by not passing EXEC_FLAG_REWIND to us.) + * + * Otherwise we can just rewind and rescan the stored output. The + * state of the subnode does not change. + */ + if (outerPlan->chgParam != NULL || + (node->eflags & EXEC_FLAG_REWIND) == 0) + { + tuplestore_end(node->tuplestorestate); + node->tuplestorestate = NULL; + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); + node->eof_underlying = false; + } + else + tuplestore_rescan(node->tuplestorestate); + } + else + { + /* In this case we are just passing on the subquery's output */ + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); + node->eof_underlying = false; + } +} diff --git a/src/backend/executor/nodeMemoize.c b/src/backend/executor/nodeMemoize.c new file mode 100644 index 0000000..f82f41f --- /dev/null +++ b/src/backend/executor/nodeMemoize.c @@ -0,0 +1,1225 @@ +/*------------------------------------------------------------------------- + * + * nodeMemoize.c + * Routines to handle caching of results from parameterized nodes + * + * Portions Copyright (c) 2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeMemoize.c + * + * Memoize nodes are intended to sit above parameterized nodes in the plan + * tree in order to cache results from them. The intention here is that a + * repeat scan with a parameter value that has already been seen by the node + * can fetch tuples from the cache rather than having to re-scan the outer + * node all over again. The query planner may choose to make use of one of + * these when it thinks rescans for previously seen values are likely enough + * to warrant adding the additional node. + * + * The method of cache we use is a hash table. When the cache fills, we never + * spill tuples to disk, instead, we choose to evict the least recently used + * cache entry from the cache. We remember the least recently used entry by + * always pushing new entries and entries we look for onto the tail of a + * doubly linked list. This means that older items always bubble to the top + * of this LRU list. + * + * Sometimes our callers won't run their scans to completion. For example a + * semi-join only needs to run until it finds a matching tuple, and once it + * does, the join operator skips to the next outer tuple and does not execute + * the inner side again on that scan. Because of this, we must keep track of + * when a cache entry is complete, and by default, we know it is when we run + * out of tuples to read during the scan. However, there are cases where we + * can mark the cache entry as complete without exhausting the scan of all + * tuples. One case is unique joins, where the join operator knows that there + * will only be at most one match for any given outer tuple. In order to + * support such cases we allow the "singlerow" option to be set for the cache. + * This option marks the cache entry as complete after we read the first tuple + * from the subnode. + * + * It's possible when we're filling the cache for a given set of parameters + * that we're unable to free enough memory to store any more tuples. If this + * happens then we'll have already evicted all other cache entries. When + * caching another tuple would cause us to exceed our memory budget, we must + * free the entry that we're currently populating and move the state machine + * into MEMO_CACHE_BYPASS_MODE. This means that we'll not attempt to cache + * any further tuples for this particular scan. We don't have the memory for + * it. The state machine will be reset again on the next rescan. If the + * memory requirements to cache the next parameter's tuples are less + * demanding, then that may allow us to start putting useful entries back into + * the cache again. + * + * + * INTERFACE ROUTINES + * ExecMemoize - lookup cache, exec subplan when not found + * ExecInitMemoize - initialize node and subnodes + * ExecEndMemoize - shutdown node and subnodes + * ExecReScanMemoize - rescan the memoize node + * + * ExecMemoizeEstimate estimates DSM space needed for parallel plan + * ExecMemoizeInitializeDSM initialize DSM for parallel plan + * ExecMemoizeInitializeWorker attach to DSM info in parallel worker + * ExecMemoizeRetrieveInstrumentation get instrumentation from worker + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "common/hashfn.h" +#include "executor/executor.h" +#include "executor/nodeMemoize.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" + +/* States of the ExecMemoize state machine */ +#define MEMO_CACHE_LOOKUP 1 /* Attempt to perform a cache lookup */ +#define MEMO_CACHE_FETCH_NEXT_TUPLE 2 /* Get another tuple from the cache */ +#define MEMO_FILLING_CACHE 3 /* Read outer node to fill cache */ +#define MEMO_CACHE_BYPASS_MODE 4 /* Bypass mode. Just read from our + * subplan without caching anything */ +#define MEMO_END_OF_SCAN 5 /* Ready for rescan */ + + +/* Helper macros for memory accounting */ +#define EMPTY_ENTRY_MEMORY_BYTES(e) (sizeof(MemoizeEntry) + \ + sizeof(MemoizeKey) + \ + (e)->key->params->t_len); +#define CACHE_TUPLE_BYTES(t) (sizeof(MemoizeTuple) + \ + (t)->mintuple->t_len) + + /* MemoizeTuple Stores an individually cached tuple */ +typedef struct MemoizeTuple +{ + MinimalTuple mintuple; /* Cached tuple */ + struct MemoizeTuple *next; /* The next tuple with the same parameter + * values or NULL if it's the last one */ +} MemoizeTuple; + +/* + * MemoizeKey + * The hash table key for cached entries plus the LRU list link + */ +typedef struct MemoizeKey +{ + MinimalTuple params; + dlist_node lru_node; /* Pointer to next/prev key in LRU list */ +} MemoizeKey; + +/* + * MemoizeEntry + * The data struct that the cache hash table stores + */ +typedef struct MemoizeEntry +{ + MemoizeKey *key; /* Hash key for hash table lookups */ + MemoizeTuple *tuplehead; /* Pointer to the first tuple or NULL if + * no tuples are cached for this entry */ + uint32 hash; /* Hash value (cached) */ + char status; /* Hash status */ + bool complete; /* Did we read the outer plan to completion? */ +} MemoizeEntry; + + +#define SH_PREFIX memoize +#define SH_ELEMENT_TYPE MemoizeEntry +#define SH_KEY_TYPE MemoizeKey * +#define SH_SCOPE static inline +#define SH_DECLARE +#include "lib/simplehash.h" + +static uint32 MemoizeHash_hash(struct memoize_hash *tb, + const MemoizeKey *key); +static bool MemoizeHash_equal(struct memoize_hash *tb, + const MemoizeKey *params1, + const MemoizeKey *params2); + +#define SH_PREFIX memoize +#define SH_ELEMENT_TYPE MemoizeEntry +#define SH_KEY_TYPE MemoizeKey * +#define SH_KEY key +#define SH_HASH_KEY(tb, key) MemoizeHash_hash(tb, key) +#define SH_EQUAL(tb, a, b) MemoizeHash_equal(tb, a, b) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) a->hash +#define SH_DEFINE +#include "lib/simplehash.h" + +/* + * MemoizeHash_hash + * Hash function for simplehash hashtable. 'key' is unused here as we + * require that all table lookups first populate the MemoizeState's + * probeslot with the key values to be looked up. + */ +static uint32 +MemoizeHash_hash(struct memoize_hash *tb, const MemoizeKey *key) +{ + MemoizeState *mstate = (MemoizeState *) tb->private_data; + TupleTableSlot *pslot = mstate->probeslot; + uint32 hashkey = 0; + int numkeys = mstate->nkeys; + + if (mstate->binary_mode) + { + for (int i = 0; i < numkeys; i++) + { + /* rotate hashkey left 1 bit at each step */ + hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0); + + if (!pslot->tts_isnull[i]) /* treat nulls as having hash key 0 */ + { + FormData_pg_attribute *attr; + uint32 hkey; + + attr = &pslot->tts_tupleDescriptor->attrs[i]; + + hkey = datum_image_hash(pslot->tts_values[i], attr->attbyval, attr->attlen); + + hashkey ^= hkey; + } + } + } + else + { + FmgrInfo *hashfunctions = mstate->hashfunctions; + Oid *collations = mstate->collations; + + for (int i = 0; i < numkeys; i++) + { + /* rotate hashkey left 1 bit at each step */ + hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0); + + if (!pslot->tts_isnull[i]) /* treat nulls as having hash key 0 */ + { + uint32 hkey; + + hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i], + collations[i], pslot->tts_values[i])); + hashkey ^= hkey; + } + } + } + + return murmurhash32(hashkey); +} + +/* + * MemoizeHash_equal + * Equality function for confirming hash value matches during a hash + * table lookup. 'key2' is never used. Instead the MemoizeState's + * probeslot is always populated with details of what's being looked up. + */ +static bool +MemoizeHash_equal(struct memoize_hash *tb, const MemoizeKey *key1, + const MemoizeKey *key2) +{ + MemoizeState *mstate = (MemoizeState *) tb->private_data; + ExprContext *econtext = mstate->ss.ps.ps_ExprContext; + TupleTableSlot *tslot = mstate->tableslot; + TupleTableSlot *pslot = mstate->probeslot; + + /* probeslot should have already been prepared by prepare_probe_slot() */ + ExecStoreMinimalTuple(key1->params, tslot, false); + + if (mstate->binary_mode) + { + int numkeys = mstate->nkeys; + + slot_getallattrs(tslot); + slot_getallattrs(pslot); + + for (int i = 0; i < numkeys; i++) + { + FormData_pg_attribute *attr; + + if (tslot->tts_isnull[i] != pslot->tts_isnull[i]) + return false; + + /* both NULL? they're equal */ + if (tslot->tts_isnull[i]) + continue; + + /* perform binary comparison on the two datums */ + attr = &tslot->tts_tupleDescriptor->attrs[i]; + if (!datum_image_eq(tslot->tts_values[i], pslot->tts_values[i], + attr->attbyval, attr->attlen)) + return false; + } + return true; + } + else + { + econtext->ecxt_innertuple = tslot; + econtext->ecxt_outertuple = pslot; + return ExecQualAndReset(mstate->cache_eq_expr, econtext); + } +} + +/* + * Initialize the hash table to empty. + */ +static void +build_hash_table(MemoizeState *mstate, uint32 size) +{ + /* Make a guess at a good size when we're not given a valid size. */ + if (size == 0) + size = 1024; + + /* memoize_create will convert the size to a power of 2 */ + mstate->hashtable = memoize_create(mstate->tableContext, size, mstate); +} + +/* + * prepare_probe_slot + * Populate mstate's probeslot with the values from the tuple stored + * in 'key'. If 'key' is NULL, then perform the population by evaluating + * mstate's param_exprs. + */ +static inline void +prepare_probe_slot(MemoizeState *mstate, MemoizeKey *key) +{ + TupleTableSlot *pslot = mstate->probeslot; + TupleTableSlot *tslot = mstate->tableslot; + int numKeys = mstate->nkeys; + + ExecClearTuple(pslot); + + if (key == NULL) + { + /* Set the probeslot's values based on the current parameter values */ + for (int i = 0; i < numKeys; i++) + pslot->tts_values[i] = ExecEvalExpr(mstate->param_exprs[i], + mstate->ss.ps.ps_ExprContext, + &pslot->tts_isnull[i]); + } + else + { + /* Process the key's MinimalTuple and store the values in probeslot */ + ExecStoreMinimalTuple(key->params, tslot, false); + slot_getallattrs(tslot); + memcpy(pslot->tts_values, tslot->tts_values, sizeof(Datum) * numKeys); + memcpy(pslot->tts_isnull, tslot->tts_isnull, sizeof(bool) * numKeys); + } + + ExecStoreVirtualTuple(pslot); +} + +/* + * entry_purge_tuples + * Remove all tuples from the cache entry pointed to by 'entry'. This + * leaves an empty cache entry. Also, update the memory accounting to + * reflect the removal of the tuples. + */ +static inline void +entry_purge_tuples(MemoizeState *mstate, MemoizeEntry *entry) +{ + MemoizeTuple *tuple = entry->tuplehead; + uint64 freed_mem = 0; + + while (tuple != NULL) + { + MemoizeTuple *next = tuple->next; + + freed_mem += CACHE_TUPLE_BYTES(tuple); + + /* Free memory used for this tuple */ + pfree(tuple->mintuple); + pfree(tuple); + + tuple = next; + } + + entry->complete = false; + entry->tuplehead = NULL; + + /* Update the memory accounting */ + mstate->mem_used -= freed_mem; +} + +/* + * remove_cache_entry + * Remove 'entry' from the cache and free memory used by it. + */ +static void +remove_cache_entry(MemoizeState *mstate, MemoizeEntry *entry) +{ + MemoizeKey *key = entry->key; + + dlist_delete(&entry->key->lru_node); + + /* Remove all of the tuples from this entry */ + entry_purge_tuples(mstate, entry); + + /* + * Update memory accounting. entry_purge_tuples should have already + * subtracted the memory used for each cached tuple. Here we just update + * the amount used by the entry itself. + */ + mstate->mem_used -= EMPTY_ENTRY_MEMORY_BYTES(entry); + + /* Remove the entry from the cache */ + memoize_delete_item(mstate->hashtable, entry); + + pfree(key->params); + pfree(key); +} + +/* + * cache_purge_all + * Remove all items from the cache + */ +static void +cache_purge_all(MemoizeState *mstate) +{ + uint64 evictions = mstate->hashtable->members; + PlanState *pstate = (PlanState *) mstate; + + /* + * Likely the most efficient way to remove all items is to just reset the + * memory context for the cache and then rebuild a fresh hash table. This + * saves having to remove each item one by one and pfree each cached tuple + */ + MemoryContextReset(mstate->tableContext); + + /* Make the hash table the same size as the original size */ + build_hash_table(mstate, ((Memoize *) pstate->plan)->est_entries); + + /* reset the LRU list */ + dlist_init(&mstate->lru_list); + mstate->last_tuple = NULL; + mstate->entry = NULL; + + mstate->mem_used = 0; + + /* XXX should we add something new to track these purges? */ + mstate->stats.cache_evictions += evictions; /* Update Stats */ +} + +/* + * cache_reduce_memory + * Evict older and less recently used items from the cache in order to + * reduce the memory consumption back to something below the + * MemoizeState's mem_limit. + * + * 'specialkey', if not NULL, causes the function to return false if the entry + * which the key belongs to is removed from the cache. + */ +static bool +cache_reduce_memory(MemoizeState *mstate, MemoizeKey *specialkey) +{ + bool specialkey_intact = true; /* for now */ + dlist_mutable_iter iter; + uint64 evictions = 0; + + /* Update peak memory usage */ + if (mstate->mem_used > mstate->stats.mem_peak) + mstate->stats.mem_peak = mstate->mem_used; + + /* We expect only to be called when we've gone over budget on memory */ + Assert(mstate->mem_used > mstate->mem_limit); + + /* Start the eviction process starting at the head of the LRU list. */ + dlist_foreach_modify(iter, &mstate->lru_list) + { + MemoizeKey *key = dlist_container(MemoizeKey, lru_node, iter.cur); + MemoizeEntry *entry; + + /* + * Populate the hash probe slot in preparation for looking up this LRU + * entry. + */ + prepare_probe_slot(mstate, key); + + /* + * Ideally the LRU list pointers would be stored in the entry itself + * rather than in the key. Unfortunately, we can't do that as the + * simplehash.h code may resize the table and allocate new memory for + * entries which would result in those pointers pointing to the old + * buckets. However, it's fine to use the key to store this as that's + * only referenced by a pointer in the entry, which of course follows + * the entry whenever the hash table is resized. Since we only have a + * pointer to the key here, we must perform a hash table lookup to + * find the entry that the key belongs to. + */ + entry = memoize_lookup(mstate->hashtable, NULL); + + /* + * Sanity check that we found the entry belonging to the LRU list + * item. A misbehaving hash or equality function could cause the + * entry not to be found or the wrong entry to be found. + */ + if (unlikely(entry == NULL || entry->key != key)) + elog(ERROR, "could not find memoization table entry"); + + /* + * If we're being called to free memory while the cache is being + * populated with new tuples, then we'd better take some care as we + * could end up freeing the entry which 'specialkey' belongs to. + * Generally callers will pass 'specialkey' as the key for the cache + * entry which is currently being populated, so we must set + * 'specialkey_intact' to false to inform the caller the specialkey + * entry has been removed. + */ + if (key == specialkey) + specialkey_intact = false; + + /* + * Finally remove the entry. This will remove from the LRU list too. + */ + remove_cache_entry(mstate, entry); + + evictions++; + + /* Exit if we've freed enough memory */ + if (mstate->mem_used <= mstate->mem_limit) + break; + } + + mstate->stats.cache_evictions += evictions; /* Update Stats */ + + return specialkey_intact; +} + +/* + * cache_lookup + * Perform a lookup to see if we've already cached tuples based on the + * scan's current parameters. If we find an existing entry we move it to + * the end of the LRU list, set *found to true then return it. If we + * don't find an entry then we create a new one and add it to the end of + * the LRU list. We also update cache memory accounting and remove older + * entries if we go over the memory budget. If we managed to free enough + * memory we return the new entry, else we return NULL. + * + * Callers can assume we'll never return NULL when *found is true. + */ +static MemoizeEntry * +cache_lookup(MemoizeState *mstate, bool *found) +{ + MemoizeKey *key; + MemoizeEntry *entry; + MemoryContext oldcontext; + + /* prepare the probe slot with the current scan parameters */ + prepare_probe_slot(mstate, NULL); + + /* + * Add the new entry to the cache. No need to pass a valid key since the + * hash function uses mstate's probeslot, which we populated above. + */ + entry = memoize_insert(mstate->hashtable, NULL, found); + + if (*found) + { + /* + * Move existing entry to the tail of the LRU list to mark it as the + * most recently used item. + */ + dlist_move_tail(&mstate->lru_list, &entry->key->lru_node); + + return entry; + } + + oldcontext = MemoryContextSwitchTo(mstate->tableContext); + + /* Allocate a new key */ + entry->key = key = (MemoizeKey *) palloc(sizeof(MemoizeKey)); + key->params = ExecCopySlotMinimalTuple(mstate->probeslot); + + /* Update the total cache memory utilization */ + mstate->mem_used += EMPTY_ENTRY_MEMORY_BYTES(entry); + + /* Initialize this entry */ + entry->complete = false; + entry->tuplehead = NULL; + + /* + * Since this is the most recently used entry, push this entry onto the + * end of the LRU list. + */ + dlist_push_tail(&mstate->lru_list, &entry->key->lru_node); + + mstate->last_tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + /* + * If we've gone over our memory budget, then we'll free up some space in + * the cache. + */ + if (mstate->mem_used > mstate->mem_limit) + { + /* + * Try to free up some memory. It's highly unlikely that we'll fail + * to do so here since the entry we've just added is yet to contain + * any tuples and we're able to remove any other entry to reduce the + * memory consumption. + */ + if (unlikely(!cache_reduce_memory(mstate, key))) + return NULL; + + /* + * The process of removing entries from the cache may have caused the + * code in simplehash.h to shuffle elements to earlier buckets in the + * hash table. If it has, we'll need to find the entry again by + * performing a lookup. Fortunately, we can detect if this has + * happened by seeing if the entry is still in use and that the key + * pointer matches our expected key. + */ + if (entry->status != memoize_SH_IN_USE || entry->key != key) + { + /* + * We need to repopulate the probeslot as lookups performed during + * the cache evictions above will have stored some other key. + */ + prepare_probe_slot(mstate, key); + + /* Re-find the newly added entry */ + entry = memoize_lookup(mstate->hashtable, NULL); + Assert(entry != NULL); + } + } + + return entry; +} + +/* + * cache_store_tuple + * Add the tuple stored in 'slot' to the mstate's current cache entry. + * The cache entry must have already been made with cache_lookup(). + * mstate's last_tuple field must point to the tail of mstate->entry's + * list of tuples. + */ +static bool +cache_store_tuple(MemoizeState *mstate, TupleTableSlot *slot) +{ + MemoizeTuple *tuple; + MemoizeEntry *entry = mstate->entry; + MemoryContext oldcontext; + + Assert(slot != NULL); + Assert(entry != NULL); + + oldcontext = MemoryContextSwitchTo(mstate->tableContext); + + tuple = (MemoizeTuple *) palloc(sizeof(MemoizeTuple)); + tuple->mintuple = ExecCopySlotMinimalTuple(slot); + tuple->next = NULL; + + /* Account for the memory we just consumed */ + mstate->mem_used += CACHE_TUPLE_BYTES(tuple); + + if (entry->tuplehead == NULL) + { + /* + * This is the first tuple for this entry, so just point the list head + * to it. + */ + entry->tuplehead = tuple; + } + else + { + /* push this tuple onto the tail of the list */ + mstate->last_tuple->next = tuple; + } + + mstate->last_tuple = tuple; + MemoryContextSwitchTo(oldcontext); + + /* + * If we've gone over our memory budget then free up some space in the + * cache. + */ + if (mstate->mem_used > mstate->mem_limit) + { + MemoizeKey *key = entry->key; + + if (!cache_reduce_memory(mstate, key)) + return false; + + /* + * The process of removing entries from the cache may have caused the + * code in simplehash.h to shuffle elements to earlier buckets in the + * hash table. If it has, we'll need to find the entry again by + * performing a lookup. Fortunately, we can detect if this has + * happened by seeing if the entry is still in use and that the key + * pointer matches our expected key. + */ + if (entry->status != memoize_SH_IN_USE || entry->key != key) + { + /* + * We need to repopulate the probeslot as lookups performed during + * the cache evictions above will have stored some other key. + */ + prepare_probe_slot(mstate, key); + + /* Re-find the entry */ + mstate->entry = entry = memoize_lookup(mstate->hashtable, NULL); + Assert(entry != NULL); + } + } + + return true; +} + +static TupleTableSlot * +ExecMemoize(PlanState *pstate) +{ + MemoizeState *node = castNode(MemoizeState, pstate); + PlanState *outerNode; + TupleTableSlot *slot; + + switch (node->mstatus) + { + case MEMO_CACHE_LOOKUP: + { + MemoizeEntry *entry; + TupleTableSlot *outerslot; + bool found; + + Assert(node->entry == NULL); + + /* + * We're only ever in this state for the first call of the + * scan. Here we have a look to see if we've already seen the + * current parameters before and if we have already cached a + * complete set of records that the outer plan will return for + * these parameters. + * + * When we find a valid cache entry, we'll return the first + * tuple from it. If not found, we'll create a cache entry and + * then try to fetch a tuple from the outer scan. If we find + * one there, we'll try to cache it. + */ + + /* see if we've got anything cached for the current parameters */ + entry = cache_lookup(node, &found); + + if (found && entry->complete) + { + node->stats.cache_hits += 1; /* stats update */ + + /* + * Set last_tuple and entry so that the state + * MEMO_CACHE_FETCH_NEXT_TUPLE can easily find the next + * tuple for these parameters. + */ + node->last_tuple = entry->tuplehead; + node->entry = entry; + + /* Fetch the first cached tuple, if there is one */ + if (entry->tuplehead) + { + node->mstatus = MEMO_CACHE_FETCH_NEXT_TUPLE; + + slot = node->ss.ps.ps_ResultTupleSlot; + ExecStoreMinimalTuple(entry->tuplehead->mintuple, + slot, false); + + return slot; + } + + /* The cache entry is void of any tuples. */ + node->mstatus = MEMO_END_OF_SCAN; + return NULL; + } + + /* Handle cache miss */ + node->stats.cache_misses += 1; /* stats update */ + + if (found) + { + /* + * A cache entry was found, but the scan for that entry + * did not run to completion. We'll just remove all + * tuples and start again. It might be tempting to + * continue where we left off, but there's no guarantee + * the outer node will produce the tuples in the same + * order as it did last time. + */ + entry_purge_tuples(node, entry); + } + + /* Scan the outer node for a tuple to cache */ + outerNode = outerPlanState(node); + outerslot = ExecProcNode(outerNode); + if (TupIsNull(outerslot)) + { + /* + * cache_lookup may have returned NULL due to failure to + * free enough cache space, so ensure we don't do anything + * here that assumes it worked. There's no need to go into + * bypass mode here as we're setting mstatus to end of + * scan. + */ + if (likely(entry)) + entry->complete = true; + + node->mstatus = MEMO_END_OF_SCAN; + return NULL; + } + + node->entry = entry; + + /* + * If we failed to create the entry or failed to store the + * tuple in the entry, then go into bypass mode. + */ + if (unlikely(entry == NULL || + !cache_store_tuple(node, outerslot))) + { + node->stats.cache_overflows += 1; /* stats update */ + + node->mstatus = MEMO_CACHE_BYPASS_MODE; + + /* + * No need to clear out last_tuple as we'll stay in bypass + * mode until the end of the scan. + */ + } + else + { + /* + * If we only expect a single row from this scan then we + * can mark that we're not expecting more. This allows + * cache lookups to work even when the scan has not been + * executed to completion. + */ + entry->complete = node->singlerow; + node->mstatus = MEMO_FILLING_CACHE; + } + + slot = node->ss.ps.ps_ResultTupleSlot; + ExecCopySlot(slot, outerslot); + return slot; + } + + case MEMO_CACHE_FETCH_NEXT_TUPLE: + { + /* We shouldn't be in this state if these are not set */ + Assert(node->entry != NULL); + Assert(node->last_tuple != NULL); + + /* Skip to the next tuple to output */ + node->last_tuple = node->last_tuple->next; + + /* No more tuples in the cache */ + if (node->last_tuple == NULL) + { + node->mstatus = MEMO_END_OF_SCAN; + return NULL; + } + + slot = node->ss.ps.ps_ResultTupleSlot; + ExecStoreMinimalTuple(node->last_tuple->mintuple, slot, + false); + + return slot; + } + + case MEMO_FILLING_CACHE: + { + TupleTableSlot *outerslot; + MemoizeEntry *entry = node->entry; + + /* entry should already have been set by MEMO_CACHE_LOOKUP */ + Assert(entry != NULL); + + /* + * When in the MEMO_FILLING_CACHE state, we've just had a + * cache miss and are populating the cache with the current + * scan tuples. + */ + outerNode = outerPlanState(node); + outerslot = ExecProcNode(outerNode); + if (TupIsNull(outerslot)) + { + /* No more tuples. Mark it as complete */ + entry->complete = true; + node->mstatus = MEMO_END_OF_SCAN; + return NULL; + } + + /* + * Validate if the planner properly set the singlerow flag. It + * should only set that if each cache entry can, at most, + * return 1 row. + */ + if (unlikely(entry->complete)) + elog(ERROR, "cache entry already complete"); + + /* Record the tuple in the current cache entry */ + if (unlikely(!cache_store_tuple(node, outerslot))) + { + /* Couldn't store it? Handle overflow */ + node->stats.cache_overflows += 1; /* stats update */ + + node->mstatus = MEMO_CACHE_BYPASS_MODE; + + /* + * No need to clear out entry or last_tuple as we'll stay + * in bypass mode until the end of the scan. + */ + } + + slot = node->ss.ps.ps_ResultTupleSlot; + ExecCopySlot(slot, outerslot); + return slot; + } + + case MEMO_CACHE_BYPASS_MODE: + { + TupleTableSlot *outerslot; + + /* + * When in bypass mode we just continue to read tuples without + * caching. We need to wait until the next rescan before we + * can come out of this mode. + */ + outerNode = outerPlanState(node); + outerslot = ExecProcNode(outerNode); + if (TupIsNull(outerslot)) + { + node->mstatus = MEMO_END_OF_SCAN; + return NULL; + } + + slot = node->ss.ps.ps_ResultTupleSlot; + ExecCopySlot(slot, outerslot); + return slot; + } + + case MEMO_END_OF_SCAN: + + /* + * We've already returned NULL for this scan, but just in case + * something calls us again by mistake. + */ + return NULL; + + default: + elog(ERROR, "unrecognized memoize state: %d", + (int) node->mstatus); + return NULL; + } /* switch */ +} + +MemoizeState * +ExecInitMemoize(Memoize *node, EState *estate, int eflags) +{ + MemoizeState *mstate = makeNode(MemoizeState); + Plan *outerNode; + int i; + int nkeys; + Oid *eqfuncoids; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + mstate->ss.ps.plan = (Plan *) node; + mstate->ss.ps.state = estate; + mstate->ss.ps.ExecProcNode = ExecMemoize; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &mstate->ss.ps); + + outerNode = outerPlan(node); + outerPlanState(mstate) = ExecInitNode(outerNode, estate, eflags); + + /* + * Initialize return slot and type. No need to initialize projection info + * because this node doesn't do projections. + */ + ExecInitResultTupleSlotTL(&mstate->ss.ps, &TTSOpsMinimalTuple); + mstate->ss.ps.ps_ProjInfo = NULL; + + /* + * Initialize scan slot and type. + */ + ExecCreateScanSlotFromOuterPlan(estate, &mstate->ss, &TTSOpsMinimalTuple); + + /* + * Set the state machine to lookup the cache. We won't find anything + * until we cache something, but this saves a special case to create the + * first entry. + */ + mstate->mstatus = MEMO_CACHE_LOOKUP; + + mstate->nkeys = nkeys = node->numKeys; + mstate->hashkeydesc = ExecTypeFromExprList(node->param_exprs); + mstate->tableslot = MakeSingleTupleTableSlot(mstate->hashkeydesc, + &TTSOpsMinimalTuple); + mstate->probeslot = MakeSingleTupleTableSlot(mstate->hashkeydesc, + &TTSOpsVirtual); + + mstate->param_exprs = (ExprState **) palloc(nkeys * sizeof(ExprState *)); + mstate->collations = node->collations; /* Just point directly to the plan + * data */ + mstate->hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo)); + + eqfuncoids = palloc(nkeys * sizeof(Oid)); + + for (i = 0; i < nkeys; i++) + { + Oid hashop = node->hashOperators[i]; + Oid left_hashfn; + Oid right_hashfn; + Expr *param_expr = (Expr *) list_nth(node->param_exprs, i); + + if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn)) + elog(ERROR, "could not find hash function for hash operator %u", + hashop); + + fmgr_info(left_hashfn, &mstate->hashfunctions[i]); + + mstate->param_exprs[i] = ExecInitExpr(param_expr, (PlanState *) mstate); + eqfuncoids[i] = get_opcode(hashop); + } + + mstate->cache_eq_expr = ExecBuildParamSetEqual(mstate->hashkeydesc, + &TTSOpsMinimalTuple, + &TTSOpsVirtual, + eqfuncoids, + node->collations, + node->param_exprs, + (PlanState *) mstate); + + pfree(eqfuncoids); + mstate->mem_used = 0; + + /* Limit the total memory consumed by the cache to this */ + mstate->mem_limit = get_hash_memory_limit(); + + /* A memory context dedicated for the cache */ + mstate->tableContext = AllocSetContextCreate(CurrentMemoryContext, + "MemoizeHashTable", + ALLOCSET_DEFAULT_SIZES); + + dlist_init(&mstate->lru_list); + mstate->last_tuple = NULL; + mstate->entry = NULL; + + /* + * Mark if we can assume the cache entry is completed after we get the + * first record for it. Some callers might not call us again after + * getting the first match. e.g. A join operator performing a unique join + * is able to skip to the next outer tuple after getting the first + * matching inner tuple. In this case, the cache entry is complete after + * getting the first tuple. This allows us to mark it as so. + */ + mstate->singlerow = node->singlerow; + mstate->keyparamids = node->keyparamids; + + /* + * Record if the cache keys should be compared bit by bit, or logically + * using the type's hash equality operator + */ + mstate->binary_mode = node->binary_mode; + + /* Zero the statistics counters */ + memset(&mstate->stats, 0, sizeof(MemoizeInstrumentation)); + + /* Allocate and set up the actual cache */ + build_hash_table(mstate, node->est_entries); + + return mstate; +} + +void +ExecEndMemoize(MemoizeState *node) +{ +#ifdef USE_ASSERT_CHECKING + /* Validate the memory accounting code is correct in assert builds. */ + { + int count; + uint64 mem = 0; + memoize_iterator i; + MemoizeEntry *entry; + + memoize_start_iterate(node->hashtable, &i); + + count = 0; + while ((entry = memoize_iterate(node->hashtable, &i)) != NULL) + { + MemoizeTuple *tuple = entry->tuplehead; + + mem += EMPTY_ENTRY_MEMORY_BYTES(entry); + while (tuple != NULL) + { + mem += CACHE_TUPLE_BYTES(tuple); + tuple = tuple->next; + } + count++; + } + + Assert(count == node->hashtable->members); + Assert(mem == node->mem_used); + } +#endif + + /* + * When ending a parallel worker, copy the statistics gathered by the + * worker back into shared memory so that it can be picked up by the main + * process to report in EXPLAIN ANALYZE. + */ + if (node->shared_info != NULL && IsParallelWorker()) + { + MemoizeInstrumentation *si; + + /* Make mem_peak available for EXPLAIN */ + if (node->stats.mem_peak == 0) + node->stats.mem_peak = node->mem_used; + + Assert(ParallelWorkerNumber <= node->shared_info->num_workers); + si = &node->shared_info->sinstrument[ParallelWorkerNumber]; + memcpy(si, &node->stats, sizeof(MemoizeInstrumentation)); + } + + /* Remove the cache context */ + MemoryContextDelete(node->tableContext); + + ExecClearTuple(node->ss.ss_ScanTupleSlot); + /* must drop pointer to cache result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + /* + * free exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * shut down the subplan + */ + ExecEndNode(outerPlanState(node)); +} + +void +ExecReScanMemoize(MemoizeState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + /* Mark that we must lookup the cache for a new set of parameters */ + node->mstatus = MEMO_CACHE_LOOKUP; + + /* nullify pointers used for the last scan */ + node->entry = NULL; + node->last_tuple = NULL; + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); + + /* + * Purge the entire cache if a parameter changed that is not part of the + * cache key. + */ + if (bms_nonempty_difference(outerPlan->chgParam, node->keyparamids)) + cache_purge_all(node); +} + +/* + * ExecEstimateCacheEntryOverheadBytes + * For use in the query planner to help it estimate the amount of memory + * required to store a single entry in the cache. + */ +double +ExecEstimateCacheEntryOverheadBytes(double ntuples) +{ + return sizeof(MemoizeEntry) + sizeof(MemoizeKey) + sizeof(MemoizeTuple) * + ntuples; +} + +/* ---------------------------------------------------------------- + * Parallel Query Support + * ---------------------------------------------------------------- + */ + + /* ---------------------------------------------------------------- + * ExecMemoizeEstimate + * + * Estimate space required to propagate memoize statistics. + * ---------------------------------------------------------------- + */ +void +ExecMemoizeEstimate(MemoizeState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(MemoizeInstrumentation)); + size = add_size(size, offsetof(SharedMemoizeInfo, sinstrument)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecMemoizeInitializeDSM + * + * Initialize DSM space for memoize statistics. + * ---------------------------------------------------------------- + */ +void +ExecMemoizeInitializeDSM(MemoizeState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedMemoizeInfo, sinstrument) + + pcxt->nworkers * sizeof(MemoizeInstrumentation); + node->shared_info = shm_toc_allocate(pcxt->toc, size); + /* ensure any unfilled slots will contain zeroes */ + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, + node->shared_info); +} + +/* ---------------------------------------------------------------- + * ExecMemoizeInitializeWorker + * + * Attach worker to DSM space for memoize statistics. + * ---------------------------------------------------------------- + */ +void +ExecMemoizeInitializeWorker(MemoizeState *node, ParallelWorkerContext *pwcxt) +{ + node->shared_info = + shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true); +} + +/* ---------------------------------------------------------------- + * ExecMemoizeRetrieveInstrumentation + * + * Transfer memoize statistics from DSM to private memory. + * ---------------------------------------------------------------- + */ +void +ExecMemoizeRetrieveInstrumentation(MemoizeState *node) +{ + Size size; + SharedMemoizeInfo *si; + + if (node->shared_info == NULL) + return; + + size = offsetof(SharedMemoizeInfo, sinstrument) + + node->shared_info->num_workers * sizeof(MemoizeInstrumentation); + si = palloc(size); + memcpy(si, node->shared_info, size); + node->shared_info = si; +} diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c new file mode 100644 index 0000000..617bffb --- /dev/null +++ b/src/backend/executor/nodeMergeAppend.c @@ -0,0 +1,389 @@ +/*------------------------------------------------------------------------- + * + * nodeMergeAppend.c + * routines to handle MergeAppend nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeMergeAppend.c + * + *------------------------------------------------------------------------- + */ +/* INTERFACE ROUTINES + * ExecInitMergeAppend - initialize the MergeAppend node + * ExecMergeAppend - retrieve the next tuple from the node + * ExecEndMergeAppend - shut down the MergeAppend node + * ExecReScanMergeAppend - rescan the MergeAppend node + * + * NOTES + * A MergeAppend node contains a list of one or more subplans. + * These are each expected to deliver tuples that are sorted according + * to a common sort key. The MergeAppend node merges these streams + * to produce output sorted the same way. + * + * MergeAppend nodes don't make use of their left and right + * subtrees, rather they maintain a list of subplans so + * a typical MergeAppend node looks like this in the plan tree: + * + * ... + * / + * MergeAppend---+------+------+--- nil + * / \ | | | + * nil nil ... ... ... + * subplans + */ + +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/execPartition.h" +#include "executor/nodeMergeAppend.h" +#include "lib/binaryheap.h" +#include "miscadmin.h" + +/* + * We have one slot for each item in the heap array. We use SlotNumber + * to store slot indexes. This doesn't actually provide any formal + * type-safety, but it makes the code more self-documenting. + */ +typedef int32 SlotNumber; + +static TupleTableSlot *ExecMergeAppend(PlanState *pstate); +static int heap_compare_slots(Datum a, Datum b, void *arg); + + +/* ---------------------------------------------------------------- + * ExecInitMergeAppend + * + * Begin all of the subscans of the MergeAppend node. + * ---------------------------------------------------------------- + */ +MergeAppendState * +ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags) +{ + MergeAppendState *mergestate = makeNode(MergeAppendState); + PlanState **mergeplanstates; + Bitmapset *validsubplans; + int nplans; + int i, + j; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create new MergeAppendState for our node + */ + mergestate->ps.plan = (Plan *) node; + mergestate->ps.state = estate; + mergestate->ps.ExecProcNode = ExecMergeAppend; + + /* If run-time partition pruning is enabled, then set that up now */ + if (node->part_prune_info != NULL) + { + PartitionPruneState *prunestate; + + /* We may need an expression context to evaluate partition exprs */ + ExecAssignExprContext(estate, &mergestate->ps); + + prunestate = ExecCreatePartitionPruneState(&mergestate->ps, + node->part_prune_info); + mergestate->ms_prune_state = prunestate; + + /* Perform an initial partition prune, if required. */ + if (prunestate->do_initial_prune) + { + /* Determine which subplans survive initial pruning */ + validsubplans = ExecFindInitialMatchingSubPlans(prunestate, + list_length(node->mergeplans)); + + nplans = bms_num_members(validsubplans); + } + else + { + /* We'll need to initialize all subplans */ + nplans = list_length(node->mergeplans); + Assert(nplans > 0); + validsubplans = bms_add_range(NULL, 0, nplans - 1); + } + + /* + * When no run-time pruning is required and there's at least one + * subplan, we can fill as_valid_subplans immediately, preventing + * later calls to ExecFindMatchingSubPlans. + */ + if (!prunestate->do_exec_prune && nplans > 0) + mergestate->ms_valid_subplans = bms_add_range(NULL, 0, nplans - 1); + } + else + { + nplans = list_length(node->mergeplans); + + /* + * When run-time partition pruning is not enabled we can just mark all + * subplans as valid; they must also all be initialized. + */ + Assert(nplans > 0); + mergestate->ms_valid_subplans = validsubplans = + bms_add_range(NULL, 0, nplans - 1); + mergestate->ms_prune_state = NULL; + } + + mergeplanstates = (PlanState **) palloc(nplans * sizeof(PlanState *)); + mergestate->mergeplans = mergeplanstates; + mergestate->ms_nplans = nplans; + + mergestate->ms_slots = (TupleTableSlot **) palloc0(sizeof(TupleTableSlot *) * nplans); + mergestate->ms_heap = binaryheap_allocate(nplans, heap_compare_slots, + mergestate); + + /* + * Miscellaneous initialization + * + * MergeAppend nodes do have Result slots, which hold pointers to tuples, + * so we have to initialize them. FIXME + */ + ExecInitResultTupleSlotTL(&mergestate->ps, &TTSOpsVirtual); + + /* node returns slots from each of its subnodes, therefore not fixed */ + mergestate->ps.resultopsset = true; + mergestate->ps.resultopsfixed = false; + + /* + * call ExecInitNode on each of the valid plans to be executed and save + * the results into the mergeplanstates array. + */ + j = 0; + i = -1; + while ((i = bms_next_member(validsubplans, i)) >= 0) + { + Plan *initNode = (Plan *) list_nth(node->mergeplans, i); + + mergeplanstates[j++] = ExecInitNode(initNode, estate, eflags); + } + + mergestate->ps.ps_ProjInfo = NULL; + + /* + * initialize sort-key information + */ + mergestate->ms_nkeys = node->numCols; + mergestate->ms_sortkeys = palloc0(sizeof(SortSupportData) * node->numCols); + + for (i = 0; i < node->numCols; i++) + { + SortSupport sortKey = mergestate->ms_sortkeys + i; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = node->collations[i]; + sortKey->ssup_nulls_first = node->nullsFirst[i]; + sortKey->ssup_attno = node->sortColIdx[i]; + + /* + * It isn't feasible to perform abbreviated key conversion, since + * tuples are pulled into mergestate's binary heap as needed. It + * would likely be counter-productive to convert tuples into an + * abbreviated representation as they're pulled up, so opt out of that + * additional optimization entirely. + */ + sortKey->abbreviate = false; + + PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey); + } + + /* + * initialize to show we have not run the subplans yet + */ + mergestate->ms_initialized = false; + + return mergestate; +} + +/* ---------------------------------------------------------------- + * ExecMergeAppend + * + * Handles iteration over multiple subplans. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecMergeAppend(PlanState *pstate) +{ + MergeAppendState *node = castNode(MergeAppendState, pstate); + TupleTableSlot *result; + SlotNumber i; + + CHECK_FOR_INTERRUPTS(); + + if (!node->ms_initialized) + { + /* Nothing to do if all subplans were pruned */ + if (node->ms_nplans == 0) + return ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* + * If we've yet to determine the valid subplans then do so now. If + * run-time pruning is disabled then the valid subplans will always be + * set to all subplans. + */ + if (node->ms_valid_subplans == NULL) + node->ms_valid_subplans = + ExecFindMatchingSubPlans(node->ms_prune_state); + + /* + * First time through: pull the first tuple from each valid subplan, + * and set up the heap. + */ + i = -1; + while ((i = bms_next_member(node->ms_valid_subplans, i)) >= 0) + { + node->ms_slots[i] = ExecProcNode(node->mergeplans[i]); + if (!TupIsNull(node->ms_slots[i])) + binaryheap_add_unordered(node->ms_heap, Int32GetDatum(i)); + } + binaryheap_build(node->ms_heap); + node->ms_initialized = true; + } + else + { + /* + * Otherwise, pull the next tuple from whichever subplan we returned + * from last time, and reinsert the subplan index into the heap, + * because it might now compare differently against the existing + * elements of the heap. (We could perhaps simplify the logic a bit + * by doing this before returning from the prior call, but it's better + * to not pull tuples until necessary.) + */ + i = DatumGetInt32(binaryheap_first(node->ms_heap)); + node->ms_slots[i] = ExecProcNode(node->mergeplans[i]); + if (!TupIsNull(node->ms_slots[i])) + binaryheap_replace_first(node->ms_heap, Int32GetDatum(i)); + else + (void) binaryheap_remove_first(node->ms_heap); + } + + if (binaryheap_empty(node->ms_heap)) + { + /* All the subplans are exhausted, and so is the heap */ + result = ExecClearTuple(node->ps.ps_ResultTupleSlot); + } + else + { + i = DatumGetInt32(binaryheap_first(node->ms_heap)); + result = node->ms_slots[i]; + } + + return result; +} + +/* + * Compare the tuples in the two given slots. + */ +static int32 +heap_compare_slots(Datum a, Datum b, void *arg) +{ + MergeAppendState *node = (MergeAppendState *) arg; + SlotNumber slot1 = DatumGetInt32(a); + SlotNumber slot2 = DatumGetInt32(b); + + TupleTableSlot *s1 = node->ms_slots[slot1]; + TupleTableSlot *s2 = node->ms_slots[slot2]; + int nkey; + + Assert(!TupIsNull(s1)); + Assert(!TupIsNull(s2)); + + for (nkey = 0; nkey < node->ms_nkeys; nkey++) + { + SortSupport sortKey = node->ms_sortkeys + nkey; + AttrNumber attno = sortKey->ssup_attno; + Datum datum1, + datum2; + bool isNull1, + isNull2; + int compare; + + datum1 = slot_getattr(s1, attno, &isNull1); + datum2 = slot_getattr(s2, attno, &isNull2); + + compare = ApplySortComparator(datum1, isNull1, + datum2, isNull2, + sortKey); + if (compare != 0) + { + INVERT_COMPARE_RESULT(compare); + return compare; + } + } + return 0; +} + +/* ---------------------------------------------------------------- + * ExecEndMergeAppend + * + * Shuts down the subscans of the MergeAppend node. + * + * Returns nothing of interest. + * ---------------------------------------------------------------- + */ +void +ExecEndMergeAppend(MergeAppendState *node) +{ + PlanState **mergeplans; + int nplans; + int i; + + /* + * get information from the node + */ + mergeplans = node->mergeplans; + nplans = node->ms_nplans; + + /* + * shut down each of the subscans + */ + for (i = 0; i < nplans; i++) + ExecEndNode(mergeplans[i]); +} + +void +ExecReScanMergeAppend(MergeAppendState *node) +{ + int i; + + /* + * If any PARAM_EXEC Params used in pruning expressions have changed, then + * we'd better unset the valid subplans so that they are reselected for + * the new parameter values. + */ + if (node->ms_prune_state && + bms_overlap(node->ps.chgParam, + node->ms_prune_state->execparamids)) + { + bms_free(node->ms_valid_subplans); + node->ms_valid_subplans = NULL; + } + + for (i = 0; i < node->ms_nplans; i++) + { + PlanState *subnode = node->mergeplans[i]; + + /* + * ExecReScan doesn't know about my subplans, so I have to do + * changed-parameter signaling myself. + */ + if (node->ps.chgParam != NULL) + UpdateChangedParamSet(subnode, node->ps.chgParam); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (subnode->chgParam == NULL) + ExecReScan(subnode); + } + binaryheap_reset(node->ms_heap); + node->ms_initialized = false; +} diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c new file mode 100644 index 0000000..5ff3f4c --- /dev/null +++ b/src/backend/executor/nodeMergejoin.c @@ -0,0 +1,1678 @@ +/*------------------------------------------------------------------------- + * + * nodeMergejoin.c + * routines supporting merge joins + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeMergejoin.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecMergeJoin mergejoin outer and inner relations. + * ExecInitMergeJoin creates and initializes run time states + * ExecEndMergeJoin cleans up the node. + * + * NOTES + * + * Merge-join is done by joining the inner and outer tuples satisfying + * join clauses of the form ((= outerKey innerKey) ...). + * The join clause list is provided by the query planner and may contain + * more than one (= outerKey innerKey) clause (for composite sort key). + * + * However, the query executor needs to know whether an outer + * tuple is "greater/smaller" than an inner tuple so that it can + * "synchronize" the two relations. For example, consider the following + * relations: + * + * outer: (0 ^1 1 2 5 5 5 6 6 7) current tuple: 1 + * inner: (1 ^3 5 5 5 5 6) current tuple: 3 + * + * To continue the merge-join, the executor needs to scan both inner + * and outer relations till the matching tuples 5. It needs to know + * that currently inner tuple 3 is "greater" than outer tuple 1 and + * therefore it should scan the outer relation first to find a + * matching tuple and so on. + * + * Therefore, rather than directly executing the merge join clauses, + * we evaluate the left and right key expressions separately and then + * compare the columns one at a time (see MJCompare). The planner + * passes us enough information about the sort ordering of the inputs + * to allow us to determine how to make the comparison. We may use the + * appropriate btree comparison function, since Postgres' only notion + * of ordering is specified by btree opfamilies. + * + * + * Consider the above relations and suppose that the executor has + * just joined the first outer "5" with the last inner "5". The + * next step is of course to join the second outer "5" with all + * the inner "5's". This requires repositioning the inner "cursor" + * to point at the first inner "5". This is done by "marking" the + * first inner 5 so we can restore the "cursor" to it before joining + * with the second outer 5. The access method interface provides + * routines to mark and restore to a tuple. + * + * + * Essential operation of the merge join algorithm is as follows: + * + * Join { + * get initial outer and inner tuples INITIALIZE + * do forever { + * while (outer != inner) { SKIP_TEST + * if (outer < inner) + * advance outer SKIPOUTER_ADVANCE + * else + * advance inner SKIPINNER_ADVANCE + * } + * mark inner position SKIP_TEST + * do forever { + * while (outer == inner) { + * join tuples JOINTUPLES + * advance inner position NEXTINNER + * } + * advance outer position NEXTOUTER + * if (outer == mark) TESTOUTER + * restore inner position to mark TESTOUTER + * else + * break // return to top of outer loop + * } + * } + * } + * + * The merge join operation is coded in the fashion + * of a state machine. At each state, we do something and then + * proceed to another state. This state is stored in the node's + * execution state information and is preserved across calls to + * ExecMergeJoin. -cim 10/31/89 + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "executor/execdebug.h" +#include "executor/nodeMergejoin.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" + + +/* + * States of the ExecMergeJoin state machine + */ +#define EXEC_MJ_INITIALIZE_OUTER 1 +#define EXEC_MJ_INITIALIZE_INNER 2 +#define EXEC_MJ_JOINTUPLES 3 +#define EXEC_MJ_NEXTOUTER 4 +#define EXEC_MJ_TESTOUTER 5 +#define EXEC_MJ_NEXTINNER 6 +#define EXEC_MJ_SKIP_TEST 7 +#define EXEC_MJ_SKIPOUTER_ADVANCE 8 +#define EXEC_MJ_SKIPINNER_ADVANCE 9 +#define EXEC_MJ_ENDOUTER 10 +#define EXEC_MJ_ENDINNER 11 + +/* + * Runtime data for each mergejoin clause + */ +typedef struct MergeJoinClauseData +{ + /* Executable expression trees */ + ExprState *lexpr; /* left-hand (outer) input expression */ + ExprState *rexpr; /* right-hand (inner) input expression */ + + /* + * If we have a current left or right input tuple, the values of the + * expressions are loaded into these fields: + */ + Datum ldatum; /* current left-hand value */ + Datum rdatum; /* current right-hand value */ + bool lisnull; /* and their isnull flags */ + bool risnull; + + /* + * Everything we need to know to compare the left and right values is + * stored here. + */ + SortSupportData ssup; +} MergeJoinClauseData; + +/* Result type for MJEvalOuterValues and MJEvalInnerValues */ +typedef enum +{ + MJEVAL_MATCHABLE, /* normal, potentially matchable tuple */ + MJEVAL_NONMATCHABLE, /* tuple cannot join because it has a null */ + MJEVAL_ENDOFJOIN /* end of input (physical or effective) */ +} MJEvalResult; + + +#define MarkInnerTuple(innerTupleSlot, mergestate) \ + ExecCopySlot((mergestate)->mj_MarkedTupleSlot, (innerTupleSlot)) + + +/* + * MJExamineQuals + * + * This deconstructs the list of mergejoinable expressions, which is given + * to us by the planner in the form of a list of "leftexpr = rightexpr" + * expression trees in the order matching the sort columns of the inputs. + * We build an array of MergeJoinClause structs containing the information + * we will need at runtime. Each struct essentially tells us how to compare + * the two expressions from the original clause. + * + * In addition to the expressions themselves, the planner passes the btree + * opfamily OID, collation OID, btree strategy number (BTLessStrategyNumber or + * BTGreaterStrategyNumber), and nulls-first flag that identify the intended + * sort ordering for each merge key. The mergejoinable operator is an + * equality operator in the opfamily, and the two inputs are guaranteed to be + * ordered in either increasing or decreasing (respectively) order according + * to the opfamily and collation, with nulls at the indicated end of the range. + * This allows us to obtain the needed comparison function from the opfamily. + */ +static MergeJoinClause +MJExamineQuals(List *mergeclauses, + Oid *mergefamilies, + Oid *mergecollations, + int *mergestrategies, + bool *mergenullsfirst, + PlanState *parent) +{ + MergeJoinClause clauses; + int nClauses = list_length(mergeclauses); + int iClause; + ListCell *cl; + + clauses = (MergeJoinClause) palloc0(nClauses * sizeof(MergeJoinClauseData)); + + iClause = 0; + foreach(cl, mergeclauses) + { + OpExpr *qual = (OpExpr *) lfirst(cl); + MergeJoinClause clause = &clauses[iClause]; + Oid opfamily = mergefamilies[iClause]; + Oid collation = mergecollations[iClause]; + StrategyNumber opstrategy = mergestrategies[iClause]; + bool nulls_first = mergenullsfirst[iClause]; + int op_strategy; + Oid op_lefttype; + Oid op_righttype; + Oid sortfunc; + + if (!IsA(qual, OpExpr)) + elog(ERROR, "mergejoin clause is not an OpExpr"); + + /* + * Prepare the input expressions for execution. + */ + clause->lexpr = ExecInitExpr((Expr *) linitial(qual->args), parent); + clause->rexpr = ExecInitExpr((Expr *) lsecond(qual->args), parent); + + /* Set up sort support data */ + clause->ssup.ssup_cxt = CurrentMemoryContext; + clause->ssup.ssup_collation = collation; + if (opstrategy == BTLessStrategyNumber) + clause->ssup.ssup_reverse = false; + else if (opstrategy == BTGreaterStrategyNumber) + clause->ssup.ssup_reverse = true; + else /* planner screwed up */ + elog(ERROR, "unsupported mergejoin strategy %d", opstrategy); + clause->ssup.ssup_nulls_first = nulls_first; + + /* Extract the operator's declared left/right datatypes */ + get_op_opfamily_properties(qual->opno, opfamily, false, + &op_strategy, + &op_lefttype, + &op_righttype); + if (op_strategy != BTEqualStrategyNumber) /* should not happen */ + elog(ERROR, "cannot merge using non-equality operator %u", + qual->opno); + + /* + * sortsupport routine must know if abbreviation optimization is + * applicable in principle. It is never applicable for merge joins + * because there is no convenient opportunity to convert to + * alternative representation. + */ + clause->ssup.abbreviate = false; + + /* And get the matching support or comparison function */ + Assert(clause->ssup.comparator == NULL); + sortfunc = get_opfamily_proc(opfamily, + op_lefttype, + op_righttype, + BTSORTSUPPORT_PROC); + if (OidIsValid(sortfunc)) + { + /* The sort support function can provide a comparator */ + OidFunctionCall1(sortfunc, PointerGetDatum(&clause->ssup)); + } + if (clause->ssup.comparator == NULL) + { + /* support not available, get comparison func */ + sortfunc = get_opfamily_proc(opfamily, + op_lefttype, + op_righttype, + BTORDER_PROC); + if (!OidIsValid(sortfunc)) /* should not happen */ + elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", + BTORDER_PROC, op_lefttype, op_righttype, opfamily); + /* We'll use a shim to call the old-style btree comparator */ + PrepareSortSupportComparisonShim(sortfunc, &clause->ssup); + } + + iClause++; + } + + return clauses; +} + +/* + * MJEvalOuterValues + * + * Compute the values of the mergejoined expressions for the current + * outer tuple. We also detect whether it's impossible for the current + * outer tuple to match anything --- this is true if it yields a NULL + * input, since we assume mergejoin operators are strict. If the NULL + * is in the first join column, and that column sorts nulls last, then + * we can further conclude that no following tuple can match anything + * either, since they must all have nulls in the first column. However, + * that case is only interesting if we're not in FillOuter mode, else + * we have to visit all the tuples anyway. + * + * For the convenience of callers, we also make this routine responsible + * for testing for end-of-input (null outer tuple), and returning + * MJEVAL_ENDOFJOIN when that's seen. This allows the same code to be used + * for both real end-of-input and the effective end-of-input represented by + * a first-column NULL. + * + * We evaluate the values in OuterEContext, which can be reset each + * time we move to a new tuple. + */ +static MJEvalResult +MJEvalOuterValues(MergeJoinState *mergestate) +{ + ExprContext *econtext = mergestate->mj_OuterEContext; + MJEvalResult result = MJEVAL_MATCHABLE; + int i; + MemoryContext oldContext; + + /* Check for end of outer subplan */ + if (TupIsNull(mergestate->mj_OuterTupleSlot)) + return MJEVAL_ENDOFJOIN; + + ResetExprContext(econtext); + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + econtext->ecxt_outertuple = mergestate->mj_OuterTupleSlot; + + for (i = 0; i < mergestate->mj_NumClauses; i++) + { + MergeJoinClause clause = &mergestate->mj_Clauses[i]; + + clause->ldatum = ExecEvalExpr(clause->lexpr, econtext, + &clause->lisnull); + if (clause->lisnull) + { + /* match is impossible; can we end the join early? */ + if (i == 0 && !clause->ssup.ssup_nulls_first && + !mergestate->mj_FillOuter) + result = MJEVAL_ENDOFJOIN; + else if (result == MJEVAL_MATCHABLE) + result = MJEVAL_NONMATCHABLE; + } + } + + MemoryContextSwitchTo(oldContext); + + return result; +} + +/* + * MJEvalInnerValues + * + * Same as above, but for the inner tuple. Here, we have to be prepared + * to load data from either the true current inner, or the marked inner, + * so caller must tell us which slot to load from. + */ +static MJEvalResult +MJEvalInnerValues(MergeJoinState *mergestate, TupleTableSlot *innerslot) +{ + ExprContext *econtext = mergestate->mj_InnerEContext; + MJEvalResult result = MJEVAL_MATCHABLE; + int i; + MemoryContext oldContext; + + /* Check for end of inner subplan */ + if (TupIsNull(innerslot)) + return MJEVAL_ENDOFJOIN; + + ResetExprContext(econtext); + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + econtext->ecxt_innertuple = innerslot; + + for (i = 0; i < mergestate->mj_NumClauses; i++) + { + MergeJoinClause clause = &mergestate->mj_Clauses[i]; + + clause->rdatum = ExecEvalExpr(clause->rexpr, econtext, + &clause->risnull); + if (clause->risnull) + { + /* match is impossible; can we end the join early? */ + if (i == 0 && !clause->ssup.ssup_nulls_first && + !mergestate->mj_FillInner) + result = MJEVAL_ENDOFJOIN; + else if (result == MJEVAL_MATCHABLE) + result = MJEVAL_NONMATCHABLE; + } + } + + MemoryContextSwitchTo(oldContext); + + return result; +} + +/* + * MJCompare + * + * Compare the mergejoinable values of the current two input tuples + * and return 0 if they are equal (ie, the mergejoin equalities all + * succeed), >0 if outer > inner, <0 if outer < inner. + * + * MJEvalOuterValues and MJEvalInnerValues must already have been called + * for the current outer and inner tuples, respectively. + */ +static int +MJCompare(MergeJoinState *mergestate) +{ + int result = 0; + bool nulleqnull = false; + ExprContext *econtext = mergestate->js.ps.ps_ExprContext; + int i; + MemoryContext oldContext; + + /* + * Call the comparison functions in short-lived context, in case they leak + * memory. + */ + ResetExprContext(econtext); + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + for (i = 0; i < mergestate->mj_NumClauses; i++) + { + MergeJoinClause clause = &mergestate->mj_Clauses[i]; + + /* + * Special case for NULL-vs-NULL, else use standard comparison. + */ + if (clause->lisnull && clause->risnull) + { + nulleqnull = true; /* NULL "=" NULL */ + continue; + } + + result = ApplySortComparator(clause->ldatum, clause->lisnull, + clause->rdatum, clause->risnull, + &clause->ssup); + + if (result != 0) + break; + } + + /* + * If we had any NULL-vs-NULL inputs, we do not want to report that the + * tuples are equal. Instead, if result is still 0, change it to +1. This + * will result in advancing the inner side of the join. + * + * Likewise, if there was a constant-false joinqual, do not report + * equality. We have to check this as part of the mergequals, else the + * rescan logic will do the wrong thing. + */ + if (result == 0 && + (nulleqnull || mergestate->mj_ConstFalseJoin)) + result = 1; + + MemoryContextSwitchTo(oldContext); + + return result; +} + + +/* + * Generate a fake join tuple with nulls for the inner tuple, + * and return it if it passes the non-join quals. + */ +static TupleTableSlot * +MJFillOuter(MergeJoinState *node) +{ + ExprContext *econtext = node->js.ps.ps_ExprContext; + ExprState *otherqual = node->js.ps.qual; + + ResetExprContext(econtext); + + econtext->ecxt_outertuple = node->mj_OuterTupleSlot; + econtext->ecxt_innertuple = node->mj_NullInnerTupleSlot; + + if (ExecQual(otherqual, econtext)) + { + /* + * qualification succeeded. now form the desired projection tuple and + * return the slot containing it. + */ + MJ_printf("ExecMergeJoin: returning outer fill tuple\n"); + + return ExecProject(node->js.ps.ps_ProjInfo); + } + else + InstrCountFiltered2(node, 1); + + return NULL; +} + +/* + * Generate a fake join tuple with nulls for the outer tuple, + * and return it if it passes the non-join quals. + */ +static TupleTableSlot * +MJFillInner(MergeJoinState *node) +{ + ExprContext *econtext = node->js.ps.ps_ExprContext; + ExprState *otherqual = node->js.ps.qual; + + ResetExprContext(econtext); + + econtext->ecxt_outertuple = node->mj_NullOuterTupleSlot; + econtext->ecxt_innertuple = node->mj_InnerTupleSlot; + + if (ExecQual(otherqual, econtext)) + { + /* + * qualification succeeded. now form the desired projection tuple and + * return the slot containing it. + */ + MJ_printf("ExecMergeJoin: returning inner fill tuple\n"); + + return ExecProject(node->js.ps.ps_ProjInfo); + } + else + InstrCountFiltered2(node, 1); + + return NULL; +} + + +/* + * Check that a qual condition is constant true or constant false. + * If it is constant false (or null), set *is_const_false to true. + * + * Constant true would normally be represented by a NIL list, but we allow an + * actual bool Const as well. We do expect that the planner will have thrown + * away any non-constant terms that have been ANDed with a constant false. + */ +static bool +check_constant_qual(List *qual, bool *is_const_false) +{ + ListCell *lc; + + foreach(lc, qual) + { + Const *con = (Const *) lfirst(lc); + + if (!con || !IsA(con, Const)) + return false; + if (con->constisnull || !DatumGetBool(con->constvalue)) + *is_const_false = true; + } + return true; +} + + +/* ---------------------------------------------------------------- + * ExecMergeTupleDump + * + * This function is called through the MJ_dump() macro + * when EXEC_MERGEJOINDEBUG is defined + * ---------------------------------------------------------------- + */ +#ifdef EXEC_MERGEJOINDEBUG + +static void +ExecMergeTupleDumpOuter(MergeJoinState *mergestate) +{ + TupleTableSlot *outerSlot = mergestate->mj_OuterTupleSlot; + + printf("==== outer tuple ====\n"); + if (TupIsNull(outerSlot)) + printf("(nil)\n"); + else + MJ_debugtup(outerSlot); +} + +static void +ExecMergeTupleDumpInner(MergeJoinState *mergestate) +{ + TupleTableSlot *innerSlot = mergestate->mj_InnerTupleSlot; + + printf("==== inner tuple ====\n"); + if (TupIsNull(innerSlot)) + printf("(nil)\n"); + else + MJ_debugtup(innerSlot); +} + +static void +ExecMergeTupleDumpMarked(MergeJoinState *mergestate) +{ + TupleTableSlot *markedSlot = mergestate->mj_MarkedTupleSlot; + + printf("==== marked tuple ====\n"); + if (TupIsNull(markedSlot)) + printf("(nil)\n"); + else + MJ_debugtup(markedSlot); +} + +static void +ExecMergeTupleDump(MergeJoinState *mergestate) +{ + printf("******** ExecMergeTupleDump ********\n"); + + ExecMergeTupleDumpOuter(mergestate); + ExecMergeTupleDumpInner(mergestate); + ExecMergeTupleDumpMarked(mergestate); + + printf("********\n"); +} +#endif + +/* ---------------------------------------------------------------- + * ExecMergeJoin + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecMergeJoin(PlanState *pstate) +{ + MergeJoinState *node = castNode(MergeJoinState, pstate); + ExprState *joinqual; + ExprState *otherqual; + bool qualResult; + int compareResult; + PlanState *innerPlan; + TupleTableSlot *innerTupleSlot; + PlanState *outerPlan; + TupleTableSlot *outerTupleSlot; + ExprContext *econtext; + bool doFillOuter; + bool doFillInner; + + CHECK_FOR_INTERRUPTS(); + + /* + * get information from node + */ + innerPlan = innerPlanState(node); + outerPlan = outerPlanState(node); + econtext = node->js.ps.ps_ExprContext; + joinqual = node->js.joinqual; + otherqual = node->js.ps.qual; + doFillOuter = node->mj_FillOuter; + doFillInner = node->mj_FillInner; + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + ResetExprContext(econtext); + + /* + * ok, everything is setup.. let's go to work + */ + for (;;) + { + MJ_dump(node); + + /* + * get the current state of the join and do things accordingly. + */ + switch (node->mj_JoinState) + { + /* + * EXEC_MJ_INITIALIZE_OUTER means that this is the first time + * ExecMergeJoin() has been called and so we have to fetch the + * first matchable tuple for both outer and inner subplans. We + * do the outer side in INITIALIZE_OUTER state, then advance + * to INITIALIZE_INNER state for the inner subplan. + */ + case EXEC_MJ_INITIALIZE_OUTER: + MJ_printf("ExecMergeJoin: EXEC_MJ_INITIALIZE_OUTER\n"); + + outerTupleSlot = ExecProcNode(outerPlan); + node->mj_OuterTupleSlot = outerTupleSlot; + + /* Compute join values and check for unmatchability */ + switch (MJEvalOuterValues(node)) + { + case MJEVAL_MATCHABLE: + /* OK to go get the first inner tuple */ + node->mj_JoinState = EXEC_MJ_INITIALIZE_INNER; + break; + case MJEVAL_NONMATCHABLE: + /* Stay in same state to fetch next outer tuple */ + if (doFillOuter) + { + /* + * Generate a fake join tuple with nulls for the + * inner tuple, and return it if it passes the + * non-join quals. + */ + TupleTableSlot *result; + + result = MJFillOuter(node); + if (result) + return result; + } + break; + case MJEVAL_ENDOFJOIN: + /* No more outer tuples */ + MJ_printf("ExecMergeJoin: nothing in outer subplan\n"); + if (doFillInner) + { + /* + * Need to emit right-join tuples for remaining + * inner tuples. We set MatchedInner = true to + * force the ENDOUTER state to advance inner. + */ + node->mj_JoinState = EXEC_MJ_ENDOUTER; + node->mj_MatchedInner = true; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + case EXEC_MJ_INITIALIZE_INNER: + MJ_printf("ExecMergeJoin: EXEC_MJ_INITIALIZE_INNER\n"); + + innerTupleSlot = ExecProcNode(innerPlan); + node->mj_InnerTupleSlot = innerTupleSlot; + + /* Compute join values and check for unmatchability */ + switch (MJEvalInnerValues(node, innerTupleSlot)) + { + case MJEVAL_MATCHABLE: + + /* + * OK, we have the initial tuples. Begin by skipping + * non-matching tuples. + */ + node->mj_JoinState = EXEC_MJ_SKIP_TEST; + break; + case MJEVAL_NONMATCHABLE: + /* Mark before advancing, if wanted */ + if (node->mj_ExtraMarks) + ExecMarkPos(innerPlan); + /* Stay in same state to fetch next inner tuple */ + if (doFillInner) + { + /* + * Generate a fake join tuple with nulls for the + * outer tuple, and return it if it passes the + * non-join quals. + */ + TupleTableSlot *result; + + result = MJFillInner(node); + if (result) + return result; + } + break; + case MJEVAL_ENDOFJOIN: + /* No more inner tuples */ + MJ_printf("ExecMergeJoin: nothing in inner subplan\n"); + if (doFillOuter) + { + /* + * Need to emit left-join tuples for all outer + * tuples, including the one we just fetched. We + * set MatchedOuter = false to force the ENDINNER + * state to emit first tuple before advancing + * outer. + */ + node->mj_JoinState = EXEC_MJ_ENDINNER; + node->mj_MatchedOuter = false; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + /* + * EXEC_MJ_JOINTUPLES means we have two tuples which satisfied + * the merge clause so we join them and then proceed to get + * the next inner tuple (EXEC_MJ_NEXTINNER). + */ + case EXEC_MJ_JOINTUPLES: + MJ_printf("ExecMergeJoin: EXEC_MJ_JOINTUPLES\n"); + + /* + * Set the next state machine state. The right things will + * happen whether we return this join tuple or just fall + * through to continue the state machine execution. + */ + node->mj_JoinState = EXEC_MJ_NEXTINNER; + + /* + * Check the extra qual conditions to see if we actually want + * to return this join tuple. If not, can proceed with merge. + * We must distinguish the additional joinquals (which must + * pass to consider the tuples "matched" for outer-join logic) + * from the otherquals (which must pass before we actually + * return the tuple). + * + * We don't bother with a ResetExprContext here, on the + * assumption that we just did one while checking the merge + * qual. One per tuple should be sufficient. We do have to + * set up the econtext links to the tuples for ExecQual to + * use. + */ + outerTupleSlot = node->mj_OuterTupleSlot; + econtext->ecxt_outertuple = outerTupleSlot; + innerTupleSlot = node->mj_InnerTupleSlot; + econtext->ecxt_innertuple = innerTupleSlot; + + qualResult = (joinqual == NULL || + ExecQual(joinqual, econtext)); + MJ_DEBUG_QUAL(joinqual, qualResult); + + if (qualResult) + { + node->mj_MatchedOuter = true; + node->mj_MatchedInner = true; + + /* In an antijoin, we never return a matched tuple */ + if (node->js.jointype == JOIN_ANTI) + { + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + break; + } + + /* + * If we only need to join to the first matching inner + * tuple, then consider returning this one, but after that + * continue with next outer tuple. + */ + if (node->js.single_match) + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + + qualResult = (otherqual == NULL || + ExecQual(otherqual, econtext)); + MJ_DEBUG_QUAL(otherqual, qualResult); + + if (qualResult) + { + /* + * qualification succeeded. now form the desired + * projection tuple and return the slot containing it. + */ + MJ_printf("ExecMergeJoin: returning tuple\n"); + + return ExecProject(node->js.ps.ps_ProjInfo); + } + else + InstrCountFiltered2(node, 1); + } + else + InstrCountFiltered1(node, 1); + break; + + /* + * EXEC_MJ_NEXTINNER means advance the inner scan to the next + * tuple. If the tuple is not nil, we then proceed to test it + * against the join qualification. + * + * Before advancing, we check to see if we must emit an + * outer-join fill tuple for this inner tuple. + */ + case EXEC_MJ_NEXTINNER: + MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTINNER\n"); + + if (doFillInner && !node->mj_MatchedInner) + { + /* + * Generate a fake join tuple with nulls for the outer + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedInner = true; /* do it only once */ + + result = MJFillInner(node); + if (result) + return result; + } + + /* + * now we get the next inner tuple, if any. If there's none, + * advance to next outer tuple (which may be able to join to + * previously marked tuples). + * + * NB: must NOT do "extraMarks" here, since we may need to + * return to previously marked tuples. + */ + innerTupleSlot = ExecProcNode(innerPlan); + node->mj_InnerTupleSlot = innerTupleSlot; + MJ_DEBUG_PROC_NODE(innerTupleSlot); + node->mj_MatchedInner = false; + + /* Compute join values and check for unmatchability */ + switch (MJEvalInnerValues(node, innerTupleSlot)) + { + case MJEVAL_MATCHABLE: + + /* + * Test the new inner tuple to see if it matches + * outer. + * + * If they do match, then we join them and move on to + * the next inner tuple (EXEC_MJ_JOINTUPLES). + * + * If they do not match then advance to next outer + * tuple. + */ + compareResult = MJCompare(node); + MJ_DEBUG_COMPARE(compareResult); + + if (compareResult == 0) + node->mj_JoinState = EXEC_MJ_JOINTUPLES; + else if (compareResult < 0) + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + else /* compareResult > 0 should not happen */ + elog(ERROR, "mergejoin input data is out of order"); + break; + case MJEVAL_NONMATCHABLE: + + /* + * It contains a NULL and hence can't match any outer + * tuple, so we can skip the comparison and assume the + * new tuple is greater than current outer. + */ + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + break; + case MJEVAL_ENDOFJOIN: + + /* + * No more inner tuples. However, this might be only + * effective and not physical end of inner plan, so + * force mj_InnerTupleSlot to null to make sure we + * don't fetch more inner tuples. (We need this hack + * because we are not transiting to a state where the + * inner plan is assumed to be exhausted.) + */ + node->mj_InnerTupleSlot = NULL; + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + break; + } + break; + + /*------------------------------------------- + * EXEC_MJ_NEXTOUTER means + * + * outer inner + * outer tuple - 5 5 - marked tuple + * 5 5 + * 6 6 - inner tuple + * 7 7 + * + * we know we just bumped into the + * first inner tuple > current outer tuple (or possibly + * the end of the inner stream) + * so get a new outer tuple and then + * proceed to test it against the marked tuple + * (EXEC_MJ_TESTOUTER) + * + * Before advancing, we check to see if we must emit an + * outer-join fill tuple for this outer tuple. + *------------------------------------------------ + */ + case EXEC_MJ_NEXTOUTER: + MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTOUTER\n"); + + if (doFillOuter && !node->mj_MatchedOuter) + { + /* + * Generate a fake join tuple with nulls for the inner + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedOuter = true; /* do it only once */ + + result = MJFillOuter(node); + if (result) + return result; + } + + /* + * now we get the next outer tuple, if any + */ + outerTupleSlot = ExecProcNode(outerPlan); + node->mj_OuterTupleSlot = outerTupleSlot; + MJ_DEBUG_PROC_NODE(outerTupleSlot); + node->mj_MatchedOuter = false; + + /* Compute join values and check for unmatchability */ + switch (MJEvalOuterValues(node)) + { + case MJEVAL_MATCHABLE: + /* Go test the new tuple against the marked tuple */ + node->mj_JoinState = EXEC_MJ_TESTOUTER; + break; + case MJEVAL_NONMATCHABLE: + /* Can't match, so fetch next outer tuple */ + node->mj_JoinState = EXEC_MJ_NEXTOUTER; + break; + case MJEVAL_ENDOFJOIN: + /* No more outer tuples */ + MJ_printf("ExecMergeJoin: end of outer subplan\n"); + innerTupleSlot = node->mj_InnerTupleSlot; + if (doFillInner && !TupIsNull(innerTupleSlot)) + { + /* + * Need to emit right-join tuples for remaining + * inner tuples. + */ + node->mj_JoinState = EXEC_MJ_ENDOUTER; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + /*-------------------------------------------------------- + * EXEC_MJ_TESTOUTER If the new outer tuple and the marked + * tuple satisfy the merge clause then we know we have + * duplicates in the outer scan so we have to restore the + * inner scan to the marked tuple and proceed to join the + * new outer tuple with the inner tuples. + * + * This is the case when + * outer inner + * 4 5 - marked tuple + * outer tuple - 5 5 + * new outer tuple - 5 5 + * 6 8 - inner tuple + * 7 12 + * + * new outer tuple == marked tuple + * + * If the outer tuple fails the test, then we are done + * with the marked tuples, and we have to look for a + * match to the current inner tuple. So we will + * proceed to skip outer tuples until outer >= inner + * (EXEC_MJ_SKIP_TEST). + * + * This is the case when + * + * outer inner + * 5 5 - marked tuple + * outer tuple - 5 5 + * new outer tuple - 6 8 - inner tuple + * 7 12 + * + * new outer tuple > marked tuple + * + *--------------------------------------------------------- + */ + case EXEC_MJ_TESTOUTER: + MJ_printf("ExecMergeJoin: EXEC_MJ_TESTOUTER\n"); + + /* + * Here we must compare the outer tuple with the marked inner + * tuple. (We can ignore the result of MJEvalInnerValues, + * since the marked inner tuple is certainly matchable.) + */ + innerTupleSlot = node->mj_MarkedTupleSlot; + (void) MJEvalInnerValues(node, innerTupleSlot); + + compareResult = MJCompare(node); + MJ_DEBUG_COMPARE(compareResult); + + if (compareResult == 0) + { + /* + * the merge clause matched so now we restore the inner + * scan position to the first mark, and go join that tuple + * (and any following ones) to the new outer. + * + * If we were able to determine mark and restore are not + * needed, then we don't have to back up; the current + * inner is already the first possible match. + * + * NOTE: we do not need to worry about the MatchedInner + * state for the rescanned inner tuples. We know all of + * them will match this new outer tuple and therefore + * won't be emitted as fill tuples. This works *only* + * because we require the extra joinquals to be constant + * when doing a right or full join --- otherwise some of + * the rescanned tuples might fail the extra joinquals. + * This obviously won't happen for a constant-true extra + * joinqual, while the constant-false case is handled by + * forcing the merge clause to never match, so we never + * get here. + */ + if (!node->mj_SkipMarkRestore) + { + ExecRestrPos(innerPlan); + + /* + * ExecRestrPos probably should give us back a new + * Slot, but since it doesn't, use the marked slot. + * (The previously returned mj_InnerTupleSlot cannot + * be assumed to hold the required tuple.) + */ + node->mj_InnerTupleSlot = innerTupleSlot; + /* we need not do MJEvalInnerValues again */ + } + + node->mj_JoinState = EXEC_MJ_JOINTUPLES; + } + else if (compareResult > 0) + { + /* ---------------- + * if the new outer tuple didn't match the marked inner + * tuple then we have a case like: + * + * outer inner + * 4 4 - marked tuple + * new outer - 5 4 + * 6 5 - inner tuple + * 7 + * + * which means that all subsequent outer tuples will be + * larger than our marked inner tuples. So we need not + * revisit any of the marked tuples but can proceed to + * look for a match to the current inner. If there's + * no more inners, no more matches are possible. + * ---------------- + */ + innerTupleSlot = node->mj_InnerTupleSlot; + + /* reload comparison data for current inner */ + switch (MJEvalInnerValues(node, innerTupleSlot)) + { + case MJEVAL_MATCHABLE: + /* proceed to compare it to the current outer */ + node->mj_JoinState = EXEC_MJ_SKIP_TEST; + break; + case MJEVAL_NONMATCHABLE: + + /* + * current inner can't possibly match any outer; + * better to advance the inner scan than the + * outer. + */ + node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; + break; + case MJEVAL_ENDOFJOIN: + /* No more inner tuples */ + if (doFillOuter) + { + /* + * Need to emit left-join tuples for remaining + * outer tuples. + */ + node->mj_JoinState = EXEC_MJ_ENDINNER; + break; + } + /* Otherwise we're done. */ + return NULL; + } + } + else /* compareResult < 0 should not happen */ + elog(ERROR, "mergejoin input data is out of order"); + break; + + /*---------------------------------------------------------- + * EXEC_MJ_SKIP means compare tuples and if they do not + * match, skip whichever is lesser. + * + * For example: + * + * outer inner + * 5 5 + * 5 5 + * outer tuple - 6 8 - inner tuple + * 7 12 + * 8 14 + * + * we have to advance the outer scan + * until we find the outer 8. + * + * On the other hand: + * + * outer inner + * 5 5 + * 5 5 + * outer tuple - 12 8 - inner tuple + * 14 10 + * 17 12 + * + * we have to advance the inner scan + * until we find the inner 12. + *---------------------------------------------------------- + */ + case EXEC_MJ_SKIP_TEST: + MJ_printf("ExecMergeJoin: EXEC_MJ_SKIP_TEST\n"); + + /* + * before we advance, make sure the current tuples do not + * satisfy the mergeclauses. If they do, then we update the + * marked tuple position and go join them. + */ + compareResult = MJCompare(node); + MJ_DEBUG_COMPARE(compareResult); + + if (compareResult == 0) + { + if (!node->mj_SkipMarkRestore) + ExecMarkPos(innerPlan); + + MarkInnerTuple(node->mj_InnerTupleSlot, node); + + node->mj_JoinState = EXEC_MJ_JOINTUPLES; + } + else if (compareResult < 0) + node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE; + else + /* compareResult > 0 */ + node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; + break; + + /* + * SKIPOUTER_ADVANCE: advance over an outer tuple that is + * known not to join to any inner tuple. + * + * Before advancing, we check to see if we must emit an + * outer-join fill tuple for this outer tuple. + */ + case EXEC_MJ_SKIPOUTER_ADVANCE: + MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPOUTER_ADVANCE\n"); + + if (doFillOuter && !node->mj_MatchedOuter) + { + /* + * Generate a fake join tuple with nulls for the inner + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedOuter = true; /* do it only once */ + + result = MJFillOuter(node); + if (result) + return result; + } + + /* + * now we get the next outer tuple, if any + */ + outerTupleSlot = ExecProcNode(outerPlan); + node->mj_OuterTupleSlot = outerTupleSlot; + MJ_DEBUG_PROC_NODE(outerTupleSlot); + node->mj_MatchedOuter = false; + + /* Compute join values and check for unmatchability */ + switch (MJEvalOuterValues(node)) + { + case MJEVAL_MATCHABLE: + /* Go test the new tuple against the current inner */ + node->mj_JoinState = EXEC_MJ_SKIP_TEST; + break; + case MJEVAL_NONMATCHABLE: + /* Can't match, so fetch next outer tuple */ + node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE; + break; + case MJEVAL_ENDOFJOIN: + /* No more outer tuples */ + MJ_printf("ExecMergeJoin: end of outer subplan\n"); + innerTupleSlot = node->mj_InnerTupleSlot; + if (doFillInner && !TupIsNull(innerTupleSlot)) + { + /* + * Need to emit right-join tuples for remaining + * inner tuples. + */ + node->mj_JoinState = EXEC_MJ_ENDOUTER; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + /* + * SKIPINNER_ADVANCE: advance over an inner tuple that is + * known not to join to any outer tuple. + * + * Before advancing, we check to see if we must emit an + * outer-join fill tuple for this inner tuple. + */ + case EXEC_MJ_SKIPINNER_ADVANCE: + MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPINNER_ADVANCE\n"); + + if (doFillInner && !node->mj_MatchedInner) + { + /* + * Generate a fake join tuple with nulls for the outer + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedInner = true; /* do it only once */ + + result = MJFillInner(node); + if (result) + return result; + } + + /* Mark before advancing, if wanted */ + if (node->mj_ExtraMarks) + ExecMarkPos(innerPlan); + + /* + * now we get the next inner tuple, if any + */ + innerTupleSlot = ExecProcNode(innerPlan); + node->mj_InnerTupleSlot = innerTupleSlot; + MJ_DEBUG_PROC_NODE(innerTupleSlot); + node->mj_MatchedInner = false; + + /* Compute join values and check for unmatchability */ + switch (MJEvalInnerValues(node, innerTupleSlot)) + { + case MJEVAL_MATCHABLE: + /* proceed to compare it to the current outer */ + node->mj_JoinState = EXEC_MJ_SKIP_TEST; + break; + case MJEVAL_NONMATCHABLE: + + /* + * current inner can't possibly match any outer; + * better to advance the inner scan than the outer. + */ + node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE; + break; + case MJEVAL_ENDOFJOIN: + /* No more inner tuples */ + MJ_printf("ExecMergeJoin: end of inner subplan\n"); + outerTupleSlot = node->mj_OuterTupleSlot; + if (doFillOuter && !TupIsNull(outerTupleSlot)) + { + /* + * Need to emit left-join tuples for remaining + * outer tuples. + */ + node->mj_JoinState = EXEC_MJ_ENDINNER; + break; + } + /* Otherwise we're done. */ + return NULL; + } + break; + + /* + * EXEC_MJ_ENDOUTER means we have run out of outer tuples, but + * are doing a right/full join and therefore must null-fill + * any remaining unmatched inner tuples. + */ + case EXEC_MJ_ENDOUTER: + MJ_printf("ExecMergeJoin: EXEC_MJ_ENDOUTER\n"); + + Assert(doFillInner); + + if (!node->mj_MatchedInner) + { + /* + * Generate a fake join tuple with nulls for the outer + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedInner = true; /* do it only once */ + + result = MJFillInner(node); + if (result) + return result; + } + + /* Mark before advancing, if wanted */ + if (node->mj_ExtraMarks) + ExecMarkPos(innerPlan); + + /* + * now we get the next inner tuple, if any + */ + innerTupleSlot = ExecProcNode(innerPlan); + node->mj_InnerTupleSlot = innerTupleSlot; + MJ_DEBUG_PROC_NODE(innerTupleSlot); + node->mj_MatchedInner = false; + + if (TupIsNull(innerTupleSlot)) + { + MJ_printf("ExecMergeJoin: end of inner subplan\n"); + return NULL; + } + + /* Else remain in ENDOUTER state and process next tuple. */ + break; + + /* + * EXEC_MJ_ENDINNER means we have run out of inner tuples, but + * are doing a left/full join and therefore must null- fill + * any remaining unmatched outer tuples. + */ + case EXEC_MJ_ENDINNER: + MJ_printf("ExecMergeJoin: EXEC_MJ_ENDINNER\n"); + + Assert(doFillOuter); + + if (!node->mj_MatchedOuter) + { + /* + * Generate a fake join tuple with nulls for the inner + * tuple, and return it if it passes the non-join quals. + */ + TupleTableSlot *result; + + node->mj_MatchedOuter = true; /* do it only once */ + + result = MJFillOuter(node); + if (result) + return result; + } + + /* + * now we get the next outer tuple, if any + */ + outerTupleSlot = ExecProcNode(outerPlan); + node->mj_OuterTupleSlot = outerTupleSlot; + MJ_DEBUG_PROC_NODE(outerTupleSlot); + node->mj_MatchedOuter = false; + + if (TupIsNull(outerTupleSlot)) + { + MJ_printf("ExecMergeJoin: end of outer subplan\n"); + return NULL; + } + + /* Else remain in ENDINNER state and process next tuple. */ + break; + + /* + * broken state value? + */ + default: + elog(ERROR, "unrecognized mergejoin state: %d", + (int) node->mj_JoinState); + } + } +} + +/* ---------------------------------------------------------------- + * ExecInitMergeJoin + * ---------------------------------------------------------------- + */ +MergeJoinState * +ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags) +{ + MergeJoinState *mergestate; + TupleDesc outerDesc, + innerDesc; + const TupleTableSlotOps *innerOps; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + MJ1_printf("ExecInitMergeJoin: %s\n", + "initializing node"); + + /* + * create state structure + */ + mergestate = makeNode(MergeJoinState); + mergestate->js.ps.plan = (Plan *) node; + mergestate->js.ps.state = estate; + mergestate->js.ps.ExecProcNode = ExecMergeJoin; + mergestate->js.jointype = node->join.jointype; + mergestate->mj_ConstFalseJoin = false; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &mergestate->js.ps); + + /* + * we need two additional econtexts in which we can compute the join + * expressions from the left and right input tuples. The node's regular + * econtext won't do because it gets reset too often. + */ + mergestate->mj_OuterEContext = CreateExprContext(estate); + mergestate->mj_InnerEContext = CreateExprContext(estate); + + /* + * initialize child nodes + * + * inner child must support MARK/RESTORE, unless we have detected that we + * don't need that. Note that skip_mark_restore must never be set if + * there are non-mergeclause joinquals, since the logic wouldn't work. + */ + Assert(node->join.joinqual == NIL || !node->skip_mark_restore); + mergestate->mj_SkipMarkRestore = node->skip_mark_restore; + + outerPlanState(mergestate) = ExecInitNode(outerPlan(node), estate, eflags); + outerDesc = ExecGetResultType(outerPlanState(mergestate)); + innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate, + mergestate->mj_SkipMarkRestore ? + eflags : + (eflags | EXEC_FLAG_MARK)); + innerDesc = ExecGetResultType(innerPlanState(mergestate)); + + /* + * For certain types of inner child nodes, it is advantageous to issue + * MARK every time we advance past an inner tuple we will never return to. + * For other types, MARK on a tuple we cannot return to is a waste of + * cycles. Detect which case applies and set mj_ExtraMarks if we want to + * issue "unnecessary" MARK calls. + * + * Currently, only Material wants the extra MARKs, and it will be helpful + * only if eflags doesn't specify REWIND. + * + * Note that for IndexScan and IndexOnlyScan, it is *necessary* that we + * not set mj_ExtraMarks; otherwise we might attempt to set a mark before + * the first inner tuple, which they do not support. + */ + if (IsA(innerPlan(node), Material) && + (eflags & EXEC_FLAG_REWIND) == 0 && + !mergestate->mj_SkipMarkRestore) + mergestate->mj_ExtraMarks = true; + else + mergestate->mj_ExtraMarks = false; + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTupleSlotTL(&mergestate->js.ps, &TTSOpsVirtual); + ExecAssignProjectionInfo(&mergestate->js.ps, NULL); + + /* + * tuple table initialization + */ + innerOps = ExecGetResultSlotOps(innerPlanState(mergestate), NULL); + mergestate->mj_MarkedTupleSlot = ExecInitExtraTupleSlot(estate, innerDesc, + innerOps); + + /* + * initialize child expressions + */ + mergestate->js.ps.qual = + ExecInitQual(node->join.plan.qual, (PlanState *) mergestate); + mergestate->js.joinqual = + ExecInitQual(node->join.joinqual, (PlanState *) mergestate); + /* mergeclauses are handled below */ + + /* + * detect whether we need only consider the first matching inner tuple + */ + mergestate->js.single_match = (node->join.inner_unique || + node->join.jointype == JOIN_SEMI); + + /* set up null tuples for outer joins, if needed */ + switch (node->join.jointype) + { + case JOIN_INNER: + case JOIN_SEMI: + mergestate->mj_FillOuter = false; + mergestate->mj_FillInner = false; + break; + case JOIN_LEFT: + case JOIN_ANTI: + mergestate->mj_FillOuter = true; + mergestate->mj_FillInner = false; + mergestate->mj_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual); + break; + case JOIN_RIGHT: + mergestate->mj_FillOuter = false; + mergestate->mj_FillInner = true; + mergestate->mj_NullOuterTupleSlot = + ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual); + + /* + * Can't handle right or full join with non-constant extra + * joinclauses. This should have been caught by planner. + */ + if (!check_constant_qual(node->join.joinqual, + &mergestate->mj_ConstFalseJoin)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("RIGHT JOIN is only supported with merge-joinable join conditions"))); + break; + case JOIN_FULL: + mergestate->mj_FillOuter = true; + mergestate->mj_FillInner = true; + mergestate->mj_NullOuterTupleSlot = + ExecInitNullTupleSlot(estate, outerDesc, &TTSOpsVirtual); + mergestate->mj_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, innerDesc, &TTSOpsVirtual); + + /* + * Can't handle right or full join with non-constant extra + * joinclauses. This should have been caught by planner. + */ + if (!check_constant_qual(node->join.joinqual, + &mergestate->mj_ConstFalseJoin)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("FULL JOIN is only supported with merge-joinable join conditions"))); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) node->join.jointype); + } + + /* + * preprocess the merge clauses + */ + mergestate->mj_NumClauses = list_length(node->mergeclauses); + mergestate->mj_Clauses = MJExamineQuals(node->mergeclauses, + node->mergeFamilies, + node->mergeCollations, + node->mergeStrategies, + node->mergeNullsFirst, + (PlanState *) mergestate); + + /* + * initialize join state + */ + mergestate->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER; + mergestate->mj_MatchedOuter = false; + mergestate->mj_MatchedInner = false; + mergestate->mj_OuterTupleSlot = NULL; + mergestate->mj_InnerTupleSlot = NULL; + + /* + * initialization successful + */ + MJ1_printf("ExecInitMergeJoin: %s\n", + "node initialized"); + + return mergestate; +} + +/* ---------------------------------------------------------------- + * ExecEndMergeJoin + * + * old comments + * frees storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndMergeJoin(MergeJoinState *node) +{ + MJ1_printf("ExecEndMergeJoin: %s\n", + "ending node processing"); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->js.ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->js.ps.ps_ResultTupleSlot); + ExecClearTuple(node->mj_MarkedTupleSlot); + + /* + * shut down the subplans + */ + ExecEndNode(innerPlanState(node)); + ExecEndNode(outerPlanState(node)); + + MJ1_printf("ExecEndMergeJoin: %s\n", + "node processing ended"); +} + +void +ExecReScanMergeJoin(MergeJoinState *node) +{ + ExecClearTuple(node->mj_MarkedTupleSlot); + + node->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER; + node->mj_MatchedOuter = false; + node->mj_MatchedInner = false; + node->mj_OuterTupleSlot = NULL; + node->mj_InnerTupleSlot = NULL; + + /* + * if chgParam of subnodes is not null then plans will be re-scanned by + * first ExecProcNode. + */ + if (node->js.ps.lefttree->chgParam == NULL) + ExecReScan(node->js.ps.lefttree); + if (node->js.ps.righttree->chgParam == NULL) + ExecReScan(node->js.ps.righttree); + +} diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c new file mode 100644 index 0000000..1e79d18 --- /dev/null +++ b/src/backend/executor/nodeModifyTable.c @@ -0,0 +1,3243 @@ +/*------------------------------------------------------------------------- + * + * nodeModifyTable.c + * routines to handle ModifyTable nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeModifyTable.c + * + *------------------------------------------------------------------------- + */ +/* INTERFACE ROUTINES + * ExecInitModifyTable - initialize the ModifyTable node + * ExecModifyTable - retrieve the next tuple from the node + * ExecEndModifyTable - shut down the ModifyTable node + * ExecReScanModifyTable - rescan the ModifyTable node + * + * NOTES + * The ModifyTable node receives input from its outerPlan, which is + * the data to insert for INSERT cases, or the changed columns' new + * values plus row-locating info for UPDATE cases, or just the + * row-locating info for DELETE cases. + * + * If the query specifies RETURNING, then the ModifyTable returns a + * RETURNING tuple after completing each row insert, update, or delete. + * It must be called again to continue the operation. Without RETURNING, + * we just loop within the node until all the work is done, then + * return NULL. This avoids useless call/return overhead. + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "commands/trigger.h" +#include "executor/execPartition.h" +#include "executor/executor.h" +#include "executor/nodeModifyTable.h" +#include "foreign/fdwapi.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "rewrite/rewriteHandler.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +typedef struct MTTargetRelLookup +{ + Oid relationOid; /* hash key, must be first */ + int relationIndex; /* rel's index in resultRelInfo[] array */ +} MTTargetRelLookup; + +static void ExecBatchInsert(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + TupleTableSlot **slots, + TupleTableSlot **planSlots, + int numSlots, + EState *estate, + bool canSetTag); +static bool ExecOnConflictUpdate(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + ItemPointer conflictTid, + TupleTableSlot *planSlot, + TupleTableSlot *excludedSlot, + EState *estate, + bool canSetTag, + TupleTableSlot **returning); +static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, + EState *estate, + PartitionTupleRouting *proute, + ResultRelInfo *targetRelInfo, + TupleTableSlot *slot, + ResultRelInfo **partRelInfo); + +/* + * Verify that the tuples to be produced by INSERT match the + * target relation's rowtype + * + * We do this to guard against stale plans. If plan invalidation is + * functioning properly then we should never get a failure here, but better + * safe than sorry. Note that this is called after we have obtained lock + * on the target rel, so the rowtype can't change underneath us. + * + * The plan output is represented by its targetlist, because that makes + * handling the dropped-column case easier. + * + * We used to use this for UPDATE as well, but now the equivalent checks + * are done in ExecBuildUpdateProjection. + */ +static void +ExecCheckPlanOutput(Relation resultRel, List *targetList) +{ + TupleDesc resultDesc = RelationGetDescr(resultRel); + int attno = 0; + ListCell *lc; + + foreach(lc, targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + Form_pg_attribute attr; + + Assert(!tle->resjunk); /* caller removed junk items already */ + + if (attno >= resultDesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Query has too many columns."))); + attr = TupleDescAttr(resultDesc, attno); + attno++; + + if (!attr->attisdropped) + { + /* Normal case: demand type match */ + if (exprType((Node *) tle->expr) != attr->atttypid) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Table has type %s at ordinal position %d, but query expects %s.", + format_type_be(attr->atttypid), + attno, + format_type_be(exprType((Node *) tle->expr))))); + } + else + { + /* + * For a dropped column, we can't check atttypid (it's likely 0). + * In any case the planner has most likely inserted an INT4 null. + * What we insist on is just *some* NULL constant. + */ + if (!IsA(tle->expr, Const) || + !((Const *) tle->expr)->constisnull) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Query provides a value for a dropped column at ordinal position %d.", + attno))); + } + } + if (attno != resultDesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("table row type and query-specified row type do not match"), + errdetail("Query has too few columns."))); +} + +/* + * ExecProcessReturning --- evaluate a RETURNING list + * + * resultRelInfo: current result rel + * tupleSlot: slot holding tuple actually inserted/updated/deleted + * planSlot: slot holding tuple returned by top subplan node + * + * Note: If tupleSlot is NULL, the FDW should have already provided econtext's + * scan tuple. + * + * Returns a slot holding the result tuple + */ +static TupleTableSlot * +ExecProcessReturning(ResultRelInfo *resultRelInfo, + TupleTableSlot *tupleSlot, + TupleTableSlot *planSlot) +{ + ProjectionInfo *projectReturning = resultRelInfo->ri_projectReturning; + ExprContext *econtext = projectReturning->pi_exprContext; + + /* Make tuple and any needed join variables available to ExecProject */ + if (tupleSlot) + econtext->ecxt_scantuple = tupleSlot; + econtext->ecxt_outertuple = planSlot; + + /* + * RETURNING expressions might reference the tableoid column, so + * reinitialize tts_tableOid before evaluating them. + */ + econtext->ecxt_scantuple->tts_tableOid = + RelationGetRelid(resultRelInfo->ri_RelationDesc); + + /* Compute the RETURNING expressions */ + return ExecProject(projectReturning); +} + +/* + * ExecCheckTupleVisible -- verify tuple is visible + * + * It would not be consistent with guarantees of the higher isolation levels to + * proceed with avoiding insertion (taking speculative insertion's alternative + * path) on the basis of another tuple that is not visible to MVCC snapshot. + * Check for the need to raise a serialization failure, and do so as necessary. + */ +static void +ExecCheckTupleVisible(EState *estate, + Relation rel, + TupleTableSlot *slot) +{ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) + { + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + /* + * We should not raise a serialization failure if the conflict is + * against a tuple inserted by our own transaction, even if it's not + * visible to our snapshot. (This would happen, for example, if + * conflicting keys are proposed for insertion in a single command.) + */ + if (!TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + } +} + +/* + * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() + */ +static void +ExecCheckTIDVisible(EState *estate, + ResultRelInfo *relinfo, + ItemPointer tid, + TupleTableSlot *tempSlot) +{ + Relation rel = relinfo->ri_RelationDesc; + + /* Redundantly check isolation level */ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_fetch_row_version(rel, tid, SnapshotAny, tempSlot)) + elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); + ExecCheckTupleVisible(estate, rel, tempSlot); + ExecClearTuple(tempSlot); +} + +/* + * Compute stored generated columns for a tuple + */ +void +ExecComputeStoredGenerated(ResultRelInfo *resultRelInfo, + EState *estate, TupleTableSlot *slot, + CmdType cmdtype) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + TupleDesc tupdesc = RelationGetDescr(rel); + int natts = tupdesc->natts; + MemoryContext oldContext; + Datum *values; + bool *nulls; + + Assert(tupdesc->constr && tupdesc->constr->has_generated_stored); + + /* + * If first time through for this result relation, build expression + * nodetrees for rel's stored generation expressions. Keep them in the + * per-query memory context so they'll survive throughout the query. + */ + if (resultRelInfo->ri_GeneratedExprs == NULL) + { + oldContext = MemoryContextSwitchTo(estate->es_query_cxt); + + resultRelInfo->ri_GeneratedExprs = + (ExprState **) palloc(natts * sizeof(ExprState *)); + resultRelInfo->ri_NumGeneratedNeeded = 0; + + for (int i = 0; i < natts; i++) + { + if (TupleDescAttr(tupdesc, i)->attgenerated == ATTRIBUTE_GENERATED_STORED) + { + Expr *expr; + + /* + * If it's an update and the current column was not marked as + * being updated, then we can skip the computation. But if + * there is a BEFORE ROW UPDATE trigger, we cannot skip + * because the trigger might affect additional columns. + */ + if (cmdtype == CMD_UPDATE && + !(rel->trigdesc && rel->trigdesc->trig_update_before_row) && + !bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber, + ExecGetExtraUpdatedCols(resultRelInfo, estate))) + { + resultRelInfo->ri_GeneratedExprs[i] = NULL; + continue; + } + + expr = (Expr *) build_column_default(rel, i + 1); + if (expr == NULL) + elog(ERROR, "no generation expression found for column number %d of table \"%s\"", + i + 1, RelationGetRelationName(rel)); + + resultRelInfo->ri_GeneratedExprs[i] = ExecPrepareExpr(expr, estate); + resultRelInfo->ri_NumGeneratedNeeded++; + } + } + + MemoryContextSwitchTo(oldContext); + } + + /* + * If no generated columns have been affected by this change, then skip + * the rest. + */ + if (resultRelInfo->ri_NumGeneratedNeeded == 0) + return; + + oldContext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + + values = palloc(sizeof(*values) * natts); + nulls = palloc(sizeof(*nulls) * natts); + + slot_getallattrs(slot); + memcpy(nulls, slot->tts_isnull, sizeof(*nulls) * natts); + + for (int i = 0; i < natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + + if (attr->attgenerated == ATTRIBUTE_GENERATED_STORED && + resultRelInfo->ri_GeneratedExprs[i]) + { + ExprContext *econtext; + Datum val; + bool isnull; + + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = slot; + + val = ExecEvalExpr(resultRelInfo->ri_GeneratedExprs[i], econtext, &isnull); + + /* + * We must make a copy of val as we have no guarantees about where + * memory for a pass-by-reference Datum is located. + */ + if (!isnull) + val = datumCopy(val, attr->attbyval, attr->attlen); + + values[i] = val; + nulls[i] = isnull; + } + else + { + if (!nulls[i]) + values[i] = datumCopy(slot->tts_values[i], attr->attbyval, attr->attlen); + } + } + + ExecClearTuple(slot); + memcpy(slot->tts_values, values, sizeof(*values) * natts); + memcpy(slot->tts_isnull, nulls, sizeof(*nulls) * natts); + ExecStoreVirtualTuple(slot); + ExecMaterializeSlot(slot); + + MemoryContextSwitchTo(oldContext); +} + +/* + * ExecInitInsertProjection + * Do one-time initialization of projection data for INSERT tuples. + * + * INSERT queries may need a projection to filter out junk attrs in the tlist. + * + * This is also a convenient place to verify that the + * output of an INSERT matches the target table. + */ +static void +ExecInitInsertProjection(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo) +{ + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + Plan *subplan = outerPlan(node); + EState *estate = mtstate->ps.state; + List *insertTargetList = NIL; + bool need_projection = false; + ListCell *l; + + /* Extract non-junk columns of the subplan's result tlist. */ + foreach(l, subplan->targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + if (!tle->resjunk) + insertTargetList = lappend(insertTargetList, tle); + else + need_projection = true; + } + + /* + * The junk-free list must produce a tuple suitable for the result + * relation. + */ + ExecCheckPlanOutput(resultRelInfo->ri_RelationDesc, insertTargetList); + + /* We'll need a slot matching the table's format. */ + resultRelInfo->ri_newTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + + /* Build ProjectionInfo if needed (it probably isn't). */ + if (need_projection) + { + TupleDesc relDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + + /* need an expression context to do the projection */ + if (mtstate->ps.ps_ExprContext == NULL) + ExecAssignExprContext(estate, &mtstate->ps); + + resultRelInfo->ri_projectNew = + ExecBuildProjectionInfo(insertTargetList, + mtstate->ps.ps_ExprContext, + resultRelInfo->ri_newTupleSlot, + &mtstate->ps, + relDesc); + } + + resultRelInfo->ri_projectNewInfoValid = true; +} + +/* + * ExecInitUpdateProjection + * Do one-time initialization of projection data for UPDATE tuples. + * + * UPDATE always needs a projection, because (1) there's always some junk + * attrs, and (2) we may need to merge values of not-updated columns from + * the old tuple into the final tuple. In UPDATE, the tuple arriving from + * the subplan contains only new values for the changed columns, plus row + * identity info in the junk attrs. + * + * This is "one-time" for any given result rel, but we might touch more than + * one result rel in the course of an inherited UPDATE, and each one needs + * its own projection due to possible column order variation. + * + * This is also a convenient place to verify that the output of an UPDATE + * matches the target table (ExecBuildUpdateProjection does that). + */ +static void +ExecInitUpdateProjection(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo) +{ + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + Plan *subplan = outerPlan(node); + EState *estate = mtstate->ps.state; + TupleDesc relDesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + int whichrel; + List *updateColnos; + + /* + * Usually, mt_lastResultIndex matches the target rel. If it happens not + * to, we can get the index the hard way with an integer division. + */ + whichrel = mtstate->mt_lastResultIndex; + if (resultRelInfo != mtstate->resultRelInfo + whichrel) + { + whichrel = resultRelInfo - mtstate->resultRelInfo; + Assert(whichrel >= 0 && whichrel < mtstate->mt_nrels); + } + + updateColnos = (List *) list_nth(node->updateColnosLists, whichrel); + + /* + * For UPDATE, we use the old tuple to fill up missing values in the tuple + * produced by the subplan to get the new tuple. We need two slots, both + * matching the table's desired format. + */ + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + resultRelInfo->ri_newTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + + /* need an expression context to do the projection */ + if (mtstate->ps.ps_ExprContext == NULL) + ExecAssignExprContext(estate, &mtstate->ps); + + resultRelInfo->ri_projectNew = + ExecBuildUpdateProjection(subplan->targetlist, + false, /* subplan did the evaluation */ + updateColnos, + relDesc, + mtstate->ps.ps_ExprContext, + resultRelInfo->ri_newTupleSlot, + &mtstate->ps); + + resultRelInfo->ri_projectNewInfoValid = true; +} + +/* + * ExecGetInsertNewTuple + * This prepares a "new" tuple ready to be inserted into given result + * relation, by removing any junk columns of the plan's output tuple + * and (if necessary) coercing the tuple to the right tuple format. + */ +static TupleTableSlot * +ExecGetInsertNewTuple(ResultRelInfo *relinfo, + TupleTableSlot *planSlot) +{ + ProjectionInfo *newProj = relinfo->ri_projectNew; + ExprContext *econtext; + + /* + * If there's no projection to be done, just make sure the slot is of the + * right type for the target rel. If the planSlot is the right type we + * can use it as-is, else copy the data into ri_newTupleSlot. + */ + if (newProj == NULL) + { + if (relinfo->ri_newTupleSlot->tts_ops != planSlot->tts_ops) + { + ExecCopySlot(relinfo->ri_newTupleSlot, planSlot); + return relinfo->ri_newTupleSlot; + } + else + return planSlot; + } + + /* + * Else project; since the projection output slot is ri_newTupleSlot, this + * will also fix any slot-type problem. + * + * Note: currently, this is dead code, because INSERT cases don't receive + * any junk columns so there's never a projection to be done. + */ + econtext = newProj->pi_exprContext; + econtext->ecxt_outertuple = planSlot; + return ExecProject(newProj); +} + +/* + * ExecGetUpdateNewTuple + * This prepares a "new" tuple by combining an UPDATE subplan's output + * tuple (which contains values of changed columns) with unchanged + * columns taken from the old tuple. + * + * The subplan tuple might also contain junk columns, which are ignored. + * Note that the projection also ensures we have a slot of the right type. + */ +TupleTableSlot * +ExecGetUpdateNewTuple(ResultRelInfo *relinfo, + TupleTableSlot *planSlot, + TupleTableSlot *oldSlot) +{ + ProjectionInfo *newProj = relinfo->ri_projectNew; + ExprContext *econtext; + + /* Use a few extra Asserts to protect against outside callers */ + Assert(relinfo->ri_projectNewInfoValid); + Assert(planSlot != NULL && !TTS_EMPTY(planSlot)); + Assert(oldSlot != NULL && !TTS_EMPTY(oldSlot)); + + econtext = newProj->pi_exprContext; + econtext->ecxt_outertuple = planSlot; + econtext->ecxt_scantuple = oldSlot; + return ExecProject(newProj); +} + + +/* ---------------------------------------------------------------- + * ExecInsert + * + * For INSERT, we have to insert the tuple into the target relation + * (or partition thereof) and insert appropriate tuples into the index + * relations. + * + * slot contains the new tuple value to be stored. + * planSlot is the output of the ModifyTable's subplan; we use it + * to access "junk" columns that are not going to be stored. + * + * Returns RETURNING result if any, otherwise NULL. + * + * This may change the currently active tuple conversion map in + * mtstate->mt_transition_capture, so the callers must take care to + * save the previous value to avoid losing track of it. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecInsert(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot, + EState *estate, + bool canSetTag) +{ + Relation resultRelationDesc; + List *recheckIndexes = NIL; + TupleTableSlot *result = NULL; + TransitionCaptureState *ar_insert_trig_tcs; + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + OnConflictAction onconflict = node->onConflictAction; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + MemoryContext oldContext; + + /* + * If the input result relation is a partitioned table, find the leaf + * partition to insert the tuple into. + */ + if (proute) + { + ResultRelInfo *partRelInfo; + + slot = ExecPrepareTupleRouting(mtstate, estate, proute, + resultRelInfo, slot, + &partRelInfo); + resultRelInfo = partRelInfo; + } + + ExecMaterializeSlot(slot); + + resultRelationDesc = resultRelInfo->ri_RelationDesc; + + /* + * Open the table's indexes, if we have not done so already, so that we + * can add new index entries for the inserted tuple. + */ + if (resultRelationDesc->rd_rel->relhasindex && + resultRelInfo->ri_IndexRelationDescs == NULL) + ExecOpenIndices(resultRelInfo, onconflict != ONCONFLICT_NONE); + + /* + * BEFORE ROW INSERT Triggers. + * + * Note: We fire BEFORE ROW TRIGGERS for every attempted insertion in an + * INSERT ... ON CONFLICT statement. We cannot check for constraint + * violations before firing these triggers, because they can change the + * values to insert. Also, they can run arbitrary user-defined code with + * side-effects that we can't cancel by just not inserting the tuple. + */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_insert_before_row) + { + if (!ExecBRInsertTriggers(estate, resultRelInfo, slot)) + return NULL; /* "do nothing" */ + } + + /* INSTEAD OF ROW INSERT Triggers */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_insert_instead_row) + { + if (!ExecIRInsertTriggers(estate, resultRelInfo, slot)) + return NULL; /* "do nothing" */ + } + else if (resultRelInfo->ri_FdwRoutine) + { + /* + * GENERATED expressions might reference the tableoid column, so + * (re-)initialize tts_tableOid before evaluating them. + */ + slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + + /* + * Compute stored generated columns + */ + if (resultRelationDesc->rd_att->constr && + resultRelationDesc->rd_att->constr->has_generated_stored) + ExecComputeStoredGenerated(resultRelInfo, estate, slot, + CMD_INSERT); + + /* + * If the FDW supports batching, and batching is requested, accumulate + * rows and insert them in batches. Otherwise use the per-row inserts. + */ + if (resultRelInfo->ri_BatchSize > 1) + { + /* + * If a certain number of tuples have already been accumulated, or + * a tuple has come for a different relation than that for the + * accumulated tuples, perform the batch insert + */ + if (resultRelInfo->ri_NumSlots == resultRelInfo->ri_BatchSize) + { + ExecBatchInsert(mtstate, resultRelInfo, + resultRelInfo->ri_Slots, + resultRelInfo->ri_PlanSlots, + resultRelInfo->ri_NumSlots, + estate, canSetTag); + resultRelInfo->ri_NumSlots = 0; + } + + oldContext = MemoryContextSwitchTo(estate->es_query_cxt); + + if (resultRelInfo->ri_Slots == NULL) + { + resultRelInfo->ri_Slots = palloc(sizeof(TupleTableSlot *) * + resultRelInfo->ri_BatchSize); + resultRelInfo->ri_PlanSlots = palloc(sizeof(TupleTableSlot *) * + resultRelInfo->ri_BatchSize); + } + + /* + * Initialize the batch slots. We don't know how many slots will + * be needed, so we initialize them as the batch grows, and we + * keep them across batches. To mitigate an inefficiency in how + * resource owner handles objects with many references (as with + * many slots all referencing the same tuple descriptor) we copy + * the appropriate tuple descriptor for each slot. + */ + if (resultRelInfo->ri_NumSlots >= resultRelInfo->ri_NumSlotsInitialized) + { + TupleDesc tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor); + TupleDesc plan_tdesc = + CreateTupleDescCopy(planSlot->tts_tupleDescriptor); + + resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] = + MakeSingleTupleTableSlot(tdesc, slot->tts_ops); + + resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] = + MakeSingleTupleTableSlot(plan_tdesc, planSlot->tts_ops); + + /* remember how many batch slots we initialized */ + resultRelInfo->ri_NumSlotsInitialized++; + } + + ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots], + slot); + + ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots], + planSlot); + + resultRelInfo->ri_NumSlots++; + + MemoryContextSwitchTo(oldContext); + + return NULL; + } + + /* + * insert into foreign table: let the FDW do it + */ + slot = resultRelInfo->ri_FdwRoutine->ExecForeignInsert(estate, + resultRelInfo, + slot, + planSlot); + + if (slot == NULL) /* "do nothing" */ + return NULL; + + /* + * AFTER ROW Triggers or RETURNING expressions might reference the + * tableoid column, so (re-)initialize tts_tableOid before evaluating + * them. (This covers the case where the FDW replaced the slot.) + */ + slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + } + else + { + WCOKind wco_kind; + + /* + * Constraints and GENERATED expressions might reference the tableoid + * column, so (re-)initialize tts_tableOid before evaluating them. + */ + slot->tts_tableOid = RelationGetRelid(resultRelationDesc); + + /* + * Compute stored generated columns + */ + if (resultRelationDesc->rd_att->constr && + resultRelationDesc->rd_att->constr->has_generated_stored) + ExecComputeStoredGenerated(resultRelInfo, estate, slot, + CMD_INSERT); + + /* + * Check any RLS WITH CHECK policies. + * + * Normally we should check INSERT policies. But if the insert is the + * result of a partition key update that moved the tuple to a new + * partition, we should instead check UPDATE policies, because we are + * executing policies defined on the target table, and not those + * defined on the child partitions. + */ + wco_kind = (mtstate->operation == CMD_UPDATE) ? + WCO_RLS_UPDATE_CHECK : WCO_RLS_INSERT_CHECK; + + /* + * ExecWithCheckOptions() will skip any WCOs which are not of the kind + * we are looking for at this point. + */ + if (resultRelInfo->ri_WithCheckOptions != NIL) + ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate); + + /* + * Check the constraints of the tuple. + */ + if (resultRelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate); + + /* + * Also check the tuple against the partition constraint, if there is + * one; except that if we got here via tuple-routing, we don't need to + * if there's no BR trigger defined on the partition. + */ + if (resultRelationDesc->rd_rel->relispartition && + (resultRelInfo->ri_RootResultRelInfo == NULL || + (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_insert_before_row))) + ExecPartitionCheck(resultRelInfo, slot, estate, true); + + if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0) + { + /* Perform a speculative insertion. */ + uint32 specToken; + ItemPointerData conflictTid; + bool specConflict; + List *arbiterIndexes; + + arbiterIndexes = resultRelInfo->ri_onConflictArbiterIndexes; + + /* + * Do a non-conclusive check for conflicts first. + * + * We're not holding any locks yet, so this doesn't guarantee that + * the later insert won't conflict. But it avoids leaving behind + * a lot of canceled speculative insertions, if you run a lot of + * INSERT ON CONFLICT statements that do conflict. + * + * We loop back here if we find a conflict below, either during + * the pre-check, or when we re-check after inserting the tuple + * speculatively. Better allow interrupts in case some bug makes + * this an infinite loop. + */ + vlock: + CHECK_FOR_INTERRUPTS(); + specConflict = false; + if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, + &conflictTid, arbiterIndexes)) + { + /* committed conflict tuple found */ + if (onconflict == ONCONFLICT_UPDATE) + { + /* + * In case of ON CONFLICT DO UPDATE, execute the UPDATE + * part. Be prepared to retry if the UPDATE fails because + * of another concurrent UPDATE/DELETE to the conflict + * tuple. + */ + TupleTableSlot *returning = NULL; + + if (ExecOnConflictUpdate(mtstate, resultRelInfo, + &conflictTid, planSlot, slot, + estate, canSetTag, &returning)) + { + InstrCountTuples2(&mtstate->ps, 1); + return returning; + } + else + goto vlock; + } + else + { + /* + * In case of ON CONFLICT DO NOTHING, do nothing. However, + * verify that the tuple is visible to the executor's MVCC + * snapshot at higher isolation levels. + * + * Using ExecGetReturningSlot() to store the tuple for the + * recheck isn't that pretty, but we can't trivially use + * the input slot, because it might not be of a compatible + * type. As there's no conflicting usage of + * ExecGetReturningSlot() in the DO NOTHING case... + */ + Assert(onconflict == ONCONFLICT_NOTHING); + ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid, + ExecGetReturningSlot(estate, resultRelInfo)); + InstrCountTuples2(&mtstate->ps, 1); + return NULL; + } + } + + /* + * Before we start insertion proper, acquire our "speculative + * insertion lock". Others can use that to wait for us to decide + * if we're going to go ahead with the insertion, instead of + * waiting for the whole transaction to complete. + */ + specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); + + /* insert the tuple, with the speculative token */ + table_tuple_insert_speculative(resultRelationDesc, slot, + estate->es_output_cid, + 0, + NULL, + specToken); + + /* insert index entries for tuple */ + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, false, true, + &specConflict, + arbiterIndexes); + + /* adjust the tuple's state accordingly */ + table_tuple_complete_speculative(resultRelationDesc, slot, + specToken, !specConflict); + + /* + * Wake up anyone waiting for our decision. They will re-check + * the tuple, see that it's no longer speculative, and wait on our + * XID as if this was a regularly inserted tuple all along. Or if + * we killed the tuple, they will see it's dead, and proceed as if + * the tuple never existed. + */ + SpeculativeInsertionLockRelease(GetCurrentTransactionId()); + + /* + * If there was a conflict, start from the beginning. We'll do + * the pre-check again, which will now find the conflicting tuple + * (unless it aborts before we get there). + */ + if (specConflict) + { + list_free(recheckIndexes); + goto vlock; + } + + /* Since there was no insertion conflict, we're done */ + } + else + { + /* insert the tuple normally */ + table_tuple_insert(resultRelationDesc, slot, + estate->es_output_cid, + 0, NULL); + + /* insert index entries for tuple */ + if (resultRelInfo->ri_NumIndices > 0) + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, false, + false, NULL, NIL); + } + } + + if (canSetTag) + (estate->es_processed)++; + + /* + * If this insert is the result of a partition key update that moved the + * tuple to a new partition, put this row into the transition NEW TABLE, + * if there is one. We need to do this separately for DELETE and INSERT + * because they happen on different tables. + */ + ar_insert_trig_tcs = mtstate->mt_transition_capture; + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_new_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, NULL, + NULL, + slot, + NULL, + mtstate->mt_transition_capture); + + /* + * We've already captured the NEW TABLE row, so make sure any AR + * INSERT trigger fired below doesn't capture it again. + */ + ar_insert_trig_tcs = NULL; + } + + /* AFTER ROW INSERT Triggers */ + ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes, + ar_insert_trig_tcs); + + list_free(recheckIndexes); + + /* + * Check any WITH CHECK OPTION constraints from parent views. We are + * required to do this after testing all constraints and uniqueness + * violations per the SQL spec, so we do it after actually inserting the + * record into the heap and all indexes. + * + * ExecWithCheckOptions will elog(ERROR) if a violation is found, so the + * tuple will never be seen, if it violates the WITH CHECK OPTION. + * + * ExecWithCheckOptions() will skip any WCOs which are not of the kind we + * are looking for at this point. + */ + if (resultRelInfo->ri_WithCheckOptions != NIL) + ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate); + + /* Process RETURNING if present */ + if (resultRelInfo->ri_projectReturning) + result = ExecProcessReturning(resultRelInfo, slot, planSlot); + + return result; +} + +/* ---------------------------------------------------------------- + * ExecBatchInsert + * + * Insert multiple tuples in an efficient way. + * Currently, this handles inserting into a foreign table without + * RETURNING clause. + * ---------------------------------------------------------------- + */ +static void +ExecBatchInsert(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + TupleTableSlot **slots, + TupleTableSlot **planSlots, + int numSlots, + EState *estate, + bool canSetTag) +{ + int i; + int numInserted = numSlots; + TupleTableSlot *slot = NULL; + TupleTableSlot **rslots; + + /* + * insert into foreign table: let the FDW do it + */ + rslots = resultRelInfo->ri_FdwRoutine->ExecForeignBatchInsert(estate, + resultRelInfo, + slots, + planSlots, + &numInserted); + + for (i = 0; i < numInserted; i++) + { + slot = rslots[i]; + + /* + * AFTER ROW Triggers or RETURNING expressions might reference the + * tableoid column, so (re-)initialize tts_tableOid before evaluating + * them. + */ + slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + + /* AFTER ROW INSERT Triggers */ + ExecARInsertTriggers(estate, resultRelInfo, slot, NIL, + mtstate->mt_transition_capture); + + /* + * Check any WITH CHECK OPTION constraints from parent views. See the + * comment in ExecInsert. + */ + if (resultRelInfo->ri_WithCheckOptions != NIL) + ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate); + } + + if (canSetTag && numInserted > 0) + estate->es_processed += numInserted; +} + +/* ---------------------------------------------------------------- + * ExecDelete + * + * DELETE is like UPDATE, except that we delete the tuple and no + * index modifications are needed. + * + * When deleting from a table, tupleid identifies the tuple to + * delete and oldtuple is NULL. When deleting from a view, + * oldtuple is passed to the INSTEAD OF triggers and identifies + * what to delete, and tupleid is invalid. When deleting from a + * foreign table, tupleid is invalid; the FDW has to figure out + * which row to delete using data from the planSlot. oldtuple is + * passed to foreign table triggers; it is NULL when the foreign + * table has no relevant triggers. We use tupleDeleted to indicate + * whether the tuple is actually deleted, callers can use it to + * decide whether to continue the operation. When this DELETE is a + * part of an UPDATE of partition-key, then the slot returned by + * EvalPlanQual() is passed back using output parameter epqslot. + * + * Returns RETURNING result if any, otherwise NULL. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecDelete(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + ItemPointer tupleid, + HeapTuple oldtuple, + TupleTableSlot *planSlot, + EPQState *epqstate, + EState *estate, + bool processReturning, + bool canSetTag, + bool changingPart, + bool *tupleDeleted, + TupleTableSlot **epqreturnslot) +{ + Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; + TM_Result result; + TM_FailureData tmfd; + TupleTableSlot *slot = NULL; + TransitionCaptureState *ar_delete_trig_tcs; + + if (tupleDeleted) + *tupleDeleted = false; + + /* BEFORE ROW DELETE Triggers */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_delete_before_row) + { + bool dodelete; + + dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, + tupleid, oldtuple, epqreturnslot); + + if (!dodelete) /* "do nothing" */ + return NULL; + } + + /* INSTEAD OF ROW DELETE Triggers */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_delete_instead_row) + { + bool dodelete; + + Assert(oldtuple != NULL); + dodelete = ExecIRDeleteTriggers(estate, resultRelInfo, oldtuple); + + if (!dodelete) /* "do nothing" */ + return NULL; + } + else if (resultRelInfo->ri_FdwRoutine) + { + /* + * delete from foreign table: let the FDW do it + * + * We offer the returning slot as a place to store RETURNING data, + * although the FDW can return some other slot if it wants. + */ + slot = ExecGetReturningSlot(estate, resultRelInfo); + slot = resultRelInfo->ri_FdwRoutine->ExecForeignDelete(estate, + resultRelInfo, + slot, + planSlot); + + if (slot == NULL) /* "do nothing" */ + return NULL; + + /* + * RETURNING expressions might reference the tableoid column, so + * (re)initialize tts_tableOid before evaluating them. + */ + if (TTS_EMPTY(slot)) + ExecStoreAllNullTuple(slot); + + slot->tts_tableOid = RelationGetRelid(resultRelationDesc); + } + else + { + /* + * delete the tuple + * + * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check + * that the row to be deleted is visible to that snapshot, and throw a + * can't-serialize error if not. This is a special-case behavior + * needed for referential integrity updates in transaction-snapshot + * mode transactions. + */ +ldelete:; + result = table_tuple_delete(resultRelationDesc, tupleid, + estate->es_output_cid, + estate->es_snapshot, + estate->es_crosscheck_snapshot, + true /* wait for commit */ , + &tmfd, + changingPart); + + switch (result) + { + case TM_SelfModified: + + /* + * The target tuple was already updated or deleted by the + * current command, or by a later command in the current + * transaction. The former case is possible in a join DELETE + * where multiple tuples join to the same target tuple. This + * is somewhat questionable, but Postgres has always allowed + * it: we just ignore additional deletion attempts. + * + * The latter case arises if the tuple is modified by a + * command in a BEFORE trigger, or perhaps by a command in a + * volatile function used in the query. In such situations we + * should not ignore the deletion, but it is equally unsafe to + * proceed. We don't want to discard the original DELETE + * while keeping the triggered actions based on its deletion; + * and it would be no better to allow the original DELETE + * while discarding updates that it triggered. The row update + * carries some information that might be important according + * to business rules; so throwing an error is the only safe + * course. + * + * If a trigger actually intends this type of interaction, it + * can re-execute the DELETE and then return NULL to cancel + * the outer delete. + */ + if (tmfd.cmax != estate->es_output_cid) + ereport(ERROR, + (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), + errmsg("tuple to be deleted was already modified by an operation triggered by the current command"), + errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); + + /* Else, already deleted by self; nothing to do */ + return NULL; + + case TM_Ok: + break; + + case TM_Updated: + { + TupleTableSlot *inputslot; + TupleTableSlot *epqslot; + + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + + /* + * Already know that we're going to need to do EPQ, so + * fetch tuple directly into the right slot. + */ + EvalPlanQualBegin(epqstate); + inputslot = EvalPlanQualSlot(epqstate, resultRelationDesc, + resultRelInfo->ri_RangeTableIndex); + + result = table_tuple_lock(resultRelationDesc, tupleid, + estate->es_snapshot, + inputslot, estate->es_output_cid, + LockTupleExclusive, LockWaitBlock, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + &tmfd); + + switch (result) + { + case TM_Ok: + Assert(tmfd.traversed); + epqslot = EvalPlanQual(epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + inputslot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; + + /* + * If requested, skip delete and pass back the + * updated row. + */ + if (epqreturnslot) + { + *epqreturnslot = epqslot; + return NULL; + } + else + goto ldelete; + + case TM_SelfModified: + + /* + * This can be reached when following an update + * chain from a tuple updated by another session, + * reaching a tuple that was already updated in + * this transaction. If previously updated by this + * command, ignore the delete, otherwise error + * out. + * + * See also TM_SelfModified response to + * table_tuple_delete() above. + */ + if (tmfd.cmax != estate->es_output_cid) + ereport(ERROR, + (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), + errmsg("tuple to be deleted was already modified by an operation triggered by the current command"), + errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); + return NULL; + + case TM_Deleted: + /* tuple already deleted; nothing to do */ + return NULL; + + default: + + /* + * TM_Invisible should be impossible because we're + * waiting for updated row versions, and would + * already have errored out if the first version + * is invisible. + * + * TM_Updated should be impossible, because we're + * locking the latest version via + * TUPLE_LOCK_FLAG_FIND_LAST_VERSION. + */ + elog(ERROR, "unexpected table_tuple_lock status: %u", + result); + return NULL; + } + + Assert(false); + break; + } + + case TM_Deleted: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent delete"))); + /* tuple already deleted; nothing to do */ + return NULL; + + default: + elog(ERROR, "unrecognized table_tuple_delete status: %u", + result); + return NULL; + } + + /* + * Note: Normally one would think that we have to delete index tuples + * associated with the heap tuple now... + * + * ... but in POSTGRES, we have no need to do this because VACUUM will + * take care of it later. We can't delete index tuples immediately + * anyway, since the tuple is still visible to other transactions. + */ + } + + if (canSetTag) + (estate->es_processed)++; + + /* Tell caller that the delete actually happened. */ + if (tupleDeleted) + *tupleDeleted = true; + + /* + * If this delete is the result of a partition key update that moved the + * tuple to a new partition, put this row into the transition OLD TABLE, + * if there is one. We need to do this separately for DELETE and INSERT + * because they happen on different tables. + */ + ar_delete_trig_tcs = mtstate->mt_transition_capture; + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_old_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, + tupleid, + oldtuple, + NULL, + NULL, + mtstate->mt_transition_capture); + + /* + * We've already captured the NEW TABLE row, so make sure any AR + * DELETE trigger fired below doesn't capture it again. + */ + ar_delete_trig_tcs = NULL; + } + + /* AFTER ROW DELETE Triggers */ + ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, + ar_delete_trig_tcs); + + /* Process RETURNING if present and if requested */ + if (processReturning && resultRelInfo->ri_projectReturning) + { + /* + * We have to put the target tuple into a slot, which means first we + * gotta fetch it. We can use the trigger tuple slot. + */ + TupleTableSlot *rslot; + + if (resultRelInfo->ri_FdwRoutine) + { + /* FDW must have provided a slot containing the deleted row */ + Assert(!TupIsNull(slot)); + } + else + { + slot = ExecGetReturningSlot(estate, resultRelInfo); + if (oldtuple != NULL) + { + ExecForceStoreHeapTuple(oldtuple, slot, false); + } + else + { + if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid, + SnapshotAny, slot)) + elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING"); + } + } + + rslot = ExecProcessReturning(resultRelInfo, slot, planSlot); + + /* + * Before releasing the target tuple again, make sure rslot has a + * local copy of any pass-by-reference values. + */ + ExecMaterializeSlot(rslot); + + ExecClearTuple(slot); + + return rslot; + } + + return NULL; +} + +/* + * ExecCrossPartitionUpdate --- Move an updated tuple to another partition. + * + * This works by first deleting the old tuple from the current partition, + * followed by inserting the new tuple into the root parent table, that is, + * mtstate->rootResultRelInfo. It will be re-routed from there to the + * correct partition. + * + * Returns true if the tuple has been successfully moved, or if it's found + * that the tuple was concurrently deleted so there's nothing more to do + * for the caller. + * + * False is returned if the tuple we're trying to move is found to have been + * concurrently updated. In that case, the caller must to check if the + * updated tuple that's returned in *retry_slot still needs to be re-routed, + * and call this function again or perform a regular update accordingly. + */ +static bool +ExecCrossPartitionUpdate(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + ItemPointer tupleid, HeapTuple oldtuple, + TupleTableSlot *slot, TupleTableSlot *planSlot, + EPQState *epqstate, bool canSetTag, + TupleTableSlot **retry_slot, + TupleTableSlot **inserted_tuple) +{ + EState *estate = mtstate->ps.state; + TupleConversionMap *tupconv_map; + bool tuple_deleted; + TupleTableSlot *epqslot = NULL; + + *inserted_tuple = NULL; + *retry_slot = NULL; + + /* + * Disallow an INSERT ON CONFLICT DO UPDATE that causes the original row + * to migrate to a different partition. Maybe this can be implemented + * some day, but it seems a fringe feature with little redeeming value. + */ + if (((ModifyTable *) mtstate->ps.plan)->onConflictAction == ONCONFLICT_UPDATE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("invalid ON UPDATE specification"), + errdetail("The result tuple would appear in a different partition than the original tuple."))); + + /* + * When an UPDATE is run directly on a leaf partition, simply fail with a + * partition constraint violation error. + */ + if (resultRelInfo == mtstate->rootResultRelInfo) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + + /* Initialize tuple routing info if not already done. */ + if (mtstate->mt_partition_tuple_routing == NULL) + { + Relation rootRel = mtstate->rootResultRelInfo->ri_RelationDesc; + MemoryContext oldcxt; + + /* Things built here have to last for the query duration. */ + oldcxt = MemoryContextSwitchTo(estate->es_query_cxt); + + mtstate->mt_partition_tuple_routing = + ExecSetupPartitionTupleRouting(estate, rootRel); + + /* + * Before a partition's tuple can be re-routed, it must first be + * converted to the root's format, so we'll need a slot for storing + * such tuples. + */ + Assert(mtstate->mt_root_tuple_slot == NULL); + mtstate->mt_root_tuple_slot = table_slot_create(rootRel, NULL); + + MemoryContextSwitchTo(oldcxt); + } + + /* + * Row movement, part 1. Delete the tuple, but skip RETURNING processing. + * We want to return rows from INSERT. + */ + ExecDelete(mtstate, resultRelInfo, tupleid, oldtuple, planSlot, + epqstate, estate, + false, /* processReturning */ + false, /* canSetTag */ + true, /* changingPart */ + &tuple_deleted, &epqslot); + + /* + * For some reason if DELETE didn't happen (e.g. trigger prevented it, or + * it was already deleted by self, or it was concurrently deleted by + * another transaction), then we should skip the insert as well; + * otherwise, an UPDATE could cause an increase in the total number of + * rows across all partitions, which is clearly wrong. + * + * For a normal UPDATE, the case where the tuple has been the subject of a + * concurrent UPDATE or DELETE would be handled by the EvalPlanQual + * machinery, but for an UPDATE that we've translated into a DELETE from + * this partition and an INSERT into some other partition, that's not + * available, because CTID chains can't span relation boundaries. We + * mimic the semantics to a limited extent by skipping the INSERT if the + * DELETE fails to find a tuple. This ensures that two concurrent + * attempts to UPDATE the same tuple at the same time can't turn one tuple + * into two, and that an UPDATE of a just-deleted tuple can't resurrect + * it. + */ + if (!tuple_deleted) + { + /* + * epqslot will be typically NULL. But when ExecDelete() finds that + * another transaction has concurrently updated the same row, it + * re-fetches the row, skips the delete, and epqslot is set to the + * re-fetched tuple slot. In that case, we need to do all the checks + * again. + */ + if (TupIsNull(epqslot)) + return true; + else + { + /* Fetch the most recent version of old tuple. */ + TupleTableSlot *oldSlot; + + /* ... but first, make sure ri_oldTupleSlot is initialized. */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitUpdateProjection(mtstate, resultRelInfo); + oldSlot = resultRelInfo->ri_oldTupleSlot; + if (!table_tuple_fetch_row_version(resultRelInfo->ri_RelationDesc, + tupleid, + SnapshotAny, + oldSlot)) + elog(ERROR, "failed to fetch tuple being updated"); + *retry_slot = ExecGetUpdateNewTuple(resultRelInfo, epqslot, + oldSlot); + return false; + } + } + + /* + * resultRelInfo is one of the per-relation resultRelInfos. So we should + * convert the tuple into root's tuple descriptor if needed, since + * ExecInsert() starts the search from root. + */ + tupconv_map = ExecGetChildToRootMap(resultRelInfo); + if (tupconv_map != NULL) + slot = execute_attr_map_slot(tupconv_map->attrMap, + slot, + mtstate->mt_root_tuple_slot); + + /* Tuple routing starts from the root table. */ + *inserted_tuple = ExecInsert(mtstate, mtstate->rootResultRelInfo, slot, + planSlot, estate, canSetTag); + + /* + * Reset the transition state that may possibly have been written by + * INSERT. + */ + if (mtstate->mt_transition_capture) + mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + + /* We're done moving. */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecUpdate + * + * note: we can't run UPDATE queries with transactions + * off because UPDATEs are actually INSERTs and our + * scan will mistakenly loop forever, updating the tuple + * it just inserted.. This should be fixed but until it + * is, we don't want to get stuck in an infinite loop + * which corrupts your database.. + * + * When updating a table, tupleid identifies the tuple to + * update and oldtuple is NULL. When updating a view, oldtuple + * is passed to the INSTEAD OF triggers and identifies what to + * update, and tupleid is invalid. When updating a foreign table, + * tupleid is invalid; the FDW has to figure out which row to + * update using data from the planSlot. oldtuple is passed to + * foreign table triggers; it is NULL when the foreign table has + * no relevant triggers. + * + * slot contains the new tuple value to be stored. + * planSlot is the output of the ModifyTable's subplan; we use it + * to access values from other input tables (for RETURNING), + * row-ID junk columns, etc. + * + * Returns RETURNING result if any, otherwise NULL. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecUpdate(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + ItemPointer tupleid, + HeapTuple oldtuple, + TupleTableSlot *slot, + TupleTableSlot *planSlot, + EPQState *epqstate, + EState *estate, + bool canSetTag) +{ + Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; + TM_Result result; + TM_FailureData tmfd; + List *recheckIndexes = NIL; + + /* + * abort the operation if not running transactions + */ + if (IsBootstrapProcessingMode()) + elog(ERROR, "cannot UPDATE during bootstrap"); + + ExecMaterializeSlot(slot); + + /* + * Open the table's indexes, if we have not done so already, so that we + * can add new index entries for the updated tuple. + */ + if (resultRelationDesc->rd_rel->relhasindex && + resultRelInfo->ri_IndexRelationDescs == NULL) + ExecOpenIndices(resultRelInfo, false); + + /* BEFORE ROW UPDATE Triggers */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_before_row) + { + if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, + tupleid, oldtuple, slot)) + return NULL; /* "do nothing" */ + } + + /* INSTEAD OF ROW UPDATE Triggers */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_instead_row) + { + if (!ExecIRUpdateTriggers(estate, resultRelInfo, + oldtuple, slot)) + return NULL; /* "do nothing" */ + } + else if (resultRelInfo->ri_FdwRoutine) + { + /* + * GENERATED expressions might reference the tableoid column, so + * (re-)initialize tts_tableOid before evaluating them. + */ + slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + + /* + * Compute stored generated columns + */ + if (resultRelationDesc->rd_att->constr && + resultRelationDesc->rd_att->constr->has_generated_stored) + ExecComputeStoredGenerated(resultRelInfo, estate, slot, + CMD_UPDATE); + + /* + * update in foreign table: let the FDW do it + */ + slot = resultRelInfo->ri_FdwRoutine->ExecForeignUpdate(estate, + resultRelInfo, + slot, + planSlot); + + if (slot == NULL) /* "do nothing" */ + return NULL; + + /* + * AFTER ROW Triggers or RETURNING expressions might reference the + * tableoid column, so (re-)initialize tts_tableOid before evaluating + * them. (This covers the case where the FDW replaced the slot.) + */ + slot->tts_tableOid = RelationGetRelid(resultRelationDesc); + } + else + { + LockTupleMode lockmode; + bool partition_constraint_failed; + bool update_indexes; + + /* + * Constraints and GENERATED expressions might reference the tableoid + * column, so (re-)initialize tts_tableOid before evaluating them. + */ + slot->tts_tableOid = RelationGetRelid(resultRelationDesc); + + /* + * Compute stored generated columns + */ + if (resultRelationDesc->rd_att->constr && + resultRelationDesc->rd_att->constr->has_generated_stored) + ExecComputeStoredGenerated(resultRelInfo, estate, slot, + CMD_UPDATE); + + /* + * Check any RLS UPDATE WITH CHECK policies + * + * If we generate a new candidate tuple after EvalPlanQual testing, we + * must loop back here and recheck any RLS policies and constraints. + * (We don't need to redo triggers, however. If there are any BEFORE + * triggers then trigger.c will have done table_tuple_lock to lock the + * correct tuple, so there's no need to do them again.) + */ +lreplace:; + + /* ensure slot is independent, consider e.g. EPQ */ + ExecMaterializeSlot(slot); + + /* + * If partition constraint fails, this row might get moved to another + * partition, in which case we should check the RLS CHECK policy just + * before inserting into the new partition, rather than doing it here. + * This is because a trigger on that partition might again change the + * row. So skip the WCO checks if the partition constraint fails. + */ + partition_constraint_failed = + resultRelationDesc->rd_rel->relispartition && + !ExecPartitionCheck(resultRelInfo, slot, estate, false); + + if (!partition_constraint_failed && + resultRelInfo->ri_WithCheckOptions != NIL) + { + /* + * ExecWithCheckOptions() will skip any WCOs which are not of the + * kind we are looking for at this point. + */ + ExecWithCheckOptions(WCO_RLS_UPDATE_CHECK, + resultRelInfo, slot, estate); + } + + /* + * If a partition check failed, try to move the row into the right + * partition. + */ + if (partition_constraint_failed) + { + TupleTableSlot *inserted_tuple, + *retry_slot; + bool retry; + + /* + * ExecCrossPartitionUpdate will first DELETE the row from the + * partition it's currently in and then insert it back into the + * root table, which will re-route it to the correct partition. + * The first part may have to be repeated if it is detected that + * the tuple we're trying to move has been concurrently updated. + */ + retry = !ExecCrossPartitionUpdate(mtstate, resultRelInfo, tupleid, + oldtuple, slot, planSlot, + epqstate, canSetTag, + &retry_slot, &inserted_tuple); + if (retry) + { + slot = retry_slot; + goto lreplace; + } + + return inserted_tuple; + } + + /* + * Check the constraints of the tuple. We've already checked the + * partition constraint above; however, we must still ensure the tuple + * passes all other constraints, so we will call ExecConstraints() and + * have it validate all remaining checks. + */ + if (resultRelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate); + + /* + * replace the heap tuple + * + * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check + * that the row to be updated is visible to that snapshot, and throw a + * can't-serialize error if not. This is a special-case behavior + * needed for referential integrity updates in transaction-snapshot + * mode transactions. + */ + result = table_tuple_update(resultRelationDesc, tupleid, slot, + estate->es_output_cid, + estate->es_snapshot, + estate->es_crosscheck_snapshot, + true /* wait for commit */ , + &tmfd, &lockmode, &update_indexes); + + switch (result) + { + case TM_SelfModified: + + /* + * The target tuple was already updated or deleted by the + * current command, or by a later command in the current + * transaction. The former case is possible in a join UPDATE + * where multiple tuples join to the same target tuple. This + * is pretty questionable, but Postgres has always allowed it: + * we just execute the first update action and ignore + * additional update attempts. + * + * The latter case arises if the tuple is modified by a + * command in a BEFORE trigger, or perhaps by a command in a + * volatile function used in the query. In such situations we + * should not ignore the update, but it is equally unsafe to + * proceed. We don't want to discard the original UPDATE + * while keeping the triggered actions based on it; and we + * have no principled way to merge this update with the + * previous ones. So throwing an error is the only safe + * course. + * + * If a trigger actually intends this type of interaction, it + * can re-execute the UPDATE (assuming it can figure out how) + * and then return NULL to cancel the outer update. + */ + if (tmfd.cmax != estate->es_output_cid) + ereport(ERROR, + (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), + errmsg("tuple to be updated was already modified by an operation triggered by the current command"), + errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); + + /* Else, already updated by self; nothing to do */ + return NULL; + + case TM_Ok: + break; + + case TM_Updated: + { + TupleTableSlot *inputslot; + TupleTableSlot *epqslot; + TupleTableSlot *oldSlot; + + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + + /* + * Already know that we're going to need to do EPQ, so + * fetch tuple directly into the right slot. + */ + inputslot = EvalPlanQualSlot(epqstate, resultRelationDesc, + resultRelInfo->ri_RangeTableIndex); + + result = table_tuple_lock(resultRelationDesc, tupleid, + estate->es_snapshot, + inputslot, estate->es_output_cid, + lockmode, LockWaitBlock, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + &tmfd); + + switch (result) + { + case TM_Ok: + Assert(tmfd.traversed); + + epqslot = EvalPlanQual(epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + inputslot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; + + /* Make sure ri_oldTupleSlot is initialized. */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitUpdateProjection(mtstate, resultRelInfo); + + /* Fetch the most recent version of old tuple. */ + oldSlot = resultRelInfo->ri_oldTupleSlot; + if (!table_tuple_fetch_row_version(resultRelationDesc, + tupleid, + SnapshotAny, + oldSlot)) + elog(ERROR, "failed to fetch tuple being updated"); + slot = ExecGetUpdateNewTuple(resultRelInfo, + epqslot, oldSlot); + goto lreplace; + + case TM_Deleted: + /* tuple already deleted; nothing to do */ + return NULL; + + case TM_SelfModified: + + /* + * This can be reached when following an update + * chain from a tuple updated by another session, + * reaching a tuple that was already updated in + * this transaction. If previously modified by + * this command, ignore the redundant update, + * otherwise error out. + * + * See also TM_SelfModified response to + * table_tuple_update() above. + */ + if (tmfd.cmax != estate->es_output_cid) + ereport(ERROR, + (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), + errmsg("tuple to be updated was already modified by an operation triggered by the current command"), + errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); + return NULL; + + default: + /* see table_tuple_lock call in ExecDelete() */ + elog(ERROR, "unexpected table_tuple_lock status: %u", + result); + return NULL; + } + } + + break; + + case TM_Deleted: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent delete"))); + /* tuple already deleted; nothing to do */ + return NULL; + + default: + elog(ERROR, "unrecognized table_tuple_update status: %u", + result); + return NULL; + } + + /* insert index entries for tuple if necessary */ + if (resultRelInfo->ri_NumIndices > 0 && update_indexes) + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, true, false, + NULL, NIL); + } + + if (canSetTag) + (estate->es_processed)++; + + /* AFTER ROW UPDATE Triggers */ + ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple, slot, + recheckIndexes, + mtstate->operation == CMD_INSERT ? + mtstate->mt_oc_transition_capture : + mtstate->mt_transition_capture); + + list_free(recheckIndexes); + + /* + * Check any WITH CHECK OPTION constraints from parent views. We are + * required to do this after testing all constraints and uniqueness + * violations per the SQL spec, so we do it after actually updating the + * record in the heap and all indexes. + * + * ExecWithCheckOptions() will skip any WCOs which are not of the kind we + * are looking for at this point. + */ + if (resultRelInfo->ri_WithCheckOptions != NIL) + ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate); + + /* Process RETURNING if present */ + if (resultRelInfo->ri_projectReturning) + return ExecProcessReturning(resultRelInfo, slot, planSlot); + + return NULL; +} + +/* + * ExecOnConflictUpdate --- execute UPDATE of INSERT ON CONFLICT DO UPDATE + * + * Try to lock tuple for update as part of speculative insertion. If + * a qual originating from ON CONFLICT DO UPDATE is satisfied, update + * (but still lock row, even though it may not satisfy estate's + * snapshot). + * + * Returns true if we're done (with or without an update), or false if + * the caller must retry the INSERT from scratch. + */ +static bool +ExecOnConflictUpdate(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + ItemPointer conflictTid, + TupleTableSlot *planSlot, + TupleTableSlot *excludedSlot, + EState *estate, + bool canSetTag, + TupleTableSlot **returning) +{ + ExprContext *econtext = mtstate->ps.ps_ExprContext; + Relation relation = resultRelInfo->ri_RelationDesc; + ExprState *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause; + TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing; + TM_FailureData tmfd; + LockTupleMode lockmode; + TM_Result test; + Datum xminDatum; + TransactionId xmin; + bool isnull; + + /* Determine lock mode to use */ + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + + /* + * Lock tuple for update. Don't follow updates when tuple cannot be + * locked without doing so. A row locking conflict here means our + * previous conclusion that the tuple is conclusively committed is not + * true anymore. + */ + test = table_tuple_lock(relation, conflictTid, + estate->es_snapshot, + existing, estate->es_output_cid, + lockmode, LockWaitBlock, 0, + &tmfd); + switch (test) + { + case TM_Ok: + /* success! */ + break; + + case TM_Invisible: + + /* + * This can occur when a just inserted tuple is updated again in + * the same command. E.g. because multiple rows with the same + * conflicting key values are inserted. + * + * This is somewhat similar to the ExecUpdate() TM_SelfModified + * case. We do not want to proceed because it would lead to the + * same row being updated a second time in some unspecified order, + * and in contrast to plain UPDATEs there's no historical behavior + * to break. + * + * It is the user's responsibility to prevent this situation from + * occurring. These problems are why SQL-2003 similarly specifies + * that for SQL MERGE, an exception must be raised in the event of + * an attempt to update the same row twice. + */ + xminDatum = slot_getsysattr(existing, + MinTransactionIdAttributeNumber, + &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + if (TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("ON CONFLICT DO UPDATE command cannot affect row a second time"), + errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); + + /* This shouldn't happen */ + elog(ERROR, "attempted to lock invisible tuple"); + break; + + case TM_SelfModified: + + /* + * This state should never be reached. As a dirty snapshot is used + * to find conflicting tuples, speculative insertion wouldn't have + * seen this row to conflict with. + */ + elog(ERROR, "unexpected self-updated tuple"); + break; + + case TM_Updated: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + + /* + * As long as we don't support an UPDATE of INSERT ON CONFLICT for + * a partitioned table we shouldn't reach to a case where tuple to + * be lock is moved to another partition due to concurrent update + * of the partition key. + */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + + /* + * Tell caller to try again from the very start. + * + * It does not make sense to use the usual EvalPlanQual() style + * loop here, as the new version of the row might not conflict + * anymore, or the conflicting tuple has actually been deleted. + */ + ExecClearTuple(existing); + return false; + + case TM_Deleted: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent delete"))); + + /* see TM_Updated case */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + ExecClearTuple(existing); + return false; + + default: + elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + } + + /* Success, the tuple is locked. */ + + /* + * Verify that the tuple is visible to our MVCC snapshot if the current + * isolation level mandates that. + * + * It's not sufficient to rely on the check within ExecUpdate() as e.g. + * CONFLICT ... WHERE clause may prevent us from reaching that. + * + * This means we only ever continue when a new command in the current + * transaction could see the row, even though in READ COMMITTED mode the + * tuple will not be visible according to the current statement's + * snapshot. This is in line with the way UPDATE deals with newer tuple + * versions. + */ + ExecCheckTupleVisible(estate, relation, existing); + + /* + * Make tuple and any needed join variables available to ExecQual and + * ExecProject. The EXCLUDED tuple is installed in ecxt_innertuple, while + * the target's existing tuple is installed in the scantuple. EXCLUDED + * has been made to reference INNER_VAR in setrefs.c, but there is no + * other redirection. + */ + econtext->ecxt_scantuple = existing; + econtext->ecxt_innertuple = excludedSlot; + econtext->ecxt_outertuple = NULL; + + if (!ExecQual(onConflictSetWhere, econtext)) + { + ExecClearTuple(existing); /* see return below */ + InstrCountFiltered1(&mtstate->ps, 1); + return true; /* done with the tuple */ + } + + if (resultRelInfo->ri_WithCheckOptions != NIL) + { + /* + * Check target's existing tuple against UPDATE-applicable USING + * security barrier quals (if any), enforced here as RLS checks/WCOs. + * + * The rewriter creates UPDATE RLS checks/WCOs for UPDATE security + * quals, and stores them as WCOs of "kind" WCO_RLS_CONFLICT_CHECK, + * but that's almost the extent of its special handling for ON + * CONFLICT DO UPDATE. + * + * The rewriter will also have associated UPDATE applicable straight + * RLS checks/WCOs for the benefit of the ExecUpdate() call that + * follows. INSERTs and UPDATEs naturally have mutually exclusive WCO + * kinds, so there is no danger of spurious over-enforcement in the + * INSERT or UPDATE path. + */ + ExecWithCheckOptions(WCO_RLS_CONFLICT_CHECK, resultRelInfo, + existing, + mtstate->ps.state); + } + + /* Project the new tuple version */ + ExecProject(resultRelInfo->ri_onConflict->oc_ProjInfo); + + /* + * Note that it is possible that the target tuple has been modified in + * this session, after the above table_tuple_lock. We choose to not error + * out in that case, in line with ExecUpdate's treatment of similar cases. + * This can happen if an UPDATE is triggered from within ExecQual(), + * ExecWithCheckOptions() or ExecProject() above, e.g. by selecting from a + * wCTE in the ON CONFLICT's SET. + */ + + /* Execute UPDATE with projection */ + *returning = ExecUpdate(mtstate, resultRelInfo, conflictTid, NULL, + resultRelInfo->ri_onConflict->oc_ProjSlot, + planSlot, + &mtstate->mt_epqstate, mtstate->ps.state, + canSetTag); + + /* + * Clear out existing tuple, as there might not be another conflict among + * the next input rows. Don't want to hold resources till the end of the + * query. + */ + ExecClearTuple(existing); + return true; +} + + +/* + * Process BEFORE EACH STATEMENT triggers + */ +static void +fireBSTriggers(ModifyTableState *node) +{ + ModifyTable *plan = (ModifyTable *) node->ps.plan; + ResultRelInfo *resultRelInfo = node->rootResultRelInfo; + + switch (node->operation) + { + case CMD_INSERT: + ExecBSInsertTriggers(node->ps.state, resultRelInfo); + if (plan->onConflictAction == ONCONFLICT_UPDATE) + ExecBSUpdateTriggers(node->ps.state, + resultRelInfo); + break; + case CMD_UPDATE: + ExecBSUpdateTriggers(node->ps.state, resultRelInfo); + break; + case CMD_DELETE: + ExecBSDeleteTriggers(node->ps.state, resultRelInfo); + break; + default: + elog(ERROR, "unknown operation"); + break; + } +} + +/* + * Process AFTER EACH STATEMENT triggers + */ +static void +fireASTriggers(ModifyTableState *node) +{ + ModifyTable *plan = (ModifyTable *) node->ps.plan; + ResultRelInfo *resultRelInfo = node->rootResultRelInfo; + + switch (node->operation) + { + case CMD_INSERT: + if (plan->onConflictAction == ONCONFLICT_UPDATE) + ExecASUpdateTriggers(node->ps.state, + resultRelInfo, + node->mt_oc_transition_capture); + ExecASInsertTriggers(node->ps.state, resultRelInfo, + node->mt_transition_capture); + break; + case CMD_UPDATE: + ExecASUpdateTriggers(node->ps.state, resultRelInfo, + node->mt_transition_capture); + break; + case CMD_DELETE: + ExecASDeleteTriggers(node->ps.state, resultRelInfo, + node->mt_transition_capture); + break; + default: + elog(ERROR, "unknown operation"); + break; + } +} + +/* + * Set up the state needed for collecting transition tuples for AFTER + * triggers. + */ +static void +ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) +{ + ModifyTable *plan = (ModifyTable *) mtstate->ps.plan; + ResultRelInfo *targetRelInfo = mtstate->rootResultRelInfo; + + /* Check for transition tables on the directly targeted relation. */ + mtstate->mt_transition_capture = + MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc, + RelationGetRelid(targetRelInfo->ri_RelationDesc), + mtstate->operation); + if (plan->operation == CMD_INSERT && + plan->onConflictAction == ONCONFLICT_UPDATE) + mtstate->mt_oc_transition_capture = + MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc, + RelationGetRelid(targetRelInfo->ri_RelationDesc), + CMD_UPDATE); +} + +/* + * ExecPrepareTupleRouting --- prepare for routing one tuple + * + * Determine the partition in which the tuple in slot is to be inserted, + * and return its ResultRelInfo in *partRelInfo. The return value is + * a slot holding the tuple of the partition rowtype. + * + * This also sets the transition table information in mtstate based on the + * selected partition. + */ +static TupleTableSlot * +ExecPrepareTupleRouting(ModifyTableState *mtstate, + EState *estate, + PartitionTupleRouting *proute, + ResultRelInfo *targetRelInfo, + TupleTableSlot *slot, + ResultRelInfo **partRelInfo) +{ + ResultRelInfo *partrel; + TupleConversionMap *map; + + /* + * Lookup the target partition's ResultRelInfo. If ExecFindPartition does + * not find a valid partition for the tuple in 'slot' then an error is + * raised. An error may also be raised if the found partition is not a + * valid target for INSERTs. This is required since a partitioned table + * UPDATE to another partition becomes a DELETE+INSERT. + */ + partrel = ExecFindPartition(mtstate, targetRelInfo, proute, slot, estate); + + /* + * If we're capturing transition tuples, we might need to convert from the + * partition rowtype to root partitioned table's rowtype. But if there + * are no BEFORE triggers on the partition that could change the tuple, we + * can just remember the original unconverted tuple to avoid a needless + * round trip conversion. + */ + if (mtstate->mt_transition_capture != NULL) + { + bool has_before_insert_row_trig; + + has_before_insert_row_trig = (partrel->ri_TrigDesc && + partrel->ri_TrigDesc->trig_insert_before_row); + + mtstate->mt_transition_capture->tcs_original_insert_tuple = + !has_before_insert_row_trig ? slot : NULL; + } + + /* + * Convert the tuple, if necessary. + */ + map = partrel->ri_RootToPartitionMap; + if (map != NULL) + { + TupleTableSlot *new_slot = partrel->ri_PartitionTupleSlot; + + slot = execute_attr_map_slot(map->attrMap, slot, new_slot); + } + + *partRelInfo = partrel; + return slot; +} + +/* ---------------------------------------------------------------- + * ExecModifyTable + * + * Perform table modifications as required, and return RETURNING results + * if needed. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecModifyTable(PlanState *pstate) +{ + ModifyTableState *node = castNode(ModifyTableState, pstate); + EState *estate = node->ps.state; + CmdType operation = node->operation; + ResultRelInfo *resultRelInfo; + PlanState *subplanstate; + TupleTableSlot *slot; + TupleTableSlot *planSlot; + TupleTableSlot *oldSlot; + ItemPointer tupleid; + ItemPointerData tuple_ctid; + HeapTupleData oldtupdata; + HeapTuple oldtuple; + PartitionTupleRouting *proute = node->mt_partition_tuple_routing; + List *relinfos = NIL; + ListCell *lc; + + CHECK_FOR_INTERRUPTS(); + + /* + * This should NOT get called during EvalPlanQual; we should have passed a + * subplan tree to EvalPlanQual, instead. Use a runtime test not just + * Assert because this condition is easy to miss in testing. (Note: + * although ModifyTable should not get executed within an EvalPlanQual + * operation, we do have to allow it to be initialized and shut down in + * case it is within a CTE subplan. Hence this test must be here, not in + * ExecInitModifyTable.) + */ + if (estate->es_epq_active != NULL) + elog(ERROR, "ModifyTable should not be called during EvalPlanQual"); + + /* + * If we've already completed processing, don't try to do more. We need + * this test because ExecPostprocessPlan might call us an extra time, and + * our subplan's nodes aren't necessarily robust against being called + * extra times. + */ + if (node->mt_done) + return NULL; + + /* + * On first call, fire BEFORE STATEMENT triggers before proceeding. + */ + if (node->fireBSTriggers) + { + fireBSTriggers(node); + node->fireBSTriggers = false; + } + + /* Preload local variables */ + resultRelInfo = node->resultRelInfo + node->mt_lastResultIndex; + subplanstate = outerPlanState(node); + + /* + * Fetch rows from subplan, and execute the required table modification + * for each row. + */ + for (;;) + { + /* + * Reset the per-output-tuple exprcontext. This is needed because + * triggers expect to use that context as workspace. It's a bit ugly + * to do this below the top level of the plan, however. We might need + * to rethink this later. + */ + ResetPerTupleExprContext(estate); + + /* + * Reset per-tuple memory context used for processing on conflict and + * returning clauses, to free any expression evaluation storage + * allocated in the previous cycle. + */ + if (pstate->ps_ExprContext) + ResetExprContext(pstate->ps_ExprContext); + + planSlot = ExecProcNode(subplanstate); + + /* No more tuples to process? */ + if (TupIsNull(planSlot)) + break; + + /* + * When there are multiple result relations, each tuple contains a + * junk column that gives the OID of the rel from which it came. + * Extract it and select the correct result relation. + */ + if (AttributeNumberIsValid(node->mt_resultOidAttno)) + { + Datum datum; + bool isNull; + Oid resultoid; + + datum = ExecGetJunkAttribute(planSlot, node->mt_resultOidAttno, + &isNull); + if (isNull) + elog(ERROR, "tableoid is NULL"); + resultoid = DatumGetObjectId(datum); + + /* If it's not the same as last time, we need to locate the rel */ + if (resultoid != node->mt_lastResultOid) + resultRelInfo = ExecLookupResultRelByOid(node, resultoid, + false, true); + } + + /* + * If resultRelInfo->ri_usesFdwDirectModify is true, all we need to do + * here is compute the RETURNING expressions. + */ + if (resultRelInfo->ri_usesFdwDirectModify) + { + Assert(resultRelInfo->ri_projectReturning); + + /* + * A scan slot containing the data that was actually inserted, + * updated or deleted has already been made available to + * ExecProcessReturning by IterateDirectModify, so no need to + * provide it here. + */ + slot = ExecProcessReturning(resultRelInfo, NULL, planSlot); + + return slot; + } + + EvalPlanQualSetSlot(&node->mt_epqstate, planSlot); + slot = planSlot; + + tupleid = NULL; + oldtuple = NULL; + + /* + * For UPDATE/DELETE, fetch the row identity info for the tuple to be + * updated/deleted. For a heap relation, that's a TID; otherwise we + * may have a wholerow junk attr that carries the old tuple in toto. + * Keep this in step with the part of ExecInitModifyTable that sets up + * ri_RowIdAttNo. + */ + if (operation == CMD_UPDATE || operation == CMD_DELETE) + { + char relkind; + Datum datum; + bool isNull; + + relkind = resultRelInfo->ri_RelationDesc->rd_rel->relkind; + if (relkind == RELKIND_RELATION || + relkind == RELKIND_MATVIEW || + relkind == RELKIND_PARTITIONED_TABLE) + { + /* ri_RowIdAttNo refers to a ctid attribute */ + Assert(AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)); + datum = ExecGetJunkAttribute(slot, + resultRelInfo->ri_RowIdAttNo, + &isNull); + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "ctid is NULL"); + + tupleid = (ItemPointer) DatumGetPointer(datum); + tuple_ctid = *tupleid; /* be sure we don't free ctid!! */ + tupleid = &tuple_ctid; + } + + /* + * Use the wholerow attribute, when available, to reconstruct the + * old relation tuple. The old tuple serves one or both of two + * purposes: 1) it serves as the OLD tuple for row triggers, 2) it + * provides values for any unchanged columns for the NEW tuple of + * an UPDATE, because the subplan does not produce all the columns + * of the target table. + * + * Note that the wholerow attribute does not carry system columns, + * so foreign table triggers miss seeing those, except that we + * know enough here to set t_tableOid. Quite separately from + * this, the FDW may fetch its own junk attrs to identify the row. + * + * Other relevant relkinds, currently limited to views, always + * have a wholerow attribute. + */ + else if (AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + { + datum = ExecGetJunkAttribute(slot, + resultRelInfo->ri_RowIdAttNo, + &isNull); + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "wholerow is NULL"); + + oldtupdata.t_data = DatumGetHeapTupleHeader(datum); + oldtupdata.t_len = + HeapTupleHeaderGetDatumLength(oldtupdata.t_data); + ItemPointerSetInvalid(&(oldtupdata.t_self)); + /* Historically, view triggers see invalid t_tableOid. */ + oldtupdata.t_tableOid = + (relkind == RELKIND_VIEW) ? InvalidOid : + RelationGetRelid(resultRelInfo->ri_RelationDesc); + + oldtuple = &oldtupdata; + } + else + { + /* Only foreign tables are allowed to omit a row-ID attr */ + Assert(relkind == RELKIND_FOREIGN_TABLE); + } + } + + switch (operation) + { + case CMD_INSERT: + /* Initialize projection info if first time for this table */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitInsertProjection(node, resultRelInfo); + slot = ExecGetInsertNewTuple(resultRelInfo, planSlot); + slot = ExecInsert(node, resultRelInfo, slot, planSlot, + estate, node->canSetTag); + break; + case CMD_UPDATE: + /* Initialize projection info if first time for this table */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitUpdateProjection(node, resultRelInfo); + + /* + * Make the new tuple by combining plan's output tuple with + * the old tuple being updated. + */ + oldSlot = resultRelInfo->ri_oldTupleSlot; + if (oldtuple != NULL) + { + /* Use the wholerow junk attr as the old tuple. */ + ExecForceStoreHeapTuple(oldtuple, oldSlot, false); + } + else + { + /* Fetch the most recent version of old tuple. */ + Relation relation = resultRelInfo->ri_RelationDesc; + + Assert(tupleid != NULL); + if (!table_tuple_fetch_row_version(relation, tupleid, + SnapshotAny, + oldSlot)) + elog(ERROR, "failed to fetch tuple being updated"); + } + slot = ExecGetUpdateNewTuple(resultRelInfo, planSlot, + oldSlot); + + /* Now apply the update. */ + slot = ExecUpdate(node, resultRelInfo, tupleid, oldtuple, slot, + planSlot, &node->mt_epqstate, estate, + node->canSetTag); + break; + case CMD_DELETE: + slot = ExecDelete(node, resultRelInfo, tupleid, oldtuple, + planSlot, &node->mt_epqstate, estate, + true, /* processReturning */ + node->canSetTag, + false, /* changingPart */ + NULL, NULL); + break; + default: + elog(ERROR, "unknown operation"); + break; + } + + /* + * If we got a RETURNING result, return it to caller. We'll continue + * the work on next call. + */ + if (slot) + return slot; + } + + /* + * Insert remaining tuples for batch insert. + */ + if (proute) + relinfos = estate->es_tuple_routing_result_relations; + else + relinfos = estate->es_opened_result_relations; + + foreach(lc, relinfos) + { + resultRelInfo = lfirst(lc); + if (resultRelInfo->ri_NumSlots > 0) + ExecBatchInsert(node, resultRelInfo, + resultRelInfo->ri_Slots, + resultRelInfo->ri_PlanSlots, + resultRelInfo->ri_NumSlots, + estate, node->canSetTag); + } + + /* + * We're done, but fire AFTER STATEMENT triggers before exiting. + */ + fireASTriggers(node); + + node->mt_done = true; + + return NULL; +} + +/* + * ExecLookupResultRelByOid + * If the table with given OID is among the result relations to be + * updated by the given ModifyTable node, return its ResultRelInfo. + * + * If not found, return NULL if missing_ok, else raise error. + * + * If update_cache is true, then upon successful lookup, update the node's + * one-element cache. ONLY ExecModifyTable may pass true for this. + */ +ResultRelInfo * +ExecLookupResultRelByOid(ModifyTableState *node, Oid resultoid, + bool missing_ok, bool update_cache) +{ + if (node->mt_resultOidHash) + { + /* Use the pre-built hash table to locate the rel */ + MTTargetRelLookup *mtlookup; + + mtlookup = (MTTargetRelLookup *) + hash_search(node->mt_resultOidHash, &resultoid, HASH_FIND, NULL); + if (mtlookup) + { + if (update_cache) + { + node->mt_lastResultOid = resultoid; + node->mt_lastResultIndex = mtlookup->relationIndex; + } + return node->resultRelInfo + mtlookup->relationIndex; + } + } + else + { + /* With few target rels, just search the ResultRelInfo array */ + for (int ndx = 0; ndx < node->mt_nrels; ndx++) + { + ResultRelInfo *rInfo = node->resultRelInfo + ndx; + + if (RelationGetRelid(rInfo->ri_RelationDesc) == resultoid) + { + if (update_cache) + { + node->mt_lastResultOid = resultoid; + node->mt_lastResultIndex = ndx; + } + return rInfo; + } + } + } + + if (!missing_ok) + elog(ERROR, "incorrect result relation OID %u", resultoid); + return NULL; +} + +/* ---------------------------------------------------------------- + * ExecInitModifyTable + * ---------------------------------------------------------------- + */ +ModifyTableState * +ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) +{ + ModifyTableState *mtstate; + Plan *subplan = outerPlan(node); + CmdType operation = node->operation; + int nrels = list_length(node->resultRelations); + ResultRelInfo *resultRelInfo; + List *arowmarks; + ListCell *l; + int i; + Relation rel; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + mtstate = makeNode(ModifyTableState); + mtstate->ps.plan = (Plan *) node; + mtstate->ps.state = estate; + mtstate->ps.ExecProcNode = ExecModifyTable; + + mtstate->operation = operation; + mtstate->canSetTag = node->canSetTag; + mtstate->mt_done = false; + + mtstate->mt_nrels = nrels; + mtstate->resultRelInfo = (ResultRelInfo *) + palloc(nrels * sizeof(ResultRelInfo)); + + /*---------- + * Resolve the target relation. This is the same as: + * + * - the relation for which we will fire FOR STATEMENT triggers, + * - the relation into whose tuple format all captured transition tuples + * must be converted, and + * - the root partitioned table used for tuple routing. + * + * If it's a partitioned table, the root partition doesn't appear + * elsewhere in the plan and its RT index is given explicitly in + * node->rootRelation. Otherwise (i.e. table inheritance) the target + * relation is the first relation in the node->resultRelations list. + *---------- + */ + if (node->rootRelation > 0) + { + mtstate->rootResultRelInfo = makeNode(ResultRelInfo); + ExecInitResultRelation(estate, mtstate->rootResultRelInfo, + node->rootRelation); + } + else + { + mtstate->rootResultRelInfo = mtstate->resultRelInfo; + ExecInitResultRelation(estate, mtstate->resultRelInfo, + linitial_int(node->resultRelations)); + } + + /* set up epqstate with dummy subplan data for the moment */ + EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam); + mtstate->fireBSTriggers = true; + + /* + * Build state for collecting transition tuples. This requires having a + * valid trigger query context, so skip it in explain-only mode. + */ + if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY)) + ExecSetupTransitionCaptureState(mtstate, estate); + + /* + * Open all the result relations and initialize the ResultRelInfo structs. + * (But root relation was initialized above, if it's part of the array.) + * We must do this before initializing the subplan, because direct-modify + * FDWs expect their ResultRelInfos to be available. + */ + resultRelInfo = mtstate->resultRelInfo; + i = 0; + foreach(l, node->resultRelations) + { + Index resultRelation = lfirst_int(l); + + if (resultRelInfo != mtstate->rootResultRelInfo) + { + ExecInitResultRelation(estate, resultRelInfo, resultRelation); + + /* + * For child result relations, store the root result relation + * pointer. We do so for the convenience of places that want to + * look at the query's original target relation but don't have the + * mtstate handy. + */ + resultRelInfo->ri_RootResultRelInfo = mtstate->rootResultRelInfo; + } + + /* Initialize the usesFdwDirectModify flag */ + resultRelInfo->ri_usesFdwDirectModify = bms_is_member(i, + node->fdwDirectModifyPlans); + + /* + * Verify result relation is a valid target for the current operation + */ + CheckValidResultRel(resultRelInfo, operation); + + resultRelInfo++; + i++; + } + + /* + * Now we may initialize the subplan. + */ + outerPlanState(mtstate) = ExecInitNode(subplan, estate, eflags); + + /* + * Do additional per-result-relation initialization. + */ + for (i = 0; i < nrels; i++) + { + resultRelInfo = &mtstate->resultRelInfo[i]; + + /* Let FDWs init themselves for foreign-table result rels */ + if (!resultRelInfo->ri_usesFdwDirectModify && + resultRelInfo->ri_FdwRoutine != NULL && + resultRelInfo->ri_FdwRoutine->BeginForeignModify != NULL) + { + List *fdw_private = (List *) list_nth(node->fdwPrivLists, i); + + resultRelInfo->ri_FdwRoutine->BeginForeignModify(mtstate, + resultRelInfo, + fdw_private, + i, + eflags); + } + + /* + * For UPDATE/DELETE, find the appropriate junk attr now, either a + * 'ctid' or 'wholerow' attribute depending on relkind. For foreign + * tables, the FDW might have created additional junk attr(s), but + * those are no concern of ours. + */ + if (operation == CMD_UPDATE || operation == CMD_DELETE) + { + char relkind; + + relkind = resultRelInfo->ri_RelationDesc->rd_rel->relkind; + if (relkind == RELKIND_RELATION || + relkind == RELKIND_MATVIEW || + relkind == RELKIND_PARTITIONED_TABLE) + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk ctid column"); + } + else if (relkind == RELKIND_FOREIGN_TABLE) + { + /* + * When there is a row-level trigger, there should be a + * wholerow attribute. We also require it to be present in + * UPDATE, so we can get the values of unchanged columns. + */ + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, + "wholerow"); + if (mtstate->operation == CMD_UPDATE && + !AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk wholerow column"); + } + else + { + /* Other valid target relkinds must provide wholerow */ + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, + "wholerow"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk wholerow column"); + } + } + } + + /* + * If this is an inherited update/delete, there will be a junk attribute + * named "tableoid" present in the subplan's targetlist. It will be used + * to identify the result relation for a given tuple to be + * updated/deleted. + */ + mtstate->mt_resultOidAttno = + ExecFindJunkAttributeInTlist(subplan->targetlist, "tableoid"); + Assert(AttributeNumberIsValid(mtstate->mt_resultOidAttno) || nrels == 1); + mtstate->mt_lastResultOid = InvalidOid; /* force lookup at first tuple */ + mtstate->mt_lastResultIndex = 0; /* must be zero if no such attr */ + + /* Get the root target relation */ + rel = mtstate->rootResultRelInfo->ri_RelationDesc; + + /* + * Build state for tuple routing if it's a partitioned INSERT. An UPDATE + * might need this too, but only if it actually moves tuples between + * partitions; in that case setup is done by ExecCrossPartitionUpdate. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE && + operation == CMD_INSERT) + mtstate->mt_partition_tuple_routing = + ExecSetupPartitionTupleRouting(estate, rel); + + /* + * Initialize any WITH CHECK OPTION constraints if needed. + */ + resultRelInfo = mtstate->resultRelInfo; + foreach(l, node->withCheckOptionLists) + { + List *wcoList = (List *) lfirst(l); + List *wcoExprs = NIL; + ListCell *ll; + + foreach(ll, wcoList) + { + WithCheckOption *wco = (WithCheckOption *) lfirst(ll); + ExprState *wcoExpr = ExecInitQual((List *) wco->qual, + &mtstate->ps); + + wcoExprs = lappend(wcoExprs, wcoExpr); + } + + resultRelInfo->ri_WithCheckOptions = wcoList; + resultRelInfo->ri_WithCheckOptionExprs = wcoExprs; + resultRelInfo++; + } + + /* + * Initialize RETURNING projections if needed. + */ + if (node->returningLists) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * Initialize result tuple slot and assign its rowtype using the first + * RETURNING list. We assume the rest will look the same. + */ + mtstate->ps.plan->targetlist = (List *) linitial(node->returningLists); + + /* Set up a slot for the output of the RETURNING projection(s) */ + ExecInitResultTupleSlotTL(&mtstate->ps, &TTSOpsVirtual); + slot = mtstate->ps.ps_ResultTupleSlot; + + /* Need an econtext too */ + if (mtstate->ps.ps_ExprContext == NULL) + ExecAssignExprContext(estate, &mtstate->ps); + econtext = mtstate->ps.ps_ExprContext; + + /* + * Build a projection for each result rel. + */ + resultRelInfo = mtstate->resultRelInfo; + foreach(l, node->returningLists) + { + List *rlist = (List *) lfirst(l); + + resultRelInfo->ri_returningList = rlist; + resultRelInfo->ri_projectReturning = + ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, + resultRelInfo->ri_RelationDesc->rd_att); + resultRelInfo++; + } + } + else + { + /* + * We still must construct a dummy result tuple type, because InitPlan + * expects one (maybe should change that?). + */ + mtstate->ps.plan->targetlist = NIL; + ExecInitResultTypeTL(&mtstate->ps); + + mtstate->ps.ps_ExprContext = NULL; + } + + /* Set the list of arbiter indexes if needed for ON CONFLICT */ + resultRelInfo = mtstate->resultRelInfo; + if (node->onConflictAction != ONCONFLICT_NONE) + { + /* insert may only have one relation, inheritance is not expanded */ + Assert(nrels == 1); + resultRelInfo->ri_onConflictArbiterIndexes = node->arbiterIndexes; + } + + /* + * If needed, Initialize target list, projection and qual for ON CONFLICT + * DO UPDATE. + */ + if (node->onConflictAction == ONCONFLICT_UPDATE) + { + OnConflictSetState *onconfl = makeNode(OnConflictSetState); + ExprContext *econtext; + TupleDesc relationDesc; + + /* already exists if created by RETURNING processing above */ + if (mtstate->ps.ps_ExprContext == NULL) + ExecAssignExprContext(estate, &mtstate->ps); + + econtext = mtstate->ps.ps_ExprContext; + relationDesc = resultRelInfo->ri_RelationDesc->rd_att; + + /* create state for DO UPDATE SET operation */ + resultRelInfo->ri_onConflict = onconfl; + + /* initialize slot for the existing tuple */ + onconfl->oc_Existing = + table_slot_create(resultRelInfo->ri_RelationDesc, + &mtstate->ps.state->es_tupleTable); + + /* + * Create the tuple slot for the UPDATE SET projection. We want a slot + * of the table's type here, because the slot will be used to insert + * into the table, and for RETURNING processing - which may access + * system attributes. + */ + onconfl->oc_ProjSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &mtstate->ps.state->es_tupleTable); + + /* build UPDATE SET projection state */ + onconfl->oc_ProjInfo = + ExecBuildUpdateProjection(node->onConflictSet, + true, + node->onConflictCols, + relationDesc, + econtext, + onconfl->oc_ProjSlot, + &mtstate->ps); + + /* initialize state to evaluate the WHERE clause, if any */ + if (node->onConflictWhere) + { + ExprState *qualexpr; + + qualexpr = ExecInitQual((List *) node->onConflictWhere, + &mtstate->ps); + onconfl->oc_WhereClause = qualexpr; + } + } + + /* + * If we have any secondary relations in an UPDATE or DELETE, they need to + * be treated like non-locked relations in SELECT FOR UPDATE, ie, the + * EvalPlanQual mechanism needs to be told about them. Locate the + * relevant ExecRowMarks. + */ + arowmarks = NIL; + foreach(l, node->rowMarks) + { + PlanRowMark *rc = lfirst_node(PlanRowMark, l); + ExecRowMark *erm; + ExecAuxRowMark *aerm; + + /* ignore "parent" rowmarks; they are irrelevant at runtime */ + if (rc->isParent) + continue; + + /* Find ExecRowMark and build ExecAuxRowMark */ + erm = ExecFindRowMark(estate, rc->rti, false); + aerm = ExecBuildAuxRowMark(erm, subplan->targetlist); + arowmarks = lappend(arowmarks, aerm); + } + + EvalPlanQualSetPlan(&mtstate->mt_epqstate, subplan, arowmarks); + + /* + * If there are a lot of result relations, use a hash table to speed the + * lookups. If there are not a lot, a simple linear search is faster. + * + * It's not clear where the threshold is, but try 64 for starters. In a + * debugging build, use a small threshold so that we get some test + * coverage of both code paths. + */ +#ifdef USE_ASSERT_CHECKING +#define MT_NRELS_HASH 4 +#else +#define MT_NRELS_HASH 64 +#endif + if (nrels >= MT_NRELS_HASH) + { + HASHCTL hash_ctl; + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(MTTargetRelLookup); + hash_ctl.hcxt = CurrentMemoryContext; + mtstate->mt_resultOidHash = + hash_create("ModifyTable target hash", + nrels, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + for (i = 0; i < nrels; i++) + { + Oid hashkey; + MTTargetRelLookup *mtlookup; + bool found; + + resultRelInfo = &mtstate->resultRelInfo[i]; + hashkey = RelationGetRelid(resultRelInfo->ri_RelationDesc); + mtlookup = (MTTargetRelLookup *) + hash_search(mtstate->mt_resultOidHash, &hashkey, + HASH_ENTER, &found); + Assert(!found); + mtlookup->relationIndex = i; + } + } + else + mtstate->mt_resultOidHash = NULL; + + /* + * Determine if the FDW supports batch insert and determine the batch size + * (a FDW may support batching, but it may be disabled for the + * server/table). + * + * We only do this for INSERT, so that for UPDATE/DELETE the batch size + * remains set to 0. + */ + if (operation == CMD_INSERT) + { + /* insert may only have one relation, inheritance is not expanded */ + Assert(nrels == 1); + resultRelInfo = mtstate->resultRelInfo; + if (!resultRelInfo->ri_usesFdwDirectModify && + resultRelInfo->ri_FdwRoutine != NULL && + resultRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize && + resultRelInfo->ri_FdwRoutine->ExecForeignBatchInsert) + { + resultRelInfo->ri_BatchSize = + resultRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(resultRelInfo); + Assert(resultRelInfo->ri_BatchSize >= 1); + } + else + resultRelInfo->ri_BatchSize = 1; + } + + /* + * Lastly, if this is not the primary (canSetTag) ModifyTable node, add it + * to estate->es_auxmodifytables so that it will be run to completion by + * ExecPostprocessPlan. (It'd actually work fine to add the primary + * ModifyTable node too, but there's no need.) Note the use of lcons not + * lappend: we need later-initialized ModifyTable nodes to be shut down + * before earlier ones. This ensures that we don't throw away RETURNING + * rows that need to be seen by a later CTE subplan. + */ + if (!mtstate->canSetTag) + estate->es_auxmodifytables = lcons(mtstate, + estate->es_auxmodifytables); + + return mtstate; +} + +/* ---------------------------------------------------------------- + * ExecEndModifyTable + * + * Shuts down the plan. + * + * Returns nothing of interest. + * ---------------------------------------------------------------- + */ +void +ExecEndModifyTable(ModifyTableState *node) +{ + int i; + + /* + * Allow any FDWs to shut down + */ + for (i = 0; i < node->mt_nrels; i++) + { + int j; + ResultRelInfo *resultRelInfo = node->resultRelInfo + i; + + if (!resultRelInfo->ri_usesFdwDirectModify && + resultRelInfo->ri_FdwRoutine != NULL && + resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL) + resultRelInfo->ri_FdwRoutine->EndForeignModify(node->ps.state, + resultRelInfo); + + /* + * Cleanup the initialized batch slots. This only matters for FDWs + * with batching, but the other cases will have ri_NumSlotsInitialized + * == 0. + */ + for (j = 0; j < resultRelInfo->ri_NumSlotsInitialized; j++) + { + ExecDropSingleTupleTableSlot(resultRelInfo->ri_Slots[j]); + ExecDropSingleTupleTableSlot(resultRelInfo->ri_PlanSlots[j]); + } + } + + /* + * Close all the partitioned tables, leaf partitions, and their indices + * and release the slot used for tuple routing, if set. + */ + if (node->mt_partition_tuple_routing) + { + ExecCleanupTupleRouting(node, node->mt_partition_tuple_routing); + + if (node->mt_root_tuple_slot) + ExecDropSingleTupleTableSlot(node->mt_root_tuple_slot); + } + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ps); + + /* + * clean out the tuple table + */ + if (node->ps.ps_ResultTupleSlot) + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* + * Terminate EPQ execution if active + */ + EvalPlanQualEnd(&node->mt_epqstate); + + /* + * shut down subplan + */ + ExecEndNode(outerPlanState(node)); +} + +void +ExecReScanModifyTable(ModifyTableState *node) +{ + /* + * Currently, we don't need to support rescan on ModifyTable nodes. The + * semantics of that would be a bit debatable anyway. + */ + elog(ERROR, "ExecReScanModifyTable is not implemented"); +} diff --git a/src/backend/executor/nodeNamedtuplestorescan.c b/src/backend/executor/nodeNamedtuplestorescan.c new file mode 100644 index 0000000..c0d1069 --- /dev/null +++ b/src/backend/executor/nodeNamedtuplestorescan.c @@ -0,0 +1,201 @@ +/*------------------------------------------------------------------------- + * + * nodeNamedtuplestorescan.c + * routines to handle NamedTuplestoreScan nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeNamedtuplestorescan.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/nodeNamedtuplestorescan.h" +#include "miscadmin.h" +#include "utils/queryenvironment.h" + +static TupleTableSlot *NamedTuplestoreScanNext(NamedTuplestoreScanState *node); + +/* ---------------------------------------------------------------- + * NamedTuplestoreScanNext + * + * This is a workhorse for ExecNamedTuplestoreScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +NamedTuplestoreScanNext(NamedTuplestoreScanState *node) +{ + TupleTableSlot *slot; + + /* We intentionally do not support backward scan. */ + Assert(ScanDirectionIsForward(node->ss.ps.state->es_direction)); + + /* + * Get the next tuple from tuplestore. Return NULL if no more tuples. + */ + slot = node->ss.ss_ScanTupleSlot; + tuplestore_select_read_pointer(node->relation, node->readptr); + (void) tuplestore_gettupleslot(node->relation, true, false, slot); + return slot; +} + +/* + * NamedTuplestoreScanRecheck -- access method routine to recheck a tuple in + * EvalPlanQual + */ +static bool +NamedTuplestoreScanRecheck(NamedTuplestoreScanState *node, TupleTableSlot *slot) +{ + /* nothing to check */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecNamedTuplestoreScan(node) + * + * Scans the CTE sequentially and returns the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecNamedTuplestoreScan(PlanState *pstate) +{ + NamedTuplestoreScanState *node = castNode(NamedTuplestoreScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) NamedTuplestoreScanNext, + (ExecScanRecheckMtd) NamedTuplestoreScanRecheck); +} + + +/* ---------------------------------------------------------------- + * ExecInitNamedTuplestoreScan + * ---------------------------------------------------------------- + */ +NamedTuplestoreScanState * +ExecInitNamedTuplestoreScan(NamedTuplestoreScan *node, EState *estate, int eflags) +{ + NamedTuplestoreScanState *scanstate; + EphemeralNamedRelation enr; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * NamedTuplestoreScan should not have any children. + */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create new NamedTuplestoreScanState for node + */ + scanstate = makeNode(NamedTuplestoreScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecNamedTuplestoreScan; + + enr = get_ENR(estate->es_queryEnv, node->enrname); + if (!enr) + elog(ERROR, "executor could not find named tuplestore \"%s\"", + node->enrname); + + Assert(enr->reldata); + scanstate->relation = (Tuplestorestate *) enr->reldata; + scanstate->tupdesc = ENRMetadataGetTupDesc(&(enr->md)); + scanstate->readptr = + tuplestore_alloc_read_pointer(scanstate->relation, EXEC_FLAG_REWIND); + + /* + * The new read pointer copies its position from read pointer 0, which + * could be anywhere, so explicitly rewind it. + */ + tuplestore_select_read_pointer(scanstate->relation, scanstate->readptr); + tuplestore_rescan(scanstate->relation); + + /* + * XXX: Should we add a function to free that read pointer when done? + * + * This was attempted, but it did not improve performance or memory usage + * in any tested cases. + */ + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * The scan tuple type is specified for the tuplestore. + */ + ExecInitScanTupleSlot(estate, &scanstate->ss, scanstate->tupdesc, + &TTSOpsMinimalTuple); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndNamedTuplestoreScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndNamedTuplestoreScan(NamedTuplestoreScanState *node) +{ + /* + * Free exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecReScanNamedTuplestoreScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanNamedTuplestoreScan(NamedTuplestoreScanState *node) +{ + Tuplestorestate *tuplestorestate = node->relation; + + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + ExecScanReScan(&node->ss); + + /* + * Rewind my own pointer. + */ + tuplestore_select_read_pointer(tuplestorestate, node->readptr); + tuplestore_rescan(tuplestorestate); +} diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c new file mode 100644 index 0000000..41e5eca --- /dev/null +++ b/src/backend/executor/nodeNestloop.c @@ -0,0 +1,411 @@ +/*------------------------------------------------------------------------- + * + * nodeNestloop.c + * routines to support nest-loop joins + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeNestloop.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecNestLoop - process a nestloop join of two plans + * ExecInitNestLoop - initialize the join + * ExecEndNestLoop - shut down the join + */ + +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/nodeNestloop.h" +#include "miscadmin.h" +#include "utils/memutils.h" + + +/* ---------------------------------------------------------------- + * ExecNestLoop(node) + * + * old comments + * Returns the tuple joined from inner and outer tuples which + * satisfies the qualification clause. + * + * It scans the inner relation to join with current outer tuple. + * + * If none is found, next tuple from the outer relation is retrieved + * and the inner relation is scanned from the beginning again to join + * with the outer tuple. + * + * NULL is returned if all the remaining outer tuples are tried and + * all fail to join with the inner tuples. + * + * NULL is also returned if there is no tuple from inner relation. + * + * Conditions: + * -- outerTuple contains current tuple from outer relation and + * the right son(inner relation) maintains "cursor" at the tuple + * returned previously. + * This is achieved by maintaining a scan position on the outer + * relation. + * + * Initial States: + * -- the outer child and the inner child + * are prepared to return the first tuple. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecNestLoop(PlanState *pstate) +{ + NestLoopState *node = castNode(NestLoopState, pstate); + NestLoop *nl; + PlanState *innerPlan; + PlanState *outerPlan; + TupleTableSlot *outerTupleSlot; + TupleTableSlot *innerTupleSlot; + ExprState *joinqual; + ExprState *otherqual; + ExprContext *econtext; + ListCell *lc; + + CHECK_FOR_INTERRUPTS(); + + /* + * get information from the node + */ + ENL1_printf("getting info from node"); + + nl = (NestLoop *) node->js.ps.plan; + joinqual = node->js.joinqual; + otherqual = node->js.ps.qual; + outerPlan = outerPlanState(node); + innerPlan = innerPlanState(node); + econtext = node->js.ps.ps_ExprContext; + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + ResetExprContext(econtext); + + /* + * Ok, everything is setup for the join so now loop until we return a + * qualifying join tuple. + */ + ENL1_printf("entering main loop"); + + for (;;) + { + /* + * If we don't have an outer tuple, get the next one and reset the + * inner scan. + */ + if (node->nl_NeedNewOuter) + { + ENL1_printf("getting new outer tuple"); + outerTupleSlot = ExecProcNode(outerPlan); + + /* + * if there are no more outer tuples, then the join is complete.. + */ + if (TupIsNull(outerTupleSlot)) + { + ENL1_printf("no outer tuple, ending join"); + return NULL; + } + + ENL1_printf("saving new outer tuple information"); + econtext->ecxt_outertuple = outerTupleSlot; + node->nl_NeedNewOuter = false; + node->nl_MatchedOuter = false; + + /* + * fetch the values of any outer Vars that must be passed to the + * inner scan, and store them in the appropriate PARAM_EXEC slots. + */ + foreach(lc, nl->nestParams) + { + NestLoopParam *nlp = (NestLoopParam *) lfirst(lc); + int paramno = nlp->paramno; + ParamExecData *prm; + + prm = &(econtext->ecxt_param_exec_vals[paramno]); + /* Param value should be an OUTER_VAR var */ + Assert(IsA(nlp->paramval, Var)); + Assert(nlp->paramval->varno == OUTER_VAR); + Assert(nlp->paramval->varattno > 0); + prm->value = slot_getattr(outerTupleSlot, + nlp->paramval->varattno, + &(prm->isnull)); + /* Flag parameter value as changed */ + innerPlan->chgParam = bms_add_member(innerPlan->chgParam, + paramno); + } + + /* + * now rescan the inner plan + */ + ENL1_printf("rescanning inner plan"); + ExecReScan(innerPlan); + } + + /* + * we have an outerTuple, try to get the next inner tuple. + */ + ENL1_printf("getting new inner tuple"); + + innerTupleSlot = ExecProcNode(innerPlan); + econtext->ecxt_innertuple = innerTupleSlot; + + if (TupIsNull(innerTupleSlot)) + { + ENL1_printf("no inner tuple, need new outer tuple"); + + node->nl_NeedNewOuter = true; + + if (!node->nl_MatchedOuter && + (node->js.jointype == JOIN_LEFT || + node->js.jointype == JOIN_ANTI)) + { + /* + * We are doing an outer join and there were no join matches + * for this outer tuple. Generate a fake join tuple with + * nulls for the inner tuple, and return it if it passes the + * non-join quals. + */ + econtext->ecxt_innertuple = node->nl_NullInnerTupleSlot; + + ENL1_printf("testing qualification for outer-join tuple"); + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + { + /* + * qualification was satisfied so we project and return + * the slot containing the result tuple using + * ExecProject(). + */ + ENL1_printf("qualification succeeded, projecting tuple"); + + return ExecProject(node->js.ps.ps_ProjInfo); + } + else + InstrCountFiltered2(node, 1); + } + + /* + * Otherwise just return to top of loop for a new outer tuple. + */ + continue; + } + + /* + * at this point we have a new pair of inner and outer tuples so we + * test the inner and outer tuples to see if they satisfy the node's + * qualification. + * + * Only the joinquals determine MatchedOuter status, but all quals + * must pass to actually return the tuple. + */ + ENL1_printf("testing qualification"); + + if (ExecQual(joinqual, econtext)) + { + node->nl_MatchedOuter = true; + + /* In an antijoin, we never return a matched tuple */ + if (node->js.jointype == JOIN_ANTI) + { + node->nl_NeedNewOuter = true; + continue; /* return to top of loop */ + } + + /* + * If we only need to join to the first matching inner tuple, then + * consider returning this one, but after that continue with next + * outer tuple. + */ + if (node->js.single_match) + node->nl_NeedNewOuter = true; + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + { + /* + * qualification was satisfied so we project and return the + * slot containing the result tuple using ExecProject(). + */ + ENL1_printf("qualification succeeded, projecting tuple"); + + return ExecProject(node->js.ps.ps_ProjInfo); + } + else + InstrCountFiltered2(node, 1); + } + else + InstrCountFiltered1(node, 1); + + /* + * Tuple fails qual, so free per-tuple memory and try again. + */ + ResetExprContext(econtext); + + ENL1_printf("qualification failed, looping"); + } +} + +/* ---------------------------------------------------------------- + * ExecInitNestLoop + * ---------------------------------------------------------------- + */ +NestLoopState * +ExecInitNestLoop(NestLoop *node, EState *estate, int eflags) +{ + NestLoopState *nlstate; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + NL1_printf("ExecInitNestLoop: %s\n", + "initializing node"); + + /* + * create state structure + */ + nlstate = makeNode(NestLoopState); + nlstate->js.ps.plan = (Plan *) node; + nlstate->js.ps.state = estate; + nlstate->js.ps.ExecProcNode = ExecNestLoop; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &nlstate->js.ps); + + /* + * initialize child nodes + * + * If we have no parameters to pass into the inner rel from the outer, + * tell the inner child that cheap rescans would be good. If we do have + * such parameters, then there is no point in REWIND support at all in the + * inner child, because it will always be rescanned with fresh parameter + * values. + */ + outerPlanState(nlstate) = ExecInitNode(outerPlan(node), estate, eflags); + if (node->nestParams == NIL) + eflags |= EXEC_FLAG_REWIND; + else + eflags &= ~EXEC_FLAG_REWIND; + innerPlanState(nlstate) = ExecInitNode(innerPlan(node), estate, eflags); + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTupleSlotTL(&nlstate->js.ps, &TTSOpsVirtual); + ExecAssignProjectionInfo(&nlstate->js.ps, NULL); + + /* + * initialize child expressions + */ + nlstate->js.ps.qual = + ExecInitQual(node->join.plan.qual, (PlanState *) nlstate); + nlstate->js.jointype = node->join.jointype; + nlstate->js.joinqual = + ExecInitQual(node->join.joinqual, (PlanState *) nlstate); + + /* + * detect whether we need only consider the first matching inner tuple + */ + nlstate->js.single_match = (node->join.inner_unique || + node->join.jointype == JOIN_SEMI); + + /* set up null tuples for outer joins, if needed */ + switch (node->join.jointype) + { + case JOIN_INNER: + case JOIN_SEMI: + break; + case JOIN_LEFT: + case JOIN_ANTI: + nlstate->nl_NullInnerTupleSlot = + ExecInitNullTupleSlot(estate, + ExecGetResultType(innerPlanState(nlstate)), + &TTSOpsVirtual); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) node->join.jointype); + } + + /* + * finally, wipe the current outer tuple clean. + */ + nlstate->nl_NeedNewOuter = true; + nlstate->nl_MatchedOuter = false; + + NL1_printf("ExecInitNestLoop: %s\n", + "node initialized"); + + return nlstate; +} + +/* ---------------------------------------------------------------- + * ExecEndNestLoop + * + * closes down scans and frees allocated storage + * ---------------------------------------------------------------- + */ +void +ExecEndNestLoop(NestLoopState *node) +{ + NL1_printf("ExecEndNestLoop: %s\n", + "ending node processing"); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->js.ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->js.ps.ps_ResultTupleSlot); + + /* + * close down subplans + */ + ExecEndNode(outerPlanState(node)); + ExecEndNode(innerPlanState(node)); + + NL1_printf("ExecEndNestLoop: %s\n", + "node processing ended"); +} + +/* ---------------------------------------------------------------- + * ExecReScanNestLoop + * ---------------------------------------------------------------- + */ +void +ExecReScanNestLoop(NestLoopState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + /* + * If outerPlan->chgParam is not null then plan will be automatically + * re-scanned by first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); + + /* + * innerPlan is re-scanned for each new outer tuple and MUST NOT be + * re-scanned from here or you'll get troubles from inner index scans when + * outer Vars are used as run-time keys... + */ + + node->nl_NeedNewOuter = true; + node->nl_MatchedOuter = false; +} diff --git a/src/backend/executor/nodeProjectSet.c b/src/backend/executor/nodeProjectSet.c new file mode 100644 index 0000000..07be814 --- /dev/null +++ b/src/backend/executor/nodeProjectSet.c @@ -0,0 +1,351 @@ +/*------------------------------------------------------------------------- + * + * nodeProjectSet.c + * support for evaluating targetlists containing set-returning functions + * + * DESCRIPTION + * + * ProjectSet nodes are inserted by the planner to evaluate set-returning + * functions in the targetlist. It's guaranteed that all set-returning + * functions are directly at the top level of the targetlist, i.e. they + * can't be inside more-complex expressions. If that'd otherwise be + * the case, the planner adds additional ProjectSet nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeProjectSet.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeProjectSet.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "utils/memutils.h" + + +static TupleTableSlot *ExecProjectSRF(ProjectSetState *node, bool continuing); + + +/* ---------------------------------------------------------------- + * ExecProjectSet(node) + * + * Return tuples after evaluating the targetlist (which contains set + * returning functions). + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecProjectSet(PlanState *pstate) +{ + ProjectSetState *node = castNode(ProjectSetState, pstate); + TupleTableSlot *outerTupleSlot; + TupleTableSlot *resultSlot; + PlanState *outerPlan; + ExprContext *econtext; + + CHECK_FOR_INTERRUPTS(); + + econtext = node->ps.ps_ExprContext; + + /* + * Reset per-tuple context to free expression-evaluation storage allocated + * for a potentially previously returned tuple. Note that the SRF argument + * context has a different lifetime and is reset below. + */ + ResetExprContext(econtext); + + /* + * Check to see if we're still projecting out tuples from a previous scan + * tuple (because there is a function-returning-set in the projection + * expressions). If so, try to project another one. + */ + if (node->pending_srf_tuples) + { + resultSlot = ExecProjectSRF(node, true); + + if (resultSlot != NULL) + return resultSlot; + } + + /* + * Reset argument context to free any expression evaluation storage + * allocated in the previous tuple cycle. Note this can't happen until + * we're done projecting out tuples from a scan tuple, as ValuePerCall + * functions are allowed to reference the arguments for each returned + * tuple. + */ + MemoryContextReset(node->argcontext); + + /* + * Get another input tuple and project SRFs from it. + */ + for (;;) + { + /* + * Retrieve tuples from the outer plan until there are no more. + */ + outerPlan = outerPlanState(node); + outerTupleSlot = ExecProcNode(outerPlan); + + if (TupIsNull(outerTupleSlot)) + return NULL; + + /* + * Prepare to compute projection expressions, which will expect to + * access the input tuples as varno OUTER. + */ + econtext->ecxt_outertuple = outerTupleSlot; + + /* Evaluate the expressions */ + resultSlot = ExecProjectSRF(node, false); + + /* + * Return the tuple unless the projection produced no rows (due to an + * empty set), in which case we must loop back to see if there are + * more outerPlan tuples. + */ + if (resultSlot) + return resultSlot; + } + + return NULL; +} + +/* ---------------------------------------------------------------- + * ExecProjectSRF + * + * Project a targetlist containing one or more set-returning functions. + * + * 'continuing' indicates whether to continue projecting rows for the + * same input tuple; or whether a new input tuple is being projected. + * + * Returns NULL if no output tuple has been produced. + * + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecProjectSRF(ProjectSetState *node, bool continuing) +{ + TupleTableSlot *resultSlot = node->ps.ps_ResultTupleSlot; + ExprContext *econtext = node->ps.ps_ExprContext; + MemoryContext oldcontext; + bool hassrf PG_USED_FOR_ASSERTS_ONLY; + bool hasresult; + int argno; + + ExecClearTuple(resultSlot); + + /* Call SRFs, as well as plain expressions, in per-tuple context */ + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + /* + * Assume no further tuples are produced unless an ExprMultipleResult is + * encountered from a set returning function. + */ + node->pending_srf_tuples = false; + + hassrf = hasresult = false; + for (argno = 0; argno < node->nelems; argno++) + { + Node *elem = node->elems[argno]; + ExprDoneCond *isdone = &node->elemdone[argno]; + Datum *result = &resultSlot->tts_values[argno]; + bool *isnull = &resultSlot->tts_isnull[argno]; + + if (continuing && *isdone == ExprEndResult) + { + /* + * If we're continuing to project output rows from a source tuple, + * return NULLs once the SRF has been exhausted. + */ + *result = (Datum) 0; + *isnull = true; + hassrf = true; + } + else if (IsA(elem, SetExprState)) + { + /* + * Evaluate SRF - possibly continuing previously started output. + */ + *result = ExecMakeFunctionResultSet((SetExprState *) elem, + econtext, node->argcontext, + isnull, isdone); + + if (*isdone != ExprEndResult) + hasresult = true; + if (*isdone == ExprMultipleResult) + node->pending_srf_tuples = true; + hassrf = true; + } + else + { + /* Non-SRF tlist expression, just evaluate normally. */ + *result = ExecEvalExpr((ExprState *) elem, econtext, isnull); + *isdone = ExprSingleResult; + } + } + + MemoryContextSwitchTo(oldcontext); + + /* ProjectSet should not be used if there's no SRFs */ + Assert(hassrf); + + /* + * If all the SRFs returned ExprEndResult, we consider that as no row + * being produced. + */ + if (hasresult) + { + ExecStoreVirtualTuple(resultSlot); + return resultSlot; + } + + return NULL; +} + +/* ---------------------------------------------------------------- + * ExecInitProjectSet + * + * Creates the run-time state information for the ProjectSet node + * produced by the planner and initializes outer relations + * (child nodes). + * ---------------------------------------------------------------- + */ +ProjectSetState * +ExecInitProjectSet(ProjectSet *node, EState *estate, int eflags) +{ + ProjectSetState *state; + ListCell *lc; + int off; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD))); + + /* + * create state structure + */ + state = makeNode(ProjectSetState); + state->ps.plan = (Plan *) node; + state->ps.state = estate; + state->ps.ExecProcNode = ExecProjectSet; + + state->pending_srf_tuples = false; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &state->ps); + + /* + * initialize child nodes + */ + outerPlanState(state) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * we don't use inner plan + */ + Assert(innerPlan(node) == NULL); + + /* + * tuple table and result type initialization + */ + ExecInitResultTupleSlotTL(&state->ps, &TTSOpsVirtual); + + /* Create workspace for per-tlist-entry expr state & SRF-is-done state */ + state->nelems = list_length(node->plan.targetlist); + state->elems = (Node **) + palloc(sizeof(Node *) * state->nelems); + state->elemdone = (ExprDoneCond *) + palloc(sizeof(ExprDoneCond) * state->nelems); + + /* + * Build expressions to evaluate targetlist. We can't use + * ExecBuildProjectionInfo here, since that doesn't deal with SRFs. + * Instead compile each expression separately, using + * ExecInitFunctionResultSet where applicable. + */ + off = 0; + foreach(lc, node->plan.targetlist) + { + TargetEntry *te = (TargetEntry *) lfirst(lc); + Expr *expr = te->expr; + + if ((IsA(expr, FuncExpr) && ((FuncExpr *) expr)->funcretset) || + (IsA(expr, OpExpr) && ((OpExpr *) expr)->opretset)) + { + state->elems[off] = (Node *) + ExecInitFunctionResultSet(expr, state->ps.ps_ExprContext, + &state->ps); + } + else + { + Assert(!expression_returns_set((Node *) expr)); + state->elems[off] = (Node *) ExecInitExpr(expr, &state->ps); + } + + off++; + } + + /* We don't support any qual on ProjectSet nodes */ + Assert(node->plan.qual == NIL); + + /* + * Create a memory context that ExecMakeFunctionResultSet can use to + * evaluate function arguments in. We can't use the per-tuple context for + * this because it gets reset too often; but we don't want to leak + * evaluation results into the query-lifespan context either. We use one + * context for the arguments of all tSRFs, as they have roughly equivalent + * lifetimes. + */ + state->argcontext = AllocSetContextCreate(CurrentMemoryContext, + "tSRF function arguments", + ALLOCSET_DEFAULT_SIZES); + + return state; +} + +/* ---------------------------------------------------------------- + * ExecEndProjectSet + * + * frees up storage allocated through C routines + * ---------------------------------------------------------------- + */ +void +ExecEndProjectSet(ProjectSetState *node) +{ + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* + * shut down subplans + */ + ExecEndNode(outerPlanState(node)); +} + +void +ExecReScanProjectSet(ProjectSetState *node) +{ + /* Forget any incompletely-evaluated SRFs */ + node->pending_srf_tuples = false; + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); +} diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c new file mode 100644 index 0000000..f9e91fd --- /dev/null +++ b/src/backend/executor/nodeRecursiveunion.c @@ -0,0 +1,331 @@ +/*------------------------------------------------------------------------- + * + * nodeRecursiveunion.c + * routines to handle RecursiveUnion nodes. + * + * To implement UNION (without ALL), we need a hashtable that stores tuples + * already seen. The hash key is computed from the grouping columns. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeRecursiveunion.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/nodeRecursiveunion.h" +#include "miscadmin.h" +#include "utils/memutils.h" + + + +/* + * Initialize the hash table to empty. + */ +static void +build_hash_table(RecursiveUnionState *rustate) +{ + RecursiveUnion *node = (RecursiveUnion *) rustate->ps.plan; + TupleDesc desc = ExecGetResultType(outerPlanState(rustate)); + + Assert(node->numCols > 0); + Assert(node->numGroups > 0); + + rustate->hashtable = BuildTupleHashTableExt(&rustate->ps, + desc, + node->numCols, + node->dupColIdx, + rustate->eqfuncoids, + rustate->hashfunctions, + node->dupCollations, + node->numGroups, + 0, + rustate->ps.state->es_query_cxt, + rustate->tableContext, + rustate->tempContext, + false); +} + + +/* ---------------------------------------------------------------- + * ExecRecursiveUnion(node) + * + * Scans the recursive query sequentially and returns the next + * qualifying tuple. + * + * 1. evaluate non recursive term and assign the result to RT + * + * 2. execute recursive terms + * + * 2.1 WT := RT + * 2.2 while WT is not empty repeat 2.3 to 2.6. if WT is empty returns RT + * 2.3 replace the name of recursive term with WT + * 2.4 evaluate the recursive term and store into WT + * 2.5 append WT to RT + * 2.6 go back to 2.2 + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecRecursiveUnion(PlanState *pstate) +{ + RecursiveUnionState *node = castNode(RecursiveUnionState, pstate); + PlanState *outerPlan = outerPlanState(node); + PlanState *innerPlan = innerPlanState(node); + RecursiveUnion *plan = (RecursiveUnion *) node->ps.plan; + TupleTableSlot *slot; + bool isnew; + + CHECK_FOR_INTERRUPTS(); + + /* 1. Evaluate non-recursive term */ + if (!node->recursing) + { + for (;;) + { + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + break; + if (plan->numCols > 0) + { + /* Find or build hashtable entry for this tuple's group */ + LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL); + /* Must reset temp context after each hashtable lookup */ + MemoryContextReset(node->tempContext); + /* Ignore tuple if already seen */ + if (!isnew) + continue; + } + /* Each non-duplicate tuple goes to the working table ... */ + tuplestore_puttupleslot(node->working_table, slot); + /* ... and to the caller */ + return slot; + } + node->recursing = true; + } + + /* 2. Execute recursive term */ + for (;;) + { + slot = ExecProcNode(innerPlan); + if (TupIsNull(slot)) + { + /* Done if there's nothing in the intermediate table */ + if (node->intermediate_empty) + break; + + /* done with old working table ... */ + tuplestore_end(node->working_table); + + /* intermediate table becomes working table */ + node->working_table = node->intermediate_table; + + /* create new empty intermediate table */ + node->intermediate_table = tuplestore_begin_heap(false, false, + work_mem); + node->intermediate_empty = true; + + /* reset the recursive term */ + innerPlan->chgParam = bms_add_member(innerPlan->chgParam, + plan->wtParam); + + /* and continue fetching from recursive term */ + continue; + } + + if (plan->numCols > 0) + { + /* Find or build hashtable entry for this tuple's group */ + LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL); + /* Must reset temp context after each hashtable lookup */ + MemoryContextReset(node->tempContext); + /* Ignore tuple if already seen */ + if (!isnew) + continue; + } + + /* Else, tuple is good; stash it in intermediate table ... */ + node->intermediate_empty = false; + tuplestore_puttupleslot(node->intermediate_table, slot); + /* ... and return it */ + return slot; + } + + return NULL; +} + +/* ---------------------------------------------------------------- + * ExecInitRecursiveUnion + * ---------------------------------------------------------------- + */ +RecursiveUnionState * +ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags) +{ + RecursiveUnionState *rustate; + ParamExecData *prmdata; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + rustate = makeNode(RecursiveUnionState); + rustate->ps.plan = (Plan *) node; + rustate->ps.state = estate; + rustate->ps.ExecProcNode = ExecRecursiveUnion; + + rustate->eqfuncoids = NULL; + rustate->hashfunctions = NULL; + rustate->hashtable = NULL; + rustate->tempContext = NULL; + rustate->tableContext = NULL; + + /* initialize processing state */ + rustate->recursing = false; + rustate->intermediate_empty = true; + rustate->working_table = tuplestore_begin_heap(false, false, work_mem); + rustate->intermediate_table = tuplestore_begin_heap(false, false, work_mem); + + /* + * If hashing, we need a per-tuple memory context for comparisons, and a + * longer-lived context to store the hash table. The table can't just be + * kept in the per-query context because we want to be able to throw it + * away when rescanning. + */ + if (node->numCols > 0) + { + rustate->tempContext = + AllocSetContextCreate(CurrentMemoryContext, + "RecursiveUnion", + ALLOCSET_DEFAULT_SIZES); + rustate->tableContext = + AllocSetContextCreate(CurrentMemoryContext, + "RecursiveUnion hash table", + ALLOCSET_DEFAULT_SIZES); + } + + /* + * Make the state structure available to descendant WorkTableScan nodes + * via the Param slot reserved for it. + */ + prmdata = &(estate->es_param_exec_vals[node->wtParam]); + Assert(prmdata->execPlan == NULL); + prmdata->value = PointerGetDatum(rustate); + prmdata->isnull = false; + + /* + * Miscellaneous initialization + * + * RecursiveUnion plans don't have expression contexts because they never + * call ExecQual or ExecProject. + */ + Assert(node->plan.qual == NIL); + + /* + * RecursiveUnion nodes still have Result slots, which hold pointers to + * tuples, so we have to initialize them. + */ + ExecInitResultTypeTL(&rustate->ps); + + /* + * Initialize result tuple type. (Note: we have to set up the result type + * before initializing child nodes, because nodeWorktablescan.c expects it + * to be valid.) + */ + rustate->ps.ps_ProjInfo = NULL; + + /* + * initialize child nodes + */ + outerPlanState(rustate) = ExecInitNode(outerPlan(node), estate, eflags); + innerPlanState(rustate) = ExecInitNode(innerPlan(node), estate, eflags); + + /* + * If hashing, precompute fmgr lookup data for inner loop, and create the + * hash table. + */ + if (node->numCols > 0) + { + execTuplesHashPrepare(node->numCols, + node->dupOperators, + &rustate->eqfuncoids, + &rustate->hashfunctions); + build_hash_table(rustate); + } + + return rustate; +} + +/* ---------------------------------------------------------------- + * ExecEndRecursiveUnion + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndRecursiveUnion(RecursiveUnionState *node) +{ + /* Release tuplestores */ + tuplestore_end(node->working_table); + tuplestore_end(node->intermediate_table); + + /* free subsidiary stuff including hashtable */ + if (node->tempContext) + MemoryContextDelete(node->tempContext); + if (node->tableContext) + MemoryContextDelete(node->tableContext); + + /* + * close down subplans + */ + ExecEndNode(outerPlanState(node)); + ExecEndNode(innerPlanState(node)); +} + +/* ---------------------------------------------------------------- + * ExecReScanRecursiveUnion + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanRecursiveUnion(RecursiveUnionState *node) +{ + PlanState *outerPlan = outerPlanState(node); + PlanState *innerPlan = innerPlanState(node); + RecursiveUnion *plan = (RecursiveUnion *) node->ps.plan; + + /* + * Set recursive term's chgParam to tell it that we'll modify the working + * table and therefore it has to rescan. + */ + innerPlan->chgParam = bms_add_member(innerPlan->chgParam, plan->wtParam); + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. Because of above, we only have to do this to the + * non-recursive term. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); + + /* Release any hashtable storage */ + if (node->tableContext) + MemoryContextResetAndDeleteChildren(node->tableContext); + + /* Empty hashtable if needed */ + if (plan->numCols > 0) + ResetTupleHashTable(node->hashtable); + + /* reset processing state */ + node->recursing = false; + node->intermediate_empty = true; + tuplestore_clear(node->working_table); + tuplestore_clear(node->intermediate_table); +} diff --git a/src/backend/executor/nodeResult.c b/src/backend/executor/nodeResult.c new file mode 100644 index 0000000..0946af0 --- /dev/null +++ b/src/backend/executor/nodeResult.c @@ -0,0 +1,272 @@ +/*------------------------------------------------------------------------- + * + * nodeResult.c + * support for constant nodes needing special code. + * + * DESCRIPTION + * + * Result nodes are used in queries where no relations are scanned. + * Examples of such queries are: + * + * select 1 * 2 + * + * insert into emp values ('mike', 15000) + * + * (Remember that in an INSERT or UPDATE, we need a plan tree that + * generates the new rows.) + * + * Result nodes are also used to optimise queries with constant + * qualifications (ie, quals that do not depend on the scanned data), + * such as: + * + * select * from emp where 2 > 1 + * + * In this case, the plan generated is + * + * Result (with 2 > 1 qual) + * / + * SeqScan (emp.*) + * + * At runtime, the Result node evaluates the constant qual once, + * which is shown by EXPLAIN as a One-Time Filter. If it's + * false, we can return an empty result set without running the + * controlled plan at all. If it's true, we run the controlled + * plan normally and pass back the results. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeResult.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeResult.h" +#include "miscadmin.h" +#include "utils/memutils.h" + + +/* ---------------------------------------------------------------- + * ExecResult(node) + * + * returns the tuples from the outer plan which satisfy the + * qualification clause. Since result nodes with right + * subtrees are never planned, we ignore the right subtree + * entirely (for now).. -cim 10/7/89 + * + * The qualification containing only constant clauses are + * checked first before any processing is done. It always returns + * 'nil' if the constant qualification is not satisfied. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecResult(PlanState *pstate) +{ + ResultState *node = castNode(ResultState, pstate); + TupleTableSlot *outerTupleSlot; + PlanState *outerPlan; + ExprContext *econtext; + + CHECK_FOR_INTERRUPTS(); + + econtext = node->ps.ps_ExprContext; + + /* + * check constant qualifications like (2 > 1), if not already done + */ + if (node->rs_checkqual) + { + bool qualResult = ExecQual(node->resconstantqual, econtext); + + node->rs_checkqual = false; + if (!qualResult) + { + node->rs_done = true; + return NULL; + } + } + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + ResetExprContext(econtext); + + /* + * if rs_done is true then it means that we were asked to return a + * constant tuple and we already did the last time ExecResult() was + * called, OR that we failed the constant qual check. Either way, now we + * are through. + */ + if (!node->rs_done) + { + outerPlan = outerPlanState(node); + + if (outerPlan != NULL) + { + /* + * retrieve tuples from the outer plan until there are no more. + */ + outerTupleSlot = ExecProcNode(outerPlan); + + if (TupIsNull(outerTupleSlot)) + return NULL; + + /* + * prepare to compute projection expressions, which will expect to + * access the input tuples as varno OUTER. + */ + econtext->ecxt_outertuple = outerTupleSlot; + } + else + { + /* + * if we don't have an outer plan, then we are just generating the + * results from a constant target list. Do it only once. + */ + node->rs_done = true; + } + + /* form the result tuple using ExecProject(), and return it */ + return ExecProject(node->ps.ps_ProjInfo); + } + + return NULL; +} + +/* ---------------------------------------------------------------- + * ExecResultMarkPos + * ---------------------------------------------------------------- + */ +void +ExecResultMarkPos(ResultState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + if (outerPlan != NULL) + ExecMarkPos(outerPlan); + else + elog(DEBUG2, "Result nodes do not support mark/restore"); +} + +/* ---------------------------------------------------------------- + * ExecResultRestrPos + * ---------------------------------------------------------------- + */ +void +ExecResultRestrPos(ResultState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + if (outerPlan != NULL) + ExecRestrPos(outerPlan); + else + elog(ERROR, "Result nodes do not support mark/restore"); +} + +/* ---------------------------------------------------------------- + * ExecInitResult + * + * Creates the run-time state information for the result node + * produced by the planner and initializes outer relations + * (child nodes). + * ---------------------------------------------------------------- + */ +ResultState * +ExecInitResult(Result *node, EState *estate, int eflags) +{ + ResultState *resstate; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)) || + outerPlan(node) != NULL); + + /* + * create state structure + */ + resstate = makeNode(ResultState); + resstate->ps.plan = (Plan *) node; + resstate->ps.state = estate; + resstate->ps.ExecProcNode = ExecResult; + + resstate->rs_done = false; + resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &resstate->ps); + + /* + * initialize child nodes + */ + outerPlanState(resstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * we don't use inner plan + */ + Assert(innerPlan(node) == NULL); + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTupleSlotTL(&resstate->ps, &TTSOpsVirtual); + ExecAssignProjectionInfo(&resstate->ps, NULL); + + /* + * initialize child expressions + */ + resstate->ps.qual = + ExecInitQual(node->plan.qual, (PlanState *) resstate); + resstate->resconstantqual = + ExecInitQual((List *) node->resconstantqual, (PlanState *) resstate); + + return resstate; +} + +/* ---------------------------------------------------------------- + * ExecEndResult + * + * frees up storage allocated through C routines + * ---------------------------------------------------------------- + */ +void +ExecEndResult(ResultState *node) +{ + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* + * shut down subplans + */ + ExecEndNode(outerPlanState(node)); +} + +void +ExecReScanResult(ResultState *node) +{ + node->rs_done = false; + node->rs_checkqual = (node->resconstantqual == NULL) ? false : true; + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree && + node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); +} diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c new file mode 100644 index 0000000..44232d5 --- /dev/null +++ b/src/backend/executor/nodeSamplescan.c @@ -0,0 +1,378 @@ +/*------------------------------------------------------------------------- + * + * nodeSamplescan.c + * Support routines for sample scans of relations (table sampling). + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeSamplescan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/tsmapi.h" +#include "executor/executor.h" +#include "executor/nodeSamplescan.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/builtins.h" +#include "utils/rel.h" + +static TupleTableSlot *SampleNext(SampleScanState *node); +static void tablesample_init(SampleScanState *scanstate); +static TupleTableSlot *tablesample_getnext(SampleScanState *scanstate); + +/* ---------------------------------------------------------------- + * Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * SampleNext + * + * This is a workhorse for ExecSampleScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +SampleNext(SampleScanState *node) +{ + /* + * if this is first call within a scan, initialize + */ + if (!node->begun) + tablesample_init(node); + + /* + * get the next tuple, and store it in our result slot + */ + return tablesample_getnext(node); +} + +/* + * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +SampleRecheck(SampleScanState *node, TupleTableSlot *slot) +{ + /* + * No need to recheck for SampleScan, since like SeqScan we don't pass any + * checkable keys to heap_beginscan. + */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecSampleScan(node) + * + * Scans the relation using the sampling method and returns + * the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecSampleScan(PlanState *pstate) +{ + SampleScanState *node = castNode(SampleScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) SampleNext, + (ExecScanRecheckMtd) SampleRecheck); +} + +/* ---------------------------------------------------------------- + * ExecInitSampleScan + * ---------------------------------------------------------------- + */ +SampleScanState * +ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) +{ + SampleScanState *scanstate; + TableSampleClause *tsc = node->tablesample; + TsmRoutine *tsm; + + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create state structure + */ + scanstate = makeNode(SampleScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecSampleScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * open the scan relation + */ + scanstate->ss.ss_currentRelation = + ExecOpenScanRelation(estate, + node->scan.scanrelid, + eflags); + + /* we won't set up the HeapScanDesc till later */ + scanstate->ss.ss_currentScanDesc = NULL; + + /* and create slot with appropriate rowtype */ + ExecInitScanTupleSlot(estate, &scanstate->ss, + RelationGetDescr(scanstate->ss.ss_currentRelation), + table_slot_callbacks(scanstate->ss.ss_currentRelation)); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); + + scanstate->args = ExecInitExprList(tsc->args, (PlanState *) scanstate); + scanstate->repeatable = + ExecInitExpr(tsc->repeatable, (PlanState *) scanstate); + + /* + * If we don't have a REPEATABLE clause, select a random seed. We want to + * do this just once, since the seed shouldn't change over rescans. + */ + if (tsc->repeatable == NULL) + scanstate->seed = random(); + + /* + * Finally, initialize the TABLESAMPLE method handler. + */ + tsm = GetTsmRoutine(tsc->tsmhandler); + scanstate->tsmroutine = tsm; + scanstate->tsm_state = NULL; + + if (tsm->InitSampleScan) + tsm->InitSampleScan(scanstate, eflags); + + /* We'll do BeginSampleScan later; we can't evaluate params yet */ + scanstate->begun = false; + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndSampleScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndSampleScan(SampleScanState *node) +{ + /* + * Tell sampling function that we finished the scan. + */ + if (node->tsmroutine->EndSampleScan) + node->tsmroutine->EndSampleScan(node); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close heap scan + */ + if (node->ss.ss_currentScanDesc) + table_endscan(node->ss.ss_currentScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecReScanSampleScan + * + * Rescans the relation. + * + * ---------------------------------------------------------------- + */ +void +ExecReScanSampleScan(SampleScanState *node) +{ + /* Remember we need to do BeginSampleScan again (if we did it at all) */ + node->begun = false; + node->done = false; + node->haveblock = false; + node->donetuples = 0; + + ExecScanReScan(&node->ss); +} + + +/* + * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan. + */ +static void +tablesample_init(SampleScanState *scanstate) +{ + TsmRoutine *tsm = scanstate->tsmroutine; + ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; + Datum *params; + Datum datum; + bool isnull; + uint32 seed; + bool allow_sync; + int i; + ListCell *arg; + + scanstate->donetuples = 0; + params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum)); + + i = 0; + foreach(arg, scanstate->args) + { + ExprState *argstate = (ExprState *) lfirst(arg); + + params[i] = ExecEvalExprSwitchContext(argstate, + econtext, + &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("TABLESAMPLE parameter cannot be null"))); + i++; + } + + if (scanstate->repeatable) + { + datum = ExecEvalExprSwitchContext(scanstate->repeatable, + econtext, + &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT), + errmsg("TABLESAMPLE REPEATABLE parameter cannot be null"))); + + /* + * The REPEATABLE parameter has been coerced to float8 by the parser. + * The reason for using float8 at the SQL level is that it will + * produce unsurprising results both for users used to databases that + * accept only integers in the REPEATABLE clause and for those who + * might expect that REPEATABLE works like setseed() (a float in the + * range from -1 to 1). + * + * We use hashfloat8() to convert the supplied value into a suitable + * seed. For regression-testing purposes, that has the convenient + * property that REPEATABLE(0) gives a machine-independent result. + */ + seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum)); + } + else + { + /* Use the seed selected by ExecInitSampleScan */ + seed = scanstate->seed; + } + + /* Set default values for params that BeginSampleScan can adjust */ + scanstate->use_bulkread = true; + scanstate->use_pagemode = true; + + /* Let tablesample method do its thing */ + tsm->BeginSampleScan(scanstate, + params, + list_length(scanstate->args), + seed); + + /* We'll use syncscan if there's no NextSampleBlock function */ + allow_sync = (tsm->NextSampleBlock == NULL); + + /* Now we can create or reset the HeapScanDesc */ + if (scanstate->ss.ss_currentScanDesc == NULL) + { + scanstate->ss.ss_currentScanDesc = + table_beginscan_sampling(scanstate->ss.ss_currentRelation, + scanstate->ss.ps.state->es_snapshot, + 0, NULL, + scanstate->use_bulkread, + allow_sync, + scanstate->use_pagemode); + } + else + { + table_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL, + scanstate->use_bulkread, + allow_sync, + scanstate->use_pagemode); + } + + pfree(params); + + /* And we're initialized. */ + scanstate->begun = true; +} + +/* + * Get next tuple from TABLESAMPLE method. + */ +static TupleTableSlot * +tablesample_getnext(SampleScanState *scanstate) +{ + TableScanDesc scan = scanstate->ss.ss_currentScanDesc; + TupleTableSlot *slot = scanstate->ss.ss_ScanTupleSlot; + + ExecClearTuple(slot); + + if (scanstate->done) + return NULL; + + for (;;) + { + if (!scanstate->haveblock) + { + if (!table_scan_sample_next_block(scan, scanstate)) + { + scanstate->haveblock = false; + scanstate->done = true; + + /* exhausted relation */ + return NULL; + } + + scanstate->haveblock = true; + } + + if (!table_scan_sample_next_tuple(scan, scanstate, slot)) + { + /* + * If we get here, it means we've exhausted the items on this page + * and it's time to move to the next. + */ + scanstate->haveblock = false; + continue; + } + + /* Found visible tuple, return it. */ + break; + } + + scanstate->donetuples++; + + return slot; +} diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c new file mode 100644 index 0000000..066f9ae --- /dev/null +++ b/src/backend/executor/nodeSeqscan.c @@ -0,0 +1,314 @@ +/*------------------------------------------------------------------------- + * + * nodeSeqscan.c + * Support routines for sequential scans of relations. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeSeqscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecSeqScan sequentially scans a relation. + * ExecSeqNext retrieve next tuple in sequential order. + * ExecInitSeqScan creates and initializes a seqscan node. + * ExecEndSeqScan releases any storage allocated. + * ExecReScanSeqScan rescans the relation + * + * ExecSeqScanEstimate estimates DSM space needed for parallel scan + * ExecSeqScanInitializeDSM initialize DSM for parallel scan + * ExecSeqScanReInitializeDSM reinitialize DSM for fresh parallel scan + * ExecSeqScanInitializeWorker attach to DSM info in parallel worker + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "access/tableam.h" +#include "executor/execdebug.h" +#include "executor/nodeSeqscan.h" +#include "utils/rel.h" + +static TupleTableSlot *SeqNext(SeqScanState *node); + +/* ---------------------------------------------------------------- + * Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * SeqNext + * + * This is a workhorse for ExecSeqScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +SeqNext(SeqScanState *node) +{ + TableScanDesc scandesc; + EState *estate; + ScanDirection direction; + TupleTableSlot *slot; + + /* + * get information from the estate and scan state + */ + scandesc = node->ss.ss_currentScanDesc; + estate = node->ss.ps.state; + direction = estate->es_direction; + slot = node->ss.ss_ScanTupleSlot; + + if (scandesc == NULL) + { + /* + * We reach here if the scan is not parallel, or if we're serially + * executing a scan that was planned to be parallel. + */ + scandesc = table_beginscan(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL); + node->ss.ss_currentScanDesc = scandesc; + } + + /* + * get the next tuple from the table + */ + if (table_scan_getnextslot(scandesc, direction, slot)) + return slot; + return NULL; +} + +/* + * SeqRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +SeqRecheck(SeqScanState *node, TupleTableSlot *slot) +{ + /* + * Note that unlike IndexScan, SeqScan never use keys in heap_beginscan + * (and this is very bad) - so, here we do not check are keys ok or not. + */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecSeqScan(node) + * + * Scans the relation sequentially and returns the next qualifying + * tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecSeqScan(PlanState *pstate) +{ + SeqScanState *node = castNode(SeqScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) SeqNext, + (ExecScanRecheckMtd) SeqRecheck); +} + + +/* ---------------------------------------------------------------- + * ExecInitSeqScan + * ---------------------------------------------------------------- + */ +SeqScanState * +ExecInitSeqScan(SeqScan *node, EState *estate, int eflags) +{ + SeqScanState *scanstate; + + /* + * Once upon a time it was possible to have an outerPlan of a SeqScan, but + * not any more. + */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create state structure + */ + scanstate = makeNode(SeqScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecSeqScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * open the scan relation + */ + scanstate->ss.ss_currentRelation = + ExecOpenScanRelation(estate, + node->scanrelid, + eflags); + + /* and create slot with the appropriate rowtype */ + ExecInitScanTupleSlot(estate, &scanstate->ss, + RelationGetDescr(scanstate->ss.ss_currentRelation), + table_slot_callbacks(scanstate->ss.ss_currentRelation)); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->plan.qual, (PlanState *) scanstate); + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndSeqScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndSeqScan(SeqScanState *node) +{ + TableScanDesc scanDesc; + + /* + * get information from node + */ + scanDesc = node->ss.ss_currentScanDesc; + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close heap scan + */ + if (scanDesc != NULL) + table_endscan(scanDesc); +} + +/* ---------------------------------------------------------------- + * Join Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecReScanSeqScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanSeqScan(SeqScanState *node) +{ + TableScanDesc scan; + + scan = node->ss.ss_currentScanDesc; + + if (scan != NULL) + table_rescan(scan, /* scan desc */ + NULL); /* new scan keys */ + + ExecScanReScan((ScanState *) node); +} + +/* ---------------------------------------------------------------- + * Parallel Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecSeqScanEstimate + * + * Compute the amount of space we'll need in the parallel + * query DSM, and inform pcxt->estimator about our needs. + * ---------------------------------------------------------------- + */ +void +ExecSeqScanEstimate(SeqScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + + node->pscan_len = table_parallelscan_estimate(node->ss.ss_currentRelation, + estate->es_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecSeqScanInitializeDSM + * + * Set up a parallel heap scan descriptor. + * ---------------------------------------------------------------- + */ +void +ExecSeqScanInitializeDSM(SeqScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + ParallelTableScanDesc pscan; + + pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); + table_parallelscan_initialize(node->ss.ss_currentRelation, + pscan, + estate->es_snapshot); + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); + node->ss.ss_currentScanDesc = + table_beginscan_parallel(node->ss.ss_currentRelation, pscan); +} + +/* ---------------------------------------------------------------- + * ExecSeqScanReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void +ExecSeqScanReInitializeDSM(SeqScanState *node, + ParallelContext *pcxt) +{ + ParallelTableScanDesc pscan; + + pscan = node->ss.ss_currentScanDesc->rs_parallel; + table_parallelscan_reinitialize(node->ss.ss_currentRelation, pscan); +} + +/* ---------------------------------------------------------------- + * ExecSeqScanInitializeWorker + * + * Copy relevant information from TOC into planstate. + * ---------------------------------------------------------------- + */ +void +ExecSeqScanInitializeWorker(SeqScanState *node, + ParallelWorkerContext *pwcxt) +{ + ParallelTableScanDesc pscan; + + pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); + node->ss.ss_currentScanDesc = + table_beginscan_parallel(node->ss.ss_currentRelation, pscan); +} diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c new file mode 100644 index 0000000..aad7ac0 --- /dev/null +++ b/src/backend/executor/nodeSetOp.c @@ -0,0 +1,651 @@ +/*------------------------------------------------------------------------- + * + * nodeSetOp.c + * Routines to handle INTERSECT and EXCEPT selection + * + * The input of a SetOp node consists of tuples from two relations, + * which have been combined into one dataset, with a junk attribute added + * that shows which relation each tuple came from. In SETOP_SORTED mode, + * the input has furthermore been sorted according to all the grouping + * columns (ie, all the non-junk attributes). The SetOp node scans each + * group of identical tuples to determine how many came from each input + * relation. Then it is a simple matter to emit the output demanded by the + * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL. + * + * In SETOP_HASHED mode, the input is delivered in no particular order, + * except that we know all the tuples from one input relation will come before + * all the tuples of the other. The planner guarantees that the first input + * relation is the left-hand one for EXCEPT, and tries to make the smaller + * input relation come first for INTERSECT. We build a hash table in memory + * with one entry for each group of identical tuples, and count the number of + * tuples in the group from each relation. After seeing all the input, we + * scan the hashtable and generate the correct output using those counts. + * We can avoid making hashtable entries for any tuples appearing only in the + * second input relation, since they cannot result in any output. + * + * This node type is not used for UNION or UNION ALL, since those can be + * implemented more cheaply (there's no need for the junk attribute to + * identify the source relation). + * + * Note that SetOp does no qual checking nor projection. The delivered + * output tuples are just copies of the first-to-arrive tuple in each + * input group. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeSetOp.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/executor.h" +#include "executor/nodeSetOp.h" +#include "miscadmin.h" +#include "utils/memutils.h" + + +/* + * SetOpStatePerGroupData - per-group working state + * + * These values are working state that is initialized at the start of + * an input tuple group and updated for each input tuple. + * + * In SETOP_SORTED mode, we need only one of these structs, and it's kept in + * the plan state node. In SETOP_HASHED mode, the hash table contains one + * of these for each tuple group. + */ +typedef struct SetOpStatePerGroupData +{ + long numLeft; /* number of left-input dups in group */ + long numRight; /* number of right-input dups in group */ +} SetOpStatePerGroupData; + + +static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate); +static void setop_fill_hash_table(SetOpState *setopstate); +static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate); + + +/* + * Initialize state for a new group of input values. + */ +static inline void +initialize_counts(SetOpStatePerGroup pergroup) +{ + pergroup->numLeft = pergroup->numRight = 0; +} + +/* + * Advance the appropriate counter for one input tuple. + */ +static inline void +advance_counts(SetOpStatePerGroup pergroup, int flag) +{ + if (flag) + pergroup->numRight++; + else + pergroup->numLeft++; +} + +/* + * Fetch the "flag" column from an input tuple. + * This is an integer column with value 0 for left side, 1 for right side. + */ +static int +fetch_tuple_flag(SetOpState *setopstate, TupleTableSlot *inputslot) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + int flag; + bool isNull; + + flag = DatumGetInt32(slot_getattr(inputslot, + node->flagColIdx, + &isNull)); + Assert(!isNull); + Assert(flag == 0 || flag == 1); + return flag; +} + +/* + * Initialize the hash table to empty. + */ +static void +build_hash_table(SetOpState *setopstate) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + ExprContext *econtext = setopstate->ps.ps_ExprContext; + TupleDesc desc = ExecGetResultType(outerPlanState(setopstate)); + + Assert(node->strategy == SETOP_HASHED); + Assert(node->numGroups > 0); + + setopstate->hashtable = BuildTupleHashTableExt(&setopstate->ps, + desc, + node->numCols, + node->dupColIdx, + setopstate->eqfuncoids, + setopstate->hashfunctions, + node->dupCollations, + node->numGroups, + 0, + setopstate->ps.state->es_query_cxt, + setopstate->tableContext, + econtext->ecxt_per_tuple_memory, + false); +} + +/* + * We've completed processing a tuple group. Decide how many copies (if any) + * of its representative row to emit, and store the count into numOutput. + * This logic is straight from the SQL92 specification. + */ +static void +set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup) +{ + SetOp *plannode = (SetOp *) setopstate->ps.plan; + + switch (plannode->cmd) + { + case SETOPCMD_INTERSECT: + if (pergroup->numLeft > 0 && pergroup->numRight > 0) + setopstate->numOutput = 1; + else + setopstate->numOutput = 0; + break; + case SETOPCMD_INTERSECT_ALL: + setopstate->numOutput = + (pergroup->numLeft < pergroup->numRight) ? + pergroup->numLeft : pergroup->numRight; + break; + case SETOPCMD_EXCEPT: + if (pergroup->numLeft > 0 && pergroup->numRight == 0) + setopstate->numOutput = 1; + else + setopstate->numOutput = 0; + break; + case SETOPCMD_EXCEPT_ALL: + setopstate->numOutput = + (pergroup->numLeft < pergroup->numRight) ? + 0 : (pergroup->numLeft - pergroup->numRight); + break; + default: + elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd); + break; + } +} + + +/* ---------------------------------------------------------------- + * ExecSetOp + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* return: a tuple or NULL */ +ExecSetOp(PlanState *pstate) +{ + SetOpState *node = castNode(SetOpState, pstate); + SetOp *plannode = (SetOp *) node->ps.plan; + TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot; + + CHECK_FOR_INTERRUPTS(); + + /* + * If the previously-returned tuple needs to be returned more than once, + * keep returning it. + */ + if (node->numOutput > 0) + { + node->numOutput--; + return resultTupleSlot; + } + + /* Otherwise, we're done if we are out of groups */ + if (node->setop_done) + return NULL; + + /* Fetch the next tuple group according to the correct strategy */ + if (plannode->strategy == SETOP_HASHED) + { + if (!node->table_filled) + setop_fill_hash_table(node); + return setop_retrieve_hash_table(node); + } + else + return setop_retrieve_direct(node); +} + +/* + * ExecSetOp for non-hashed case + */ +static TupleTableSlot * +setop_retrieve_direct(SetOpState *setopstate) +{ + PlanState *outerPlan; + SetOpStatePerGroup pergroup; + TupleTableSlot *outerslot; + TupleTableSlot *resultTupleSlot; + ExprContext *econtext = setopstate->ps.ps_ExprContext; + + /* + * get state info from node + */ + outerPlan = outerPlanState(setopstate); + pergroup = (SetOpStatePerGroup) setopstate->pergroup; + resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; + + /* + * We loop retrieving groups until we find one we should return + */ + while (!setopstate->setop_done) + { + /* + * If we don't already have the first tuple of the new group, fetch it + * from the outer plan. + */ + if (setopstate->grp_firstTuple == NULL) + { + outerslot = ExecProcNode(outerPlan); + if (!TupIsNull(outerslot)) + { + /* Make a copy of the first input tuple */ + setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot); + } + else + { + /* outer plan produced no tuples at all */ + setopstate->setop_done = true; + return NULL; + } + } + + /* + * Store the copied first input tuple in the tuple table slot reserved + * for it. The tuple will be deleted when it is cleared from the + * slot. + */ + ExecStoreHeapTuple(setopstate->grp_firstTuple, + resultTupleSlot, + true); + setopstate->grp_firstTuple = NULL; /* don't keep two pointers */ + + /* Initialize working state for a new input tuple group */ + initialize_counts(pergroup); + + /* Count the first input tuple */ + advance_counts(pergroup, + fetch_tuple_flag(setopstate, resultTupleSlot)); + + /* + * Scan the outer plan until we exhaust it or cross a group boundary. + */ + for (;;) + { + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + { + /* no more outer-plan tuples available */ + setopstate->setop_done = true; + break; + } + + /* + * Check whether we've crossed a group boundary. + */ + econtext->ecxt_outertuple = resultTupleSlot; + econtext->ecxt_innertuple = outerslot; + + if (!ExecQualAndReset(setopstate->eqfunction, econtext)) + { + /* + * Save the first input tuple of the next group. + */ + setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot); + break; + } + + /* Still in same group, so count this tuple */ + advance_counts(pergroup, + fetch_tuple_flag(setopstate, outerslot)); + } + + /* + * Done scanning input tuple group. See if we should emit any copies + * of result tuple, and if so return the first copy. + */ + set_output_count(setopstate, pergroup); + + if (setopstate->numOutput > 0) + { + setopstate->numOutput--; + return resultTupleSlot; + } + } + + /* No more groups */ + ExecClearTuple(resultTupleSlot); + return NULL; +} + +/* + * ExecSetOp for hashed case: phase 1, read input and build hash table + */ +static void +setop_fill_hash_table(SetOpState *setopstate) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + PlanState *outerPlan; + int firstFlag; + bool in_first_rel PG_USED_FOR_ASSERTS_ONLY; + ExprContext *econtext = setopstate->ps.ps_ExprContext; + + /* + * get state info from node + */ + outerPlan = outerPlanState(setopstate); + firstFlag = node->firstFlag; + /* verify planner didn't mess up */ + Assert(firstFlag == 0 || + (firstFlag == 1 && + (node->cmd == SETOPCMD_INTERSECT || + node->cmd == SETOPCMD_INTERSECT_ALL))); + + /* + * Process each outer-plan tuple, and then fetch the next one, until we + * exhaust the outer plan. + */ + in_first_rel = true; + for (;;) + { + TupleTableSlot *outerslot; + int flag; + TupleHashEntryData *entry; + bool isnew; + + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + break; + + /* Identify whether it's left or right input */ + flag = fetch_tuple_flag(setopstate, outerslot); + + if (flag == firstFlag) + { + /* (still) in first input relation */ + Assert(in_first_rel); + + /* Find or build hashtable entry for this tuple's group */ + entry = LookupTupleHashEntry(setopstate->hashtable, outerslot, + &isnew, NULL); + + /* If new tuple group, initialize counts */ + if (isnew) + { + entry->additional = (SetOpStatePerGroup) + MemoryContextAlloc(setopstate->hashtable->tablecxt, + sizeof(SetOpStatePerGroupData)); + initialize_counts((SetOpStatePerGroup) entry->additional); + } + + /* Advance the counts */ + advance_counts((SetOpStatePerGroup) entry->additional, flag); + } + else + { + /* reached second relation */ + in_first_rel = false; + + /* For tuples not seen previously, do not make hashtable entry */ + entry = LookupTupleHashEntry(setopstate->hashtable, outerslot, + NULL, NULL); + + /* Advance the counts if entry is already present */ + if (entry) + advance_counts((SetOpStatePerGroup) entry->additional, flag); + } + + /* Must reset expression context after each hashtable lookup */ + ResetExprContext(econtext); + } + + setopstate->table_filled = true; + /* Initialize to walk the hash table */ + ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter); +} + +/* + * ExecSetOp for hashed case: phase 2, retrieving groups from hash table + */ +static TupleTableSlot * +setop_retrieve_hash_table(SetOpState *setopstate) +{ + TupleHashEntryData *entry; + TupleTableSlot *resultTupleSlot; + + /* + * get state info from node + */ + resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; + + /* + * We loop retrieving groups until we find one we should return + */ + while (!setopstate->setop_done) + { + CHECK_FOR_INTERRUPTS(); + + /* + * Find the next entry in the hash table + */ + entry = ScanTupleHashTable(setopstate->hashtable, &setopstate->hashiter); + if (entry == NULL) + { + /* No more entries in hashtable, so done */ + setopstate->setop_done = true; + return NULL; + } + + /* + * See if we should emit any copies of this tuple, and if so return + * the first copy. + */ + set_output_count(setopstate, (SetOpStatePerGroup) entry->additional); + + if (setopstate->numOutput > 0) + { + setopstate->numOutput--; + return ExecStoreMinimalTuple(entry->firstTuple, + resultTupleSlot, + false); + } + } + + /* No more groups */ + ExecClearTuple(resultTupleSlot); + return NULL; +} + +/* ---------------------------------------------------------------- + * ExecInitSetOp + * + * This initializes the setop node state structures and + * the node's subplan. + * ---------------------------------------------------------------- + */ +SetOpState * +ExecInitSetOp(SetOp *node, EState *estate, int eflags) +{ + SetOpState *setopstate; + TupleDesc outerDesc; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + setopstate = makeNode(SetOpState); + setopstate->ps.plan = (Plan *) node; + setopstate->ps.state = estate; + setopstate->ps.ExecProcNode = ExecSetOp; + + setopstate->eqfuncoids = NULL; + setopstate->hashfunctions = NULL; + setopstate->setop_done = false; + setopstate->numOutput = 0; + setopstate->pergroup = NULL; + setopstate->grp_firstTuple = NULL; + setopstate->hashtable = NULL; + setopstate->tableContext = NULL; + + /* + * create expression context + */ + ExecAssignExprContext(estate, &setopstate->ps); + + /* + * If hashing, we also need a longer-lived context to store the hash + * table. The table can't just be kept in the per-query context because + * we want to be able to throw it away in ExecReScanSetOp. + */ + if (node->strategy == SETOP_HASHED) + setopstate->tableContext = + AllocSetContextCreate(CurrentMemoryContext, + "SetOp hash table", + ALLOCSET_DEFAULT_SIZES); + + /* + * initialize child nodes + * + * If we are hashing then the child plan does not need to handle REWIND + * efficiently; see ExecReScanSetOp. + */ + if (node->strategy == SETOP_HASHED) + eflags &= ~EXEC_FLAG_REWIND; + outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags); + outerDesc = ExecGetResultType(outerPlanState(setopstate)); + + /* + * Initialize result slot and type. Setop nodes do no projections, so + * initialize projection info for this node appropriately. + */ + ExecInitResultTupleSlotTL(&setopstate->ps, + node->strategy == SETOP_HASHED ? + &TTSOpsMinimalTuple : &TTSOpsHeapTuple); + setopstate->ps.ps_ProjInfo = NULL; + + /* + * Precompute fmgr lookup data for inner loop. We need both equality and + * hashing functions to do it by hashing, but only equality if not + * hashing. + */ + if (node->strategy == SETOP_HASHED) + execTuplesHashPrepare(node->numCols, + node->dupOperators, + &setopstate->eqfuncoids, + &setopstate->hashfunctions); + else + setopstate->eqfunction = + execTuplesMatchPrepare(outerDesc, + node->numCols, + node->dupColIdx, + node->dupOperators, + node->dupCollations, + &setopstate->ps); + + if (node->strategy == SETOP_HASHED) + { + build_hash_table(setopstate); + setopstate->table_filled = false; + } + else + { + setopstate->pergroup = + (SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData)); + } + + return setopstate; +} + +/* ---------------------------------------------------------------- + * ExecEndSetOp + * + * This shuts down the subplan and frees resources allocated + * to this node. + * ---------------------------------------------------------------- + */ +void +ExecEndSetOp(SetOpState *node) +{ + /* clean up tuple table */ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* free subsidiary stuff including hashtable */ + if (node->tableContext) + MemoryContextDelete(node->tableContext); + ExecFreeExprContext(&node->ps); + + ExecEndNode(outerPlanState(node)); +} + + +void +ExecReScanSetOp(SetOpState *node) +{ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + node->setop_done = false; + node->numOutput = 0; + + if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) + { + /* + * In the hashed case, if we haven't yet built the hash table then we + * can just return; nothing done yet, so nothing to undo. If subnode's + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else no reason to re-scan it at all. + */ + if (!node->table_filled) + return; + + /* + * If we do have the hash table and the subplan does not have any + * parameter changes, then we can just rescan the existing hash table; + * no need to build it again. + */ + if (node->ps.lefttree->chgParam == NULL) + { + ResetTupleHashIterator(node->hashtable, &node->hashiter); + return; + } + } + + /* Release first tuple of group, if we have made a copy */ + if (node->grp_firstTuple != NULL) + { + heap_freetuple(node->grp_firstTuple); + node->grp_firstTuple = NULL; + } + + /* Release any hashtable storage */ + if (node->tableContext) + MemoryContextResetAndDeleteChildren(node->tableContext); + + /* And rebuild empty hashtable if needed */ + if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) + { + ResetTupleHashTable(node->hashtable); + node->table_filled = false; + } + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); +} diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c new file mode 100644 index 0000000..b99027e --- /dev/null +++ b/src/backend/executor/nodeSort.c @@ -0,0 +1,430 @@ +/*------------------------------------------------------------------------- + * + * nodeSort.c + * Routines to handle sorting of relations. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeSort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/parallel.h" +#include "executor/execdebug.h" +#include "executor/nodeSort.h" +#include "miscadmin.h" +#include "utils/tuplesort.h" + + +/* ---------------------------------------------------------------- + * ExecSort + * + * Sorts tuples from the outer subtree of the node using tuplesort, + * which saves the results in a temporary file or memory. After the + * initial call, returns a tuple from the file with each call. + * + * Conditions: + * -- none. + * + * Initial States: + * -- the outer child is prepared to return the first tuple. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecSort(PlanState *pstate) +{ + SortState *node = castNode(SortState, pstate); + EState *estate; + ScanDirection dir; + Tuplesortstate *tuplesortstate; + TupleTableSlot *slot; + + CHECK_FOR_INTERRUPTS(); + + /* + * get state info from node + */ + SO1_printf("ExecSort: %s\n", + "entering routine"); + + estate = node->ss.ps.state; + dir = estate->es_direction; + tuplesortstate = (Tuplesortstate *) node->tuplesortstate; + + /* + * If first time through, read all tuples from outer plan and pass them to + * tuplesort.c. Subsequent calls just fetch tuples from tuplesort. + */ + + if (!node->sort_Done) + { + Sort *plannode = (Sort *) node->ss.ps.plan; + PlanState *outerNode; + TupleDesc tupDesc; + + SO1_printf("ExecSort: %s\n", + "sorting subplan"); + + /* + * Want to scan subplan in the forward direction while creating the + * sorted data. + */ + estate->es_direction = ForwardScanDirection; + + /* + * Initialize tuplesort module. + */ + SO1_printf("ExecSort: %s\n", + "calling tuplesort_begin"); + + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + tuplesortstate = tuplesort_begin_heap(tupDesc, + plannode->numCols, + plannode->sortColIdx, + plannode->sortOperators, + plannode->collations, + plannode->nullsFirst, + work_mem, + NULL, + node->randomAccess); + if (node->bounded) + tuplesort_set_bound(tuplesortstate, node->bound); + node->tuplesortstate = (void *) tuplesortstate; + + /* + * Scan the subplan and feed all the tuples to tuplesort. + */ + + for (;;) + { + slot = ExecProcNode(outerNode); + + if (TupIsNull(slot)) + break; + + tuplesort_puttupleslot(tuplesortstate, slot); + } + + /* + * Complete the sort. + */ + tuplesort_performsort(tuplesortstate); + + /* + * restore to user specified direction + */ + estate->es_direction = dir; + + /* + * finally set the sorted flag to true + */ + node->sort_Done = true; + node->bounded_Done = node->bounded; + node->bound_Done = node->bound; + if (node->shared_info && node->am_worker) + { + TuplesortInstrumentation *si; + + Assert(IsParallelWorker()); + Assert(ParallelWorkerNumber <= node->shared_info->num_workers); + si = &node->shared_info->sinstrument[ParallelWorkerNumber]; + tuplesort_get_stats(tuplesortstate, si); + } + SO1_printf("ExecSort: %s\n", "sorting done"); + } + + SO1_printf("ExecSort: %s\n", + "retrieving tuple from tuplesort"); + + /* + * Get the first or next tuple from tuplesort. Returns NULL if no more + * tuples. Note that we only rely on slot tuple remaining valid until the + * next fetch from the tuplesort. + */ + slot = node->ss.ps.ps_ResultTupleSlot; + (void) tuplesort_gettupleslot(tuplesortstate, + ScanDirectionIsForward(dir), + false, slot, NULL); + return slot; +} + +/* ---------------------------------------------------------------- + * ExecInitSort + * + * Creates the run-time state information for the sort node + * produced by the planner and initializes its outer subtree. + * ---------------------------------------------------------------- + */ +SortState * +ExecInitSort(Sort *node, EState *estate, int eflags) +{ + SortState *sortstate; + + SO1_printf("ExecInitSort: %s\n", + "initializing sort node"); + + /* + * create state structure + */ + sortstate = makeNode(SortState); + sortstate->ss.ps.plan = (Plan *) node; + sortstate->ss.ps.state = estate; + sortstate->ss.ps.ExecProcNode = ExecSort; + + /* + * We must have random access to the sort output to do backward scan or + * mark/restore. We also prefer to materialize the sort output if we + * might be called on to rewind and replay it many times. + */ + sortstate->randomAccess = (eflags & (EXEC_FLAG_REWIND | + EXEC_FLAG_BACKWARD | + EXEC_FLAG_MARK)) != 0; + + sortstate->bounded = false; + sortstate->sort_Done = false; + sortstate->tuplesortstate = NULL; + + /* + * Miscellaneous initialization + * + * Sort nodes don't initialize their ExprContexts because they never call + * ExecQual or ExecProject. + */ + + /* + * initialize child nodes + * + * We shield the child node from the need to support REWIND, BACKWARD, or + * MARK/RESTORE. + */ + eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK); + + outerPlanState(sortstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * Initialize scan slot and type. + */ + ExecCreateScanSlotFromOuterPlan(estate, &sortstate->ss, &TTSOpsVirtual); + + /* + * Initialize return slot and type. No need to initialize projection info + * because this node doesn't do projections. + */ + ExecInitResultTupleSlotTL(&sortstate->ss.ps, &TTSOpsMinimalTuple); + sortstate->ss.ps.ps_ProjInfo = NULL; + + SO1_printf("ExecInitSort: %s\n", + "sort node initialized"); + + return sortstate; +} + +/* ---------------------------------------------------------------- + * ExecEndSort(node) + * ---------------------------------------------------------------- + */ +void +ExecEndSort(SortState *node) +{ + SO1_printf("ExecEndSort: %s\n", + "shutting down sort node"); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + /* + * Release tuplesort resources + */ + if (node->tuplesortstate != NULL) + tuplesort_end((Tuplesortstate *) node->tuplesortstate); + node->tuplesortstate = NULL; + + /* + * shut down the subplan + */ + ExecEndNode(outerPlanState(node)); + + SO1_printf("ExecEndSort: %s\n", + "sort node shutdown"); +} + +/* ---------------------------------------------------------------- + * ExecSortMarkPos + * + * Calls tuplesort to save the current position in the sorted file. + * ---------------------------------------------------------------- + */ +void +ExecSortMarkPos(SortState *node) +{ + /* + * if we haven't sorted yet, just return + */ + if (!node->sort_Done) + return; + + tuplesort_markpos((Tuplesortstate *) node->tuplesortstate); +} + +/* ---------------------------------------------------------------- + * ExecSortRestrPos + * + * Calls tuplesort to restore the last saved sort file position. + * ---------------------------------------------------------------- + */ +void +ExecSortRestrPos(SortState *node) +{ + /* + * if we haven't sorted yet, just return. + */ + if (!node->sort_Done) + return; + + /* + * restore the scan to the previously marked position + */ + tuplesort_restorepos((Tuplesortstate *) node->tuplesortstate); +} + +void +ExecReScanSort(SortState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + /* + * If we haven't sorted yet, just return. If outerplan's chgParam is not + * NULL then it will be re-scanned by ExecProcNode, else no reason to + * re-scan it at all. + */ + if (!node->sort_Done) + return; + + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + /* + * If subnode is to be rescanned then we forget previous sort results; we + * have to re-read the subplan and re-sort. Also must re-sort if the + * bounded-sort parameters changed or we didn't select randomAccess. + * + * Otherwise we can just rewind and rescan the sorted output. + */ + if (outerPlan->chgParam != NULL || + node->bounded != node->bounded_Done || + node->bound != node->bound_Done || + !node->randomAccess) + { + node->sort_Done = false; + tuplesort_end((Tuplesortstate *) node->tuplesortstate); + node->tuplesortstate = NULL; + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); + } + else + tuplesort_rescan((Tuplesortstate *) node->tuplesortstate); +} + +/* ---------------------------------------------------------------- + * Parallel Query Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecSortEstimate + * + * Estimate space required to propagate sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecSortEstimate(SortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(TuplesortInstrumentation)); + size = add_size(size, offsetof(SharedSortInfo, sinstrument)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeDSM + * + * Initialize DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedSortInfo, sinstrument) + + pcxt->nworkers * sizeof(TuplesortInstrumentation); + node->shared_info = shm_toc_allocate(pcxt->toc, size); + /* ensure any unfilled slots will contain zeroes */ + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, + node->shared_info); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeWorker + * + * Attach worker to DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecSortInitializeWorker(SortState *node, ParallelWorkerContext *pwcxt) +{ + node->shared_info = + shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true); + node->am_worker = true; +} + +/* ---------------------------------------------------------------- + * ExecSortRetrieveInstrumentation + * + * Transfer sort statistics from DSM to private memory. + * ---------------------------------------------------------------- + */ +void +ExecSortRetrieveInstrumentation(SortState *node) +{ + Size size; + SharedSortInfo *si; + + if (node->shared_info == NULL) + return; + + size = offsetof(SharedSortInfo, sinstrument) + + node->shared_info->num_workers * sizeof(TuplesortInstrumentation); + si = palloc(size); + memcpy(si, node->shared_info, size); + node->shared_info = si; +} diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c new file mode 100644 index 0000000..d46227e --- /dev/null +++ b/src/backend/executor/nodeSubplan.c @@ -0,0 +1,1313 @@ +/*------------------------------------------------------------------------- + * + * nodeSubplan.c + * routines to support sub-selects appearing in expressions + * + * This module is concerned with executing SubPlan expression nodes, which + * should not be confused with sub-SELECTs appearing in FROM. SubPlans are + * divided into "initplans", which are those that need only one evaluation per + * query (among other restrictions, this requires that they don't use any + * direct correlation variables from the parent plan level), and "regular" + * subplans, which are re-evaluated every time their result is required. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeSubplan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecSubPlan - process a subselect + * ExecInitSubPlan - initialize a subselect + */ +#include "postgres.h" + +#include +#include + +#include "access/htup_details.h" +#include "executor/executor.h" +#include "executor/nodeSubplan.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "utils/array.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" + +static Datum ExecHashSubPlan(SubPlanState *node, + ExprContext *econtext, + bool *isNull); +static Datum ExecScanSubPlan(SubPlanState *node, + ExprContext *econtext, + bool *isNull); +static void buildSubPlanHash(SubPlanState *node, ExprContext *econtext); +static bool findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot, + FmgrInfo *eqfunctions); +static bool slotAllNulls(TupleTableSlot *slot); +static bool slotNoNulls(TupleTableSlot *slot); + + +/* ---------------------------------------------------------------- + * ExecSubPlan + * + * This is the main entry point for execution of a regular SubPlan. + * ---------------------------------------------------------------- + */ +Datum +ExecSubPlan(SubPlanState *node, + ExprContext *econtext, + bool *isNull) +{ + SubPlan *subplan = node->subplan; + EState *estate = node->planstate->state; + ScanDirection dir = estate->es_direction; + Datum retval; + + CHECK_FOR_INTERRUPTS(); + + /* Set non-null as default */ + *isNull = false; + + /* Sanity checks */ + if (subplan->subLinkType == CTE_SUBLINK) + elog(ERROR, "CTE subplans should not be executed via ExecSubPlan"); + if (subplan->setParam != NIL && subplan->subLinkType != MULTIEXPR_SUBLINK) + elog(ERROR, "cannot set parent params from subquery"); + + /* Force forward-scan mode for evaluation */ + estate->es_direction = ForwardScanDirection; + + /* Select appropriate evaluation strategy */ + if (subplan->useHashTable) + retval = ExecHashSubPlan(node, econtext, isNull); + else + retval = ExecScanSubPlan(node, econtext, isNull); + + /* restore scan direction */ + estate->es_direction = dir; + + return retval; +} + +/* + * ExecHashSubPlan: store subselect result in an in-memory hash table + */ +static Datum +ExecHashSubPlan(SubPlanState *node, + ExprContext *econtext, + bool *isNull) +{ + SubPlan *subplan = node->subplan; + PlanState *planstate = node->planstate; + TupleTableSlot *slot; + + /* Shouldn't have any direct correlation Vars */ + if (subplan->parParam != NIL || node->args != NIL) + elog(ERROR, "hashed subplan with direct correlation not supported"); + + /* + * If first time through or we need to rescan the subplan, build the hash + * table. + */ + if (node->hashtable == NULL || planstate->chgParam != NULL) + buildSubPlanHash(node, econtext); + + /* + * The result for an empty subplan is always FALSE; no need to evaluate + * lefthand side. + */ + *isNull = false; + if (!node->havehashrows && !node->havenullrows) + return BoolGetDatum(false); + + /* + * Evaluate lefthand expressions and form a projection tuple. First we + * have to set the econtext to use (hack alert!). + */ + node->projLeft->pi_exprContext = econtext; + slot = ExecProject(node->projLeft); + + /* + * Note: because we are typically called in a per-tuple context, we have + * to explicitly clear the projected tuple before returning. Otherwise, + * we'll have a double-free situation: the per-tuple context will probably + * be reset before we're called again, and then the tuple slot will think + * it still needs to free the tuple. + */ + + /* + * If the LHS is all non-null, probe for an exact match in the main hash + * table. If we find one, the result is TRUE. Otherwise, scan the + * partly-null table to see if there are any rows that aren't provably + * unequal to the LHS; if so, the result is UNKNOWN. (We skip that part + * if we don't care about UNKNOWN.) Otherwise, the result is FALSE. + * + * Note: the reason we can avoid a full scan of the main hash table is + * that the combining operators are assumed never to yield NULL when both + * inputs are non-null. If they were to do so, we might need to produce + * UNKNOWN instead of FALSE because of an UNKNOWN result in comparing the + * LHS to some main-table entry --- which is a comparison we will not even + * make, unless there's a chance match of hash keys. + */ + if (slotNoNulls(slot)) + { + if (node->havehashrows && + FindTupleHashEntry(node->hashtable, + slot, + node->cur_eq_comp, + node->lhs_hash_funcs) != NULL) + { + ExecClearTuple(slot); + return BoolGetDatum(true); + } + if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) + { + ExecClearTuple(slot); + *isNull = true; + return BoolGetDatum(false); + } + ExecClearTuple(slot); + return BoolGetDatum(false); + } + + /* + * When the LHS is partly or wholly NULL, we can never return TRUE. If we + * don't care about UNKNOWN, just return FALSE. Otherwise, if the LHS is + * wholly NULL, immediately return UNKNOWN. (Since the combining + * operators are strict, the result could only be FALSE if the sub-select + * were empty, but we already handled that case.) Otherwise, we must scan + * both the main and partly-null tables to see if there are any rows that + * aren't provably unequal to the LHS; if so, the result is UNKNOWN. + * Otherwise, the result is FALSE. + */ + if (node->hashnulls == NULL) + { + ExecClearTuple(slot); + return BoolGetDatum(false); + } + if (slotAllNulls(slot)) + { + ExecClearTuple(slot); + *isNull = true; + return BoolGetDatum(false); + } + /* Scan partly-null table first, since more likely to get a match */ + if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) + { + ExecClearTuple(slot); + *isNull = true; + return BoolGetDatum(false); + } + if (node->havehashrows && + findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) + { + ExecClearTuple(slot); + *isNull = true; + return BoolGetDatum(false); + } + ExecClearTuple(slot); + return BoolGetDatum(false); +} + +/* + * ExecScanSubPlan: default case where we have to rescan subplan each time + */ +static Datum +ExecScanSubPlan(SubPlanState *node, + ExprContext *econtext, + bool *isNull) +{ + SubPlan *subplan = node->subplan; + PlanState *planstate = node->planstate; + SubLinkType subLinkType = subplan->subLinkType; + MemoryContext oldcontext; + TupleTableSlot *slot; + Datum result; + bool found = false; /* true if got at least one subplan tuple */ + ListCell *pvar; + ListCell *l; + ArrayBuildStateAny *astate = NULL; + + /* + * MULTIEXPR subplans, when "executed", just return NULL; but first we + * mark the subplan's output parameters as needing recalculation. (This + * is a bit of a hack: it relies on the subplan appearing later in its + * targetlist than any of the referencing Params, so that all the Params + * have been evaluated before we re-mark them for the next evaluation + * cycle. But in general resjunk tlist items appear after non-resjunk + * ones, so this should be safe.) Unlike ExecReScanSetParamPlan, we do + * *not* set bits in the parent plan node's chgParam, because we don't + * want to cause a rescan of the parent. + */ + if (subLinkType == MULTIEXPR_SUBLINK) + { + EState *estate = node->parent->state; + + foreach(l, subplan->setParam) + { + int paramid = lfirst_int(l); + ParamExecData *prm = &(estate->es_param_exec_vals[paramid]); + + prm->execPlan = node; + } + *isNull = true; + return (Datum) 0; + } + + /* Initialize ArrayBuildStateAny in caller's context, if needed */ + if (subLinkType == ARRAY_SUBLINK) + astate = initArrayResultAny(subplan->firstColType, + CurrentMemoryContext, true); + + /* + * We are probably in a short-lived expression-evaluation context. Switch + * to the per-query context for manipulating the child plan's chgParam, + * calling ExecProcNode on it, etc. + */ + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + /* + * Set Params of this plan from parent plan correlation values. (Any + * calculation we have to do is done in the parent econtext, since the + * Param values don't need to have per-query lifetime.) + */ + Assert(list_length(subplan->parParam) == list_length(node->args)); + + forboth(l, subplan->parParam, pvar, node->args) + { + int paramid = lfirst_int(l); + ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]); + + prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar), + econtext, + &(prm->isnull)); + planstate->chgParam = bms_add_member(planstate->chgParam, paramid); + } + + /* + * Now that we've set up its parameters, we can reset the subplan. + */ + ExecReScan(planstate); + + /* + * For all sublink types except EXPR_SUBLINK and ARRAY_SUBLINK, the result + * is boolean as are the results of the combining operators. We combine + * results across tuples (if the subplan produces more than one) using OR + * semantics for ANY_SUBLINK or AND semantics for ALL_SUBLINK. + * (ROWCOMPARE_SUBLINK doesn't allow multiple tuples from the subplan.) + * NULL results from the combining operators are handled according to the + * usual SQL semantics for OR and AND. The result for no input tuples is + * FALSE for ANY_SUBLINK, TRUE for ALL_SUBLINK, NULL for + * ROWCOMPARE_SUBLINK. + * + * For EXPR_SUBLINK we require the subplan to produce no more than one + * tuple, else an error is raised. If zero tuples are produced, we return + * NULL. Assuming we get a tuple, we just use its first column (there can + * be only one non-junk column in this case). + * + * For ARRAY_SUBLINK we allow the subplan to produce any number of tuples, + * and form an array of the first column's values. Note in particular + * that we produce a zero-element array if no tuples are produced (this is + * a change from pre-8.3 behavior of returning NULL). + */ + result = BoolGetDatum(subLinkType == ALL_SUBLINK); + *isNull = false; + + for (slot = ExecProcNode(planstate); + !TupIsNull(slot); + slot = ExecProcNode(planstate)) + { + TupleDesc tdesc = slot->tts_tupleDescriptor; + Datum rowresult; + bool rownull; + int col; + ListCell *plst; + + if (subLinkType == EXISTS_SUBLINK) + { + found = true; + result = BoolGetDatum(true); + break; + } + + if (subLinkType == EXPR_SUBLINK) + { + /* cannot allow multiple input tuples for EXPR sublink */ + if (found) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("more than one row returned by a subquery used as an expression"))); + found = true; + + /* + * We need to copy the subplan's tuple in case the result is of + * pass-by-ref type --- our return value will point into this + * copied tuple! Can't use the subplan's instance of the tuple + * since it won't still be valid after next ExecProcNode() call. + * node->curTuple keeps track of the copied tuple for eventual + * freeing. + */ + if (node->curTuple) + heap_freetuple(node->curTuple); + node->curTuple = ExecCopySlotHeapTuple(slot); + + result = heap_getattr(node->curTuple, 1, tdesc, isNull); + /* keep scanning subplan to make sure there's only one tuple */ + continue; + } + + if (subLinkType == ARRAY_SUBLINK) + { + Datum dvalue; + bool disnull; + + found = true; + /* stash away current value */ + Assert(subplan->firstColType == TupleDescAttr(tdesc, 0)->atttypid); + dvalue = slot_getattr(slot, 1, &disnull); + astate = accumArrayResultAny(astate, dvalue, disnull, + subplan->firstColType, oldcontext); + /* keep scanning subplan to collect all values */ + continue; + } + + /* cannot allow multiple input tuples for ROWCOMPARE sublink either */ + if (subLinkType == ROWCOMPARE_SUBLINK && found) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("more than one row returned by a subquery used as an expression"))); + + found = true; + + /* + * For ALL, ANY, and ROWCOMPARE sublinks, load up the Params + * representing the columns of the sub-select, and then evaluate the + * combining expression. + */ + col = 1; + foreach(plst, subplan->paramIds) + { + int paramid = lfirst_int(plst); + ParamExecData *prmdata; + + prmdata = &(econtext->ecxt_param_exec_vals[paramid]); + Assert(prmdata->execPlan == NULL); + prmdata->value = slot_getattr(slot, col, &(prmdata->isnull)); + col++; + } + + rowresult = ExecEvalExprSwitchContext(node->testexpr, econtext, + &rownull); + + if (subLinkType == ANY_SUBLINK) + { + /* combine across rows per OR semantics */ + if (rownull) + *isNull = true; + else if (DatumGetBool(rowresult)) + { + result = BoolGetDatum(true); + *isNull = false; + break; /* needn't look at any more rows */ + } + } + else if (subLinkType == ALL_SUBLINK) + { + /* combine across rows per AND semantics */ + if (rownull) + *isNull = true; + else if (!DatumGetBool(rowresult)) + { + result = BoolGetDatum(false); + *isNull = false; + break; /* needn't look at any more rows */ + } + } + else + { + /* must be ROWCOMPARE_SUBLINK */ + result = rowresult; + *isNull = rownull; + } + } + + MemoryContextSwitchTo(oldcontext); + + if (subLinkType == ARRAY_SUBLINK) + { + /* We return the result in the caller's context */ + result = makeArrayResultAny(astate, oldcontext, true); + } + else if (!found) + { + /* + * deal with empty subplan result. result/isNull were previously + * initialized correctly for all sublink types except EXPR and + * ROWCOMPARE; for those, return NULL. + */ + if (subLinkType == EXPR_SUBLINK || + subLinkType == ROWCOMPARE_SUBLINK) + { + result = (Datum) 0; + *isNull = true; + } + } + + return result; +} + +/* + * buildSubPlanHash: load hash table by scanning subplan output. + */ +static void +buildSubPlanHash(SubPlanState *node, ExprContext *econtext) +{ + SubPlan *subplan = node->subplan; + PlanState *planstate = node->planstate; + int ncols = node->numCols; + ExprContext *innerecontext = node->innerecontext; + MemoryContext oldcontext; + long nbuckets; + TupleTableSlot *slot; + + Assert(subplan->subLinkType == ANY_SUBLINK); + + /* + * If we already had any hash tables, reset 'em; otherwise create empty + * hash table(s). + * + * If we need to distinguish accurately between FALSE and UNKNOWN (i.e., + * NULL) results of the IN operation, then we have to store subplan output + * rows that are partly or wholly NULL. We store such rows in a separate + * hash table that we expect will be much smaller than the main table. (We + * can use hashing to eliminate partly-null rows that are not distinct. We + * keep them separate to minimize the cost of the inevitable full-table + * searches; see findPartialMatch.) + * + * If it's not necessary to distinguish FALSE and UNKNOWN, then we don't + * need to store subplan output rows that contain NULL. + */ + MemoryContextReset(node->hashtablecxt); + node->havehashrows = false; + node->havenullrows = false; + + nbuckets = (long) Min(planstate->plan->plan_rows, (double) LONG_MAX); + if (nbuckets < 1) + nbuckets = 1; + + if (node->hashtable) + ResetTupleHashTable(node->hashtable); + else + node->hashtable = BuildTupleHashTableExt(node->parent, + node->descRight, + ncols, + node->keyColIdx, + node->tab_eq_funcoids, + node->tab_hash_funcs, + node->tab_collations, + nbuckets, + 0, + node->planstate->state->es_query_cxt, + node->hashtablecxt, + node->hashtempcxt, + false); + + if (!subplan->unknownEqFalse) + { + if (ncols == 1) + nbuckets = 1; /* there can only be one entry */ + else + { + nbuckets /= 16; + if (nbuckets < 1) + nbuckets = 1; + } + + if (node->hashnulls) + ResetTupleHashTable(node->hashnulls); + else + node->hashnulls = BuildTupleHashTableExt(node->parent, + node->descRight, + ncols, + node->keyColIdx, + node->tab_eq_funcoids, + node->tab_hash_funcs, + node->tab_collations, + nbuckets, + 0, + node->planstate->state->es_query_cxt, + node->hashtablecxt, + node->hashtempcxt, + false); + } + else + node->hashnulls = NULL; + + /* + * We are probably in a short-lived expression-evaluation context. Switch + * to the per-query context for manipulating the child plan. + */ + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + /* + * Reset subplan to start. + */ + ExecReScan(planstate); + + /* + * Scan the subplan and load the hash table(s). Note that when there are + * duplicate rows coming out of the sub-select, only one copy is stored. + */ + for (slot = ExecProcNode(planstate); + !TupIsNull(slot); + slot = ExecProcNode(planstate)) + { + int col = 1; + ListCell *plst; + bool isnew; + + /* + * Load up the Params representing the raw sub-select outputs, then + * form the projection tuple to store in the hashtable. + */ + foreach(plst, subplan->paramIds) + { + int paramid = lfirst_int(plst); + ParamExecData *prmdata; + + prmdata = &(innerecontext->ecxt_param_exec_vals[paramid]); + Assert(prmdata->execPlan == NULL); + prmdata->value = slot_getattr(slot, col, + &(prmdata->isnull)); + col++; + } + slot = ExecProject(node->projRight); + + /* + * If result contains any nulls, store separately or not at all. + */ + if (slotNoNulls(slot)) + { + (void) LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL); + node->havehashrows = true; + } + else if (node->hashnulls) + { + (void) LookupTupleHashEntry(node->hashnulls, slot, &isnew, NULL); + node->havenullrows = true; + } + + /* + * Reset innerecontext after each inner tuple to free any memory used + * during ExecProject. + */ + ResetExprContext(innerecontext); + } + + /* + * Since the projected tuples are in the sub-query's context and not the + * main context, we'd better clear the tuple slot before there's any + * chance of a reset of the sub-query's context. Else we will have the + * potential for a double free attempt. (XXX possibly no longer needed, + * but can't hurt.) + */ + ExecClearTuple(node->projRight->pi_state.resultslot); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * execTuplesUnequal + * Return true if two tuples are definitely unequal in the indicated + * fields. + * + * Nulls are neither equal nor unequal to anything else. A true result + * is obtained only if there are non-null fields that compare not-equal. + * + * slot1, slot2: the tuples to compare (must have same columns!) + * numCols: the number of attributes to be examined + * matchColIdx: array of attribute column numbers + * eqFunctions: array of fmgr lookup info for the equality functions to use + * evalContext: short-term memory context for executing the functions + */ +static bool +execTuplesUnequal(TupleTableSlot *slot1, + TupleTableSlot *slot2, + int numCols, + AttrNumber *matchColIdx, + FmgrInfo *eqfunctions, + const Oid *collations, + MemoryContext evalContext) +{ + MemoryContext oldContext; + bool result; + int i; + + /* Reset and switch into the temp context. */ + MemoryContextReset(evalContext); + oldContext = MemoryContextSwitchTo(evalContext); + + /* + * We cannot report a match without checking all the fields, but we can + * report a non-match as soon as we find unequal fields. So, start + * comparing at the last field (least significant sort key). That's the + * most likely to be different if we are dealing with sorted input. + */ + result = false; + + for (i = numCols; --i >= 0;) + { + AttrNumber att = matchColIdx[i]; + Datum attr1, + attr2; + bool isNull1, + isNull2; + + attr1 = slot_getattr(slot1, att, &isNull1); + + if (isNull1) + continue; /* can't prove anything here */ + + attr2 = slot_getattr(slot2, att, &isNull2); + + if (isNull2) + continue; /* can't prove anything here */ + + /* Apply the type-specific equality function */ + if (!DatumGetBool(FunctionCall2Coll(&eqfunctions[i], + collations[i], + attr1, attr2))) + { + result = true; /* they are unequal */ + break; + } + } + + MemoryContextSwitchTo(oldContext); + + return result; +} + +/* + * findPartialMatch: does the hashtable contain an entry that is not + * provably distinct from the tuple? + * + * We have to scan the whole hashtable; we can't usefully use hashkeys + * to guide probing, since we might get partial matches on tuples with + * hashkeys quite unrelated to what we'd get from the given tuple. + * + * Caller must provide the equality functions to use, since in cross-type + * cases these are different from the hashtable's internal functions. + */ +static bool +findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot, + FmgrInfo *eqfunctions) +{ + int numCols = hashtable->numCols; + AttrNumber *keyColIdx = hashtable->keyColIdx; + TupleHashIterator hashiter; + TupleHashEntry entry; + + InitTupleHashIterator(hashtable, &hashiter); + while ((entry = ScanTupleHashTable(hashtable, &hashiter)) != NULL) + { + CHECK_FOR_INTERRUPTS(); + + ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false); + if (!execTuplesUnequal(slot, hashtable->tableslot, + numCols, keyColIdx, + eqfunctions, + hashtable->tab_collations, + hashtable->tempcxt)) + { + TermTupleHashIterator(&hashiter); + return true; + } + } + /* No TermTupleHashIterator call needed here */ + return false; +} + +/* + * slotAllNulls: is the slot completely NULL? + * + * This does not test for dropped columns, which is OK because we only + * use it on projected tuples. + */ +static bool +slotAllNulls(TupleTableSlot *slot) +{ + int ncols = slot->tts_tupleDescriptor->natts; + int i; + + for (i = 1; i <= ncols; i++) + { + if (!slot_attisnull(slot, i)) + return false; + } + return true; +} + +/* + * slotNoNulls: is the slot entirely not NULL? + * + * This does not test for dropped columns, which is OK because we only + * use it on projected tuples. + */ +static bool +slotNoNulls(TupleTableSlot *slot) +{ + int ncols = slot->tts_tupleDescriptor->natts; + int i; + + for (i = 1; i <= ncols; i++) + { + if (slot_attisnull(slot, i)) + return false; + } + return true; +} + +/* ---------------------------------------------------------------- + * ExecInitSubPlan + * + * Create a SubPlanState for a SubPlan; this is the SubPlan-specific part + * of ExecInitExpr(). We split it out so that it can be used for InitPlans + * as well as regular SubPlans. Note that we don't link the SubPlan into + * the parent's subPlan list, because that shouldn't happen for InitPlans. + * Instead, ExecInitExpr() does that one part. + * ---------------------------------------------------------------- + */ +SubPlanState * +ExecInitSubPlan(SubPlan *subplan, PlanState *parent) +{ + SubPlanState *sstate = makeNode(SubPlanState); + EState *estate = parent->state; + + sstate->subplan = subplan; + + /* Link the SubPlanState to already-initialized subplan */ + sstate->planstate = (PlanState *) list_nth(estate->es_subplanstates, + subplan->plan_id - 1); + + /* + * This check can fail if the planner mistakenly puts a parallel-unsafe + * subplan into a parallelized subquery; see ExecSerializePlan. + */ + if (sstate->planstate == NULL) + elog(ERROR, "subplan \"%s\" was not initialized", + subplan->plan_name); + + /* Link to parent's state, too */ + sstate->parent = parent; + + /* Initialize subexpressions */ + sstate->testexpr = ExecInitExpr((Expr *) subplan->testexpr, parent); + sstate->args = ExecInitExprList(subplan->args, parent); + + /* + * initialize my state + */ + sstate->curTuple = NULL; + sstate->curArray = PointerGetDatum(NULL); + sstate->projLeft = NULL; + sstate->projRight = NULL; + sstate->hashtable = NULL; + sstate->hashnulls = NULL; + sstate->hashtablecxt = NULL; + sstate->hashtempcxt = NULL; + sstate->innerecontext = NULL; + sstate->keyColIdx = NULL; + sstate->tab_eq_funcoids = NULL; + sstate->tab_hash_funcs = NULL; + sstate->tab_eq_funcs = NULL; + sstate->tab_collations = NULL; + sstate->lhs_hash_funcs = NULL; + sstate->cur_eq_funcs = NULL; + + /* + * If this is an initplan or MULTIEXPR subplan, it has output parameters + * that the parent plan will use, so mark those parameters as needing + * evaluation. We don't actually run the subplan until we first need one + * of its outputs. + * + * A CTE subplan's output parameter is never to be evaluated in the normal + * way, so skip this in that case. + * + * Note that we don't set parent->chgParam here: the parent plan hasn't + * been run yet, so no need to force it to re-run. + */ + if (subplan->setParam != NIL && subplan->subLinkType != CTE_SUBLINK) + { + ListCell *lst; + + foreach(lst, subplan->setParam) + { + int paramid = lfirst_int(lst); + ParamExecData *prm = &(estate->es_param_exec_vals[paramid]); + + prm->execPlan = sstate; + } + } + + /* + * If we are going to hash the subquery output, initialize relevant stuff. + * (We don't create the hashtable until needed, though.) + */ + if (subplan->useHashTable) + { + int ncols, + i; + TupleDesc tupDescLeft; + TupleDesc tupDescRight; + Oid *cross_eq_funcoids; + TupleTableSlot *slot; + List *oplist, + *lefttlist, + *righttlist; + ListCell *l; + + /* We need a memory context to hold the hash table(s) */ + sstate->hashtablecxt = + AllocSetContextCreate(CurrentMemoryContext, + "Subplan HashTable Context", + ALLOCSET_DEFAULT_SIZES); + /* and a small one for the hash tables to use as temp storage */ + sstate->hashtempcxt = + AllocSetContextCreate(CurrentMemoryContext, + "Subplan HashTable Temp Context", + ALLOCSET_SMALL_SIZES); + /* and a short-lived exprcontext for function evaluation */ + sstate->innerecontext = CreateExprContext(estate); + + /* + * We use ExecProject to evaluate the lefthand and righthand + * expression lists and form tuples. (You might think that we could + * use the sub-select's output tuples directly, but that is not the + * case if we had to insert any run-time coercions of the sub-select's + * output datatypes; anyway this avoids storing any resjunk columns + * that might be in the sub-select's output.) Run through the + * combining expressions to build tlists for the lefthand and + * righthand sides. + * + * We also extract the combining operators themselves to initialize + * the equality and hashing functions for the hash tables. + */ + if (IsA(subplan->testexpr, OpExpr)) + { + /* single combining operator */ + oplist = list_make1(subplan->testexpr); + } + else if (is_andclause(subplan->testexpr)) + { + /* multiple combining operators */ + oplist = castNode(BoolExpr, subplan->testexpr)->args; + } + else + { + /* shouldn't see anything else in a hashable subplan */ + elog(ERROR, "unrecognized testexpr type: %d", + (int) nodeTag(subplan->testexpr)); + oplist = NIL; /* keep compiler quiet */ + } + ncols = list_length(oplist); + + lefttlist = righttlist = NIL; + sstate->numCols = ncols; + sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber)); + sstate->tab_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid)); + sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid)); + sstate->tab_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo)); + sstate->tab_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo)); + sstate->lhs_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo)); + sstate->cur_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo)); + /* we'll need the cross-type equality fns below, but not in sstate */ + cross_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid)); + + i = 1; + foreach(l, oplist) + { + OpExpr *opexpr = lfirst_node(OpExpr, l); + Expr *expr; + TargetEntry *tle; + Oid rhs_eq_oper; + Oid left_hashfn; + Oid right_hashfn; + + Assert(list_length(opexpr->args) == 2); + + /* Process lefthand argument */ + expr = (Expr *) linitial(opexpr->args); + tle = makeTargetEntry(expr, + i, + NULL, + false); + lefttlist = lappend(lefttlist, tle); + + /* Process righthand argument */ + expr = (Expr *) lsecond(opexpr->args); + tle = makeTargetEntry(expr, + i, + NULL, + false); + righttlist = lappend(righttlist, tle); + + /* Lookup the equality function (potentially cross-type) */ + cross_eq_funcoids[i - 1] = opexpr->opfuncid; + fmgr_info(opexpr->opfuncid, &sstate->cur_eq_funcs[i - 1]); + fmgr_info_set_expr((Node *) opexpr, &sstate->cur_eq_funcs[i - 1]); + + /* Look up the equality function for the RHS type */ + if (!get_compatible_hash_operators(opexpr->opno, + NULL, &rhs_eq_oper)) + elog(ERROR, "could not find compatible hash operator for operator %u", + opexpr->opno); + sstate->tab_eq_funcoids[i - 1] = get_opcode(rhs_eq_oper); + fmgr_info(sstate->tab_eq_funcoids[i - 1], + &sstate->tab_eq_funcs[i - 1]); + + /* Lookup the associated hash functions */ + if (!get_op_hash_functions(opexpr->opno, + &left_hashfn, &right_hashfn)) + elog(ERROR, "could not find hash function for hash operator %u", + opexpr->opno); + fmgr_info(left_hashfn, &sstate->lhs_hash_funcs[i - 1]); + fmgr_info(right_hashfn, &sstate->tab_hash_funcs[i - 1]); + + /* Set collation */ + sstate->tab_collations[i - 1] = opexpr->inputcollid; + + /* keyColIdx is just column numbers 1..n */ + sstate->keyColIdx[i - 1] = i; + + i++; + } + + /* + * Construct tupdescs, slots and projection nodes for left and right + * sides. The lefthand expressions will be evaluated in the parent + * plan node's exprcontext, which we don't have access to here. + * Fortunately we can just pass NULL for now and fill it in later + * (hack alert!). The righthand expressions will be evaluated in our + * own innerecontext. + */ + tupDescLeft = ExecTypeFromTL(lefttlist); + slot = ExecInitExtraTupleSlot(estate, tupDescLeft, &TTSOpsVirtual); + sstate->projLeft = ExecBuildProjectionInfo(lefttlist, + NULL, + slot, + parent, + NULL); + + sstate->descRight = tupDescRight = ExecTypeFromTL(righttlist); + slot = ExecInitExtraTupleSlot(estate, tupDescRight, &TTSOpsVirtual); + sstate->projRight = ExecBuildProjectionInfo(righttlist, + sstate->innerecontext, + slot, + sstate->planstate, + NULL); + + /* + * Create comparator for lookups of rows in the table (potentially + * cross-type comparisons). + */ + sstate->cur_eq_comp = ExecBuildGroupingEqual(tupDescLeft, tupDescRight, + &TTSOpsVirtual, &TTSOpsMinimalTuple, + ncols, + sstate->keyColIdx, + cross_eq_funcoids, + sstate->tab_collations, + parent); + } + + return sstate; +} + +/* ---------------------------------------------------------------- + * ExecSetParamPlan + * + * Executes a subplan and sets its output parameters. + * + * This is called from ExecEvalParamExec() when the value of a PARAM_EXEC + * parameter is requested and the param's execPlan field is set (indicating + * that the param has not yet been evaluated). This allows lazy evaluation + * of initplans: we don't run the subplan until/unless we need its output. + * Note that this routine MUST clear the execPlan fields of the plan's + * output parameters after evaluating them! + * + * The results of this function are stored in the EState associated with the + * ExprContext (particularly, its ecxt_param_exec_vals); any pass-by-ref + * result Datums are allocated in the EState's per-query memory. The passed + * econtext can be any ExprContext belonging to that EState; which one is + * important only to the extent that the ExprContext's per-tuple memory + * context is used to evaluate any parameters passed down to the subplan. + * (Thus in principle, the shorter-lived the ExprContext the better, since + * that data isn't needed after we return. In practice, because initplan + * parameters are never more complex than Vars, Aggrefs, etc, evaluating them + * currently never leaks any memory anyway.) + * ---------------------------------------------------------------- + */ +void +ExecSetParamPlan(SubPlanState *node, ExprContext *econtext) +{ + SubPlan *subplan = node->subplan; + PlanState *planstate = node->planstate; + SubLinkType subLinkType = subplan->subLinkType; + EState *estate = planstate->state; + ScanDirection dir = estate->es_direction; + MemoryContext oldcontext; + TupleTableSlot *slot; + ListCell *pvar; + ListCell *l; + bool found = false; + ArrayBuildStateAny *astate = NULL; + + if (subLinkType == ANY_SUBLINK || + subLinkType == ALL_SUBLINK) + elog(ERROR, "ANY/ALL subselect unsupported as initplan"); + if (subLinkType == CTE_SUBLINK) + elog(ERROR, "CTE subplans should not be executed via ExecSetParamPlan"); + + /* + * Enforce forward scan direction regardless of caller. It's hard but not + * impossible to get here in backward scan, so make it work anyway. + */ + estate->es_direction = ForwardScanDirection; + + /* Initialize ArrayBuildStateAny in caller's context, if needed */ + if (subLinkType == ARRAY_SUBLINK) + astate = initArrayResultAny(subplan->firstColType, + CurrentMemoryContext, true); + + /* + * Must switch to per-query memory context. + */ + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + /* + * Set Params of this plan from parent plan correlation values. (Any + * calculation we have to do is done in the parent econtext, since the + * Param values don't need to have per-query lifetime.) Currently, we + * expect only MULTIEXPR_SUBLINK plans to have any correlation values. + */ + Assert(subplan->parParam == NIL || subLinkType == MULTIEXPR_SUBLINK); + Assert(list_length(subplan->parParam) == list_length(node->args)); + + forboth(l, subplan->parParam, pvar, node->args) + { + int paramid = lfirst_int(l); + ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]); + + prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar), + econtext, + &(prm->isnull)); + planstate->chgParam = bms_add_member(planstate->chgParam, paramid); + } + + /* + * Run the plan. (If it needs to be rescanned, the first ExecProcNode + * call will take care of that.) + */ + for (slot = ExecProcNode(planstate); + !TupIsNull(slot); + slot = ExecProcNode(planstate)) + { + TupleDesc tdesc = slot->tts_tupleDescriptor; + int i = 1; + + if (subLinkType == EXISTS_SUBLINK) + { + /* There can be only one setParam... */ + int paramid = linitial_int(subplan->setParam); + ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]); + + prm->execPlan = NULL; + prm->value = BoolGetDatum(true); + prm->isnull = false; + found = true; + break; + } + + if (subLinkType == ARRAY_SUBLINK) + { + Datum dvalue; + bool disnull; + + found = true; + /* stash away current value */ + Assert(subplan->firstColType == TupleDescAttr(tdesc, 0)->atttypid); + dvalue = slot_getattr(slot, 1, &disnull); + astate = accumArrayResultAny(astate, dvalue, disnull, + subplan->firstColType, oldcontext); + /* keep scanning subplan to collect all values */ + continue; + } + + if (found && + (subLinkType == EXPR_SUBLINK || + subLinkType == MULTIEXPR_SUBLINK || + subLinkType == ROWCOMPARE_SUBLINK)) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + errmsg("more than one row returned by a subquery used as an expression"))); + + found = true; + + /* + * We need to copy the subplan's tuple into our own context, in case + * any of the params are pass-by-ref type --- the pointers stored in + * the param structs will point at this copied tuple! node->curTuple + * keeps track of the copied tuple for eventual freeing. + */ + if (node->curTuple) + heap_freetuple(node->curTuple); + node->curTuple = ExecCopySlotHeapTuple(slot); + + /* + * Now set all the setParam params from the columns of the tuple + */ + foreach(l, subplan->setParam) + { + int paramid = lfirst_int(l); + ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]); + + prm->execPlan = NULL; + prm->value = heap_getattr(node->curTuple, i, tdesc, + &(prm->isnull)); + i++; + } + } + + if (subLinkType == ARRAY_SUBLINK) + { + /* There can be only one setParam... */ + int paramid = linitial_int(subplan->setParam); + ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]); + + /* + * We build the result array in query context so it won't disappear; + * to avoid leaking memory across repeated calls, we have to remember + * the latest value, much as for curTuple above. + */ + if (node->curArray != PointerGetDatum(NULL)) + pfree(DatumGetPointer(node->curArray)); + node->curArray = makeArrayResultAny(astate, + econtext->ecxt_per_query_memory, + true); + prm->execPlan = NULL; + prm->value = node->curArray; + prm->isnull = false; + } + else if (!found) + { + if (subLinkType == EXISTS_SUBLINK) + { + /* There can be only one setParam... */ + int paramid = linitial_int(subplan->setParam); + ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]); + + prm->execPlan = NULL; + prm->value = BoolGetDatum(false); + prm->isnull = false; + } + else + { + /* For other sublink types, set all the output params to NULL */ + foreach(l, subplan->setParam) + { + int paramid = lfirst_int(l); + ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]); + + prm->execPlan = NULL; + prm->value = (Datum) 0; + prm->isnull = true; + } + } + } + + MemoryContextSwitchTo(oldcontext); + + /* restore scan direction */ + estate->es_direction = dir; +} + +/* + * ExecSetParamPlanMulti + * + * Apply ExecSetParamPlan to evaluate any not-yet-evaluated initplan output + * parameters whose ParamIDs are listed in "params". Any listed params that + * are not initplan outputs are ignored. + * + * As with ExecSetParamPlan, any ExprContext belonging to the current EState + * can be used, but in principle a shorter-lived ExprContext is better than a + * longer-lived one. + */ +void +ExecSetParamPlanMulti(const Bitmapset *params, ExprContext *econtext) +{ + int paramid; + + paramid = -1; + while ((paramid = bms_next_member(params, paramid)) >= 0) + { + ParamExecData *prm = &(econtext->ecxt_param_exec_vals[paramid]); + + if (prm->execPlan != NULL) + { + /* Parameter not evaluated yet, so go do it */ + ExecSetParamPlan(prm->execPlan, econtext); + /* ExecSetParamPlan should have processed this param... */ + Assert(prm->execPlan == NULL); + } + } +} + +/* + * Mark an initplan as needing recalculation + */ +void +ExecReScanSetParamPlan(SubPlanState *node, PlanState *parent) +{ + PlanState *planstate = node->planstate; + SubPlan *subplan = node->subplan; + EState *estate = parent->state; + ListCell *l; + + /* sanity checks */ + if (subplan->parParam != NIL) + elog(ERROR, "direct correlated subquery unsupported as initplan"); + if (subplan->setParam == NIL) + elog(ERROR, "setParam list of initplan is empty"); + if (bms_is_empty(planstate->plan->extParam)) + elog(ERROR, "extParam set of initplan is empty"); + + /* + * Don't actually re-scan: it'll happen inside ExecSetParamPlan if needed. + */ + + /* + * Mark this subplan's output parameters as needing recalculation. + * + * CTE subplans are never executed via parameter recalculation; instead + * they get run when called by nodeCtescan.c. So don't mark the output + * parameter of a CTE subplan as dirty, but do set the chgParam bit for it + * so that dependent plan nodes will get told to rescan. + */ + foreach(l, subplan->setParam) + { + int paramid = lfirst_int(l); + ParamExecData *prm = &(estate->es_param_exec_vals[paramid]); + + if (subplan->subLinkType != CTE_SUBLINK) + prm->execPlan = node; + + parent->chgParam = bms_add_member(parent->chgParam, paramid); + } +} diff --git a/src/backend/executor/nodeSubqueryscan.c b/src/backend/executor/nodeSubqueryscan.c new file mode 100644 index 0000000..c09f628 --- /dev/null +++ b/src/backend/executor/nodeSubqueryscan.c @@ -0,0 +1,213 @@ +/*------------------------------------------------------------------------- + * + * nodeSubqueryscan.c + * Support routines for scanning subqueries (subselects in rangetable). + * + * This is just enough different from sublinks (nodeSubplan.c) to mean that + * we need two sets of code. Ought to look at trying to unify the cases. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeSubqueryscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecSubqueryScan scans a subquery. + * ExecSubqueryNext retrieve next tuple in sequential order. + * ExecInitSubqueryScan creates and initializes a subqueryscan node. + * ExecEndSubqueryScan releases any storage allocated. + * ExecReScanSubqueryScan rescans the relation + * + */ +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/nodeSubqueryscan.h" + +static TupleTableSlot *SubqueryNext(SubqueryScanState *node); + +/* ---------------------------------------------------------------- + * Scan Support + * ---------------------------------------------------------------- + */ +/* ---------------------------------------------------------------- + * SubqueryNext + * + * This is a workhorse for ExecSubqueryScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +SubqueryNext(SubqueryScanState *node) +{ + TupleTableSlot *slot; + + /* + * Get the next tuple from the sub-query. + */ + slot = ExecProcNode(node->subplan); + + /* + * We just return the subplan's result slot, rather than expending extra + * cycles for ExecCopySlot(). (Our own ScanTupleSlot is used only for + * EvalPlanQual rechecks.) + */ + return slot; +} + +/* + * SubqueryRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +SubqueryRecheck(SubqueryScanState *node, TupleTableSlot *slot) +{ + /* nothing to check */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecSubqueryScan(node) + * + * Scans the subquery sequentially and returns the next qualifying + * tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecSubqueryScan(PlanState *pstate) +{ + SubqueryScanState *node = castNode(SubqueryScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) SubqueryNext, + (ExecScanRecheckMtd) SubqueryRecheck); +} + +/* ---------------------------------------------------------------- + * ExecInitSubqueryScan + * ---------------------------------------------------------------- + */ +SubqueryScanState * +ExecInitSubqueryScan(SubqueryScan *node, EState *estate, int eflags) +{ + SubqueryScanState *subquerystate; + + /* check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* SubqueryScan should not have any "normal" children */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create state structure + */ + subquerystate = makeNode(SubqueryScanState); + subquerystate->ss.ps.plan = (Plan *) node; + subquerystate->ss.ps.state = estate; + subquerystate->ss.ps.ExecProcNode = ExecSubqueryScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &subquerystate->ss.ps); + + /* + * initialize subquery + */ + subquerystate->subplan = ExecInitNode(node->subplan, estate, eflags); + + /* + * Initialize scan slot and type (needed by ExecAssignScanProjectionInfo) + */ + ExecInitScanTupleSlot(estate, &subquerystate->ss, + ExecGetResultType(subquerystate->subplan), + ExecGetResultSlotOps(subquerystate->subplan, NULL)); + + /* + * The slot used as the scantuple isn't the slot above (outside of EPQ), + * but the one from the node below. + */ + subquerystate->ss.ps.scanopsset = true; + subquerystate->ss.ps.scanops = ExecGetResultSlotOps(subquerystate->subplan, + &subquerystate->ss.ps.scanopsfixed); + subquerystate->ss.ps.resultopsset = true; + subquerystate->ss.ps.resultops = subquerystate->ss.ps.scanops; + subquerystate->ss.ps.resultopsfixed = subquerystate->ss.ps.scanopsfixed; + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&subquerystate->ss.ps); + ExecAssignScanProjectionInfo(&subquerystate->ss); + + /* + * initialize child expressions + */ + subquerystate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) subquerystate); + + return subquerystate; +} + +/* ---------------------------------------------------------------- + * ExecEndSubqueryScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndSubqueryScan(SubqueryScanState *node) +{ + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the upper tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close down subquery + */ + ExecEndNode(node->subplan); +} + +/* ---------------------------------------------------------------- + * ExecReScanSubqueryScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanSubqueryScan(SubqueryScanState *node) +{ + ExecScanReScan(&node->ss); + + /* + * ExecReScan doesn't know about my subplan, so I have to do + * changed-parameter signaling myself. This is just as well, because the + * subplan has its own memory context in which its chgParam state lives. + */ + if (node->ss.ps.chgParam != NULL) + UpdateChangedParamSet(node->subplan, node->ss.ps.chgParam); + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->subplan->chgParam == NULL) + ExecReScan(node->subplan); +} diff --git a/src/backend/executor/nodeTableFuncscan.c b/src/backend/executor/nodeTableFuncscan.c new file mode 100644 index 0000000..4d7eca4 --- /dev/null +++ b/src/backend/executor/nodeTableFuncscan.c @@ -0,0 +1,523 @@ +/*------------------------------------------------------------------------- + * + * nodeTableFuncscan.c + * Support routines for scanning RangeTableFunc (XMLTABLE like functions). + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeTableFuncscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecTableFuncscan scans a function. + * ExecFunctionNext retrieve next tuple in sequential order. + * ExecInitTableFuncscan creates and initializes a TableFuncscan node. + * ExecEndTableFuncscan releases any storage allocated. + * ExecReScanTableFuncscan rescans the function + */ +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeTableFuncscan.h" +#include "executor/tablefunc.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/xml.h" + +static TupleTableSlot *TableFuncNext(TableFuncScanState *node); +static bool TableFuncRecheck(TableFuncScanState *node, TupleTableSlot *slot); + +static void tfuncFetchRows(TableFuncScanState *tstate, ExprContext *econtext); +static void tfuncInitialize(TableFuncScanState *tstate, ExprContext *econtext, Datum doc); +static void tfuncLoadRows(TableFuncScanState *tstate, ExprContext *econtext); + +/* ---------------------------------------------------------------- + * Scan Support + * ---------------------------------------------------------------- + */ +/* ---------------------------------------------------------------- + * TableFuncNext + * + * This is a workhorse for ExecTableFuncscan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +TableFuncNext(TableFuncScanState *node) +{ + TupleTableSlot *scanslot; + + scanslot = node->ss.ss_ScanTupleSlot; + + /* + * If first time through, read all tuples from function and put them in a + * tuplestore. Subsequent calls just fetch tuples from tuplestore. + */ + if (node->tupstore == NULL) + tfuncFetchRows(node, node->ss.ps.ps_ExprContext); + + /* + * Get the next tuple from tuplestore. + */ + (void) tuplestore_gettupleslot(node->tupstore, + true, + false, + scanslot); + return scanslot; +} + +/* + * TableFuncRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +TableFuncRecheck(TableFuncScanState *node, TupleTableSlot *slot) +{ + /* nothing to check */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecTableFuncscan(node) + * + * Scans the function sequentially and returns the next qualifying + * tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecTableFuncScan(PlanState *pstate) +{ + TableFuncScanState *node = castNode(TableFuncScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) TableFuncNext, + (ExecScanRecheckMtd) TableFuncRecheck); +} + +/* ---------------------------------------------------------------- + * ExecInitTableFuncscan + * ---------------------------------------------------------------- + */ +TableFuncScanState * +ExecInitTableFuncScan(TableFuncScan *node, EState *estate, int eflags) +{ + TableFuncScanState *scanstate; + TableFunc *tf = node->tablefunc; + TupleDesc tupdesc; + int i; + + /* check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* + * TableFuncscan should not have any children. + */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create new ScanState for node + */ + scanstate = makeNode(TableFuncScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecTableFuncScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * initialize source tuple type + */ + tupdesc = BuildDescFromLists(tf->colnames, + tf->coltypes, + tf->coltypmods, + tf->colcollations); + /* and the corresponding scan slot */ + ExecInitScanTupleSlot(estate, &scanstate->ss, tupdesc, + &TTSOpsMinimalTuple); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, &scanstate->ss.ps); + + /* Only XMLTABLE is supported currently */ + scanstate->routine = &XmlTableRoutine; + + scanstate->perTableCxt = + AllocSetContextCreate(CurrentMemoryContext, + "TableFunc per value context", + ALLOCSET_DEFAULT_SIZES); + scanstate->opaque = NULL; /* initialized at runtime */ + + scanstate->ns_names = tf->ns_names; + + scanstate->ns_uris = + ExecInitExprList(tf->ns_uris, (PlanState *) scanstate); + scanstate->docexpr = + ExecInitExpr((Expr *) tf->docexpr, (PlanState *) scanstate); + scanstate->rowexpr = + ExecInitExpr((Expr *) tf->rowexpr, (PlanState *) scanstate); + scanstate->colexprs = + ExecInitExprList(tf->colexprs, (PlanState *) scanstate); + scanstate->coldefexprs = + ExecInitExprList(tf->coldefexprs, (PlanState *) scanstate); + + scanstate->notnulls = tf->notnulls; + + /* these are allocated now and initialized later */ + scanstate->in_functions = palloc(sizeof(FmgrInfo) * tupdesc->natts); + scanstate->typioparams = palloc(sizeof(Oid) * tupdesc->natts); + + /* + * Fill in the necessary fmgr infos. + */ + for (i = 0; i < tupdesc->natts; i++) + { + Oid in_funcid; + + getTypeInputInfo(TupleDescAttr(tupdesc, i)->atttypid, + &in_funcid, &scanstate->typioparams[i]); + fmgr_info(in_funcid, &scanstate->in_functions[i]); + } + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndTableFuncscan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndTableFuncScan(TableFuncScanState *node) +{ + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * Release tuplestore resources + */ + if (node->tupstore != NULL) + tuplestore_end(node->tupstore); + node->tupstore = NULL; +} + +/* ---------------------------------------------------------------- + * ExecReScanTableFuncscan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanTableFuncScan(TableFuncScanState *node) +{ + Bitmapset *chgparam = node->ss.ps.chgParam; + + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecScanReScan(&node->ss); + + /* + * Recompute when parameters are changed. + */ + if (chgparam) + { + if (node->tupstore != NULL) + { + tuplestore_end(node->tupstore); + node->tupstore = NULL; + } + } + + if (node->tupstore != NULL) + tuplestore_rescan(node->tupstore); +} + +/* ---------------------------------------------------------------- + * tfuncFetchRows + * + * Read rows from a TableFunc producer + * ---------------------------------------------------------------- + */ +static void +tfuncFetchRows(TableFuncScanState *tstate, ExprContext *econtext) +{ + const TableFuncRoutine *routine = tstate->routine; + MemoryContext oldcxt; + Datum value; + bool isnull; + + Assert(tstate->opaque == NULL); + + /* build tuplestore for the result */ + oldcxt = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + tstate->tupstore = tuplestore_begin_heap(false, false, work_mem); + + /* + * Each call to fetch a new set of rows - of which there may be very many + * if XMLTABLE is being used in a lateral join - will allocate a possibly + * substantial amount of memory, so we cannot use the per-query context + * here. perTableCxt now serves the same function as "argcontext" does in + * FunctionScan - a place to store per-one-call (i.e. one result table) + * lifetime data (as opposed to per-query or per-result-tuple). + */ + MemoryContextSwitchTo(tstate->perTableCxt); + + PG_TRY(); + { + routine->InitOpaque(tstate, + tstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor->natts); + + /* + * If evaluating the document expression returns NULL, the table + * expression is empty and we return immediately. + */ + value = ExecEvalExpr(tstate->docexpr, econtext, &isnull); + + if (!isnull) + { + /* otherwise, pass the document value to the table builder */ + tfuncInitialize(tstate, econtext, value); + + /* initialize ordinality counter */ + tstate->ordinal = 1; + + /* Load all rows into the tuplestore, and we're done */ + tfuncLoadRows(tstate, econtext); + } + } + PG_CATCH(); + { + if (tstate->opaque != NULL) + routine->DestroyOpaque(tstate); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* clean up and return to original memory context */ + + if (tstate->opaque != NULL) + { + routine->DestroyOpaque(tstate); + tstate->opaque = NULL; + } + + MemoryContextSwitchTo(oldcxt); + MemoryContextReset(tstate->perTableCxt); +} + +/* + * Fill in namespace declarations, the row filter, and column filters in a + * table expression builder context. + */ +static void +tfuncInitialize(TableFuncScanState *tstate, ExprContext *econtext, Datum doc) +{ + const TableFuncRoutine *routine = tstate->routine; + TupleDesc tupdesc; + ListCell *lc1, + *lc2; + bool isnull; + int colno; + Datum value; + int ordinalitycol = + ((TableFuncScan *) (tstate->ss.ps.plan))->tablefunc->ordinalitycol; + + /* + * Install the document as a possibly-toasted Datum into the tablefunc + * context. + */ + routine->SetDocument(tstate, doc); + + /* Evaluate namespace specifications */ + forboth(lc1, tstate->ns_uris, lc2, tstate->ns_names) + { + ExprState *expr = (ExprState *) lfirst(lc1); + Value *ns_node = (Value *) lfirst(lc2); + char *ns_uri; + char *ns_name; + + value = ExecEvalExpr((ExprState *) expr, econtext, &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("namespace URI must not be null"))); + ns_uri = TextDatumGetCString(value); + + /* DEFAULT is passed down to SetNamespace as NULL */ + ns_name = ns_node ? strVal(ns_node) : NULL; + + routine->SetNamespace(tstate, ns_name, ns_uri); + } + + /* Install the row filter expression into the table builder context */ + value = ExecEvalExpr(tstate->rowexpr, econtext, &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("row filter expression must not be null"))); + + routine->SetRowFilter(tstate, TextDatumGetCString(value)); + + /* + * Install the column filter expressions into the table builder context. + * If an expression is given, use that; otherwise the column name itself + * is the column filter. + */ + colno = 0; + tupdesc = tstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + foreach(lc1, tstate->colexprs) + { + char *colfilter; + Form_pg_attribute att = TupleDescAttr(tupdesc, colno); + + if (colno != ordinalitycol) + { + ExprState *colexpr = lfirst(lc1); + + if (colexpr != NULL) + { + value = ExecEvalExpr(colexpr, econtext, &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("column filter expression must not be null"), + errdetail("Filter for column \"%s\" is null.", + NameStr(att->attname)))); + colfilter = TextDatumGetCString(value); + } + else + colfilter = NameStr(att->attname); + + routine->SetColumnFilter(tstate, colfilter, colno); + } + + colno++; + } +} + +/* + * Load all the rows from the TableFunc table builder into a tuplestore. + */ +static void +tfuncLoadRows(TableFuncScanState *tstate, ExprContext *econtext) +{ + const TableFuncRoutine *routine = tstate->routine; + TupleTableSlot *slot = tstate->ss.ss_ScanTupleSlot; + TupleDesc tupdesc = slot->tts_tupleDescriptor; + Datum *values = slot->tts_values; + bool *nulls = slot->tts_isnull; + int natts = tupdesc->natts; + MemoryContext oldcxt; + int ordinalitycol; + + ordinalitycol = + ((TableFuncScan *) (tstate->ss.ps.plan))->tablefunc->ordinalitycol; + + /* + * We need a short-lived memory context that we can clean up each time + * around the loop, to avoid wasting space. Our default per-tuple context + * is fine for the job, since we won't have used it for anything yet in + * this tuple cycle. + */ + oldcxt = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + /* + * Keep requesting rows from the table builder until there aren't any. + */ + while (routine->FetchRow(tstate)) + { + ListCell *cell = list_head(tstate->coldefexprs); + int colno; + + CHECK_FOR_INTERRUPTS(); + + ExecClearTuple(tstate->ss.ss_ScanTupleSlot); + + /* + * Obtain the value of each column for this row, installing them into + * the slot; then add the tuple to the tuplestore. + */ + for (colno = 0; colno < natts; colno++) + { + Form_pg_attribute att = TupleDescAttr(tupdesc, colno); + + if (colno == ordinalitycol) + { + /* Fast path for ordinality column */ + values[colno] = Int32GetDatum(tstate->ordinal++); + nulls[colno] = false; + } + else + { + bool isnull; + + values[colno] = routine->GetValue(tstate, + colno, + att->atttypid, + att->atttypmod, + &isnull); + + /* No value? Evaluate and apply the default, if any */ + if (isnull && cell != NULL) + { + ExprState *coldefexpr = (ExprState *) lfirst(cell); + + if (coldefexpr != NULL) + values[colno] = ExecEvalExpr(coldefexpr, econtext, + &isnull); + } + + /* Verify a possible NOT NULL constraint */ + if (isnull && bms_is_member(colno, tstate->notnulls)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("null is not allowed in column \"%s\"", + NameStr(att->attname)))); + + nulls[colno] = isnull; + } + + /* advance list of default expressions */ + if (cell != NULL) + cell = lnext(tstate->coldefexprs, cell); + } + + tuplestore_putvalues(tstate->tupstore, tupdesc, values, nulls); + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + } + + MemoryContextSwitchTo(oldcxt); +} diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c new file mode 100644 index 0000000..2b0d205 --- /dev/null +++ b/src/backend/executor/nodeTidrangescan.c @@ -0,0 +1,413 @@ +/*------------------------------------------------------------------------- + * + * nodeTidrangescan.c + * Routines to support TID range scans of relations + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeTidrangescan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "access/sysattr.h" +#include "access/tableam.h" +#include "catalog/pg_operator.h" +#include "executor/execdebug.h" +#include "executor/nodeTidrangescan.h" +#include "nodes/nodeFuncs.h" +#include "storage/bufmgr.h" +#include "utils/rel.h" + + +#define IsCTIDVar(node) \ + ((node) != NULL && \ + IsA((node), Var) && \ + ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \ + ((Var *) (node))->varlevelsup == 0) + +typedef enum +{ + TIDEXPR_UPPER_BOUND, + TIDEXPR_LOWER_BOUND +} TidExprType; + +/* Upper or lower range bound for scan */ +typedef struct TidOpExpr +{ + TidExprType exprtype; /* type of op; lower or upper */ + ExprState *exprstate; /* ExprState for a TID-yielding subexpr */ + bool inclusive; /* whether op is inclusive */ +} TidOpExpr; + +/* + * For the given 'expr', build and return an appropriate TidOpExpr taking into + * account the expr's operator and operand order. + */ +static TidOpExpr * +MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate) +{ + Node *arg1 = get_leftop((Expr *) expr); + Node *arg2 = get_rightop((Expr *) expr); + ExprState *exprstate = NULL; + bool invert = false; + TidOpExpr *tidopexpr; + + if (IsCTIDVar(arg1)) + exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps); + else if (IsCTIDVar(arg2)) + { + exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps); + invert = true; + } + else + elog(ERROR, "could not identify CTID variable"); + + tidopexpr = (TidOpExpr *) palloc(sizeof(TidOpExpr)); + tidopexpr->inclusive = false; /* for now */ + + switch (expr->opno) + { + case TIDLessEqOperator: + tidopexpr->inclusive = true; + /* fall through */ + case TIDLessOperator: + tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND; + break; + case TIDGreaterEqOperator: + tidopexpr->inclusive = true; + /* fall through */ + case TIDGreaterOperator: + tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND; + break; + default: + elog(ERROR, "could not identify CTID operator"); + } + + tidopexpr->exprstate = exprstate; + + return tidopexpr; +} + +/* + * Extract the qual subexpressions that yield TIDs to search for, + * and compile them into ExprStates if they're ordinary expressions. + */ +static void +TidExprListCreate(TidRangeScanState *tidrangestate) +{ + TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan; + List *tidexprs = NIL; + ListCell *l; + + foreach(l, node->tidrangequals) + { + OpExpr *opexpr = lfirst(l); + TidOpExpr *tidopexpr; + + if (!IsA(opexpr, OpExpr)) + elog(ERROR, "could not identify CTID expression"); + + tidopexpr = MakeTidOpExpr(opexpr, tidrangestate); + tidexprs = lappend(tidexprs, tidopexpr); + } + + tidrangestate->trss_tidexprs = tidexprs; +} + +/* ---------------------------------------------------------------- + * TidRangeEval + * + * Compute and set node's block and offset range to scan by evaluating + * the trss_tidexprs. Returns false if we detect the range cannot + * contain any tuples. Returns true if it's possible for the range to + * contain tuples. + * ---------------------------------------------------------------- + */ +static bool +TidRangeEval(TidRangeScanState *node) +{ + ExprContext *econtext = node->ss.ps.ps_ExprContext; + ItemPointerData lowerBound; + ItemPointerData upperBound; + ListCell *l; + + /* + * Set the upper and lower bounds to the absolute limits of the range of + * the ItemPointer type. Below we'll try to narrow this range on either + * side by looking at the TidOpExprs. + */ + ItemPointerSet(&lowerBound, 0, 0); + ItemPointerSet(&upperBound, InvalidBlockNumber, PG_UINT16_MAX); + + foreach(l, node->trss_tidexprs) + { + TidOpExpr *tidopexpr = (TidOpExpr *) lfirst(l); + ItemPointer itemptr; + bool isNull; + + /* Evaluate this bound. */ + itemptr = (ItemPointer) + DatumGetPointer(ExecEvalExprSwitchContext(tidopexpr->exprstate, + econtext, + &isNull)); + + /* If the bound is NULL, *nothing* matches the qual. */ + if (isNull) + return false; + + if (tidopexpr->exprtype == TIDEXPR_LOWER_BOUND) + { + ItemPointerData lb; + + ItemPointerCopy(itemptr, &lb); + + /* + * Normalize non-inclusive ranges to become inclusive. The + * resulting ItemPointer here may not be a valid item pointer. + */ + if (!tidopexpr->inclusive) + ItemPointerInc(&lb); + + /* Check if we can narrow the range using this qual */ + if (ItemPointerCompare(&lb, &lowerBound) > 0) + ItemPointerCopy(&lb, &lowerBound); + } + + else if (tidopexpr->exprtype == TIDEXPR_UPPER_BOUND) + { + ItemPointerData ub; + + ItemPointerCopy(itemptr, &ub); + + /* + * Normalize non-inclusive ranges to become inclusive. The + * resulting ItemPointer here may not be a valid item pointer. + */ + if (!tidopexpr->inclusive) + ItemPointerDec(&ub); + + /* Check if we can narrow the range using this qual */ + if (ItemPointerCompare(&ub, &upperBound) < 0) + ItemPointerCopy(&ub, &upperBound); + } + } + + ItemPointerCopy(&lowerBound, &node->trss_mintid); + ItemPointerCopy(&upperBound, &node->trss_maxtid); + + return true; +} + +/* ---------------------------------------------------------------- + * TidRangeNext + * + * Retrieve a tuple from the TidRangeScan node's currentRelation + * using the TIDs in the TidRangeScanState information. + * + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +TidRangeNext(TidRangeScanState *node) +{ + TableScanDesc scandesc; + EState *estate; + ScanDirection direction; + TupleTableSlot *slot; + + /* + * extract necessary information from TID scan node + */ + scandesc = node->ss.ss_currentScanDesc; + estate = node->ss.ps.state; + slot = node->ss.ss_ScanTupleSlot; + direction = estate->es_direction; + + if (!node->trss_inScan) + { + /* First time through, compute TID range to scan */ + if (!TidRangeEval(node)) + return NULL; + + if (scandesc == NULL) + { + scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation, + estate->es_snapshot, + &node->trss_mintid, + &node->trss_maxtid); + node->ss.ss_currentScanDesc = scandesc; + } + else + { + /* rescan with the updated TID range */ + table_rescan_tidrange(scandesc, &node->trss_mintid, + &node->trss_maxtid); + } + + node->trss_inScan = true; + } + + /* Fetch the next tuple. */ + if (!table_scan_getnextslot_tidrange(scandesc, direction, slot)) + { + node->trss_inScan = false; + ExecClearTuple(slot); + } + + return slot; +} + +/* + * TidRangeRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot) +{ + return true; +} + +/* ---------------------------------------------------------------- + * ExecTidRangeScan(node) + * + * Scans the relation using tids and returns the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * + * Conditions: + * -- the "cursor" maintained by the AMI is positioned at the tuple + * returned previously. + * + * Initial States: + * -- the relation indicated is opened for TID range scanning. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecTidRangeScan(PlanState *pstate) +{ + TidRangeScanState *node = castNode(TidRangeScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) TidRangeNext, + (ExecScanRecheckMtd) TidRangeRecheck); +} + +/* ---------------------------------------------------------------- + * ExecReScanTidRangeScan(node) + * ---------------------------------------------------------------- + */ +void +ExecReScanTidRangeScan(TidRangeScanState *node) +{ + /* mark scan as not in progress, and tid range list as not computed yet */ + node->trss_inScan = false; + + /* + * We must wait until TidRangeNext before calling table_rescan_tidrange. + */ + ExecScanReScan(&node->ss); +} + +/* ---------------------------------------------------------------- + * ExecEndTidRangeScan + * + * Releases any storage allocated through C routines. + * Returns nothing. + * ---------------------------------------------------------------- + */ +void +ExecEndTidRangeScan(TidRangeScanState *node) +{ + TableScanDesc scan = node->ss.ss_currentScanDesc; + + if (scan != NULL) + table_endscan(scan); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clear out tuple table slots + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecInitTidRangeScan + * + * Initializes the tid range scan's state information, creates + * scan keys, and opens the scan relation. + * + * Parameters: + * node: TidRangeScan node produced by the planner. + * estate: the execution state initialized in InitPlan. + * ---------------------------------------------------------------- + */ +TidRangeScanState * +ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags) +{ + TidRangeScanState *tidrangestate; + Relation currentRelation; + + /* + * create state structure + */ + tidrangestate = makeNode(TidRangeScanState); + tidrangestate->ss.ps.plan = (Plan *) node; + tidrangestate->ss.ps.state = estate; + tidrangestate->ss.ps.ExecProcNode = ExecTidRangeScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &tidrangestate->ss.ps); + + /* + * mark scan as not in progress, and TID range as not computed yet + */ + tidrangestate->trss_inScan = false; + + /* + * open the scan relation + */ + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); + + tidrangestate->ss.ss_currentRelation = currentRelation; + tidrangestate->ss.ss_currentScanDesc = NULL; /* no table scan here */ + + /* + * get the scan type from the relation descriptor. + */ + ExecInitScanTupleSlot(estate, &tidrangestate->ss, + RelationGetDescr(currentRelation), + table_slot_callbacks(currentRelation)); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&tidrangestate->ss.ps); + ExecAssignScanProjectionInfo(&tidrangestate->ss); + + /* + * initialize child expressions + */ + tidrangestate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) tidrangestate); + + TidExprListCreate(tidrangestate); + + /* + * all done. + */ + return tidrangestate; +} diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c new file mode 100644 index 0000000..48c3737 --- /dev/null +++ b/src/backend/executor/nodeTidscan.c @@ -0,0 +1,558 @@ +/*------------------------------------------------------------------------- + * + * nodeTidscan.c + * Routines to support direct tid scans of relations + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeTidscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * + * ExecTidScan scans a relation using tids + * ExecInitTidScan creates and initializes state info. + * ExecReScanTidScan rescans the tid relation. + * ExecEndTidScan releases all storage. + */ +#include "postgres.h" + +#include "access/sysattr.h" +#include "access/tableam.h" +#include "catalog/pg_type.h" +#include "executor/execdebug.h" +#include "executor/nodeTidscan.h" +#include "lib/qunique.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "storage/bufmgr.h" +#include "utils/array.h" +#include "utils/rel.h" + + +#define IsCTIDVar(node) \ + ((node) != NULL && \ + IsA((node), Var) && \ + ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \ + ((Var *) (node))->varlevelsup == 0) + +/* one element in tss_tidexprs */ +typedef struct TidExpr +{ + ExprState *exprstate; /* ExprState for a TID-yielding subexpr */ + bool isarray; /* if true, it yields tid[] not just tid */ + CurrentOfExpr *cexpr; /* alternatively, we can have CURRENT OF */ +} TidExpr; + +static void TidExprListCreate(TidScanState *tidstate); +static void TidListEval(TidScanState *tidstate); +static int itemptr_comparator(const void *a, const void *b); +static TupleTableSlot *TidNext(TidScanState *node); + + +/* + * Extract the qual subexpressions that yield TIDs to search for, + * and compile them into ExprStates if they're ordinary expressions. + * + * CURRENT OF is a special case that we can't compile usefully; + * just drop it into the TidExpr list as-is. + */ +static void +TidExprListCreate(TidScanState *tidstate) +{ + TidScan *node = (TidScan *) tidstate->ss.ps.plan; + ListCell *l; + + tidstate->tss_tidexprs = NIL; + tidstate->tss_isCurrentOf = false; + + foreach(l, node->tidquals) + { + Expr *expr = (Expr *) lfirst(l); + TidExpr *tidexpr = (TidExpr *) palloc0(sizeof(TidExpr)); + + if (is_opclause(expr)) + { + Node *arg1; + Node *arg2; + + arg1 = get_leftop(expr); + arg2 = get_rightop(expr); + if (IsCTIDVar(arg1)) + tidexpr->exprstate = ExecInitExpr((Expr *) arg2, + &tidstate->ss.ps); + else if (IsCTIDVar(arg2)) + tidexpr->exprstate = ExecInitExpr((Expr *) arg1, + &tidstate->ss.ps); + else + elog(ERROR, "could not identify CTID variable"); + tidexpr->isarray = false; + } + else if (expr && IsA(expr, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saex = (ScalarArrayOpExpr *) expr; + + Assert(IsCTIDVar(linitial(saex->args))); + tidexpr->exprstate = ExecInitExpr(lsecond(saex->args), + &tidstate->ss.ps); + tidexpr->isarray = true; + } + else if (expr && IsA(expr, CurrentOfExpr)) + { + CurrentOfExpr *cexpr = (CurrentOfExpr *) expr; + + tidexpr->cexpr = cexpr; + tidstate->tss_isCurrentOf = true; + } + else + elog(ERROR, "could not identify CTID expression"); + + tidstate->tss_tidexprs = lappend(tidstate->tss_tidexprs, tidexpr); + } + + /* CurrentOfExpr could never appear OR'd with something else */ + Assert(list_length(tidstate->tss_tidexprs) == 1 || + !tidstate->tss_isCurrentOf); +} + +/* + * Compute the list of TIDs to be visited, by evaluating the expressions + * for them. + * + * (The result is actually an array, not a list.) + */ +static void +TidListEval(TidScanState *tidstate) +{ + ExprContext *econtext = tidstate->ss.ps.ps_ExprContext; + TableScanDesc scan; + ItemPointerData *tidList; + int numAllocTids; + int numTids; + ListCell *l; + + /* + * Start scan on-demand - initializing a scan isn't free (e.g. heap stats + * the size of the table), so it makes sense to delay that until needed - + * the node might never get executed. + */ + if (tidstate->ss.ss_currentScanDesc == NULL) + tidstate->ss.ss_currentScanDesc = + table_beginscan_tid(tidstate->ss.ss_currentRelation, + tidstate->ss.ps.state->es_snapshot); + scan = tidstate->ss.ss_currentScanDesc; + + /* + * We initialize the array with enough slots for the case that all quals + * are simple OpExprs or CurrentOfExprs. If there are any + * ScalarArrayOpExprs, we may have to enlarge the array. + */ + numAllocTids = list_length(tidstate->tss_tidexprs); + tidList = (ItemPointerData *) + palloc(numAllocTids * sizeof(ItemPointerData)); + numTids = 0; + + foreach(l, tidstate->tss_tidexprs) + { + TidExpr *tidexpr = (TidExpr *) lfirst(l); + ItemPointer itemptr; + bool isNull; + + if (tidexpr->exprstate && !tidexpr->isarray) + { + itemptr = (ItemPointer) + DatumGetPointer(ExecEvalExprSwitchContext(tidexpr->exprstate, + econtext, + &isNull)); + if (isNull) + continue; + + /* + * We silently discard any TIDs that the AM considers invalid + * (E.g. for heap, they could be out of range at the time of scan + * start. Since we hold at least AccessShareLock on the table, it + * won't be possible for someone to truncate away the blocks we + * intend to visit.). + */ + if (!table_tuple_tid_valid(scan, itemptr)) + continue; + + if (numTids >= numAllocTids) + { + numAllocTids *= 2; + tidList = (ItemPointerData *) + repalloc(tidList, + numAllocTids * sizeof(ItemPointerData)); + } + tidList[numTids++] = *itemptr; + } + else if (tidexpr->exprstate && tidexpr->isarray) + { + Datum arraydatum; + ArrayType *itemarray; + Datum *ipdatums; + bool *ipnulls; + int ndatums; + int i; + + arraydatum = ExecEvalExprSwitchContext(tidexpr->exprstate, + econtext, + &isNull); + if (isNull) + continue; + itemarray = DatumGetArrayTypeP(arraydatum); + deconstruct_array(itemarray, + TIDOID, sizeof(ItemPointerData), false, TYPALIGN_SHORT, + &ipdatums, &ipnulls, &ndatums); + if (numTids + ndatums > numAllocTids) + { + numAllocTids = numTids + ndatums; + tidList = (ItemPointerData *) + repalloc(tidList, + numAllocTids * sizeof(ItemPointerData)); + } + for (i = 0; i < ndatums; i++) + { + if (ipnulls[i]) + continue; + + itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]); + + if (!table_tuple_tid_valid(scan, itemptr)) + continue; + + tidList[numTids++] = *itemptr; + } + pfree(ipdatums); + pfree(ipnulls); + } + else + { + ItemPointerData cursor_tid; + + Assert(tidexpr->cexpr); + if (execCurrentOf(tidexpr->cexpr, econtext, + RelationGetRelid(tidstate->ss.ss_currentRelation), + &cursor_tid)) + { + if (numTids >= numAllocTids) + { + numAllocTids *= 2; + tidList = (ItemPointerData *) + repalloc(tidList, + numAllocTids * sizeof(ItemPointerData)); + } + tidList[numTids++] = cursor_tid; + } + } + } + + /* + * Sort the array of TIDs into order, and eliminate duplicates. + * Eliminating duplicates is necessary since we want OR semantics across + * the list. Sorting makes it easier to detect duplicates, and as a bonus + * ensures that we will visit the heap in the most efficient way. + */ + if (numTids > 1) + { + /* CurrentOfExpr could never appear OR'd with something else */ + Assert(!tidstate->tss_isCurrentOf); + + qsort((void *) tidList, numTids, sizeof(ItemPointerData), + itemptr_comparator); + numTids = qunique(tidList, numTids, sizeof(ItemPointerData), + itemptr_comparator); + } + + tidstate->tss_TidList = tidList; + tidstate->tss_NumTids = numTids; + tidstate->tss_TidPtr = -1; +} + +/* + * qsort comparator for ItemPointerData items + */ +static int +itemptr_comparator(const void *a, const void *b) +{ + const ItemPointerData *ipa = (const ItemPointerData *) a; + const ItemPointerData *ipb = (const ItemPointerData *) b; + BlockNumber ba = ItemPointerGetBlockNumber(ipa); + BlockNumber bb = ItemPointerGetBlockNumber(ipb); + OffsetNumber oa = ItemPointerGetOffsetNumber(ipa); + OffsetNumber ob = ItemPointerGetOffsetNumber(ipb); + + if (ba < bb) + return -1; + if (ba > bb) + return 1; + if (oa < ob) + return -1; + if (oa > ob) + return 1; + return 0; +} + +/* ---------------------------------------------------------------- + * TidNext + * + * Retrieve a tuple from the TidScan node's currentRelation + * using the tids in the TidScanState information. + * + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +TidNext(TidScanState *node) +{ + EState *estate; + ScanDirection direction; + Snapshot snapshot; + TableScanDesc scan; + Relation heapRelation; + TupleTableSlot *slot; + ItemPointerData *tidList; + int numTids; + bool bBackward; + + /* + * extract necessary information from tid scan node + */ + estate = node->ss.ps.state; + direction = estate->es_direction; + snapshot = estate->es_snapshot; + heapRelation = node->ss.ss_currentRelation; + slot = node->ss.ss_ScanTupleSlot; + + /* + * First time through, compute the list of TIDs to be visited + */ + if (node->tss_TidList == NULL) + TidListEval(node); + + scan = node->ss.ss_currentScanDesc; + tidList = node->tss_TidList; + numTids = node->tss_NumTids; + + /* + * Initialize or advance scan position, depending on direction. + */ + bBackward = ScanDirectionIsBackward(direction); + if (bBackward) + { + if (node->tss_TidPtr < 0) + { + /* initialize for backward scan */ + node->tss_TidPtr = numTids - 1; + } + else + node->tss_TidPtr--; + } + else + { + if (node->tss_TidPtr < 0) + { + /* initialize for forward scan */ + node->tss_TidPtr = 0; + } + else + node->tss_TidPtr++; + } + + while (node->tss_TidPtr >= 0 && node->tss_TidPtr < numTids) + { + ItemPointerData tid = tidList[node->tss_TidPtr]; + + /* + * For WHERE CURRENT OF, the tuple retrieved from the cursor might + * since have been updated; if so, we should fetch the version that is + * current according to our snapshot. + */ + if (node->tss_isCurrentOf) + table_tuple_get_latest_tid(scan, &tid); + + if (table_tuple_fetch_row_version(heapRelation, &tid, snapshot, slot)) + return slot; + + /* Bad TID or failed snapshot qual; try next */ + if (bBackward) + node->tss_TidPtr--; + else + node->tss_TidPtr++; + + CHECK_FOR_INTERRUPTS(); + } + + /* + * if we get here it means the tid scan failed so we are at the end of the + * scan.. + */ + return ExecClearTuple(slot); +} + +/* + * TidRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +TidRecheck(TidScanState *node, TupleTableSlot *slot) +{ + /* + * XXX shouldn't we check here to make sure tuple matches TID list? In + * runtime-key case this is not certain, is it? However, in the WHERE + * CURRENT OF case it might not match anyway ... + */ + return true; +} + + +/* ---------------------------------------------------------------- + * ExecTidScan(node) + * + * Scans the relation using tids and returns + * the next qualifying tuple in the direction specified. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * + * Conditions: + * -- the "cursor" maintained by the AMI is positioned at the tuple + * returned previously. + * + * Initial States: + * -- the relation indicated is opened for scanning so that the + * "cursor" is positioned before the first qualifying tuple. + * -- tss_TidPtr is -1. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecTidScan(PlanState *pstate) +{ + TidScanState *node = castNode(TidScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) TidNext, + (ExecScanRecheckMtd) TidRecheck); +} + +/* ---------------------------------------------------------------- + * ExecReScanTidScan(node) + * ---------------------------------------------------------------- + */ +void +ExecReScanTidScan(TidScanState *node) +{ + if (node->tss_TidList) + pfree(node->tss_TidList); + node->tss_TidList = NULL; + node->tss_NumTids = 0; + node->tss_TidPtr = -1; + + /* not really necessary, but seems good form */ + if (node->ss.ss_currentScanDesc) + table_rescan(node->ss.ss_currentScanDesc, NULL); + + ExecScanReScan(&node->ss); +} + +/* ---------------------------------------------------------------- + * ExecEndTidScan + * + * Releases any storage allocated through C routines. + * Returns nothing. + * ---------------------------------------------------------------- + */ +void +ExecEndTidScan(TidScanState *node) +{ + if (node->ss.ss_currentScanDesc) + table_endscan(node->ss.ss_currentScanDesc); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clear out tuple table slots + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecInitTidScan + * + * Initializes the tid scan's state information, creates + * scan keys, and opens the base and tid relations. + * + * Parameters: + * node: TidScan node produced by the planner. + * estate: the execution state initialized in InitPlan. + * ---------------------------------------------------------------- + */ +TidScanState * +ExecInitTidScan(TidScan *node, EState *estate, int eflags) +{ + TidScanState *tidstate; + Relation currentRelation; + + /* + * create state structure + */ + tidstate = makeNode(TidScanState); + tidstate->ss.ps.plan = (Plan *) node; + tidstate->ss.ps.state = estate; + tidstate->ss.ps.ExecProcNode = ExecTidScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &tidstate->ss.ps); + + /* + * mark tid list as not computed yet + */ + tidstate->tss_TidList = NULL; + tidstate->tss_NumTids = 0; + tidstate->tss_TidPtr = -1; + + /* + * open the scan relation + */ + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); + + tidstate->ss.ss_currentRelation = currentRelation; + tidstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ + + /* + * get the scan type from the relation descriptor. + */ + ExecInitScanTupleSlot(estate, &tidstate->ss, + RelationGetDescr(currentRelation), + table_slot_callbacks(currentRelation)); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&tidstate->ss.ps); + ExecAssignScanProjectionInfo(&tidstate->ss); + + /* + * initialize child expressions + */ + tidstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) tidstate); + + TidExprListCreate(tidstate); + + /* + * all done. + */ + return tidstate; +} diff --git a/src/backend/executor/nodeUnique.c b/src/backend/executor/nodeUnique.c new file mode 100644 index 0000000..9214d6f --- /dev/null +++ b/src/backend/executor/nodeUnique.c @@ -0,0 +1,192 @@ +/*------------------------------------------------------------------------- + * + * nodeUnique.c + * Routines to handle unique'ing of queries where appropriate + * + * Unique is a very simple node type that just filters out duplicate + * tuples from a stream of sorted tuples from its subplan. It's essentially + * a dumbed-down form of Group: the duplicate-removal functionality is + * identical. However, Unique doesn't do projection nor qual checking, + * so it's marginally more efficient for cases where neither is needed. + * (It's debatable whether the savings justifies carrying two plan node + * types, though.) + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeUnique.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecUnique - generate a unique'd temporary relation + * ExecInitUnique - initialize node and subnodes + * ExecEndUnique - shutdown node and subnodes + * + * NOTES + * Assumes tuples returned from subplan arrive in + * sorted order. + */ + +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeUnique.h" +#include "miscadmin.h" +#include "utils/memutils.h" + + +/* ---------------------------------------------------------------- + * ExecUnique + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* return: a tuple or NULL */ +ExecUnique(PlanState *pstate) +{ + UniqueState *node = castNode(UniqueState, pstate); + ExprContext *econtext = node->ps.ps_ExprContext; + TupleTableSlot *resultTupleSlot; + TupleTableSlot *slot; + PlanState *outerPlan; + + CHECK_FOR_INTERRUPTS(); + + /* + * get information from the node + */ + outerPlan = outerPlanState(node); + resultTupleSlot = node->ps.ps_ResultTupleSlot; + + /* + * now loop, returning only non-duplicate tuples. We assume that the + * tuples arrive in sorted order so we can detect duplicates easily. The + * first tuple of each group is returned. + */ + for (;;) + { + /* + * fetch a tuple from the outer subplan + */ + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + { + /* end of subplan, so we're done */ + ExecClearTuple(resultTupleSlot); + return NULL; + } + + /* + * Always return the first tuple from the subplan. + */ + if (TupIsNull(resultTupleSlot)) + break; + + /* + * Else test if the new tuple and the previously returned tuple match. + * If so then we loop back and fetch another new tuple from the + * subplan. + */ + econtext->ecxt_innertuple = slot; + econtext->ecxt_outertuple = resultTupleSlot; + if (!ExecQualAndReset(node->eqfunction, econtext)) + break; + } + + /* + * We have a new tuple different from the previous saved tuple (if any). + * Save it and return it. We must copy it because the source subplan + * won't guarantee that this source tuple is still accessible after + * fetching the next source tuple. + */ + return ExecCopySlot(resultTupleSlot, slot); +} + +/* ---------------------------------------------------------------- + * ExecInitUnique + * + * This initializes the unique node state structures and + * the node's subplan. + * ---------------------------------------------------------------- + */ +UniqueState * +ExecInitUnique(Unique *node, EState *estate, int eflags) +{ + UniqueState *uniquestate; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + uniquestate = makeNode(UniqueState); + uniquestate->ps.plan = (Plan *) node; + uniquestate->ps.state = estate; + uniquestate->ps.ExecProcNode = ExecUnique; + + /* + * create expression context + */ + ExecAssignExprContext(estate, &uniquestate->ps); + + /* + * then initialize outer plan + */ + outerPlanState(uniquestate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * Initialize result slot and type. Unique nodes do no projections, so + * initialize projection info for this node appropriately. + */ + ExecInitResultTupleSlotTL(&uniquestate->ps, &TTSOpsMinimalTuple); + uniquestate->ps.ps_ProjInfo = NULL; + + /* + * Precompute fmgr lookup data for inner loop + */ + uniquestate->eqfunction = + execTuplesMatchPrepare(ExecGetResultType(outerPlanState(uniquestate)), + node->numCols, + node->uniqColIdx, + node->uniqOperators, + node->uniqCollations, + &uniquestate->ps); + + return uniquestate; +} + +/* ---------------------------------------------------------------- + * ExecEndUnique + * + * This shuts down the subplan and frees resources allocated + * to this node. + * ---------------------------------------------------------------- + */ +void +ExecEndUnique(UniqueState *node) +{ + /* clean up tuple table */ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + ExecFreeExprContext(&node->ps); + + ExecEndNode(outerPlanState(node)); +} + + +void +ExecReScanUnique(UniqueState *node) +{ + /* must clear result tuple so first input tuple is returned */ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); +} diff --git a/src/backend/executor/nodeValuesscan.c b/src/backend/executor/nodeValuesscan.c new file mode 100644 index 0000000..5de1429 --- /dev/null +++ b/src/backend/executor/nodeValuesscan.c @@ -0,0 +1,361 @@ +/*------------------------------------------------------------------------- + * + * nodeValuesscan.c + * Support routines for scanning Values lists + * ("VALUES (...), (...), ..." in rangetable). + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeValuesscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecValuesScan scans a values list. + * ExecValuesNext retrieve next tuple in sequential order. + * ExecInitValuesScan creates and initializes a valuesscan node. + * ExecEndValuesScan releases any storage allocated. + * ExecReScanValuesScan rescans the values list + */ +#include "postgres.h" + +#include "executor/executor.h" +#include "executor/nodeValuesscan.h" +#include "jit/jit.h" +#include "optimizer/clauses.h" +#include "utils/expandeddatum.h" + + +static TupleTableSlot *ValuesNext(ValuesScanState *node); + + +/* ---------------------------------------------------------------- + * Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ValuesNext + * + * This is a workhorse for ExecValuesScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ValuesNext(ValuesScanState *node) +{ + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + ScanDirection direction; + int curr_idx; + + /* + * get information from the estate and scan state + */ + estate = node->ss.ps.state; + direction = estate->es_direction; + slot = node->ss.ss_ScanTupleSlot; + econtext = node->rowcontext; + + /* + * Get the next tuple. Return NULL if no more tuples. + */ + if (ScanDirectionIsForward(direction)) + { + if (node->curr_idx < node->array_len) + node->curr_idx++; + } + else + { + if (node->curr_idx >= 0) + node->curr_idx--; + } + + /* + * Always clear the result slot; this is appropriate if we are at the end + * of the data, and if we're not, we still need it as the first step of + * the store-virtual-tuple protocol. It seems wise to clear the slot + * before we reset the context it might have pointers into. + */ + ExecClearTuple(slot); + + curr_idx = node->curr_idx; + if (curr_idx >= 0 && curr_idx < node->array_len) + { + List *exprlist = node->exprlists[curr_idx]; + List *exprstatelist = node->exprstatelists[curr_idx]; + MemoryContext oldContext; + Datum *values; + bool *isnull; + ListCell *lc; + int resind; + + /* + * Get rid of any prior cycle's leftovers. We use ReScanExprContext + * not just ResetExprContext because we want any registered shutdown + * callbacks to be called. + */ + ReScanExprContext(econtext); + + /* + * Do per-VALUES-row work in the per-tuple context. + */ + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + /* + * Unless we already made the expression eval state for this row, + * build it in the econtext's per-tuple memory. This is a tad + * unusual, but we want to delete the eval state again when we move to + * the next row, to avoid growth of memory requirements over a long + * values list. For rows in which that won't work, we already built + * the eval state at plan startup. + */ + if (exprstatelist == NIL) + { + /* + * Pass parent as NULL, not my plan node, because we don't want + * anything in this transient state linking into permanent state. + * The only expression type that might wish to do so is a SubPlan, + * and we already checked that there aren't any. + * + * Note that passing parent = NULL also disables JIT compilation + * of the expressions, which is a win, because they're only going + * to be used once under normal circumstances. + */ + exprstatelist = ExecInitExprList(exprlist, NULL); + } + + /* parser should have checked all sublists are the same length */ + Assert(list_length(exprstatelist) == slot->tts_tupleDescriptor->natts); + + /* + * Compute the expressions and build a virtual result tuple. We + * already did ExecClearTuple(slot). + */ + values = slot->tts_values; + isnull = slot->tts_isnull; + + resind = 0; + foreach(lc, exprstatelist) + { + ExprState *estate = (ExprState *) lfirst(lc); + Form_pg_attribute attr = TupleDescAttr(slot->tts_tupleDescriptor, + resind); + + values[resind] = ExecEvalExpr(estate, + econtext, + &isnull[resind]); + + /* + * We must force any R/W expanded datums to read-only state, in + * case they are multiply referenced in the plan node's output + * expressions, or in case we skip the output projection and the + * output column is multiply referenced in higher plan nodes. + */ + values[resind] = MakeExpandedObjectReadOnly(values[resind], + isnull[resind], + attr->attlen); + + resind++; + } + + MemoryContextSwitchTo(oldContext); + + /* + * And return the virtual tuple. + */ + ExecStoreVirtualTuple(slot); + } + + return slot; +} + +/* + * ValuesRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +ValuesRecheck(ValuesScanState *node, TupleTableSlot *slot) +{ + /* nothing to check */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecValuesScan(node) + * + * Scans the values lists sequentially and returns the next qualifying + * tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecValuesScan(PlanState *pstate) +{ + ValuesScanState *node = castNode(ValuesScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) ValuesNext, + (ExecScanRecheckMtd) ValuesRecheck); +} + +/* ---------------------------------------------------------------- + * ExecInitValuesScan + * ---------------------------------------------------------------- + */ +ValuesScanState * +ExecInitValuesScan(ValuesScan *node, EState *estate, int eflags) +{ + ValuesScanState *scanstate; + TupleDesc tupdesc; + ListCell *vtl; + int i; + PlanState *planstate; + + /* + * ValuesScan should not have any children. + */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create new ScanState for node + */ + scanstate = makeNode(ValuesScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecValuesScan; + + /* + * Miscellaneous initialization + */ + planstate = &scanstate->ss.ps; + + /* + * Create expression contexts. We need two, one for per-sublist + * processing and one for execScan.c to use for quals and projections. We + * cheat a little by using ExecAssignExprContext() to build both. + */ + ExecAssignExprContext(estate, planstate); + scanstate->rowcontext = planstate->ps_ExprContext; + ExecAssignExprContext(estate, planstate); + + /* + * Get info about values list, initialize scan slot with it. + */ + tupdesc = ExecTypeFromExprList((List *) linitial(node->values_lists)); + ExecInitScanTupleSlot(estate, &scanstate->ss, tupdesc, &TTSOpsVirtual); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); + + /* + * Other node-specific setup + */ + scanstate->curr_idx = -1; + scanstate->array_len = list_length(node->values_lists); + + /* + * Convert the list of expression sublists into an array for easier + * addressing at runtime. Also, detect whether any sublists contain + * SubPlans; for just those sublists, go ahead and do expression + * initialization. (This avoids problems with SubPlans wanting to connect + * themselves up to the outer plan tree. Notably, EXPLAIN won't see the + * subplans otherwise; also we will have troubles with dangling pointers + * and/or leaked resources if we try to handle SubPlans the same as + * simpler expressions.) + */ + scanstate->exprlists = (List **) + palloc(scanstate->array_len * sizeof(List *)); + scanstate->exprstatelists = (List **) + palloc0(scanstate->array_len * sizeof(List *)); + i = 0; + foreach(vtl, node->values_lists) + { + List *exprs = castNode(List, lfirst(vtl)); + + scanstate->exprlists[i] = exprs; + + /* + * We can avoid the cost of a contain_subplans() scan in the simple + * case where there are no SubPlans anywhere. + */ + if (estate->es_subplanstates && + contain_subplans((Node *) exprs)) + { + int saved_jit_flags; + + /* + * As these expressions are only used once, disable JIT for them. + * This is worthwhile because it's common to insert significant + * amounts of data via VALUES(). Note that this doesn't prevent + * use of JIT *within* a subplan, since that's initialized + * separately; this just affects the upper-level subexpressions. + */ + saved_jit_flags = estate->es_jit_flags; + estate->es_jit_flags = PGJIT_NONE; + + scanstate->exprstatelists[i] = ExecInitExprList(exprs, + &scanstate->ss.ps); + + estate->es_jit_flags = saved_jit_flags; + } + i++; + } + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndValuesScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndValuesScan(ValuesScanState *node) +{ + /* + * Free both exprcontexts + */ + ExecFreeExprContext(&node->ss.ps); + node->ss.ps.ps_ExprContext = node->rowcontext; + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecReScanValuesScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanValuesScan(ValuesScanState *node) +{ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + ExecScanReScan(&node->ss); + + node->curr_idx = -1; +} diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c new file mode 100644 index 0000000..f8ea9e9 --- /dev/null +++ b/src/backend/executor/nodeWindowAgg.c @@ -0,0 +1,3463 @@ +/*------------------------------------------------------------------------- + * + * nodeWindowAgg.c + * routines to handle WindowAgg nodes. + * + * A WindowAgg node evaluates "window functions" across suitable partitions + * of the input tuple set. Any one WindowAgg works for just a single window + * specification, though it can evaluate multiple window functions sharing + * identical window specifications. The input tuples are required to be + * delivered in sorted order, with the PARTITION BY columns (if any) as + * major sort keys and the ORDER BY columns (if any) as minor sort keys. + * (The planner generates a stack of WindowAggs with intervening Sort nodes + * as needed, if a query involves more than one window specification.) + * + * Since window functions can require access to any or all of the rows in + * the current partition, we accumulate rows of the partition into a + * tuplestore. The window functions are called using the WindowObject API + * so that they can access those rows as needed. + * + * We also support using plain aggregate functions as window functions. + * For these, the regular Agg-node environment is emulated for each partition. + * As required by the SQL spec, the output represents the value of the + * aggregate function over all rows in the current row's window frame. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeWindowAgg.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_proc.h" +#include "executor/executor.h" +#include "executor/nodeWindowAgg.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/optimizer.h" +#include "parser/parse_agg.h" +#include "parser/parse_coerce.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/expandeddatum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/regproc.h" +#include "utils/syscache.h" +#include "windowapi.h" + +/* + * All the window function APIs are called with this object, which is passed + * to window functions as fcinfo->context. + */ +typedef struct WindowObjectData +{ + NodeTag type; + WindowAggState *winstate; /* parent WindowAggState */ + List *argstates; /* ExprState trees for fn's arguments */ + void *localmem; /* WinGetPartitionLocalMemory's chunk */ + int markptr; /* tuplestore mark pointer for this fn */ + int readptr; /* tuplestore read pointer for this fn */ + int64 markpos; /* row that markptr is positioned on */ + int64 seekpos; /* row that readptr is positioned on */ +} WindowObjectData; + +/* + * We have one WindowStatePerFunc struct for each window function and + * window aggregate handled by this node. + */ +typedef struct WindowStatePerFuncData +{ + /* Links to WindowFunc expr and state nodes this working state is for */ + WindowFuncExprState *wfuncstate; + WindowFunc *wfunc; + + int numArguments; /* number of arguments */ + + FmgrInfo flinfo; /* fmgr lookup data for window function */ + + Oid winCollation; /* collation derived for window function */ + + /* + * We need the len and byval info for the result of each function in order + * to know how to copy/delete values. + */ + int16 resulttypeLen; + bool resulttypeByVal; + + bool plain_agg; /* is it just a plain aggregate function? */ + int aggno; /* if so, index of its WindowStatePerAggData */ + + WindowObject winobj; /* object used in window function API */ +} WindowStatePerFuncData; + +/* + * For plain aggregate window functions, we also have one of these. + */ +typedef struct WindowStatePerAggData +{ + /* Oids of transition functions */ + Oid transfn_oid; + Oid invtransfn_oid; /* may be InvalidOid */ + Oid finalfn_oid; /* may be InvalidOid */ + + /* + * fmgr lookup data for transition functions --- only valid when + * corresponding oid is not InvalidOid. Note in particular that fn_strict + * flags are kept here. + */ + FmgrInfo transfn; + FmgrInfo invtransfn; + FmgrInfo finalfn; + + int numFinalArgs; /* number of arguments to pass to finalfn */ + + /* + * initial value from pg_aggregate entry + */ + Datum initValue; + bool initValueIsNull; + + /* + * cached value for current frame boundaries + */ + Datum resultValue; + bool resultValueIsNull; + + /* + * We need the len and byval info for the agg's input, result, and + * transition data types in order to know how to copy/delete values. + */ + int16 inputtypeLen, + resulttypeLen, + transtypeLen; + bool inputtypeByVal, + resulttypeByVal, + transtypeByVal; + + int wfuncno; /* index of associated WindowStatePerFuncData */ + + /* Context holding transition value and possibly other subsidiary data */ + MemoryContext aggcontext; /* may be private, or winstate->aggcontext */ + + /* Current transition value */ + Datum transValue; /* current transition value */ + bool transValueIsNull; + + int64 transValueCount; /* number of currently-aggregated rows */ + + /* Data local to eval_windowaggregates() */ + bool restart; /* need to restart this agg in this cycle? */ +} WindowStatePerAggData; + +static void initialize_windowaggregate(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + WindowStatePerAgg peraggstate); +static void advance_windowaggregate(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + WindowStatePerAgg peraggstate); +static bool advance_windowaggregate_base(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + WindowStatePerAgg peraggstate); +static void finalize_windowaggregate(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + WindowStatePerAgg peraggstate, + Datum *result, bool *isnull); + +static void eval_windowaggregates(WindowAggState *winstate); +static void eval_windowfunction(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + Datum *result, bool *isnull); + +static void begin_partition(WindowAggState *winstate); +static void spool_tuples(WindowAggState *winstate, int64 pos); +static void release_partition(WindowAggState *winstate); + +static int row_is_in_frame(WindowAggState *winstate, int64 pos, + TupleTableSlot *slot); +static void update_frameheadpos(WindowAggState *winstate); +static void update_frametailpos(WindowAggState *winstate); +static void update_grouptailpos(WindowAggState *winstate); + +static WindowStatePerAggData *initialize_peragg(WindowAggState *winstate, + WindowFunc *wfunc, + WindowStatePerAgg peraggstate); +static Datum GetAggInitVal(Datum textInitVal, Oid transtype); + +static bool are_peers(WindowAggState *winstate, TupleTableSlot *slot1, + TupleTableSlot *slot2); +static bool window_gettupleslot(WindowObject winobj, int64 pos, + TupleTableSlot *slot); + + +/* + * initialize_windowaggregate + * parallel to initialize_aggregates in nodeAgg.c + */ +static void +initialize_windowaggregate(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + WindowStatePerAgg peraggstate) +{ + MemoryContext oldContext; + + /* + * If we're using a private aggcontext, we may reset it here. But if the + * context is shared, we don't know which other aggregates may still need + * it, so we must leave it to the caller to reset at an appropriate time. + */ + if (peraggstate->aggcontext != winstate->aggcontext) + MemoryContextResetAndDeleteChildren(peraggstate->aggcontext); + + if (peraggstate->initValueIsNull) + peraggstate->transValue = peraggstate->initValue; + else + { + oldContext = MemoryContextSwitchTo(peraggstate->aggcontext); + peraggstate->transValue = datumCopy(peraggstate->initValue, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + MemoryContextSwitchTo(oldContext); + } + peraggstate->transValueIsNull = peraggstate->initValueIsNull; + peraggstate->transValueCount = 0; + peraggstate->resultValue = (Datum) 0; + peraggstate->resultValueIsNull = true; +} + +/* + * advance_windowaggregate + * parallel to advance_aggregates in nodeAgg.c + */ +static void +advance_windowaggregate(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + WindowStatePerAgg peraggstate) +{ + LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS); + WindowFuncExprState *wfuncstate = perfuncstate->wfuncstate; + int numArguments = perfuncstate->numArguments; + Datum newVal; + ListCell *arg; + int i; + MemoryContext oldContext; + ExprContext *econtext = winstate->tmpcontext; + ExprState *filter = wfuncstate->aggfilter; + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + /* Skip anything FILTERed out */ + if (filter) + { + bool isnull; + Datum res = ExecEvalExpr(filter, econtext, &isnull); + + if (isnull || !DatumGetBool(res)) + { + MemoryContextSwitchTo(oldContext); + return; + } + } + + /* We start from 1, since the 0th arg will be the transition value */ + i = 1; + foreach(arg, wfuncstate->args) + { + ExprState *argstate = (ExprState *) lfirst(arg); + + fcinfo->args[i].value = ExecEvalExpr(argstate, econtext, + &fcinfo->args[i].isnull); + i++; + } + + if (peraggstate->transfn.fn_strict) + { + /* + * For a strict transfn, nothing happens when there's a NULL input; we + * just keep the prior transValue. Note transValueCount doesn't + * change either. + */ + for (i = 1; i <= numArguments; i++) + { + if (fcinfo->args[i].isnull) + { + MemoryContextSwitchTo(oldContext); + return; + } + } + + /* + * For strict transition functions with initial value NULL we use the + * first non-NULL input as the initial state. (We already checked + * that the agg's input type is binary-compatible with its transtype, + * so straight copy here is OK.) + * + * We must copy the datum into aggcontext if it is pass-by-ref. We do + * not need to pfree the old transValue, since it's NULL. + */ + if (peraggstate->transValueCount == 0 && peraggstate->transValueIsNull) + { + MemoryContextSwitchTo(peraggstate->aggcontext); + peraggstate->transValue = datumCopy(fcinfo->args[1].value, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + peraggstate->transValueIsNull = false; + peraggstate->transValueCount = 1; + MemoryContextSwitchTo(oldContext); + return; + } + + if (peraggstate->transValueIsNull) + { + /* + * Don't call a strict function with NULL inputs. Note it is + * possible to get here despite the above tests, if the transfn is + * strict *and* returned a NULL on a prior cycle. If that happens + * we will propagate the NULL all the way to the end. That can + * only happen if there's no inverse transition function, though, + * since we disallow transitions back to NULL when there is one. + */ + MemoryContextSwitchTo(oldContext); + Assert(!OidIsValid(peraggstate->invtransfn_oid)); + return; + } + } + + /* + * OK to call the transition function. Set winstate->curaggcontext while + * calling it, for possible use by AggCheckCallContext. + */ + InitFunctionCallInfoData(*fcinfo, &(peraggstate->transfn), + numArguments + 1, + perfuncstate->winCollation, + (void *) winstate, NULL); + fcinfo->args[0].value = peraggstate->transValue; + fcinfo->args[0].isnull = peraggstate->transValueIsNull; + winstate->curaggcontext = peraggstate->aggcontext; + newVal = FunctionCallInvoke(fcinfo); + winstate->curaggcontext = NULL; + + /* + * Moving-aggregate transition functions must not return null, see + * advance_windowaggregate_base(). + */ + if (fcinfo->isnull && OidIsValid(peraggstate->invtransfn_oid)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("moving-aggregate transition function must not return null"))); + + /* + * We must track the number of rows included in transValue, since to + * remove the last input, advance_windowaggregate_base() mustn't call the + * inverse transition function, but simply reset transValue back to its + * initial value. + */ + peraggstate->transValueCount++; + + /* + * If pass-by-ref datatype, must copy the new value into aggcontext and + * free the prior transValue. But if transfn returned a pointer to its + * first input, we don't need to do anything. Also, if transfn returned a + * pointer to a R/W expanded object that is already a child of the + * aggcontext, assume we can adopt that value without copying it. + */ + if (!peraggstate->transtypeByVal && + DatumGetPointer(newVal) != DatumGetPointer(peraggstate->transValue)) + { + if (!fcinfo->isnull) + { + MemoryContextSwitchTo(peraggstate->aggcontext); + if (DatumIsReadWriteExpandedObject(newVal, + false, + peraggstate->transtypeLen) && + MemoryContextGetParent(DatumGetEOHP(newVal)->eoh_context) == CurrentMemoryContext) + /* do nothing */ ; + else + newVal = datumCopy(newVal, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + } + if (!peraggstate->transValueIsNull) + { + if (DatumIsReadWriteExpandedObject(peraggstate->transValue, + false, + peraggstate->transtypeLen)) + DeleteExpandedObject(peraggstate->transValue); + else + pfree(DatumGetPointer(peraggstate->transValue)); + } + } + + MemoryContextSwitchTo(oldContext); + peraggstate->transValue = newVal; + peraggstate->transValueIsNull = fcinfo->isnull; +} + +/* + * advance_windowaggregate_base + * Remove the oldest tuple from an aggregation. + * + * This is very much like advance_windowaggregate, except that we will call + * the inverse transition function (which caller must have checked is + * available). + * + * Returns true if we successfully removed the current row from this + * aggregate, false if not (in the latter case, caller is responsible + * for cleaning up by restarting the aggregation). + */ +static bool +advance_windowaggregate_base(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + WindowStatePerAgg peraggstate) +{ + LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS); + WindowFuncExprState *wfuncstate = perfuncstate->wfuncstate; + int numArguments = perfuncstate->numArguments; + Datum newVal; + ListCell *arg; + int i; + MemoryContext oldContext; + ExprContext *econtext = winstate->tmpcontext; + ExprState *filter = wfuncstate->aggfilter; + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + /* Skip anything FILTERed out */ + if (filter) + { + bool isnull; + Datum res = ExecEvalExpr(filter, econtext, &isnull); + + if (isnull || !DatumGetBool(res)) + { + MemoryContextSwitchTo(oldContext); + return true; + } + } + + /* We start from 1, since the 0th arg will be the transition value */ + i = 1; + foreach(arg, wfuncstate->args) + { + ExprState *argstate = (ExprState *) lfirst(arg); + + fcinfo->args[i].value = ExecEvalExpr(argstate, econtext, + &fcinfo->args[i].isnull); + i++; + } + + if (peraggstate->invtransfn.fn_strict) + { + /* + * For a strict (inv)transfn, nothing happens when there's a NULL + * input; we just keep the prior transValue. Note transValueCount + * doesn't change either. + */ + for (i = 1; i <= numArguments; i++) + { + if (fcinfo->args[i].isnull) + { + MemoryContextSwitchTo(oldContext); + return true; + } + } + } + + /* There should still be an added but not yet removed value */ + Assert(peraggstate->transValueCount > 0); + + /* + * In moving-aggregate mode, the state must never be NULL, except possibly + * before any rows have been aggregated (which is surely not the case at + * this point). This restriction allows us to interpret a NULL result + * from the inverse function as meaning "sorry, can't do an inverse + * transition in this case". We already checked this in + * advance_windowaggregate, but just for safety, check again. + */ + if (peraggstate->transValueIsNull) + elog(ERROR, "aggregate transition value is NULL before inverse transition"); + + /* + * We mustn't use the inverse transition function to remove the last + * input. Doing so would yield a non-NULL state, whereas we should be in + * the initial state afterwards which may very well be NULL. So instead, + * we simply re-initialize the aggregate in this case. + */ + if (peraggstate->transValueCount == 1) + { + MemoryContextSwitchTo(oldContext); + initialize_windowaggregate(winstate, + &winstate->perfunc[peraggstate->wfuncno], + peraggstate); + return true; + } + + /* + * OK to call the inverse transition function. Set + * winstate->curaggcontext while calling it, for possible use by + * AggCheckCallContext. + */ + InitFunctionCallInfoData(*fcinfo, &(peraggstate->invtransfn), + numArguments + 1, + perfuncstate->winCollation, + (void *) winstate, NULL); + fcinfo->args[0].value = peraggstate->transValue; + fcinfo->args[0].isnull = peraggstate->transValueIsNull; + winstate->curaggcontext = peraggstate->aggcontext; + newVal = FunctionCallInvoke(fcinfo); + winstate->curaggcontext = NULL; + + /* + * If the function returns NULL, report failure, forcing a restart. + */ + if (fcinfo->isnull) + { + MemoryContextSwitchTo(oldContext); + return false; + } + + /* Update number of rows included in transValue */ + peraggstate->transValueCount--; + + /* + * If pass-by-ref datatype, must copy the new value into aggcontext and + * free the prior transValue. But if invtransfn returned a pointer to its + * first input, we don't need to do anything. Also, if invtransfn + * returned a pointer to a R/W expanded object that is already a child of + * the aggcontext, assume we can adopt that value without copying it. + * + * Note: the checks for null values here will never fire, but it seems + * best to have this stanza look just like advance_windowaggregate. + */ + if (!peraggstate->transtypeByVal && + DatumGetPointer(newVal) != DatumGetPointer(peraggstate->transValue)) + { + if (!fcinfo->isnull) + { + MemoryContextSwitchTo(peraggstate->aggcontext); + if (DatumIsReadWriteExpandedObject(newVal, + false, + peraggstate->transtypeLen) && + MemoryContextGetParent(DatumGetEOHP(newVal)->eoh_context) == CurrentMemoryContext) + /* do nothing */ ; + else + newVal = datumCopy(newVal, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + } + if (!peraggstate->transValueIsNull) + { + if (DatumIsReadWriteExpandedObject(peraggstate->transValue, + false, + peraggstate->transtypeLen)) + DeleteExpandedObject(peraggstate->transValue); + else + pfree(DatumGetPointer(peraggstate->transValue)); + } + } + + MemoryContextSwitchTo(oldContext); + peraggstate->transValue = newVal; + peraggstate->transValueIsNull = fcinfo->isnull; + + return true; +} + +/* + * finalize_windowaggregate + * parallel to finalize_aggregate in nodeAgg.c + */ +static void +finalize_windowaggregate(WindowAggState *winstate, + WindowStatePerFunc perfuncstate, + WindowStatePerAgg peraggstate, + Datum *result, bool *isnull) +{ + MemoryContext oldContext; + + oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory); + + /* + * Apply the agg's finalfn if one is provided, else return transValue. + */ + if (OidIsValid(peraggstate->finalfn_oid)) + { + LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS); + int numFinalArgs = peraggstate->numFinalArgs; + bool anynull; + int i; + + InitFunctionCallInfoData(fcinfodata.fcinfo, &(peraggstate->finalfn), + numFinalArgs, + perfuncstate->winCollation, + (void *) winstate, NULL); + fcinfo->args[0].value = + MakeExpandedObjectReadOnly(peraggstate->transValue, + peraggstate->transValueIsNull, + peraggstate->transtypeLen); + fcinfo->args[0].isnull = peraggstate->transValueIsNull; + anynull = peraggstate->transValueIsNull; + + /* Fill any remaining argument positions with nulls */ + for (i = 1; i < numFinalArgs; i++) + { + fcinfo->args[i].value = (Datum) 0; + fcinfo->args[i].isnull = true; + anynull = true; + } + + if (fcinfo->flinfo->fn_strict && anynull) + { + /* don't call a strict function with NULL inputs */ + *result = (Datum) 0; + *isnull = true; + } + else + { + winstate->curaggcontext = peraggstate->aggcontext; + *result = FunctionCallInvoke(fcinfo); + winstate->curaggcontext = NULL; + *isnull = fcinfo->isnull; + } + } + else + { + /* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */ + *result = peraggstate->transValue; + *isnull = peraggstate->transValueIsNull; + } + + /* + * If result is pass-by-ref, make sure it is in the right context. + */ + if (!peraggstate->resulttypeByVal && !*isnull && + !MemoryContextContains(CurrentMemoryContext, + DatumGetPointer(*result))) + *result = datumCopy(*result, + peraggstate->resulttypeByVal, + peraggstate->resulttypeLen); + MemoryContextSwitchTo(oldContext); +} + +/* + * eval_windowaggregates + * evaluate plain aggregates being used as window functions + * + * This differs from nodeAgg.c in two ways. First, if the window's frame + * start position moves, we use the inverse transition function (if it exists) + * to remove rows from the transition value. And second, we expect to be + * able to call aggregate final functions repeatedly after aggregating more + * data onto the same transition value. This is not a behavior required by + * nodeAgg.c. + */ +static void +eval_windowaggregates(WindowAggState *winstate) +{ + WindowStatePerAgg peraggstate; + int wfuncno, + numaggs, + numaggs_restart, + i; + int64 aggregatedupto_nonrestarted; + MemoryContext oldContext; + ExprContext *econtext; + WindowObject agg_winobj; + TupleTableSlot *agg_row_slot; + TupleTableSlot *temp_slot; + + numaggs = winstate->numaggs; + if (numaggs == 0) + return; /* nothing to do */ + + /* final output execution is in ps_ExprContext */ + econtext = winstate->ss.ps.ps_ExprContext; + agg_winobj = winstate->agg_winobj; + agg_row_slot = winstate->agg_row_slot; + temp_slot = winstate->temp_slot_1; + + /* + * If the window's frame start clause is UNBOUNDED_PRECEDING and no + * exclusion clause is specified, then the window frame consists of a + * contiguous group of rows extending forward from the start of the + * partition, and rows only enter the frame, never exit it, as the current + * row advances forward. This makes it possible to use an incremental + * strategy for evaluating aggregates: we run the transition function for + * each row added to the frame, and run the final function whenever we + * need the current aggregate value. This is considerably more efficient + * than the naive approach of re-running the entire aggregate calculation + * for each current row. It does assume that the final function doesn't + * damage the running transition value, but we have the same assumption in + * nodeAgg.c too (when it rescans an existing hash table). + * + * If the frame start does sometimes move, we can still optimize as above + * whenever successive rows share the same frame head, but if the frame + * head moves beyond the previous head we try to remove those rows using + * the aggregate's inverse transition function. This function restores + * the aggregate's current state to what it would be if the removed row + * had never been aggregated in the first place. Inverse transition + * functions may optionally return NULL, indicating that the function was + * unable to remove the tuple from aggregation. If this happens, or if + * the aggregate doesn't have an inverse transition function at all, we + * must perform the aggregation all over again for all tuples within the + * new frame boundaries. + * + * If there's any exclusion clause, then we may have to aggregate over a + * non-contiguous set of rows, so we punt and recalculate for every row. + * (For some frame end choices, it might be that the frame is always + * contiguous anyway, but that's an optimization to investigate later.) + * + * In many common cases, multiple rows share the same frame and hence the + * same aggregate value. (In particular, if there's no ORDER BY in a RANGE + * window, then all rows are peers and so they all have window frame equal + * to the whole partition.) We optimize such cases by calculating the + * aggregate value once when we reach the first row of a peer group, and + * then returning the saved value for all subsequent rows. + * + * 'aggregatedupto' keeps track of the first row that has not yet been + * accumulated into the aggregate transition values. Whenever we start a + * new peer group, we accumulate forward to the end of the peer group. + */ + + /* + * First, update the frame head position. + * + * The frame head should never move backwards, and the code below wouldn't + * cope if it did, so for safety we complain if it does. + */ + update_frameheadpos(winstate); + if (winstate->frameheadpos < winstate->aggregatedbase) + elog(ERROR, "window frame head moved backward"); + + /* + * If the frame didn't change compared to the previous row, we can re-use + * the result values that were previously saved at the bottom of this + * function. Since we don't know the current frame's end yet, this is not + * possible to check for fully. But if the frame end mode is UNBOUNDED + * FOLLOWING or CURRENT ROW, no exclusion clause is specified, and the + * current row lies within the previous row's frame, then the two frames' + * ends must coincide. Note that on the first row aggregatedbase == + * aggregatedupto, meaning this test must fail, so we don't need to check + * the "there was no previous row" case explicitly here. + */ + if (winstate->aggregatedbase == winstate->frameheadpos && + (winstate->frameOptions & (FRAMEOPTION_END_UNBOUNDED_FOLLOWING | + FRAMEOPTION_END_CURRENT_ROW)) && + !(winstate->frameOptions & FRAMEOPTION_EXCLUSION) && + winstate->aggregatedbase <= winstate->currentpos && + winstate->aggregatedupto > winstate->currentpos) + { + for (i = 0; i < numaggs; i++) + { + peraggstate = &winstate->peragg[i]; + wfuncno = peraggstate->wfuncno; + econtext->ecxt_aggvalues[wfuncno] = peraggstate->resultValue; + econtext->ecxt_aggnulls[wfuncno] = peraggstate->resultValueIsNull; + } + return; + } + + /*---------- + * Initialize restart flags. + * + * We restart the aggregation: + * - if we're processing the first row in the partition, or + * - if the frame's head moved and we cannot use an inverse + * transition function, or + * - we have an EXCLUSION clause, or + * - if the new frame doesn't overlap the old one + * + * Note that we don't strictly need to restart in the last case, but if + * we're going to remove all rows from the aggregation anyway, a restart + * surely is faster. + *---------- + */ + numaggs_restart = 0; + for (i = 0; i < numaggs; i++) + { + peraggstate = &winstate->peragg[i]; + if (winstate->currentpos == 0 || + (winstate->aggregatedbase != winstate->frameheadpos && + !OidIsValid(peraggstate->invtransfn_oid)) || + (winstate->frameOptions & FRAMEOPTION_EXCLUSION) || + winstate->aggregatedupto <= winstate->frameheadpos) + { + peraggstate->restart = true; + numaggs_restart++; + } + else + peraggstate->restart = false; + } + + /* + * If we have any possibly-moving aggregates, attempt to advance + * aggregatedbase to match the frame's head by removing input rows that + * fell off the top of the frame from the aggregations. This can fail, + * i.e. advance_windowaggregate_base() can return false, in which case + * we'll restart that aggregate below. + */ + while (numaggs_restart < numaggs && + winstate->aggregatedbase < winstate->frameheadpos) + { + /* + * Fetch the next tuple of those being removed. This should never fail + * as we should have been here before. + */ + if (!window_gettupleslot(agg_winobj, winstate->aggregatedbase, + temp_slot)) + elog(ERROR, "could not re-fetch previously fetched frame row"); + + /* Set tuple context for evaluation of aggregate arguments */ + winstate->tmpcontext->ecxt_outertuple = temp_slot; + + /* + * Perform the inverse transition for each aggregate function in the + * window, unless it has already been marked as needing a restart. + */ + for (i = 0; i < numaggs; i++) + { + bool ok; + + peraggstate = &winstate->peragg[i]; + if (peraggstate->restart) + continue; + + wfuncno = peraggstate->wfuncno; + ok = advance_windowaggregate_base(winstate, + &winstate->perfunc[wfuncno], + peraggstate); + if (!ok) + { + /* Inverse transition function has failed, must restart */ + peraggstate->restart = true; + numaggs_restart++; + } + } + + /* Reset per-input-tuple context after each tuple */ + ResetExprContext(winstate->tmpcontext); + + /* And advance the aggregated-row state */ + winstate->aggregatedbase++; + ExecClearTuple(temp_slot); + } + + /* + * If we successfully advanced the base rows of all the aggregates, + * aggregatedbase now equals frameheadpos; but if we failed for any, we + * must forcibly update aggregatedbase. + */ + winstate->aggregatedbase = winstate->frameheadpos; + + /* + * If we created a mark pointer for aggregates, keep it pushed up to frame + * head, so that tuplestore can discard unnecessary rows. + */ + if (agg_winobj->markptr >= 0) + WinSetMarkPosition(agg_winobj, winstate->frameheadpos); + + /* + * Now restart the aggregates that require it. + * + * We assume that aggregates using the shared context always restart if + * *any* aggregate restarts, and we may thus clean up the shared + * aggcontext if that is the case. Private aggcontexts are reset by + * initialize_windowaggregate() if their owning aggregate restarts. If we + * aren't restarting an aggregate, we need to free any previously saved + * result for it, else we'll leak memory. + */ + if (numaggs_restart > 0) + MemoryContextResetAndDeleteChildren(winstate->aggcontext); + for (i = 0; i < numaggs; i++) + { + peraggstate = &winstate->peragg[i]; + + /* Aggregates using the shared ctx must restart if *any* agg does */ + Assert(peraggstate->aggcontext != winstate->aggcontext || + numaggs_restart == 0 || + peraggstate->restart); + + if (peraggstate->restart) + { + wfuncno = peraggstate->wfuncno; + initialize_windowaggregate(winstate, + &winstate->perfunc[wfuncno], + peraggstate); + } + else if (!peraggstate->resultValueIsNull) + { + if (!peraggstate->resulttypeByVal) + pfree(DatumGetPointer(peraggstate->resultValue)); + peraggstate->resultValue = (Datum) 0; + peraggstate->resultValueIsNull = true; + } + } + + /* + * Non-restarted aggregates now contain the rows between aggregatedbase + * (i.e., frameheadpos) and aggregatedupto, while restarted aggregates + * contain no rows. If there are any restarted aggregates, we must thus + * begin aggregating anew at frameheadpos, otherwise we may simply + * continue at aggregatedupto. We must remember the old value of + * aggregatedupto to know how long to skip advancing non-restarted + * aggregates. If we modify aggregatedupto, we must also clear + * agg_row_slot, per the loop invariant below. + */ + aggregatedupto_nonrestarted = winstate->aggregatedupto; + if (numaggs_restart > 0 && + winstate->aggregatedupto != winstate->frameheadpos) + { + winstate->aggregatedupto = winstate->frameheadpos; + ExecClearTuple(agg_row_slot); + } + + /* + * Advance until we reach a row not in frame (or end of partition). + * + * Note the loop invariant: agg_row_slot is either empty or holds the row + * at position aggregatedupto. We advance aggregatedupto after processing + * a row. + */ + for (;;) + { + int ret; + + /* Fetch next row if we didn't already */ + if (TupIsNull(agg_row_slot)) + { + if (!window_gettupleslot(agg_winobj, winstate->aggregatedupto, + agg_row_slot)) + break; /* must be end of partition */ + } + + /* + * Exit loop if no more rows can be in frame. Skip aggregation if + * current row is not in frame but there might be more in the frame. + */ + ret = row_is_in_frame(winstate, winstate->aggregatedupto, agg_row_slot); + if (ret < 0) + break; + if (ret == 0) + goto next_tuple; + + /* Set tuple context for evaluation of aggregate arguments */ + winstate->tmpcontext->ecxt_outertuple = agg_row_slot; + + /* Accumulate row into the aggregates */ + for (i = 0; i < numaggs; i++) + { + peraggstate = &winstate->peragg[i]; + + /* Non-restarted aggs skip until aggregatedupto_nonrestarted */ + if (!peraggstate->restart && + winstate->aggregatedupto < aggregatedupto_nonrestarted) + continue; + + wfuncno = peraggstate->wfuncno; + advance_windowaggregate(winstate, + &winstate->perfunc[wfuncno], + peraggstate); + } + +next_tuple: + /* Reset per-input-tuple context after each tuple */ + ResetExprContext(winstate->tmpcontext); + + /* And advance the aggregated-row state */ + winstate->aggregatedupto++; + ExecClearTuple(agg_row_slot); + } + + /* The frame's end is not supposed to move backwards, ever */ + Assert(aggregatedupto_nonrestarted <= winstate->aggregatedupto); + + /* + * finalize aggregates and fill result/isnull fields. + */ + for (i = 0; i < numaggs; i++) + { + Datum *result; + bool *isnull; + + peraggstate = &winstate->peragg[i]; + wfuncno = peraggstate->wfuncno; + result = &econtext->ecxt_aggvalues[wfuncno]; + isnull = &econtext->ecxt_aggnulls[wfuncno]; + finalize_windowaggregate(winstate, + &winstate->perfunc[wfuncno], + peraggstate, + result, isnull); + + /* + * save the result in case next row shares the same frame. + * + * XXX in some framing modes, eg ROWS/END_CURRENT_ROW, we can know in + * advance that the next row can't possibly share the same frame. Is + * it worth detecting that and skipping this code? + */ + if (!peraggstate->resulttypeByVal && !*isnull) + { + oldContext = MemoryContextSwitchTo(peraggstate->aggcontext); + peraggstate->resultValue = + datumCopy(*result, + peraggstate->resulttypeByVal, + peraggstate->resulttypeLen); + MemoryContextSwitchTo(oldContext); + } + else + { + peraggstate->resultValue = *result; + } + peraggstate->resultValueIsNull = *isnull; + } +} + +/* + * eval_windowfunction + * + * Arguments of window functions are not evaluated here, because a window + * function can need random access to arbitrary rows in the partition. + * The window function uses the special WinGetFuncArgInPartition and + * WinGetFuncArgInFrame functions to evaluate the arguments for the rows + * it wants. + */ +static void +eval_windowfunction(WindowAggState *winstate, WindowStatePerFunc perfuncstate, + Datum *result, bool *isnull) +{ + LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS); + MemoryContext oldContext; + + oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory); + + /* + * We don't pass any normal arguments to a window function, but we do pass + * it the number of arguments, in order to permit window function + * implementations to support varying numbers of arguments. The real info + * goes through the WindowObject, which is passed via fcinfo->context. + */ + InitFunctionCallInfoData(*fcinfo, &(perfuncstate->flinfo), + perfuncstate->numArguments, + perfuncstate->winCollation, + (void *) perfuncstate->winobj, NULL); + /* Just in case, make all the regular argument slots be null */ + for (int argno = 0; argno < perfuncstate->numArguments; argno++) + fcinfo->args[argno].isnull = true; + /* Window functions don't have a current aggregate context, either */ + winstate->curaggcontext = NULL; + + *result = FunctionCallInvoke(fcinfo); + *isnull = fcinfo->isnull; + + /* + * Make sure pass-by-ref data is allocated in the appropriate context. (We + * need this in case the function returns a pointer into some short-lived + * tuple, as is entirely possible.) + */ + if (!perfuncstate->resulttypeByVal && !fcinfo->isnull && + !MemoryContextContains(CurrentMemoryContext, + DatumGetPointer(*result))) + *result = datumCopy(*result, + perfuncstate->resulttypeByVal, + perfuncstate->resulttypeLen); + + MemoryContextSwitchTo(oldContext); +} + +/* + * begin_partition + * Start buffering rows of the next partition. + */ +static void +begin_partition(WindowAggState *winstate) +{ + WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan; + PlanState *outerPlan = outerPlanState(winstate); + int frameOptions = winstate->frameOptions; + int numfuncs = winstate->numfuncs; + int i; + + winstate->partition_spooled = false; + winstate->framehead_valid = false; + winstate->frametail_valid = false; + winstate->grouptail_valid = false; + winstate->spooled_rows = 0; + winstate->currentpos = 0; + winstate->frameheadpos = 0; + winstate->frametailpos = 0; + winstate->currentgroup = 0; + winstate->frameheadgroup = 0; + winstate->frametailgroup = 0; + winstate->groupheadpos = 0; + winstate->grouptailpos = -1; /* see update_grouptailpos */ + ExecClearTuple(winstate->agg_row_slot); + if (winstate->framehead_slot) + ExecClearTuple(winstate->framehead_slot); + if (winstate->frametail_slot) + ExecClearTuple(winstate->frametail_slot); + + /* + * If this is the very first partition, we need to fetch the first input + * row to store in first_part_slot. + */ + if (TupIsNull(winstate->first_part_slot)) + { + TupleTableSlot *outerslot = ExecProcNode(outerPlan); + + if (!TupIsNull(outerslot)) + ExecCopySlot(winstate->first_part_slot, outerslot); + else + { + /* outer plan is empty, so we have nothing to do */ + winstate->partition_spooled = true; + winstate->more_partitions = false; + return; + } + } + + /* Create new tuplestore for this partition */ + winstate->buffer = tuplestore_begin_heap(false, false, work_mem); + + /* + * Set up read pointers for the tuplestore. The current pointer doesn't + * need BACKWARD capability, but the per-window-function read pointers do, + * and the aggregate pointer does if we might need to restart aggregation. + */ + winstate->current_ptr = 0; /* read pointer 0 is pre-allocated */ + + /* reset default REWIND capability bit for current ptr */ + tuplestore_set_eflags(winstate->buffer, 0); + + /* create read pointers for aggregates, if needed */ + if (winstate->numaggs > 0) + { + WindowObject agg_winobj = winstate->agg_winobj; + int readptr_flags = 0; + + /* + * If the frame head is potentially movable, or we have an EXCLUSION + * clause, we might need to restart aggregation ... + */ + if (!(frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING) || + (frameOptions & FRAMEOPTION_EXCLUSION)) + { + /* ... so create a mark pointer to track the frame head */ + agg_winobj->markptr = tuplestore_alloc_read_pointer(winstate->buffer, 0); + /* and the read pointer will need BACKWARD capability */ + readptr_flags |= EXEC_FLAG_BACKWARD; + } + + agg_winobj->readptr = tuplestore_alloc_read_pointer(winstate->buffer, + readptr_flags); + agg_winobj->markpos = -1; + agg_winobj->seekpos = -1; + + /* Also reset the row counters for aggregates */ + winstate->aggregatedbase = 0; + winstate->aggregatedupto = 0; + } + + /* create mark and read pointers for each real window function */ + for (i = 0; i < numfuncs; i++) + { + WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]); + + if (!perfuncstate->plain_agg) + { + WindowObject winobj = perfuncstate->winobj; + + winobj->markptr = tuplestore_alloc_read_pointer(winstate->buffer, + 0); + winobj->readptr = tuplestore_alloc_read_pointer(winstate->buffer, + EXEC_FLAG_BACKWARD); + winobj->markpos = -1; + winobj->seekpos = -1; + } + } + + /* + * If we are in RANGE or GROUPS mode, then determining frame boundaries + * requires physical access to the frame endpoint rows, except in certain + * degenerate cases. We create read pointers to point to those rows, to + * simplify access and ensure that the tuplestore doesn't discard the + * endpoint rows prematurely. (Must create pointers in exactly the same + * cases that update_frameheadpos and update_frametailpos need them.) + */ + winstate->framehead_ptr = winstate->frametail_ptr = -1; /* if not used */ + + if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) + { + if (((frameOptions & FRAMEOPTION_START_CURRENT_ROW) && + node->ordNumCols != 0) || + (frameOptions & FRAMEOPTION_START_OFFSET)) + winstate->framehead_ptr = + tuplestore_alloc_read_pointer(winstate->buffer, 0); + if (((frameOptions & FRAMEOPTION_END_CURRENT_ROW) && + node->ordNumCols != 0) || + (frameOptions & FRAMEOPTION_END_OFFSET)) + winstate->frametail_ptr = + tuplestore_alloc_read_pointer(winstate->buffer, 0); + } + + /* + * If we have an exclusion clause that requires knowing the boundaries of + * the current row's peer group, we create a read pointer to track the + * tail position of the peer group (i.e., first row of the next peer + * group). The head position does not require its own pointer because we + * maintain that as a side effect of advancing the current row. + */ + winstate->grouptail_ptr = -1; + + if ((frameOptions & (FRAMEOPTION_EXCLUDE_GROUP | + FRAMEOPTION_EXCLUDE_TIES)) && + node->ordNumCols != 0) + { + winstate->grouptail_ptr = + tuplestore_alloc_read_pointer(winstate->buffer, 0); + } + + /* + * Store the first tuple into the tuplestore (it's always available now; + * we either read it above, or saved it at the end of previous partition) + */ + tuplestore_puttupleslot(winstate->buffer, winstate->first_part_slot); + winstate->spooled_rows++; +} + +/* + * Read tuples from the outer node, up to and including position 'pos', and + * store them into the tuplestore. If pos is -1, reads the whole partition. + */ +static void +spool_tuples(WindowAggState *winstate, int64 pos) +{ + WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan; + PlanState *outerPlan; + TupleTableSlot *outerslot; + MemoryContext oldcontext; + + if (!winstate->buffer) + return; /* just a safety check */ + if (winstate->partition_spooled) + return; /* whole partition done already */ + + /* + * If the tuplestore has spilled to disk, alternate reading and writing + * becomes quite expensive due to frequent buffer flushes. It's cheaper + * to force the entire partition to get spooled in one go. + * + * XXX this is a horrid kluge --- it'd be better to fix the performance + * problem inside tuplestore. FIXME + */ + if (!tuplestore_in_memory(winstate->buffer)) + pos = -1; + + outerPlan = outerPlanState(winstate); + + /* Must be in query context to call outerplan */ + oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory); + + while (winstate->spooled_rows <= pos || pos == -1) + { + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + { + /* reached the end of the last partition */ + winstate->partition_spooled = true; + winstate->more_partitions = false; + break; + } + + if (node->partNumCols > 0) + { + ExprContext *econtext = winstate->tmpcontext; + + econtext->ecxt_innertuple = winstate->first_part_slot; + econtext->ecxt_outertuple = outerslot; + + /* Check if this tuple still belongs to the current partition */ + if (!ExecQualAndReset(winstate->partEqfunction, econtext)) + { + /* + * end of partition; copy the tuple for the next cycle. + */ + ExecCopySlot(winstate->first_part_slot, outerslot); + winstate->partition_spooled = true; + winstate->more_partitions = true; + break; + } + } + + /* Still in partition, so save it into the tuplestore */ + tuplestore_puttupleslot(winstate->buffer, outerslot); + winstate->spooled_rows++; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * release_partition + * clear information kept within a partition, including + * tuplestore and aggregate results. + */ +static void +release_partition(WindowAggState *winstate) +{ + int i; + + for (i = 0; i < winstate->numfuncs; i++) + { + WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]); + + /* Release any partition-local state of this window function */ + if (perfuncstate->winobj) + perfuncstate->winobj->localmem = NULL; + } + + /* + * Release all partition-local memory (in particular, any partition-local + * state that we might have trashed our pointers to in the above loop, and + * any aggregate temp data). We don't rely on retail pfree because some + * aggregates might have allocated data we don't have direct pointers to. + */ + MemoryContextResetAndDeleteChildren(winstate->partcontext); + MemoryContextResetAndDeleteChildren(winstate->aggcontext); + for (i = 0; i < winstate->numaggs; i++) + { + if (winstate->peragg[i].aggcontext != winstate->aggcontext) + MemoryContextResetAndDeleteChildren(winstate->peragg[i].aggcontext); + } + + if (winstate->buffer) + tuplestore_end(winstate->buffer); + winstate->buffer = NULL; + winstate->partition_spooled = false; +} + +/* + * row_is_in_frame + * Determine whether a row is in the current row's window frame according + * to our window framing rule + * + * The caller must have already determined that the row is in the partition + * and fetched it into a slot. This function just encapsulates the framing + * rules. + * + * Returns: + * -1, if the row is out of frame and no succeeding rows can be in frame + * 0, if the row is out of frame but succeeding rows might be in frame + * 1, if the row is in frame + * + * May clobber winstate->temp_slot_2. + */ +static int +row_is_in_frame(WindowAggState *winstate, int64 pos, TupleTableSlot *slot) +{ + int frameOptions = winstate->frameOptions; + + Assert(pos >= 0); /* else caller error */ + + /* + * First, check frame starting conditions. We might as well delegate this + * to update_frameheadpos always; it doesn't add any notable cost. + */ + update_frameheadpos(winstate); + if (pos < winstate->frameheadpos) + return 0; + + /* + * Okay so far, now check frame ending conditions. Here, we avoid calling + * update_frametailpos in simple cases, so as not to spool tuples further + * ahead than necessary. + */ + if (frameOptions & FRAMEOPTION_END_CURRENT_ROW) + { + if (frameOptions & FRAMEOPTION_ROWS) + { + /* rows after current row are out of frame */ + if (pos > winstate->currentpos) + return -1; + } + else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) + { + /* following row that is not peer is out of frame */ + if (pos > winstate->currentpos && + !are_peers(winstate, slot, winstate->ss.ss_ScanTupleSlot)) + return -1; + } + else + Assert(false); + } + else if (frameOptions & FRAMEOPTION_END_OFFSET) + { + if (frameOptions & FRAMEOPTION_ROWS) + { + int64 offset = DatumGetInt64(winstate->endOffsetValue); + + /* rows after current row + offset are out of frame */ + if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING) + offset = -offset; + + if (pos > winstate->currentpos + offset) + return -1; + } + else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) + { + /* hard cases, so delegate to update_frametailpos */ + update_frametailpos(winstate); + if (pos >= winstate->frametailpos) + return -1; + } + else + Assert(false); + } + + /* Check exclusion clause */ + if (frameOptions & FRAMEOPTION_EXCLUDE_CURRENT_ROW) + { + if (pos == winstate->currentpos) + return 0; + } + else if ((frameOptions & FRAMEOPTION_EXCLUDE_GROUP) || + ((frameOptions & FRAMEOPTION_EXCLUDE_TIES) && + pos != winstate->currentpos)) + { + WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan; + + /* If no ORDER BY, all rows are peers with each other */ + if (node->ordNumCols == 0) + return 0; + /* Otherwise, check the group boundaries */ + if (pos >= winstate->groupheadpos) + { + update_grouptailpos(winstate); + if (pos < winstate->grouptailpos) + return 0; + } + } + + /* If we get here, it's in frame */ + return 1; +} + +/* + * update_frameheadpos + * make frameheadpos valid for the current row + * + * Note that frameheadpos is computed without regard for any window exclusion + * clause; the current row and/or its peers are considered part of the frame + * for this purpose even if they must be excluded later. + * + * May clobber winstate->temp_slot_2. + */ +static void +update_frameheadpos(WindowAggState *winstate) +{ + WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan; + int frameOptions = winstate->frameOptions; + MemoryContext oldcontext; + + if (winstate->framehead_valid) + return; /* already known for current row */ + + /* We may be called in a short-lived context */ + oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory); + + if (frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING) + { + /* In UNBOUNDED PRECEDING mode, frame head is always row 0 */ + winstate->frameheadpos = 0; + winstate->framehead_valid = true; + } + else if (frameOptions & FRAMEOPTION_START_CURRENT_ROW) + { + if (frameOptions & FRAMEOPTION_ROWS) + { + /* In ROWS mode, frame head is the same as current */ + winstate->frameheadpos = winstate->currentpos; + winstate->framehead_valid = true; + } + else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) + { + /* If no ORDER BY, all rows are peers with each other */ + if (node->ordNumCols == 0) + { + winstate->frameheadpos = 0; + winstate->framehead_valid = true; + MemoryContextSwitchTo(oldcontext); + return; + } + + /* + * In RANGE or GROUPS START_CURRENT_ROW mode, frame head is the + * first row that is a peer of current row. We keep a copy of the + * last-known frame head row in framehead_slot, and advance as + * necessary. Note that if we reach end of partition, we will + * leave frameheadpos = end+1 and framehead_slot empty. + */ + tuplestore_select_read_pointer(winstate->buffer, + winstate->framehead_ptr); + if (winstate->frameheadpos == 0 && + TupIsNull(winstate->framehead_slot)) + { + /* fetch first row into framehead_slot, if we didn't already */ + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->framehead_slot)) + elog(ERROR, "unexpected end of tuplestore"); + } + + while (!TupIsNull(winstate->framehead_slot)) + { + if (are_peers(winstate, winstate->framehead_slot, + winstate->ss.ss_ScanTupleSlot)) + break; /* this row is the correct frame head */ + /* Note we advance frameheadpos even if the fetch fails */ + winstate->frameheadpos++; + spool_tuples(winstate, winstate->frameheadpos); + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->framehead_slot)) + break; /* end of partition */ + } + winstate->framehead_valid = true; + } + else + Assert(false); + } + else if (frameOptions & FRAMEOPTION_START_OFFSET) + { + if (frameOptions & FRAMEOPTION_ROWS) + { + /* In ROWS mode, bound is physically n before/after current */ + int64 offset = DatumGetInt64(winstate->startOffsetValue); + + if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING) + offset = -offset; + + winstate->frameheadpos = winstate->currentpos + offset; + /* frame head can't go before first row */ + if (winstate->frameheadpos < 0) + winstate->frameheadpos = 0; + else if (winstate->frameheadpos > winstate->currentpos + 1) + { + /* make sure frameheadpos is not past end of partition */ + spool_tuples(winstate, winstate->frameheadpos - 1); + if (winstate->frameheadpos > winstate->spooled_rows) + winstate->frameheadpos = winstate->spooled_rows; + } + winstate->framehead_valid = true; + } + else if (frameOptions & FRAMEOPTION_RANGE) + { + /* + * In RANGE START_OFFSET mode, frame head is the first row that + * satisfies the in_range constraint relative to the current row. + * We keep a copy of the last-known frame head row in + * framehead_slot, and advance as necessary. Note that if we + * reach end of partition, we will leave frameheadpos = end+1 and + * framehead_slot empty. + */ + int sortCol = node->ordColIdx[0]; + bool sub, + less; + + /* We must have an ordering column */ + Assert(node->ordNumCols == 1); + + /* Precompute flags for in_range checks */ + if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING) + sub = true; /* subtract startOffset from current row */ + else + sub = false; /* add it */ + less = false; /* normally, we want frame head >= sum */ + /* If sort order is descending, flip both flags */ + if (!winstate->inRangeAsc) + { + sub = !sub; + less = true; + } + + tuplestore_select_read_pointer(winstate->buffer, + winstate->framehead_ptr); + if (winstate->frameheadpos == 0 && + TupIsNull(winstate->framehead_slot)) + { + /* fetch first row into framehead_slot, if we didn't already */ + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->framehead_slot)) + elog(ERROR, "unexpected end of tuplestore"); + } + + while (!TupIsNull(winstate->framehead_slot)) + { + Datum headval, + currval; + bool headisnull, + currisnull; + + headval = slot_getattr(winstate->framehead_slot, sortCol, + &headisnull); + currval = slot_getattr(winstate->ss.ss_ScanTupleSlot, sortCol, + &currisnull); + if (headisnull || currisnull) + { + /* order of the rows depends only on nulls_first */ + if (winstate->inRangeNullsFirst) + { + /* advance head if head is null and curr is not */ + if (!headisnull || currisnull) + break; + } + else + { + /* advance head if head is not null and curr is null */ + if (headisnull || !currisnull) + break; + } + } + else + { + if (DatumGetBool(FunctionCall5Coll(&winstate->startInRangeFunc, + winstate->inRangeColl, + headval, + currval, + winstate->startOffsetValue, + BoolGetDatum(sub), + BoolGetDatum(less)))) + break; /* this row is the correct frame head */ + } + /* Note we advance frameheadpos even if the fetch fails */ + winstate->frameheadpos++; + spool_tuples(winstate, winstate->frameheadpos); + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->framehead_slot)) + break; /* end of partition */ + } + winstate->framehead_valid = true; + } + else if (frameOptions & FRAMEOPTION_GROUPS) + { + /* + * In GROUPS START_OFFSET mode, frame head is the first row of the + * first peer group whose number satisfies the offset constraint. + * We keep a copy of the last-known frame head row in + * framehead_slot, and advance as necessary. Note that if we + * reach end of partition, we will leave frameheadpos = end+1 and + * framehead_slot empty. + */ + int64 offset = DatumGetInt64(winstate->startOffsetValue); + int64 minheadgroup; + + if (frameOptions & FRAMEOPTION_START_OFFSET_PRECEDING) + minheadgroup = winstate->currentgroup - offset; + else + minheadgroup = winstate->currentgroup + offset; + + tuplestore_select_read_pointer(winstate->buffer, + winstate->framehead_ptr); + if (winstate->frameheadpos == 0 && + TupIsNull(winstate->framehead_slot)) + { + /* fetch first row into framehead_slot, if we didn't already */ + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->framehead_slot)) + elog(ERROR, "unexpected end of tuplestore"); + } + + while (!TupIsNull(winstate->framehead_slot)) + { + if (winstate->frameheadgroup >= minheadgroup) + break; /* this row is the correct frame head */ + ExecCopySlot(winstate->temp_slot_2, winstate->framehead_slot); + /* Note we advance frameheadpos even if the fetch fails */ + winstate->frameheadpos++; + spool_tuples(winstate, winstate->frameheadpos); + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->framehead_slot)) + break; /* end of partition */ + if (!are_peers(winstate, winstate->temp_slot_2, + winstate->framehead_slot)) + winstate->frameheadgroup++; + } + ExecClearTuple(winstate->temp_slot_2); + winstate->framehead_valid = true; + } + else + Assert(false); + } + else + Assert(false); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * update_frametailpos + * make frametailpos valid for the current row + * + * Note that frametailpos is computed without regard for any window exclusion + * clause; the current row and/or its peers are considered part of the frame + * for this purpose even if they must be excluded later. + * + * May clobber winstate->temp_slot_2. + */ +static void +update_frametailpos(WindowAggState *winstate) +{ + WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan; + int frameOptions = winstate->frameOptions; + MemoryContext oldcontext; + + if (winstate->frametail_valid) + return; /* already known for current row */ + + /* We may be called in a short-lived context */ + oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory); + + if (frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING) + { + /* In UNBOUNDED FOLLOWING mode, all partition rows are in frame */ + spool_tuples(winstate, -1); + winstate->frametailpos = winstate->spooled_rows; + winstate->frametail_valid = true; + } + else if (frameOptions & FRAMEOPTION_END_CURRENT_ROW) + { + if (frameOptions & FRAMEOPTION_ROWS) + { + /* In ROWS mode, exactly the rows up to current are in frame */ + winstate->frametailpos = winstate->currentpos + 1; + winstate->frametail_valid = true; + } + else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) + { + /* If no ORDER BY, all rows are peers with each other */ + if (node->ordNumCols == 0) + { + spool_tuples(winstate, -1); + winstate->frametailpos = winstate->spooled_rows; + winstate->frametail_valid = true; + MemoryContextSwitchTo(oldcontext); + return; + } + + /* + * In RANGE or GROUPS END_CURRENT_ROW mode, frame end is the last + * row that is a peer of current row, frame tail is the row after + * that (if any). We keep a copy of the last-known frame tail row + * in frametail_slot, and advance as necessary. Note that if we + * reach end of partition, we will leave frametailpos = end+1 and + * frametail_slot empty. + */ + tuplestore_select_read_pointer(winstate->buffer, + winstate->frametail_ptr); + if (winstate->frametailpos == 0 && + TupIsNull(winstate->frametail_slot)) + { + /* fetch first row into frametail_slot, if we didn't already */ + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->frametail_slot)) + elog(ERROR, "unexpected end of tuplestore"); + } + + while (!TupIsNull(winstate->frametail_slot)) + { + if (winstate->frametailpos > winstate->currentpos && + !are_peers(winstate, winstate->frametail_slot, + winstate->ss.ss_ScanTupleSlot)) + break; /* this row is the frame tail */ + /* Note we advance frametailpos even if the fetch fails */ + winstate->frametailpos++; + spool_tuples(winstate, winstate->frametailpos); + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->frametail_slot)) + break; /* end of partition */ + } + winstate->frametail_valid = true; + } + else + Assert(false); + } + else if (frameOptions & FRAMEOPTION_END_OFFSET) + { + if (frameOptions & FRAMEOPTION_ROWS) + { + /* In ROWS mode, bound is physically n before/after current */ + int64 offset = DatumGetInt64(winstate->endOffsetValue); + + if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING) + offset = -offset; + + winstate->frametailpos = winstate->currentpos + offset + 1; + /* smallest allowable value of frametailpos is 0 */ + if (winstate->frametailpos < 0) + winstate->frametailpos = 0; + else if (winstate->frametailpos > winstate->currentpos + 1) + { + /* make sure frametailpos is not past end of partition */ + spool_tuples(winstate, winstate->frametailpos - 1); + if (winstate->frametailpos > winstate->spooled_rows) + winstate->frametailpos = winstate->spooled_rows; + } + winstate->frametail_valid = true; + } + else if (frameOptions & FRAMEOPTION_RANGE) + { + /* + * In RANGE END_OFFSET mode, frame end is the last row that + * satisfies the in_range constraint relative to the current row, + * frame tail is the row after that (if any). We keep a copy of + * the last-known frame tail row in frametail_slot, and advance as + * necessary. Note that if we reach end of partition, we will + * leave frametailpos = end+1 and frametail_slot empty. + */ + int sortCol = node->ordColIdx[0]; + bool sub, + less; + + /* We must have an ordering column */ + Assert(node->ordNumCols == 1); + + /* Precompute flags for in_range checks */ + if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING) + sub = true; /* subtract endOffset from current row */ + else + sub = false; /* add it */ + less = true; /* normally, we want frame tail <= sum */ + /* If sort order is descending, flip both flags */ + if (!winstate->inRangeAsc) + { + sub = !sub; + less = false; + } + + tuplestore_select_read_pointer(winstate->buffer, + winstate->frametail_ptr); + if (winstate->frametailpos == 0 && + TupIsNull(winstate->frametail_slot)) + { + /* fetch first row into frametail_slot, if we didn't already */ + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->frametail_slot)) + elog(ERROR, "unexpected end of tuplestore"); + } + + while (!TupIsNull(winstate->frametail_slot)) + { + Datum tailval, + currval; + bool tailisnull, + currisnull; + + tailval = slot_getattr(winstate->frametail_slot, sortCol, + &tailisnull); + currval = slot_getattr(winstate->ss.ss_ScanTupleSlot, sortCol, + &currisnull); + if (tailisnull || currisnull) + { + /* order of the rows depends only on nulls_first */ + if (winstate->inRangeNullsFirst) + { + /* advance tail if tail is null or curr is not */ + if (!tailisnull) + break; + } + else + { + /* advance tail if tail is not null or curr is null */ + if (!currisnull) + break; + } + } + else + { + if (!DatumGetBool(FunctionCall5Coll(&winstate->endInRangeFunc, + winstate->inRangeColl, + tailval, + currval, + winstate->endOffsetValue, + BoolGetDatum(sub), + BoolGetDatum(less)))) + break; /* this row is the correct frame tail */ + } + /* Note we advance frametailpos even if the fetch fails */ + winstate->frametailpos++; + spool_tuples(winstate, winstate->frametailpos); + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->frametail_slot)) + break; /* end of partition */ + } + winstate->frametail_valid = true; + } + else if (frameOptions & FRAMEOPTION_GROUPS) + { + /* + * In GROUPS END_OFFSET mode, frame end is the last row of the + * last peer group whose number satisfies the offset constraint, + * and frame tail is the row after that (if any). We keep a copy + * of the last-known frame tail row in frametail_slot, and advance + * as necessary. Note that if we reach end of partition, we will + * leave frametailpos = end+1 and frametail_slot empty. + */ + int64 offset = DatumGetInt64(winstate->endOffsetValue); + int64 maxtailgroup; + + if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING) + maxtailgroup = winstate->currentgroup - offset; + else + maxtailgroup = winstate->currentgroup + offset; + + tuplestore_select_read_pointer(winstate->buffer, + winstate->frametail_ptr); + if (winstate->frametailpos == 0 && + TupIsNull(winstate->frametail_slot)) + { + /* fetch first row into frametail_slot, if we didn't already */ + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->frametail_slot)) + elog(ERROR, "unexpected end of tuplestore"); + } + + while (!TupIsNull(winstate->frametail_slot)) + { + if (winstate->frametailgroup > maxtailgroup) + break; /* this row is the correct frame tail */ + ExecCopySlot(winstate->temp_slot_2, winstate->frametail_slot); + /* Note we advance frametailpos even if the fetch fails */ + winstate->frametailpos++; + spool_tuples(winstate, winstate->frametailpos); + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->frametail_slot)) + break; /* end of partition */ + if (!are_peers(winstate, winstate->temp_slot_2, + winstate->frametail_slot)) + winstate->frametailgroup++; + } + ExecClearTuple(winstate->temp_slot_2); + winstate->frametail_valid = true; + } + else + Assert(false); + } + else + Assert(false); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * update_grouptailpos + * make grouptailpos valid for the current row + * + * May clobber winstate->temp_slot_2. + */ +static void +update_grouptailpos(WindowAggState *winstate) +{ + WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan; + MemoryContext oldcontext; + + if (winstate->grouptail_valid) + return; /* already known for current row */ + + /* We may be called in a short-lived context */ + oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory); + + /* If no ORDER BY, all rows are peers with each other */ + if (node->ordNumCols == 0) + { + spool_tuples(winstate, -1); + winstate->grouptailpos = winstate->spooled_rows; + winstate->grouptail_valid = true; + MemoryContextSwitchTo(oldcontext); + return; + } + + /* + * Because grouptail_valid is reset only when current row advances into a + * new peer group, we always reach here knowing that grouptailpos needs to + * be advanced by at least one row. Hence, unlike the otherwise similar + * case for frame tail tracking, we do not need persistent storage of the + * group tail row. + */ + Assert(winstate->grouptailpos <= winstate->currentpos); + tuplestore_select_read_pointer(winstate->buffer, + winstate->grouptail_ptr); + for (;;) + { + /* Note we advance grouptailpos even if the fetch fails */ + winstate->grouptailpos++; + spool_tuples(winstate, winstate->grouptailpos); + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->temp_slot_2)) + break; /* end of partition */ + if (winstate->grouptailpos > winstate->currentpos && + !are_peers(winstate, winstate->temp_slot_2, + winstate->ss.ss_ScanTupleSlot)) + break; /* this row is the group tail */ + } + ExecClearTuple(winstate->temp_slot_2); + winstate->grouptail_valid = true; + + MemoryContextSwitchTo(oldcontext); +} + + +/* ----------------- + * ExecWindowAgg + * + * ExecWindowAgg receives tuples from its outer subplan and + * stores them into a tuplestore, then processes window functions. + * This node doesn't reduce nor qualify any row so the number of + * returned rows is exactly the same as its outer subplan's result. + * ----------------- + */ +static TupleTableSlot * +ExecWindowAgg(PlanState *pstate) +{ + WindowAggState *winstate = castNode(WindowAggState, pstate); + ExprContext *econtext; + int i; + int numfuncs; + + CHECK_FOR_INTERRUPTS(); + + if (winstate->all_done) + return NULL; + + /* + * Compute frame offset values, if any, during first call (or after a + * rescan). These are assumed to hold constant throughout the scan; if + * user gives us a volatile expression, we'll only use its initial value. + */ + if (winstate->all_first) + { + int frameOptions = winstate->frameOptions; + ExprContext *econtext = winstate->ss.ps.ps_ExprContext; + Datum value; + bool isnull; + int16 len; + bool byval; + + if (frameOptions & FRAMEOPTION_START_OFFSET) + { + Assert(winstate->startOffset != NULL); + value = ExecEvalExprSwitchContext(winstate->startOffset, + econtext, + &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("frame starting offset must not be null"))); + /* copy value into query-lifespan context */ + get_typlenbyval(exprType((Node *) winstate->startOffset->expr), + &len, &byval); + winstate->startOffsetValue = datumCopy(value, byval, len); + if (frameOptions & (FRAMEOPTION_ROWS | FRAMEOPTION_GROUPS)) + { + /* value is known to be int8 */ + int64 offset = DatumGetInt64(value); + + if (offset < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PRECEDING_OR_FOLLOWING_SIZE), + errmsg("frame starting offset must not be negative"))); + } + } + if (frameOptions & FRAMEOPTION_END_OFFSET) + { + Assert(winstate->endOffset != NULL); + value = ExecEvalExprSwitchContext(winstate->endOffset, + econtext, + &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("frame ending offset must not be null"))); + /* copy value into query-lifespan context */ + get_typlenbyval(exprType((Node *) winstate->endOffset->expr), + &len, &byval); + winstate->endOffsetValue = datumCopy(value, byval, len); + if (frameOptions & (FRAMEOPTION_ROWS | FRAMEOPTION_GROUPS)) + { + /* value is known to be int8 */ + int64 offset = DatumGetInt64(value); + + if (offset < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PRECEDING_OR_FOLLOWING_SIZE), + errmsg("frame ending offset must not be negative"))); + } + } + winstate->all_first = false; + } + + if (winstate->buffer == NULL) + { + /* Initialize for first partition and set current row = 0 */ + begin_partition(winstate); + /* If there are no input rows, we'll detect that and exit below */ + } + else + { + /* Advance current row within partition */ + winstate->currentpos++; + /* This might mean that the frame moves, too */ + winstate->framehead_valid = false; + winstate->frametail_valid = false; + /* we don't need to invalidate grouptail here; see below */ + } + + /* + * Spool all tuples up to and including the current row, if we haven't + * already + */ + spool_tuples(winstate, winstate->currentpos); + + /* Move to the next partition if we reached the end of this partition */ + if (winstate->partition_spooled && + winstate->currentpos >= winstate->spooled_rows) + { + release_partition(winstate); + + if (winstate->more_partitions) + { + begin_partition(winstate); + Assert(winstate->spooled_rows > 0); + } + else + { + winstate->all_done = true; + return NULL; + } + } + + /* final output execution is in ps_ExprContext */ + econtext = winstate->ss.ps.ps_ExprContext; + + /* Clear the per-output-tuple context for current row */ + ResetExprContext(econtext); + + /* + * Read the current row from the tuplestore, and save in ScanTupleSlot. + * (We can't rely on the outerplan's output slot because we may have to + * read beyond the current row. Also, we have to actually copy the row + * out of the tuplestore, since window function evaluation might cause the + * tuplestore to dump its state to disk.) + * + * In GROUPS mode, or when tracking a group-oriented exclusion clause, we + * must also detect entering a new peer group and update associated state + * when that happens. We use temp_slot_2 to temporarily hold the previous + * row for this purpose. + * + * Current row must be in the tuplestore, since we spooled it above. + */ + tuplestore_select_read_pointer(winstate->buffer, winstate->current_ptr); + if ((winstate->frameOptions & (FRAMEOPTION_GROUPS | + FRAMEOPTION_EXCLUDE_GROUP | + FRAMEOPTION_EXCLUDE_TIES)) && + winstate->currentpos > 0) + { + ExecCopySlot(winstate->temp_slot_2, winstate->ss.ss_ScanTupleSlot); + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->ss.ss_ScanTupleSlot)) + elog(ERROR, "unexpected end of tuplestore"); + if (!are_peers(winstate, winstate->temp_slot_2, + winstate->ss.ss_ScanTupleSlot)) + { + winstate->currentgroup++; + winstate->groupheadpos = winstate->currentpos; + winstate->grouptail_valid = false; + } + ExecClearTuple(winstate->temp_slot_2); + } + else + { + if (!tuplestore_gettupleslot(winstate->buffer, true, true, + winstate->ss.ss_ScanTupleSlot)) + elog(ERROR, "unexpected end of tuplestore"); + } + + /* + * Evaluate true window functions + */ + numfuncs = winstate->numfuncs; + for (i = 0; i < numfuncs; i++) + { + WindowStatePerFunc perfuncstate = &(winstate->perfunc[i]); + + if (perfuncstate->plain_agg) + continue; + eval_windowfunction(winstate, perfuncstate, + &(econtext->ecxt_aggvalues[perfuncstate->wfuncstate->wfuncno]), + &(econtext->ecxt_aggnulls[perfuncstate->wfuncstate->wfuncno])); + } + + /* + * Evaluate aggregates + */ + if (winstate->numaggs > 0) + eval_windowaggregates(winstate); + + /* + * If we have created auxiliary read pointers for the frame or group + * boundaries, force them to be kept up-to-date, because we don't know + * whether the window function(s) will do anything that requires that. + * Failing to advance the pointers would result in being unable to trim + * data from the tuplestore, which is bad. (If we could know in advance + * whether the window functions will use frame boundary info, we could + * skip creating these pointers in the first place ... but unfortunately + * the window function API doesn't require that.) + */ + if (winstate->framehead_ptr >= 0) + update_frameheadpos(winstate); + if (winstate->frametail_ptr >= 0) + update_frametailpos(winstate); + if (winstate->grouptail_ptr >= 0) + update_grouptailpos(winstate); + + /* + * Truncate any no-longer-needed rows from the tuplestore. + */ + tuplestore_trim(winstate->buffer); + + /* + * Form and return a projection tuple using the windowfunc results and the + * current row. Setting ecxt_outertuple arranges that any Vars will be + * evaluated with respect to that row. + */ + econtext->ecxt_outertuple = winstate->ss.ss_ScanTupleSlot; + + return ExecProject(winstate->ss.ps.ps_ProjInfo); +} + +/* ----------------- + * ExecInitWindowAgg + * + * Creates the run-time information for the WindowAgg node produced by the + * planner and initializes its outer subtree + * ----------------- + */ +WindowAggState * +ExecInitWindowAgg(WindowAgg *node, EState *estate, int eflags) +{ + WindowAggState *winstate; + Plan *outerPlan; + ExprContext *econtext; + ExprContext *tmpcontext; + WindowStatePerFunc perfunc; + WindowStatePerAgg peragg; + int frameOptions = node->frameOptions; + int numfuncs, + wfuncno, + numaggs, + aggno; + TupleDesc scanDesc; + ListCell *l; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + winstate = makeNode(WindowAggState); + winstate->ss.ps.plan = (Plan *) node; + winstate->ss.ps.state = estate; + winstate->ss.ps.ExecProcNode = ExecWindowAgg; + + /* + * Create expression contexts. We need two, one for per-input-tuple + * processing and one for per-output-tuple processing. We cheat a little + * by using ExecAssignExprContext() to build both. + */ + ExecAssignExprContext(estate, &winstate->ss.ps); + tmpcontext = winstate->ss.ps.ps_ExprContext; + winstate->tmpcontext = tmpcontext; + ExecAssignExprContext(estate, &winstate->ss.ps); + + /* Create long-lived context for storage of partition-local memory etc */ + winstate->partcontext = + AllocSetContextCreate(CurrentMemoryContext, + "WindowAgg Partition", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create mid-lived context for aggregate trans values etc. + * + * Note that moving aggregates each use their own private context, not + * this one. + */ + winstate->aggcontext = + AllocSetContextCreate(CurrentMemoryContext, + "WindowAgg Aggregates", + ALLOCSET_DEFAULT_SIZES); + + /* + * WindowAgg nodes never have quals, since they can only occur at the + * logical top level of a query (ie, after any WHERE or HAVING filters) + */ + Assert(node->plan.qual == NIL); + winstate->ss.ps.qual = NULL; + + /* + * initialize child nodes + */ + outerPlan = outerPlan(node); + outerPlanState(winstate) = ExecInitNode(outerPlan, estate, eflags); + + /* + * initialize source tuple type (which is also the tuple type that we'll + * store in the tuplestore and use in all our working slots). + */ + ExecCreateScanSlotFromOuterPlan(estate, &winstate->ss, &TTSOpsMinimalTuple); + scanDesc = winstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + + /* the outer tuple isn't the child's tuple, but always a minimal tuple */ + winstate->ss.ps.outeropsset = true; + winstate->ss.ps.outerops = &TTSOpsMinimalTuple; + winstate->ss.ps.outeropsfixed = true; + + /* + * tuple table initialization + */ + winstate->first_part_slot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + winstate->agg_row_slot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + winstate->temp_slot_1 = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + winstate->temp_slot_2 = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + + /* + * create frame head and tail slots only if needed (must create slots in + * exactly the same cases that update_frameheadpos and update_frametailpos + * need them) + */ + winstate->framehead_slot = winstate->frametail_slot = NULL; + + if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) + { + if (((frameOptions & FRAMEOPTION_START_CURRENT_ROW) && + node->ordNumCols != 0) || + (frameOptions & FRAMEOPTION_START_OFFSET)) + winstate->framehead_slot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + if (((frameOptions & FRAMEOPTION_END_CURRENT_ROW) && + node->ordNumCols != 0) || + (frameOptions & FRAMEOPTION_END_OFFSET)) + winstate->frametail_slot = ExecInitExtraTupleSlot(estate, scanDesc, + &TTSOpsMinimalTuple); + } + + /* + * Initialize result slot, type and projection. + */ + ExecInitResultTupleSlotTL(&winstate->ss.ps, &TTSOpsVirtual); + ExecAssignProjectionInfo(&winstate->ss.ps, NULL); + + /* Set up data for comparing tuples */ + if (node->partNumCols > 0) + winstate->partEqfunction = + execTuplesMatchPrepare(scanDesc, + node->partNumCols, + node->partColIdx, + node->partOperators, + node->partCollations, + &winstate->ss.ps); + + if (node->ordNumCols > 0) + winstate->ordEqfunction = + execTuplesMatchPrepare(scanDesc, + node->ordNumCols, + node->ordColIdx, + node->ordOperators, + node->ordCollations, + &winstate->ss.ps); + + /* + * WindowAgg nodes use aggvalues and aggnulls as well as Agg nodes. + */ + numfuncs = winstate->numfuncs; + numaggs = winstate->numaggs; + econtext = winstate->ss.ps.ps_ExprContext; + econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numfuncs); + econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numfuncs); + + /* + * allocate per-wfunc/per-agg state information. + */ + perfunc = (WindowStatePerFunc) palloc0(sizeof(WindowStatePerFuncData) * numfuncs); + peragg = (WindowStatePerAgg) palloc0(sizeof(WindowStatePerAggData) * numaggs); + winstate->perfunc = perfunc; + winstate->peragg = peragg; + + wfuncno = -1; + aggno = -1; + foreach(l, winstate->funcs) + { + WindowFuncExprState *wfuncstate = (WindowFuncExprState *) lfirst(l); + WindowFunc *wfunc = wfuncstate->wfunc; + WindowStatePerFunc perfuncstate; + AclResult aclresult; + int i; + + if (wfunc->winref != node->winref) /* planner screwed up? */ + elog(ERROR, "WindowFunc with winref %u assigned to WindowAgg with winref %u", + wfunc->winref, node->winref); + + /* Look for a previous duplicate window function */ + for (i = 0; i <= wfuncno; i++) + { + if (equal(wfunc, perfunc[i].wfunc) && + !contain_volatile_functions((Node *) wfunc)) + break; + } + if (i <= wfuncno) + { + /* Found a match to an existing entry, so just mark it */ + wfuncstate->wfuncno = i; + continue; + } + + /* Nope, so assign a new PerAgg record */ + perfuncstate = &perfunc[++wfuncno]; + + /* Mark WindowFunc state node with assigned index in the result array */ + wfuncstate->wfuncno = wfuncno; + + /* Check permission to call window function */ + aclresult = pg_proc_aclcheck(wfunc->winfnoid, GetUserId(), + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(wfunc->winfnoid)); + InvokeFunctionExecuteHook(wfunc->winfnoid); + + /* Fill in the perfuncstate data */ + perfuncstate->wfuncstate = wfuncstate; + perfuncstate->wfunc = wfunc; + perfuncstate->numArguments = list_length(wfuncstate->args); + perfuncstate->winCollation = wfunc->inputcollid; + + get_typlenbyval(wfunc->wintype, + &perfuncstate->resulttypeLen, + &perfuncstate->resulttypeByVal); + + /* + * If it's really just a plain aggregate function, we'll emulate the + * Agg environment for it. + */ + perfuncstate->plain_agg = wfunc->winagg; + if (wfunc->winagg) + { + WindowStatePerAgg peraggstate; + + perfuncstate->aggno = ++aggno; + peraggstate = &winstate->peragg[aggno]; + initialize_peragg(winstate, wfunc, peraggstate); + peraggstate->wfuncno = wfuncno; + } + else + { + WindowObject winobj = makeNode(WindowObjectData); + + winobj->winstate = winstate; + winobj->argstates = wfuncstate->args; + winobj->localmem = NULL; + perfuncstate->winobj = winobj; + + /* It's a real window function, so set up to call it. */ + fmgr_info_cxt(wfunc->winfnoid, &perfuncstate->flinfo, + econtext->ecxt_per_query_memory); + fmgr_info_set_expr((Node *) wfunc, &perfuncstate->flinfo); + } + } + + /* Update numfuncs, numaggs to match number of unique functions found */ + winstate->numfuncs = wfuncno + 1; + winstate->numaggs = aggno + 1; + + /* Set up WindowObject for aggregates, if needed */ + if (winstate->numaggs > 0) + { + WindowObject agg_winobj = makeNode(WindowObjectData); + + agg_winobj->winstate = winstate; + agg_winobj->argstates = NIL; + agg_winobj->localmem = NULL; + /* make sure markptr = -1 to invalidate. It may not get used */ + agg_winobj->markptr = -1; + agg_winobj->readptr = -1; + winstate->agg_winobj = agg_winobj; + } + + /* copy frame options to state node for easy access */ + winstate->frameOptions = frameOptions; + + /* initialize frame bound offset expressions */ + winstate->startOffset = ExecInitExpr((Expr *) node->startOffset, + (PlanState *) winstate); + winstate->endOffset = ExecInitExpr((Expr *) node->endOffset, + (PlanState *) winstate); + + /* Lookup in_range support functions if needed */ + if (OidIsValid(node->startInRangeFunc)) + fmgr_info(node->startInRangeFunc, &winstate->startInRangeFunc); + if (OidIsValid(node->endInRangeFunc)) + fmgr_info(node->endInRangeFunc, &winstate->endInRangeFunc); + winstate->inRangeColl = node->inRangeColl; + winstate->inRangeAsc = node->inRangeAsc; + winstate->inRangeNullsFirst = node->inRangeNullsFirst; + + winstate->all_first = true; + winstate->partition_spooled = false; + winstate->more_partitions = false; + + return winstate; +} + +/* ----------------- + * ExecEndWindowAgg + * ----------------- + */ +void +ExecEndWindowAgg(WindowAggState *node) +{ + PlanState *outerPlan; + int i; + + release_partition(node); + + ExecClearTuple(node->ss.ss_ScanTupleSlot); + ExecClearTuple(node->first_part_slot); + ExecClearTuple(node->agg_row_slot); + ExecClearTuple(node->temp_slot_1); + ExecClearTuple(node->temp_slot_2); + if (node->framehead_slot) + ExecClearTuple(node->framehead_slot); + if (node->frametail_slot) + ExecClearTuple(node->frametail_slot); + + /* + * Free both the expr contexts. + */ + ExecFreeExprContext(&node->ss.ps); + node->ss.ps.ps_ExprContext = node->tmpcontext; + ExecFreeExprContext(&node->ss.ps); + + for (i = 0; i < node->numaggs; i++) + { + if (node->peragg[i].aggcontext != node->aggcontext) + MemoryContextDelete(node->peragg[i].aggcontext); + } + MemoryContextDelete(node->partcontext); + MemoryContextDelete(node->aggcontext); + + pfree(node->perfunc); + pfree(node->peragg); + + outerPlan = outerPlanState(node); + ExecEndNode(outerPlan); +} + +/* ----------------- + * ExecReScanWindowAgg + * ----------------- + */ +void +ExecReScanWindowAgg(WindowAggState *node) +{ + PlanState *outerPlan = outerPlanState(node); + ExprContext *econtext = node->ss.ps.ps_ExprContext; + + node->all_done = false; + node->all_first = true; + + /* release tuplestore et al */ + release_partition(node); + + /* release all temp tuples, but especially first_part_slot */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + ExecClearTuple(node->first_part_slot); + ExecClearTuple(node->agg_row_slot); + ExecClearTuple(node->temp_slot_1); + ExecClearTuple(node->temp_slot_2); + if (node->framehead_slot) + ExecClearTuple(node->framehead_slot); + if (node->frametail_slot) + ExecClearTuple(node->frametail_slot); + + /* Forget current wfunc values */ + MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numfuncs); + MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numfuncs); + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} + +/* + * initialize_peragg + * + * Almost same as in nodeAgg.c, except we don't support DISTINCT currently. + */ +static WindowStatePerAggData * +initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc, + WindowStatePerAgg peraggstate) +{ + Oid inputTypes[FUNC_MAX_ARGS]; + int numArguments; + HeapTuple aggTuple; + Form_pg_aggregate aggform; + Oid aggtranstype; + AttrNumber initvalAttNo; + AclResult aclresult; + bool use_ma_code; + Oid transfn_oid, + invtransfn_oid, + finalfn_oid; + bool finalextra; + char finalmodify; + Expr *transfnexpr, + *invtransfnexpr, + *finalfnexpr; + Datum textInitVal; + int i; + ListCell *lc; + + numArguments = list_length(wfunc->args); + + i = 0; + foreach(lc, wfunc->args) + { + inputTypes[i++] = exprType((Node *) lfirst(lc)); + } + + aggTuple = SearchSysCache1(AGGFNOID, ObjectIdGetDatum(wfunc->winfnoid)); + if (!HeapTupleIsValid(aggTuple)) + elog(ERROR, "cache lookup failed for aggregate %u", + wfunc->winfnoid); + aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple); + + /* + * Figure out whether we want to use the moving-aggregate implementation, + * and collect the right set of fields from the pg_attribute entry. + * + * It's possible that an aggregate would supply a safe moving-aggregate + * implementation and an unsafe normal one, in which case our hand is + * forced. Otherwise, if the frame head can't move, we don't need + * moving-aggregate code. Even if we'd like to use it, don't do so if the + * aggregate's arguments (and FILTER clause if any) contain any calls to + * volatile functions. Otherwise, the difference between restarting and + * not restarting the aggregation would be user-visible. + */ + if (!OidIsValid(aggform->aggminvtransfn)) + use_ma_code = false; /* sine qua non */ + else if (aggform->aggmfinalmodify == AGGMODIFY_READ_ONLY && + aggform->aggfinalmodify != AGGMODIFY_READ_ONLY) + use_ma_code = true; /* decision forced by safety */ + else if (winstate->frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING) + use_ma_code = false; /* non-moving frame head */ + else if (contain_volatile_functions((Node *) wfunc)) + use_ma_code = false; /* avoid possible behavioral change */ + else + use_ma_code = true; /* yes, let's use it */ + if (use_ma_code) + { + peraggstate->transfn_oid = transfn_oid = aggform->aggmtransfn; + peraggstate->invtransfn_oid = invtransfn_oid = aggform->aggminvtransfn; + peraggstate->finalfn_oid = finalfn_oid = aggform->aggmfinalfn; + finalextra = aggform->aggmfinalextra; + finalmodify = aggform->aggmfinalmodify; + aggtranstype = aggform->aggmtranstype; + initvalAttNo = Anum_pg_aggregate_aggminitval; + } + else + { + peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn; + peraggstate->invtransfn_oid = invtransfn_oid = InvalidOid; + peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn; + finalextra = aggform->aggfinalextra; + finalmodify = aggform->aggfinalmodify; + aggtranstype = aggform->aggtranstype; + initvalAttNo = Anum_pg_aggregate_agginitval; + } + + /* + * ExecInitWindowAgg already checked permission to call aggregate function + * ... but we still need to check the component functions + */ + + /* Check that aggregate owner has permission to call component fns */ + { + HeapTuple procTuple; + Oid aggOwner; + + procTuple = SearchSysCache1(PROCOID, + ObjectIdGetDatum(wfunc->winfnoid)); + if (!HeapTupleIsValid(procTuple)) + elog(ERROR, "cache lookup failed for function %u", + wfunc->winfnoid); + aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner; + ReleaseSysCache(procTuple); + + aclresult = pg_proc_aclcheck(transfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(transfn_oid)); + InvokeFunctionExecuteHook(transfn_oid); + + if (OidIsValid(invtransfn_oid)) + { + aclresult = pg_proc_aclcheck(invtransfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(invtransfn_oid)); + InvokeFunctionExecuteHook(invtransfn_oid); + } + + if (OidIsValid(finalfn_oid)) + { + aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(finalfn_oid)); + InvokeFunctionExecuteHook(finalfn_oid); + } + } + + /* + * If the selected finalfn isn't read-only, we can't run this aggregate as + * a window function. This is a user-facing error, so we take a bit more + * care with the error message than elsewhere in this function. + */ + if (finalmodify != AGGMODIFY_READ_ONLY) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("aggregate function %s does not support use as a window function", + format_procedure(wfunc->winfnoid)))); + + /* Detect how many arguments to pass to the finalfn */ + if (finalextra) + peraggstate->numFinalArgs = numArguments + 1; + else + peraggstate->numFinalArgs = 1; + + /* resolve actual type of transition state, if polymorphic */ + aggtranstype = resolve_aggregate_transtype(wfunc->winfnoid, + aggtranstype, + inputTypes, + numArguments); + + /* build expression trees using actual argument & result types */ + build_aggregate_transfn_expr(inputTypes, + numArguments, + 0, /* no ordered-set window functions yet */ + false, /* no variadic window functions yet */ + aggtranstype, + wfunc->inputcollid, + transfn_oid, + invtransfn_oid, + &transfnexpr, + &invtransfnexpr); + + /* set up infrastructure for calling the transfn(s) and finalfn */ + fmgr_info(transfn_oid, &peraggstate->transfn); + fmgr_info_set_expr((Node *) transfnexpr, &peraggstate->transfn); + + if (OidIsValid(invtransfn_oid)) + { + fmgr_info(invtransfn_oid, &peraggstate->invtransfn); + fmgr_info_set_expr((Node *) invtransfnexpr, &peraggstate->invtransfn); + } + + if (OidIsValid(finalfn_oid)) + { + build_aggregate_finalfn_expr(inputTypes, + peraggstate->numFinalArgs, + aggtranstype, + wfunc->wintype, + wfunc->inputcollid, + finalfn_oid, + &finalfnexpr); + fmgr_info(finalfn_oid, &peraggstate->finalfn); + fmgr_info_set_expr((Node *) finalfnexpr, &peraggstate->finalfn); + } + + /* get info about relevant datatypes */ + get_typlenbyval(wfunc->wintype, + &peraggstate->resulttypeLen, + &peraggstate->resulttypeByVal); + get_typlenbyval(aggtranstype, + &peraggstate->transtypeLen, + &peraggstate->transtypeByVal); + + /* + * initval is potentially null, so don't try to access it as a struct + * field. Must do it the hard way with SysCacheGetAttr. + */ + textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, initvalAttNo, + &peraggstate->initValueIsNull); + + if (peraggstate->initValueIsNull) + peraggstate->initValue = (Datum) 0; + else + peraggstate->initValue = GetAggInitVal(textInitVal, + aggtranstype); + + /* + * If the transfn is strict and the initval is NULL, make sure input type + * and transtype are the same (or at least binary-compatible), so that + * it's OK to use the first input value as the initial transValue. This + * should have been checked at agg definition time, but we must check + * again in case the transfn's strictness property has been changed. + */ + if (peraggstate->transfn.fn_strict && peraggstate->initValueIsNull) + { + if (numArguments < 1 || + !IsBinaryCoercible(inputTypes[0], aggtranstype)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("aggregate %u needs to have compatible input type and transition type", + wfunc->winfnoid))); + } + + /* + * Insist that forward and inverse transition functions have the same + * strictness setting. Allowing them to differ would require handling + * more special cases in advance_windowaggregate and + * advance_windowaggregate_base, for no discernible benefit. This should + * have been checked at agg definition time, but we must check again in + * case either function's strictness property has been changed. + */ + if (OidIsValid(invtransfn_oid) && + peraggstate->transfn.fn_strict != peraggstate->invtransfn.fn_strict) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("strictness of aggregate's forward and inverse transition functions must match"))); + + /* + * Moving aggregates use their own aggcontext. + * + * This is necessary because they might restart at different times, so we + * might never be able to reset the shared context otherwise. We can't + * make it the aggregates' responsibility to clean up after themselves, + * because strict aggregates must be restarted whenever we remove their + * last non-NULL input, which the aggregate won't be aware is happening. + * Also, just pfree()ing the transValue upon restarting wouldn't help, + * since we'd miss any indirectly referenced data. We could, in theory, + * make the memory allocation rules for moving aggregates different than + * they have historically been for plain aggregates, but that seems grotty + * and likely to lead to memory leaks. + */ + if (OidIsValid(invtransfn_oid)) + peraggstate->aggcontext = + AllocSetContextCreate(CurrentMemoryContext, + "WindowAgg Per Aggregate", + ALLOCSET_DEFAULT_SIZES); + else + peraggstate->aggcontext = winstate->aggcontext; + + ReleaseSysCache(aggTuple); + + return peraggstate; +} + +static Datum +GetAggInitVal(Datum textInitVal, Oid transtype) +{ + Oid typinput, + typioparam; + char *strInitVal; + Datum initVal; + + getTypeInputInfo(transtype, &typinput, &typioparam); + strInitVal = TextDatumGetCString(textInitVal); + initVal = OidInputFunctionCall(typinput, strInitVal, + typioparam, -1); + pfree(strInitVal); + return initVal; +} + +/* + * are_peers + * compare two rows to see if they are equal according to the ORDER BY clause + * + * NB: this does not consider the window frame mode. + */ +static bool +are_peers(WindowAggState *winstate, TupleTableSlot *slot1, + TupleTableSlot *slot2) +{ + WindowAgg *node = (WindowAgg *) winstate->ss.ps.plan; + ExprContext *econtext = winstate->tmpcontext; + + /* If no ORDER BY, all rows are peers with each other */ + if (node->ordNumCols == 0) + return true; + + econtext->ecxt_outertuple = slot1; + econtext->ecxt_innertuple = slot2; + return ExecQualAndReset(winstate->ordEqfunction, econtext); +} + +/* + * window_gettupleslot + * Fetch the pos'th tuple of the current partition into the slot, + * using the winobj's read pointer + * + * Returns true if successful, false if no such row + */ +static bool +window_gettupleslot(WindowObject winobj, int64 pos, TupleTableSlot *slot) +{ + WindowAggState *winstate = winobj->winstate; + MemoryContext oldcontext; + + /* often called repeatedly in a row */ + CHECK_FOR_INTERRUPTS(); + + /* Don't allow passing -1 to spool_tuples here */ + if (pos < 0) + return false; + + /* If necessary, fetch the tuple into the spool */ + spool_tuples(winstate, pos); + + if (pos >= winstate->spooled_rows) + return false; + + if (pos < winobj->markpos) + elog(ERROR, "cannot fetch row before WindowObject's mark position"); + + oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory); + + tuplestore_select_read_pointer(winstate->buffer, winobj->readptr); + + /* + * Advance or rewind until we are within one tuple of the one we want. + */ + if (winobj->seekpos < pos - 1) + { + if (!tuplestore_skiptuples(winstate->buffer, + pos - 1 - winobj->seekpos, + true)) + elog(ERROR, "unexpected end of tuplestore"); + winobj->seekpos = pos - 1; + } + else if (winobj->seekpos > pos + 1) + { + if (!tuplestore_skiptuples(winstate->buffer, + winobj->seekpos - (pos + 1), + false)) + elog(ERROR, "unexpected end of tuplestore"); + winobj->seekpos = pos + 1; + } + else if (winobj->seekpos == pos) + { + /* + * There's no API to refetch the tuple at the current position. We + * have to move one tuple forward, and then one backward. (We don't + * do it the other way because we might try to fetch the row before + * our mark, which isn't allowed.) XXX this case could stand to be + * optimized. + */ + tuplestore_advance(winstate->buffer, true); + winobj->seekpos++; + } + + /* + * Now we should be on the tuple immediately before or after the one we + * want, so just fetch forwards or backwards as appropriate. + */ + if (winobj->seekpos > pos) + { + if (!tuplestore_gettupleslot(winstate->buffer, false, true, slot)) + elog(ERROR, "unexpected end of tuplestore"); + winobj->seekpos--; + } + else + { + if (!tuplestore_gettupleslot(winstate->buffer, true, true, slot)) + elog(ERROR, "unexpected end of tuplestore"); + winobj->seekpos++; + } + + Assert(winobj->seekpos == pos); + + MemoryContextSwitchTo(oldcontext); + + return true; +} + + +/*********************************************************************** + * API exposed to window functions + ***********************************************************************/ + + +/* + * WinGetPartitionLocalMemory + * Get working memory that lives till end of partition processing + * + * On first call within a given partition, this allocates and zeroes the + * requested amount of space. Subsequent calls just return the same chunk. + * + * Memory obtained this way is normally used to hold state that should be + * automatically reset for each new partition. If a window function wants + * to hold state across the whole query, fcinfo->fn_extra can be used in the + * usual way for that. + */ +void * +WinGetPartitionLocalMemory(WindowObject winobj, Size sz) +{ + Assert(WindowObjectIsValid(winobj)); + if (winobj->localmem == NULL) + winobj->localmem = + MemoryContextAllocZero(winobj->winstate->partcontext, sz); + return winobj->localmem; +} + +/* + * WinGetCurrentPosition + * Return the current row's position (counting from 0) within the current + * partition. + */ +int64 +WinGetCurrentPosition(WindowObject winobj) +{ + Assert(WindowObjectIsValid(winobj)); + return winobj->winstate->currentpos; +} + +/* + * WinGetPartitionRowCount + * Return total number of rows contained in the current partition. + * + * Note: this is a relatively expensive operation because it forces the + * whole partition to be "spooled" into the tuplestore at once. Once + * executed, however, additional calls within the same partition are cheap. + */ +int64 +WinGetPartitionRowCount(WindowObject winobj) +{ + Assert(WindowObjectIsValid(winobj)); + spool_tuples(winobj->winstate, -1); + return winobj->winstate->spooled_rows; +} + +/* + * WinSetMarkPosition + * Set the "mark" position for the window object, which is the oldest row + * number (counting from 0) it is allowed to fetch during all subsequent + * operations within the current partition. + * + * Window functions do not have to call this, but are encouraged to move the + * mark forward when possible to keep the tuplestore size down and prevent + * having to spill rows to disk. + */ +void +WinSetMarkPosition(WindowObject winobj, int64 markpos) +{ + WindowAggState *winstate; + + Assert(WindowObjectIsValid(winobj)); + winstate = winobj->winstate; + + if (markpos < winobj->markpos) + elog(ERROR, "cannot move WindowObject's mark position backward"); + tuplestore_select_read_pointer(winstate->buffer, winobj->markptr); + if (markpos > winobj->markpos) + { + tuplestore_skiptuples(winstate->buffer, + markpos - winobj->markpos, + true); + winobj->markpos = markpos; + } + tuplestore_select_read_pointer(winstate->buffer, winobj->readptr); + if (markpos > winobj->seekpos) + { + tuplestore_skiptuples(winstate->buffer, + markpos - winobj->seekpos, + true); + winobj->seekpos = markpos; + } +} + +/* + * WinRowsArePeers + * Compare two rows (specified by absolute position in partition) to see + * if they are equal according to the ORDER BY clause. + * + * NB: this does not consider the window frame mode. + */ +bool +WinRowsArePeers(WindowObject winobj, int64 pos1, int64 pos2) +{ + WindowAggState *winstate; + WindowAgg *node; + TupleTableSlot *slot1; + TupleTableSlot *slot2; + bool res; + + Assert(WindowObjectIsValid(winobj)); + winstate = winobj->winstate; + node = (WindowAgg *) winstate->ss.ps.plan; + + /* If no ORDER BY, all rows are peers; don't bother to fetch them */ + if (node->ordNumCols == 0) + return true; + + /* + * Note: OK to use temp_slot_2 here because we aren't calling any + * frame-related functions (those tend to clobber temp_slot_2). + */ + slot1 = winstate->temp_slot_1; + slot2 = winstate->temp_slot_2; + + if (!window_gettupleslot(winobj, pos1, slot1)) + elog(ERROR, "specified position is out of window: " INT64_FORMAT, + pos1); + if (!window_gettupleslot(winobj, pos2, slot2)) + elog(ERROR, "specified position is out of window: " INT64_FORMAT, + pos2); + + res = are_peers(winstate, slot1, slot2); + + ExecClearTuple(slot1); + ExecClearTuple(slot2); + + return res; +} + +/* + * WinGetFuncArgInPartition + * Evaluate a window function's argument expression on a specified + * row of the partition. The row is identified in lseek(2) style, + * i.e. relative to the current, first, or last row. + * + * argno: argument number to evaluate (counted from 0) + * relpos: signed rowcount offset from the seek position + * seektype: WINDOW_SEEK_CURRENT, WINDOW_SEEK_HEAD, or WINDOW_SEEK_TAIL + * set_mark: If the row is found and set_mark is true, the mark is moved to + * the row as a side-effect. + * isnull: output argument, receives isnull status of result + * isout: output argument, set to indicate whether target row position + * is out of partition (can pass NULL if caller doesn't care about this) + * + * Specifying a nonexistent row is not an error, it just causes a null result + * (plus setting *isout true, if isout isn't NULL). + */ +Datum +WinGetFuncArgInPartition(WindowObject winobj, int argno, + int relpos, int seektype, bool set_mark, + bool *isnull, bool *isout) +{ + WindowAggState *winstate; + ExprContext *econtext; + TupleTableSlot *slot; + bool gottuple; + int64 abs_pos; + + Assert(WindowObjectIsValid(winobj)); + winstate = winobj->winstate; + econtext = winstate->ss.ps.ps_ExprContext; + slot = winstate->temp_slot_1; + + switch (seektype) + { + case WINDOW_SEEK_CURRENT: + abs_pos = winstate->currentpos + relpos; + break; + case WINDOW_SEEK_HEAD: + abs_pos = relpos; + break; + case WINDOW_SEEK_TAIL: + spool_tuples(winstate, -1); + abs_pos = winstate->spooled_rows - 1 + relpos; + break; + default: + elog(ERROR, "unrecognized window seek type: %d", seektype); + abs_pos = 0; /* keep compiler quiet */ + break; + } + + gottuple = window_gettupleslot(winobj, abs_pos, slot); + + if (!gottuple) + { + if (isout) + *isout = true; + *isnull = true; + return (Datum) 0; + } + else + { + if (isout) + *isout = false; + if (set_mark) + WinSetMarkPosition(winobj, abs_pos); + econtext->ecxt_outertuple = slot; + return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno), + econtext, isnull); + } +} + +/* + * WinGetFuncArgInFrame + * Evaluate a window function's argument expression on a specified + * row of the window frame. The row is identified in lseek(2) style, + * i.e. relative to the first or last row of the frame. (We do not + * support WINDOW_SEEK_CURRENT here, because it's not very clear what + * that should mean if the current row isn't part of the frame.) + * + * argno: argument number to evaluate (counted from 0) + * relpos: signed rowcount offset from the seek position + * seektype: WINDOW_SEEK_HEAD or WINDOW_SEEK_TAIL + * set_mark: If the row is found/in frame and set_mark is true, the mark is + * moved to the row as a side-effect. + * isnull: output argument, receives isnull status of result + * isout: output argument, set to indicate whether target row position + * is out of frame (can pass NULL if caller doesn't care about this) + * + * Specifying a nonexistent or not-in-frame row is not an error, it just + * causes a null result (plus setting *isout true, if isout isn't NULL). + * + * Note that some exclusion-clause options lead to situations where the + * rows that are in-frame are not consecutive in the partition. But we + * count only in-frame rows when measuring relpos. + * + * The set_mark flag is interpreted as meaning that the caller will specify + * a constant (or, perhaps, monotonically increasing) relpos in successive + * calls, so that *if there is no exclusion clause* there will be no need + * to fetch a row before the previously fetched row. But we do not expect + * the caller to know how to account for exclusion clauses. Therefore, + * if there is an exclusion clause we take responsibility for adjusting the + * mark request to something that will be safe given the above assumption + * about relpos. + */ +Datum +WinGetFuncArgInFrame(WindowObject winobj, int argno, + int relpos, int seektype, bool set_mark, + bool *isnull, bool *isout) +{ + WindowAggState *winstate; + ExprContext *econtext; + TupleTableSlot *slot; + int64 abs_pos; + int64 mark_pos; + + Assert(WindowObjectIsValid(winobj)); + winstate = winobj->winstate; + econtext = winstate->ss.ps.ps_ExprContext; + slot = winstate->temp_slot_1; + + switch (seektype) + { + case WINDOW_SEEK_CURRENT: + elog(ERROR, "WINDOW_SEEK_CURRENT is not supported for WinGetFuncArgInFrame"); + abs_pos = mark_pos = 0; /* keep compiler quiet */ + break; + case WINDOW_SEEK_HEAD: + /* rejecting relpos < 0 is easy and simplifies code below */ + if (relpos < 0) + goto out_of_frame; + update_frameheadpos(winstate); + abs_pos = winstate->frameheadpos + relpos; + mark_pos = abs_pos; + + /* + * Account for exclusion option if one is active, but advance only + * abs_pos not mark_pos. This prevents changes of the current + * row's peer group from resulting in trying to fetch a row before + * some previous mark position. + * + * Note that in some corner cases such as current row being + * outside frame, these calculations are theoretically too simple, + * but it doesn't matter because we'll end up deciding the row is + * out of frame. We do not attempt to avoid fetching rows past + * end of frame; that would happen in some cases anyway. + */ + switch (winstate->frameOptions & FRAMEOPTION_EXCLUSION) + { + case 0: + /* no adjustment needed */ + break; + case FRAMEOPTION_EXCLUDE_CURRENT_ROW: + if (abs_pos >= winstate->currentpos && + winstate->currentpos >= winstate->frameheadpos) + abs_pos++; + break; + case FRAMEOPTION_EXCLUDE_GROUP: + update_grouptailpos(winstate); + if (abs_pos >= winstate->groupheadpos && + winstate->grouptailpos > winstate->frameheadpos) + { + int64 overlapstart = Max(winstate->groupheadpos, + winstate->frameheadpos); + + abs_pos += winstate->grouptailpos - overlapstart; + } + break; + case FRAMEOPTION_EXCLUDE_TIES: + update_grouptailpos(winstate); + if (abs_pos >= winstate->groupheadpos && + winstate->grouptailpos > winstate->frameheadpos) + { + int64 overlapstart = Max(winstate->groupheadpos, + winstate->frameheadpos); + + if (abs_pos == overlapstart) + abs_pos = winstate->currentpos; + else + abs_pos += winstate->grouptailpos - overlapstart - 1; + } + break; + default: + elog(ERROR, "unrecognized frame option state: 0x%x", + winstate->frameOptions); + break; + } + break; + case WINDOW_SEEK_TAIL: + /* rejecting relpos > 0 is easy and simplifies code below */ + if (relpos > 0) + goto out_of_frame; + update_frametailpos(winstate); + abs_pos = winstate->frametailpos - 1 + relpos; + + /* + * Account for exclusion option if one is active. If there is no + * exclusion, we can safely set the mark at the accessed row. But + * if there is, we can only mark the frame start, because we can't + * be sure how far back in the frame the exclusion might cause us + * to fetch in future. Furthermore, we have to actually check + * against frameheadpos here, since it's unsafe to try to fetch a + * row before frame start if the mark might be there already. + */ + switch (winstate->frameOptions & FRAMEOPTION_EXCLUSION) + { + case 0: + /* no adjustment needed */ + mark_pos = abs_pos; + break; + case FRAMEOPTION_EXCLUDE_CURRENT_ROW: + if (abs_pos <= winstate->currentpos && + winstate->currentpos < winstate->frametailpos) + abs_pos--; + update_frameheadpos(winstate); + if (abs_pos < winstate->frameheadpos) + goto out_of_frame; + mark_pos = winstate->frameheadpos; + break; + case FRAMEOPTION_EXCLUDE_GROUP: + update_grouptailpos(winstate); + if (abs_pos < winstate->grouptailpos && + winstate->groupheadpos < winstate->frametailpos) + { + int64 overlapend = Min(winstate->grouptailpos, + winstate->frametailpos); + + abs_pos -= overlapend - winstate->groupheadpos; + } + update_frameheadpos(winstate); + if (abs_pos < winstate->frameheadpos) + goto out_of_frame; + mark_pos = winstate->frameheadpos; + break; + case FRAMEOPTION_EXCLUDE_TIES: + update_grouptailpos(winstate); + if (abs_pos < winstate->grouptailpos && + winstate->groupheadpos < winstate->frametailpos) + { + int64 overlapend = Min(winstate->grouptailpos, + winstate->frametailpos); + + if (abs_pos == overlapend - 1) + abs_pos = winstate->currentpos; + else + abs_pos -= overlapend - 1 - winstate->groupheadpos; + } + update_frameheadpos(winstate); + if (abs_pos < winstate->frameheadpos) + goto out_of_frame; + mark_pos = winstate->frameheadpos; + break; + default: + elog(ERROR, "unrecognized frame option state: 0x%x", + winstate->frameOptions); + mark_pos = 0; /* keep compiler quiet */ + break; + } + break; + default: + elog(ERROR, "unrecognized window seek type: %d", seektype); + abs_pos = mark_pos = 0; /* keep compiler quiet */ + break; + } + + if (!window_gettupleslot(winobj, abs_pos, slot)) + goto out_of_frame; + + /* The code above does not detect all out-of-frame cases, so check */ + if (row_is_in_frame(winstate, abs_pos, slot) <= 0) + goto out_of_frame; + + if (isout) + *isout = false; + if (set_mark) + WinSetMarkPosition(winobj, mark_pos); + econtext->ecxt_outertuple = slot; + return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno), + econtext, isnull); + +out_of_frame: + if (isout) + *isout = true; + *isnull = true; + return (Datum) 0; +} + +/* + * WinGetFuncArgCurrent + * Evaluate a window function's argument expression on the current row. + * + * argno: argument number to evaluate (counted from 0) + * isnull: output argument, receives isnull status of result + * + * Note: this isn't quite equivalent to WinGetFuncArgInPartition or + * WinGetFuncArgInFrame targeting the current row, because it will succeed + * even if the WindowObject's mark has been set beyond the current row. + * This should generally be used for "ordinary" arguments of a window + * function, such as the offset argument of lead() or lag(). + */ +Datum +WinGetFuncArgCurrent(WindowObject winobj, int argno, bool *isnull) +{ + WindowAggState *winstate; + ExprContext *econtext; + + Assert(WindowObjectIsValid(winobj)); + winstate = winobj->winstate; + + econtext = winstate->ss.ps.ps_ExprContext; + + econtext->ecxt_outertuple = winstate->ss.ss_ScanTupleSlot; + return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno), + econtext, isnull); +} diff --git a/src/backend/executor/nodeWorktablescan.c b/src/backend/executor/nodeWorktablescan.c new file mode 100644 index 0000000..91d3bf3 --- /dev/null +++ b/src/backend/executor/nodeWorktablescan.c @@ -0,0 +1,223 @@ +/*------------------------------------------------------------------------- + * + * nodeWorktablescan.c + * routines to handle WorkTableScan nodes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeWorktablescan.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/execdebug.h" +#include "executor/nodeWorktablescan.h" + +static TupleTableSlot *WorkTableScanNext(WorkTableScanState *node); + +/* ---------------------------------------------------------------- + * WorkTableScanNext + * + * This is a workhorse for ExecWorkTableScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +WorkTableScanNext(WorkTableScanState *node) +{ + TupleTableSlot *slot; + Tuplestorestate *tuplestorestate; + + /* + * get information from the estate and scan state + * + * Note: we intentionally do not support backward scan. Although it would + * take only a couple more lines here, it would force nodeRecursiveunion.c + * to create the tuplestore with backward scan enabled, which has a + * performance cost. In practice backward scan is never useful for a + * worktable plan node, since it cannot appear high enough in the plan + * tree of a scrollable cursor to be exposed to a backward-scan + * requirement. So it's not worth expending effort to support it. + * + * Note: we are also assuming that this node is the only reader of the + * worktable. Therefore, we don't need a private read pointer for the + * tuplestore, nor do we need to tell tuplestore_gettupleslot to copy. + */ + Assert(ScanDirectionIsForward(node->ss.ps.state->es_direction)); + + tuplestorestate = node->rustate->working_table; + + /* + * Get the next tuple from tuplestore. Return NULL if no more tuples. + */ + slot = node->ss.ss_ScanTupleSlot; + (void) tuplestore_gettupleslot(tuplestorestate, true, false, slot); + return slot; +} + +/* + * WorkTableScanRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +WorkTableScanRecheck(WorkTableScanState *node, TupleTableSlot *slot) +{ + /* nothing to check */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecWorkTableScan(node) + * + * Scans the worktable sequentially and returns the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecWorkTableScan(PlanState *pstate) +{ + WorkTableScanState *node = castNode(WorkTableScanState, pstate); + + /* + * On the first call, find the ancestor RecursiveUnion's state via the + * Param slot reserved for it. (We can't do this during node init because + * there are corner cases where we'll get the init call before the + * RecursiveUnion does.) + */ + if (node->rustate == NULL) + { + WorkTableScan *plan = (WorkTableScan *) node->ss.ps.plan; + EState *estate = node->ss.ps.state; + ParamExecData *param; + + param = &(estate->es_param_exec_vals[plan->wtParam]); + Assert(param->execPlan == NULL); + Assert(!param->isnull); + node->rustate = castNode(RecursiveUnionState, DatumGetPointer(param->value)); + Assert(node->rustate); + + /* + * The scan tuple type (ie, the rowtype we expect to find in the work + * table) is the same as the result rowtype of the ancestor + * RecursiveUnion node. Note this depends on the assumption that + * RecursiveUnion doesn't allow projection. + */ + ExecAssignScanType(&node->ss, + ExecGetResultType(&node->rustate->ps)); + + /* + * Now we can initialize the projection info. This must be completed + * before we can call ExecScan(). + */ + ExecAssignScanProjectionInfo(&node->ss); + } + + return ExecScan(&node->ss, + (ExecScanAccessMtd) WorkTableScanNext, + (ExecScanRecheckMtd) WorkTableScanRecheck); +} + + +/* ---------------------------------------------------------------- + * ExecInitWorkTableScan + * ---------------------------------------------------------------- + */ +WorkTableScanState * +ExecInitWorkTableScan(WorkTableScan *node, EState *estate, int eflags) +{ + WorkTableScanState *scanstate; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * WorkTableScan should not have any children. + */ + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + + /* + * create new WorkTableScanState for node + */ + scanstate = makeNode(WorkTableScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + scanstate->ss.ps.ExecProcNode = ExecWorkTableScan; + scanstate->rustate = NULL; /* we'll set this later */ + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * tuple table initialization + */ + ExecInitResultTypeTL(&scanstate->ss.ps); + + /* signal that return type is not yet known */ + scanstate->ss.ps.resultopsset = true; + scanstate->ss.ps.resultopsfixed = false; + + ExecInitScanTupleSlot(estate, &scanstate->ss, NULL, &TTSOpsMinimalTuple); + + /* + * initialize child expressions + */ + scanstate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate); + + /* + * Do not yet initialize projection info, see ExecWorkTableScan() for + * details. + */ + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndWorkTableScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndWorkTableScan(WorkTableScanState *node) +{ + /* + * Free exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecReScanWorkTableScan + * + * Rescans the relation. + * ---------------------------------------------------------------- + */ +void +ExecReScanWorkTableScan(WorkTableScanState *node) +{ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + ExecScanReScan(&node->ss); + + /* No need (or way) to rescan if ExecWorkTableScan not called yet */ + if (node->rustate) + tuplestore_rescan(node->rustate->working_table); +} diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c new file mode 100644 index 0000000..f73c1e7 --- /dev/null +++ b/src/backend/executor/spi.c @@ -0,0 +1,3383 @@ +/*------------------------------------------------------------------------- + * + * spi.c + * Server Programming Interface + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/spi.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/printtup.h" +#include "access/sysattr.h" +#include "access/xact.h" +#include "catalog/heap.h" +#include "catalog/pg_type.h" +#include "commands/trigger.h" +#include "executor/executor.h" +#include "executor/spi_priv.h" +#include "miscadmin.h" +#include "tcop/pquery.h" +#include "tcop/utility.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + + +/* + * These global variables are part of the API for various SPI functions + * (a horrible API choice, but it's too late now). To reduce the risk of + * interference between different SPI callers, we save and restore them + * when entering/exiting a SPI nesting level. + */ +uint64 SPI_processed = 0; +SPITupleTable *SPI_tuptable = NULL; +int SPI_result = 0; + +static _SPI_connection *_SPI_stack = NULL; +static _SPI_connection *_SPI_current = NULL; +static int _SPI_stack_depth = 0; /* allocated size of _SPI_stack */ +static int _SPI_connected = -1; /* current stack index */ + +typedef struct SPICallbackArg +{ + const char *query; + RawParseMode mode; +} SPICallbackArg; + +static Portal SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, + ParamListInfo paramLI, bool read_only); + +static void _SPI_prepare_plan(const char *src, SPIPlanPtr plan); + +static void _SPI_prepare_oneshot_plan(const char *src, SPIPlanPtr plan); + +static int _SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options, + Snapshot snapshot, Snapshot crosscheck_snapshot, + bool fire_triggers); + +static ParamListInfo _SPI_convert_params(int nargs, Oid *argtypes, + Datum *Values, const char *Nulls); + +static int _SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount); + +static void _SPI_error_callback(void *arg); + +static void _SPI_cursor_operation(Portal portal, + FetchDirection direction, long count, + DestReceiver *dest); + +static SPIPlanPtr _SPI_make_plan_non_temp(SPIPlanPtr plan); +static SPIPlanPtr _SPI_save_plan(SPIPlanPtr plan); + +static int _SPI_begin_call(bool use_exec); +static int _SPI_end_call(bool use_exec); +static MemoryContext _SPI_execmem(void); +static MemoryContext _SPI_procmem(void); +static bool _SPI_checktuples(void); + + +/* =================== interface functions =================== */ + +int +SPI_connect(void) +{ + return SPI_connect_ext(0); +} + +int +SPI_connect_ext(int options) +{ + int newdepth; + + /* Enlarge stack if necessary */ + if (_SPI_stack == NULL) + { + if (_SPI_connected != -1 || _SPI_stack_depth != 0) + elog(ERROR, "SPI stack corrupted"); + newdepth = 16; + _SPI_stack = (_SPI_connection *) + MemoryContextAlloc(TopMemoryContext, + newdepth * sizeof(_SPI_connection)); + _SPI_stack_depth = newdepth; + } + else + { + if (_SPI_stack_depth <= 0 || _SPI_stack_depth <= _SPI_connected) + elog(ERROR, "SPI stack corrupted"); + if (_SPI_stack_depth == _SPI_connected + 1) + { + newdepth = _SPI_stack_depth * 2; + _SPI_stack = (_SPI_connection *) + repalloc(_SPI_stack, + newdepth * sizeof(_SPI_connection)); + _SPI_stack_depth = newdepth; + } + } + + /* Enter new stack level */ + _SPI_connected++; + Assert(_SPI_connected >= 0 && _SPI_connected < _SPI_stack_depth); + + _SPI_current = &(_SPI_stack[_SPI_connected]); + _SPI_current->processed = 0; + _SPI_current->tuptable = NULL; + _SPI_current->execSubid = InvalidSubTransactionId; + slist_init(&_SPI_current->tuptables); + _SPI_current->procCxt = NULL; /* in case we fail to create 'em */ + _SPI_current->execCxt = NULL; + _SPI_current->connectSubid = GetCurrentSubTransactionId(); + _SPI_current->queryEnv = NULL; + _SPI_current->atomic = (options & SPI_OPT_NONATOMIC ? false : true); + _SPI_current->internal_xact = false; + _SPI_current->outer_processed = SPI_processed; + _SPI_current->outer_tuptable = SPI_tuptable; + _SPI_current->outer_result = SPI_result; + + /* + * Create memory contexts for this procedure + * + * In atomic contexts (the normal case), we use TopTransactionContext, + * otherwise PortalContext, so that it lives across transaction + * boundaries. + * + * XXX It could be better to use PortalContext as the parent context in + * all cases, but we may not be inside a portal (consider deferred-trigger + * execution). Perhaps CurTransactionContext could be an option? For now + * it doesn't matter because we clean up explicitly in AtEOSubXact_SPI(); + * but see also AtEOXact_SPI(). + */ + _SPI_current->procCxt = AllocSetContextCreate(_SPI_current->atomic ? TopTransactionContext : PortalContext, + "SPI Proc", + ALLOCSET_DEFAULT_SIZES); + _SPI_current->execCxt = AllocSetContextCreate(_SPI_current->atomic ? TopTransactionContext : _SPI_current->procCxt, + "SPI Exec", + ALLOCSET_DEFAULT_SIZES); + /* ... and switch to procedure's context */ + _SPI_current->savedcxt = MemoryContextSwitchTo(_SPI_current->procCxt); + + /* + * Reset API global variables so that current caller cannot accidentally + * depend on state of an outer caller. + */ + SPI_processed = 0; + SPI_tuptable = NULL; + SPI_result = 0; + + return SPI_OK_CONNECT; +} + +int +SPI_finish(void) +{ + int res; + + res = _SPI_begin_call(false); /* just check we're connected */ + if (res < 0) + return res; + + /* Restore memory context as it was before procedure call */ + MemoryContextSwitchTo(_SPI_current->savedcxt); + + /* Release memory used in procedure call (including tuptables) */ + MemoryContextDelete(_SPI_current->execCxt); + _SPI_current->execCxt = NULL; + MemoryContextDelete(_SPI_current->procCxt); + _SPI_current->procCxt = NULL; + + /* + * Restore outer API variables, especially SPI_tuptable which is probably + * pointing at a just-deleted tuptable + */ + SPI_processed = _SPI_current->outer_processed; + SPI_tuptable = _SPI_current->outer_tuptable; + SPI_result = _SPI_current->outer_result; + + /* Exit stack level */ + _SPI_connected--; + if (_SPI_connected < 0) + _SPI_current = NULL; + else + _SPI_current = &(_SPI_stack[_SPI_connected]); + + return SPI_OK_FINISH; +} + +/* + * SPI_start_transaction is a no-op, kept for backwards compatibility. + * SPI callers are *always* inside a transaction. + */ +void +SPI_start_transaction(void) +{ +} + +static void +_SPI_commit(bool chain) +{ + MemoryContext oldcontext = CurrentMemoryContext; + + /* + * Complain if we are in a context that doesn't permit transaction + * termination. (Note: here and _SPI_rollback should be the only places + * that throw ERRCODE_INVALID_TRANSACTION_TERMINATION, so that callers can + * test for that with security that they know what happened.) + */ + if (_SPI_current->atomic) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION), + errmsg("invalid transaction termination"))); + + /* + * This restriction is required by PLs implemented on top of SPI. They + * use subtransactions to establish exception blocks that are supposed to + * be rolled back together if there is an error. Terminating the + * top-level transaction in such a block violates that idea. A future PL + * implementation might have different ideas about this, in which case + * this restriction would have to be refined or the check possibly be + * moved out of SPI into the PLs. Note however that the code below relies + * on not being within a subtransaction. + */ + if (IsSubTransaction()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION), + errmsg("cannot commit while a subtransaction is active"))); + + /* XXX this ain't re-entrant enough for my taste */ + if (chain) + SaveTransactionCharacteristics(); + + /* Catch any error occurring during the COMMIT */ + PG_TRY(); + { + /* Protect current SPI stack entry against deletion */ + _SPI_current->internal_xact = true; + + /* + * Hold any pinned portals that any PLs might be using. We have to do + * this before changing transaction state, since this will run + * user-defined code that might throw an error. + */ + HoldPinnedPortals(); + + /* Release snapshots associated with portals */ + ForgetPortalSnapshots(); + + /* Do the deed */ + CommitTransactionCommand(); + + /* Immediately start a new transaction */ + StartTransactionCommand(); + if (chain) + RestoreTransactionCharacteristics(); + + MemoryContextSwitchTo(oldcontext); + + _SPI_current->internal_xact = false; + } + PG_CATCH(); + { + ErrorData *edata; + + /* Save error info in caller's context */ + MemoryContextSwitchTo(oldcontext); + edata = CopyErrorData(); + FlushErrorState(); + + /* + * Abort the failed transaction. If this fails too, we'll just + * propagate the error out ... there's not that much we can do. + */ + AbortCurrentTransaction(); + + /* ... and start a new one */ + StartTransactionCommand(); + if (chain) + RestoreTransactionCharacteristics(); + + MemoryContextSwitchTo(oldcontext); + + _SPI_current->internal_xact = false; + + /* Now that we've cleaned up the transaction, re-throw the error */ + ReThrowError(edata); + } + PG_END_TRY(); +} + +void +SPI_commit(void) +{ + _SPI_commit(false); +} + +void +SPI_commit_and_chain(void) +{ + _SPI_commit(true); +} + +static void +_SPI_rollback(bool chain) +{ + MemoryContext oldcontext = CurrentMemoryContext; + + /* see under SPI_commit() */ + if (_SPI_current->atomic) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION), + errmsg("invalid transaction termination"))); + + /* see under SPI_commit() */ + if (IsSubTransaction()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION), + errmsg("cannot roll back while a subtransaction is active"))); + + /* XXX this ain't re-entrant enough for my taste */ + if (chain) + SaveTransactionCharacteristics(); + + /* Catch any error occurring during the ROLLBACK */ + PG_TRY(); + { + /* Protect current SPI stack entry against deletion */ + _SPI_current->internal_xact = true; + + /* + * Hold any pinned portals that any PLs might be using. We have to do + * this before changing transaction state, since this will run + * user-defined code that might throw an error, and in any case + * couldn't be run in an already-aborted transaction. + */ + HoldPinnedPortals(); + + /* Release snapshots associated with portals */ + ForgetPortalSnapshots(); + + /* Do the deed */ + AbortCurrentTransaction(); + + /* Immediately start a new transaction */ + StartTransactionCommand(); + if (chain) + RestoreTransactionCharacteristics(); + + MemoryContextSwitchTo(oldcontext); + + _SPI_current->internal_xact = false; + } + PG_CATCH(); + { + ErrorData *edata; + + /* Save error info in caller's context */ + MemoryContextSwitchTo(oldcontext); + edata = CopyErrorData(); + FlushErrorState(); + + /* + * Try again to abort the failed transaction. If this fails too, + * we'll just propagate the error out ... there's not that much we can + * do. + */ + AbortCurrentTransaction(); + + /* ... and start a new one */ + StartTransactionCommand(); + if (chain) + RestoreTransactionCharacteristics(); + + MemoryContextSwitchTo(oldcontext); + + _SPI_current->internal_xact = false; + + /* Now that we've cleaned up the transaction, re-throw the error */ + ReThrowError(edata); + } + PG_END_TRY(); +} + +void +SPI_rollback(void) +{ + _SPI_rollback(false); +} + +void +SPI_rollback_and_chain(void) +{ + _SPI_rollback(true); +} + +/* + * SPICleanup is a no-op, kept for backwards compatibility. We rely on + * AtEOXact_SPI to cleanup. Extensions should not (need to) fiddle with the + * internal SPI state directly. + */ +void +SPICleanup(void) +{ +} + +/* + * Clean up SPI state at transaction commit or abort. + */ +void +AtEOXact_SPI(bool isCommit) +{ + bool found = false; + + /* + * Pop stack entries, stopping if we find one marked internal_xact (that + * one belongs to the caller of SPI_commit or SPI_abort). + */ + while (_SPI_connected >= 0) + { + _SPI_connection *connection = &(_SPI_stack[_SPI_connected]); + + if (connection->internal_xact) + break; + + found = true; + + /* + * We need not release the procedure's memory contexts explicitly, as + * they'll go away automatically when their parent context does; see + * notes in SPI_connect_ext. + */ + + /* + * Restore outer global variables and pop the stack entry. Unlike + * SPI_finish(), we don't risk switching to memory contexts that might + * be already gone. + */ + SPI_processed = connection->outer_processed; + SPI_tuptable = connection->outer_tuptable; + SPI_result = connection->outer_result; + + _SPI_connected--; + if (_SPI_connected < 0) + _SPI_current = NULL; + else + _SPI_current = &(_SPI_stack[_SPI_connected]); + } + + /* We should only find entries to pop during an ABORT. */ + if (found && isCommit) + ereport(WARNING, + (errcode(ERRCODE_WARNING), + errmsg("transaction left non-empty SPI stack"), + errhint("Check for missing \"SPI_finish\" calls."))); +} + +/* + * Clean up SPI state at subtransaction commit or abort. + * + * During commit, there shouldn't be any unclosed entries remaining from + * the current subtransaction; we emit a warning if any are found. + */ +void +AtEOSubXact_SPI(bool isCommit, SubTransactionId mySubid) +{ + bool found = false; + + while (_SPI_connected >= 0) + { + _SPI_connection *connection = &(_SPI_stack[_SPI_connected]); + + if (connection->connectSubid != mySubid) + break; /* couldn't be any underneath it either */ + + if (connection->internal_xact) + break; + + found = true; + + /* + * Release procedure memory explicitly (see note in SPI_connect) + */ + if (connection->execCxt) + { + MemoryContextDelete(connection->execCxt); + connection->execCxt = NULL; + } + if (connection->procCxt) + { + MemoryContextDelete(connection->procCxt); + connection->procCxt = NULL; + } + + /* + * Restore outer global variables and pop the stack entry. Unlike + * SPI_finish(), we don't risk switching to memory contexts that might + * be already gone. + */ + SPI_processed = connection->outer_processed; + SPI_tuptable = connection->outer_tuptable; + SPI_result = connection->outer_result; + + _SPI_connected--; + if (_SPI_connected < 0) + _SPI_current = NULL; + else + _SPI_current = &(_SPI_stack[_SPI_connected]); + } + + if (found && isCommit) + ereport(WARNING, + (errcode(ERRCODE_WARNING), + errmsg("subtransaction left non-empty SPI stack"), + errhint("Check for missing \"SPI_finish\" calls."))); + + /* + * If we are aborting a subtransaction and there is an open SPI context + * surrounding the subxact, clean up to prevent memory leakage. + */ + if (_SPI_current && !isCommit) + { + slist_mutable_iter siter; + + /* + * Throw away executor state if current executor operation was started + * within current subxact (essentially, force a _SPI_end_call(true)). + */ + if (_SPI_current->execSubid >= mySubid) + { + _SPI_current->execSubid = InvalidSubTransactionId; + MemoryContextResetAndDeleteChildren(_SPI_current->execCxt); + } + + /* throw away any tuple tables created within current subxact */ + slist_foreach_modify(siter, &_SPI_current->tuptables) + { + SPITupleTable *tuptable; + + tuptable = slist_container(SPITupleTable, next, siter.cur); + if (tuptable->subid >= mySubid) + { + /* + * If we used SPI_freetuptable() here, its internal search of + * the tuptables list would make this operation O(N^2). + * Instead, just free the tuptable manually. This should + * match what SPI_freetuptable() does. + */ + slist_delete_current(&siter); + if (tuptable == _SPI_current->tuptable) + _SPI_current->tuptable = NULL; + if (tuptable == SPI_tuptable) + SPI_tuptable = NULL; + MemoryContextDelete(tuptable->tuptabcxt); + } + } + } +} + +/* + * Are we executing inside a procedure (that is, a nonatomic SPI context)? + */ +bool +SPI_inside_nonatomic_context(void) +{ + if (_SPI_current == NULL) + return false; /* not in any SPI context at all */ + if (_SPI_current->atomic) + return false; /* it's atomic (ie function not procedure) */ + return true; +} + + +/* Parse, plan, and execute a query string */ +int +SPI_execute(const char *src, bool read_only, long tcount) +{ + _SPI_plan plan; + SPIExecuteOptions options; + int res; + + if (src == NULL || tcount < 0) + return SPI_ERROR_ARGUMENT; + + res = _SPI_begin_call(true); + if (res < 0) + return res; + + memset(&plan, 0, sizeof(_SPI_plan)); + plan.magic = _SPI_PLAN_MAGIC; + plan.parse_mode = RAW_PARSE_DEFAULT; + plan.cursor_options = CURSOR_OPT_PARALLEL_OK; + + _SPI_prepare_oneshot_plan(src, &plan); + + memset(&options, 0, sizeof(options)); + options.read_only = read_only; + options.tcount = tcount; + + res = _SPI_execute_plan(&plan, &options, + InvalidSnapshot, InvalidSnapshot, + true); + + _SPI_end_call(true); + return res; +} + +/* Obsolete version of SPI_execute */ +int +SPI_exec(const char *src, long tcount) +{ + return SPI_execute(src, false, tcount); +} + +/* Parse, plan, and execute a query string, with extensible options */ +int +SPI_execute_extended(const char *src, + const SPIExecuteOptions *options) +{ + int res; + _SPI_plan plan; + + if (src == NULL || options == NULL) + return SPI_ERROR_ARGUMENT; + + res = _SPI_begin_call(true); + if (res < 0) + return res; + + memset(&plan, 0, sizeof(_SPI_plan)); + plan.magic = _SPI_PLAN_MAGIC; + plan.parse_mode = RAW_PARSE_DEFAULT; + plan.cursor_options = CURSOR_OPT_PARALLEL_OK; + if (options->params) + { + plan.parserSetup = options->params->parserSetup; + plan.parserSetupArg = options->params->parserSetupArg; + } + + _SPI_prepare_oneshot_plan(src, &plan); + + res = _SPI_execute_plan(&plan, options, + InvalidSnapshot, InvalidSnapshot, + true); + + _SPI_end_call(true); + return res; +} + +/* Execute a previously prepared plan */ +int +SPI_execute_plan(SPIPlanPtr plan, Datum *Values, const char *Nulls, + bool read_only, long tcount) +{ + SPIExecuteOptions options; + int res; + + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0) + return SPI_ERROR_ARGUMENT; + + if (plan->nargs > 0 && Values == NULL) + return SPI_ERROR_PARAM; + + res = _SPI_begin_call(true); + if (res < 0) + return res; + + memset(&options, 0, sizeof(options)); + options.params = _SPI_convert_params(plan->nargs, plan->argtypes, + Values, Nulls); + options.read_only = read_only; + options.tcount = tcount; + + res = _SPI_execute_plan(plan, &options, + InvalidSnapshot, InvalidSnapshot, + true); + + _SPI_end_call(true); + return res; +} + +/* Obsolete version of SPI_execute_plan */ +int +SPI_execp(SPIPlanPtr plan, Datum *Values, const char *Nulls, long tcount) +{ + return SPI_execute_plan(plan, Values, Nulls, false, tcount); +} + +/* Execute a previously prepared plan */ +int +SPI_execute_plan_extended(SPIPlanPtr plan, + const SPIExecuteOptions *options) +{ + int res; + + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || options == NULL) + return SPI_ERROR_ARGUMENT; + + res = _SPI_begin_call(true); + if (res < 0) + return res; + + res = _SPI_execute_plan(plan, options, + InvalidSnapshot, InvalidSnapshot, + true); + + _SPI_end_call(true); + return res; +} + +/* Execute a previously prepared plan */ +int +SPI_execute_plan_with_paramlist(SPIPlanPtr plan, ParamListInfo params, + bool read_only, long tcount) +{ + SPIExecuteOptions options; + int res; + + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0) + return SPI_ERROR_ARGUMENT; + + res = _SPI_begin_call(true); + if (res < 0) + return res; + + memset(&options, 0, sizeof(options)); + options.params = params; + options.read_only = read_only; + options.tcount = tcount; + + res = _SPI_execute_plan(plan, &options, + InvalidSnapshot, InvalidSnapshot, + true); + + _SPI_end_call(true); + return res; +} + +/* + * SPI_execute_snapshot -- identical to SPI_execute_plan, except that we allow + * the caller to specify exactly which snapshots to use, which will be + * registered here. Also, the caller may specify that AFTER triggers should be + * queued as part of the outer query rather than being fired immediately at the + * end of the command. + * + * This is currently not documented in spi.sgml because it is only intended + * for use by RI triggers. + * + * Passing snapshot == InvalidSnapshot will select the normal behavior of + * fetching a new snapshot for each query. + */ +int +SPI_execute_snapshot(SPIPlanPtr plan, + Datum *Values, const char *Nulls, + Snapshot snapshot, Snapshot crosscheck_snapshot, + bool read_only, bool fire_triggers, long tcount) +{ + SPIExecuteOptions options; + int res; + + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || tcount < 0) + return SPI_ERROR_ARGUMENT; + + if (plan->nargs > 0 && Values == NULL) + return SPI_ERROR_PARAM; + + res = _SPI_begin_call(true); + if (res < 0) + return res; + + memset(&options, 0, sizeof(options)); + options.params = _SPI_convert_params(plan->nargs, plan->argtypes, + Values, Nulls); + options.read_only = read_only; + options.tcount = tcount; + + res = _SPI_execute_plan(plan, &options, + snapshot, crosscheck_snapshot, + fire_triggers); + + _SPI_end_call(true); + return res; +} + +/* + * SPI_execute_with_args -- plan and execute a query with supplied arguments + * + * This is functionally equivalent to SPI_prepare followed by + * SPI_execute_plan. + */ +int +SPI_execute_with_args(const char *src, + int nargs, Oid *argtypes, + Datum *Values, const char *Nulls, + bool read_only, long tcount) +{ + int res; + _SPI_plan plan; + ParamListInfo paramLI; + SPIExecuteOptions options; + + if (src == NULL || nargs < 0 || tcount < 0) + return SPI_ERROR_ARGUMENT; + + if (nargs > 0 && (argtypes == NULL || Values == NULL)) + return SPI_ERROR_PARAM; + + res = _SPI_begin_call(true); + if (res < 0) + return res; + + memset(&plan, 0, sizeof(_SPI_plan)); + plan.magic = _SPI_PLAN_MAGIC; + plan.parse_mode = RAW_PARSE_DEFAULT; + plan.cursor_options = CURSOR_OPT_PARALLEL_OK; + plan.nargs = nargs; + plan.argtypes = argtypes; + plan.parserSetup = NULL; + plan.parserSetupArg = NULL; + + paramLI = _SPI_convert_params(nargs, argtypes, + Values, Nulls); + + _SPI_prepare_oneshot_plan(src, &plan); + + memset(&options, 0, sizeof(options)); + options.params = paramLI; + options.read_only = read_only; + options.tcount = tcount; + + res = _SPI_execute_plan(&plan, &options, + InvalidSnapshot, InvalidSnapshot, + true); + + _SPI_end_call(true); + return res; +} + +SPIPlanPtr +SPI_prepare(const char *src, int nargs, Oid *argtypes) +{ + return SPI_prepare_cursor(src, nargs, argtypes, 0); +} + +SPIPlanPtr +SPI_prepare_cursor(const char *src, int nargs, Oid *argtypes, + int cursorOptions) +{ + _SPI_plan plan; + SPIPlanPtr result; + + if (src == NULL || nargs < 0 || (nargs > 0 && argtypes == NULL)) + { + SPI_result = SPI_ERROR_ARGUMENT; + return NULL; + } + + SPI_result = _SPI_begin_call(true); + if (SPI_result < 0) + return NULL; + + memset(&plan, 0, sizeof(_SPI_plan)); + plan.magic = _SPI_PLAN_MAGIC; + plan.parse_mode = RAW_PARSE_DEFAULT; + plan.cursor_options = cursorOptions; + plan.nargs = nargs; + plan.argtypes = argtypes; + plan.parserSetup = NULL; + plan.parserSetupArg = NULL; + + _SPI_prepare_plan(src, &plan); + + /* copy plan to procedure context */ + result = _SPI_make_plan_non_temp(&plan); + + _SPI_end_call(true); + + return result; +} + +SPIPlanPtr +SPI_prepare_extended(const char *src, + const SPIPrepareOptions *options) +{ + _SPI_plan plan; + SPIPlanPtr result; + + if (src == NULL || options == NULL) + { + SPI_result = SPI_ERROR_ARGUMENT; + return NULL; + } + + SPI_result = _SPI_begin_call(true); + if (SPI_result < 0) + return NULL; + + memset(&plan, 0, sizeof(_SPI_plan)); + plan.magic = _SPI_PLAN_MAGIC; + plan.parse_mode = options->parseMode; + plan.cursor_options = options->cursorOptions; + plan.nargs = 0; + plan.argtypes = NULL; + plan.parserSetup = options->parserSetup; + plan.parserSetupArg = options->parserSetupArg; + + _SPI_prepare_plan(src, &plan); + + /* copy plan to procedure context */ + result = _SPI_make_plan_non_temp(&plan); + + _SPI_end_call(true); + + return result; +} + +SPIPlanPtr +SPI_prepare_params(const char *src, + ParserSetupHook parserSetup, + void *parserSetupArg, + int cursorOptions) +{ + _SPI_plan plan; + SPIPlanPtr result; + + if (src == NULL) + { + SPI_result = SPI_ERROR_ARGUMENT; + return NULL; + } + + SPI_result = _SPI_begin_call(true); + if (SPI_result < 0) + return NULL; + + memset(&plan, 0, sizeof(_SPI_plan)); + plan.magic = _SPI_PLAN_MAGIC; + plan.parse_mode = RAW_PARSE_DEFAULT; + plan.cursor_options = cursorOptions; + plan.nargs = 0; + plan.argtypes = NULL; + plan.parserSetup = parserSetup; + plan.parserSetupArg = parserSetupArg; + + _SPI_prepare_plan(src, &plan); + + /* copy plan to procedure context */ + result = _SPI_make_plan_non_temp(&plan); + + _SPI_end_call(true); + + return result; +} + +int +SPI_keepplan(SPIPlanPtr plan) +{ + ListCell *lc; + + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || + plan->saved || plan->oneshot) + return SPI_ERROR_ARGUMENT; + + /* + * Mark it saved, reparent it under CacheMemoryContext, and mark all the + * component CachedPlanSources as saved. This sequence cannot fail + * partway through, so there's no risk of long-term memory leakage. + */ + plan->saved = true; + MemoryContextSetParent(plan->plancxt, CacheMemoryContext); + + foreach(lc, plan->plancache_list) + { + CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc); + + SaveCachedPlan(plansource); + } + + return 0; +} + +SPIPlanPtr +SPI_saveplan(SPIPlanPtr plan) +{ + SPIPlanPtr newplan; + + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC) + { + SPI_result = SPI_ERROR_ARGUMENT; + return NULL; + } + + SPI_result = _SPI_begin_call(false); /* don't change context */ + if (SPI_result < 0) + return NULL; + + newplan = _SPI_save_plan(plan); + + SPI_result = _SPI_end_call(false); + + return newplan; +} + +int +SPI_freeplan(SPIPlanPtr plan) +{ + ListCell *lc; + + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC) + return SPI_ERROR_ARGUMENT; + + /* Release the plancache entries */ + foreach(lc, plan->plancache_list) + { + CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc); + + DropCachedPlan(plansource); + } + + /* Now get rid of the _SPI_plan and subsidiary data in its plancxt */ + MemoryContextDelete(plan->plancxt); + + return 0; +} + +HeapTuple +SPI_copytuple(HeapTuple tuple) +{ + MemoryContext oldcxt; + HeapTuple ctuple; + + if (tuple == NULL) + { + SPI_result = SPI_ERROR_ARGUMENT; + return NULL; + } + + if (_SPI_current == NULL) + { + SPI_result = SPI_ERROR_UNCONNECTED; + return NULL; + } + + oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt); + + ctuple = heap_copytuple(tuple); + + MemoryContextSwitchTo(oldcxt); + + return ctuple; +} + +HeapTupleHeader +SPI_returntuple(HeapTuple tuple, TupleDesc tupdesc) +{ + MemoryContext oldcxt; + HeapTupleHeader dtup; + + if (tuple == NULL || tupdesc == NULL) + { + SPI_result = SPI_ERROR_ARGUMENT; + return NULL; + } + + if (_SPI_current == NULL) + { + SPI_result = SPI_ERROR_UNCONNECTED; + return NULL; + } + + /* For RECORD results, make sure a typmod has been assigned */ + if (tupdesc->tdtypeid == RECORDOID && + tupdesc->tdtypmod < 0) + assign_record_type_typmod(tupdesc); + + oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt); + + dtup = DatumGetHeapTupleHeader(heap_copy_tuple_as_datum(tuple, tupdesc)); + + MemoryContextSwitchTo(oldcxt); + + return dtup; +} + +HeapTuple +SPI_modifytuple(Relation rel, HeapTuple tuple, int natts, int *attnum, + Datum *Values, const char *Nulls) +{ + MemoryContext oldcxt; + HeapTuple mtuple; + int numberOfAttributes; + Datum *v; + bool *n; + int i; + + if (rel == NULL || tuple == NULL || natts < 0 || attnum == NULL || Values == NULL) + { + SPI_result = SPI_ERROR_ARGUMENT; + return NULL; + } + + if (_SPI_current == NULL) + { + SPI_result = SPI_ERROR_UNCONNECTED; + return NULL; + } + + oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt); + + SPI_result = 0; + + numberOfAttributes = rel->rd_att->natts; + v = (Datum *) palloc(numberOfAttributes * sizeof(Datum)); + n = (bool *) palloc(numberOfAttributes * sizeof(bool)); + + /* fetch old values and nulls */ + heap_deform_tuple(tuple, rel->rd_att, v, n); + + /* replace values and nulls */ + for (i = 0; i < natts; i++) + { + if (attnum[i] <= 0 || attnum[i] > numberOfAttributes) + break; + v[attnum[i] - 1] = Values[i]; + n[attnum[i] - 1] = (Nulls && Nulls[i] == 'n') ? true : false; + } + + if (i == natts) /* no errors in *attnum */ + { + mtuple = heap_form_tuple(rel->rd_att, v, n); + + /* + * copy the identification info of the old tuple: t_ctid, t_self, and + * OID (if any) + */ + mtuple->t_data->t_ctid = tuple->t_data->t_ctid; + mtuple->t_self = tuple->t_self; + mtuple->t_tableOid = tuple->t_tableOid; + } + else + { + mtuple = NULL; + SPI_result = SPI_ERROR_NOATTRIBUTE; + } + + pfree(v); + pfree(n); + + MemoryContextSwitchTo(oldcxt); + + return mtuple; +} + +int +SPI_fnumber(TupleDesc tupdesc, const char *fname) +{ + int res; + const FormData_pg_attribute *sysatt; + + for (res = 0; res < tupdesc->natts; res++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, res); + + if (namestrcmp(&attr->attname, fname) == 0 && + !attr->attisdropped) + return res + 1; + } + + sysatt = SystemAttributeByName(fname); + if (sysatt != NULL) + return sysatt->attnum; + + /* SPI_ERROR_NOATTRIBUTE is different from all sys column numbers */ + return SPI_ERROR_NOATTRIBUTE; +} + +char * +SPI_fname(TupleDesc tupdesc, int fnumber) +{ + const FormData_pg_attribute *att; + + SPI_result = 0; + + if (fnumber > tupdesc->natts || fnumber == 0 || + fnumber <= FirstLowInvalidHeapAttributeNumber) + { + SPI_result = SPI_ERROR_NOATTRIBUTE; + return NULL; + } + + if (fnumber > 0) + att = TupleDescAttr(tupdesc, fnumber - 1); + else + att = SystemAttributeDefinition(fnumber); + + return pstrdup(NameStr(att->attname)); +} + +char * +SPI_getvalue(HeapTuple tuple, TupleDesc tupdesc, int fnumber) +{ + Datum val; + bool isnull; + Oid typoid, + foutoid; + bool typisvarlena; + + SPI_result = 0; + + if (fnumber > tupdesc->natts || fnumber == 0 || + fnumber <= FirstLowInvalidHeapAttributeNumber) + { + SPI_result = SPI_ERROR_NOATTRIBUTE; + return NULL; + } + + val = heap_getattr(tuple, fnumber, tupdesc, &isnull); + if (isnull) + return NULL; + + if (fnumber > 0) + typoid = TupleDescAttr(tupdesc, fnumber - 1)->atttypid; + else + typoid = (SystemAttributeDefinition(fnumber))->atttypid; + + getTypeOutputInfo(typoid, &foutoid, &typisvarlena); + + return OidOutputFunctionCall(foutoid, val); +} + +Datum +SPI_getbinval(HeapTuple tuple, TupleDesc tupdesc, int fnumber, bool *isnull) +{ + SPI_result = 0; + + if (fnumber > tupdesc->natts || fnumber == 0 || + fnumber <= FirstLowInvalidHeapAttributeNumber) + { + SPI_result = SPI_ERROR_NOATTRIBUTE; + *isnull = true; + return (Datum) NULL; + } + + return heap_getattr(tuple, fnumber, tupdesc, isnull); +} + +char * +SPI_gettype(TupleDesc tupdesc, int fnumber) +{ + Oid typoid; + HeapTuple typeTuple; + char *result; + + SPI_result = 0; + + if (fnumber > tupdesc->natts || fnumber == 0 || + fnumber <= FirstLowInvalidHeapAttributeNumber) + { + SPI_result = SPI_ERROR_NOATTRIBUTE; + return NULL; + } + + if (fnumber > 0) + typoid = TupleDescAttr(tupdesc, fnumber - 1)->atttypid; + else + typoid = (SystemAttributeDefinition(fnumber))->atttypid; + + typeTuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid)); + + if (!HeapTupleIsValid(typeTuple)) + { + SPI_result = SPI_ERROR_TYPUNKNOWN; + return NULL; + } + + result = pstrdup(NameStr(((Form_pg_type) GETSTRUCT(typeTuple))->typname)); + ReleaseSysCache(typeTuple); + return result; +} + +/* + * Get the data type OID for a column. + * + * There's nothing similar for typmod and typcollation. The rare consumers + * thereof should inspect the TupleDesc directly. + */ +Oid +SPI_gettypeid(TupleDesc tupdesc, int fnumber) +{ + SPI_result = 0; + + if (fnumber > tupdesc->natts || fnumber == 0 || + fnumber <= FirstLowInvalidHeapAttributeNumber) + { + SPI_result = SPI_ERROR_NOATTRIBUTE; + return InvalidOid; + } + + if (fnumber > 0) + return TupleDescAttr(tupdesc, fnumber - 1)->atttypid; + else + return (SystemAttributeDefinition(fnumber))->atttypid; +} + +char * +SPI_getrelname(Relation rel) +{ + return pstrdup(RelationGetRelationName(rel)); +} + +char * +SPI_getnspname(Relation rel) +{ + return get_namespace_name(RelationGetNamespace(rel)); +} + +void * +SPI_palloc(Size size) +{ + if (_SPI_current == NULL) + elog(ERROR, "SPI_palloc called while not connected to SPI"); + + return MemoryContextAlloc(_SPI_current->savedcxt, size); +} + +void * +SPI_repalloc(void *pointer, Size size) +{ + /* No longer need to worry which context chunk was in... */ + return repalloc(pointer, size); +} + +void +SPI_pfree(void *pointer) +{ + /* No longer need to worry which context chunk was in... */ + pfree(pointer); +} + +Datum +SPI_datumTransfer(Datum value, bool typByVal, int typLen) +{ + MemoryContext oldcxt; + Datum result; + + if (_SPI_current == NULL) + elog(ERROR, "SPI_datumTransfer called while not connected to SPI"); + + oldcxt = MemoryContextSwitchTo(_SPI_current->savedcxt); + + result = datumTransfer(value, typByVal, typLen); + + MemoryContextSwitchTo(oldcxt); + + return result; +} + +void +SPI_freetuple(HeapTuple tuple) +{ + /* No longer need to worry which context tuple was in... */ + heap_freetuple(tuple); +} + +void +SPI_freetuptable(SPITupleTable *tuptable) +{ + bool found = false; + + /* ignore call if NULL pointer */ + if (tuptable == NULL) + return; + + /* + * Search only the topmost SPI context for a matching tuple table. + */ + if (_SPI_current != NULL) + { + slist_mutable_iter siter; + + /* find tuptable in active list, then remove it */ + slist_foreach_modify(siter, &_SPI_current->tuptables) + { + SPITupleTable *tt; + + tt = slist_container(SPITupleTable, next, siter.cur); + if (tt == tuptable) + { + slist_delete_current(&siter); + found = true; + break; + } + } + } + + /* + * Refuse the deletion if we didn't find it in the topmost SPI context. + * This is primarily a guard against double deletion, but might prevent + * other errors as well. Since the worst consequence of not deleting a + * tuptable would be a transient memory leak, this is just a WARNING. + */ + if (!found) + { + elog(WARNING, "attempt to delete invalid SPITupleTable %p", tuptable); + return; + } + + /* for safety, reset global variables that might point at tuptable */ + if (tuptable == _SPI_current->tuptable) + _SPI_current->tuptable = NULL; + if (tuptable == SPI_tuptable) + SPI_tuptable = NULL; + + /* release all memory belonging to tuptable */ + MemoryContextDelete(tuptable->tuptabcxt); +} + + +/* + * SPI_cursor_open() + * + * Open a prepared SPI plan as a portal + */ +Portal +SPI_cursor_open(const char *name, SPIPlanPtr plan, + Datum *Values, const char *Nulls, + bool read_only) +{ + Portal portal; + ParamListInfo paramLI; + + /* build transient ParamListInfo in caller's context */ + paramLI = _SPI_convert_params(plan->nargs, plan->argtypes, + Values, Nulls); + + portal = SPI_cursor_open_internal(name, plan, paramLI, read_only); + + /* done with the transient ParamListInfo */ + if (paramLI) + pfree(paramLI); + + return portal; +} + + +/* + * SPI_cursor_open_with_args() + * + * Parse and plan a query and open it as a portal. + */ +Portal +SPI_cursor_open_with_args(const char *name, + const char *src, + int nargs, Oid *argtypes, + Datum *Values, const char *Nulls, + bool read_only, int cursorOptions) +{ + Portal result; + _SPI_plan plan; + ParamListInfo paramLI; + + if (src == NULL || nargs < 0) + elog(ERROR, "SPI_cursor_open_with_args called with invalid arguments"); + + if (nargs > 0 && (argtypes == NULL || Values == NULL)) + elog(ERROR, "SPI_cursor_open_with_args called with missing parameters"); + + SPI_result = _SPI_begin_call(true); + if (SPI_result < 0) + elog(ERROR, "SPI_cursor_open_with_args called while not connected"); + + memset(&plan, 0, sizeof(_SPI_plan)); + plan.magic = _SPI_PLAN_MAGIC; + plan.parse_mode = RAW_PARSE_DEFAULT; + plan.cursor_options = cursorOptions; + plan.nargs = nargs; + plan.argtypes = argtypes; + plan.parserSetup = NULL; + plan.parserSetupArg = NULL; + + /* build transient ParamListInfo in executor context */ + paramLI = _SPI_convert_params(nargs, argtypes, + Values, Nulls); + + _SPI_prepare_plan(src, &plan); + + /* We needn't copy the plan; SPI_cursor_open_internal will do so */ + + result = SPI_cursor_open_internal(name, &plan, paramLI, read_only); + + /* And clean up */ + _SPI_end_call(true); + + return result; +} + + +/* + * SPI_cursor_open_with_paramlist() + * + * Same as SPI_cursor_open except that parameters (if any) are passed + * as a ParamListInfo, which supports dynamic parameter set determination + */ +Portal +SPI_cursor_open_with_paramlist(const char *name, SPIPlanPtr plan, + ParamListInfo params, bool read_only) +{ + return SPI_cursor_open_internal(name, plan, params, read_only); +} + +/* Parse a query and open it as a cursor */ +Portal +SPI_cursor_parse_open(const char *name, + const char *src, + const SPIParseOpenOptions *options) +{ + Portal result; + _SPI_plan plan; + + if (src == NULL || options == NULL) + elog(ERROR, "SPI_cursor_parse_open called with invalid arguments"); + + SPI_result = _SPI_begin_call(true); + if (SPI_result < 0) + elog(ERROR, "SPI_cursor_parse_open called while not connected"); + + memset(&plan, 0, sizeof(_SPI_plan)); + plan.magic = _SPI_PLAN_MAGIC; + plan.parse_mode = RAW_PARSE_DEFAULT; + plan.cursor_options = options->cursorOptions; + if (options->params) + { + plan.parserSetup = options->params->parserSetup; + plan.parserSetupArg = options->params->parserSetupArg; + } + + _SPI_prepare_plan(src, &plan); + + /* We needn't copy the plan; SPI_cursor_open_internal will do so */ + + result = SPI_cursor_open_internal(name, &plan, + options->params, options->read_only); + + /* And clean up */ + _SPI_end_call(true); + + return result; +} + + +/* + * SPI_cursor_open_internal() + * + * Common code for SPI_cursor_open variants + */ +static Portal +SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, + ParamListInfo paramLI, bool read_only) +{ + CachedPlanSource *plansource; + CachedPlan *cplan; + List *stmt_list; + char *query_string; + Snapshot snapshot; + MemoryContext oldcontext; + Portal portal; + SPICallbackArg spicallbackarg; + ErrorContextCallback spierrcontext; + + /* + * Check that the plan is something the Portal code will special-case as + * returning one tupleset. + */ + if (!SPI_is_cursor_plan(plan)) + { + /* try to give a good error message */ + const char *cmdtag; + + if (list_length(plan->plancache_list) != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_DEFINITION), + errmsg("cannot open multi-query plan as cursor"))); + plansource = (CachedPlanSource *) linitial(plan->plancache_list); + /* A SELECT that fails SPI_is_cursor_plan() must be SELECT INTO */ + if (plansource->commandTag == CMDTAG_SELECT) + cmdtag = "SELECT INTO"; + else + cmdtag = GetCommandTagName(plansource->commandTag); + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_DEFINITION), + /* translator: %s is name of a SQL command, eg INSERT */ + errmsg("cannot open %s query as cursor", cmdtag))); + } + + Assert(list_length(plan->plancache_list) == 1); + plansource = (CachedPlanSource *) linitial(plan->plancache_list); + + /* Push the SPI stack */ + if (_SPI_begin_call(true) < 0) + elog(ERROR, "SPI_cursor_open called while not connected"); + + /* Reset SPI result (note we deliberately don't touch lastoid) */ + SPI_processed = 0; + SPI_tuptable = NULL; + _SPI_current->processed = 0; + _SPI_current->tuptable = NULL; + + /* Create the portal */ + if (name == NULL || name[0] == '\0') + { + /* Use a random nonconflicting name */ + portal = CreateNewPortal(); + } + else + { + /* In this path, error if portal of same name already exists */ + portal = CreatePortal(name, false, false); + } + + /* Copy the plan's query string into the portal */ + query_string = MemoryContextStrdup(portal->portalContext, + plansource->query_string); + + /* + * Setup error traceback support for ereport(), in case GetCachedPlan + * throws an error. + */ + spicallbackarg.query = plansource->query_string; + spicallbackarg.mode = plan->parse_mode; + spierrcontext.callback = _SPI_error_callback; + spierrcontext.arg = &spicallbackarg; + spierrcontext.previous = error_context_stack; + error_context_stack = &spierrcontext; + + /* + * Note: for a saved plan, we mustn't have any failure occur between + * GetCachedPlan and PortalDefineQuery; that would result in leaking our + * plancache refcount. + */ + + /* Replan if needed, and increment plan refcount for portal */ + cplan = GetCachedPlan(plansource, paramLI, NULL, _SPI_current->queryEnv); + stmt_list = cplan->stmt_list; + + if (!plan->saved) + { + /* + * We don't want the portal to depend on an unsaved CachedPlanSource, + * so must copy the plan into the portal's context. An error here + * will result in leaking our refcount on the plan, but it doesn't + * matter because the plan is unsaved and hence transient anyway. + */ + oldcontext = MemoryContextSwitchTo(portal->portalContext); + stmt_list = copyObject(stmt_list); + MemoryContextSwitchTo(oldcontext); + ReleaseCachedPlan(cplan, NULL); + cplan = NULL; /* portal shouldn't depend on cplan */ + } + + /* + * Set up the portal. + */ + PortalDefineQuery(portal, + NULL, /* no statement name */ + query_string, + plansource->commandTag, + stmt_list, + cplan); + + /* + * Set up options for portal. Default SCROLL type is chosen the same way + * as PerformCursorOpen does it. + */ + portal->cursorOptions = plan->cursor_options; + if (!(portal->cursorOptions & (CURSOR_OPT_SCROLL | CURSOR_OPT_NO_SCROLL))) + { + if (list_length(stmt_list) == 1 && + linitial_node(PlannedStmt, stmt_list)->commandType != CMD_UTILITY && + linitial_node(PlannedStmt, stmt_list)->rowMarks == NIL && + ExecSupportsBackwardScan(linitial_node(PlannedStmt, stmt_list)->planTree)) + portal->cursorOptions |= CURSOR_OPT_SCROLL; + else + portal->cursorOptions |= CURSOR_OPT_NO_SCROLL; + } + + /* + * Disallow SCROLL with SELECT FOR UPDATE. This is not redundant with the + * check in transformDeclareCursorStmt because the cursor options might + * not have come through there. + */ + if (portal->cursorOptions & CURSOR_OPT_SCROLL) + { + if (list_length(stmt_list) == 1 && + linitial_node(PlannedStmt, stmt_list)->commandType != CMD_UTILITY && + linitial_node(PlannedStmt, stmt_list)->rowMarks != NIL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("DECLARE SCROLL CURSOR ... FOR UPDATE/SHARE is not supported"), + errdetail("Scrollable cursors must be READ ONLY."))); + } + + /* Make current query environment available to portal at execution time. */ + portal->queryEnv = _SPI_current->queryEnv; + + /* + * If told to be read-only, we'd better check for read-only queries. This + * can't be done earlier because we need to look at the finished, planned + * queries. (In particular, we don't want to do it between GetCachedPlan + * and PortalDefineQuery, because throwing an error between those steps + * would result in leaking our plancache refcount.) + */ + if (read_only) + { + ListCell *lc; + + foreach(lc, stmt_list) + { + PlannedStmt *pstmt = lfirst_node(PlannedStmt, lc); + + if (!CommandIsReadOnly(pstmt)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /* translator: %s is a SQL statement name */ + errmsg("%s is not allowed in a non-volatile function", + CreateCommandName((Node *) pstmt)))); + } + } + + /* Set up the snapshot to use. */ + if (read_only) + snapshot = GetActiveSnapshot(); + else + { + CommandCounterIncrement(); + snapshot = GetTransactionSnapshot(); + } + + /* + * If the plan has parameters, copy them into the portal. Note that this + * must be done after revalidating the plan, because in dynamic parameter + * cases the set of parameters could have changed during re-parsing. + */ + if (paramLI) + { + oldcontext = MemoryContextSwitchTo(portal->portalContext); + paramLI = copyParamList(paramLI); + MemoryContextSwitchTo(oldcontext); + } + + /* + * Start portal execution. + */ + PortalStart(portal, paramLI, 0, snapshot); + + Assert(portal->strategy != PORTAL_MULTI_QUERY); + + /* Pop the error context stack */ + error_context_stack = spierrcontext.previous; + + /* Pop the SPI stack */ + _SPI_end_call(true); + + /* Return the created portal */ + return portal; +} + + +/* + * SPI_cursor_find() + * + * Find the portal of an existing open cursor + */ +Portal +SPI_cursor_find(const char *name) +{ + return GetPortalByName(name); +} + + +/* + * SPI_cursor_fetch() + * + * Fetch rows in a cursor + */ +void +SPI_cursor_fetch(Portal portal, bool forward, long count) +{ + _SPI_cursor_operation(portal, + forward ? FETCH_FORWARD : FETCH_BACKWARD, count, + CreateDestReceiver(DestSPI)); + /* we know that the DestSPI receiver doesn't need a destroy call */ +} + + +/* + * SPI_cursor_move() + * + * Move in a cursor + */ +void +SPI_cursor_move(Portal portal, bool forward, long count) +{ + _SPI_cursor_operation(portal, + forward ? FETCH_FORWARD : FETCH_BACKWARD, count, + None_Receiver); +} + + +/* + * SPI_scroll_cursor_fetch() + * + * Fetch rows in a scrollable cursor + */ +void +SPI_scroll_cursor_fetch(Portal portal, FetchDirection direction, long count) +{ + _SPI_cursor_operation(portal, + direction, count, + CreateDestReceiver(DestSPI)); + /* we know that the DestSPI receiver doesn't need a destroy call */ +} + + +/* + * SPI_scroll_cursor_move() + * + * Move in a scrollable cursor + */ +void +SPI_scroll_cursor_move(Portal portal, FetchDirection direction, long count) +{ + _SPI_cursor_operation(portal, direction, count, None_Receiver); +} + + +/* + * SPI_cursor_close() + * + * Close a cursor + */ +void +SPI_cursor_close(Portal portal) +{ + if (!PortalIsValid(portal)) + elog(ERROR, "invalid portal in SPI cursor operation"); + + PortalDrop(portal, false); +} + +/* + * Returns the Oid representing the type id for argument at argIndex. First + * parameter is at index zero. + */ +Oid +SPI_getargtypeid(SPIPlanPtr plan, int argIndex) +{ + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC || + argIndex < 0 || argIndex >= plan->nargs) + { + SPI_result = SPI_ERROR_ARGUMENT; + return InvalidOid; + } + return plan->argtypes[argIndex]; +} + +/* + * Returns the number of arguments for the prepared plan. + */ +int +SPI_getargcount(SPIPlanPtr plan) +{ + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC) + { + SPI_result = SPI_ERROR_ARGUMENT; + return -1; + } + return plan->nargs; +} + +/* + * Returns true if the plan contains exactly one command + * and that command returns tuples to the caller (eg, SELECT or + * INSERT ... RETURNING, but not SELECT ... INTO). In essence, + * the result indicates if the command can be used with SPI_cursor_open + * + * Parameters + * plan: A plan previously prepared using SPI_prepare + */ +bool +SPI_is_cursor_plan(SPIPlanPtr plan) +{ + CachedPlanSource *plansource; + + if (plan == NULL || plan->magic != _SPI_PLAN_MAGIC) + { + SPI_result = SPI_ERROR_ARGUMENT; + return false; + } + + if (list_length(plan->plancache_list) != 1) + { + SPI_result = 0; + return false; /* not exactly 1 pre-rewrite command */ + } + plansource = (CachedPlanSource *) linitial(plan->plancache_list); + + /* + * We used to force revalidation of the cached plan here, but that seems + * unnecessary: invalidation could mean a change in the rowtype of the + * tuples returned by a plan, but not whether it returns tuples at all. + */ + SPI_result = 0; + + /* Does it return tuples? */ + if (plansource->resultDesc) + return true; + + return false; +} + +/* + * SPI_plan_is_valid --- test whether a SPI plan is currently valid + * (that is, not marked as being in need of revalidation). + * + * See notes for CachedPlanIsValid before using this. + */ +bool +SPI_plan_is_valid(SPIPlanPtr plan) +{ + ListCell *lc; + + Assert(plan->magic == _SPI_PLAN_MAGIC); + + foreach(lc, plan->plancache_list) + { + CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc); + + if (!CachedPlanIsValid(plansource)) + return false; + } + return true; +} + +/* + * SPI_result_code_string --- convert any SPI return code to a string + * + * This is often useful in error messages. Most callers will probably + * only pass negative (error-case) codes, but for generality we recognize + * the success codes too. + */ +const char * +SPI_result_code_string(int code) +{ + static char buf[64]; + + switch (code) + { + case SPI_ERROR_CONNECT: + return "SPI_ERROR_CONNECT"; + case SPI_ERROR_COPY: + return "SPI_ERROR_COPY"; + case SPI_ERROR_OPUNKNOWN: + return "SPI_ERROR_OPUNKNOWN"; + case SPI_ERROR_UNCONNECTED: + return "SPI_ERROR_UNCONNECTED"; + case SPI_ERROR_ARGUMENT: + return "SPI_ERROR_ARGUMENT"; + case SPI_ERROR_PARAM: + return "SPI_ERROR_PARAM"; + case SPI_ERROR_TRANSACTION: + return "SPI_ERROR_TRANSACTION"; + case SPI_ERROR_NOATTRIBUTE: + return "SPI_ERROR_NOATTRIBUTE"; + case SPI_ERROR_NOOUTFUNC: + return "SPI_ERROR_NOOUTFUNC"; + case SPI_ERROR_TYPUNKNOWN: + return "SPI_ERROR_TYPUNKNOWN"; + case SPI_ERROR_REL_DUPLICATE: + return "SPI_ERROR_REL_DUPLICATE"; + case SPI_ERROR_REL_NOT_FOUND: + return "SPI_ERROR_REL_NOT_FOUND"; + case SPI_OK_CONNECT: + return "SPI_OK_CONNECT"; + case SPI_OK_FINISH: + return "SPI_OK_FINISH"; + case SPI_OK_FETCH: + return "SPI_OK_FETCH"; + case SPI_OK_UTILITY: + return "SPI_OK_UTILITY"; + case SPI_OK_SELECT: + return "SPI_OK_SELECT"; + case SPI_OK_SELINTO: + return "SPI_OK_SELINTO"; + case SPI_OK_INSERT: + return "SPI_OK_INSERT"; + case SPI_OK_DELETE: + return "SPI_OK_DELETE"; + case SPI_OK_UPDATE: + return "SPI_OK_UPDATE"; + case SPI_OK_CURSOR: + return "SPI_OK_CURSOR"; + case SPI_OK_INSERT_RETURNING: + return "SPI_OK_INSERT_RETURNING"; + case SPI_OK_DELETE_RETURNING: + return "SPI_OK_DELETE_RETURNING"; + case SPI_OK_UPDATE_RETURNING: + return "SPI_OK_UPDATE_RETURNING"; + case SPI_OK_REWRITTEN: + return "SPI_OK_REWRITTEN"; + case SPI_OK_REL_REGISTER: + return "SPI_OK_REL_REGISTER"; + case SPI_OK_REL_UNREGISTER: + return "SPI_OK_REL_UNREGISTER"; + } + /* Unrecognized code ... return something useful ... */ + sprintf(buf, "Unrecognized SPI code %d", code); + return buf; +} + +/* + * SPI_plan_get_plan_sources --- get a SPI plan's underlying list of + * CachedPlanSources. + * + * This is exported so that PL/pgSQL can use it (this beats letting PL/pgSQL + * look directly into the SPIPlan for itself). It's not documented in + * spi.sgml because we'd just as soon not have too many places using this. + */ +List * +SPI_plan_get_plan_sources(SPIPlanPtr plan) +{ + Assert(plan->magic == _SPI_PLAN_MAGIC); + return plan->plancache_list; +} + +/* + * SPI_plan_get_cached_plan --- get a SPI plan's generic CachedPlan, + * if the SPI plan contains exactly one CachedPlanSource. If not, + * return NULL. + * + * The plan's refcount is incremented (and logged in CurrentResourceOwner, + * if it's a saved plan). Caller is responsible for doing ReleaseCachedPlan. + * + * This is exported so that PL/pgSQL can use it (this beats letting PL/pgSQL + * look directly into the SPIPlan for itself). It's not documented in + * spi.sgml because we'd just as soon not have too many places using this. + */ +CachedPlan * +SPI_plan_get_cached_plan(SPIPlanPtr plan) +{ + CachedPlanSource *plansource; + CachedPlan *cplan; + SPICallbackArg spicallbackarg; + ErrorContextCallback spierrcontext; + + Assert(plan->magic == _SPI_PLAN_MAGIC); + + /* Can't support one-shot plans here */ + if (plan->oneshot) + return NULL; + + /* Must have exactly one CachedPlanSource */ + if (list_length(plan->plancache_list) != 1) + return NULL; + plansource = (CachedPlanSource *) linitial(plan->plancache_list); + + /* Setup error traceback support for ereport() */ + spicallbackarg.query = plansource->query_string; + spicallbackarg.mode = plan->parse_mode; + spierrcontext.callback = _SPI_error_callback; + spierrcontext.arg = &spicallbackarg; + spierrcontext.previous = error_context_stack; + error_context_stack = &spierrcontext; + + /* Get the generic plan for the query */ + cplan = GetCachedPlan(plansource, NULL, + plan->saved ? CurrentResourceOwner : NULL, + _SPI_current->queryEnv); + Assert(cplan == plansource->gplan); + + /* Pop the error context stack */ + error_context_stack = spierrcontext.previous; + + return cplan; +} + + +/* =================== private functions =================== */ + +/* + * spi_dest_startup + * Initialize to receive tuples from Executor into SPITupleTable + * of current SPI procedure + */ +void +spi_dest_startup(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + SPITupleTable *tuptable; + MemoryContext oldcxt; + MemoryContext tuptabcxt; + + if (_SPI_current == NULL) + elog(ERROR, "spi_dest_startup called while not connected to SPI"); + + if (_SPI_current->tuptable != NULL) + elog(ERROR, "improper call to spi_dest_startup"); + + /* We create the tuple table context as a child of procCxt */ + + oldcxt = _SPI_procmem(); /* switch to procedure memory context */ + + tuptabcxt = AllocSetContextCreate(CurrentMemoryContext, + "SPI TupTable", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(tuptabcxt); + + _SPI_current->tuptable = tuptable = (SPITupleTable *) + palloc0(sizeof(SPITupleTable)); + tuptable->tuptabcxt = tuptabcxt; + tuptable->subid = GetCurrentSubTransactionId(); + + /* + * The tuptable is now valid enough to be freed by AtEOSubXact_SPI, so put + * it onto the SPI context's tuptables list. This will ensure it's not + * leaked even in the unlikely event the following few lines fail. + */ + slist_push_head(&_SPI_current->tuptables, &tuptable->next); + + /* set up initial allocations */ + tuptable->alloced = 128; + tuptable->vals = (HeapTuple *) palloc(tuptable->alloced * sizeof(HeapTuple)); + tuptable->numvals = 0; + tuptable->tupdesc = CreateTupleDescCopy(typeinfo); + + MemoryContextSwitchTo(oldcxt); +} + +/* + * spi_printtup + * store tuple retrieved by Executor into SPITupleTable + * of current SPI procedure + */ +bool +spi_printtup(TupleTableSlot *slot, DestReceiver *self) +{ + SPITupleTable *tuptable; + MemoryContext oldcxt; + + if (_SPI_current == NULL) + elog(ERROR, "spi_printtup called while not connected to SPI"); + + tuptable = _SPI_current->tuptable; + if (tuptable == NULL) + elog(ERROR, "improper call to spi_printtup"); + + oldcxt = MemoryContextSwitchTo(tuptable->tuptabcxt); + + if (tuptable->numvals >= tuptable->alloced) + { + /* Double the size of the pointer array */ + uint64 newalloced = tuptable->alloced * 2; + + tuptable->vals = (HeapTuple *) repalloc_huge(tuptable->vals, + newalloced * sizeof(HeapTuple)); + tuptable->alloced = newalloced; + } + + tuptable->vals[tuptable->numvals] = ExecCopySlotHeapTuple(slot); + (tuptable->numvals)++; + + MemoryContextSwitchTo(oldcxt); + + return true; +} + +/* + * Static functions + */ + +/* + * Parse and analyze a querystring. + * + * At entry, plan->argtypes and plan->nargs (or alternatively plan->parserSetup + * and plan->parserSetupArg) must be valid, as must plan->parse_mode and + * plan->cursor_options. + * + * Results are stored into *plan (specifically, plan->plancache_list). + * Note that the result data is all in CurrentMemoryContext or child contexts + * thereof; in practice this means it is in the SPI executor context, and + * what we are creating is a "temporary" SPIPlan. Cruft generated during + * parsing is also left in CurrentMemoryContext. + */ +static void +_SPI_prepare_plan(const char *src, SPIPlanPtr plan) +{ + List *raw_parsetree_list; + List *plancache_list; + ListCell *list_item; + SPICallbackArg spicallbackarg; + ErrorContextCallback spierrcontext; + + /* + * Setup error traceback support for ereport() + */ + spicallbackarg.query = src; + spicallbackarg.mode = plan->parse_mode; + spierrcontext.callback = _SPI_error_callback; + spierrcontext.arg = &spicallbackarg; + spierrcontext.previous = error_context_stack; + error_context_stack = &spierrcontext; + + /* + * Parse the request string into a list of raw parse trees. + */ + raw_parsetree_list = raw_parser(src, plan->parse_mode); + + /* + * Do parse analysis and rule rewrite for each raw parsetree, storing the + * results into unsaved plancache entries. + */ + plancache_list = NIL; + + foreach(list_item, raw_parsetree_list) + { + RawStmt *parsetree = lfirst_node(RawStmt, list_item); + List *stmt_list; + CachedPlanSource *plansource; + + /* + * Create the CachedPlanSource before we do parse analysis, since it + * needs to see the unmodified raw parse tree. + */ + plansource = CreateCachedPlan(parsetree, + src, + CreateCommandTag(parsetree->stmt)); + + /* + * Parameter datatypes are driven by parserSetup hook if provided, + * otherwise we use the fixed parameter list. + */ + if (plan->parserSetup != NULL) + { + Assert(plan->nargs == 0); + stmt_list = pg_analyze_and_rewrite_params(parsetree, + src, + plan->parserSetup, + plan->parserSetupArg, + _SPI_current->queryEnv); + } + else + { + stmt_list = pg_analyze_and_rewrite(parsetree, + src, + plan->argtypes, + plan->nargs, + _SPI_current->queryEnv); + } + + /* Finish filling in the CachedPlanSource */ + CompleteCachedPlan(plansource, + stmt_list, + NULL, + plan->argtypes, + plan->nargs, + plan->parserSetup, + plan->parserSetupArg, + plan->cursor_options, + false); /* not fixed result */ + + plancache_list = lappend(plancache_list, plansource); + } + + plan->plancache_list = plancache_list; + plan->oneshot = false; + + /* + * Pop the error context stack + */ + error_context_stack = spierrcontext.previous; +} + +/* + * Parse, but don't analyze, a querystring. + * + * This is a stripped-down version of _SPI_prepare_plan that only does the + * initial raw parsing. It creates "one shot" CachedPlanSources + * that still require parse analysis before execution is possible. + * + * The advantage of using the "one shot" form of CachedPlanSource is that + * we eliminate data copying and invalidation overhead. Postponing parse + * analysis also prevents issues if some of the raw parsetrees are DDL + * commands that affect validity of later parsetrees. Both of these + * attributes are good things for SPI_execute() and similar cases. + * + * Results are stored into *plan (specifically, plan->plancache_list). + * Note that the result data is all in CurrentMemoryContext or child contexts + * thereof; in practice this means it is in the SPI executor context, and + * what we are creating is a "temporary" SPIPlan. Cruft generated during + * parsing is also left in CurrentMemoryContext. + */ +static void +_SPI_prepare_oneshot_plan(const char *src, SPIPlanPtr plan) +{ + List *raw_parsetree_list; + List *plancache_list; + ListCell *list_item; + SPICallbackArg spicallbackarg; + ErrorContextCallback spierrcontext; + + /* + * Setup error traceback support for ereport() + */ + spicallbackarg.query = src; + spicallbackarg.mode = plan->parse_mode; + spierrcontext.callback = _SPI_error_callback; + spierrcontext.arg = &spicallbackarg; + spierrcontext.previous = error_context_stack; + error_context_stack = &spierrcontext; + + /* + * Parse the request string into a list of raw parse trees. + */ + raw_parsetree_list = raw_parser(src, plan->parse_mode); + + /* + * Construct plancache entries, but don't do parse analysis yet. + */ + plancache_list = NIL; + + foreach(list_item, raw_parsetree_list) + { + RawStmt *parsetree = lfirst_node(RawStmt, list_item); + CachedPlanSource *plansource; + + plansource = CreateOneShotCachedPlan(parsetree, + src, + CreateCommandTag(parsetree->stmt)); + + plancache_list = lappend(plancache_list, plansource); + } + + plan->plancache_list = plancache_list; + plan->oneshot = true; + + /* + * Pop the error context stack + */ + error_context_stack = spierrcontext.previous; +} + +/* + * _SPI_execute_plan: execute the given plan with the given options + * + * options contains options accessible from outside SPI: + * params: parameter values to pass to query + * read_only: true for read-only execution (no CommandCounterIncrement) + * allow_nonatomic: true to allow nonatomic CALL/DO execution + * must_return_tuples: throw error if query doesn't return tuples + * tcount: execution tuple-count limit, or 0 for none + * dest: DestReceiver to receive output, or NULL for normal SPI output + * owner: ResourceOwner that will be used to hold refcount on plan; + * if NULL, CurrentResourceOwner is used (ignored for non-saved plan) + * + * Additional, only-internally-accessible options: + * snapshot: query snapshot to use, or InvalidSnapshot for the normal + * behavior of taking a new snapshot for each query. + * crosscheck_snapshot: for RI use, all others pass InvalidSnapshot + * fire_triggers: true to fire AFTER triggers at end of query (normal case); + * false means any AFTER triggers are postponed to end of outer query + */ +static int +_SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options, + Snapshot snapshot, Snapshot crosscheck_snapshot, + bool fire_triggers) +{ + int my_res = 0; + uint64 my_processed = 0; + SPITupleTable *my_tuptable = NULL; + int res = 0; + bool pushed_active_snap = false; + ResourceOwner plan_owner = options->owner; + SPICallbackArg spicallbackarg; + ErrorContextCallback spierrcontext; + CachedPlan *cplan = NULL; + ListCell *lc1; + + /* + * Setup error traceback support for ereport() + */ + spicallbackarg.query = NULL; /* we'll fill this below */ + spicallbackarg.mode = plan->parse_mode; + spierrcontext.callback = _SPI_error_callback; + spierrcontext.arg = &spicallbackarg; + spierrcontext.previous = error_context_stack; + error_context_stack = &spierrcontext; + + /* + * We support four distinct snapshot management behaviors: + * + * snapshot != InvalidSnapshot, read_only = true: use exactly the given + * snapshot. + * + * snapshot != InvalidSnapshot, read_only = false: use the given snapshot, + * modified by advancing its command ID before each querytree. + * + * snapshot == InvalidSnapshot, read_only = true: use the entry-time + * ActiveSnapshot, if any (if there isn't one, we run with no snapshot). + * + * snapshot == InvalidSnapshot, read_only = false: take a full new + * snapshot for each user command, and advance its command ID before each + * querytree within the command. + * + * In the first two cases, we can just push the snap onto the stack once + * for the whole plan list. + * + * Note that snapshot != InvalidSnapshot implies an atomic execution + * context. + */ + if (snapshot != InvalidSnapshot) + { + Assert(!options->allow_nonatomic); + if (options->read_only) + { + PushActiveSnapshot(snapshot); + pushed_active_snap = true; + } + else + { + /* Make sure we have a private copy of the snapshot to modify */ + PushCopiedSnapshot(snapshot); + pushed_active_snap = true; + } + } + + /* + * Ensure that we have a resource owner if plan is saved, and not if it + * isn't. + */ + if (!plan->saved) + plan_owner = NULL; + else if (plan_owner == NULL) + plan_owner = CurrentResourceOwner; + + /* + * We interpret must_return_tuples as "there must be at least one query, + * and all of them must return tuples". This is a bit laxer than + * SPI_is_cursor_plan's check, but there seems no reason to enforce that + * there be only one query. + */ + if (options->must_return_tuples && plan->plancache_list == NIL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("empty query does not return tuples"))); + + foreach(lc1, plan->plancache_list) + { + CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc1); + List *stmt_list; + ListCell *lc2; + + spicallbackarg.query = plansource->query_string; + + /* + * If this is a one-shot plan, we still need to do parse analysis. + */ + if (plan->oneshot) + { + RawStmt *parsetree = plansource->raw_parse_tree; + const char *src = plansource->query_string; + List *stmt_list; + + /* + * Parameter datatypes are driven by parserSetup hook if provided, + * otherwise we use the fixed parameter list. + */ + if (parsetree == NULL) + stmt_list = NIL; + else if (plan->parserSetup != NULL) + { + Assert(plan->nargs == 0); + stmt_list = pg_analyze_and_rewrite_params(parsetree, + src, + plan->parserSetup, + plan->parserSetupArg, + _SPI_current->queryEnv); + } + else + { + stmt_list = pg_analyze_and_rewrite(parsetree, + src, + plan->argtypes, + plan->nargs, + _SPI_current->queryEnv); + } + + /* Finish filling in the CachedPlanSource */ + CompleteCachedPlan(plansource, + stmt_list, + NULL, + plan->argtypes, + plan->nargs, + plan->parserSetup, + plan->parserSetupArg, + plan->cursor_options, + false); /* not fixed result */ + } + + /* + * If asked to, complain when query does not return tuples. + * (Replanning can't change this, so we can check it before that. + * However, we can't check it till after parse analysis, so in the + * case of a one-shot plan this is the earliest we could check.) + */ + if (options->must_return_tuples && !plansource->resultDesc) + { + /* try to give a good error message */ + const char *cmdtag; + + /* A SELECT without resultDesc must be SELECT INTO */ + if (plansource->commandTag == CMDTAG_SELECT) + cmdtag = "SELECT INTO"; + else + cmdtag = GetCommandTagName(plansource->commandTag); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + /* translator: %s is name of a SQL command, eg INSERT */ + errmsg("%s query does not return tuples", cmdtag))); + } + + /* + * Replan if needed, and increment plan refcount. If it's a saved + * plan, the refcount must be backed by the plan_owner. + */ + cplan = GetCachedPlan(plansource, options->params, + plan_owner, _SPI_current->queryEnv); + + stmt_list = cplan->stmt_list; + + /* + * If we weren't given a specific snapshot to use, and the statement + * list requires a snapshot, set that up. + */ + if (snapshot == InvalidSnapshot && + (list_length(stmt_list) > 1 || + (list_length(stmt_list) == 1 && + PlannedStmtRequiresSnapshot(linitial_node(PlannedStmt, + stmt_list))))) + { + /* + * First, ensure there's a Portal-level snapshot. This back-fills + * the snapshot stack in case the previous operation was a COMMIT + * or ROLLBACK inside a procedure or DO block. (We can't put back + * the Portal snapshot any sooner, or we'd break cases like doing + * SET or LOCK just after COMMIT.) It's enough to check once per + * statement list, since COMMIT/ROLLBACK/CALL/DO can't appear + * within a multi-statement list. + */ + EnsurePortalSnapshotExists(); + + /* + * In the default non-read-only case, get a new per-statement-list + * snapshot, replacing any that we pushed in a previous cycle. + * Skip it when doing non-atomic execution, though (we rely + * entirely on the Portal snapshot in that case). + */ + if (!options->read_only && !options->allow_nonatomic) + { + if (pushed_active_snap) + PopActiveSnapshot(); + PushActiveSnapshot(GetTransactionSnapshot()); + pushed_active_snap = true; + } + } + + foreach(lc2, stmt_list) + { + PlannedStmt *stmt = lfirst_node(PlannedStmt, lc2); + bool canSetTag = stmt->canSetTag; + DestReceiver *dest; + + /* + * Reset output state. (Note that if a non-SPI receiver is used, + * _SPI_current->processed will stay zero, and that's what we'll + * report to the caller. It's the receiver's job to count tuples + * in that case.) + */ + _SPI_current->processed = 0; + _SPI_current->tuptable = NULL; + + /* Check for unsupported cases. */ + if (stmt->utilityStmt) + { + if (IsA(stmt->utilityStmt, CopyStmt)) + { + CopyStmt *cstmt = (CopyStmt *) stmt->utilityStmt; + + if (cstmt->filename == NULL) + { + my_res = SPI_ERROR_COPY; + goto fail; + } + } + else if (IsA(stmt->utilityStmt, TransactionStmt)) + { + my_res = SPI_ERROR_TRANSACTION; + goto fail; + } + } + + if (options->read_only && !CommandIsReadOnly(stmt)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /* translator: %s is a SQL statement name */ + errmsg("%s is not allowed in a non-volatile function", + CreateCommandName((Node *) stmt)))); + + /* + * If not read-only mode, advance the command counter before each + * command and update the snapshot. (But skip it if the snapshot + * isn't under our control.) + */ + if (!options->read_only && pushed_active_snap) + { + CommandCounterIncrement(); + UpdateActiveSnapshotCommandId(); + } + + /* + * Select appropriate tuple receiver. Output from non-canSetTag + * subqueries always goes to the bit bucket. + */ + if (!canSetTag) + dest = CreateDestReceiver(DestNone); + else if (options->dest) + dest = options->dest; + else + dest = CreateDestReceiver(DestSPI); + + if (stmt->utilityStmt == NULL) + { + QueryDesc *qdesc; + Snapshot snap; + + if (ActiveSnapshotSet()) + snap = GetActiveSnapshot(); + else + snap = InvalidSnapshot; + + qdesc = CreateQueryDesc(stmt, + plansource->query_string, + snap, crosscheck_snapshot, + dest, + options->params, + _SPI_current->queryEnv, + 0); + res = _SPI_pquery(qdesc, fire_triggers, + canSetTag ? options->tcount : 0); + FreeQueryDesc(qdesc); + } + else + { + ProcessUtilityContext context; + QueryCompletion qc; + + /* + * If the SPI context is atomic, or we were not told to allow + * nonatomic operations, tell ProcessUtility this is an atomic + * execution context. + */ + if (_SPI_current->atomic || !options->allow_nonatomic) + context = PROCESS_UTILITY_QUERY; + else + context = PROCESS_UTILITY_QUERY_NONATOMIC; + + InitializeQueryCompletion(&qc); + ProcessUtility(stmt, + plansource->query_string, + true, /* protect plancache's node tree */ + context, + options->params, + _SPI_current->queryEnv, + dest, + &qc); + + /* Update "processed" if stmt returned tuples */ + if (_SPI_current->tuptable) + _SPI_current->processed = _SPI_current->tuptable->numvals; + + res = SPI_OK_UTILITY; + + /* + * Some utility statements return a row count, even though the + * tuples are not returned to the caller. + */ + if (IsA(stmt->utilityStmt, CreateTableAsStmt)) + { + CreateTableAsStmt *ctastmt = (CreateTableAsStmt *) stmt->utilityStmt; + + if (qc.commandTag == CMDTAG_SELECT) + _SPI_current->processed = qc.nprocessed; + else + { + /* + * Must be an IF NOT EXISTS that did nothing, or a + * CREATE ... WITH NO DATA. + */ + Assert(ctastmt->if_not_exists || + ctastmt->into->skipData); + _SPI_current->processed = 0; + } + + /* + * For historical reasons, if CREATE TABLE AS was spelled + * as SELECT INTO, return a special return code. + */ + if (ctastmt->is_select_into) + res = SPI_OK_SELINTO; + } + else if (IsA(stmt->utilityStmt, CopyStmt)) + { + Assert(qc.commandTag == CMDTAG_COPY); + _SPI_current->processed = qc.nprocessed; + } + } + + /* + * The last canSetTag query sets the status values returned to the + * caller. Be careful to free any tuptables not returned, to + * avoid intra-transaction memory leak. + */ + if (canSetTag) + { + my_processed = _SPI_current->processed; + SPI_freetuptable(my_tuptable); + my_tuptable = _SPI_current->tuptable; + my_res = res; + } + else + { + SPI_freetuptable(_SPI_current->tuptable); + _SPI_current->tuptable = NULL; + } + + /* + * We don't issue a destroy call to the receiver. The SPI and + * None receivers would ignore it anyway, while if the caller + * supplied a receiver, it's not our job to destroy it. + */ + + if (res < 0) + { + my_res = res; + goto fail; + } + } + + /* Done with this plan, so release refcount */ + ReleaseCachedPlan(cplan, plan_owner); + cplan = NULL; + + /* + * If not read-only mode, advance the command counter after the last + * command. This ensures that its effects are visible, in case it was + * DDL that would affect the next CachedPlanSource. + */ + if (!options->read_only) + CommandCounterIncrement(); + } + +fail: + + /* Pop the snapshot off the stack if we pushed one */ + if (pushed_active_snap) + PopActiveSnapshot(); + + /* We no longer need the cached plan refcount, if any */ + if (cplan) + ReleaseCachedPlan(cplan, plan_owner); + + /* + * Pop the error context stack + */ + error_context_stack = spierrcontext.previous; + + /* Save results for caller */ + SPI_processed = my_processed; + SPI_tuptable = my_tuptable; + + /* tuptable now is caller's responsibility, not SPI's */ + _SPI_current->tuptable = NULL; + + /* + * If none of the queries had canSetTag, return SPI_OK_REWRITTEN. Prior to + * 8.4, we used return the last query's result code, but not its auxiliary + * results, but that's confusing. + */ + if (my_res == 0) + my_res = SPI_OK_REWRITTEN; + + return my_res; +} + +/* + * Convert arrays of query parameters to form wanted by planner and executor + */ +static ParamListInfo +_SPI_convert_params(int nargs, Oid *argtypes, + Datum *Values, const char *Nulls) +{ + ParamListInfo paramLI; + + if (nargs > 0) + { + paramLI = makeParamList(nargs); + + for (int i = 0; i < nargs; i++) + { + ParamExternData *prm = ¶mLI->params[i]; + + prm->value = Values[i]; + prm->isnull = (Nulls && Nulls[i] == 'n'); + prm->pflags = PARAM_FLAG_CONST; + prm->ptype = argtypes[i]; + } + } + else + paramLI = NULL; + return paramLI; +} + +static int +_SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount) +{ + int operation = queryDesc->operation; + int eflags; + int res; + + switch (operation) + { + case CMD_SELECT: + if (queryDesc->dest->mydest == DestNone) + { + /* Don't return SPI_OK_SELECT if we're discarding result */ + res = SPI_OK_UTILITY; + } + else + res = SPI_OK_SELECT; + break; + case CMD_INSERT: + if (queryDesc->plannedstmt->hasReturning) + res = SPI_OK_INSERT_RETURNING; + else + res = SPI_OK_INSERT; + break; + case CMD_DELETE: + if (queryDesc->plannedstmt->hasReturning) + res = SPI_OK_DELETE_RETURNING; + else + res = SPI_OK_DELETE; + break; + case CMD_UPDATE: + if (queryDesc->plannedstmt->hasReturning) + res = SPI_OK_UPDATE_RETURNING; + else + res = SPI_OK_UPDATE; + break; + default: + return SPI_ERROR_OPUNKNOWN; + } + +#ifdef SPI_EXECUTOR_STATS + if (ShowExecutorStats) + ResetUsage(); +#endif + + /* Select execution options */ + if (fire_triggers) + eflags = 0; /* default run-to-completion flags */ + else + eflags = EXEC_FLAG_SKIP_TRIGGERS; + + ExecutorStart(queryDesc, eflags); + + ExecutorRun(queryDesc, ForwardScanDirection, tcount, true); + + _SPI_current->processed = queryDesc->estate->es_processed; + + if ((res == SPI_OK_SELECT || queryDesc->plannedstmt->hasReturning) && + queryDesc->dest->mydest == DestSPI) + { + if (_SPI_checktuples()) + elog(ERROR, "consistency check on SPI tuple count failed"); + } + + ExecutorFinish(queryDesc); + ExecutorEnd(queryDesc); + /* FreeQueryDesc is done by the caller */ + +#ifdef SPI_EXECUTOR_STATS + if (ShowExecutorStats) + ShowUsage("SPI EXECUTOR STATS"); +#endif + + return res; +} + +/* + * _SPI_error_callback + * + * Add context information when a query invoked via SPI fails + */ +static void +_SPI_error_callback(void *arg) +{ + SPICallbackArg *carg = (SPICallbackArg *) arg; + const char *query = carg->query; + int syntaxerrposition; + + if (query == NULL) /* in case arg wasn't set yet */ + return; + + /* + * If there is a syntax error position, convert to internal syntax error; + * otherwise treat the query as an item of context stack + */ + syntaxerrposition = geterrposition(); + if (syntaxerrposition > 0) + { + errposition(0); + internalerrposition(syntaxerrposition); + internalerrquery(query); + } + else + { + /* Use the parse mode to decide how to describe the query */ + switch (carg->mode) + { + case RAW_PARSE_PLPGSQL_EXPR: + errcontext("SQL expression \"%s\"", query); + break; + case RAW_PARSE_PLPGSQL_ASSIGN1: + case RAW_PARSE_PLPGSQL_ASSIGN2: + case RAW_PARSE_PLPGSQL_ASSIGN3: + errcontext("PL/pgSQL assignment \"%s\"", query); + break; + default: + errcontext("SQL statement \"%s\"", query); + break; + } + } +} + +/* + * _SPI_cursor_operation() + * + * Do a FETCH or MOVE in a cursor + */ +static void +_SPI_cursor_operation(Portal portal, FetchDirection direction, long count, + DestReceiver *dest) +{ + uint64 nfetched; + + /* Check that the portal is valid */ + if (!PortalIsValid(portal)) + elog(ERROR, "invalid portal in SPI cursor operation"); + + /* Push the SPI stack */ + if (_SPI_begin_call(true) < 0) + elog(ERROR, "SPI cursor operation called while not connected"); + + /* Reset the SPI result (note we deliberately don't touch lastoid) */ + SPI_processed = 0; + SPI_tuptable = NULL; + _SPI_current->processed = 0; + _SPI_current->tuptable = NULL; + + /* Run the cursor */ + nfetched = PortalRunFetch(portal, + direction, + count, + dest); + + /* + * Think not to combine this store with the preceding function call. If + * the portal contains calls to functions that use SPI, then _SPI_stack is + * likely to move around while the portal runs. When control returns, + * _SPI_current will point to the correct stack entry... but the pointer + * may be different than it was beforehand. So we must be sure to re-fetch + * the pointer after the function call completes. + */ + _SPI_current->processed = nfetched; + + if (dest->mydest == DestSPI && _SPI_checktuples()) + elog(ERROR, "consistency check on SPI tuple count failed"); + + /* Put the result into place for access by caller */ + SPI_processed = _SPI_current->processed; + SPI_tuptable = _SPI_current->tuptable; + + /* tuptable now is caller's responsibility, not SPI's */ + _SPI_current->tuptable = NULL; + + /* Pop the SPI stack */ + _SPI_end_call(true); +} + + +static MemoryContext +_SPI_execmem(void) +{ + return MemoryContextSwitchTo(_SPI_current->execCxt); +} + +static MemoryContext +_SPI_procmem(void) +{ + return MemoryContextSwitchTo(_SPI_current->procCxt); +} + +/* + * _SPI_begin_call: begin a SPI operation within a connected procedure + * + * use_exec is true if we intend to make use of the procedure's execCxt + * during this SPI operation. We'll switch into that context, and arrange + * for it to be cleaned up at _SPI_end_call or if an error occurs. + */ +static int +_SPI_begin_call(bool use_exec) +{ + if (_SPI_current == NULL) + return SPI_ERROR_UNCONNECTED; + + if (use_exec) + { + /* remember when the Executor operation started */ + _SPI_current->execSubid = GetCurrentSubTransactionId(); + /* switch to the Executor memory context */ + _SPI_execmem(); + } + + return 0; +} + +/* + * _SPI_end_call: end a SPI operation within a connected procedure + * + * use_exec must be the same as in the previous _SPI_begin_call + * + * Note: this currently has no failure return cases, so callers don't check + */ +static int +_SPI_end_call(bool use_exec) +{ + if (use_exec) + { + /* switch to the procedure memory context */ + _SPI_procmem(); + /* mark Executor context no longer in use */ + _SPI_current->execSubid = InvalidSubTransactionId; + /* and free Executor memory */ + MemoryContextResetAndDeleteChildren(_SPI_current->execCxt); + } + + return 0; +} + +static bool +_SPI_checktuples(void) +{ + uint64 processed = _SPI_current->processed; + SPITupleTable *tuptable = _SPI_current->tuptable; + bool failed = false; + + if (tuptable == NULL) /* spi_dest_startup was not called */ + failed = true; + else if (processed != tuptable->numvals) + failed = true; + + return failed; +} + +/* + * Convert a "temporary" SPIPlan into an "unsaved" plan. + * + * The passed _SPI_plan struct is on the stack, and all its subsidiary data + * is in or under the current SPI executor context. Copy the plan into the + * SPI procedure context so it will survive _SPI_end_call(). To minimize + * data copying, this destructively modifies the input plan, by taking the + * plancache entries away from it and reparenting them to the new SPIPlan. + */ +static SPIPlanPtr +_SPI_make_plan_non_temp(SPIPlanPtr plan) +{ + SPIPlanPtr newplan; + MemoryContext parentcxt = _SPI_current->procCxt; + MemoryContext plancxt; + MemoryContext oldcxt; + ListCell *lc; + + /* Assert the input is a temporary SPIPlan */ + Assert(plan->magic == _SPI_PLAN_MAGIC); + Assert(plan->plancxt == NULL); + /* One-shot plans can't be saved */ + Assert(!plan->oneshot); + + /* + * Create a memory context for the plan, underneath the procedure context. + * We don't expect the plan to be very large. + */ + plancxt = AllocSetContextCreate(parentcxt, + "SPI Plan", + ALLOCSET_SMALL_SIZES); + oldcxt = MemoryContextSwitchTo(plancxt); + + /* Copy the _SPI_plan struct and subsidiary data into the new context */ + newplan = (SPIPlanPtr) palloc0(sizeof(_SPI_plan)); + newplan->magic = _SPI_PLAN_MAGIC; + newplan->plancxt = plancxt; + newplan->parse_mode = plan->parse_mode; + newplan->cursor_options = plan->cursor_options; + newplan->nargs = plan->nargs; + if (plan->nargs > 0) + { + newplan->argtypes = (Oid *) palloc(plan->nargs * sizeof(Oid)); + memcpy(newplan->argtypes, plan->argtypes, plan->nargs * sizeof(Oid)); + } + else + newplan->argtypes = NULL; + newplan->parserSetup = plan->parserSetup; + newplan->parserSetupArg = plan->parserSetupArg; + + /* + * Reparent all the CachedPlanSources into the procedure context. In + * theory this could fail partway through due to the pallocs, but we don't + * care too much since both the procedure context and the executor context + * would go away on error. + */ + foreach(lc, plan->plancache_list) + { + CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc); + + CachedPlanSetParentContext(plansource, parentcxt); + + /* Build new list, with list cells in plancxt */ + newplan->plancache_list = lappend(newplan->plancache_list, plansource); + } + + MemoryContextSwitchTo(oldcxt); + + /* For safety, unlink the CachedPlanSources from the temporary plan */ + plan->plancache_list = NIL; + + return newplan; +} + +/* + * Make a "saved" copy of the given plan. + */ +static SPIPlanPtr +_SPI_save_plan(SPIPlanPtr plan) +{ + SPIPlanPtr newplan; + MemoryContext plancxt; + MemoryContext oldcxt; + ListCell *lc; + + /* One-shot plans can't be saved */ + Assert(!plan->oneshot); + + /* + * Create a memory context for the plan. We don't expect the plan to be + * very large, so use smaller-than-default alloc parameters. It's a + * transient context until we finish copying everything. + */ + plancxt = AllocSetContextCreate(CurrentMemoryContext, + "SPI Plan", + ALLOCSET_SMALL_SIZES); + oldcxt = MemoryContextSwitchTo(plancxt); + + /* Copy the SPI plan into its own context */ + newplan = (SPIPlanPtr) palloc0(sizeof(_SPI_plan)); + newplan->magic = _SPI_PLAN_MAGIC; + newplan->plancxt = plancxt; + newplan->parse_mode = plan->parse_mode; + newplan->cursor_options = plan->cursor_options; + newplan->nargs = plan->nargs; + if (plan->nargs > 0) + { + newplan->argtypes = (Oid *) palloc(plan->nargs * sizeof(Oid)); + memcpy(newplan->argtypes, plan->argtypes, plan->nargs * sizeof(Oid)); + } + else + newplan->argtypes = NULL; + newplan->parserSetup = plan->parserSetup; + newplan->parserSetupArg = plan->parserSetupArg; + + /* Copy all the plancache entries */ + foreach(lc, plan->plancache_list) + { + CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc); + CachedPlanSource *newsource; + + newsource = CopyCachedPlan(plansource); + newplan->plancache_list = lappend(newplan->plancache_list, newsource); + } + + MemoryContextSwitchTo(oldcxt); + + /* + * Mark it saved, reparent it under CacheMemoryContext, and mark all the + * component CachedPlanSources as saved. This sequence cannot fail + * partway through, so there's no risk of long-term memory leakage. + */ + newplan->saved = true; + MemoryContextSetParent(newplan->plancxt, CacheMemoryContext); + + foreach(lc, newplan->plancache_list) + { + CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc); + + SaveCachedPlan(plansource); + } + + return newplan; +} + +/* + * Internal lookup of ephemeral named relation by name. + */ +static EphemeralNamedRelation +_SPI_find_ENR_by_name(const char *name) +{ + /* internal static function; any error is bug in SPI itself */ + Assert(name != NULL); + + /* fast exit if no tuplestores have been added */ + if (_SPI_current->queryEnv == NULL) + return NULL; + + return get_ENR(_SPI_current->queryEnv, name); +} + +/* + * Register an ephemeral named relation for use by the planner and executor on + * subsequent calls using this SPI connection. + */ +int +SPI_register_relation(EphemeralNamedRelation enr) +{ + EphemeralNamedRelation match; + int res; + + if (enr == NULL || enr->md.name == NULL) + return SPI_ERROR_ARGUMENT; + + res = _SPI_begin_call(false); /* keep current memory context */ + if (res < 0) + return res; + + match = _SPI_find_ENR_by_name(enr->md.name); + if (match) + res = SPI_ERROR_REL_DUPLICATE; + else + { + if (_SPI_current->queryEnv == NULL) + _SPI_current->queryEnv = create_queryEnv(); + + register_ENR(_SPI_current->queryEnv, enr); + res = SPI_OK_REL_REGISTER; + } + + _SPI_end_call(false); + + return res; +} + +/* + * Unregister an ephemeral named relation by name. This will probably be a + * rarely used function, since SPI_finish will clear it automatically. + */ +int +SPI_unregister_relation(const char *name) +{ + EphemeralNamedRelation match; + int res; + + if (name == NULL) + return SPI_ERROR_ARGUMENT; + + res = _SPI_begin_call(false); /* keep current memory context */ + if (res < 0) + return res; + + match = _SPI_find_ENR_by_name(name); + if (match) + { + unregister_ENR(_SPI_current->queryEnv, match->md.name); + res = SPI_OK_REL_UNREGISTER; + } + else + res = SPI_ERROR_REL_NOT_FOUND; + + _SPI_end_call(false); + + return res; +} + +/* + * Register the transient relations from 'tdata' using this SPI connection. + * This should be called by PL implementations' trigger handlers after + * connecting, in order to make transition tables visible to any queries run + * in this connection. + */ +int +SPI_register_trigger_data(TriggerData *tdata) +{ + if (tdata == NULL) + return SPI_ERROR_ARGUMENT; + + if (tdata->tg_newtable) + { + EphemeralNamedRelation enr = + palloc(sizeof(EphemeralNamedRelationData)); + int rc; + + enr->md.name = tdata->tg_trigger->tgnewtable; + enr->md.reliddesc = tdata->tg_relation->rd_id; + enr->md.tupdesc = NULL; + enr->md.enrtype = ENR_NAMED_TUPLESTORE; + enr->md.enrtuples = tuplestore_tuple_count(tdata->tg_newtable); + enr->reldata = tdata->tg_newtable; + rc = SPI_register_relation(enr); + if (rc != SPI_OK_REL_REGISTER) + return rc; + } + + if (tdata->tg_oldtable) + { + EphemeralNamedRelation enr = + palloc(sizeof(EphemeralNamedRelationData)); + int rc; + + enr->md.name = tdata->tg_trigger->tgoldtable; + enr->md.reliddesc = tdata->tg_relation->rd_id; + enr->md.tupdesc = NULL; + enr->md.enrtype = ENR_NAMED_TUPLESTORE; + enr->md.enrtuples = tuplestore_tuple_count(tdata->tg_oldtable); + enr->reldata = tdata->tg_oldtable; + rc = SPI_register_relation(enr); + if (rc != SPI_OK_REL_REGISTER) + return rc; + } + + return SPI_OK_TD_REGISTER; +} diff --git a/src/backend/executor/tqueue.c b/src/backend/executor/tqueue.c new file mode 100644 index 0000000..7af9fbe --- /dev/null +++ b/src/backend/executor/tqueue.c @@ -0,0 +1,210 @@ +/*------------------------------------------------------------------------- + * + * tqueue.c + * Use shm_mq to send & receive tuples between parallel backends + * + * A DestReceiver of type DestTupleQueue, which is a TQueueDestReceiver + * under the hood, writes tuples from the executor to a shm_mq. + * + * A TupleQueueReader reads tuples from a shm_mq and returns the tuples. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/tqueue.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/tqueue.h" + +/* + * DestReceiver object's private contents + * + * queue is a pointer to data supplied by DestReceiver's caller. + */ +typedef struct TQueueDestReceiver +{ + DestReceiver pub; /* public fields */ + shm_mq_handle *queue; /* shm_mq to send to */ +} TQueueDestReceiver; + +/* + * TupleQueueReader object's private contents + * + * queue is a pointer to data supplied by reader's caller. + * + * "typedef struct TupleQueueReader TupleQueueReader" is in tqueue.h + */ +struct TupleQueueReader +{ + shm_mq_handle *queue; /* shm_mq to receive from */ +}; + +/* + * Receive a tuple from a query, and send it to the designated shm_mq. + * + * Returns true if successful, false if shm_mq has been detached. + */ +static bool +tqueueReceiveSlot(TupleTableSlot *slot, DestReceiver *self) +{ + TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self; + MinimalTuple tuple; + shm_mq_result result; + bool should_free; + + /* Send the tuple itself. */ + tuple = ExecFetchSlotMinimalTuple(slot, &should_free); + result = shm_mq_send(tqueue->queue, tuple->t_len, tuple, false); + + if (should_free) + pfree(tuple); + + /* Check for failure. */ + if (result == SHM_MQ_DETACHED) + return false; + else if (result != SHM_MQ_SUCCESS) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not send tuple to shared-memory queue"))); + + return true; +} + +/* + * Prepare to receive tuples from executor. + */ +static void +tqueueStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + /* do nothing */ +} + +/* + * Clean up at end of an executor run + */ +static void +tqueueShutdownReceiver(DestReceiver *self) +{ + TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self; + + if (tqueue->queue != NULL) + shm_mq_detach(tqueue->queue); + tqueue->queue = NULL; +} + +/* + * Destroy receiver when done with it + */ +static void +tqueueDestroyReceiver(DestReceiver *self) +{ + TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self; + + /* We probably already detached from queue, but let's be sure */ + if (tqueue->queue != NULL) + shm_mq_detach(tqueue->queue); + pfree(self); +} + +/* + * Create a DestReceiver that writes tuples to a tuple queue. + */ +DestReceiver * +CreateTupleQueueDestReceiver(shm_mq_handle *handle) +{ + TQueueDestReceiver *self; + + self = (TQueueDestReceiver *) palloc0(sizeof(TQueueDestReceiver)); + + self->pub.receiveSlot = tqueueReceiveSlot; + self->pub.rStartup = tqueueStartupReceiver; + self->pub.rShutdown = tqueueShutdownReceiver; + self->pub.rDestroy = tqueueDestroyReceiver; + self->pub.mydest = DestTupleQueue; + self->queue = handle; + + return (DestReceiver *) self; +} + +/* + * Create a tuple queue reader. + */ +TupleQueueReader * +CreateTupleQueueReader(shm_mq_handle *handle) +{ + TupleQueueReader *reader = palloc0(sizeof(TupleQueueReader)); + + reader->queue = handle; + + return reader; +} + +/* + * Destroy a tuple queue reader. + * + * Note: cleaning up the underlying shm_mq is the caller's responsibility. + * We won't access it here, as it may be detached already. + */ +void +DestroyTupleQueueReader(TupleQueueReader *reader) +{ + pfree(reader); +} + +/* + * Fetch a tuple from a tuple queue reader. + * + * The return value is NULL if there are no remaining tuples or if + * nowait = true and no tuple is ready to return. *done, if not NULL, + * is set to true when there are no remaining tuples and otherwise to false. + * + * The returned tuple, if any, is either in shared memory or a private buffer + * and should not be freed. The pointer is invalid after the next call to + * TupleQueueReaderNext(). + * + * Even when shm_mq_receive() returns SHM_MQ_WOULD_BLOCK, this can still + * accumulate bytes from a partially-read message, so it's useful to call + * this with nowait = true even if nothing is returned. + */ +MinimalTuple +TupleQueueReaderNext(TupleQueueReader *reader, bool nowait, bool *done) +{ + MinimalTuple tuple; + shm_mq_result result; + Size nbytes; + void *data; + + if (done != NULL) + *done = false; + + /* Attempt to read a message. */ + result = shm_mq_receive(reader->queue, &nbytes, &data, nowait); + + /* If queue is detached, set *done and return NULL. */ + if (result == SHM_MQ_DETACHED) + { + if (done != NULL) + *done = true; + return NULL; + } + + /* In non-blocking mode, bail out if no message ready yet. */ + if (result == SHM_MQ_WOULD_BLOCK) + return NULL; + Assert(result == SHM_MQ_SUCCESS); + + /* + * Return a pointer to the queue memory directly (which had better be + * sufficiently aligned). + */ + tuple = (MinimalTuple) data; + Assert(tuple->t_len == nbytes); + + return tuple; +} diff --git a/src/backend/executor/tstoreReceiver.c b/src/backend/executor/tstoreReceiver.c new file mode 100644 index 0000000..e07664f --- /dev/null +++ b/src/backend/executor/tstoreReceiver.c @@ -0,0 +1,283 @@ +/*------------------------------------------------------------------------- + * + * tstoreReceiver.c + * An implementation of DestReceiver that stores the result tuples in + * a Tuplestore. + * + * Optionally, we can force detoasting (but not decompression) of out-of-line + * toasted values. This is to support cursors WITH HOLD, which must retain + * data even if the underlying table is dropped. + * + * Also optionally, we can apply a tuple conversion map before storing. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/tstoreReceiver.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/tupconvert.h" +#include "executor/tstoreReceiver.h" + + +typedef struct +{ + DestReceiver pub; + /* parameters: */ + Tuplestorestate *tstore; /* where to put the data */ + MemoryContext cxt; /* context containing tstore */ + bool detoast; /* were we told to detoast? */ + TupleDesc target_tupdesc; /* target tupdesc, or NULL if none */ + const char *map_failure_msg; /* tupdesc mapping failure message */ + /* workspace: */ + Datum *outvalues; /* values array for result tuple */ + Datum *tofree; /* temp values to be pfree'd */ + TupleConversionMap *tupmap; /* conversion map, if needed */ + TupleTableSlot *mapslot; /* slot for mapped tuples */ +} TStoreState; + + +static bool tstoreReceiveSlot_notoast(TupleTableSlot *slot, DestReceiver *self); +static bool tstoreReceiveSlot_detoast(TupleTableSlot *slot, DestReceiver *self); +static bool tstoreReceiveSlot_tupmap(TupleTableSlot *slot, DestReceiver *self); + + +/* + * Prepare to receive tuples from executor. + */ +static void +tstoreStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + TStoreState *myState = (TStoreState *) self; + bool needtoast = false; + int natts = typeinfo->natts; + int i; + + /* Check if any columns require detoast work */ + if (myState->detoast) + { + for (i = 0; i < natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(typeinfo, i); + + if (attr->attisdropped) + continue; + if (attr->attlen == -1) + { + needtoast = true; + break; + } + } + } + + /* Check if tuple conversion is needed */ + if (myState->target_tupdesc) + myState->tupmap = convert_tuples_by_position(typeinfo, + myState->target_tupdesc, + myState->map_failure_msg); + else + myState->tupmap = NULL; + + /* Set up appropriate callback */ + if (needtoast) + { + Assert(!myState->tupmap); + myState->pub.receiveSlot = tstoreReceiveSlot_detoast; + /* Create workspace */ + myState->outvalues = (Datum *) + MemoryContextAlloc(myState->cxt, natts * sizeof(Datum)); + myState->tofree = (Datum *) + MemoryContextAlloc(myState->cxt, natts * sizeof(Datum)); + myState->mapslot = NULL; + } + else if (myState->tupmap) + { + myState->pub.receiveSlot = tstoreReceiveSlot_tupmap; + myState->outvalues = NULL; + myState->tofree = NULL; + myState->mapslot = MakeSingleTupleTableSlot(myState->target_tupdesc, + &TTSOpsVirtual); + } + else + { + myState->pub.receiveSlot = tstoreReceiveSlot_notoast; + myState->outvalues = NULL; + myState->tofree = NULL; + myState->mapslot = NULL; + } +} + +/* + * Receive a tuple from the executor and store it in the tuplestore. + * This is for the easy case where we don't have to detoast nor map anything. + */ +static bool +tstoreReceiveSlot_notoast(TupleTableSlot *slot, DestReceiver *self) +{ + TStoreState *myState = (TStoreState *) self; + + tuplestore_puttupleslot(myState->tstore, slot); + + return true; +} + +/* + * Receive a tuple from the executor and store it in the tuplestore. + * This is for the case where we have to detoast any toasted values. + */ +static bool +tstoreReceiveSlot_detoast(TupleTableSlot *slot, DestReceiver *self) +{ + TStoreState *myState = (TStoreState *) self; + TupleDesc typeinfo = slot->tts_tupleDescriptor; + int natts = typeinfo->natts; + int nfree; + int i; + MemoryContext oldcxt; + + /* Make sure the tuple is fully deconstructed */ + slot_getallattrs(slot); + + /* + * Fetch back any out-of-line datums. We build the new datums array in + * myState->outvalues[] (but we can re-use the slot's isnull array). Also, + * remember the fetched values to free afterwards. + */ + nfree = 0; + for (i = 0; i < natts; i++) + { + Datum val = slot->tts_values[i]; + Form_pg_attribute attr = TupleDescAttr(typeinfo, i); + + if (!attr->attisdropped && attr->attlen == -1 && !slot->tts_isnull[i]) + { + if (VARATT_IS_EXTERNAL(DatumGetPointer(val))) + { + val = PointerGetDatum(detoast_external_attr((struct varlena *) + DatumGetPointer(val))); + myState->tofree[nfree++] = val; + } + } + + myState->outvalues[i] = val; + } + + /* + * Push the modified tuple into the tuplestore. + */ + oldcxt = MemoryContextSwitchTo(myState->cxt); + tuplestore_putvalues(myState->tstore, typeinfo, + myState->outvalues, slot->tts_isnull); + MemoryContextSwitchTo(oldcxt); + + /* And release any temporary detoasted values */ + for (i = 0; i < nfree; i++) + pfree(DatumGetPointer(myState->tofree[i])); + + return true; +} + +/* + * Receive a tuple from the executor and store it in the tuplestore. + * This is for the case where we must apply a tuple conversion map. + */ +static bool +tstoreReceiveSlot_tupmap(TupleTableSlot *slot, DestReceiver *self) +{ + TStoreState *myState = (TStoreState *) self; + + execute_attr_map_slot(myState->tupmap->attrMap, slot, myState->mapslot); + tuplestore_puttupleslot(myState->tstore, myState->mapslot); + + return true; +} + +/* + * Clean up at end of an executor run + */ +static void +tstoreShutdownReceiver(DestReceiver *self) +{ + TStoreState *myState = (TStoreState *) self; + + /* Release workspace if any */ + if (myState->outvalues) + pfree(myState->outvalues); + myState->outvalues = NULL; + if (myState->tofree) + pfree(myState->tofree); + myState->tofree = NULL; + if (myState->tupmap) + free_conversion_map(myState->tupmap); + myState->tupmap = NULL; + if (myState->mapslot) + ExecDropSingleTupleTableSlot(myState->mapslot); + myState->mapslot = NULL; +} + +/* + * Destroy receiver when done with it + */ +static void +tstoreDestroyReceiver(DestReceiver *self) +{ + pfree(self); +} + +/* + * Initially create a DestReceiver object. + */ +DestReceiver * +CreateTuplestoreDestReceiver(void) +{ + TStoreState *self = (TStoreState *) palloc0(sizeof(TStoreState)); + + self->pub.receiveSlot = tstoreReceiveSlot_notoast; /* might change */ + self->pub.rStartup = tstoreStartupReceiver; + self->pub.rShutdown = tstoreShutdownReceiver; + self->pub.rDestroy = tstoreDestroyReceiver; + self->pub.mydest = DestTuplestore; + + /* private fields will be set by SetTuplestoreDestReceiverParams */ + + return (DestReceiver *) self; +} + +/* + * Set parameters for a TuplestoreDestReceiver + * + * tStore: where to store the tuples + * tContext: memory context containing tStore + * detoast: forcibly detoast contained data? + * target_tupdesc: if not NULL, forcibly convert tuples to this rowtype + * map_failure_msg: error message to use if mapping to target_tupdesc fails + * + * We don't currently support both detoast and target_tupdesc at the same + * time, just because no existing caller needs that combination. + */ +void +SetTuplestoreDestReceiverParams(DestReceiver *self, + Tuplestorestate *tStore, + MemoryContext tContext, + bool detoast, + TupleDesc target_tupdesc, + const char *map_failure_msg) +{ + TStoreState *myState = (TStoreState *) self; + + Assert(!(detoast && target_tupdesc)); + + Assert(myState->pub.mydest == DestTuplestore); + myState->tstore = tStore; + myState->cxt = tContext; + myState->detoast = detoast; + myState->target_tupdesc = target_tupdesc; + myState->map_failure_msg = map_failure_msg; +} -- cgit v1.2.3