summaryrefslogtreecommitdiffstats
path: root/src/pl/plpgsql/src/pl_scanner.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/pl/plpgsql/src/pl_scanner.c')
-rw-r--r--src/pl/plpgsql/src/pl_scanner.c620
1 files changed, 620 insertions, 0 deletions
diff --git a/src/pl/plpgsql/src/pl_scanner.c b/src/pl/plpgsql/src/pl_scanner.c
new file mode 100644
index 0000000..e4c7a91
--- /dev/null
+++ b/src/pl/plpgsql/src/pl_scanner.c
@@ -0,0 +1,620 @@
+/*-------------------------------------------------------------------------
+ *
+ * pl_scanner.c
+ * lexical scanning for PL/pgSQL
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/pl/plpgsql/src/pl_scanner.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "mb/pg_wchar.h"
+#include "parser/scanner.h"
+
+#include "plpgsql.h"
+#include "pl_gram.h" /* must be after parser/scanner.h */
+
+
+/* Klugy flag to tell scanner how to look up identifiers */
+IdentifierLookup plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL;
+
+/*
+ * A word about keywords:
+ *
+ * We keep reserved and unreserved keywords in separate headers. Be careful
+ * not to put the same word in both headers. Also be sure that pl_gram.y's
+ * unreserved_keyword production agrees with the unreserved header. The
+ * reserved keywords are passed to the core scanner, so they will be
+ * recognized before (and instead of) any variable name. Unreserved words
+ * are checked for separately, usually after determining that the identifier
+ * isn't a known variable name. If plpgsql_IdentifierLookup is DECLARE then
+ * no variable names will be recognized, so the unreserved words always work.
+ * (Note in particular that this helps us avoid reserving keywords that are
+ * only needed in DECLARE sections.)
+ *
+ * In certain contexts it is desirable to prefer recognizing an unreserved
+ * keyword over recognizing a variable name. In particular, at the start
+ * of a statement we should prefer unreserved keywords unless the statement
+ * looks like an assignment (i.e., first token is followed by ':=' or '[').
+ * This rule allows most statement-introducing keywords to be kept unreserved.
+ * (We still have to reserve initial keywords that might follow a block
+ * label, unfortunately, since the method used to determine if we are at
+ * start of statement doesn't recognize such cases. We'd also have to
+ * reserve any keyword that could legitimately be followed by ':=' or '['.)
+ * Some additional cases are handled in pl_gram.y using tok_is_keyword().
+ *
+ * We try to avoid reserving more keywords than we have to; but there's
+ * little point in not reserving a word if it's reserved in the core grammar.
+ * Currently, the following words are reserved here but not in the core:
+ * BEGIN BY DECLARE EXECUTE FOREACH IF LOOP STRICT WHILE
+ */
+
+/* ScanKeywordList lookup data for PL/pgSQL keywords */
+#include "pl_reserved_kwlist_d.h"
+#include "pl_unreserved_kwlist_d.h"
+
+/* Token codes for PL/pgSQL keywords */
+#define PG_KEYWORD(kwname, value) value,
+
+static const uint16 ReservedPLKeywordTokens[] = {
+#include "pl_reserved_kwlist.h"
+};
+
+static const uint16 UnreservedPLKeywordTokens[] = {
+#include "pl_unreserved_kwlist.h"
+};
+
+#undef PG_KEYWORD
+
+/*
+ * This macro must recognize all tokens that can immediately precede a
+ * PL/pgSQL executable statement (that is, proc_sect or proc_stmt in the
+ * grammar). Fortunately, there are not very many, so hard-coding in this
+ * fashion seems sufficient.
+ */
+#define AT_STMT_START(prev_token) \
+ ((prev_token) == ';' || \
+ (prev_token) == K_BEGIN || \
+ (prev_token) == K_THEN || \
+ (prev_token) == K_ELSE || \
+ (prev_token) == K_LOOP)
+
+
+/* Auxiliary data about a token (other than the token type) */
+typedef struct
+{
+ YYSTYPE lval; /* semantic information */
+ YYLTYPE lloc; /* offset in scanbuf */
+ int leng; /* length in bytes */
+} TokenAuxData;
+
+/*
+ * Scanner working state. At some point we might wish to fold all this
+ * into a YY_EXTRA struct. For the moment, there is no need for plpgsql's
+ * lexer to be re-entrant, and the notational burden of passing a yyscanner
+ * pointer around is great enough to not want to do it without need.
+ */
+
+/* The stuff the core lexer needs */
+static core_yyscan_t yyscanner = NULL;
+static core_yy_extra_type core_yy;
+
+/* The original input string */
+static const char *scanorig;
+
+/* Current token's length (corresponds to plpgsql_yylval and plpgsql_yylloc) */
+static int plpgsql_yyleng;
+
+/* Current token's code (corresponds to plpgsql_yylval and plpgsql_yylloc) */
+static int plpgsql_yytoken;
+
+/* Token pushback stack */
+#define MAX_PUSHBACKS 4
+
+static int num_pushbacks;
+static int pushback_token[MAX_PUSHBACKS];
+static TokenAuxData pushback_auxdata[MAX_PUSHBACKS];
+
+/* State for plpgsql_location_to_lineno() */
+static const char *cur_line_start;
+static const char *cur_line_end;
+static int cur_line_num;
+
+/* Internal functions */
+static int internal_yylex(TokenAuxData *auxdata);
+static void push_back_token(int token, TokenAuxData *auxdata);
+static void location_lineno_init(void);
+
+
+/*
+ * This is the yylex routine called from the PL/pgSQL grammar.
+ * It is a wrapper around the core lexer, with the ability to recognize
+ * PL/pgSQL variables and return them as special T_DATUM tokens. If a
+ * word or compound word does not match any variable name, or if matching
+ * is turned off by plpgsql_IdentifierLookup, it is returned as
+ * T_WORD or T_CWORD respectively, or as an unreserved keyword if it
+ * matches one of those.
+ */
+int
+plpgsql_yylex(void)
+{
+ int tok1;
+ TokenAuxData aux1;
+ int kwnum;
+
+ tok1 = internal_yylex(&aux1);
+ if (tok1 == IDENT || tok1 == PARAM)
+ {
+ int tok2;
+ TokenAuxData aux2;
+
+ tok2 = internal_yylex(&aux2);
+ if (tok2 == '.')
+ {
+ int tok3;
+ TokenAuxData aux3;
+
+ tok3 = internal_yylex(&aux3);
+ if (tok3 == IDENT)
+ {
+ int tok4;
+ TokenAuxData aux4;
+
+ tok4 = internal_yylex(&aux4);
+ if (tok4 == '.')
+ {
+ int tok5;
+ TokenAuxData aux5;
+
+ tok5 = internal_yylex(&aux5);
+ if (tok5 == IDENT)
+ {
+ if (plpgsql_parse_tripword(aux1.lval.str,
+ aux3.lval.str,
+ aux5.lval.str,
+ &aux1.lval.wdatum,
+ &aux1.lval.cword))
+ tok1 = T_DATUM;
+ else
+ tok1 = T_CWORD;
+ }
+ else
+ {
+ /* not A.B.C, so just process A.B */
+ push_back_token(tok5, &aux5);
+ push_back_token(tok4, &aux4);
+ if (plpgsql_parse_dblword(aux1.lval.str,
+ aux3.lval.str,
+ &aux1.lval.wdatum,
+ &aux1.lval.cword))
+ tok1 = T_DATUM;
+ else
+ tok1 = T_CWORD;
+ }
+ }
+ else
+ {
+ /* not A.B.C, so just process A.B */
+ push_back_token(tok4, &aux4);
+ if (plpgsql_parse_dblword(aux1.lval.str,
+ aux3.lval.str,
+ &aux1.lval.wdatum,
+ &aux1.lval.cword))
+ tok1 = T_DATUM;
+ else
+ tok1 = T_CWORD;
+ }
+ }
+ else
+ {
+ /* not A.B, so just process A */
+ push_back_token(tok3, &aux3);
+ push_back_token(tok2, &aux2);
+ if (plpgsql_parse_word(aux1.lval.str,
+ core_yy.scanbuf + aux1.lloc,
+ true,
+ &aux1.lval.wdatum,
+ &aux1.lval.word))
+ tok1 = T_DATUM;
+ else if (!aux1.lval.word.quoted &&
+ (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
+ &UnreservedPLKeywords)) >= 0)
+ {
+ aux1.lval.keyword = GetScanKeyword(kwnum,
+ &UnreservedPLKeywords);
+ tok1 = UnreservedPLKeywordTokens[kwnum];
+ }
+ else
+ tok1 = T_WORD;
+ }
+ }
+ else
+ {
+ /* not A.B, so just process A */
+ push_back_token(tok2, &aux2);
+
+ /*
+ * See if it matches a variable name, except in the context where
+ * we are at start of statement and the next token isn't
+ * assignment or '['. In that case, it couldn't validly be a
+ * variable name, and skipping the lookup allows variable names to
+ * be used that would conflict with plpgsql or core keywords that
+ * introduce statements (e.g., "comment"). Without this special
+ * logic, every statement-introducing keyword would effectively be
+ * reserved in PL/pgSQL, which would be unpleasant.
+ *
+ * If it isn't a variable name, try to match against unreserved
+ * plpgsql keywords. If not one of those either, it's T_WORD.
+ *
+ * Note: we must call plpgsql_parse_word even if we don't want to
+ * do variable lookup, because it sets up aux1.lval.word for the
+ * non-variable cases.
+ */
+ if (plpgsql_parse_word(aux1.lval.str,
+ core_yy.scanbuf + aux1.lloc,
+ (!AT_STMT_START(plpgsql_yytoken) ||
+ (tok2 == '=' || tok2 == COLON_EQUALS ||
+ tok2 == '[')),
+ &aux1.lval.wdatum,
+ &aux1.lval.word))
+ tok1 = T_DATUM;
+ else if (!aux1.lval.word.quoted &&
+ (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
+ &UnreservedPLKeywords)) >= 0)
+ {
+ aux1.lval.keyword = GetScanKeyword(kwnum,
+ &UnreservedPLKeywords);
+ tok1 = UnreservedPLKeywordTokens[kwnum];
+ }
+ else
+ tok1 = T_WORD;
+ }
+ }
+ else
+ {
+ /*
+ * Not a potential plpgsql variable name, just return the data.
+ *
+ * Note that we also come through here if the grammar pushed back a
+ * T_DATUM, T_CWORD, T_WORD, or unreserved-keyword token returned by a
+ * previous lookup cycle; thus, pushbacks do not incur extra lookup
+ * work, since we'll never do the above code twice for the same token.
+ * This property also makes it safe to rely on the old value of
+ * plpgsql_yytoken in the is-this-start-of-statement test above.
+ */
+ }
+
+ plpgsql_yylval = aux1.lval;
+ plpgsql_yylloc = aux1.lloc;
+ plpgsql_yyleng = aux1.leng;
+ plpgsql_yytoken = tok1;
+ return tok1;
+}
+
+/*
+ * Internal yylex function. This wraps the core lexer and adds one feature:
+ * a token pushback stack. We also make a couple of trivial single-token
+ * translations from what the core lexer does to what we want, in particular
+ * interfacing from the core_YYSTYPE to YYSTYPE union.
+ */
+static int
+internal_yylex(TokenAuxData *auxdata)
+{
+ int token;
+ const char *yytext;
+
+ if (num_pushbacks > 0)
+ {
+ num_pushbacks--;
+ token = pushback_token[num_pushbacks];
+ *auxdata = pushback_auxdata[num_pushbacks];
+ }
+ else
+ {
+ token = core_yylex(&auxdata->lval.core_yystype,
+ &auxdata->lloc,
+ yyscanner);
+
+ /* remember the length of yytext before it gets changed */
+ yytext = core_yy.scanbuf + auxdata->lloc;
+ auxdata->leng = strlen(yytext);
+
+ /* Check for << >> and #, which the core considers operators */
+ if (token == Op)
+ {
+ if (strcmp(auxdata->lval.str, "<<") == 0)
+ token = LESS_LESS;
+ else if (strcmp(auxdata->lval.str, ">>") == 0)
+ token = GREATER_GREATER;
+ else if (strcmp(auxdata->lval.str, "#") == 0)
+ token = '#';
+ }
+
+ /* The core returns PARAM as ival, but we treat it like IDENT */
+ else if (token == PARAM)
+ {
+ auxdata->lval.str = pstrdup(yytext);
+ }
+ }
+
+ return token;
+}
+
+/*
+ * Push back a token to be re-read by next internal_yylex() call.
+ */
+static void
+push_back_token(int token, TokenAuxData *auxdata)
+{
+ if (num_pushbacks >= MAX_PUSHBACKS)
+ elog(ERROR, "too many tokens pushed back");
+ pushback_token[num_pushbacks] = token;
+ pushback_auxdata[num_pushbacks] = *auxdata;
+ num_pushbacks++;
+}
+
+/*
+ * Push back a single token to be re-read by next plpgsql_yylex() call.
+ *
+ * NOTE: this does not cause yylval or yylloc to "back up". Also, it
+ * is not a good idea to push back a token code other than what you read.
+ */
+void
+plpgsql_push_back_token(int token)
+{
+ TokenAuxData auxdata;
+
+ auxdata.lval = plpgsql_yylval;
+ auxdata.lloc = plpgsql_yylloc;
+ auxdata.leng = plpgsql_yyleng;
+ push_back_token(token, &auxdata);
+}
+
+/*
+ * Tell whether a token is an unreserved keyword.
+ *
+ * (If it is, its lowercased form was returned as the token value, so we
+ * do not need to offer that data here.)
+ */
+bool
+plpgsql_token_is_unreserved_keyword(int token)
+{
+ int i;
+
+ for (i = 0; i < lengthof(UnreservedPLKeywordTokens); i++)
+ {
+ if (UnreservedPLKeywordTokens[i] == token)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Append the function text starting at startlocation and extending to
+ * (not including) endlocation onto the existing contents of "buf".
+ */
+void
+plpgsql_append_source_text(StringInfo buf,
+ int startlocation, int endlocation)
+{
+ Assert(startlocation <= endlocation);
+ appendBinaryStringInfo(buf, scanorig + startlocation,
+ endlocation - startlocation);
+}
+
+/*
+ * Peek one token ahead in the input stream. Only the token code is
+ * made available, not any of the auxiliary info such as location.
+ *
+ * NB: no variable or unreserved keyword lookup is performed here, they will
+ * be returned as IDENT. Reserved keywords are resolved as usual.
+ */
+int
+plpgsql_peek(void)
+{
+ int tok1;
+ TokenAuxData aux1;
+
+ tok1 = internal_yylex(&aux1);
+ push_back_token(tok1, &aux1);
+ return tok1;
+}
+
+/*
+ * Peek two tokens ahead in the input stream. The first token and its
+ * location in the query are returned in *tok1_p and *tok1_loc, second token
+ * and its location in *tok2_p and *tok2_loc.
+ *
+ * NB: no variable or unreserved keyword lookup is performed here, they will
+ * be returned as IDENT. Reserved keywords are resolved as usual.
+ */
+void
+plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, int *tok2_loc)
+{
+ int tok1,
+ tok2;
+ TokenAuxData aux1,
+ aux2;
+
+ tok1 = internal_yylex(&aux1);
+ tok2 = internal_yylex(&aux2);
+
+ *tok1_p = tok1;
+ if (tok1_loc)
+ *tok1_loc = aux1.lloc;
+ *tok2_p = tok2;
+ if (tok2_loc)
+ *tok2_loc = aux2.lloc;
+
+ push_back_token(tok2, &aux2);
+ push_back_token(tok1, &aux1);
+}
+
+/*
+ * plpgsql_scanner_errposition
+ * Report an error cursor position, if possible.
+ *
+ * This is expected to be used within an ereport() call. The return value
+ * is a dummy (always 0, in fact).
+ *
+ * Note that this can only be used for messages emitted during initial
+ * parsing of a plpgsql function, since it requires the scanorig string
+ * to still be available.
+ */
+int
+plpgsql_scanner_errposition(int location)
+{
+ int pos;
+
+ if (location < 0 || scanorig == NULL)
+ return 0; /* no-op if location is unknown */
+
+ /* Convert byte offset to character number */
+ pos = pg_mbstrlen_with_len(scanorig, location) + 1;
+ /* And pass it to the ereport mechanism */
+ (void) internalerrposition(pos);
+ /* Also pass the function body string */
+ return internalerrquery(scanorig);
+}
+
+/*
+ * plpgsql_yyerror
+ * Report a lexer or grammar error.
+ *
+ * The message's cursor position refers to the current token (the one
+ * last returned by plpgsql_yylex()).
+ * This is OK for syntax error messages from the Bison parser, because Bison
+ * parsers report error as soon as the first unparsable token is reached.
+ * Beware of using yyerror for other purposes, as the cursor position might
+ * be misleading!
+ */
+void
+plpgsql_yyerror(const char *message)
+{
+ char *yytext = core_yy.scanbuf + plpgsql_yylloc;
+
+ if (*yytext == '\0')
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ /* translator: %s is typically the translation of "syntax error" */
+ errmsg("%s at end of input", _(message)),
+ plpgsql_scanner_errposition(plpgsql_yylloc)));
+ }
+ else
+ {
+ /*
+ * If we have done any lookahead then flex will have restored the
+ * character after the end-of-token. Zap it again so that we report
+ * only the single token here. This modifies scanbuf but we no longer
+ * care about that.
+ */
+ yytext[plpgsql_yyleng] = '\0';
+
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ /* translator: first %s is typically the translation of "syntax error" */
+ errmsg("%s at or near \"%s\"", _(message), yytext),
+ plpgsql_scanner_errposition(plpgsql_yylloc)));
+ }
+}
+
+/*
+ * Given a location (a byte offset in the function source text),
+ * return a line number.
+ *
+ * We expect that this is typically called for a sequence of increasing
+ * location values, so optimize accordingly by tracking the endpoints
+ * of the "current" line.
+ */
+int
+plpgsql_location_to_lineno(int location)
+{
+ const char *loc;
+
+ if (location < 0 || scanorig == NULL)
+ return 0; /* garbage in, garbage out */
+ loc = scanorig + location;
+
+ /* be correct, but not fast, if input location goes backwards */
+ if (loc < cur_line_start)
+ location_lineno_init();
+
+ while (cur_line_end != NULL && loc > cur_line_end)
+ {
+ cur_line_start = cur_line_end + 1;
+ cur_line_num++;
+ cur_line_end = strchr(cur_line_start, '\n');
+ }
+
+ return cur_line_num;
+}
+
+/* initialize or reset the state for plpgsql_location_to_lineno */
+static void
+location_lineno_init(void)
+{
+ cur_line_start = scanorig;
+ cur_line_num = 1;
+
+ cur_line_end = strchr(cur_line_start, '\n');
+}
+
+/* return the most recently computed lineno */
+int
+plpgsql_latest_lineno(void)
+{
+ return cur_line_num;
+}
+
+
+/*
+ * Called before any actual parsing is done
+ *
+ * Note: the passed "str" must remain valid until plpgsql_scanner_finish().
+ * Although it is not fed directly to flex, we need the original string
+ * to cite in error messages.
+ */
+void
+plpgsql_scanner_init(const char *str)
+{
+ /* Start up the core scanner */
+ yyscanner = scanner_init(str, &core_yy,
+ &ReservedPLKeywords, ReservedPLKeywordTokens);
+
+ /*
+ * scanorig points to the original string, which unlike the scanner's
+ * scanbuf won't be modified on-the-fly by flex. Notice that although
+ * yytext points into scanbuf, we rely on being able to apply locations
+ * (offsets from string start) to scanorig as well.
+ */
+ scanorig = str;
+
+ /* Other setup */
+ plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL;
+ plpgsql_yytoken = 0;
+
+ num_pushbacks = 0;
+
+ location_lineno_init();
+}
+
+/*
+ * Called after parsing is done to clean up after plpgsql_scanner_init()
+ */
+void
+plpgsql_scanner_finish(void)
+{
+ /* release storage */
+ scanner_finish(yyscanner);
+ /* avoid leaving any dangling pointers */
+ yyscanner = NULL;
+ scanorig = NULL;
+}