summaryrefslogtreecommitdiffstats
path: root/src/backend/parser/parser.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/backend/parser/parser.c501
1 files changed, 501 insertions, 0 deletions
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
new file mode 100644
index 0000000..875de7b
--- /dev/null
+++ b/src/backend/parser/parser.c
@@ -0,0 +1,501 @@
+/*-------------------------------------------------------------------------
+ *
+ * parser.c
+ * Main entry point/driver for PostgreSQL grammar
+ *
+ * Note that the grammar is not allowed to perform any table access
+ * (since we need to be able to do basic parsing even while inside an
+ * aborted transaction). Therefore, the data structures returned by
+ * the grammar are "raw" parsetrees that still need to be analyzed by
+ * analyze.c and related files.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/parser/parser.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "mb/pg_wchar.h"
+#include "parser/gramparse.h"
+#include "parser/parser.h"
+#include "parser/scansup.h"
+
+static bool check_uescapechar(unsigned char escape);
+static char *str_udeescape(const char *str, char escape,
+ int position, core_yyscan_t yyscanner);
+
+
+/*
+ * raw_parser
+ * Given a query in string form, do lexical and grammatical analysis.
+ *
+ * Returns a list of raw (un-analyzed) parse trees. The contents of the
+ * list have the form required by the specified RawParseMode.
+ */
+List *
+raw_parser(const char *str, RawParseMode mode)
+{
+ core_yyscan_t yyscanner;
+ base_yy_extra_type yyextra;
+ int yyresult;
+
+ /* initialize the flex scanner */
+ yyscanner = scanner_init(str, &yyextra.core_yy_extra,
+ &ScanKeywords, ScanKeywordTokens);
+
+ /* base_yylex() only needs us to initialize the lookahead token, if any */
+ if (mode == RAW_PARSE_DEFAULT)
+ yyextra.have_lookahead = false;
+ else
+ {
+ /* this array is indexed by RawParseMode enum */
+ static const int mode_token[] = {
+ 0, /* RAW_PARSE_DEFAULT */
+ MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
+ MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
+ MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
+ MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
+ MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
+ };
+
+ yyextra.have_lookahead = true;
+ yyextra.lookahead_token = mode_token[mode];
+ yyextra.lookahead_yylloc = 0;
+ yyextra.lookahead_end = NULL;
+ }
+
+ /* initialize the bison parser */
+ parser_init(&yyextra);
+
+ /* Parse! */
+ yyresult = base_yyparse(yyscanner);
+
+ /* Clean up (release memory) */
+ scanner_finish(yyscanner);
+
+ if (yyresult) /* error */
+ return NIL;
+
+ return yyextra.parsetree;
+}
+
+
+/*
+ * Intermediate filter between parser and core lexer (core_yylex in scan.l).
+ *
+ * This filter is needed because in some cases the standard SQL grammar
+ * requires more than one token lookahead. We reduce these cases to one-token
+ * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
+ *
+ * Using a filter is simpler than trying to recognize multiword tokens
+ * directly in scan.l, because we'd have to allow for comments between the
+ * words. Furthermore it's not clear how to do that without re-introducing
+ * scanner backtrack, which would cost more performance than this filter
+ * layer does.
+ *
+ * We also use this filter to convert UIDENT and USCONST sequences into
+ * plain IDENT and SCONST tokens. While that could be handled by additional
+ * productions in the main grammar, it's more efficient to do it like this.
+ *
+ * The filter also provides a convenient place to translate between
+ * the core_YYSTYPE and YYSTYPE representations (which are really the
+ * same thing anyway, but notationally they're different).
+ */
+int
+base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
+{
+ base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
+ int cur_token;
+ int next_token;
+ int cur_token_length;
+ YYLTYPE cur_yylloc;
+
+ /* Get next token --- we might already have it */
+ if (yyextra->have_lookahead)
+ {
+ cur_token = yyextra->lookahead_token;
+ lvalp->core_yystype = yyextra->lookahead_yylval;
+ *llocp = yyextra->lookahead_yylloc;
+ if (yyextra->lookahead_end)
+ *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
+ yyextra->have_lookahead = false;
+ }
+ else
+ cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
+
+ /*
+ * If this token isn't one that requires lookahead, just return it. If it
+ * does, determine the token length. (We could get that via strlen(), but
+ * since we have such a small set of possibilities, hardwiring seems
+ * feasible and more efficient --- at least for the fixed-length cases.)
+ */
+ switch (cur_token)
+ {
+ case NOT:
+ cur_token_length = 3;
+ break;
+ case NULLS_P:
+ cur_token_length = 5;
+ break;
+ case WITH:
+ cur_token_length = 4;
+ break;
+ case UIDENT:
+ case USCONST:
+ cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
+ break;
+ default:
+ return cur_token;
+ }
+
+ /*
+ * Identify end+1 of current token. core_yylex() has temporarily stored a
+ * '\0' here, and will undo that when we call it again. We need to redo
+ * it to fully revert the lookahead call for error reporting purposes.
+ */
+ yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
+ *llocp + cur_token_length;
+ Assert(*(yyextra->lookahead_end) == '\0');
+
+ /*
+ * Save and restore *llocp around the call. It might look like we could
+ * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
+ * does not work because flex actually holds onto the last-passed pointer
+ * internally, and will use that for error reporting. We need any error
+ * reports to point to the current token, not the next one.
+ */
+ cur_yylloc = *llocp;
+
+ /* Get next token, saving outputs into lookahead variables */
+ next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
+ yyextra->lookahead_token = next_token;
+ yyextra->lookahead_yylloc = *llocp;
+
+ *llocp = cur_yylloc;
+
+ /* Now revert the un-truncation of the current token */
+ yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
+ *(yyextra->lookahead_end) = '\0';
+
+ yyextra->have_lookahead = true;
+
+ /* Replace cur_token if needed, based on lookahead */
+ switch (cur_token)
+ {
+ case NOT:
+ /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
+ switch (next_token)
+ {
+ case BETWEEN:
+ case IN_P:
+ case LIKE:
+ case ILIKE:
+ case SIMILAR:
+ cur_token = NOT_LA;
+ break;
+ }
+ break;
+
+ case NULLS_P:
+ /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
+ switch (next_token)
+ {
+ case FIRST_P:
+ case LAST_P:
+ cur_token = NULLS_LA;
+ break;
+ }
+ break;
+
+ case WITH:
+ /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
+ switch (next_token)
+ {
+ case TIME:
+ case ORDINALITY:
+ cur_token = WITH_LA;
+ break;
+ }
+ break;
+
+ case UIDENT:
+ case USCONST:
+ /* Look ahead for UESCAPE */
+ if (next_token == UESCAPE)
+ {
+ /* Yup, so get third token, which had better be SCONST */
+ const char *escstr;
+
+ /* Again save and restore *llocp */
+ cur_yylloc = *llocp;
+
+ /* Un-truncate current token so errors point to third token */
+ *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
+
+ /* Get third token */
+ next_token = core_yylex(&(yyextra->lookahead_yylval),
+ llocp, yyscanner);
+
+ /* If we throw error here, it will point to third token */
+ if (next_token != SCONST)
+ scanner_yyerror("UESCAPE must be followed by a simple string literal",
+ yyscanner);
+
+ escstr = yyextra->lookahead_yylval.str;
+ if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
+ scanner_yyerror("invalid Unicode escape character",
+ yyscanner);
+
+ /* Now restore *llocp; errors will point to first token */
+ *llocp = cur_yylloc;
+
+ /* Apply Unicode conversion */
+ lvalp->core_yystype.str =
+ str_udeescape(lvalp->core_yystype.str,
+ escstr[0],
+ *llocp,
+ yyscanner);
+
+ /*
+ * We don't need to revert the un-truncation of UESCAPE. What
+ * we do want to do is clear have_lookahead, thereby consuming
+ * all three tokens.
+ */
+ yyextra->have_lookahead = false;
+ }
+ else
+ {
+ /* No UESCAPE, so convert using default escape character */
+ lvalp->core_yystype.str =
+ str_udeescape(lvalp->core_yystype.str,
+ '\\',
+ *llocp,
+ yyscanner);
+ }
+
+ if (cur_token == UIDENT)
+ {
+ /* It's an identifier, so truncate as appropriate */
+ truncate_identifier(lvalp->core_yystype.str,
+ strlen(lvalp->core_yystype.str),
+ true);
+ cur_token = IDENT;
+ }
+ else if (cur_token == USCONST)
+ {
+ cur_token = SCONST;
+ }
+ break;
+ }
+
+ return cur_token;
+}
+
+/* convert hex digit (caller should have verified that) to value */
+static unsigned int
+hexval(unsigned char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 0xA;
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 0xA;
+ elog(ERROR, "invalid hexadecimal digit");
+ return 0; /* not reached */
+}
+
+/* is Unicode code point acceptable? */
+static void
+check_unicode_value(pg_wchar c)
+{
+ if (!is_valid_unicode_codepoint(c))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode escape value")));
+}
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+static bool
+check_uescapechar(unsigned char escape)
+{
+ if (isxdigit(escape)
+ || escape == '+'
+ || escape == '\''
+ || escape == '"'
+ || scanner_isspace(escape))
+ return false;
+ else
+ return true;
+}
+
+/*
+ * Process Unicode escapes in "str", producing a palloc'd plain string
+ *
+ * escape: the escape character to use
+ * position: start position of U&'' or U&"" string token
+ * yyscanner: context information needed for error reports
+ */
+static char *
+str_udeescape(const char *str, char escape,
+ int position, core_yyscan_t yyscanner)
+{
+ const char *in;
+ char *new,
+ *out;
+ size_t new_len;
+ pg_wchar pair_first = 0;
+ ScannerCallbackState scbstate;
+
+ /*
+ * Guesstimate that result will be no longer than input, but allow enough
+ * padding for Unicode conversion.
+ */
+ new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
+ new = palloc(new_len);
+
+ in = str;
+ out = new;
+ while (*in)
+ {
+ /* Enlarge string if needed */
+ size_t out_dist = out - new;
+
+ if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
+ {
+ new_len *= 2;
+ new = repalloc(new, new_len);
+ out = new + out_dist;
+ }
+
+ if (in[0] == escape)
+ {
+ /*
+ * Any errors reported while processing this escape sequence will
+ * have an error cursor pointing at the escape.
+ */
+ setup_scanner_errposition_callback(&scbstate, yyscanner,
+ in - str + position + 3); /* 3 for U&" */
+ if (in[1] == escape)
+ {
+ if (pair_first)
+ goto invalid_pair;
+ *out++ = escape;
+ in += 2;
+ }
+ else if (isxdigit((unsigned char) in[1]) &&
+ isxdigit((unsigned char) in[2]) &&
+ isxdigit((unsigned char) in[3]) &&
+ isxdigit((unsigned char) in[4]))
+ {
+ pg_wchar unicode;
+
+ unicode = (hexval(in[1]) << 12) +
+ (hexval(in[2]) << 8) +
+ (hexval(in[3]) << 4) +
+ hexval(in[4]);
+ check_unicode_value(unicode);
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ goto invalid_pair;
+ }
+ else if (is_utf16_surrogate_second(unicode))
+ goto invalid_pair;
+
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ pg_unicode_to_server(unicode, (unsigned char *) out);
+ out += strlen(out);
+ }
+ in += 5;
+ }
+ else if (in[1] == '+' &&
+ isxdigit((unsigned char) in[2]) &&
+ isxdigit((unsigned char) in[3]) &&
+ isxdigit((unsigned char) in[4]) &&
+ isxdigit((unsigned char) in[5]) &&
+ isxdigit((unsigned char) in[6]) &&
+ isxdigit((unsigned char) in[7]))
+ {
+ pg_wchar unicode;
+
+ unicode = (hexval(in[2]) << 20) +
+ (hexval(in[3]) << 16) +
+ (hexval(in[4]) << 12) +
+ (hexval(in[5]) << 8) +
+ (hexval(in[6]) << 4) +
+ hexval(in[7]);
+ check_unicode_value(unicode);
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ goto invalid_pair;
+ }
+ else if (is_utf16_surrogate_second(unicode))
+ goto invalid_pair;
+
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ pg_unicode_to_server(unicode, (unsigned char *) out);
+ out += strlen(out);
+ }
+ in += 8;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode escape"),
+ errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
+
+ cancel_scanner_errposition_callback(&scbstate);
+ }
+ else
+ {
+ if (pair_first)
+ goto invalid_pair;
+
+ *out++ = *in++;
+ }
+ }
+
+ /* unfinished surrogate pair? */
+ if (pair_first)
+ goto invalid_pair;
+
+ *out = '\0';
+ return new;
+
+ /*
+ * We might get here with the error callback active, or not. Call
+ * scanner_errposition to make sure an error cursor appears; if the
+ * callback is active, this is duplicative but harmless.
+ */
+invalid_pair:
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode surrogate pair"),
+ scanner_errposition(in - str + position + 3, /* 3 for U&" */
+ yyscanner)));
+ return NULL; /* keep compiler quiet */
+}