diff options
Diffstat (limited to 'src/backend/parser/parser.c')
-rw-r--r-- | src/backend/parser/parser.c | 501 |
1 files changed, 501 insertions, 0 deletions
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c new file mode 100644 index 0000000..875de7b --- /dev/null +++ b/src/backend/parser/parser.c @@ -0,0 +1,501 @@ +/*------------------------------------------------------------------------- + * + * parser.c + * Main entry point/driver for PostgreSQL grammar + * + * Note that the grammar is not allowed to perform any table access + * (since we need to be able to do basic parsing even while inside an + * aborted transaction). Therefore, the data structures returned by + * the grammar are "raw" parsetrees that still need to be analyzed by + * analyze.c and related files. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/parser/parser.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "mb/pg_wchar.h" +#include "parser/gramparse.h" +#include "parser/parser.h" +#include "parser/scansup.h" + +static bool check_uescapechar(unsigned char escape); +static char *str_udeescape(const char *str, char escape, + int position, core_yyscan_t yyscanner); + + +/* + * raw_parser + * Given a query in string form, do lexical and grammatical analysis. + * + * Returns a list of raw (un-analyzed) parse trees. The contents of the + * list have the form required by the specified RawParseMode. + */ +List * +raw_parser(const char *str, RawParseMode mode) +{ + core_yyscan_t yyscanner; + base_yy_extra_type yyextra; + int yyresult; + + /* initialize the flex scanner */ + yyscanner = scanner_init(str, &yyextra.core_yy_extra, + &ScanKeywords, ScanKeywordTokens); + + /* base_yylex() only needs us to initialize the lookahead token, if any */ + if (mode == RAW_PARSE_DEFAULT) + yyextra.have_lookahead = false; + else + { + /* this array is indexed by RawParseMode enum */ + static const int mode_token[] = { + 0, /* RAW_PARSE_DEFAULT */ + MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */ + MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */ + MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */ + MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */ + MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */ + }; + + yyextra.have_lookahead = true; + yyextra.lookahead_token = mode_token[mode]; + yyextra.lookahead_yylloc = 0; + yyextra.lookahead_end = NULL; + } + + /* initialize the bison parser */ + parser_init(&yyextra); + + /* Parse! */ + yyresult = base_yyparse(yyscanner); + + /* Clean up (release memory) */ + scanner_finish(yyscanner); + + if (yyresult) /* error */ + return NIL; + + return yyextra.parsetree; +} + + +/* + * Intermediate filter between parser and core lexer (core_yylex in scan.l). + * + * This filter is needed because in some cases the standard SQL grammar + * requires more than one token lookahead. We reduce these cases to one-token + * lookahead by replacing tokens here, in order to keep the grammar LALR(1). + * + * Using a filter is simpler than trying to recognize multiword tokens + * directly in scan.l, because we'd have to allow for comments between the + * words. Furthermore it's not clear how to do that without re-introducing + * scanner backtrack, which would cost more performance than this filter + * layer does. + * + * We also use this filter to convert UIDENT and USCONST sequences into + * plain IDENT and SCONST tokens. While that could be handled by additional + * productions in the main grammar, it's more efficient to do it like this. + * + * The filter also provides a convenient place to translate between + * the core_YYSTYPE and YYSTYPE representations (which are really the + * same thing anyway, but notationally they're different). + */ +int +base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) +{ + base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); + int cur_token; + int next_token; + int cur_token_length; + YYLTYPE cur_yylloc; + + /* Get next token --- we might already have it */ + if (yyextra->have_lookahead) + { + cur_token = yyextra->lookahead_token; + lvalp->core_yystype = yyextra->lookahead_yylval; + *llocp = yyextra->lookahead_yylloc; + if (yyextra->lookahead_end) + *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; + yyextra->have_lookahead = false; + } + else + cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); + + /* + * If this token isn't one that requires lookahead, just return it. If it + * does, determine the token length. (We could get that via strlen(), but + * since we have such a small set of possibilities, hardwiring seems + * feasible and more efficient --- at least for the fixed-length cases.) + */ + switch (cur_token) + { + case NOT: + cur_token_length = 3; + break; + case NULLS_P: + cur_token_length = 5; + break; + case WITH: + cur_token_length = 4; + break; + case UIDENT: + case USCONST: + cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp); + break; + default: + return cur_token; + } + + /* + * Identify end+1 of current token. core_yylex() has temporarily stored a + * '\0' here, and will undo that when we call it again. We need to redo + * it to fully revert the lookahead call for error reporting purposes. + */ + yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + + *llocp + cur_token_length; + Assert(*(yyextra->lookahead_end) == '\0'); + + /* + * Save and restore *llocp around the call. It might look like we could + * avoid this by just passing &lookahead_yylloc to core_yylex(), but that + * does not work because flex actually holds onto the last-passed pointer + * internally, and will use that for error reporting. We need any error + * reports to point to the current token, not the next one. + */ + cur_yylloc = *llocp; + + /* Get next token, saving outputs into lookahead variables */ + next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); + yyextra->lookahead_token = next_token; + yyextra->lookahead_yylloc = *llocp; + + *llocp = cur_yylloc; + + /* Now revert the un-truncation of the current token */ + yyextra->lookahead_hold_char = *(yyextra->lookahead_end); + *(yyextra->lookahead_end) = '\0'; + + yyextra->have_lookahead = true; + + /* Replace cur_token if needed, based on lookahead */ + switch (cur_token) + { + case NOT: + /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ + switch (next_token) + { + case BETWEEN: + case IN_P: + case LIKE: + case ILIKE: + case SIMILAR: + cur_token = NOT_LA; + break; + } + break; + + case NULLS_P: + /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ + switch (next_token) + { + case FIRST_P: + case LAST_P: + cur_token = NULLS_LA; + break; + } + break; + + case WITH: + /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ + switch (next_token) + { + case TIME: + case ORDINALITY: + cur_token = WITH_LA; + break; + } + break; + + case UIDENT: + case USCONST: + /* Look ahead for UESCAPE */ + if (next_token == UESCAPE) + { + /* Yup, so get third token, which had better be SCONST */ + const char *escstr; + + /* Again save and restore *llocp */ + cur_yylloc = *llocp; + + /* Un-truncate current token so errors point to third token */ + *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; + + /* Get third token */ + next_token = core_yylex(&(yyextra->lookahead_yylval), + llocp, yyscanner); + + /* If we throw error here, it will point to third token */ + if (next_token != SCONST) + scanner_yyerror("UESCAPE must be followed by a simple string literal", + yyscanner); + + escstr = yyextra->lookahead_yylval.str; + if (strlen(escstr) != 1 || !check_uescapechar(escstr[0])) + scanner_yyerror("invalid Unicode escape character", + yyscanner); + + /* Now restore *llocp; errors will point to first token */ + *llocp = cur_yylloc; + + /* Apply Unicode conversion */ + lvalp->core_yystype.str = + str_udeescape(lvalp->core_yystype.str, + escstr[0], + *llocp, + yyscanner); + + /* + * We don't need to revert the un-truncation of UESCAPE. What + * we do want to do is clear have_lookahead, thereby consuming + * all three tokens. + */ + yyextra->have_lookahead = false; + } + else + { + /* No UESCAPE, so convert using default escape character */ + lvalp->core_yystype.str = + str_udeescape(lvalp->core_yystype.str, + '\\', + *llocp, + yyscanner); + } + + if (cur_token == UIDENT) + { + /* It's an identifier, so truncate as appropriate */ + truncate_identifier(lvalp->core_yystype.str, + strlen(lvalp->core_yystype.str), + true); + cur_token = IDENT; + } + else if (cur_token == USCONST) + { + cur_token = SCONST; + } + break; + } + + return cur_token; +} + +/* convert hex digit (caller should have verified that) to value */ +static unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +/* is Unicode code point acceptable? */ +static void +check_unicode_value(pg_wchar c) +{ + if (!is_valid_unicode_codepoint(c)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"))); +} + +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +static bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || scanner_isspace(escape)) + return false; + else + return true; +} + +/* + * Process Unicode escapes in "str", producing a palloc'd plain string + * + * escape: the escape character to use + * position: start position of U&'' or U&"" string token + * yyscanner: context information needed for error reports + */ +static char * +str_udeescape(const char *str, char escape, + int position, core_yyscan_t yyscanner) +{ + const char *in; + char *new, + *out; + size_t new_len; + pg_wchar pair_first = 0; + ScannerCallbackState scbstate; + + /* + * Guesstimate that result will be no longer than input, but allow enough + * padding for Unicode conversion. + */ + new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1; + new = palloc(new_len); + + in = str; + out = new; + while (*in) + { + /* Enlarge string if needed */ + size_t out_dist = out - new; + + if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1)) + { + new_len *= 2; + new = repalloc(new, new_len); + out = new + out_dist; + } + + if (in[0] == escape) + { + /* + * Any errors reported while processing this escape sequence will + * have an error cursor pointing at the escape. + */ + setup_scanner_errposition_callback(&scbstate, yyscanner, + in - str + position + 3); /* 3 for U&" */ + if (in[1] == escape) + { + if (pair_first) + goto invalid_pair; + *out++ = escape; + in += 2; + } + else if (isxdigit((unsigned char) in[1]) && + isxdigit((unsigned char) in[2]) && + isxdigit((unsigned char) in[3]) && + isxdigit((unsigned char) in[4])) + { + pg_wchar unicode; + + unicode = (hexval(in[1]) << 12) + + (hexval(in[2]) << 8) + + (hexval(in[3]) << 4) + + hexval(in[4]); + check_unicode_value(unicode); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) out); + out += strlen(out); + } + in += 5; + } + else if (in[1] == '+' && + isxdigit((unsigned char) in[2]) && + isxdigit((unsigned char) in[3]) && + isxdigit((unsigned char) in[4]) && + isxdigit((unsigned char) in[5]) && + isxdigit((unsigned char) in[6]) && + isxdigit((unsigned char) in[7])) + { + pg_wchar unicode; + + unicode = (hexval(in[2]) << 20) + + (hexval(in[3]) << 16) + + (hexval(in[4]) << 12) + + (hexval(in[5]) << 8) + + (hexval(in[6]) << 4) + + hexval(in[7]); + check_unicode_value(unicode); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) out); + out += strlen(out); + } + in += 8; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\XXXX or \\+XXXXXX."))); + + cancel_scanner_errposition_callback(&scbstate); + } + else + { + if (pair_first) + goto invalid_pair; + + *out++ = *in++; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + *out = '\0'; + return new; + + /* + * We might get here with the error callback active, or not. Call + * scanner_errposition to make sure an error cursor appears; if the + * callback is active, this is duplicative but harmless. + */ +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"), + scanner_errposition(in - str + position + 3, /* 3 for U&" */ + yyscanner))); + return NULL; /* keep compiler quiet */ +} |