diff options
Diffstat (limited to 'src/backend/parser/scan.l')
-rw-r--r-- | src/backend/parser/scan.l | 1429 |
1 files changed, 1429 insertions, 0 deletions
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l new file mode 100644 index 0000000..9f9d8a1 --- /dev/null +++ b/src/backend/parser/scan.l @@ -0,0 +1,1429 @@ +%top{ +/*------------------------------------------------------------------------- + * + * scan.l + * lexical scanner for PostgreSQL + * + * NOTE NOTE NOTE: + * + * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l + * and src/interfaces/ecpg/preproc/pgc.l! + * + * The rules are designed so that the scanner never has to backtrack, + * in the sense that there is always a rule that can match the input + * consumed so far (the rule action may internally throw back some input + * with yyless(), however). As explained in the flex manual, this makes + * for a useful speed increase --- several percent faster when measuring + * raw parsing (Flex + Bison). The extra complexity is mostly in the rules + * for handling float numbers and continued string literals. If you change + * the lexical rules, verify that you haven't broken the no-backtrack + * property by running flex with the "-b" option and checking that the + * resulting "lex.backup" file says that no backing up is needed. (As of + * Postgres 9.2, this check is made automatically by the Makefile.) + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/parser/scan.l + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <ctype.h> +#include <unistd.h> + +#include "common/string.h" +#include "parser/gramparse.h" +#include "parser/parser.h" /* only needed for GUC variables */ +#include "parser/scansup.h" +#include "mb/pg_wchar.h" +} + +%{ + +/* LCOV_EXCL_START */ + +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) + +static void +fprintf_to_ereport(const char *fmt, const char *msg) +{ + ereport(ERROR, (errmsg_internal("%s", msg))); +} + +/* + * GUC variables. This is a DIRECT violation of the warning given at the + * head of gram.y, ie flex/bison code must not depend on any GUC variables; + * as such, changing their values can induce very unintuitive behavior. + * But we shall have to live with it until we can remove these variables. + */ +int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING; +bool escape_string_warning = true; +bool standard_conforming_strings = true; + +/* + * Constant data exported from this file. This array maps from the + * zero-based keyword numbers returned by ScanKeywordLookup to the + * Bison token numbers needed by gram.y. This is exported because + * callers need to pass it to scanner_init, if they are using the + * standard keyword list ScanKeywords. + */ +#define PG_KEYWORD(kwname, value, category, collabel) value, + +const uint16 ScanKeywordTokens[] = { +#include "parser/kwlist.h" +}; + +#undef PG_KEYWORD + +/* + * Set the type of YYSTYPE. + */ +#define YYSTYPE core_YYSTYPE + +/* + * Set the type of yyextra. All state variables used by the scanner should + * be in yyextra, *not* statically allocated. + */ +#define YY_EXTRA_TYPE core_yy_extra_type * + +/* + * Each call to yylex must set yylloc to the location of the found token + * (expressed as a byte offset from the start of the input text). + * When we parse a token that requires multiple lexer rules to process, + * this should be done in the first such rule, else yylloc will point + * into the middle of the token. + */ +#define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf) + +/* + * Advance yylloc by the given number of bytes. + */ +#define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) ) + +/* + * Sometimes, we do want yylloc to point into the middle of a token; this is + * useful for instance to throw an error about an escape sequence within a + * string literal. But if we find no error there, we want to revert yylloc + * to the token start, so that that's the location reported to the parser. + * Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code. + * (Currently the implied "stack" is just one location, but someday we might + * need to nest these.) + */ +#define PUSH_YYLLOC() (yyextra->save_yylloc = *(yylloc)) +#define POP_YYLLOC() (*(yylloc) = yyextra->save_yylloc) + +#define startlit() ( yyextra->literallen = 0 ) +static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner); +static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner); +static char *litbufdup(core_yyscan_t yyscanner); +static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner); +static int process_integer_literal(const char *token, YYSTYPE *lval); +static void addunicode(pg_wchar c, yyscan_t yyscanner); + +#define yyerror(msg) scanner_yyerror(msg, yyscanner) + +#define lexer_errposition() scanner_errposition(*(yylloc), yyscanner) + +static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner); +static void check_escape_warning(core_yyscan_t yyscanner); + +/* + * Work around a bug in flex 2.5.35: it emits a couple of functions that + * it forgets to emit declarations for. Since we use -Wmissing-prototypes, + * this would cause warnings. Providing our own declarations should be + * harmless even when the bug gets fixed. + */ +extern int core_yyget_column(yyscan_t yyscanner); +extern void core_yyset_column(int column_no, yyscan_t yyscanner); + +%} + +%option reentrant +%option bison-bridge +%option bison-locations +%option 8bit +%option never-interactive +%option nodefault +%option noinput +%option nounput +%option noyywrap +%option noyyalloc +%option noyyrealloc +%option noyyfree +%option warn +%option prefix="core_yy" + +/* + * OK, here is a short description of lex/flex rules behavior. + * The longest pattern which matches an input string is always chosen. + * For equal-length patterns, the first occurring in the rules list is chosen. + * INITIAL is the starting state, to which all non-conditional rules apply. + * Exclusive states change parsing rules while the state is active. When in + * an exclusive state, only those rules defined for that state apply. + * + * We use exclusive states for quoted strings, extended comments, + * and to eliminate parsing troubles for numeric strings. + * Exclusive states: + * <xb> bit string literal + * <xc> extended C-style comments + * <xd> delimited identifiers (double-quoted identifiers) + * <xh> hexadecimal numeric string + * <xq> standard quoted strings + * <xqs> quote stop (detect continued strings) + * <xe> extended quoted strings (support backslash escape sequences) + * <xdolq> $foo$ quoted strings + * <xui> quoted identifier with Unicode escapes + * <xus> quoted string with Unicode escapes + * <xeu> Unicode surrogate pair in extended quoted string + * + * Remember to add an <<EOF>> case whenever you add a new exclusive state! + * The default one is probably not the right thing. + */ + +%x xb +%x xc +%x xd +%x xh +%x xq +%x xqs +%x xe +%x xdolq +%x xui +%x xus +%x xeu + +/* + * In order to make the world safe for Windows and Mac clients as well as + * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n + * sequence will be seen as two successive newlines, but that doesn't cause + * any problems. Comments that start with -- and extend to the next + * newline are treated as equivalent to a single whitespace character. + * + * NOTE a fine point: if there is no newline following --, we will absorb + * everything to the end of the input as a comment. This is correct. Older + * versions of Postgres failed to recognize -- as a comment if the input + * did not end with a newline. + * + * XXX perhaps \f (formfeed) should be treated as a newline as well? + * + * XXX if you change the set of whitespace characters, fix scanner_isspace() + * to agree. + */ + +space [ \t\n\r\f] +horiz_space [ \t\f] +newline [\n\r] +non_newline [^\n\r] + +comment ("--"{non_newline}*) + +whitespace ({space}+|{comment}) + +/* + * SQL requires at least one newline in the whitespace separating + * string literals that are to be concatenated. Silly, but who are we + * to argue? Note that {whitespace_with_newline} should not have * after + * it, whereas {whitespace} should generally have a * after it... + */ + +special_whitespace ({space}+|{comment}{newline}) +horiz_whitespace ({horiz_space}|{comment}) +whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) + +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + +/* + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. + */ +quotecontinuefail {whitespace}*"-"? + +/* Bit string + * It is tempting to scan the string for only those characters + * which are allowed. However, this leads to silently swallowed + * characters if illegal characters are included in the string. + * For example, if xbinside is [01] then B'ABCD' is interpreted + * as a zero-length string, and the ABCD' is lost! + * Better to pass the string forward and let the input routines + * validate the contents. + */ +xbstart [bB]{quote} +xbinside [^']* + +/* Hexadecimal number */ +xhstart [xX]{quote} +xhinside [^']* + +/* National character */ +xnstart [nN]{quote} + +/* Quoted string that allows backslash escapes */ +xestart [eE]{quote} +xeinside [^\\']+ +xeescape [\\][^0-7] +xeoctesc [\\][0-7]{1,3} +xehexesc [\\]x[0-9A-Fa-f]{1,2} +xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) +xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7}) + +/* Extended quote + * xqdouble implements embedded quote, '''' + */ +xqstart {quote} +xqdouble {quote}{quote} +xqinside [^']+ + +/* $foo$ style quotes ("dollar quoting") + * The quoted string starts with $foo$ where "foo" is an optional string + * in the form of an identifier, except that it may not contain "$", + * and extends to the first occurrence of an identical string. + * There is *no* processing of the quoted text. + * + * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim} + * fails to match its trailing "$". + */ +dolq_start [A-Za-z\200-\377_] +dolq_cont [A-Za-z\200-\377_0-9] +dolqdelim \$({dolq_start}{dolq_cont}*)?\$ +dolqfailed \${dolq_start}{dolq_cont}* +dolqinside [^$]+ + +/* Double quote + * Allows embedded spaces and other special characters into identifiers. + */ +dquote \" +xdstart {dquote} +xdstop {dquote} +xddouble {dquote}{dquote} +xdinside [^"]+ + +/* Quoted identifier with Unicode escapes */ +xuistart [uU]&{dquote} + +/* Quoted string with Unicode escapes */ +xusstart [uU]&{quote} + +/* error rule to avoid backup */ +xufailed [uU]& + + +/* C-style comments + * + * The "extended comment" syntax closely resembles allowable operator syntax. + * The tricky part here is to get lex to recognize a string starting with + * slash-star as a comment, when interpreting it as an operator would produce + * a longer match --- remember lex will prefer a longer match! Also, if we + * have something like plus-slash-star, lex will think this is a 3-character + * operator whereas we want to see it as a + operator and a comment start. + * The solution is two-fold: + * 1. append {op_chars}* to xcstart so that it matches as much text as + * {operator} would. Then the tie-breaker (first matching rule of same + * length) ensures xcstart wins. We put back the extra stuff with yyless() + * in case it contains a star-slash that should terminate the comment. + * 2. In the operator rule, check for slash-star within the operator, and + * if found throw it back with yyless(). This handles the plus-slash-star + * problem. + * Dash-dash comments have similar interactions with the operator rule. + */ +xcstart \/\*{op_chars}* +xcstop \*+\/ +xcinside [^*/]+ + +digit [0-9] +ident_start [A-Za-z\200-\377_] +ident_cont [A-Za-z\200-\377_0-9\$] + +identifier {ident_start}{ident_cont}* + +/* Assorted special-case operators and operator-like tokens */ +typecast "::" +dot_dot \.\. +colon_equals ":=" + +/* + * These operator-like tokens (unlike the above ones) also match the {operator} + * rule, which means that they might be overridden by a longer match if they + * are followed by a comment start or a + or - character. Accordingly, if you + * add to this list, you must also add corresponding code to the {operator} + * block to return the correct token in such cases. (This is not needed in + * psqlscan.l since the token value is ignored there.) + */ +equals_greater "=>" +less_equals "<=" +greater_equals ">=" +less_greater "<>" +not_equals "!=" + +/* + * "self" is the set of chars that should be returned as single-character + * tokens. "op_chars" is the set of chars that can make up "Op" tokens, + * which can be one or more characters long (but if a single-char token + * appears in the "self" set, it is not to be returned as an Op). Note + * that the sets overlap, but each has some chars that are not in the other. + * + * If you change either set, adjust the character lists appearing in the + * rule for "operator"! + */ +self [,()\[\].;\:\+\-\*\/\%\^\<\>\=] +op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=] +operator {op_chars}+ + +/* we no longer allow unary minus in numbers. + * instead we pass it separately to parser. there it gets + * coerced via doNegate() -- Leon aug 20 1999 + * + * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10. + * + * {realfail1} and {realfail2} are added to prevent the need for scanner + * backup when the {real} rule fails to match completely. + */ + +integer {digit}+ +decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*)) +decimalfail {digit}+\.\. +real ({integer}|{decimal})[Ee][-+]?{digit}+ +realfail1 ({integer}|{decimal})[Ee] +realfail2 ({integer}|{decimal})[Ee][-+] + +param \${integer} + +other . + +/* + * Dollar quoted strings are totally opaque, and no escaping is done on them. + * Other quoted strings must allow some special characters such as single-quote + * and newline. + * Embedded single-quotes are implemented both in the SQL standard + * style of two adjacent single quotes "''" and in the Postgres/Java style + * of escaped-quote "\'". + * Other embedded escaped characters are matched explicitly and the leading + * backslash is dropped from the string. + * Note that xcstart must appear before operator, as explained above! + * Also whitespace (comment) must appear before operator. + */ + +%% + +{whitespace} { + /* ignore */ + } + +{xcstart} { + /* Set location in case of syntax error in comment */ + SET_YYLLOC(); + yyextra->xcdepth = 0; + BEGIN(xc); + /* Put back any characters past slash-star; see above */ + yyless(2); + } + +<xc>{ +{xcstart} { + (yyextra->xcdepth)++; + /* Put back any characters past slash-star; see above */ + yyless(2); + } + +{xcstop} { + if (yyextra->xcdepth <= 0) + BEGIN(INITIAL); + else + (yyextra->xcdepth)--; + } + +{xcinside} { + /* ignore */ + } + +{op_chars} { + /* ignore */ + } + +\*+ { + /* ignore */ + } + +<<EOF>> { + yyerror("unterminated /* comment"); + } +} /* <xc> */ + +{xbstart} { + /* Binary bit type. + * At some point we should simply pass the string + * forward to the parser and label it there. + * In the meantime, place a leading "b" on the string + * to mark it for the input routine as a binary string. + */ + SET_YYLLOC(); + BEGIN(xb); + startlit(); + addlitchar('b', yyscanner); + } +<xh>{xhinside} | +<xb>{xbinside} { + addlit(yytext, yyleng, yyscanner); + } +<xb><<EOF>> { yyerror("unterminated bit string literal"); } + +{xhstart} { + /* Hexadecimal bit type. + * At some point we should simply pass the string + * forward to the parser and label it there. + * In the meantime, place a leading "x" on the string + * to mark it for the input routine as a hex string. + */ + SET_YYLLOC(); + BEGIN(xh); + startlit(); + addlitchar('x', yyscanner); + } +<xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); } + +{xnstart} { + /* National character. + * We will pass this along as a normal character string, + * but preceded with an internally-generated "NCHAR". + */ + int kwnum; + + SET_YYLLOC(); + yyless(1); /* eat only 'n' this time */ + + kwnum = ScanKeywordLookup("nchar", + yyextra->keywordlist); + if (kwnum >= 0) + { + yylval->keyword = GetScanKeyword(kwnum, + yyextra->keywordlist); + return yyextra->keyword_tokens[kwnum]; + } + else + { + /* If NCHAR isn't a keyword, just return "n" */ + yylval->str = pstrdup("n"); + return IDENT; + } + } + +{xqstart} { + yyextra->warn_on_first_escape = true; + yyextra->saw_non_ascii = false; + SET_YYLLOC(); + if (yyextra->standard_conforming_strings) + BEGIN(xq); + else + BEGIN(xe); + startlit(); + } +{xestart} { + yyextra->warn_on_first_escape = false; + yyextra->saw_non_ascii = false; + SET_YYLLOC(); + BEGIN(xe); + startlit(); + } +{xusstart} { + SET_YYLLOC(); + if (!yyextra->standard_conforming_strings) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unsafe use of string constant with Unicode escapes"), + errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."), + lexer_errposition())); + BEGIN(xus); + startlit(); + } + +<xb,xh,xq,xe,xus>{quote} { + /* + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. + */ + yyextra->state_before_str_stop = YYSTATE; + BEGIN(xqs); + } +<xqs>{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. Nothing is + * added to the literal's contents. + */ + BEGIN(yyextra->state_before_str_stop); + } +<xqs>{quotecontinuefail} | +<xqs>{other} | +<xqs><<EOF>> { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and handle the string + * according to the state we were in previously. + */ + yyless(0); + BEGIN(INITIAL); + + switch (yyextra->state_before_str_stop) + { + case xb: + yylval->str = litbufdup(yyscanner); + return BCONST; + case xh: + yylval->str = litbufdup(yyscanner); + return XCONST; + case xq: + case xe: + /* + * Check that the data remains valid, if it might + * have been made invalid by unescaping any chars. + */ + if (yyextra->saw_non_ascii) + pg_verifymbstr(yyextra->literalbuf, + yyextra->literallen, + false); + yylval->str = litbufdup(yyscanner); + return SCONST; + case xus: + yylval->str = litbufdup(yyscanner); + return USCONST; + default: + yyerror("unhandled previous state in xqs"); + } + } + +<xq,xe,xus>{xqdouble} { + addlitchar('\'', yyscanner); + } +<xq,xus>{xqinside} { + addlit(yytext, yyleng, yyscanner); + } +<xe>{xeinside} { + addlit(yytext, yyleng, yyscanner); + } +<xe>{xeunicode} { + pg_wchar c = strtoul(yytext + 2, NULL, 16); + + /* + * For consistency with other productions, issue any + * escape warning with cursor pointing to start of string. + * We might want to change that, someday. + */ + check_escape_warning(yyscanner); + + /* Remember start of overall string token ... */ + PUSH_YYLLOC(); + /* ... and set the error cursor to point at this esc seq */ + SET_YYLLOC(); + + if (is_utf16_surrogate_first(c)) + { + yyextra->utf16_first_part = c; + BEGIN(xeu); + } + else if (is_utf16_surrogate_second(c)) + yyerror("invalid Unicode surrogate pair"); + else + addunicode(c, yyscanner); + + /* Restore yylloc to be start of string token */ + POP_YYLLOC(); + } +<xeu>{xeunicode} { + pg_wchar c = strtoul(yytext + 2, NULL, 16); + + /* Remember start of overall string token ... */ + PUSH_YYLLOC(); + /* ... and set the error cursor to point at this esc seq */ + SET_YYLLOC(); + + if (!is_utf16_surrogate_second(c)) + yyerror("invalid Unicode surrogate pair"); + + c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c); + + addunicode(c, yyscanner); + + /* Restore yylloc to be start of string token */ + POP_YYLLOC(); + + BEGIN(xe); + } +<xeu>. | +<xeu>\n | +<xeu><<EOF>> { + /* Set the error cursor to point at missing esc seq */ + SET_YYLLOC(); + yyerror("invalid Unicode surrogate pair"); + } +<xe,xeu>{xeunicodefail} { + /* Set the error cursor to point at malformed esc seq */ + SET_YYLLOC(); + ereport(ERROR, + (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."), + lexer_errposition())); + } +<xe>{xeescape} { + if (yytext[1] == '\'') + { + if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF || + (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING && + PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding()))) + ereport(ERROR, + (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER), + errmsg("unsafe use of \\' in a string literal"), + errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."), + lexer_errposition())); + } + check_string_escape_warning(yytext[1], yyscanner); + addlitchar(unescape_single_char(yytext[1], yyscanner), + yyscanner); + } +<xe>{xeoctesc} { + unsigned char c = strtoul(yytext + 1, NULL, 8); + + check_escape_warning(yyscanner); + addlitchar(c, yyscanner); + if (c == '\0' || IS_HIGHBIT_SET(c)) + yyextra->saw_non_ascii = true; + } +<xe>{xehexesc} { + unsigned char c = strtoul(yytext + 2, NULL, 16); + + check_escape_warning(yyscanner); + addlitchar(c, yyscanner); + if (c == '\0' || IS_HIGHBIT_SET(c)) + yyextra->saw_non_ascii = true; + } +<xe>. { + /* This is only needed for \ just before EOF */ + addlitchar(yytext[0], yyscanner); + } +<xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); } + +{dolqdelim} { + SET_YYLLOC(); + yyextra->dolqstart = pstrdup(yytext); + BEGIN(xdolq); + startlit(); + } +{dolqfailed} { + SET_YYLLOC(); + /* throw back all but the initial "$" */ + yyless(1); + /* and treat it as {other} */ + return yytext[0]; + } +<xdolq>{dolqdelim} { + if (strcmp(yytext, yyextra->dolqstart) == 0) + { + pfree(yyextra->dolqstart); + yyextra->dolqstart = NULL; + BEGIN(INITIAL); + yylval->str = litbufdup(yyscanner); + return SCONST; + } + else + { + /* + * When we fail to match $...$ to dolqstart, transfer + * the $... part to the output, but put back the final + * $ for rescanning. Consider $delim$...$junk$delim$ + */ + addlit(yytext, yyleng - 1, yyscanner); + yyless(yyleng - 1); + } + } +<xdolq>{dolqinside} { + addlit(yytext, yyleng, yyscanner); + } +<xdolq>{dolqfailed} { + addlit(yytext, yyleng, yyscanner); + } +<xdolq>. { + /* This is only needed for $ inside the quoted text */ + addlitchar(yytext[0], yyscanner); + } +<xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); } + +{xdstart} { + SET_YYLLOC(); + BEGIN(xd); + startlit(); + } +{xuistart} { + SET_YYLLOC(); + BEGIN(xui); + startlit(); + } +<xd>{xdstop} { + char *ident; + + BEGIN(INITIAL); + if (yyextra->literallen == 0) + yyerror("zero-length delimited identifier"); + ident = litbufdup(yyscanner); + if (yyextra->literallen >= NAMEDATALEN) + truncate_identifier(ident, yyextra->literallen, true); + yylval->str = ident; + return IDENT; + } +<xui>{dquote} { + BEGIN(INITIAL); + if (yyextra->literallen == 0) + yyerror("zero-length delimited identifier"); + /* can't truncate till after we de-escape the ident */ + yylval->str = litbufdup(yyscanner); + return UIDENT; + } +<xd,xui>{xddouble} { + addlitchar('"', yyscanner); + } +<xd,xui>{xdinside} { + addlit(yytext, yyleng, yyscanner); + } +<xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); } + +{xufailed} { + char *ident; + + SET_YYLLOC(); + /* throw back all but the initial u/U */ + yyless(1); + /* and treat it as {identifier} */ + ident = downcase_truncate_identifier(yytext, yyleng, true); + yylval->str = ident; + return IDENT; + } + +{typecast} { + SET_YYLLOC(); + return TYPECAST; + } + +{dot_dot} { + SET_YYLLOC(); + return DOT_DOT; + } + +{colon_equals} { + SET_YYLLOC(); + return COLON_EQUALS; + } + +{equals_greater} { + SET_YYLLOC(); + return EQUALS_GREATER; + } + +{less_equals} { + SET_YYLLOC(); + return LESS_EQUALS; + } + +{greater_equals} { + SET_YYLLOC(); + return GREATER_EQUALS; + } + +{less_greater} { + /* We accept both "<>" and "!=" as meaning NOT_EQUALS */ + SET_YYLLOC(); + return NOT_EQUALS; + } + +{not_equals} { + /* We accept both "<>" and "!=" as meaning NOT_EQUALS */ + SET_YYLLOC(); + return NOT_EQUALS; + } + +{self} { + SET_YYLLOC(); + return yytext[0]; + } + +{operator} { + /* + * Check for embedded slash-star or dash-dash; those + * are comment starts, so operator must stop there. + * Note that slash-star or dash-dash at the first + * character will match a prior rule, not this one. + */ + int nchars = yyleng; + char *slashstar = strstr(yytext, "/*"); + char *dashdash = strstr(yytext, "--"); + + if (slashstar && dashdash) + { + /* if both appear, take the first one */ + if (slashstar > dashdash) + slashstar = dashdash; + } + else if (!slashstar) + slashstar = dashdash; + if (slashstar) + nchars = slashstar - yytext; + + /* + * For SQL compatibility, '+' and '-' cannot be the + * last char of a multi-char operator unless the operator + * contains chars that are not in SQL operators. + * The idea is to lex '=-' as two operators, but not + * to forbid operator names like '?-' that could not be + * sequences of SQL operators. + */ + if (nchars > 1 && + (yytext[nchars - 1] == '+' || + yytext[nchars - 1] == '-')) + { + int ic; + + for (ic = nchars - 2; ic >= 0; ic--) + { + char c = yytext[ic]; + if (c == '~' || c == '!' || c == '@' || + c == '#' || c == '^' || c == '&' || + c == '|' || c == '`' || c == '?' || + c == '%') + break; + } + if (ic < 0) + { + /* + * didn't find a qualifying character, so remove + * all trailing [+-] + */ + do { + nchars--; + } while (nchars > 1 && + (yytext[nchars - 1] == '+' || + yytext[nchars - 1] == '-')); + } + } + + SET_YYLLOC(); + + if (nchars < yyleng) + { + /* Strip the unwanted chars from the token */ + yyless(nchars); + /* + * If what we have left is only one char, and it's + * one of the characters matching "self", then + * return it as a character token the same way + * that the "self" rule would have. + */ + if (nchars == 1 && + strchr(",()[].;:+-*/%^<>=", yytext[0])) + return yytext[0]; + /* + * Likewise, if what we have left is two chars, and + * those match the tokens ">=", "<=", "=>", "<>" or + * "!=", then we must return the appropriate token + * rather than the generic Op. + */ + if (nchars == 2) + { + if (yytext[0] == '=' && yytext[1] == '>') + return EQUALS_GREATER; + if (yytext[0] == '>' && yytext[1] == '=') + return GREATER_EQUALS; + if (yytext[0] == '<' && yytext[1] == '=') + return LESS_EQUALS; + if (yytext[0] == '<' && yytext[1] == '>') + return NOT_EQUALS; + if (yytext[0] == '!' && yytext[1] == '=') + return NOT_EQUALS; + } + } + + /* + * Complain if operator is too long. Unlike the case + * for identifiers, we make this an error not a notice- + * and-truncate, because the odds are we are looking at + * a syntactic mistake anyway. + */ + if (nchars >= NAMEDATALEN) + yyerror("operator too long"); + + yylval->str = pstrdup(yytext); + return Op; + } + +{param} { + SET_YYLLOC(); + yylval->ival = atol(yytext + 1); + return PARAM; + } + +{integer} { + SET_YYLLOC(); + return process_integer_literal(yytext, yylval); + } +{decimal} { + SET_YYLLOC(); + yylval->str = pstrdup(yytext); + return FCONST; + } +{decimalfail} { + /* throw back the .., and treat as integer */ + yyless(yyleng - 2); + SET_YYLLOC(); + return process_integer_literal(yytext, yylval); + } +{real} { + SET_YYLLOC(); + yylval->str = pstrdup(yytext); + return FCONST; + } +{realfail1} { + /* + * throw back the [Ee], and figure out whether what + * remains is an {integer} or {decimal}. + */ + yyless(yyleng - 1); + SET_YYLLOC(); + return process_integer_literal(yytext, yylval); + } +{realfail2} { + /* throw back the [Ee][+-], and proceed as above */ + yyless(yyleng - 2); + SET_YYLLOC(); + return process_integer_literal(yytext, yylval); + } + + +{identifier} { + int kwnum; + char *ident; + + SET_YYLLOC(); + + /* Is it a keyword? */ + kwnum = ScanKeywordLookup(yytext, + yyextra->keywordlist); + if (kwnum >= 0) + { + yylval->keyword = GetScanKeyword(kwnum, + yyextra->keywordlist); + return yyextra->keyword_tokens[kwnum]; + } + + /* + * No. Convert the identifier to lower case, and truncate + * if necessary. + */ + ident = downcase_truncate_identifier(yytext, yyleng, true); + yylval->str = ident; + return IDENT; + } + +{other} { + SET_YYLLOC(); + return yytext[0]; + } + +<<EOF>> { + SET_YYLLOC(); + yyterminate(); + } + +%% + +/* LCOV_EXCL_STOP */ + +/* + * Arrange access to yyextra for subroutines of the main yylex() function. + * We expect each subroutine to have a yyscanner parameter. Rather than + * use the yyget_xxx functions, which might or might not get inlined by the + * compiler, we cheat just a bit and cast yyscanner to the right type. + */ +#undef yyextra +#define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r) + +/* Likewise for a couple of other things we need. */ +#undef yylloc +#define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r) +#undef yyleng +#define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r) + + +/* + * scanner_errposition + * Report a lexer or grammar error cursor position, if possible. + * + * This is expected to be used within an ereport() call, or via an error + * callback such as setup_scanner_errposition_callback(). The return value + * is a dummy (always 0, in fact). + * + * Note that this can only be used for messages emitted during raw parsing + * (essentially, scan.l, parser.c, and gram.y), since it requires the + * yyscanner struct to still be available. + */ +int +scanner_errposition(int location, core_yyscan_t yyscanner) +{ + int pos; + + if (location < 0) + return 0; /* no-op if location is unknown */ + + /* Convert byte offset to character number */ + pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1; + /* And pass it to the ereport mechanism */ + return errposition(pos); +} + +/* + * Error context callback for inserting scanner error location. + * + * Note that this will be called for *any* error occurring while the + * callback is installed. We avoid inserting an irrelevant error location + * if the error is a query cancel --- are there any other important cases? + */ +static void +scb_error_callback(void *arg) +{ + ScannerCallbackState *scbstate = (ScannerCallbackState *) arg; + + if (geterrcode() != ERRCODE_QUERY_CANCELED) + (void) scanner_errposition(scbstate->location, scbstate->yyscanner); +} + +/* + * setup_scanner_errposition_callback + * Arrange for non-scanner errors to report an error position + * + * Sometimes the scanner calls functions that aren't part of the scanner + * subsystem and can't reasonably be passed the yyscanner pointer; yet + * we would like any errors thrown in those functions to be tagged with an + * error location. Use this function to set up an error context stack + * entry that will accomplish that. Usage pattern: + * + * declare a local variable "ScannerCallbackState scbstate" + * ... + * setup_scanner_errposition_callback(&scbstate, yyscanner, location); + * call function that might throw error; + * cancel_scanner_errposition_callback(&scbstate); + */ +void +setup_scanner_errposition_callback(ScannerCallbackState *scbstate, + core_yyscan_t yyscanner, + int location) +{ + /* Setup error traceback support for ereport() */ + scbstate->yyscanner = yyscanner; + scbstate->location = location; + scbstate->errcallback.callback = scb_error_callback; + scbstate->errcallback.arg = (void *) scbstate; + scbstate->errcallback.previous = error_context_stack; + error_context_stack = &scbstate->errcallback; +} + +/* + * Cancel a previously-set-up errposition callback. + */ +void +cancel_scanner_errposition_callback(ScannerCallbackState *scbstate) +{ + /* Pop the error context stack */ + error_context_stack = scbstate->errcallback.previous; +} + +/* + * scanner_yyerror + * Report a lexer or grammar error. + * + * The message's cursor position is whatever YYLLOC was last set to, + * ie, the start of the current token if called within yylex(), or the + * most recently lexed token if called from the grammar. + * This is OK for syntax error messages from the Bison parser, because Bison + * parsers report error as soon as the first unparsable token is reached. + * Beware of using yyerror for other purposes, as the cursor position might + * be misleading! + */ +void +scanner_yyerror(const char *message, core_yyscan_t yyscanner) +{ + const char *loc = yyextra->scanbuf + *yylloc; + + if (*loc == YY_END_OF_BUFFER_CHAR) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + /* translator: %s is typically the translation of "syntax error" */ + errmsg("%s at end of input", _(message)), + lexer_errposition())); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + /* translator: first %s is typically the translation of "syntax error" */ + errmsg("%s at or near \"%s\"", _(message), loc), + lexer_errposition())); + } +} + + +/* + * Called before any actual parsing is done + */ +core_yyscan_t +scanner_init(const char *str, + core_yy_extra_type *yyext, + const ScanKeywordList *keywordlist, + const uint16 *keyword_tokens) +{ + Size slen = strlen(str); + yyscan_t scanner; + + if (yylex_init(&scanner) != 0) + elog(ERROR, "yylex_init() failed: %m"); + + core_yyset_extra(yyext, scanner); + + yyext->keywordlist = keywordlist; + yyext->keyword_tokens = keyword_tokens; + + yyext->backslash_quote = backslash_quote; + yyext->escape_string_warning = escape_string_warning; + yyext->standard_conforming_strings = standard_conforming_strings; + + /* + * Make a scan buffer with special termination needed by flex. + */ + yyext->scanbuf = (char *) palloc(slen + 2); + yyext->scanbuflen = slen; + memcpy(yyext->scanbuf, str, slen); + yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR; + yy_scan_buffer(yyext->scanbuf, slen + 2, scanner); + + /* initialize literal buffer to a reasonable but expansible size */ + yyext->literalalloc = 1024; + yyext->literalbuf = (char *) palloc(yyext->literalalloc); + yyext->literallen = 0; + + return scanner; +} + + +/* + * Called after parsing is done to clean up after scanner_init() + */ +void +scanner_finish(core_yyscan_t yyscanner) +{ + /* + * We don't bother to call yylex_destroy(), because all it would do is + * pfree a small amount of control storage. It's cheaper to leak the + * storage until the parsing context is destroyed. The amount of space + * involved is usually negligible compared to the output parse tree + * anyway. + * + * We do bother to pfree the scanbuf and literal buffer, but only if they + * represent a nontrivial amount of space. The 8K cutoff is arbitrary. + */ + if (yyextra->scanbuflen >= 8192) + pfree(yyextra->scanbuf); + if (yyextra->literalalloc >= 8192) + pfree(yyextra->literalbuf); +} + + +static void +addlit(char *ytext, int yleng, core_yyscan_t yyscanner) +{ + /* enlarge buffer if needed */ + if ((yyextra->literallen + yleng) >= yyextra->literalalloc) + { + do + { + yyextra->literalalloc *= 2; + } while ((yyextra->literallen + yleng) >= yyextra->literalalloc); + yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf, + yyextra->literalalloc); + } + /* append new data */ + memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng); + yyextra->literallen += yleng; +} + + +static void +addlitchar(unsigned char ychar, core_yyscan_t yyscanner) +{ + /* enlarge buffer if needed */ + if ((yyextra->literallen + 1) >= yyextra->literalalloc) + { + yyextra->literalalloc *= 2; + yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf, + yyextra->literalalloc); + } + /* append new data */ + yyextra->literalbuf[yyextra->literallen] = ychar; + yyextra->literallen += 1; +} + + +/* + * Create a palloc'd copy of literalbuf, adding a trailing null. + */ +static char * +litbufdup(core_yyscan_t yyscanner) +{ + int llen = yyextra->literallen; + char *new; + + new = palloc(llen + 1); + memcpy(new, yyextra->literalbuf, llen); + new[llen] = '\0'; + return new; +} + +/* + * Process {integer}. Note this will also do the right thing with {decimal}, + * ie digits and a decimal point. + */ +static int +process_integer_literal(const char *token, YYSTYPE *lval) +{ + int val; + char *endptr; + + errno = 0; + val = strtoint(token, &endptr, 10); + if (*endptr != '\0' || errno == ERANGE) + { + /* integer too large (or contains decimal pt), treat it as a float */ + lval->str = pstrdup(token); + return FCONST; + } + lval->ival = val; + return ICONST; +} + +static void +addunicode(pg_wchar c, core_yyscan_t yyscanner) +{ + ScannerCallbackState scbstate; + char buf[MAX_UNICODE_EQUIVALENT_STRING + 1]; + + if (!is_valid_unicode_codepoint(c)) + yyerror("invalid Unicode escape value"); + + /* + * We expect that pg_unicode_to_server() will complain about any + * unconvertible code point, so we don't have to set saw_non_ascii. + */ + setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc)); + pg_unicode_to_server(c, (unsigned char *) buf); + cancel_scanner_errposition_callback(&scbstate); + addlit(buf, strlen(buf), yyscanner); +} + +static unsigned char +unescape_single_char(unsigned char c, core_yyscan_t yyscanner) +{ + switch (c) + { + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + default: + /* check for backslash followed by non-7-bit-ASCII */ + if (c == '\0' || IS_HIGHBIT_SET(c)) + yyextra->saw_non_ascii = true; + + return c; + } +} + +static void +check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner) +{ + if (ychar == '\'') + { + if (yyextra->warn_on_first_escape && yyextra->escape_string_warning) + ereport(WARNING, + (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER), + errmsg("nonstandard use of \\' in a string literal"), + errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."), + lexer_errposition())); + yyextra->warn_on_first_escape = false; /* warn only once per string */ + } + else if (ychar == '\\') + { + if (yyextra->warn_on_first_escape && yyextra->escape_string_warning) + ereport(WARNING, + (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER), + errmsg("nonstandard use of \\\\ in a string literal"), + errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."), + lexer_errposition())); + yyextra->warn_on_first_escape = false; /* warn only once per string */ + } + else + check_escape_warning(yyscanner); +} + +static void +check_escape_warning(core_yyscan_t yyscanner) +{ + if (yyextra->warn_on_first_escape && yyextra->escape_string_warning) + ereport(WARNING, + (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER), + errmsg("nonstandard use of escape in a string literal"), + errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."), + lexer_errposition())); + yyextra->warn_on_first_escape = false; /* warn only once per string */ +} + +/* + * Interface functions to make flex use palloc() instead of malloc(). + * It'd be better to make these static, but flex insists otherwise. + */ + +void * +core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner) +{ + return palloc(bytes); +} + +void * +core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner) +{ + if (ptr) + return repalloc(ptr, bytes); + else + return palloc(bytes); +} + +void +core_yyfree(void *ptr, core_yyscan_t yyscanner) +{ + if (ptr) + pfree(ptr); +} |