summaryrefslogtreecommitdiffstats
path: root/src/backend/parser/scan.l
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/parser/scan.l')
-rw-r--r--src/backend/parser/scan.l1429
1 files changed, 1429 insertions, 0 deletions
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
new file mode 100644
index 0000000..9f9d8a1
--- /dev/null
+++ b/src/backend/parser/scan.l
@@ -0,0 +1,1429 @@
+%top{
+/*-------------------------------------------------------------------------
+ *
+ * scan.l
+ * lexical scanner for PostgreSQL
+ *
+ * NOTE NOTE NOTE:
+ *
+ * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l
+ * and src/interfaces/ecpg/preproc/pgc.l!
+ *
+ * The rules are designed so that the scanner never has to backtrack,
+ * in the sense that there is always a rule that can match the input
+ * consumed so far (the rule action may internally throw back some input
+ * with yyless(), however). As explained in the flex manual, this makes
+ * for a useful speed increase --- several percent faster when measuring
+ * raw parsing (Flex + Bison). The extra complexity is mostly in the rules
+ * for handling float numbers and continued string literals. If you change
+ * the lexical rules, verify that you haven't broken the no-backtrack
+ * property by running flex with the "-b" option and checking that the
+ * resulting "lex.backup" file says that no backing up is needed. (As of
+ * Postgres 9.2, this check is made automatically by the Makefile.)
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/parser/scan.l
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+#include <unistd.h>
+
+#include "common/string.h"
+#include "parser/gramparse.h"
+#include "parser/parser.h" /* only needed for GUC variables */
+#include "parser/scansup.h"
+#include "mb/pg_wchar.h"
+}
+
+%{
+
+/* LCOV_EXCL_START */
+
+/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
+#undef fprintf
+#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
+
+static void
+fprintf_to_ereport(const char *fmt, const char *msg)
+{
+ ereport(ERROR, (errmsg_internal("%s", msg)));
+}
+
+/*
+ * GUC variables. This is a DIRECT violation of the warning given at the
+ * head of gram.y, ie flex/bison code must not depend on any GUC variables;
+ * as such, changing their values can induce very unintuitive behavior.
+ * But we shall have to live with it until we can remove these variables.
+ */
+int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
+bool escape_string_warning = true;
+bool standard_conforming_strings = true;
+
+/*
+ * Constant data exported from this file. This array maps from the
+ * zero-based keyword numbers returned by ScanKeywordLookup to the
+ * Bison token numbers needed by gram.y. This is exported because
+ * callers need to pass it to scanner_init, if they are using the
+ * standard keyword list ScanKeywords.
+ */
+#define PG_KEYWORD(kwname, value, category, collabel) value,
+
+const uint16 ScanKeywordTokens[] = {
+#include "parser/kwlist.h"
+};
+
+#undef PG_KEYWORD
+
+/*
+ * Set the type of YYSTYPE.
+ */
+#define YYSTYPE core_YYSTYPE
+
+/*
+ * Set the type of yyextra. All state variables used by the scanner should
+ * be in yyextra, *not* statically allocated.
+ */
+#define YY_EXTRA_TYPE core_yy_extra_type *
+
+/*
+ * Each call to yylex must set yylloc to the location of the found token
+ * (expressed as a byte offset from the start of the input text).
+ * When we parse a token that requires multiple lexer rules to process,
+ * this should be done in the first such rule, else yylloc will point
+ * into the middle of the token.
+ */
+#define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf)
+
+/*
+ * Advance yylloc by the given number of bytes.
+ */
+#define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
+
+/*
+ * Sometimes, we do want yylloc to point into the middle of a token; this is
+ * useful for instance to throw an error about an escape sequence within a
+ * string literal. But if we find no error there, we want to revert yylloc
+ * to the token start, so that that's the location reported to the parser.
+ * Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code.
+ * (Currently the implied "stack" is just one location, but someday we might
+ * need to nest these.)
+ */
+#define PUSH_YYLLOC() (yyextra->save_yylloc = *(yylloc))
+#define POP_YYLLOC() (*(yylloc) = yyextra->save_yylloc)
+
+#define startlit() ( yyextra->literallen = 0 )
+static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
+static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
+static char *litbufdup(core_yyscan_t yyscanner);
+static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
+static int process_integer_literal(const char *token, YYSTYPE *lval);
+static void addunicode(pg_wchar c, yyscan_t yyscanner);
+
+#define yyerror(msg) scanner_yyerror(msg, yyscanner)
+
+#define lexer_errposition() scanner_errposition(*(yylloc), yyscanner)
+
+static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
+static void check_escape_warning(core_yyscan_t yyscanner);
+
+/*
+ * Work around a bug in flex 2.5.35: it emits a couple of functions that
+ * it forgets to emit declarations for. Since we use -Wmissing-prototypes,
+ * this would cause warnings. Providing our own declarations should be
+ * harmless even when the bug gets fixed.
+ */
+extern int core_yyget_column(yyscan_t yyscanner);
+extern void core_yyset_column(int column_no, yyscan_t yyscanner);
+
+%}
+
+%option reentrant
+%option bison-bridge
+%option bison-locations
+%option 8bit
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option noyyalloc
+%option noyyrealloc
+%option noyyfree
+%option warn
+%option prefix="core_yy"
+
+/*
+ * OK, here is a short description of lex/flex rules behavior.
+ * The longest pattern which matches an input string is always chosen.
+ * For equal-length patterns, the first occurring in the rules list is chosen.
+ * INITIAL is the starting state, to which all non-conditional rules apply.
+ * Exclusive states change parsing rules while the state is active. When in
+ * an exclusive state, only those rules defined for that state apply.
+ *
+ * We use exclusive states for quoted strings, extended comments,
+ * and to eliminate parsing troubles for numeric strings.
+ * Exclusive states:
+ * <xb> bit string literal
+ * <xc> extended C-style comments
+ * <xd> delimited identifiers (double-quoted identifiers)
+ * <xh> hexadecimal numeric string
+ * <xq> standard quoted strings
+ * <xqs> quote stop (detect continued strings)
+ * <xe> extended quoted strings (support backslash escape sequences)
+ * <xdolq> $foo$ quoted strings
+ * <xui> quoted identifier with Unicode escapes
+ * <xus> quoted string with Unicode escapes
+ * <xeu> Unicode surrogate pair in extended quoted string
+ *
+ * Remember to add an <<EOF>> case whenever you add a new exclusive state!
+ * The default one is probably not the right thing.
+ */
+
+%x xb
+%x xc
+%x xd
+%x xh
+%x xq
+%x xqs
+%x xe
+%x xdolq
+%x xui
+%x xus
+%x xeu
+
+/*
+ * In order to make the world safe for Windows and Mac clients as well as
+ * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
+ * sequence will be seen as two successive newlines, but that doesn't cause
+ * any problems. Comments that start with -- and extend to the next
+ * newline are treated as equivalent to a single whitespace character.
+ *
+ * NOTE a fine point: if there is no newline following --, we will absorb
+ * everything to the end of the input as a comment. This is correct. Older
+ * versions of Postgres failed to recognize -- as a comment if the input
+ * did not end with a newline.
+ *
+ * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ *
+ * XXX if you change the set of whitespace characters, fix scanner_isspace()
+ * to agree.
+ */
+
+space [ \t\n\r\f]
+horiz_space [ \t\f]
+newline [\n\r]
+non_newline [^\n\r]
+
+comment ("--"{non_newline}*)
+
+whitespace ({space}+|{comment})
+
+/*
+ * SQL requires at least one newline in the whitespace separating
+ * string literals that are to be concatenated. Silly, but who are we
+ * to argue? Note that {whitespace_with_newline} should not have * after
+ * it, whereas {whitespace} should generally have a * after it...
+ */
+
+special_whitespace ({space}+|{comment}{newline})
+horiz_whitespace ({horiz_space}|{comment})
+whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
+
+quote '
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue {whitespace_with_newline}{quote}
+
+/*
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}. It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
+ */
+quotecontinuefail {whitespace}*"-"?
+
+/* Bit string
+ * It is tempting to scan the string for only those characters
+ * which are allowed. However, this leads to silently swallowed
+ * characters if illegal characters are included in the string.
+ * For example, if xbinside is [01] then B'ABCD' is interpreted
+ * as a zero-length string, and the ABCD' is lost!
+ * Better to pass the string forward and let the input routines
+ * validate the contents.
+ */
+xbstart [bB]{quote}
+xbinside [^']*
+
+/* Hexadecimal number */
+xhstart [xX]{quote}
+xhinside [^']*
+
+/* National character */
+xnstart [nN]{quote}
+
+/* Quoted string that allows backslash escapes */
+xestart [eE]{quote}
+xeinside [^\\']+
+xeescape [\\][^0-7]
+xeoctesc [\\][0-7]{1,3}
+xehexesc [\\]x[0-9A-Fa-f]{1,2}
+xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
+xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
+
+/* Extended quote
+ * xqdouble implements embedded quote, ''''
+ */
+xqstart {quote}
+xqdouble {quote}{quote}
+xqinside [^']+
+
+/* $foo$ style quotes ("dollar quoting")
+ * The quoted string starts with $foo$ where "foo" is an optional string
+ * in the form of an identifier, except that it may not contain "$",
+ * and extends to the first occurrence of an identical string.
+ * There is *no* processing of the quoted text.
+ *
+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
+ * fails to match its trailing "$".
+ */
+dolq_start [A-Za-z\200-\377_]
+dolq_cont [A-Za-z\200-\377_0-9]
+dolqdelim \$({dolq_start}{dolq_cont}*)?\$
+dolqfailed \${dolq_start}{dolq_cont}*
+dolqinside [^$]+
+
+/* Double quote
+ * Allows embedded spaces and other special characters into identifiers.
+ */
+dquote \"
+xdstart {dquote}
+xdstop {dquote}
+xddouble {dquote}{dquote}
+xdinside [^"]+
+
+/* Quoted identifier with Unicode escapes */
+xuistart [uU]&{dquote}
+
+/* Quoted string with Unicode escapes */
+xusstart [uU]&{quote}
+
+/* error rule to avoid backup */
+xufailed [uU]&
+
+
+/* C-style comments
+ *
+ * The "extended comment" syntax closely resembles allowable operator syntax.
+ * The tricky part here is to get lex to recognize a string starting with
+ * slash-star as a comment, when interpreting it as an operator would produce
+ * a longer match --- remember lex will prefer a longer match! Also, if we
+ * have something like plus-slash-star, lex will think this is a 3-character
+ * operator whereas we want to see it as a + operator and a comment start.
+ * The solution is two-fold:
+ * 1. append {op_chars}* to xcstart so that it matches as much text as
+ * {operator} would. Then the tie-breaker (first matching rule of same
+ * length) ensures xcstart wins. We put back the extra stuff with yyless()
+ * in case it contains a star-slash that should terminate the comment.
+ * 2. In the operator rule, check for slash-star within the operator, and
+ * if found throw it back with yyless(). This handles the plus-slash-star
+ * problem.
+ * Dash-dash comments have similar interactions with the operator rule.
+ */
+xcstart \/\*{op_chars}*
+xcstop \*+\/
+xcinside [^*/]+
+
+digit [0-9]
+ident_start [A-Za-z\200-\377_]
+ident_cont [A-Za-z\200-\377_0-9\$]
+
+identifier {ident_start}{ident_cont}*
+
+/* Assorted special-case operators and operator-like tokens */
+typecast "::"
+dot_dot \.\.
+colon_equals ":="
+
+/*
+ * These operator-like tokens (unlike the above ones) also match the {operator}
+ * rule, which means that they might be overridden by a longer match if they
+ * are followed by a comment start or a + or - character. Accordingly, if you
+ * add to this list, you must also add corresponding code to the {operator}
+ * block to return the correct token in such cases. (This is not needed in
+ * psqlscan.l since the token value is ignored there.)
+ */
+equals_greater "=>"
+less_equals "<="
+greater_equals ">="
+less_greater "<>"
+not_equals "!="
+
+/*
+ * "self" is the set of chars that should be returned as single-character
+ * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
+ * which can be one or more characters long (but if a single-char token
+ * appears in the "self" set, it is not to be returned as an Op). Note
+ * that the sets overlap, but each has some chars that are not in the other.
+ *
+ * If you change either set, adjust the character lists appearing in the
+ * rule for "operator"!
+ */
+self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
+op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
+operator {op_chars}+
+
+/* we no longer allow unary minus in numbers.
+ * instead we pass it separately to parser. there it gets
+ * coerced via doNegate() -- Leon aug 20 1999
+ *
+ * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
+ *
+ * {realfail1} and {realfail2} are added to prevent the need for scanner
+ * backup when the {real} rule fails to match completely.
+ */
+
+integer {digit}+
+decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
+decimalfail {digit}+\.\.
+real ({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1 ({integer}|{decimal})[Ee]
+realfail2 ({integer}|{decimal})[Ee][-+]
+
+param \${integer}
+
+other .
+
+/*
+ * Dollar quoted strings are totally opaque, and no escaping is done on them.
+ * Other quoted strings must allow some special characters such as single-quote
+ * and newline.
+ * Embedded single-quotes are implemented both in the SQL standard
+ * style of two adjacent single quotes "''" and in the Postgres/Java style
+ * of escaped-quote "\'".
+ * Other embedded escaped characters are matched explicitly and the leading
+ * backslash is dropped from the string.
+ * Note that xcstart must appear before operator, as explained above!
+ * Also whitespace (comment) must appear before operator.
+ */
+
+%%
+
+{whitespace} {
+ /* ignore */
+ }
+
+{xcstart} {
+ /* Set location in case of syntax error in comment */
+ SET_YYLLOC();
+ yyextra->xcdepth = 0;
+ BEGIN(xc);
+ /* Put back any characters past slash-star; see above */
+ yyless(2);
+ }
+
+<xc>{
+{xcstart} {
+ (yyextra->xcdepth)++;
+ /* Put back any characters past slash-star; see above */
+ yyless(2);
+ }
+
+{xcstop} {
+ if (yyextra->xcdepth <= 0)
+ BEGIN(INITIAL);
+ else
+ (yyextra->xcdepth)--;
+ }
+
+{xcinside} {
+ /* ignore */
+ }
+
+{op_chars} {
+ /* ignore */
+ }
+
+\*+ {
+ /* ignore */
+ }
+
+<<EOF>> {
+ yyerror("unterminated /* comment");
+ }
+} /* <xc> */
+
+{xbstart} {
+ /* Binary bit type.
+ * At some point we should simply pass the string
+ * forward to the parser and label it there.
+ * In the meantime, place a leading "b" on the string
+ * to mark it for the input routine as a binary string.
+ */
+ SET_YYLLOC();
+ BEGIN(xb);
+ startlit();
+ addlitchar('b', yyscanner);
+ }
+<xh>{xhinside} |
+<xb>{xbinside} {
+ addlit(yytext, yyleng, yyscanner);
+ }
+<xb><<EOF>> { yyerror("unterminated bit string literal"); }
+
+{xhstart} {
+ /* Hexadecimal bit type.
+ * At some point we should simply pass the string
+ * forward to the parser and label it there.
+ * In the meantime, place a leading "x" on the string
+ * to mark it for the input routine as a hex string.
+ */
+ SET_YYLLOC();
+ BEGIN(xh);
+ startlit();
+ addlitchar('x', yyscanner);
+ }
+<xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
+
+{xnstart} {
+ /* National character.
+ * We will pass this along as a normal character string,
+ * but preceded with an internally-generated "NCHAR".
+ */
+ int kwnum;
+
+ SET_YYLLOC();
+ yyless(1); /* eat only 'n' this time */
+
+ kwnum = ScanKeywordLookup("nchar",
+ yyextra->keywordlist);
+ if (kwnum >= 0)
+ {
+ yylval->keyword = GetScanKeyword(kwnum,
+ yyextra->keywordlist);
+ return yyextra->keyword_tokens[kwnum];
+ }
+ else
+ {
+ /* If NCHAR isn't a keyword, just return "n" */
+ yylval->str = pstrdup("n");
+ return IDENT;
+ }
+ }
+
+{xqstart} {
+ yyextra->warn_on_first_escape = true;
+ yyextra->saw_non_ascii = false;
+ SET_YYLLOC();
+ if (yyextra->standard_conforming_strings)
+ BEGIN(xq);
+ else
+ BEGIN(xe);
+ startlit();
+ }
+{xestart} {
+ yyextra->warn_on_first_escape = false;
+ yyextra->saw_non_ascii = false;
+ SET_YYLLOC();
+ BEGIN(xe);
+ startlit();
+ }
+{xusstart} {
+ SET_YYLLOC();
+ if (!yyextra->standard_conforming_strings)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("unsafe use of string constant with Unicode escapes"),
+ errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
+ lexer_errposition()));
+ BEGIN(xus);
+ startlit();
+ }
+
+<xb,xh,xq,xe,xus>{quote} {
+ /*
+ * When we are scanning a quoted string and see an end
+ * quote, we must look ahead for a possible continuation.
+ * If we don't see one, we know the end quote was in fact
+ * the end of the string. To reduce the lexer table size,
+ * we use a single "xqs" state to do the lookahead for all
+ * types of strings.
+ */
+ yyextra->state_before_str_stop = YYSTATE;
+ BEGIN(xqs);
+ }
+<xqs>{quotecontinue} {
+ /*
+ * Found a quote continuation, so return to the in-quote
+ * state and continue scanning the literal. Nothing is
+ * added to the literal's contents.
+ */
+ BEGIN(yyextra->state_before_str_stop);
+ }
+<xqs>{quotecontinuefail} |
+<xqs>{other} |
+<xqs><<EOF>> {
+ /*
+ * Failed to see a quote continuation. Throw back
+ * everything after the end quote, and handle the string
+ * according to the state we were in previously.
+ */
+ yyless(0);
+ BEGIN(INITIAL);
+
+ switch (yyextra->state_before_str_stop)
+ {
+ case xb:
+ yylval->str = litbufdup(yyscanner);
+ return BCONST;
+ case xh:
+ yylval->str = litbufdup(yyscanner);
+ return XCONST;
+ case xq:
+ case xe:
+ /*
+ * Check that the data remains valid, if it might
+ * have been made invalid by unescaping any chars.
+ */
+ if (yyextra->saw_non_ascii)
+ pg_verifymbstr(yyextra->literalbuf,
+ yyextra->literallen,
+ false);
+ yylval->str = litbufdup(yyscanner);
+ return SCONST;
+ case xus:
+ yylval->str = litbufdup(yyscanner);
+ return USCONST;
+ default:
+ yyerror("unhandled previous state in xqs");
+ }
+ }
+
+<xq,xe,xus>{xqdouble} {
+ addlitchar('\'', yyscanner);
+ }
+<xq,xus>{xqinside} {
+ addlit(yytext, yyleng, yyscanner);
+ }
+<xe>{xeinside} {
+ addlit(yytext, yyleng, yyscanner);
+ }
+<xe>{xeunicode} {
+ pg_wchar c = strtoul(yytext + 2, NULL, 16);
+
+ /*
+ * For consistency with other productions, issue any
+ * escape warning with cursor pointing to start of string.
+ * We might want to change that, someday.
+ */
+ check_escape_warning(yyscanner);
+
+ /* Remember start of overall string token ... */
+ PUSH_YYLLOC();
+ /* ... and set the error cursor to point at this esc seq */
+ SET_YYLLOC();
+
+ if (is_utf16_surrogate_first(c))
+ {
+ yyextra->utf16_first_part = c;
+ BEGIN(xeu);
+ }
+ else if (is_utf16_surrogate_second(c))
+ yyerror("invalid Unicode surrogate pair");
+ else
+ addunicode(c, yyscanner);
+
+ /* Restore yylloc to be start of string token */
+ POP_YYLLOC();
+ }
+<xeu>{xeunicode} {
+ pg_wchar c = strtoul(yytext + 2, NULL, 16);
+
+ /* Remember start of overall string token ... */
+ PUSH_YYLLOC();
+ /* ... and set the error cursor to point at this esc seq */
+ SET_YYLLOC();
+
+ if (!is_utf16_surrogate_second(c))
+ yyerror("invalid Unicode surrogate pair");
+
+ c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
+
+ addunicode(c, yyscanner);
+
+ /* Restore yylloc to be start of string token */
+ POP_YYLLOC();
+
+ BEGIN(xe);
+ }
+<xeu>. |
+<xeu>\n |
+<xeu><<EOF>> {
+ /* Set the error cursor to point at missing esc seq */
+ SET_YYLLOC();
+ yyerror("invalid Unicode surrogate pair");
+ }
+<xe,xeu>{xeunicodefail} {
+ /* Set the error cursor to point at malformed esc seq */
+ SET_YYLLOC();
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+ errmsg("invalid Unicode escape"),
+ errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
+ lexer_errposition()));
+ }
+<xe>{xeescape} {
+ if (yytext[1] == '\'')
+ {
+ if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
+ (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
+ PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
+ ereport(ERROR,
+ (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
+ errmsg("unsafe use of \\' in a string literal"),
+ errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
+ lexer_errposition()));
+ }
+ check_string_escape_warning(yytext[1], yyscanner);
+ addlitchar(unescape_single_char(yytext[1], yyscanner),
+ yyscanner);
+ }
+<xe>{xeoctesc} {
+ unsigned char c = strtoul(yytext + 1, NULL, 8);
+
+ check_escape_warning(yyscanner);
+ addlitchar(c, yyscanner);
+ if (c == '\0' || IS_HIGHBIT_SET(c))
+ yyextra->saw_non_ascii = true;
+ }
+<xe>{xehexesc} {
+ unsigned char c = strtoul(yytext + 2, NULL, 16);
+
+ check_escape_warning(yyscanner);
+ addlitchar(c, yyscanner);
+ if (c == '\0' || IS_HIGHBIT_SET(c))
+ yyextra->saw_non_ascii = true;
+ }
+<xe>. {
+ /* This is only needed for \ just before EOF */
+ addlitchar(yytext[0], yyscanner);
+ }
+<xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
+
+{dolqdelim} {
+ SET_YYLLOC();
+ yyextra->dolqstart = pstrdup(yytext);
+ BEGIN(xdolq);
+ startlit();
+ }
+{dolqfailed} {
+ SET_YYLLOC();
+ /* throw back all but the initial "$" */
+ yyless(1);
+ /* and treat it as {other} */
+ return yytext[0];
+ }
+<xdolq>{dolqdelim} {
+ if (strcmp(yytext, yyextra->dolqstart) == 0)
+ {
+ pfree(yyextra->dolqstart);
+ yyextra->dolqstart = NULL;
+ BEGIN(INITIAL);
+ yylval->str = litbufdup(yyscanner);
+ return SCONST;
+ }
+ else
+ {
+ /*
+ * When we fail to match $...$ to dolqstart, transfer
+ * the $... part to the output, but put back the final
+ * $ for rescanning. Consider $delim$...$junk$delim$
+ */
+ addlit(yytext, yyleng - 1, yyscanner);
+ yyless(yyleng - 1);
+ }
+ }
+<xdolq>{dolqinside} {
+ addlit(yytext, yyleng, yyscanner);
+ }
+<xdolq>{dolqfailed} {
+ addlit(yytext, yyleng, yyscanner);
+ }
+<xdolq>. {
+ /* This is only needed for $ inside the quoted text */
+ addlitchar(yytext[0], yyscanner);
+ }
+<xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
+
+{xdstart} {
+ SET_YYLLOC();
+ BEGIN(xd);
+ startlit();
+ }
+{xuistart} {
+ SET_YYLLOC();
+ BEGIN(xui);
+ startlit();
+ }
+<xd>{xdstop} {
+ char *ident;
+
+ BEGIN(INITIAL);
+ if (yyextra->literallen == 0)
+ yyerror("zero-length delimited identifier");
+ ident = litbufdup(yyscanner);
+ if (yyextra->literallen >= NAMEDATALEN)
+ truncate_identifier(ident, yyextra->literallen, true);
+ yylval->str = ident;
+ return IDENT;
+ }
+<xui>{dquote} {
+ BEGIN(INITIAL);
+ if (yyextra->literallen == 0)
+ yyerror("zero-length delimited identifier");
+ /* can't truncate till after we de-escape the ident */
+ yylval->str = litbufdup(yyscanner);
+ return UIDENT;
+ }
+<xd,xui>{xddouble} {
+ addlitchar('"', yyscanner);
+ }
+<xd,xui>{xdinside} {
+ addlit(yytext, yyleng, yyscanner);
+ }
+<xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
+
+{xufailed} {
+ char *ident;
+
+ SET_YYLLOC();
+ /* throw back all but the initial u/U */
+ yyless(1);
+ /* and treat it as {identifier} */
+ ident = downcase_truncate_identifier(yytext, yyleng, true);
+ yylval->str = ident;
+ return IDENT;
+ }
+
+{typecast} {
+ SET_YYLLOC();
+ return TYPECAST;
+ }
+
+{dot_dot} {
+ SET_YYLLOC();
+ return DOT_DOT;
+ }
+
+{colon_equals} {
+ SET_YYLLOC();
+ return COLON_EQUALS;
+ }
+
+{equals_greater} {
+ SET_YYLLOC();
+ return EQUALS_GREATER;
+ }
+
+{less_equals} {
+ SET_YYLLOC();
+ return LESS_EQUALS;
+ }
+
+{greater_equals} {
+ SET_YYLLOC();
+ return GREATER_EQUALS;
+ }
+
+{less_greater} {
+ /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
+ SET_YYLLOC();
+ return NOT_EQUALS;
+ }
+
+{not_equals} {
+ /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
+ SET_YYLLOC();
+ return NOT_EQUALS;
+ }
+
+{self} {
+ SET_YYLLOC();
+ return yytext[0];
+ }
+
+{operator} {
+ /*
+ * Check for embedded slash-star or dash-dash; those
+ * are comment starts, so operator must stop there.
+ * Note that slash-star or dash-dash at the first
+ * character will match a prior rule, not this one.
+ */
+ int nchars = yyleng;
+ char *slashstar = strstr(yytext, "/*");
+ char *dashdash = strstr(yytext, "--");
+
+ if (slashstar && dashdash)
+ {
+ /* if both appear, take the first one */
+ if (slashstar > dashdash)
+ slashstar = dashdash;
+ }
+ else if (!slashstar)
+ slashstar = dashdash;
+ if (slashstar)
+ nchars = slashstar - yytext;
+
+ /*
+ * For SQL compatibility, '+' and '-' cannot be the
+ * last char of a multi-char operator unless the operator
+ * contains chars that are not in SQL operators.
+ * The idea is to lex '=-' as two operators, but not
+ * to forbid operator names like '?-' that could not be
+ * sequences of SQL operators.
+ */
+ if (nchars > 1 &&
+ (yytext[nchars - 1] == '+' ||
+ yytext[nchars - 1] == '-'))
+ {
+ int ic;
+
+ for (ic = nchars - 2; ic >= 0; ic--)
+ {
+ char c = yytext[ic];
+ if (c == '~' || c == '!' || c == '@' ||
+ c == '#' || c == '^' || c == '&' ||
+ c == '|' || c == '`' || c == '?' ||
+ c == '%')
+ break;
+ }
+ if (ic < 0)
+ {
+ /*
+ * didn't find a qualifying character, so remove
+ * all trailing [+-]
+ */
+ do {
+ nchars--;
+ } while (nchars > 1 &&
+ (yytext[nchars - 1] == '+' ||
+ yytext[nchars - 1] == '-'));
+ }
+ }
+
+ SET_YYLLOC();
+
+ if (nchars < yyleng)
+ {
+ /* Strip the unwanted chars from the token */
+ yyless(nchars);
+ /*
+ * If what we have left is only one char, and it's
+ * one of the characters matching "self", then
+ * return it as a character token the same way
+ * that the "self" rule would have.
+ */
+ if (nchars == 1 &&
+ strchr(",()[].;:+-*/%^<>=", yytext[0]))
+ return yytext[0];
+ /*
+ * Likewise, if what we have left is two chars, and
+ * those match the tokens ">=", "<=", "=>", "<>" or
+ * "!=", then we must return the appropriate token
+ * rather than the generic Op.
+ */
+ if (nchars == 2)
+ {
+ if (yytext[0] == '=' && yytext[1] == '>')
+ return EQUALS_GREATER;
+ if (yytext[0] == '>' && yytext[1] == '=')
+ return GREATER_EQUALS;
+ if (yytext[0] == '<' && yytext[1] == '=')
+ return LESS_EQUALS;
+ if (yytext[0] == '<' && yytext[1] == '>')
+ return NOT_EQUALS;
+ if (yytext[0] == '!' && yytext[1] == '=')
+ return NOT_EQUALS;
+ }
+ }
+
+ /*
+ * Complain if operator is too long. Unlike the case
+ * for identifiers, we make this an error not a notice-
+ * and-truncate, because the odds are we are looking at
+ * a syntactic mistake anyway.
+ */
+ if (nchars >= NAMEDATALEN)
+ yyerror("operator too long");
+
+ yylval->str = pstrdup(yytext);
+ return Op;
+ }
+
+{param} {
+ SET_YYLLOC();
+ yylval->ival = atol(yytext + 1);
+ return PARAM;
+ }
+
+{integer} {
+ SET_YYLLOC();
+ return process_integer_literal(yytext, yylval);
+ }
+{decimal} {
+ SET_YYLLOC();
+ yylval->str = pstrdup(yytext);
+ return FCONST;
+ }
+{decimalfail} {
+ /* throw back the .., and treat as integer */
+ yyless(yyleng - 2);
+ SET_YYLLOC();
+ return process_integer_literal(yytext, yylval);
+ }
+{real} {
+ SET_YYLLOC();
+ yylval->str = pstrdup(yytext);
+ return FCONST;
+ }
+{realfail1} {
+ /*
+ * throw back the [Ee], and figure out whether what
+ * remains is an {integer} or {decimal}.
+ */
+ yyless(yyleng - 1);
+ SET_YYLLOC();
+ return process_integer_literal(yytext, yylval);
+ }
+{realfail2} {
+ /* throw back the [Ee][+-], and proceed as above */
+ yyless(yyleng - 2);
+ SET_YYLLOC();
+ return process_integer_literal(yytext, yylval);
+ }
+
+
+{identifier} {
+ int kwnum;
+ char *ident;
+
+ SET_YYLLOC();
+
+ /* Is it a keyword? */
+ kwnum = ScanKeywordLookup(yytext,
+ yyextra->keywordlist);
+ if (kwnum >= 0)
+ {
+ yylval->keyword = GetScanKeyword(kwnum,
+ yyextra->keywordlist);
+ return yyextra->keyword_tokens[kwnum];
+ }
+
+ /*
+ * No. Convert the identifier to lower case, and truncate
+ * if necessary.
+ */
+ ident = downcase_truncate_identifier(yytext, yyleng, true);
+ yylval->str = ident;
+ return IDENT;
+ }
+
+{other} {
+ SET_YYLLOC();
+ return yytext[0];
+ }
+
+<<EOF>> {
+ SET_YYLLOC();
+ yyterminate();
+ }
+
+%%
+
+/* LCOV_EXCL_STOP */
+
+/*
+ * Arrange access to yyextra for subroutines of the main yylex() function.
+ * We expect each subroutine to have a yyscanner parameter. Rather than
+ * use the yyget_xxx functions, which might or might not get inlined by the
+ * compiler, we cheat just a bit and cast yyscanner to the right type.
+ */
+#undef yyextra
+#define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
+
+/* Likewise for a couple of other things we need. */
+#undef yylloc
+#define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
+#undef yyleng
+#define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
+
+
+/*
+ * scanner_errposition
+ * Report a lexer or grammar error cursor position, if possible.
+ *
+ * This is expected to be used within an ereport() call, or via an error
+ * callback such as setup_scanner_errposition_callback(). The return value
+ * is a dummy (always 0, in fact).
+ *
+ * Note that this can only be used for messages emitted during raw parsing
+ * (essentially, scan.l, parser.c, and gram.y), since it requires the
+ * yyscanner struct to still be available.
+ */
+int
+scanner_errposition(int location, core_yyscan_t yyscanner)
+{
+ int pos;
+
+ if (location < 0)
+ return 0; /* no-op if location is unknown */
+
+ /* Convert byte offset to character number */
+ pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
+ /* And pass it to the ereport mechanism */
+ return errposition(pos);
+}
+
+/*
+ * Error context callback for inserting scanner error location.
+ *
+ * Note that this will be called for *any* error occurring while the
+ * callback is installed. We avoid inserting an irrelevant error location
+ * if the error is a query cancel --- are there any other important cases?
+ */
+static void
+scb_error_callback(void *arg)
+{
+ ScannerCallbackState *scbstate = (ScannerCallbackState *) arg;
+
+ if (geterrcode() != ERRCODE_QUERY_CANCELED)
+ (void) scanner_errposition(scbstate->location, scbstate->yyscanner);
+}
+
+/*
+ * setup_scanner_errposition_callback
+ * Arrange for non-scanner errors to report an error position
+ *
+ * Sometimes the scanner calls functions that aren't part of the scanner
+ * subsystem and can't reasonably be passed the yyscanner pointer; yet
+ * we would like any errors thrown in those functions to be tagged with an
+ * error location. Use this function to set up an error context stack
+ * entry that will accomplish that. Usage pattern:
+ *
+ * declare a local variable "ScannerCallbackState scbstate"
+ * ...
+ * setup_scanner_errposition_callback(&scbstate, yyscanner, location);
+ * call function that might throw error;
+ * cancel_scanner_errposition_callback(&scbstate);
+ */
+void
+setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
+ core_yyscan_t yyscanner,
+ int location)
+{
+ /* Setup error traceback support for ereport() */
+ scbstate->yyscanner = yyscanner;
+ scbstate->location = location;
+ scbstate->errcallback.callback = scb_error_callback;
+ scbstate->errcallback.arg = (void *) scbstate;
+ scbstate->errcallback.previous = error_context_stack;
+ error_context_stack = &scbstate->errcallback;
+}
+
+/*
+ * Cancel a previously-set-up errposition callback.
+ */
+void
+cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
+{
+ /* Pop the error context stack */
+ error_context_stack = scbstate->errcallback.previous;
+}
+
+/*
+ * scanner_yyerror
+ * Report a lexer or grammar error.
+ *
+ * The message's cursor position is whatever YYLLOC was last set to,
+ * ie, the start of the current token if called within yylex(), or the
+ * most recently lexed token if called from the grammar.
+ * This is OK for syntax error messages from the Bison parser, because Bison
+ * parsers report error as soon as the first unparsable token is reached.
+ * Beware of using yyerror for other purposes, as the cursor position might
+ * be misleading!
+ */
+void
+scanner_yyerror(const char *message, core_yyscan_t yyscanner)
+{
+ const char *loc = yyextra->scanbuf + *yylloc;
+
+ if (*loc == YY_END_OF_BUFFER_CHAR)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ /* translator: %s is typically the translation of "syntax error" */
+ errmsg("%s at end of input", _(message)),
+ lexer_errposition()));
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ /* translator: first %s is typically the translation of "syntax error" */
+ errmsg("%s at or near \"%s\"", _(message), loc),
+ lexer_errposition()));
+ }
+}
+
+
+/*
+ * Called before any actual parsing is done
+ */
+core_yyscan_t
+scanner_init(const char *str,
+ core_yy_extra_type *yyext,
+ const ScanKeywordList *keywordlist,
+ const uint16 *keyword_tokens)
+{
+ Size slen = strlen(str);
+ yyscan_t scanner;
+
+ if (yylex_init(&scanner) != 0)
+ elog(ERROR, "yylex_init() failed: %m");
+
+ core_yyset_extra(yyext, scanner);
+
+ yyext->keywordlist = keywordlist;
+ yyext->keyword_tokens = keyword_tokens;
+
+ yyext->backslash_quote = backslash_quote;
+ yyext->escape_string_warning = escape_string_warning;
+ yyext->standard_conforming_strings = standard_conforming_strings;
+
+ /*
+ * Make a scan buffer with special termination needed by flex.
+ */
+ yyext->scanbuf = (char *) palloc(slen + 2);
+ yyext->scanbuflen = slen;
+ memcpy(yyext->scanbuf, str, slen);
+ yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
+ yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
+
+ /* initialize literal buffer to a reasonable but expansible size */
+ yyext->literalalloc = 1024;
+ yyext->literalbuf = (char *) palloc(yyext->literalalloc);
+ yyext->literallen = 0;
+
+ return scanner;
+}
+
+
+/*
+ * Called after parsing is done to clean up after scanner_init()
+ */
+void
+scanner_finish(core_yyscan_t yyscanner)
+{
+ /*
+ * We don't bother to call yylex_destroy(), because all it would do is
+ * pfree a small amount of control storage. It's cheaper to leak the
+ * storage until the parsing context is destroyed. The amount of space
+ * involved is usually negligible compared to the output parse tree
+ * anyway.
+ *
+ * We do bother to pfree the scanbuf and literal buffer, but only if they
+ * represent a nontrivial amount of space. The 8K cutoff is arbitrary.
+ */
+ if (yyextra->scanbuflen >= 8192)
+ pfree(yyextra->scanbuf);
+ if (yyextra->literalalloc >= 8192)
+ pfree(yyextra->literalbuf);
+}
+
+
+static void
+addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
+{
+ /* enlarge buffer if needed */
+ if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
+ {
+ do
+ {
+ yyextra->literalalloc *= 2;
+ } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
+ yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
+ yyextra->literalalloc);
+ }
+ /* append new data */
+ memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
+ yyextra->literallen += yleng;
+}
+
+
+static void
+addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
+{
+ /* enlarge buffer if needed */
+ if ((yyextra->literallen + 1) >= yyextra->literalalloc)
+ {
+ yyextra->literalalloc *= 2;
+ yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
+ yyextra->literalalloc);
+ }
+ /* append new data */
+ yyextra->literalbuf[yyextra->literallen] = ychar;
+ yyextra->literallen += 1;
+}
+
+
+/*
+ * Create a palloc'd copy of literalbuf, adding a trailing null.
+ */
+static char *
+litbufdup(core_yyscan_t yyscanner)
+{
+ int llen = yyextra->literallen;
+ char *new;
+
+ new = palloc(llen + 1);
+ memcpy(new, yyextra->literalbuf, llen);
+ new[llen] = '\0';
+ return new;
+}
+
+/*
+ * Process {integer}. Note this will also do the right thing with {decimal},
+ * ie digits and a decimal point.
+ */
+static int
+process_integer_literal(const char *token, YYSTYPE *lval)
+{
+ int val;
+ char *endptr;
+
+ errno = 0;
+ val = strtoint(token, &endptr, 10);
+ if (*endptr != '\0' || errno == ERANGE)
+ {
+ /* integer too large (or contains decimal pt), treat it as a float */
+ lval->str = pstrdup(token);
+ return FCONST;
+ }
+ lval->ival = val;
+ return ICONST;
+}
+
+static void
+addunicode(pg_wchar c, core_yyscan_t yyscanner)
+{
+ ScannerCallbackState scbstate;
+ char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+ if (!is_valid_unicode_codepoint(c))
+ yyerror("invalid Unicode escape value");
+
+ /*
+ * We expect that pg_unicode_to_server() will complain about any
+ * unconvertible code point, so we don't have to set saw_non_ascii.
+ */
+ setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc));
+ pg_unicode_to_server(c, (unsigned char *) buf);
+ cancel_scanner_errposition_callback(&scbstate);
+ addlit(buf, strlen(buf), yyscanner);
+}
+
+static unsigned char
+unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
+{
+ switch (c)
+ {
+ case 'b':
+ return '\b';
+ case 'f':
+ return '\f';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ default:
+ /* check for backslash followed by non-7-bit-ASCII */
+ if (c == '\0' || IS_HIGHBIT_SET(c))
+ yyextra->saw_non_ascii = true;
+
+ return c;
+ }
+}
+
+static void
+check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
+{
+ if (ychar == '\'')
+ {
+ if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
+ ereport(WARNING,
+ (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
+ errmsg("nonstandard use of \\' in a string literal"),
+ errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
+ lexer_errposition()));
+ yyextra->warn_on_first_escape = false; /* warn only once per string */
+ }
+ else if (ychar == '\\')
+ {
+ if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
+ ereport(WARNING,
+ (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
+ errmsg("nonstandard use of \\\\ in a string literal"),
+ errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
+ lexer_errposition()));
+ yyextra->warn_on_first_escape = false; /* warn only once per string */
+ }
+ else
+ check_escape_warning(yyscanner);
+}
+
+static void
+check_escape_warning(core_yyscan_t yyscanner)
+{
+ if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
+ ereport(WARNING,
+ (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
+ errmsg("nonstandard use of escape in a string literal"),
+ errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
+ lexer_errposition()));
+ yyextra->warn_on_first_escape = false; /* warn only once per string */
+}
+
+/*
+ * Interface functions to make flex use palloc() instead of malloc().
+ * It'd be better to make these static, but flex insists otherwise.
+ */
+
+void *
+core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
+{
+ return palloc(bytes);
+}
+
+void *
+core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
+{
+ if (ptr)
+ return repalloc(ptr, bytes);
+ else
+ return palloc(bytes);
+}
+
+void
+core_yyfree(void *ptr, core_yyscan_t yyscanner)
+{
+ if (ptr)
+ pfree(ptr);
+}