1 files changed, 501 insertions, 0 deletions
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
new file mode 100644
index 0000000..875de7b
--- /dev/null
+++ b/src/backend/parser/parser.c
@@ -0,0 +1,501 @@
+/*-------------------------------------------------------------------------
+ *
+ * parser.c
+ *		Main entry point/driver for PostgreSQL grammar
+ *
+ * Note that the grammar is not allowed to perform any table access
+ * (since we need to be able to do basic parsing even while inside an
+ * aborted transaction).  Therefore, the data structures returned by
+ * the grammar are "raw" parsetrees that still need to be analyzed by
+ * analyze.c and related files.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/parser/parser.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "mb/pg_wchar.h"
+#include "parser/gramparse.h"
+#include "parser/parser.h"
+#include "parser/scansup.h"
+
+static bool check_uescapechar(unsigned char escape);
+static char *str_udeescape(const char *str, char escape,
+						   int position, core_yyscan_t yyscanner);
+
+
+/*
+ * raw_parser
+ *		Given a query in string form, do lexical and grammatical analysis.
+ *
+ * Returns a list of raw (un-analyzed) parse trees.  The contents of the
+ * list have the form required by the specified RawParseMode.
+ */
+List *
+raw_parser(const char *str, RawParseMode mode)
+{
+	core_yyscan_t yyscanner;
+	base_yy_extra_type yyextra;
+	int			yyresult;
+
+	/* initialize the flex scanner */
+	yyscanner = scanner_init(str, &yyextra.core_yy_extra,
+							 &ScanKeywords, ScanKeywordTokens);
+
+	/* base_yylex() only needs us to initialize the lookahead token, if any */
+	if (mode == RAW_PARSE_DEFAULT)
+		yyextra.have_lookahead = false;
+	else
+	{
+		/* this array is indexed by RawParseMode enum */
+		static const int mode_token[] = {
+			0,					/* RAW_PARSE_DEFAULT */
+			MODE_TYPE_NAME,		/* RAW_PARSE_TYPE_NAME */
+			MODE_PLPGSQL_EXPR,	/* RAW_PARSE_PLPGSQL_EXPR */
+			MODE_PLPGSQL_ASSIGN1,	/* RAW_PARSE_PLPGSQL_ASSIGN1 */
+			MODE_PLPGSQL_ASSIGN2,	/* RAW_PARSE_PLPGSQL_ASSIGN2 */
+			MODE_PLPGSQL_ASSIGN3	/* RAW_PARSE_PLPGSQL_ASSIGN3 */
+		};
+
+		yyextra.have_lookahead = true;
+		yyextra.lookahead_token = mode_token[mode];
+		yyextra.lookahead_yylloc = 0;
+		yyextra.lookahead_end = NULL;
+	}
+
+	/* initialize the bison parser */
+	parser_init(&yyextra);
+
+	/* Parse! */
+	yyresult = base_yyparse(yyscanner);
+
+	/* Clean up (release memory) */
+	scanner_finish(yyscanner);
+
+	if (yyresult)				/* error */
+		return NIL;
+
+	return yyextra.parsetree;
+}
+
+
+/*
+ * Intermediate filter between parser and core lexer (core_yylex in scan.l).
+ *
+ * This filter is needed because in some cases the standard SQL grammar
+ * requires more than one token lookahead.  We reduce these cases to one-token
+ * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
+ *
+ * Using a filter is simpler than trying to recognize multiword tokens
+ * directly in scan.l, because we'd have to allow for comments between the
+ * words.  Furthermore it's not clear how to do that without re-introducing
+ * scanner backtrack, which would cost more performance than this filter
+ * layer does.
+ *
+ * We also use this filter to convert UIDENT and USCONST sequences into
+ * plain IDENT and SCONST tokens.  While that could be handled by additional
+ * productions in the main grammar, it's more efficient to do it like this.
+ *
+ * The filter also provides a convenient place to translate between
+ * the core_YYSTYPE and YYSTYPE representations (which are really the
+ * same thing anyway, but notationally they're different).
+ */
+int
+base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
+{
+	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
+	int			cur_token;
+	int			next_token;
+	int			cur_token_length;
+	YYLTYPE		cur_yylloc;
+
+	/* Get next token --- we might already have it */
+	if (yyextra->have_lookahead)
+	{
+		cur_token = yyextra->lookahead_token;
+		lvalp->core_yystype = yyextra->lookahead_yylval;
+		*llocp = yyextra->lookahead_yylloc;
+		if (yyextra->lookahead_end)
+			*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
+		yyextra->have_lookahead = false;
+	}
+	else
+		cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
+
+	/*
+	 * If this token isn't one that requires lookahead, just return it.  If it
+	 * does, determine the token length.  (We could get that via strlen(), but
+	 * since we have such a small set of possibilities, hardwiring seems
+	 * feasible and more efficient --- at least for the fixed-length cases.)
+	 */
+	switch (cur_token)
+	{
+		case NOT:
+			cur_token_length = 3;
+			break;
+		case NULLS_P:
+			cur_token_length = 5;
+			break;
+		case WITH:
+			cur_token_length = 4;
+			break;
+		case UIDENT:
+		case USCONST:
+			cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
+			break;
+		default:
+			return cur_token;
+	}
+
+	/*
+	 * Identify end+1 of current token.  core_yylex() has temporarily stored a
+	 * '\0' here, and will undo that when we call it again.  We need to redo
+	 * it to fully revert the lookahead call for error reporting purposes.
+	 */
+	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
+		*llocp + cur_token_length;
+	Assert(*(yyextra->lookahead_end) == '\0');
+
+	/*
+	 * Save and restore *llocp around the call.  It might look like we could
+	 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
+	 * does not work because flex actually holds onto the last-passed pointer
+	 * internally, and will use that for error reporting.  We need any error
+	 * reports to point to the current token, not the next one.
+	 */
+	cur_yylloc = *llocp;
+
+	/* Get next token, saving outputs into lookahead variables */
+	next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
+	yyextra->lookahead_token = next_token;
+	yyextra->lookahead_yylloc = *llocp;
+
+	*llocp = cur_yylloc;
+
+	/* Now revert the un-truncation of the current token */
+	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
+	*(yyextra->lookahead_end) = '\0';
+
+	yyextra->have_lookahead = true;
+
+	/* Replace cur_token if needed, based on lookahead */
+	switch (cur_token)
+	{
+		case NOT:
+			/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
+			switch (next_token)
+			{
+				case BETWEEN:
+				case IN_P:
+				case LIKE:
+				case ILIKE:
+				case SIMILAR:
+					cur_token = NOT_LA;
+					break;
+			}
+			break;
+
+		case NULLS_P:
+			/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
+			switch (next_token)
+			{
+				case FIRST_P:
+				case LAST_P:
+					cur_token = NULLS_LA;
+					break;
+			}
+			break;
+
+		case WITH:
+			/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
+			switch (next_token)
+			{
+				case TIME:
+				case ORDINALITY:
+					cur_token = WITH_LA;
+					break;
+			}
+			break;
+
+		case UIDENT:
+		case USCONST:
+			/* Look ahead for UESCAPE */
+			if (next_token == UESCAPE)
+			{
+				/* Yup, so get third token, which had better be SCONST */
+				const char *escstr;
+
+				/* Again save and restore *llocp */
+				cur_yylloc = *llocp;
+
+				/* Un-truncate current token so errors point to third token */
+				*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
+
+				/* Get third token */
+				next_token = core_yylex(&(yyextra->lookahead_yylval),
+										llocp, yyscanner);
+
+				/* If we throw error here, it will point to third token */
+				if (next_token != SCONST)
+					scanner_yyerror("UESCAPE must be followed by a simple string literal",
+									yyscanner);
+
+				escstr = yyextra->lookahead_yylval.str;
+				if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
+					scanner_yyerror("invalid Unicode escape character",
+									yyscanner);
+
+				/* Now restore *llocp; errors will point to first token */
+				*llocp = cur_yylloc;
+
+				/* Apply Unicode conversion */
+				lvalp->core_yystype.str =
+					str_udeescape(lvalp->core_yystype.str,
+								  escstr[0],
+								  *llocp,
+								  yyscanner);
+
+				/*
+				 * We don't need to revert the un-truncation of UESCAPE.  What
+				 * we do want to do is clear have_lookahead, thereby consuming
+				 * all three tokens.
+				 */
+				yyextra->have_lookahead = false;
+			}
+			else
+			{
+				/* No UESCAPE, so convert using default escape character */
+				lvalp->core_yystype.str =
+					str_udeescape(lvalp->core_yystype.str,
+								  '\\',
+								  *llocp,
+								  yyscanner);
+			}
+
+			if (cur_token == UIDENT)
+			{
+				/* It's an identifier, so truncate as appropriate */
+				truncate_identifier(lvalp->core_yystype.str,
+									strlen(lvalp->core_yystype.str),
+									true);
+				cur_token = IDENT;
+			}
+			else if (cur_token == USCONST)
+			{
+				cur_token = SCONST;
+			}
+			break;
+	}
+
+	return cur_token;
+}
+
+/* convert hex digit (caller should have verified that) to value */
+static unsigned int
+hexval(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	elog(ERROR, "invalid hexadecimal digit");
+	return 0;					/* not reached */
+}
+
+/* is Unicode code point acceptable? */
+static void
+check_unicode_value(pg_wchar c)
+{
+	if (!is_valid_unicode_codepoint(c))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape value")));
+}
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+static bool
+check_uescapechar(unsigned char escape)
+{
+	if (isxdigit(escape)
+		|| escape == '+'
+		|| escape == '\''
+		|| escape == '"'
+		|| scanner_isspace(escape))
+		return false;
+	else
+		return true;
+}
+
+/*
+ * Process Unicode escapes in "str", producing a palloc'd plain string
+ *
+ * escape: the escape character to use
+ * position: start position of U&'' or U&"" string token
+ * yyscanner: context information needed for error reports
+ */
+static char *
+str_udeescape(const char *str, char escape,
+			  int position, core_yyscan_t yyscanner)
+{
+	const char *in;
+	char	   *new,
+			   *out;
+	size_t		new_len;
+	pg_wchar	pair_first = 0;
+	ScannerCallbackState scbstate;
+
+	/*
+	 * Guesstimate that result will be no longer than input, but allow enough
+	 * padding for Unicode conversion.
+	 */
+	new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
+	new = palloc(new_len);
+
+	in = str;
+	out = new;
+	while (*in)
+	{
+		/* Enlarge string if needed */
+		size_t		out_dist = out - new;
+
+		if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
+		{
+			new_len *= 2;
+			new = repalloc(new, new_len);
+			out = new + out_dist;
+		}
+
+		if (in[0] == escape)
+		{
+			/*
+			 * Any errors reported while processing this escape sequence will
+			 * have an error cursor pointing at the escape.
+			 */
+			setup_scanner_errposition_callback(&scbstate, yyscanner,
+											   in - str + position + 3);	/* 3 for U&" */
+			if (in[1] == escape)
+			{
+				if (pair_first)
+					goto invalid_pair;
+				*out++ = escape;
+				in += 2;
+			}
+			else if (isxdigit((unsigned char) in[1]) &&
+					 isxdigit((unsigned char) in[2]) &&
+					 isxdigit((unsigned char) in[3]) &&
+					 isxdigit((unsigned char) in[4]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval(in[1]) << 12) +
+					(hexval(in[2]) << 8) +
+					(hexval(in[3]) << 4) +
+					hexval(in[4]);
+				check_unicode_value(unicode);
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) out);
+					out += strlen(out);
+				}
+				in += 5;
+			}
+			else if (in[1] == '+' &&
+					 isxdigit((unsigned char) in[2]) &&
+					 isxdigit((unsigned char) in[3]) &&
+					 isxdigit((unsigned char) in[4]) &&
+					 isxdigit((unsigned char) in[5]) &&
+					 isxdigit((unsigned char) in[6]) &&
+					 isxdigit((unsigned char) in[7]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval(in[2]) << 20) +
+					(hexval(in[3]) << 16) +
+					(hexval(in[4]) << 12) +
+					(hexval(in[5]) << 8) +
+					(hexval(in[6]) << 4) +
+					hexval(in[7]);
+				check_unicode_value(unicode);
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) out);
+					out += strlen(out);
+				}
+				in += 8;
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("invalid Unicode escape"),
+						 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
+
+			cancel_scanner_errposition_callback(&scbstate);
+		}
+		else
+		{
+			if (pair_first)
+				goto invalid_pair;
+
+			*out++ = *in++;
+		}
+	}
+
+	/* unfinished surrogate pair? */
+	if (pair_first)
+		goto invalid_pair;
+
+	*out = '\0';
+	return new;
+
+	/*
+	 * We might get here with the error callback active, or not.  Call
+	 * scanner_errposition to make sure an error cursor appears; if the
+	 * callback is active, this is duplicative but harmless.
+	 */
+invalid_pair:
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 errmsg("invalid Unicode surrogate pair"),
+			 scanner_errposition(in - str + position + 3,	/* 3 for U&" */
+								 yyscanner)));
+	return NULL;				/* keep compiler quiet */
+}