1 files changed, 623 insertions, 0 deletions
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
new file mode 100644
index 0000000..72d4c5e
--- /dev/null
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -0,0 +1,623 @@
+%{
+/*-------------------------------------------------------------------------
+ *
+ * jsonpath_scan.l
+ *	Lexical parser for jsonpath datatype
+ *
+ * Splits jsonpath string into tokens represented as JsonPathString structs.
+ * Decodes unicode and hex escaped strings.
+ *
+ * Copyright (c) 2019-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	src/backend/utils/adt/jsonpath_scan.l
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "mb/pg_wchar.h"
+#include "nodes/pg_list.h"
+
+static JsonPathString scanstring;
+
+/* Handles to the buffer that the lexer uses internally */
+static YY_BUFFER_STATE scanbufhandle;
+static char *scanbuf;
+static int	scanbuflen;
+
+static void addstring(bool init, char *s, int l);
+static void addchar(bool init, char s);
+static enum yytokentype checkKeyword(void);
+static void parseUnicode(char *s, int l);
+static void parseHexChar(char *s);
+
+/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
+#undef fprintf
+#define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
+
+static void
+fprintf_to_ereport(const char *fmt, const char *msg)
+{
+	ereport(ERROR, (errmsg_internal("%s", msg)));
+}
+
+/* LCOV_EXCL_START */
+
+%}
+
+%option 8bit
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option warn
+%option prefix="jsonpath_yy"
+%option bison-bridge
+%option noyyalloc
+%option noyyrealloc
+%option noyyfree
+
+/*
+ * We use exclusive states for quoted and non-quoted strings,
+ * quoted variable names and C-style comments.
+ * Exclusive states:
+ *  <xq> - quoted strings
+ *  <xnq> - non-quoted strings
+ *  <xvq> - quoted variable names
+ *  <xc> - C-style comment
+ */
+
+%x xq
+%x xnq
+%x xvq
+%x xc
+
+special		[\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
+blank		[ \t\n\r\f]
+/* "other" means anything that's not special, blank, or '\' or '"' */
+other		[^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
+
+digit		[0-9]
+integer		(0|[1-9]{digit}*)
+decimal		{integer}\.{digit}+
+decimalfail	{integer}\.
+real		({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1	({integer}|{decimal})[Ee]
+realfail2	({integer}|{decimal})[Ee][-+]
+
+hex_dig		[0-9A-Fa-f]
+unicode		\\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
+unicodefail	\\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
+hex_char	\\x{hex_dig}{2}
+hex_fail	\\x{hex_dig}{0,1}
+
+%%
+
+<xnq>{other}+					{
+									addstring(false, yytext, yyleng);
+								}
+
+<xnq>{blank}+					{
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return checkKeyword();
+								}
+
+<xnq>\/\*						{
+									yylval->str = scanstring;
+									BEGIN xc;
+								}
+
+<xnq>({special}|\")				{
+									yylval->str = scanstring;
+									yyless(0);
+									BEGIN INITIAL;
+									return checkKeyword();
+								}
+
+<xnq><<EOF>>					{
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return checkKeyword();
+								}
+
+<xnq,xq,xvq>\\b				{ addchar(false, '\b'); }
+
+<xnq,xq,xvq>\\f				{ addchar(false, '\f'); }
+
+<xnq,xq,xvq>\\n				{ addchar(false, '\n'); }
+
+<xnq,xq,xvq>\\r				{ addchar(false, '\r'); }
+
+<xnq,xq,xvq>\\t				{ addchar(false, '\t'); }
+
+<xnq,xq,xvq>\\v				{ addchar(false, '\v'); }
+
+<xnq,xq,xvq>{unicode}+		{ parseUnicode(yytext, yyleng); }
+
+<xnq,xq,xvq>{hex_char}		{ parseHexChar(yytext); }
+
+<xnq,xq,xvq>{unicode}*{unicodefail}	{ yyerror(NULL, "invalid unicode sequence"); }
+
+<xnq,xq,xvq>{hex_fail}		{ yyerror(NULL, "invalid hex character sequence"); }
+
+<xnq,xq,xvq>{unicode}+\\	{
+								/* throw back the \\, and treat as unicode */
+								yyless(yyleng - 1);
+								parseUnicode(yytext, yyleng);
+							}
+
+<xnq,xq,xvq>\\.				{ addchar(false, yytext[1]); }
+
+<xnq,xq,xvq>\\				{ yyerror(NULL, "unexpected end after backslash"); }
+
+<xq,xvq><<EOF>>				{ yyerror(NULL, "unexpected end of quoted string"); }
+
+<xq>\"							{
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return STRING_P;
+								}
+
+<xvq>\"							{
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return VARIABLE_P;
+								}
+
+<xq,xvq>[^\\\"]+				{ addstring(false, yytext, yyleng); }
+
+<xc>\*\/						{ BEGIN INITIAL; }
+
+<xc>[^\*]+						{ }
+
+<xc>\*							{ }
+
+<xc><<EOF>>						{ yyerror(NULL, "unexpected end of comment"); }
+
+\&\&							{ return AND_P; }
+
+\|\|							{ return OR_P; }
+
+\!								{ return NOT_P; }
+
+\*\*							{ return ANY_P; }
+
+\<								{ return LESS_P; }
+
+\<\=							{ return LESSEQUAL_P; }
+
+\=\=							{ return EQUAL_P; }
+
+\<\>							{ return NOTEQUAL_P; }
+
+\!\=							{ return NOTEQUAL_P; }
+
+\>\=							{ return GREATEREQUAL_P; }
+
+\>								{ return GREATER_P; }
+
+\${other}+						{
+									addstring(true, yytext + 1, yyleng - 1);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return VARIABLE_P;
+								}
+
+\$\"							{
+									addchar(true, '\0');
+									BEGIN xvq;
+								}
+
+{special}						{ return *yytext; }
+
+{blank}+						{ /* ignore */ }
+
+\/\*							{
+									addchar(true, '\0');
+									BEGIN xc;
+								}
+
+{real}							{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return NUMERIC_P;
+								}
+
+{decimal}						{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return NUMERIC_P;
+								}
+
+{integer}						{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return INT_P;
+								}
+
+{decimalfail}					{
+									/* throw back the ., and treat as integer */
+									yyless(yyleng - 1);
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return INT_P;
+								}
+
+({realfail1}|{realfail2})		{ yyerror(NULL, "invalid floating point number"); }
+
+\"								{
+									addchar(true, '\0');
+									BEGIN xq;
+								}
+
+\\								{
+									yyless(0);
+									addchar(true, '\0');
+									BEGIN xnq;
+								}
+
+{other}+						{
+									addstring(true, yytext, yyleng);
+									BEGIN xnq;
+								}
+
+<<EOF>>							{ yyterminate(); }
+
+%%
+
+/* LCOV_EXCL_STOP */
+
+void
+jsonpath_yyerror(JsonPathParseResult **result, const char *message)
+{
+	if (*yytext == YY_END_OF_BUFFER_CHAR)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 /* translator: %s is typically "syntax error" */
+				 errmsg("%s at end of jsonpath input", _(message))));
+	}
+	else
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 /* translator: first %s is typically "syntax error" */
+				 errmsg("%s at or near \"%s\" of jsonpath input",
+						_(message), yytext)));
+	}
+}
+
+typedef struct JsonPathKeyword
+{
+	int16		len;
+	bool		lowercase;
+	int			val;
+	const char *keyword;
+} JsonPathKeyword;
+
+/*
+ * Array of key words should be sorted by length and then
+ * alphabetical order
+ */
+static const JsonPathKeyword keywords[] = {
+	{ 2, false,	IS_P,		"is"},
+	{ 2, false,	TO_P,		"to"},
+	{ 3, false,	ABS_P,		"abs"},
+	{ 3, false,	LAX_P,		"lax"},
+	{ 4, false,	FLAG_P,		"flag"},
+	{ 4, false,	LAST_P,		"last"},
+	{ 4, true,	NULL_P,		"null"},
+	{ 4, false,	SIZE_P,		"size"},
+	{ 4, true,	TRUE_P,		"true"},
+	{ 4, false,	TYPE_P,		"type"},
+	{ 4, false,	WITH_P,		"with"},
+	{ 5, true,	FALSE_P,	"false"},
+	{ 5, false,	FLOOR_P,	"floor"},
+	{ 6, false,	DOUBLE_P,	"double"},
+	{ 6, false,	EXISTS_P,	"exists"},
+	{ 6, false,	STARTS_P,	"starts"},
+	{ 6, false,	STRICT_P,	"strict"},
+	{ 7, false,	CEILING_P,	"ceiling"},
+	{ 7, false,	UNKNOWN_P,	"unknown"},
+	{ 8, false,	DATETIME_P,	"datetime"},
+	{ 8, false,	KEYVALUE_P,	"keyvalue"},
+	{ 10,false, LIKE_REGEX_P, "like_regex"},
+};
+
+/* Check if current scanstring value is a keyword */
+static enum yytokentype
+checkKeyword()
+{
+	int						res = IDENT_P;
+	int						diff;
+	const JsonPathKeyword  *StopLow = keywords,
+						   *StopHigh = keywords + lengthof(keywords),
+						   *StopMiddle;
+
+	if (scanstring.len > keywords[lengthof(keywords) - 1].len)
+		return res;
+
+	while (StopLow < StopHigh)
+	{
+		StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+
+		if (StopMiddle->len == scanstring.len)
+			diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
+								  scanstring.len);
+		else
+			diff = StopMiddle->len - scanstring.len;
+
+		if (diff < 0)
+			StopLow = StopMiddle + 1;
+		else if (diff > 0)
+			StopHigh = StopMiddle;
+		else
+		{
+			if (StopMiddle->lowercase)
+				diff = strncmp(StopMiddle->keyword, scanstring.val,
+							   scanstring.len);
+
+			if (diff == 0)
+				res = StopMiddle->val;
+
+			break;
+		}
+	}
+
+	return res;
+}
+
+/*
+ * Called before any actual parsing is done
+ */
+static void
+jsonpath_scanner_init(const char *str, int slen)
+{
+	if (slen <= 0)
+		slen = strlen(str);
+
+	/*
+	 * Might be left over after ereport()
+	 */
+	yy_init_globals();
+
+	/*
+	 * Make a scan buffer with special termination needed by flex.
+	 */
+
+	scanbuflen = slen;
+	scanbuf = palloc(slen + 2);
+	memcpy(scanbuf, str, slen);
+	scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
+	scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
+
+	BEGIN(INITIAL);
+}
+
+
+/*
+ * Called after parsing is done to clean up after jsonpath_scanner_init()
+ */
+static void
+jsonpath_scanner_finish(void)
+{
+	yy_delete_buffer(scanbufhandle);
+	pfree(scanbuf);
+}
+
+/*
+ * Resize scanstring so that it can append string of given length.
+ * Reinitialize if required.
+ */
+static void
+resizeString(bool init, int appendLen)
+{
+	if (init)
+	{
+		scanstring.total = Max(32, appendLen);
+		scanstring.val = (char *) palloc(scanstring.total);
+		scanstring.len = 0;
+	}
+	else
+	{
+		if (scanstring.len + appendLen >= scanstring.total)
+		{
+			while (scanstring.len + appendLen >= scanstring.total)
+				scanstring.total *= 2;
+			scanstring.val = repalloc(scanstring.val, scanstring.total);
+		}
+	}
+}
+
+/* Add set of bytes at "s" of length "l" to scanstring */
+static void
+addstring(bool init, char *s, int l)
+{
+	resizeString(init, l + 1);
+	memcpy(scanstring.val + scanstring.len, s, l);
+	scanstring.len += l;
+}
+
+/* Add single byte "c" to scanstring */
+static void
+addchar(bool init, char c)
+{
+	resizeString(init, 1);
+	scanstring.val[scanstring.len] = c;
+	if (c != '\0')
+		scanstring.len++;
+}
+
+/* Interface to jsonpath parser */
+JsonPathParseResult *
+parsejsonpath(const char *str, int len)
+{
+	JsonPathParseResult	*parseresult;
+
+	jsonpath_scanner_init(str, len);
+
+	if (jsonpath_yyparse((void *) &parseresult) != 0)
+		jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
+
+	jsonpath_scanner_finish();
+
+	return parseresult;
+}
+
+/* Turn hex character into integer */
+static int
+hexval(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	jsonpath_yyerror(NULL, "invalid hexadecimal digit");
+	return 0; /* not reached */
+}
+
+/* Add given unicode character to scanstring */
+static void
+addUnicodeChar(int ch)
+{
+	if (ch == 0)
+	{
+		/* We can't allow this, since our TEXT type doesn't */
+		ereport(ERROR,
+				(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+				 errmsg("unsupported Unicode escape sequence"),
+				  errdetail("\\u0000 cannot be converted to text.")));
+	}
+	else
+	{
+		char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+		pg_unicode_to_server(ch, (unsigned char *) cbuf);
+		addstring(false, cbuf, strlen(cbuf));
+	}
+}
+
+/* Add unicode character, processing any surrogate pairs */
+static void
+addUnicode(int ch, int *hi_surrogate)
+{
+	if (is_utf16_surrogate_first(ch))
+	{
+		if (*hi_surrogate != -1)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type %s", "jsonpath"),
+					 errdetail("Unicode high surrogate must not follow "
+							   "a high surrogate.")));
+		*hi_surrogate = ch;
+		return;
+	}
+	else if (is_utf16_surrogate_second(ch))
+	{
+		if (*hi_surrogate == -1)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type %s", "jsonpath"),
+					 errdetail("Unicode low surrogate must follow a high "
+							   "surrogate.")));
+		ch = surrogate_pair_to_codepoint(*hi_surrogate, ch);
+		*hi_surrogate = -1;
+	}
+	else if (*hi_surrogate != -1)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type %s", "jsonpath"),
+				 errdetail("Unicode low surrogate must follow a high "
+						   "surrogate.")));
+	}
+
+	addUnicodeChar(ch);
+}
+
+/*
+ * parseUnicode was adopted from json_lex_string() in
+ * src/backend/utils/adt/json.c
+ */
+static void
+parseUnicode(char *s, int l)
+{
+	int			i = 2;
+	int			hi_surrogate = -1;
+
+	for (i = 2; i < l; i += 2)	/* skip '\u' */
+	{
+		int			ch = 0;
+		int			j;
+
+		if (s[i] == '{')	/* parse '\u{XX...}' */
+		{
+			while (s[++i] != '}' && i < l)
+				ch = (ch << 4) | hexval(s[i]);
+			i++;	/* skip '}' */
+		}
+		else		/* parse '\uXXXX' */
+		{
+			for (j = 0; j < 4 && i < l; j++)
+				ch = (ch << 4) | hexval(s[i++]);
+		}
+
+		addUnicode(ch, &hi_surrogate);
+	}
+
+	if (hi_surrogate != -1)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type %s", "jsonpath"),
+				 errdetail("Unicode low surrogate must follow a high "
+						   "surrogate.")));
+	}
+}
+
+/* Parse sequence of hex-encoded characters */
+static void
+parseHexChar(char *s)
+{
+	int			ch = (hexval(s[2]) << 4) |
+					  hexval(s[3]);
+
+	addUnicodeChar(ch);
+}
+
+/*
+ * Interface functions to make flex use palloc() instead of malloc().
+ * It'd be better to make these static, but flex insists otherwise.
+ */
+
+void *
+jsonpath_yyalloc(yy_size_t bytes)
+{
+	return palloc(bytes);
+}
+
+void *
+jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
+{
+	if (ptr)
+		return repalloc(ptr, bytes);
+	else
+		return palloc(bytes);
+}
+
+void
+jsonpath_yyfree(void *ptr)
+{
+	if (ptr)
+		pfree(ptr);
+}