1 files changed, 749 insertions, 0 deletions
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
new file mode 100644
index 0000000..29c26af
--- /dev/null
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -0,0 +1,749 @@
+%top{
+/*-------------------------------------------------------------------------
+ *
+ * jsonpath_scan.l
+ *	Lexical parser for jsonpath datatype
+ *
+ * Splits jsonpath string into tokens represented as JsonPathString structs.
+ * Decodes unicode and hex escaped strings.
+ *
+ * Copyright (c) 2019-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	src/backend/utils/adt/jsonpath_scan.l
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+/*
+ * NB: include jsonpath_gram.h only AFTER including jsonpath_internal.h,
+ * because jsonpath_internal.h contains the declaration for JsonPathString.
+ */
+#include "jsonpath_internal.h"
+#include "jsonpath_gram.h"
+
+#include "mb/pg_wchar.h"
+#include "nodes/miscnodes.h"
+#include "nodes/pg_list.h"
+}
+
+%{
+static JsonPathString scanstring;
+
+/* Handles to the buffer that the lexer uses internally */
+static YY_BUFFER_STATE scanbufhandle;
+static char *scanbuf;
+static int	scanbuflen;
+
+static void addstring(bool init, char *s, int l);
+static void addchar(bool init, char c);
+static enum yytokentype checkKeyword(void);
+static bool parseUnicode(char *s, int l, struct Node *escontext);
+static bool parseHexChar(char *s, struct Node *escontext);
+
+/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
+#undef fprintf
+#define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
+
+static void
+fprintf_to_ereport(const char *fmt, const char *msg)
+{
+	ereport(ERROR, (errmsg_internal("%s", msg)));
+}
+
+/* LCOV_EXCL_START */
+
+%}
+
+%option 8bit
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option warn
+%option prefix="jsonpath_yy"
+%option bison-bridge
+%option noyyalloc
+%option noyyrealloc
+%option noyyfree
+
+/*
+ * We use exclusive states for quoted and non-quoted strings,
+ * quoted variable names and C-style comments.
+ * Exclusive states:
+ *  <xq> - quoted strings
+ *  <xnq> - non-quoted strings
+ *  <xvq> - quoted variable names
+ *  <xc> - C-style comment
+ */
+
+%x xq
+%x xnq
+%x xvq
+%x xc
+
+special		[\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
+blank		[ \t\n\r\f]
+/* "other" means anything that's not special, blank, or '\' or '"' */
+other		[^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
+
+decdigit	[0-9]
+hexdigit	[0-9A-Fa-f]
+octdigit	[0-7]
+bindigit	[0-1]
+
+/* DecimalInteger in ECMAScript; must not start with 0 unless it's exactly 0 */
+decinteger	(0|[1-9](_?{decdigit})*)
+/* DecimalDigits in ECMAScript; only used as part of other rules */
+decdigits	{decdigit}(_?{decdigit})*
+/* Non-decimal integers; in ECMAScript, these must not have underscore after prefix */
+hexinteger	0[xX]{hexdigit}(_?{hexdigit})*
+octinteger	0[oO]{octdigit}(_?{octdigit})*
+bininteger	0[bB]{bindigit}(_?{bindigit})*
+
+decimal		({decinteger}\.{decdigits}?|\.{decdigits})
+real		({decinteger}|{decimal})[Ee][-+]?{decdigits}
+realfail	({decinteger}|{decimal})[Ee][-+]
+
+decinteger_junk	{decinteger}{other}
+decimal_junk	{decimal}{other}
+real_junk		{real}{other}
+
+unicode		\\u({hexdigit}{4}|\{{hexdigit}{1,6}\})
+unicodefail	\\u({hexdigit}{0,3}|\{{hexdigit}{0,6})
+hex_char	\\x{hexdigit}{2}
+hex_fail	\\x{hexdigit}{0,1}
+
+%%
+
+<xnq>{other}+					{
+									addstring(false, yytext, yyleng);
+								}
+
+<xnq>{blank}+					{
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return checkKeyword();
+								}
+
+<xnq>\/\*						{
+									yylval->str = scanstring;
+									BEGIN xc;
+								}
+
+<xnq>({special}|\")				{
+									yylval->str = scanstring;
+									yyless(0);
+									BEGIN INITIAL;
+									return checkKeyword();
+								}
+
+<xnq><<EOF>>					{
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return checkKeyword();
+								}
+
+<xnq,xq,xvq>\\b				{ addchar(false, '\b'); }
+
+<xnq,xq,xvq>\\f				{ addchar(false, '\f'); }
+
+<xnq,xq,xvq>\\n				{ addchar(false, '\n'); }
+
+<xnq,xq,xvq>\\r				{ addchar(false, '\r'); }
+
+<xnq,xq,xvq>\\t				{ addchar(false, '\t'); }
+
+<xnq,xq,xvq>\\v				{ addchar(false, '\v'); }
+
+<xnq,xq,xvq>{unicode}+		{
+								if (!parseUnicode(yytext, yyleng, escontext))
+									yyterminate();
+							}
+
+<xnq,xq,xvq>{hex_char}		{
+								if (!parseHexChar(yytext, escontext))
+									yyterminate();
+							}
+
+<xnq,xq,xvq>{unicode}*{unicodefail} {
+								jsonpath_yyerror(NULL, escontext,
+												 "invalid Unicode escape sequence");
+								yyterminate();
+							}
+
+<xnq,xq,xvq>{hex_fail}		{
+								jsonpath_yyerror(NULL, escontext,
+												 "invalid hexadecimal character sequence");
+								yyterminate();
+							}
+
+<xnq,xq,xvq>{unicode}+\\	{
+								/* throw back the \\, and treat as unicode */
+								yyless(yyleng - 1);
+								if (!parseUnicode(yytext, yyleng, escontext))
+									yyterminate();
+							}
+
+<xnq,xq,xvq>\\.				{ addchar(false, yytext[1]); }
+
+<xnq,xq,xvq>\\				{
+							  jsonpath_yyerror(NULL, escontext,
+											   "unexpected end after backslash");
+							  yyterminate();
+							}
+
+<xq,xvq><<EOF>>				{
+							  jsonpath_yyerror(NULL, escontext,
+											   "unterminated quoted string");
+							  yyterminate();
+							}
+
+<xq>\"							{
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return STRING_P;
+								}
+
+<xvq>\"							{
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return VARIABLE_P;
+								}
+
+<xq,xvq>[^\\\"]+				{ addstring(false, yytext, yyleng); }
+
+<xc>\*\/						{ BEGIN INITIAL; }
+
+<xc>[^\*]+						{ }
+
+<xc>\*							{ }
+
+<xc><<EOF>>						{
+									jsonpath_yyerror(
+										NULL, escontext,
+										"unexpected end of comment");
+									yyterminate();
+								}
+\&\&							{ return AND_P; }
+
+\|\|							{ return OR_P; }
+
+\!								{ return NOT_P; }
+
+\*\*							{ return ANY_P; }
+
+\<								{ return LESS_P; }
+
+\<\=							{ return LESSEQUAL_P; }
+
+\=\=							{ return EQUAL_P; }
+
+\<\>							{ return NOTEQUAL_P; }
+
+\!\=							{ return NOTEQUAL_P; }
+
+\>\=							{ return GREATEREQUAL_P; }
+
+\>								{ return GREATER_P; }
+
+\${other}+						{
+									addstring(true, yytext + 1, yyleng - 1);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return VARIABLE_P;
+								}
+
+\$\"							{
+									addchar(true, '\0');
+									BEGIN xvq;
+								}
+
+{special}						{ return *yytext; }
+
+{blank}+						{ /* ignore */ }
+
+\/\*							{
+									addchar(true, '\0');
+									BEGIN xc;
+								}
+
+{real}							{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return NUMERIC_P;
+								}
+
+{decimal}						{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return NUMERIC_P;
+								}
+
+{decinteger}					{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return INT_P;
+								}
+
+{hexinteger}					{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return INT_P;
+								}
+
+{octinteger}					{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return INT_P;
+								}
+
+{bininteger}					{
+									addstring(true, yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									return INT_P;
+								}
+
+{realfail}						{
+									jsonpath_yyerror(
+										NULL, escontext,
+										"invalid numeric literal");
+									yyterminate();
+								}
+{decinteger_junk}				{
+									jsonpath_yyerror(
+										NULL, escontext,
+										"trailing junk after numeric literal");
+									yyterminate();
+								}
+{decimal_junk}					{
+									jsonpath_yyerror(
+										NULL, escontext,
+										"trailing junk after numeric literal");
+									yyterminate();
+								}
+{real_junk}						{
+									jsonpath_yyerror(
+										NULL, escontext,
+										"trailing junk after numeric literal");
+									yyterminate();
+								}
+\"								{
+									addchar(true, '\0');
+									BEGIN xq;
+								}
+
+\\								{
+									yyless(0);
+									addchar(true, '\0');
+									BEGIN xnq;
+								}
+
+{other}+						{
+									addstring(true, yytext, yyleng);
+									BEGIN xnq;
+								}
+
+<<EOF>>							{ yyterminate(); }
+
+%%
+
+/* LCOV_EXCL_STOP */
+
+void
+jsonpath_yyerror(JsonPathParseResult **result, struct Node *escontext,
+				 const char *message)
+{
+	/* don't overwrite escontext if it's already been set */
+	if (SOFT_ERROR_OCCURRED(escontext))
+		return;
+
+	if (*yytext == YY_END_OF_BUFFER_CHAR)
+	{
+		errsave(escontext,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 /* translator: %s is typically "syntax error" */
+				 errmsg("%s at end of jsonpath input", _(message))));
+	}
+	else
+	{
+		errsave(escontext,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 /* translator: first %s is typically "syntax error" */
+				 errmsg("%s at or near \"%s\" of jsonpath input",
+						_(message), yytext)));
+	}
+}
+
+typedef struct JsonPathKeyword
+{
+	int16		len;
+	bool		lowercase;
+	int			val;
+	const char *keyword;
+} JsonPathKeyword;
+
+/*
+ * Array of key words should be sorted by length and then
+ * alphabetical order
+ */
+static const JsonPathKeyword keywords[] = {
+	{ 2, false,	IS_P,		"is"},
+	{ 2, false,	TO_P,		"to"},
+	{ 3, false,	ABS_P,		"abs"},
+	{ 3, false,	LAX_P,		"lax"},
+	{ 4, false,	FLAG_P,		"flag"},
+	{ 4, false,	LAST_P,		"last"},
+	{ 4, true,	NULL_P,		"null"},
+	{ 4, false,	SIZE_P,		"size"},
+	{ 4, true,	TRUE_P,		"true"},
+	{ 4, false,	TYPE_P,		"type"},
+	{ 4, false,	WITH_P,		"with"},
+	{ 5, true,	FALSE_P,	"false"},
+	{ 5, false,	FLOOR_P,	"floor"},
+	{ 6, false,	DOUBLE_P,	"double"},
+	{ 6, false,	EXISTS_P,	"exists"},
+	{ 6, false,	STARTS_P,	"starts"},
+	{ 6, false,	STRICT_P,	"strict"},
+	{ 7, false,	CEILING_P,	"ceiling"},
+	{ 7, false,	UNKNOWN_P,	"unknown"},
+	{ 8, false,	DATETIME_P,	"datetime"},
+	{ 8, false,	KEYVALUE_P,	"keyvalue"},
+	{ 10,false, LIKE_REGEX_P, "like_regex"},
+};
+
+/* Check if current scanstring value is a keyword */
+static enum yytokentype
+checkKeyword()
+{
+	int			res = IDENT_P;
+	int			diff;
+	const JsonPathKeyword  *StopLow = keywords,
+						   *StopHigh = keywords + lengthof(keywords),
+						   *StopMiddle;
+
+	if (scanstring.len > keywords[lengthof(keywords) - 1].len)
+		return res;
+
+	while (StopLow < StopHigh)
+	{
+		StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+
+		if (StopMiddle->len == scanstring.len)
+			diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
+								  scanstring.len);
+		else
+			diff = StopMiddle->len - scanstring.len;
+
+		if (diff < 0)
+			StopLow = StopMiddle + 1;
+		else if (diff > 0)
+			StopHigh = StopMiddle;
+		else
+		{
+			if (StopMiddle->lowercase)
+				diff = strncmp(StopMiddle->keyword, scanstring.val,
+							   scanstring.len);
+
+			if (diff == 0)
+				res = StopMiddle->val;
+
+			break;
+		}
+	}
+
+	return res;
+}
+
+/*
+ * Called before any actual parsing is done
+ */
+static void
+jsonpath_scanner_init(const char *str, int slen)
+{
+	if (slen <= 0)
+		slen = strlen(str);
+
+	/*
+	 * Might be left over after ereport()
+	 */
+	yy_init_globals();
+
+	/*
+	 * Make a scan buffer with special termination needed by flex.
+	 */
+
+	scanbuflen = slen;
+	scanbuf = palloc(slen + 2);
+	memcpy(scanbuf, str, slen);
+	scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
+	scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
+
+	BEGIN(INITIAL);
+}
+
+
+/*
+ * Called after parsing is done to clean up after jsonpath_scanner_init()
+ */
+static void
+jsonpath_scanner_finish(void)
+{
+	yy_delete_buffer(scanbufhandle);
+	pfree(scanbuf);
+}
+
+/*
+ * Resize scanstring so that it can append string of given length.
+ * Reinitialize if required.
+ */
+static void
+resizeString(bool init, int appendLen)
+{
+	if (init)
+	{
+		scanstring.total = Max(32, appendLen);
+		scanstring.val = (char *) palloc(scanstring.total);
+		scanstring.len = 0;
+	}
+	else
+	{
+		if (scanstring.len + appendLen >= scanstring.total)
+		{
+			while (scanstring.len + appendLen >= scanstring.total)
+				scanstring.total *= 2;
+			scanstring.val = repalloc(scanstring.val, scanstring.total);
+		}
+	}
+}
+
+/* Add set of bytes at "s" of length "l" to scanstring */
+static void
+addstring(bool init, char *s, int l)
+{
+	resizeString(init, l + 1);
+	memcpy(scanstring.val + scanstring.len, s, l);
+	scanstring.len += l;
+}
+
+/* Add single byte "c" to scanstring */
+static void
+addchar(bool init, char c)
+{
+	resizeString(init, 1);
+	scanstring.val[scanstring.len] = c;
+	if (c != '\0')
+		scanstring.len++;
+}
+
+/* Interface to jsonpath parser */
+JsonPathParseResult *
+parsejsonpath(const char *str, int len, struct Node *escontext)
+{
+	JsonPathParseResult	*parseresult;
+
+	jsonpath_scanner_init(str, len);
+
+	if (jsonpath_yyparse((void *) &parseresult, escontext) != 0)
+		jsonpath_yyerror(NULL, escontext, "invalid input"); /* shouldn't happen */
+
+	jsonpath_scanner_finish();
+
+	return parseresult;
+}
+
+/* Turn hex character into integer */
+static bool
+hexval(char c, int *result, struct Node *escontext)
+{
+	if (c >= '0' && c <= '9')
+	{
+		*result = c - '0';
+		return true;
+	}
+	if (c >= 'a' && c <= 'f')
+	{
+		*result = c - 'a' + 0xA;
+		return true;
+	}
+	if (c >= 'A' && c <= 'F')
+	{
+		*result = c - 'A' + 0xA;
+		return true;
+	}
+	jsonpath_yyerror(NULL, escontext, "invalid hexadecimal digit");
+	return false;
+}
+
+/* Add given unicode character to scanstring */
+static bool
+addUnicodeChar(int ch, struct Node *escontext)
+{
+	if (ch == 0)
+	{
+		/* We can't allow this, since our TEXT type doesn't */
+		ereturn(escontext, false,
+				(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+				 errmsg("unsupported Unicode escape sequence"),
+				  errdetail("\\u0000 cannot be converted to text.")));
+	}
+	else
+	{
+		char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+		/*
+		 * If we're trapping the error status, call the noerror form of the
+		 * conversion function. Otherwise call the normal form which provides
+		 * more detailed errors.
+		 */
+
+		if (! escontext  || ! IsA(escontext, ErrorSaveContext))
+			pg_unicode_to_server(ch, (unsigned char *) cbuf);
+		else if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf))
+			ereturn(escontext, false,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("could not convert Unicode to server encoding")));
+		addstring(false, cbuf, strlen(cbuf));
+	}
+	return true;
+}
+
+/* Add unicode character, processing any surrogate pairs */
+static bool
+addUnicode(int ch, int *hi_surrogate, struct Node *escontext)
+{
+	if (is_utf16_surrogate_first(ch))
+	{
+		if (*hi_surrogate != -1)
+			ereturn(escontext, false,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type %s", "jsonpath"),
+					 errdetail("Unicode high surrogate must not follow "
+							   "a high surrogate.")));
+		*hi_surrogate = ch;
+		return true;
+	}
+	else if (is_utf16_surrogate_second(ch))
+	{
+		if (*hi_surrogate == -1)
+			ereturn(escontext, false,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type %s", "jsonpath"),
+					 errdetail("Unicode low surrogate must follow a high "
+							   "surrogate.")));
+		ch = surrogate_pair_to_codepoint(*hi_surrogate, ch);
+		*hi_surrogate = -1;
+	}
+	else if (*hi_surrogate != -1)
+	{
+		ereturn(escontext, false,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type %s", "jsonpath"),
+				 errdetail("Unicode low surrogate must follow a high "
+						   "surrogate.")));
+	}
+
+	return addUnicodeChar(ch, escontext);
+}
+
+/*
+ * parseUnicode was adopted from json_lex_string() in
+ * src/backend/utils/adt/json.c
+ */
+static bool
+parseUnicode(char *s, int l, struct Node *escontext)
+{
+	int			i = 2;
+	int			hi_surrogate = -1;
+
+	for (i = 2; i < l; i += 2)	/* skip '\u' */
+	{
+		int			ch = 0;
+		int			j, si;
+
+		if (s[i] == '{')	/* parse '\u{XX...}' */
+		{
+			while (s[++i] != '}' && i < l)
+			{
+				if (!hexval(s[i], &si, escontext))
+					return false;
+				ch = (ch << 4) | si;
+			}
+			i++;	/* skip '}' */
+		}
+		else		/* parse '\uXXXX' */
+		{
+			for (j = 0; j < 4 && i < l; j++)
+			{
+				if (!hexval(s[i++], &si, escontext))
+					return false;
+				ch = (ch << 4) | si;
+			}
+		}
+
+		if (! addUnicode(ch, &hi_surrogate, escontext))
+			return false;
+	}
+
+	if (hi_surrogate != -1)
+	{
+		ereturn(escontext, false,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type %s", "jsonpath"),
+				 errdetail("Unicode low surrogate must follow a high "
+						   "surrogate.")));
+	}
+
+	return true;
+}
+
+/* Parse sequence of hex-encoded characters */
+static bool
+parseHexChar(char *s, struct Node *escontext)
+{
+	int s2, s3, ch;
+	if (!hexval(s[2], &s2, escontext))
+		return false;
+	if (!hexval(s[3], &s3, escontext))
+		return false;
+
+	ch = (s2 << 4) | s3;
+
+	return addUnicodeChar(ch, escontext);
+}
+
+/*
+ * Interface functions to make flex use palloc() instead of malloc().
+ * It'd be better to make these static, but flex insists otherwise.
+ */
+
+void *
+jsonpath_yyalloc(yy_size_t bytes)
+{
+	return palloc(bytes);
+}
+
+void *
+jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
+{
+	if (ptr)
+		return repalloc(ptr, bytes);
+	else
+		return palloc(bytes);
+}
+
+void
+jsonpath_yyfree(void *ptr)
+{
+	if (ptr)
+		pfree(ptr);
+}