summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/adt/jsonpath_scan.l
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/jsonpath_scan.l')
-rw-r--r--src/backend/utils/adt/jsonpath_scan.l749
1 files changed, 749 insertions, 0 deletions
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
new file mode 100644
index 0000000..29c26af
--- /dev/null
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -0,0 +1,749 @@
+%top{
+/*-------------------------------------------------------------------------
+ *
+ * jsonpath_scan.l
+ * Lexical parser for jsonpath datatype
+ *
+ * Splits jsonpath string into tokens represented as JsonPathString structs.
+ * Decodes unicode and hex escaped strings.
+ *
+ * Copyright (c) 2019-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/jsonpath_scan.l
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+/*
+ * NB: include jsonpath_gram.h only AFTER including jsonpath_internal.h,
+ * because jsonpath_internal.h contains the declaration for JsonPathString.
+ */
+#include "jsonpath_internal.h"
+#include "jsonpath_gram.h"
+
+#include "mb/pg_wchar.h"
+#include "nodes/miscnodes.h"
+#include "nodes/pg_list.h"
+}
+
+%{
+static JsonPathString scanstring;
+
+/* Handles to the buffer that the lexer uses internally */
+static YY_BUFFER_STATE scanbufhandle;
+static char *scanbuf;
+static int scanbuflen;
+
+static void addstring(bool init, char *s, int l);
+static void addchar(bool init, char c);
+static enum yytokentype checkKeyword(void);
+static bool parseUnicode(char *s, int l, struct Node *escontext);
+static bool parseHexChar(char *s, struct Node *escontext);
+
+/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
+#undef fprintf
+#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
+
+static void
+fprintf_to_ereport(const char *fmt, const char *msg)
+{
+ ereport(ERROR, (errmsg_internal("%s", msg)));
+}
+
+/* LCOV_EXCL_START */
+
+%}
+
+%option 8bit
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option warn
+%option prefix="jsonpath_yy"
+%option bison-bridge
+%option noyyalloc
+%option noyyrealloc
+%option noyyfree
+
+/*
+ * We use exclusive states for quoted and non-quoted strings,
+ * quoted variable names and C-style comments.
+ * Exclusive states:
+ * <xq> - quoted strings
+ * <xnq> - non-quoted strings
+ * <xvq> - quoted variable names
+ * <xc> - C-style comment
+ */
+
+%x xq
+%x xnq
+%x xvq
+%x xc
+
+special [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
+blank [ \t\n\r\f]
+/* "other" means anything that's not special, blank, or '\' or '"' */
+other [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
+
+decdigit [0-9]
+hexdigit [0-9A-Fa-f]
+octdigit [0-7]
+bindigit [0-1]
+
+/* DecimalInteger in ECMAScript; must not start with 0 unless it's exactly 0 */
+decinteger (0|[1-9](_?{decdigit})*)
+/* DecimalDigits in ECMAScript; only used as part of other rules */
+decdigits {decdigit}(_?{decdigit})*
+/* Non-decimal integers; in ECMAScript, these must not have underscore after prefix */
+hexinteger 0[xX]{hexdigit}(_?{hexdigit})*
+octinteger 0[oO]{octdigit}(_?{octdigit})*
+bininteger 0[bB]{bindigit}(_?{bindigit})*
+
+decimal ({decinteger}\.{decdigits}?|\.{decdigits})
+real ({decinteger}|{decimal})[Ee][-+]?{decdigits}
+realfail ({decinteger}|{decimal})[Ee][-+]
+
+decinteger_junk {decinteger}{other}
+decimal_junk {decimal}{other}
+real_junk {real}{other}
+
+unicode \\u({hexdigit}{4}|\{{hexdigit}{1,6}\})
+unicodefail \\u({hexdigit}{0,3}|\{{hexdigit}{0,6})
+hex_char \\x{hexdigit}{2}
+hex_fail \\x{hexdigit}{0,1}
+
+%%
+
+<xnq>{other}+ {
+ addstring(false, yytext, yyleng);
+ }
+
+<xnq>{blank}+ {
+ yylval->str = scanstring;
+ BEGIN INITIAL;
+ return checkKeyword();
+ }
+
+<xnq>\/\* {
+ yylval->str = scanstring;
+ BEGIN xc;
+ }
+
+<xnq>({special}|\") {
+ yylval->str = scanstring;
+ yyless(0);
+ BEGIN INITIAL;
+ return checkKeyword();
+ }
+
+<xnq><<EOF>> {
+ yylval->str = scanstring;
+ BEGIN INITIAL;
+ return checkKeyword();
+ }
+
+<xnq,xq,xvq>\\b { addchar(false, '\b'); }
+
+<xnq,xq,xvq>\\f { addchar(false, '\f'); }
+
+<xnq,xq,xvq>\\n { addchar(false, '\n'); }
+
+<xnq,xq,xvq>\\r { addchar(false, '\r'); }
+
+<xnq,xq,xvq>\\t { addchar(false, '\t'); }
+
+<xnq,xq,xvq>\\v { addchar(false, '\v'); }
+
+<xnq,xq,xvq>{unicode}+ {
+ if (!parseUnicode(yytext, yyleng, escontext))
+ yyterminate();
+ }
+
+<xnq,xq,xvq>{hex_char} {
+ if (!parseHexChar(yytext, escontext))
+ yyterminate();
+ }
+
+<xnq,xq,xvq>{unicode}*{unicodefail} {
+ jsonpath_yyerror(NULL, escontext,
+ "invalid Unicode escape sequence");
+ yyterminate();
+ }
+
+<xnq,xq,xvq>{hex_fail} {
+ jsonpath_yyerror(NULL, escontext,
+ "invalid hexadecimal character sequence");
+ yyterminate();
+ }
+
+<xnq,xq,xvq>{unicode}+\\ {
+ /* throw back the \\, and treat as unicode */
+ yyless(yyleng - 1);
+ if (!parseUnicode(yytext, yyleng, escontext))
+ yyterminate();
+ }
+
+<xnq,xq,xvq>\\. { addchar(false, yytext[1]); }
+
+<xnq,xq,xvq>\\ {
+ jsonpath_yyerror(NULL, escontext,
+ "unexpected end after backslash");
+ yyterminate();
+ }
+
+<xq,xvq><<EOF>> {
+ jsonpath_yyerror(NULL, escontext,
+ "unterminated quoted string");
+ yyterminate();
+ }
+
+<xq>\" {
+ yylval->str = scanstring;
+ BEGIN INITIAL;
+ return STRING_P;
+ }
+
+<xvq>\" {
+ yylval->str = scanstring;
+ BEGIN INITIAL;
+ return VARIABLE_P;
+ }
+
+<xq,xvq>[^\\\"]+ { addstring(false, yytext, yyleng); }
+
+<xc>\*\/ { BEGIN INITIAL; }
+
+<xc>[^\*]+ { }
+
+<xc>\* { }
+
+<xc><<EOF>> {
+ jsonpath_yyerror(
+ NULL, escontext,
+ "unexpected end of comment");
+ yyterminate();
+ }
+\&\& { return AND_P; }
+
+\|\| { return OR_P; }
+
+\! { return NOT_P; }
+
+\*\* { return ANY_P; }
+
+\< { return LESS_P; }
+
+\<\= { return LESSEQUAL_P; }
+
+\=\= { return EQUAL_P; }
+
+\<\> { return NOTEQUAL_P; }
+
+\!\= { return NOTEQUAL_P; }
+
+\>\= { return GREATEREQUAL_P; }
+
+\> { return GREATER_P; }
+
+\${other}+ {
+ addstring(true, yytext + 1, yyleng - 1);
+ addchar(false, '\0');
+ yylval->str = scanstring;
+ return VARIABLE_P;
+ }
+
+\$\" {
+ addchar(true, '\0');
+ BEGIN xvq;
+ }
+
+{special} { return *yytext; }
+
+{blank}+ { /* ignore */ }
+
+\/\* {
+ addchar(true, '\0');
+ BEGIN xc;
+ }
+
+{real} {
+ addstring(true, yytext, yyleng);
+ addchar(false, '\0');
+ yylval->str = scanstring;
+ return NUMERIC_P;
+ }
+
+{decimal} {
+ addstring(true, yytext, yyleng);
+ addchar(false, '\0');
+ yylval->str = scanstring;
+ return NUMERIC_P;
+ }
+
+{decinteger} {
+ addstring(true, yytext, yyleng);
+ addchar(false, '\0');
+ yylval->str = scanstring;
+ return INT_P;
+ }
+
+{hexinteger} {
+ addstring(true, yytext, yyleng);
+ addchar(false, '\0');
+ yylval->str = scanstring;
+ return INT_P;
+ }
+
+{octinteger} {
+ addstring(true, yytext, yyleng);
+ addchar(false, '\0');
+ yylval->str = scanstring;
+ return INT_P;
+ }
+
+{bininteger} {
+ addstring(true, yytext, yyleng);
+ addchar(false, '\0');
+ yylval->str = scanstring;
+ return INT_P;
+ }
+
+{realfail} {
+ jsonpath_yyerror(
+ NULL, escontext,
+ "invalid numeric literal");
+ yyterminate();
+ }
+{decinteger_junk} {
+ jsonpath_yyerror(
+ NULL, escontext,
+ "trailing junk after numeric literal");
+ yyterminate();
+ }
+{decimal_junk} {
+ jsonpath_yyerror(
+ NULL, escontext,
+ "trailing junk after numeric literal");
+ yyterminate();
+ }
+{real_junk} {
+ jsonpath_yyerror(
+ NULL, escontext,
+ "trailing junk after numeric literal");
+ yyterminate();
+ }
+\" {
+ addchar(true, '\0');
+ BEGIN xq;
+ }
+
+\\ {
+ yyless(0);
+ addchar(true, '\0');
+ BEGIN xnq;
+ }
+
+{other}+ {
+ addstring(true, yytext, yyleng);
+ BEGIN xnq;
+ }
+
+<<EOF>> { yyterminate(); }
+
+%%
+
+/* LCOV_EXCL_STOP */
+
+void
+jsonpath_yyerror(JsonPathParseResult **result, struct Node *escontext,
+ const char *message)
+{
+ /* don't overwrite escontext if it's already been set */
+ if (SOFT_ERROR_OCCURRED(escontext))
+ return;
+
+ if (*yytext == YY_END_OF_BUFFER_CHAR)
+ {
+ errsave(escontext,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ /* translator: %s is typically "syntax error" */
+ errmsg("%s at end of jsonpath input", _(message))));
+ }
+ else
+ {
+ errsave(escontext,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ /* translator: first %s is typically "syntax error" */
+ errmsg("%s at or near \"%s\" of jsonpath input",
+ _(message), yytext)));
+ }
+}
+
+typedef struct JsonPathKeyword
+{
+ int16 len;
+ bool lowercase;
+ int val;
+ const char *keyword;
+} JsonPathKeyword;
+
+/*
+ * Array of key words should be sorted by length and then
+ * alphabetical order
+ */
+static const JsonPathKeyword keywords[] = {
+ { 2, false, IS_P, "is"},
+ { 2, false, TO_P, "to"},
+ { 3, false, ABS_P, "abs"},
+ { 3, false, LAX_P, "lax"},
+ { 4, false, FLAG_P, "flag"},
+ { 4, false, LAST_P, "last"},
+ { 4, true, NULL_P, "null"},
+ { 4, false, SIZE_P, "size"},
+ { 4, true, TRUE_P, "true"},
+ { 4, false, TYPE_P, "type"},
+ { 4, false, WITH_P, "with"},
+ { 5, true, FALSE_P, "false"},
+ { 5, false, FLOOR_P, "floor"},
+ { 6, false, DOUBLE_P, "double"},
+ { 6, false, EXISTS_P, "exists"},
+ { 6, false, STARTS_P, "starts"},
+ { 6, false, STRICT_P, "strict"},
+ { 7, false, CEILING_P, "ceiling"},
+ { 7, false, UNKNOWN_P, "unknown"},
+ { 8, false, DATETIME_P, "datetime"},
+ { 8, false, KEYVALUE_P, "keyvalue"},
+ { 10,false, LIKE_REGEX_P, "like_regex"},
+};
+
+/* Check if current scanstring value is a keyword */
+static enum yytokentype
+checkKeyword()
+{
+ int res = IDENT_P;
+ int diff;
+ const JsonPathKeyword *StopLow = keywords,
+ *StopHigh = keywords + lengthof(keywords),
+ *StopMiddle;
+
+ if (scanstring.len > keywords[lengthof(keywords) - 1].len)
+ return res;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+
+ if (StopMiddle->len == scanstring.len)
+ diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
+ scanstring.len);
+ else
+ diff = StopMiddle->len - scanstring.len;
+
+ if (diff < 0)
+ StopLow = StopMiddle + 1;
+ else if (diff > 0)
+ StopHigh = StopMiddle;
+ else
+ {
+ if (StopMiddle->lowercase)
+ diff = strncmp(StopMiddle->keyword, scanstring.val,
+ scanstring.len);
+
+ if (diff == 0)
+ res = StopMiddle->val;
+
+ break;
+ }
+ }
+
+ return res;
+}
+
+/*
+ * Called before any actual parsing is done
+ */
+static void
+jsonpath_scanner_init(const char *str, int slen)
+{
+ if (slen <= 0)
+ slen = strlen(str);
+
+ /*
+ * Might be left over after ereport()
+ */
+ yy_init_globals();
+
+ /*
+ * Make a scan buffer with special termination needed by flex.
+ */
+
+ scanbuflen = slen;
+ scanbuf = palloc(slen + 2);
+ memcpy(scanbuf, str, slen);
+ scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
+ scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
+
+ BEGIN(INITIAL);
+}
+
+
+/*
+ * Called after parsing is done to clean up after jsonpath_scanner_init()
+ */
+static void
+jsonpath_scanner_finish(void)
+{
+ yy_delete_buffer(scanbufhandle);
+ pfree(scanbuf);
+}
+
+/*
+ * Resize scanstring so that it can append string of given length.
+ * Reinitialize if required.
+ */
+static void
+resizeString(bool init, int appendLen)
+{
+ if (init)
+ {
+ scanstring.total = Max(32, appendLen);
+ scanstring.val = (char *) palloc(scanstring.total);
+ scanstring.len = 0;
+ }
+ else
+ {
+ if (scanstring.len + appendLen >= scanstring.total)
+ {
+ while (scanstring.len + appendLen >= scanstring.total)
+ scanstring.total *= 2;
+ scanstring.val = repalloc(scanstring.val, scanstring.total);
+ }
+ }
+}
+
+/* Add set of bytes at "s" of length "l" to scanstring */
+static void
+addstring(bool init, char *s, int l)
+{
+ resizeString(init, l + 1);
+ memcpy(scanstring.val + scanstring.len, s, l);
+ scanstring.len += l;
+}
+
+/* Add single byte "c" to scanstring */
+static void
+addchar(bool init, char c)
+{
+ resizeString(init, 1);
+ scanstring.val[scanstring.len] = c;
+ if (c != '\0')
+ scanstring.len++;
+}
+
+/* Interface to jsonpath parser */
+JsonPathParseResult *
+parsejsonpath(const char *str, int len, struct Node *escontext)
+{
+ JsonPathParseResult *parseresult;
+
+ jsonpath_scanner_init(str, len);
+
+ if (jsonpath_yyparse((void *) &parseresult, escontext) != 0)
+ jsonpath_yyerror(NULL, escontext, "invalid input"); /* shouldn't happen */
+
+ jsonpath_scanner_finish();
+
+ return parseresult;
+}
+
+/* Turn hex character into integer */
+static bool
+hexval(char c, int *result, struct Node *escontext)
+{
+ if (c >= '0' && c <= '9')
+ {
+ *result = c - '0';
+ return true;
+ }
+ if (c >= 'a' && c <= 'f')
+ {
+ *result = c - 'a' + 0xA;
+ return true;
+ }
+ if (c >= 'A' && c <= 'F')
+ {
+ *result = c - 'A' + 0xA;
+ return true;
+ }
+ jsonpath_yyerror(NULL, escontext, "invalid hexadecimal digit");
+ return false;
+}
+
+/* Add given unicode character to scanstring */
+static bool
+addUnicodeChar(int ch, struct Node *escontext)
+{
+ if (ch == 0)
+ {
+ /* We can't allow this, since our TEXT type doesn't */
+ ereturn(escontext, false,
+ (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+ errmsg("unsupported Unicode escape sequence"),
+ errdetail("\\u0000 cannot be converted to text.")));
+ }
+ else
+ {
+ char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+ /*
+ * If we're trapping the error status, call the noerror form of the
+ * conversion function. Otherwise call the normal form which provides
+ * more detailed errors.
+ */
+
+ if (! escontext || ! IsA(escontext, ErrorSaveContext))
+ pg_unicode_to_server(ch, (unsigned char *) cbuf);
+ else if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf))
+ ereturn(escontext, false,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("could not convert Unicode to server encoding")));
+ addstring(false, cbuf, strlen(cbuf));
+ }
+ return true;
+}
+
+/* Add unicode character, processing any surrogate pairs */
+static bool
+addUnicode(int ch, int *hi_surrogate, struct Node *escontext)
+{
+ if (is_utf16_surrogate_first(ch))
+ {
+ if (*hi_surrogate != -1)
+ ereturn(escontext, false,
+ (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+ errmsg("invalid input syntax for type %s", "jsonpath"),
+ errdetail("Unicode high surrogate must not follow "
+ "a high surrogate.")));
+ *hi_surrogate = ch;
+ return true;
+ }
+ else if (is_utf16_surrogate_second(ch))
+ {
+ if (*hi_surrogate == -1)
+ ereturn(escontext, false,
+ (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+ errmsg("invalid input syntax for type %s", "jsonpath"),
+ errdetail("Unicode low surrogate must follow a high "
+ "surrogate.")));
+ ch = surrogate_pair_to_codepoint(*hi_surrogate, ch);
+ *hi_surrogate = -1;
+ }
+ else if (*hi_surrogate != -1)
+ {
+ ereturn(escontext, false,
+ (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+ errmsg("invalid input syntax for type %s", "jsonpath"),
+ errdetail("Unicode low surrogate must follow a high "
+ "surrogate.")));
+ }
+
+ return addUnicodeChar(ch, escontext);
+}
+
+/*
+ * parseUnicode was adopted from json_lex_string() in
+ * src/backend/utils/adt/json.c
+ */
+static bool
+parseUnicode(char *s, int l, struct Node *escontext)
+{
+ int i = 2;
+ int hi_surrogate = -1;
+
+ for (i = 2; i < l; i += 2) /* skip '\u' */
+ {
+ int ch = 0;
+ int j, si;
+
+ if (s[i] == '{') /* parse '\u{XX...}' */
+ {
+ while (s[++i] != '}' && i < l)
+ {
+ if (!hexval(s[i], &si, escontext))
+ return false;
+ ch = (ch << 4) | si;
+ }
+ i++; /* skip '}' */
+ }
+ else /* parse '\uXXXX' */
+ {
+ for (j = 0; j < 4 && i < l; j++)
+ {
+ if (!hexval(s[i++], &si, escontext))
+ return false;
+ ch = (ch << 4) | si;
+ }
+ }
+
+ if (! addUnicode(ch, &hi_surrogate, escontext))
+ return false;
+ }
+
+ if (hi_surrogate != -1)
+ {
+ ereturn(escontext, false,
+ (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+ errmsg("invalid input syntax for type %s", "jsonpath"),
+ errdetail("Unicode low surrogate must follow a high "
+ "surrogate.")));
+ }
+
+ return true;
+}
+
+/* Parse sequence of hex-encoded characters */
+static bool
+parseHexChar(char *s, struct Node *escontext)
+{
+ int s2, s3, ch;
+ if (!hexval(s[2], &s2, escontext))
+ return false;
+ if (!hexval(s[3], &s3, escontext))
+ return false;
+
+ ch = (s2 << 4) | s3;
+
+ return addUnicodeChar(ch, escontext);
+}
+
+/*
+ * Interface functions to make flex use palloc() instead of malloc().
+ * It'd be better to make these static, but flex insists otherwise.
+ */
+
+void *
+jsonpath_yyalloc(yy_size_t bytes)
+{
+ return palloc(bytes);
+}
+
+void *
+jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
+{
+ if (ptr)
+ return repalloc(ptr, bytes);
+ else
+ return palloc(bytes);
+}
+
+void
+jsonpath_yyfree(void *ptr)
+{
+ if (ptr)
+ pfree(ptr);
+}