diff options
Diffstat (limited to 'src/interfaces/ecpg/preproc/parser.c')
-rw-r--r-- | src/interfaces/ecpg/preproc/parser.c | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/src/interfaces/ecpg/preproc/parser.c b/src/interfaces/ecpg/preproc/parser.c new file mode 100644 index 0000000..a8571a3 --- /dev/null +++ b/src/interfaces/ecpg/preproc/parser.c @@ -0,0 +1,231 @@ +/*------------------------------------------------------------------------- + * + * parser.c + * Main entry point/driver for PostgreSQL grammar + * + * This should match src/backend/parser/parser.c, except that we do not + * need to bother with re-entrant interfaces. + * + * Note: ECPG doesn't report error location like the backend does. + * This file will need work if we ever want it to. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/interfaces/ecpg/preproc/parser.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include "preproc_extern.h" +#include "preproc.h" + + +static bool have_lookahead; /* is lookahead info valid? */ +static int lookahead_token; /* one-token lookahead */ +static YYSTYPE lookahead_yylval; /* yylval for lookahead token */ +static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */ +static char *lookahead_yytext; /* start current token */ + +static bool check_uescapechar(unsigned char escape); +static bool ecpg_isspace(char ch); + + +/* + * Intermediate filter between parser and base lexer (base_yylex in scan.l). + * + * This filter is needed because in some cases the standard SQL grammar + * requires more than one token lookahead. We reduce these cases to one-token + * lookahead by replacing tokens here, in order to keep the grammar LALR(1). + * + * Using a filter is simpler than trying to recognize multiword tokens + * directly in scan.l, because we'd have to allow for comments between the + * words. Furthermore it's not clear how to do that without re-introducing + * scanner backtrack, which would cost more performance than this filter + * layer does. + * + * We also use this filter to convert UIDENT and USCONST sequences into + * plain IDENT and SCONST tokens. While that could be handled by additional + * productions in the main grammar, it's more efficient to do it like this. + */ +int +filtered_base_yylex(void) +{ + int cur_token; + int next_token; + YYSTYPE cur_yylval; + YYLTYPE cur_yylloc; + char *cur_yytext; + + /* Get next token --- we might already have it */ + if (have_lookahead) + { + cur_token = lookahead_token; + base_yylval = lookahead_yylval; + base_yylloc = lookahead_yylloc; + base_yytext = lookahead_yytext; + have_lookahead = false; + } + else + cur_token = base_yylex(); + + /* + * If this token isn't one that requires lookahead, just return it. + */ + switch (cur_token) + { + case NOT: + case NULLS_P: + case WITH: + case UIDENT: + case USCONST: + break; + default: + return cur_token; + } + + /* Save and restore lexer output variables around the call */ + cur_yylval = base_yylval; + cur_yylloc = base_yylloc; + cur_yytext = base_yytext; + + /* Get next token, saving outputs into lookahead variables */ + next_token = base_yylex(); + + lookahead_token = next_token; + lookahead_yylval = base_yylval; + lookahead_yylloc = base_yylloc; + lookahead_yytext = base_yytext; + + base_yylval = cur_yylval; + base_yylloc = cur_yylloc; + base_yytext = cur_yytext; + + have_lookahead = true; + + /* Replace cur_token if needed, based on lookahead */ + switch (cur_token) + { + case NOT: + /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ + switch (next_token) + { + case BETWEEN: + case IN_P: + case LIKE: + case ILIKE: + case SIMILAR: + cur_token = NOT_LA; + break; + } + break; + + case NULLS_P: + /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ + switch (next_token) + { + case FIRST_P: + case LAST_P: + cur_token = NULLS_LA; + break; + } + break; + + case WITH: + /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ + switch (next_token) + { + case TIME: + case ORDINALITY: + cur_token = WITH_LA; + break; + } + break; + case UIDENT: + case USCONST: + /* Look ahead for UESCAPE */ + if (next_token == UESCAPE) + { + /* Yup, so get third token, which had better be SCONST */ + const char *escstr; + + /* + * Again save and restore lexer output variables around the + * call + */ + cur_yylval = base_yylval; + cur_yylloc = base_yylloc; + cur_yytext = base_yytext; + + /* Get third token */ + next_token = base_yylex(); + + if (next_token != SCONST) + mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal"); + + /* + * Save and check escape string, which the scanner returns + * with quotes + */ + escstr = base_yylval.str; + if (strlen(escstr) != 3 || !check_uescapechar(escstr[1])) + mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character"); + + base_yylval = cur_yylval; + base_yylloc = cur_yylloc; + base_yytext = cur_yytext; + + /* Combine 3 tokens into 1 */ + base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr); + + /* Clear have_lookahead, thereby consuming all three tokens */ + have_lookahead = false; + } + + if (cur_token == UIDENT) + cur_token = IDENT; + else if (cur_token == USCONST) + cur_token = SCONST; + break; + } + + return cur_token; +} + +/* + * check_uescapechar() and ecpg_isspace() should match their equivalents + * in pgc.l. + */ + +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +static bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || ecpg_isspace(escape)) + return false; + else + return true; +} + +/* + * ecpg_isspace() --- return true if flex scanner considers char whitespace + */ +static bool +ecpg_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\f') + return true; + return false; +} |