diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/adt/tsvector_parser.c | |
parent | Initial commit. (diff) | |
download | postgresql-14-upstream.tar.xz postgresql-14-upstream.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/adt/tsvector_parser.c')
-rw-r--r-- | src/backend/utils/adt/tsvector_parser.c | 367 |
1 files changed, 367 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c new file mode 100644 index 0000000..c2df409 --- /dev/null +++ b/src/backend/utils/adt/tsvector_parser.c @@ -0,0 +1,367 @@ +/*------------------------------------------------------------------------- + * + * tsvector_parser.c + * Parser for tsvector + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/tsvector_parser.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" + + +/* + * Private state of tsvector parser. Note that tsquery also uses this code to + * parse its input, hence the boolean flags. The two flags are both true or + * both false in current usage, but we keep them separate for clarity. + * is_tsquery affects *only* the content of error messages. + */ +struct TSVectorParseStateData +{ + char *prsbuf; /* next input character */ + char *bufstart; /* whole string (used only for errors) */ + char *word; /* buffer to hold the current word */ + int len; /* size in bytes allocated for 'word' */ + int eml; /* max bytes per character */ + bool oprisdelim; /* treat ! | * ( ) as delimiters? */ + bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */ + bool is_web; /* we're in websearch_to_tsquery() */ +}; + + +/* + * Initializes parser for the input string. If oprisdelim is set, the + * following characters are treated as delimiters in addition to whitespace: + * ! | & ( ) + */ +TSVectorParseState +init_tsvector_parser(char *input, int flags) +{ + TSVectorParseState state; + + state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData)); + state->prsbuf = input; + state->bufstart = input; + state->len = 32; + state->word = (char *) palloc(state->len); + state->eml = pg_database_encoding_max_length(); + state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0; + state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0; + state->is_web = (flags & P_TSV_IS_WEB) != 0; + + return state; +} + +/* + * Reinitializes parser to parse 'input', instead of previous input. + */ +void +reset_tsvector_parser(TSVectorParseState state, char *input) +{ + state->prsbuf = input; +} + +/* + * Shuts down a tsvector parser. + */ +void +close_tsvector_parser(TSVectorParseState state) +{ + pfree(state->word); + pfree(state); +} + +/* increase the size of 'word' if needed to hold one more character */ +#define RESIZEPRSBUF \ +do { \ + int clen = curpos - state->word; \ + if ( clen + state->eml >= state->len ) \ + { \ + state->len *= 2; \ + state->word = (char *) repalloc(state->word, state->len); \ + curpos = state->word + clen; \ + } \ +} while (0) + +/* Fills gettoken_tsvector's output parameters, and returns true */ +#define RETURN_TOKEN \ +do { \ + if (pos_ptr != NULL) \ + { \ + *pos_ptr = pos; \ + *poslen = npos; \ + } \ + else if (pos != NULL) \ + pfree(pos); \ + \ + if (strval != NULL) \ + *strval = state->word; \ + if (lenval != NULL) \ + *lenval = curpos - state->word; \ + if (endptr != NULL) \ + *endptr = state->prsbuf; \ + return true; \ +} while(0) + + +/* State codes used in gettoken_tsvector */ +#define WAITWORD 1 +#define WAITENDWORD 2 +#define WAITNEXTCHAR 3 +#define WAITENDCMPLX 4 +#define WAITPOSINFO 5 +#define INPOSINFO 6 +#define WAITPOSDELIM 7 +#define WAITCHARCMPLX 8 + +#define PRSSYNTAXERROR prssyntaxerror(state) + +static void +prssyntaxerror(TSVectorParseState state) +{ + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + state->is_tsquery ? + errmsg("syntax error in tsquery: \"%s\"", state->bufstart) : + errmsg("syntax error in tsvector: \"%s\"", state->bufstart))); +} + + +/* + * Get next token from string being parsed. Returns true if successful, + * false if end of input string is reached. On success, these output + * parameters are filled in: + * + * *strval pointer to token + * *lenval length of *strval + * *pos_ptr pointer to a palloc'd array of positions and weights + * associated with the token. If the caller is not interested + * in the information, NULL can be supplied. Otherwise + * the caller is responsible for pfreeing the array. + * *poslen number of elements in *pos_ptr + * *endptr scan resumption point + * + * Pass NULL for unwanted output parameters. + */ +bool +gettoken_tsvector(TSVectorParseState state, + char **strval, int *lenval, + WordEntryPos **pos_ptr, int *poslen, + char **endptr) +{ + int oldstate = 0; + char *curpos = state->word; + int statecode = WAITWORD; + + /* + * pos is for collecting the comma delimited list of positions followed by + * the actual token. + */ + WordEntryPos *pos = NULL; + int npos = 0; /* elements of pos used */ + int posalen = 0; /* allocated size of pos */ + + while (1) + { + if (statecode == WAITWORD) + { + if (*(state->prsbuf) == '\0') + return false; + else if (!state->is_web && t_iseq(state->prsbuf, '\'')) + statecode = WAITENDCMPLX; + else if (!state->is_web && t_iseq(state->prsbuf, '\\')) + { + statecode = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) || + (state->is_web && t_iseq(state->prsbuf, '"'))) + PRSSYNTAXERROR; + else if (!t_isspace(state->prsbuf)) + { + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + statecode = WAITENDWORD; + } + } + else if (statecode == WAITNEXTCHAR) + { + if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("there is no escaped character: \"%s\"", + state->bufstart))); + else + { + RESIZEPRSBUF; + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + Assert(oldstate != 0); + statecode = oldstate; + } + } + else if (statecode == WAITENDWORD) + { + if (!state->is_web && t_iseq(state->prsbuf, '\\')) + { + statecode = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || + (state->oprisdelim && ISOPERATOR(state->prsbuf)) || + (state->is_web && t_iseq(state->prsbuf, '"'))) + { + RESIZEPRSBUF; + if (curpos == state->word) + PRSSYNTAXERROR; + *(curpos) = '\0'; + RETURN_TOKEN; + } + else if (t_iseq(state->prsbuf, ':')) + { + if (curpos == state->word) + PRSSYNTAXERROR; + *(curpos) = '\0'; + if (state->oprisdelim) + RETURN_TOKEN; + else + statecode = INPOSINFO; + } + else + { + RESIZEPRSBUF; + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + } + } + else if (statecode == WAITENDCMPLX) + { + if (!state->is_web && t_iseq(state->prsbuf, '\'')) + { + statecode = WAITCHARCMPLX; + } + else if (!state->is_web && t_iseq(state->prsbuf, '\\')) + { + statecode = WAITNEXTCHAR; + oldstate = WAITENDCMPLX; + } + else if (*(state->prsbuf) == '\0') + PRSSYNTAXERROR; + else + { + RESIZEPRSBUF; + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + } + } + else if (statecode == WAITCHARCMPLX) + { + if (!state->is_web && t_iseq(state->prsbuf, '\'')) + { + RESIZEPRSBUF; + COPYCHAR(curpos, state->prsbuf); + curpos += pg_mblen(state->prsbuf); + statecode = WAITENDCMPLX; + } + else + { + RESIZEPRSBUF; + *(curpos) = '\0'; + if (curpos == state->word) + PRSSYNTAXERROR; + if (state->oprisdelim) + { + /* state->prsbuf+=pg_mblen(state->prsbuf); */ + RETURN_TOKEN; + } + else + statecode = WAITPOSINFO; + continue; /* recheck current character */ + } + } + else if (statecode == WAITPOSINFO) + { + if (t_iseq(state->prsbuf, ':')) + statecode = INPOSINFO; + else + RETURN_TOKEN; + } + else if (statecode == INPOSINFO) + { + if (t_isdigit(state->prsbuf)) + { + if (posalen == 0) + { + posalen = 4; + pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen); + npos = 0; + } + else if (npos + 1 >= posalen) + { + posalen *= 2; + pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen); + } + npos++; + WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf))); + /* we cannot get here in tsquery, so no need for 2 errmsgs */ + if (WEP_GETPOS(pos[npos - 1]) == 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("wrong position info in tsvector: \"%s\"", + state->bufstart))); + WEP_SETWEIGHT(pos[npos - 1], 0); + statecode = WAITPOSDELIM; + } + else + PRSSYNTAXERROR; + } + else if (statecode == WAITPOSDELIM) + { + if (t_iseq(state->prsbuf, ',')) + statecode = INPOSINFO; + else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) + { + if (WEP_GETWEIGHT(pos[npos - 1])) + PRSSYNTAXERROR; + WEP_SETWEIGHT(pos[npos - 1], 3); + } + else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) + { + if (WEP_GETWEIGHT(pos[npos - 1])) + PRSSYNTAXERROR; + WEP_SETWEIGHT(pos[npos - 1], 2); + } + else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) + { + if (WEP_GETWEIGHT(pos[npos - 1])) + PRSSYNTAXERROR; + WEP_SETWEIGHT(pos[npos - 1], 1); + } + else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) + { + if (WEP_GETWEIGHT(pos[npos - 1])) + PRSSYNTAXERROR; + WEP_SETWEIGHT(pos[npos - 1], 0); + } + else if (t_isspace(state->prsbuf) || + *(state->prsbuf) == '\0') + RETURN_TOKEN; + else if (!t_isdigit(state->prsbuf)) + PRSSYNTAXERROR; + } + else /* internal error */ + elog(ERROR, "unrecognized state in gettoken_tsvector: %d", + statecode); + + /* get next char */ + state->prsbuf += pg_mblen(state->prsbuf); + } +} |