/*------------------------------------------------------------------------- * * tsvector_parser.c * Parser for tsvector * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * * * IDENTIFICATION * src/backend/utils/adt/tsvector_parser.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_utils.h" /* * Private state of tsvector parser. Note that tsquery also uses this code to * parse its input, hence the boolean flags. The two flags are both true or * both false in current usage, but we keep them separate for clarity. * is_tsquery affects *only* the content of error messages. */ struct TSVectorParseStateData { char *prsbuf; /* next input character */ char *bufstart; /* whole string (used only for errors) */ char *word; /* buffer to hold the current word */ int len; /* size in bytes allocated for 'word' */ int eml; /* max bytes per character */ bool oprisdelim; /* treat ! | * ( ) as delimiters? */ bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */ bool is_web; /* we're in websearch_to_tsquery() */ }; /* * Initializes parser for the input string. If oprisdelim is set, the * following characters are treated as delimiters in addition to whitespace: * ! | & ( ) */ TSVectorParseState init_tsvector_parser(char *input, int flags) { TSVectorParseState state; state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData)); state->prsbuf = input; state->bufstart = input; state->len = 32; state->word = (char *) palloc(state->len); state->eml = pg_database_encoding_max_length(); state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0; state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0; state->is_web = (flags & P_TSV_IS_WEB) != 0; return state; } /* * Reinitializes parser to parse 'input', instead of previous input. */ void reset_tsvector_parser(TSVectorParseState state, char *input) { state->prsbuf = input; } /* * Shuts down a tsvector parser. */ void close_tsvector_parser(TSVectorParseState state) { pfree(state->word); pfree(state); } /* increase the size of 'word' if needed to hold one more character */ #define RESIZEPRSBUF \ do { \ int clen = curpos - state->word; \ if ( clen + state->eml >= state->len ) \ { \ state->len *= 2; \ state->word = (char *) repalloc(state->word, state->len); \ curpos = state->word + clen; \ } \ } while (0) /* Fills gettoken_tsvector's output parameters, and returns true */ #define RETURN_TOKEN \ do { \ if (pos_ptr != NULL) \ { \ *pos_ptr = pos; \ *poslen = npos; \ } \ else if (pos != NULL) \ pfree(pos); \ \ if (strval != NULL) \ *strval = state->word; \ if (lenval != NULL) \ *lenval = curpos - state->word; \ if (endptr != NULL) \ *endptr = state->prsbuf; \ return true; \ } while(0) /* State codes used in gettoken_tsvector */ #define WAITWORD 1 #define WAITENDWORD 2 #define WAITNEXTCHAR 3 #define WAITENDCMPLX 4 #define WAITPOSINFO 5 #define INPOSINFO 6 #define WAITPOSDELIM 7 #define WAITCHARCMPLX 8 #define PRSSYNTAXERROR prssyntaxerror(state) static void prssyntaxerror(TSVectorParseState state) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), state->is_tsquery ? errmsg("syntax error in tsquery: \"%s\"", state->bufstart) : errmsg("syntax error in tsvector: \"%s\"", state->bufstart))); } /* * Get next token from string being parsed. Returns true if successful, * false if end of input string is reached. On success, these output * parameters are filled in: * * *strval pointer to token * *lenval length of *strval * *pos_ptr pointer to a palloc'd array of positions and weights * associated with the token. If the caller is not interested * in the information, NULL can be supplied. Otherwise * the caller is responsible for pfreeing the array. * *poslen number of elements in *pos_ptr * *endptr scan resumption point * * Pass NULL for unwanted output parameters. */ bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr) { int oldstate = 0; char *curpos = state->word; int statecode = WAITWORD; /* * pos is for collecting the comma delimited list of positions followed by * the actual token. */ WordEntryPos *pos = NULL; int npos = 0; /* elements of pos used */ int posalen = 0; /* allocated size of pos */ while (1) { if (statecode == WAITWORD) { if (*(state->prsbuf) == '\0') return false; else if (!state->is_web && t_iseq(state->prsbuf, '\'')) statecode = WAITENDCMPLX; else if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) || (state->is_web && t_iseq(state->prsbuf, '"'))) PRSSYNTAXERROR; else if (!t_isspace(state->prsbuf)) { COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); statecode = WAITENDWORD; } } else if (statecode == WAITNEXTCHAR) { if (*(state->prsbuf) == '\0') ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("there is no escaped character: \"%s\"", state->bufstart))); else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); Assert(oldstate != 0); statecode = oldstate; } } else if (statecode == WAITENDWORD) { if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || (state->oprisdelim && ISOPERATOR(state->prsbuf)) || (state->is_web && t_iseq(state->prsbuf, '"'))) { RESIZEPRSBUF; if (curpos == state->word) PRSSYNTAXERROR; *(curpos) = '\0'; RETURN_TOKEN; } else if (t_iseq(state->prsbuf, ':')) { if (curpos == state->word) PRSSYNTAXERROR; *(curpos) = '\0'; if (state->oprisdelim) RETURN_TOKEN; else statecode = INPOSINFO; } else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); } } else if (statecode == WAITENDCMPLX) { if (!state->is_web && t_iseq(state->prsbuf, '\'')) { statecode = WAITCHARCMPLX; } else if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDCMPLX; } else if (*(state->prsbuf) == '\0') PRSSYNTAXERROR; else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); } } else if (statecode == WAITCHARCMPLX) { if (!state->is_web && t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); statecode = WAITENDCMPLX; } else { RESIZEPRSBUF; *(curpos) = '\0'; if (curpos == state->word) PRSSYNTAXERROR; if (state->oprisdelim) { /* state->prsbuf+=pg_mblen(state->prsbuf); */ RETURN_TOKEN; } else statecode = WAITPOSINFO; continue; /* recheck current character */ } } else if (statecode == WAITPOSINFO) { if (t_iseq(state->prsbuf, ':')) statecode = INPOSINFO; else RETURN_TOKEN; } else if (statecode == INPOSINFO) { if (t_isdigit(state->prsbuf)) { if (posalen == 0) { posalen = 4; pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen); npos = 0; } else if (npos + 1 >= posalen) { posalen *= 2; pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen); } npos++; WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf))); /* we cannot get here in tsquery, so no need for 2 errmsgs */ if (WEP_GETPOS(pos[npos - 1]) == 0) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("wrong position info in tsvector: \"%s\"", state->bufstart))); WEP_SETWEIGHT(pos[npos - 1], 0); statecode = WAITPOSDELIM; } else PRSSYNTAXERROR; } else if (statecode == WAITPOSDELIM) { if (t_iseq(state->prsbuf, ',')) statecode = INPOSINFO; else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 3); } else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 2); } else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 1); } else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 0); } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0') RETURN_TOKEN; else if (!t_isdigit(state->prsbuf)) PRSSYNTAXERROR; } else /* internal error */ elog(ERROR, "unrecognized state in gettoken_tsvector: %d", statecode); /* get next char */ state->prsbuf += pg_mblen(state->prsbuf); } }