Adding upstream version 14.5.upstream/14.5 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
commit: 46651ce6fe013220ed397add242004d764fc0153 (patch)
tree: 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/adt/tsvector_parser.c
parent: Initial commit. (diff)
download: postgresql-14-upstream.tar.xz
postgresql-14-upstream.zip
1 files changed, 367 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c
new file mode 100644
index 0000000..c2df409
--- /dev/null
+++ b/src/backend/utils/adt/tsvector_parser.c
@@ -0,0 +1,367 @@
+/*-------------------------------------------------------------------------
+ *
+ * tsvector_parser.c
+ *	  Parser for tsvector
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/tsvector_parser.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+
+
+/*
+ * Private state of tsvector parser.  Note that tsquery also uses this code to
+ * parse its input, hence the boolean flags.  The two flags are both true or
+ * both false in current usage, but we keep them separate for clarity.
+ * is_tsquery affects *only* the content of error messages.
+ */
+struct TSVectorParseStateData
+{
+	char	   *prsbuf;			/* next input character */
+	char	   *bufstart;		/* whole string (used only for errors) */
+	char	   *word;			/* buffer to hold the current word */
+	int			len;			/* size in bytes allocated for 'word' */
+	int			eml;			/* max bytes per character */
+	bool		oprisdelim;		/* treat ! | * ( ) as delimiters? */
+	bool		is_tsquery;		/* say "tsquery" not "tsvector" in errors? */
+	bool		is_web;			/* we're in websearch_to_tsquery() */
+};
+
+
+/*
+ * Initializes parser for the input string. If oprisdelim is set, the
+ * following characters are treated as delimiters in addition to whitespace:
+ * ! | & ( )
+ */
+TSVectorParseState
+init_tsvector_parser(char *input, int flags)
+{
+	TSVectorParseState state;
+
+	state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
+	state->prsbuf = input;
+	state->bufstart = input;
+	state->len = 32;
+	state->word = (char *) palloc(state->len);
+	state->eml = pg_database_encoding_max_length();
+	state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
+	state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
+	state->is_web = (flags & P_TSV_IS_WEB) != 0;
+
+	return state;
+}
+
+/*
+ * Reinitializes parser to parse 'input', instead of previous input.
+ */
+void
+reset_tsvector_parser(TSVectorParseState state, char *input)
+{
+	state->prsbuf = input;
+}
+
+/*
+ * Shuts down a tsvector parser.
+ */
+void
+close_tsvector_parser(TSVectorParseState state)
+{
+	pfree(state->word);
+	pfree(state);
+}
+
+/* increase the size of 'word' if needed to hold one more character */
+#define RESIZEPRSBUF \
+do { \
+	int clen = curpos - state->word; \
+	if ( clen + state->eml >= state->len ) \
+	{ \
+		state->len *= 2; \
+		state->word = (char *) repalloc(state->word, state->len); \
+		curpos = state->word + clen; \
+	} \
+} while (0)
+
+/* Fills gettoken_tsvector's output parameters, and returns true */
+#define RETURN_TOKEN \
+do { \
+	if (pos_ptr != NULL) \
+	{ \
+		*pos_ptr = pos; \
+		*poslen = npos; \
+	} \
+	else if (pos != NULL) \
+		pfree(pos); \
+	\
+	if (strval != NULL) \
+		*strval = state->word; \
+	if (lenval != NULL) \
+		*lenval = curpos - state->word; \
+	if (endptr != NULL) \
+		*endptr = state->prsbuf; \
+	return true; \
+} while(0)
+
+
+/* State codes used in gettoken_tsvector */
+#define WAITWORD		1
+#define WAITENDWORD		2
+#define WAITNEXTCHAR	3
+#define WAITENDCMPLX	4
+#define WAITPOSINFO		5
+#define INPOSINFO		6
+#define WAITPOSDELIM	7
+#define WAITCHARCMPLX	8
+
+#define PRSSYNTAXERROR prssyntaxerror(state)
+
+static void
+prssyntaxerror(TSVectorParseState state)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 state->is_tsquery ?
+			 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
+			 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
+}
+
+
+/*
+ * Get next token from string being parsed. Returns true if successful,
+ * false if end of input string is reached.  On success, these output
+ * parameters are filled in:
+ *
+ * *strval		pointer to token
+ * *lenval		length of *strval
+ * *pos_ptr		pointer to a palloc'd array of positions and weights
+ *				associated with the token. If the caller is not interested
+ *				in the information, NULL can be supplied. Otherwise
+ *				the caller is responsible for pfreeing the array.
+ * *poslen		number of elements in *pos_ptr
+ * *endptr		scan resumption point
+ *
+ * Pass NULL for unwanted output parameters.
+ */
+bool
+gettoken_tsvector(TSVectorParseState state,
+				  char **strval, int *lenval,
+				  WordEntryPos **pos_ptr, int *poslen,
+				  char **endptr)
+{
+	int			oldstate = 0;
+	char	   *curpos = state->word;
+	int			statecode = WAITWORD;
+
+	/*
+	 * pos is for collecting the comma delimited list of positions followed by
+	 * the actual token.
+	 */
+	WordEntryPos *pos = NULL;
+	int			npos = 0;		/* elements of pos used */
+	int			posalen = 0;	/* allocated size of pos */
+
+	while (1)
+	{
+		if (statecode == WAITWORD)
+		{
+			if (*(state->prsbuf) == '\0')
+				return false;
+			else if (!state->is_web && t_iseq(state->prsbuf, '\''))
+				statecode = WAITENDCMPLX;
+			else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
+			{
+				statecode = WAITNEXTCHAR;
+				oldstate = WAITENDWORD;
+			}
+			else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
+					 (state->is_web && t_iseq(state->prsbuf, '"')))
+				PRSSYNTAXERROR;
+			else if (!t_isspace(state->prsbuf))
+			{
+				COPYCHAR(curpos, state->prsbuf);
+				curpos += pg_mblen(state->prsbuf);
+				statecode = WAITENDWORD;
+			}
+		}
+		else if (statecode == WAITNEXTCHAR)
+		{
+			if (*(state->prsbuf) == '\0')
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("there is no escaped character: \"%s\"",
+								state->bufstart)));
+			else
+			{
+				RESIZEPRSBUF;
+				COPYCHAR(curpos, state->prsbuf);
+				curpos += pg_mblen(state->prsbuf);
+				Assert(oldstate != 0);
+				statecode = oldstate;
+			}
+		}
+		else if (statecode == WAITENDWORD)
+		{
+			if (!state->is_web && t_iseq(state->prsbuf, '\\'))
+			{
+				statecode = WAITNEXTCHAR;
+				oldstate = WAITENDWORD;
+			}
+			else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
+					 (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
+					 (state->is_web && t_iseq(state->prsbuf, '"')))
+			{
+				RESIZEPRSBUF;
+				if (curpos == state->word)
+					PRSSYNTAXERROR;
+				*(curpos) = '\0';
+				RETURN_TOKEN;
+			}
+			else if (t_iseq(state->prsbuf, ':'))
+			{
+				if (curpos == state->word)
+					PRSSYNTAXERROR;
+				*(curpos) = '\0';
+				if (state->oprisdelim)
+					RETURN_TOKEN;
+				else
+					statecode = INPOSINFO;
+			}
+			else
+			{
+				RESIZEPRSBUF;
+				COPYCHAR(curpos, state->prsbuf);
+				curpos += pg_mblen(state->prsbuf);
+			}
+		}
+		else if (statecode == WAITENDCMPLX)
+		{
+			if (!state->is_web && t_iseq(state->prsbuf, '\''))
+			{
+				statecode = WAITCHARCMPLX;
+			}
+			else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
+			{
+				statecode = WAITNEXTCHAR;
+				oldstate = WAITENDCMPLX;
+			}
+			else if (*(state->prsbuf) == '\0')
+				PRSSYNTAXERROR;
+			else
+			{
+				RESIZEPRSBUF;
+				COPYCHAR(curpos, state->prsbuf);
+				curpos += pg_mblen(state->prsbuf);
+			}
+		}
+		else if (statecode == WAITCHARCMPLX)
+		{
+			if (!state->is_web && t_iseq(state->prsbuf, '\''))
+			{
+				RESIZEPRSBUF;
+				COPYCHAR(curpos, state->prsbuf);
+				curpos += pg_mblen(state->prsbuf);
+				statecode = WAITENDCMPLX;
+			}
+			else
+			{
+				RESIZEPRSBUF;
+				*(curpos) = '\0';
+				if (curpos == state->word)
+					PRSSYNTAXERROR;
+				if (state->oprisdelim)
+				{
+					/* state->prsbuf+=pg_mblen(state->prsbuf); */
+					RETURN_TOKEN;
+				}
+				else
+					statecode = WAITPOSINFO;
+				continue;		/* recheck current character */
+			}
+		}
+		else if (statecode == WAITPOSINFO)
+		{
+			if (t_iseq(state->prsbuf, ':'))
+				statecode = INPOSINFO;
+			else
+				RETURN_TOKEN;
+		}
+		else if (statecode == INPOSINFO)
+		{
+			if (t_isdigit(state->prsbuf))
+			{
+				if (posalen == 0)
+				{
+					posalen = 4;
+					pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
+					npos = 0;
+				}
+				else if (npos + 1 >= posalen)
+				{
+					posalen *= 2;
+					pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
+				}
+				npos++;
+				WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
+				/* we cannot get here in tsquery, so no need for 2 errmsgs */
+				if (WEP_GETPOS(pos[npos - 1]) == 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("wrong position info in tsvector: \"%s\"",
+									state->bufstart)));
+				WEP_SETWEIGHT(pos[npos - 1], 0);
+				statecode = WAITPOSDELIM;
+			}
+			else
+				PRSSYNTAXERROR;
+		}
+		else if (statecode == WAITPOSDELIM)
+		{
+			if (t_iseq(state->prsbuf, ','))
+				statecode = INPOSINFO;
+			else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
+			{
+				if (WEP_GETWEIGHT(pos[npos - 1]))
+					PRSSYNTAXERROR;
+				WEP_SETWEIGHT(pos[npos - 1], 3);
+			}
+			else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
+			{
+				if (WEP_GETWEIGHT(pos[npos - 1]))
+					PRSSYNTAXERROR;
+				WEP_SETWEIGHT(pos[npos - 1], 2);
+			}
+			else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
+			{
+				if (WEP_GETWEIGHT(pos[npos - 1]))
+					PRSSYNTAXERROR;
+				WEP_SETWEIGHT(pos[npos - 1], 1);
+			}
+			else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
+			{
+				if (WEP_GETWEIGHT(pos[npos - 1]))
+					PRSSYNTAXERROR;
+				WEP_SETWEIGHT(pos[npos - 1], 0);
+			}
+			else if (t_isspace(state->prsbuf) ||
+					 *(state->prsbuf) == '\0')
+				RETURN_TOKEN;
+			else if (!t_isdigit(state->prsbuf))
+				PRSSYNTAXERROR;
+		}
+		else					/* internal error */
+			elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
+				 statecode);
+
+		/* get next char */
+		state->prsbuf += pg_mblen(state->prsbuf);
+	}
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
commit	46651ce6fe013220ed397add242004d764fc0153 (patch)
tree	6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/adt/tsvector_parser.c
parent	Initial commit. (diff)
download	postgresql-14-upstream.tar.xz postgresql-14-upstream.zip