1 files changed, 558 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
new file mode 100644
index 0000000..dff0bfe
--- /dev/null
+++ b/src/backend/utils/adt/tsvector.c
@@ -0,0 +1,558 @@
+/*-------------------------------------------------------------------------
+ *
+ * tsvector.c
+ *	  I/O functions for tsvector
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/tsvector.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "libpq/pqformat.h"
+#include "nodes/miscnodes.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "varatt.h"
+
+typedef struct
+{
+	WordEntry	entry;			/* must be first! */
+	WordEntryPos *pos;
+	int			poslen;			/* number of elements in pos */
+} WordEntryIN;
+
+
+/* Compare two WordEntryPos values for qsort */
+int
+compareWordEntryPos(const void *a, const void *b)
+{
+	int			apos = WEP_GETPOS(*(const WordEntryPos *) a);
+	int			bpos = WEP_GETPOS(*(const WordEntryPos *) b);
+
+	if (apos == bpos)
+		return 0;
+	return (apos > bpos) ? 1 : -1;
+}
+
+/*
+ * Removes duplicate pos entries. If there's two entries with same pos but
+ * different weight, the higher weight is retained, so we can't use
+ * qunique here.
+ *
+ * Returns new length.
+ */
+static int
+uniquePos(WordEntryPos *a, int l)
+{
+	WordEntryPos *ptr,
+			   *res;
+
+	if (l <= 1)
+		return l;
+
+	qsort(a, l, sizeof(WordEntryPos), compareWordEntryPos);
+
+	res = a;
+	ptr = a + 1;
+	while (ptr - a < l)
+	{
+		if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
+		{
+			res++;
+			*res = *ptr;
+			if (res - a >= MAXNUMPOS - 1 ||
+				WEP_GETPOS(*res) == MAXENTRYPOS - 1)
+				break;
+		}
+		else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
+			WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
+		ptr++;
+	}
+
+	return res + 1 - a;
+}
+
+/* Compare two WordEntryIN values for qsort */
+static int
+compareentry(const void *va, const void *vb, void *arg)
+{
+	const WordEntryIN *a = (const WordEntryIN *) va;
+	const WordEntryIN *b = (const WordEntryIN *) vb;
+	char	   *BufferStr = (char *) arg;
+
+	return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
+						   &BufferStr[b->entry.pos], b->entry.len,
+						   false);
+}
+
+/*
+ * Sort an array of WordEntryIN, remove duplicates.
+ * *outbuflen receives the amount of space needed for strings and positions.
+ */
+static int
+uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
+{
+	int			buflen;
+	WordEntryIN *ptr,
+			   *res;
+
+	Assert(l >= 1);
+
+	if (l > 1)
+		qsort_arg(a, l, sizeof(WordEntryIN), compareentry, buf);
+
+	buflen = 0;
+	res = a;
+	ptr = a + 1;
+	while (ptr - a < l)
+	{
+		if (!(ptr->entry.len == res->entry.len &&
+			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
+					  res->entry.len) == 0))
+		{
+			/* done accumulating data into *res, count space needed */
+			buflen += res->entry.len;
+			if (res->entry.haspos)
+			{
+				res->poslen = uniquePos(res->pos, res->poslen);
+				buflen = SHORTALIGN(buflen);
+				buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+			}
+			res++;
+			if (res != ptr)
+				memcpy(res, ptr, sizeof(WordEntryIN));
+		}
+		else if (ptr->entry.haspos)
+		{
+			if (res->entry.haspos)
+			{
+				/* append ptr's positions to res's positions */
+				int			newlen = ptr->poslen + res->poslen;
+
+				res->pos = (WordEntryPos *)
+					repalloc(res->pos, newlen * sizeof(WordEntryPos));
+				memcpy(&res->pos[res->poslen], ptr->pos,
+					   ptr->poslen * sizeof(WordEntryPos));
+				res->poslen = newlen;
+				pfree(ptr->pos);
+			}
+			else
+			{
+				/* just give ptr's positions to pos */
+				res->entry.haspos = 1;
+				res->pos = ptr->pos;
+				res->poslen = ptr->poslen;
+			}
+		}
+		ptr++;
+	}
+
+	/* count space needed for last item */
+	buflen += res->entry.len;
+	if (res->entry.haspos)
+	{
+		res->poslen = uniquePos(res->pos, res->poslen);
+		buflen = SHORTALIGN(buflen);
+		buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+	}
+
+	*outbuflen = buflen;
+	return res + 1 - a;
+}
+
+static int
+WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
+{
+	return compareentry(a, b, buf);
+}
+
+
+Datum
+tsvectorin(PG_FUNCTION_ARGS)
+{
+	char	   *buf = PG_GETARG_CSTRING(0);
+	Node	   *escontext = fcinfo->context;
+	TSVectorParseState state;
+	WordEntryIN *arr;
+	int			totallen;
+	int			arrlen;			/* allocated size of arr */
+	WordEntry  *inarr;
+	int			len = 0;
+	TSVector	in;
+	int			i;
+	char	   *token;
+	int			toklen;
+	WordEntryPos *pos;
+	int			poslen;
+	char	   *strbuf;
+	int			stroff;
+
+	/*
+	 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
+	 * space in tmpbuf.
+	 */
+	char	   *tmpbuf;
+	char	   *cur;
+	int			buflen = 256;	/* allocated size of tmpbuf */
+
+	state = init_tsvector_parser(buf, 0, escontext);
+
+	arrlen = 64;
+	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
+	cur = tmpbuf = (char *) palloc(buflen);
+
+	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
+	{
+		if (toklen >= MAXSTRLEN)
+			ereturn(escontext, (Datum) 0,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("word is too long (%ld bytes, max %ld bytes)",
+							(long) toklen,
+							(long) (MAXSTRLEN - 1))));
+
+		if (cur - tmpbuf > MAXSTRPOS)
+			ereturn(escontext, (Datum) 0,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
+							(long) (cur - tmpbuf), (long) MAXSTRPOS)));
+
+		/*
+		 * Enlarge buffers if needed
+		 */
+		if (len >= arrlen)
+		{
+			arrlen *= 2;
+			arr = (WordEntryIN *)
+				repalloc(arr, sizeof(WordEntryIN) * arrlen);
+		}
+		while ((cur - tmpbuf) + toklen >= buflen)
+		{
+			int			dist = cur - tmpbuf;
+
+			buflen *= 2;
+			tmpbuf = (char *) repalloc(tmpbuf, buflen);
+			cur = tmpbuf + dist;
+		}
+		arr[len].entry.len = toklen;
+		arr[len].entry.pos = cur - tmpbuf;
+		memcpy(cur, token, toklen);
+		cur += toklen;
+
+		if (poslen != 0)
+		{
+			arr[len].entry.haspos = 1;
+			arr[len].pos = pos;
+			arr[len].poslen = poslen;
+		}
+		else
+		{
+			arr[len].entry.haspos = 0;
+			arr[len].pos = NULL;
+			arr[len].poslen = 0;
+		}
+		len++;
+	}
+
+	close_tsvector_parser(state);
+
+	/* Did gettoken_tsvector fail? */
+	if (SOFT_ERROR_OCCURRED(escontext))
+		PG_RETURN_NULL();
+
+	if (len > 0)
+		len = uniqueentry(arr, len, tmpbuf, &buflen);
+	else
+		buflen = 0;
+
+	if (buflen > MAXSTRPOS)
+		ereturn(escontext, (Datum) 0,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
+
+	totallen = CALCDATASIZE(len, buflen);
+	in = (TSVector) palloc0(totallen);
+	SET_VARSIZE(in, totallen);
+	in->size = len;
+	inarr = ARRPTR(in);
+	strbuf = STRPTR(in);
+	stroff = 0;
+	for (i = 0; i < len; i++)
+	{
+		memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
+		arr[i].entry.pos = stroff;
+		stroff += arr[i].entry.len;
+		if (arr[i].entry.haspos)
+		{
+			/* This should be unreachable because of MAXNUMPOS restrictions */
+			if (arr[i].poslen > 0xFFFF)
+				elog(ERROR, "positions array too long");
+
+			/* Copy number of positions */
+			stroff = SHORTALIGN(stroff);
+			*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
+			stroff += sizeof(uint16);
+
+			/* Copy positions */
+			memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
+			stroff += arr[i].poslen * sizeof(WordEntryPos);
+
+			pfree(arr[i].pos);
+		}
+		inarr[i] = arr[i].entry;
+	}
+
+	Assert((strbuf + stroff - (char *) in) == totallen);
+
+	PG_RETURN_TSVECTOR(in);
+}
+
+Datum
+tsvectorout(PG_FUNCTION_ARGS)
+{
+	TSVector	out = PG_GETARG_TSVECTOR(0);
+	char	   *outbuf;
+	int32		i,
+				lenbuf = 0,
+				pp;
+	WordEntry  *ptr = ARRPTR(out);
+	char	   *curbegin,
+			   *curin,
+			   *curout;
+
+	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
+	for (i = 0; i < out->size; i++)
+	{
+		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
+		if (ptr[i].haspos)
+			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
+	}
+
+	curout = outbuf = (char *) palloc(lenbuf);
+	for (i = 0; i < out->size; i++)
+	{
+		curbegin = curin = STRPTR(out) + ptr->pos;
+		if (i != 0)
+			*curout++ = ' ';
+		*curout++ = '\'';
+		while (curin - curbegin < ptr->len)
+		{
+			int			len = pg_mblen(curin);
+
+			if (t_iseq(curin, '\''))
+				*curout++ = '\'';
+			else if (t_iseq(curin, '\\'))
+				*curout++ = '\\';
+
+			while (len--)
+				*curout++ = *curin++;
+		}
+
+		*curout++ = '\'';
+		if ((pp = POSDATALEN(out, ptr)) != 0)
+		{
+			WordEntryPos *wptr;
+
+			*curout++ = ':';
+			wptr = POSDATAPTR(out, ptr);
+			while (pp)
+			{
+				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
+				switch (WEP_GETWEIGHT(*wptr))
+				{
+					case 3:
+						*curout++ = 'A';
+						break;
+					case 2:
+						*curout++ = 'B';
+						break;
+					case 1:
+						*curout++ = 'C';
+						break;
+					case 0:
+					default:
+						break;
+				}
+
+				if (pp > 1)
+					*curout++ = ',';
+				pp--;
+				wptr++;
+			}
+		}
+		ptr++;
+	}
+
+	*curout = '\0';
+	PG_FREE_IF_COPY(out, 0);
+	PG_RETURN_CSTRING(outbuf);
+}
+
+/*
+ * Binary Input / Output functions. The binary format is as follows:
+ *
+ * uint32	number of lexemes
+ *
+ * for each lexeme:
+ *		lexeme text in client encoding, null-terminated
+ *		uint16	number of positions
+ *		for each position:
+ *			uint16 WordEntryPos
+ */
+
+Datum
+tsvectorsend(PG_FUNCTION_ARGS)
+{
+	TSVector	vec = PG_GETARG_TSVECTOR(0);
+	StringInfoData buf;
+	int			i,
+				j;
+	WordEntry  *weptr = ARRPTR(vec);
+
+	pq_begintypsend(&buf);
+
+	pq_sendint32(&buf, vec->size);
+	for (i = 0; i < vec->size; i++)
+	{
+		uint16		npos;
+
+		/*
+		 * the strings in the TSVector array are not null-terminated, so we
+		 * have to send the null-terminator separately
+		 */
+		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+		pq_sendbyte(&buf, '\0');
+
+		npos = POSDATALEN(vec, weptr);
+		pq_sendint16(&buf, npos);
+
+		if (npos > 0)
+		{
+			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
+
+			for (j = 0; j < npos; j++)
+				pq_sendint16(&buf, wepptr[j]);
+		}
+		weptr++;
+	}
+
+	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+}
+
+Datum
+tsvectorrecv(PG_FUNCTION_ARGS)
+{
+	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
+	TSVector	vec;
+	int			i;
+	int32		nentries;
+	int			datalen;		/* number of bytes used in the variable size
+								 * area after fixed size TSVector header and
+								 * WordEntries */
+	Size		hdrlen;
+	Size		len;			/* allocated size of vec */
+	bool		needSort = false;
+
+	nentries = pq_getmsgint(buf, sizeof(int32));
+	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
+		elog(ERROR, "invalid size of tsvector");
+
+	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
+
+	len = hdrlen * 2;			/* times two to make room for lexemes */
+	vec = (TSVector) palloc0(len);
+	vec->size = nentries;
+
+	datalen = 0;
+	for (i = 0; i < nentries; i++)
+	{
+		const char *lexeme;
+		uint16		npos;
+		size_t		lex_len;
+
+		lexeme = pq_getmsgstring(buf);
+		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
+
+		/* sanity checks */
+
+		lex_len = strlen(lexeme);
+		if (lex_len > MAXSTRLEN)
+			elog(ERROR, "invalid tsvector: lexeme too long");
+
+		if (datalen > MAXSTRPOS)
+			elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
+
+		if (npos > MAXNUMPOS)
+			elog(ERROR, "unexpected number of tsvector positions");
+
+		/*
+		 * Looks valid. Fill the WordEntry struct, and copy lexeme.
+		 *
+		 * But make sure the buffer is large enough first.
+		 */
+		while (hdrlen + SHORTALIGN(datalen + lex_len) +
+			   sizeof(uint16) + npos * sizeof(WordEntryPos) >= len)
+		{
+			len *= 2;
+			vec = (TSVector) repalloc(vec, len);
+		}
+
+		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
+		vec->entries[i].len = lex_len;
+		vec->entries[i].pos = datalen;
+
+		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
+
+		datalen += lex_len;
+
+		if (i > 0 && WordEntryCMP(&vec->entries[i],
+								  &vec->entries[i - 1],
+								  STRPTR(vec)) <= 0)
+			needSort = true;
+
+		/* Receive positions */
+		if (npos > 0)
+		{
+			uint16		j;
+			WordEntryPos *wepptr;
+
+			/*
+			 * Pad to 2-byte alignment if necessary. Though we used palloc0
+			 * for the initial allocation, subsequent repalloc'd memory areas
+			 * are not initialized to zero.
+			 */
+			if (datalen != SHORTALIGN(datalen))
+			{
+				*(STRPTR(vec) + datalen) = '\0';
+				datalen = SHORTALIGN(datalen);
+			}
+
+			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
+
+			wepptr = POSDATAPTR(vec, &vec->entries[i]);
+			for (j = 0; j < npos; j++)
+			{
+				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
+				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
+					elog(ERROR, "position information is misordered");
+			}
+
+			datalen += sizeof(uint16) + npos * sizeof(WordEntryPos);
+		}
+	}
+
+	SET_VARSIZE(vec, hdrlen + datalen);
+
+	if (needSort)
+		qsort_arg(ARRPTR(vec), vec->size, sizeof(WordEntry),
+				  compareentry, STRPTR(vec));
+
+	PG_RETURN_TSVECTOR(vec);
+}