summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/adt/tsvector.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/tsvector.c')
-rw-r--r--src/backend/utils/adt/tsvector.c558
1 files changed, 558 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
new file mode 100644
index 0000000..dff0bfe
--- /dev/null
+++ b/src/backend/utils/adt/tsvector.c
@@ -0,0 +1,558 @@
+/*-------------------------------------------------------------------------
+ *
+ * tsvector.c
+ * I/O functions for tsvector
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/tsvector.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "libpq/pqformat.h"
+#include "nodes/miscnodes.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "varatt.h"
+
+typedef struct
+{
+ WordEntry entry; /* must be first! */
+ WordEntryPos *pos;
+ int poslen; /* number of elements in pos */
+} WordEntryIN;
+
+
+/* Compare two WordEntryPos values for qsort */
+int
+compareWordEntryPos(const void *a, const void *b)
+{
+ int apos = WEP_GETPOS(*(const WordEntryPos *) a);
+ int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
+
+ if (apos == bpos)
+ return 0;
+ return (apos > bpos) ? 1 : -1;
+}
+
+/*
+ * Removes duplicate pos entries. If there's two entries with same pos but
+ * different weight, the higher weight is retained, so we can't use
+ * qunique here.
+ *
+ * Returns new length.
+ */
+static int
+uniquePos(WordEntryPos *a, int l)
+{
+ WordEntryPos *ptr,
+ *res;
+
+ if (l <= 1)
+ return l;
+
+ qsort(a, l, sizeof(WordEntryPos), compareWordEntryPos);
+
+ res = a;
+ ptr = a + 1;
+ while (ptr - a < l)
+ {
+ if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
+ {
+ res++;
+ *res = *ptr;
+ if (res - a >= MAXNUMPOS - 1 ||
+ WEP_GETPOS(*res) == MAXENTRYPOS - 1)
+ break;
+ }
+ else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
+ WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
+ ptr++;
+ }
+
+ return res + 1 - a;
+}
+
+/* Compare two WordEntryIN values for qsort */
+static int
+compareentry(const void *va, const void *vb, void *arg)
+{
+ const WordEntryIN *a = (const WordEntryIN *) va;
+ const WordEntryIN *b = (const WordEntryIN *) vb;
+ char *BufferStr = (char *) arg;
+
+ return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
+ &BufferStr[b->entry.pos], b->entry.len,
+ false);
+}
+
+/*
+ * Sort an array of WordEntryIN, remove duplicates.
+ * *outbuflen receives the amount of space needed for strings and positions.
+ */
+static int
+uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
+{
+ int buflen;
+ WordEntryIN *ptr,
+ *res;
+
+ Assert(l >= 1);
+
+ if (l > 1)
+ qsort_arg(a, l, sizeof(WordEntryIN), compareentry, buf);
+
+ buflen = 0;
+ res = a;
+ ptr = a + 1;
+ while (ptr - a < l)
+ {
+ if (!(ptr->entry.len == res->entry.len &&
+ strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
+ res->entry.len) == 0))
+ {
+ /* done accumulating data into *res, count space needed */
+ buflen += res->entry.len;
+ if (res->entry.haspos)
+ {
+ res->poslen = uniquePos(res->pos, res->poslen);
+ buflen = SHORTALIGN(buflen);
+ buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+ }
+ res++;
+ if (res != ptr)
+ memcpy(res, ptr, sizeof(WordEntryIN));
+ }
+ else if (ptr->entry.haspos)
+ {
+ if (res->entry.haspos)
+ {
+ /* append ptr's positions to res's positions */
+ int newlen = ptr->poslen + res->poslen;
+
+ res->pos = (WordEntryPos *)
+ repalloc(res->pos, newlen * sizeof(WordEntryPos));
+ memcpy(&res->pos[res->poslen], ptr->pos,
+ ptr->poslen * sizeof(WordEntryPos));
+ res->poslen = newlen;
+ pfree(ptr->pos);
+ }
+ else
+ {
+ /* just give ptr's positions to pos */
+ res->entry.haspos = 1;
+ res->pos = ptr->pos;
+ res->poslen = ptr->poslen;
+ }
+ }
+ ptr++;
+ }
+
+ /* count space needed for last item */
+ buflen += res->entry.len;
+ if (res->entry.haspos)
+ {
+ res->poslen = uniquePos(res->pos, res->poslen);
+ buflen = SHORTALIGN(buflen);
+ buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+ }
+
+ *outbuflen = buflen;
+ return res + 1 - a;
+}
+
+static int
+WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
+{
+ return compareentry(a, b, buf);
+}
+
+
+Datum
+tsvectorin(PG_FUNCTION_ARGS)
+{
+ char *buf = PG_GETARG_CSTRING(0);
+ Node *escontext = fcinfo->context;
+ TSVectorParseState state;
+ WordEntryIN *arr;
+ int totallen;
+ int arrlen; /* allocated size of arr */
+ WordEntry *inarr;
+ int len = 0;
+ TSVector in;
+ int i;
+ char *token;
+ int toklen;
+ WordEntryPos *pos;
+ int poslen;
+ char *strbuf;
+ int stroff;
+
+ /*
+ * Tokens are appended to tmpbuf, cur is a pointer to the end of used
+ * space in tmpbuf.
+ */
+ char *tmpbuf;
+ char *cur;
+ int buflen = 256; /* allocated size of tmpbuf */
+
+ state = init_tsvector_parser(buf, 0, escontext);
+
+ arrlen = 64;
+ arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
+ cur = tmpbuf = (char *) palloc(buflen);
+
+ while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
+ {
+ if (toklen >= MAXSTRLEN)
+ ereturn(escontext, (Datum) 0,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long (%ld bytes, max %ld bytes)",
+ (long) toklen,
+ (long) (MAXSTRLEN - 1))));
+
+ if (cur - tmpbuf > MAXSTRPOS)
+ ereturn(escontext, (Datum) 0,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
+ (long) (cur - tmpbuf), (long) MAXSTRPOS)));
+
+ /*
+ * Enlarge buffers if needed
+ */
+ if (len >= arrlen)
+ {
+ arrlen *= 2;
+ arr = (WordEntryIN *)
+ repalloc(arr, sizeof(WordEntryIN) * arrlen);
+ }
+ while ((cur - tmpbuf) + toklen >= buflen)
+ {
+ int dist = cur - tmpbuf;
+
+ buflen *= 2;
+ tmpbuf = (char *) repalloc(tmpbuf, buflen);
+ cur = tmpbuf + dist;
+ }
+ arr[len].entry.len = toklen;
+ arr[len].entry.pos = cur - tmpbuf;
+ memcpy(cur, token, toklen);
+ cur += toklen;
+
+ if (poslen != 0)
+ {
+ arr[len].entry.haspos = 1;
+ arr[len].pos = pos;
+ arr[len].poslen = poslen;
+ }
+ else
+ {
+ arr[len].entry.haspos = 0;
+ arr[len].pos = NULL;
+ arr[len].poslen = 0;
+ }
+ len++;
+ }
+
+ close_tsvector_parser(state);
+
+ /* Did gettoken_tsvector fail? */
+ if (SOFT_ERROR_OCCURRED(escontext))
+ PG_RETURN_NULL();
+
+ if (len > 0)
+ len = uniqueentry(arr, len, tmpbuf, &buflen);
+ else
+ buflen = 0;
+
+ if (buflen > MAXSTRPOS)
+ ereturn(escontext, (Datum) 0,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
+
+ totallen = CALCDATASIZE(len, buflen);
+ in = (TSVector) palloc0(totallen);
+ SET_VARSIZE(in, totallen);
+ in->size = len;
+ inarr = ARRPTR(in);
+ strbuf = STRPTR(in);
+ stroff = 0;
+ for (i = 0; i < len; i++)
+ {
+ memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
+ arr[i].entry.pos = stroff;
+ stroff += arr[i].entry.len;
+ if (arr[i].entry.haspos)
+ {
+ /* This should be unreachable because of MAXNUMPOS restrictions */
+ if (arr[i].poslen > 0xFFFF)
+ elog(ERROR, "positions array too long");
+
+ /* Copy number of positions */
+ stroff = SHORTALIGN(stroff);
+ *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
+ stroff += sizeof(uint16);
+
+ /* Copy positions */
+ memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
+ stroff += arr[i].poslen * sizeof(WordEntryPos);
+
+ pfree(arr[i].pos);
+ }
+ inarr[i] = arr[i].entry;
+ }
+
+ Assert((strbuf + stroff - (char *) in) == totallen);
+
+ PG_RETURN_TSVECTOR(in);
+}
+
+Datum
+tsvectorout(PG_FUNCTION_ARGS)
+{
+ TSVector out = PG_GETARG_TSVECTOR(0);
+ char *outbuf;
+ int32 i,
+ lenbuf = 0,
+ pp;
+ WordEntry *ptr = ARRPTR(out);
+ char *curbegin,
+ *curin,
+ *curout;
+
+ lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
+ for (i = 0; i < out->size; i++)
+ {
+ lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
+ if (ptr[i].haspos)
+ lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
+ }
+
+ curout = outbuf = (char *) palloc(lenbuf);
+ for (i = 0; i < out->size; i++)
+ {
+ curbegin = curin = STRPTR(out) + ptr->pos;
+ if (i != 0)
+ *curout++ = ' ';
+ *curout++ = '\'';
+ while (curin - curbegin < ptr->len)
+ {
+ int len = pg_mblen(curin);
+
+ if (t_iseq(curin, '\''))
+ *curout++ = '\'';
+ else if (t_iseq(curin, '\\'))
+ *curout++ = '\\';
+
+ while (len--)
+ *curout++ = *curin++;
+ }
+
+ *curout++ = '\'';
+ if ((pp = POSDATALEN(out, ptr)) != 0)
+ {
+ WordEntryPos *wptr;
+
+ *curout++ = ':';
+ wptr = POSDATAPTR(out, ptr);
+ while (pp)
+ {
+ curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
+ switch (WEP_GETWEIGHT(*wptr))
+ {
+ case 3:
+ *curout++ = 'A';
+ break;
+ case 2:
+ *curout++ = 'B';
+ break;
+ case 1:
+ *curout++ = 'C';
+ break;
+ case 0:
+ default:
+ break;
+ }
+
+ if (pp > 1)
+ *curout++ = ',';
+ pp--;
+ wptr++;
+ }
+ }
+ ptr++;
+ }
+
+ *curout = '\0';
+ PG_FREE_IF_COPY(out, 0);
+ PG_RETURN_CSTRING(outbuf);
+}
+
+/*
+ * Binary Input / Output functions. The binary format is as follows:
+ *
+ * uint32 number of lexemes
+ *
+ * for each lexeme:
+ * lexeme text in client encoding, null-terminated
+ * uint16 number of positions
+ * for each position:
+ * uint16 WordEntryPos
+ */
+
+Datum
+tsvectorsend(PG_FUNCTION_ARGS)
+{
+ TSVector vec = PG_GETARG_TSVECTOR(0);
+ StringInfoData buf;
+ int i,
+ j;
+ WordEntry *weptr = ARRPTR(vec);
+
+ pq_begintypsend(&buf);
+
+ pq_sendint32(&buf, vec->size);
+ for (i = 0; i < vec->size; i++)
+ {
+ uint16 npos;
+
+ /*
+ * the strings in the TSVector array are not null-terminated, so we
+ * have to send the null-terminator separately
+ */
+ pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+ pq_sendbyte(&buf, '\0');
+
+ npos = POSDATALEN(vec, weptr);
+ pq_sendint16(&buf, npos);
+
+ if (npos > 0)
+ {
+ WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
+
+ for (j = 0; j < npos; j++)
+ pq_sendint16(&buf, wepptr[j]);
+ }
+ weptr++;
+ }
+
+ PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+}
+
+Datum
+tsvectorrecv(PG_FUNCTION_ARGS)
+{
+ StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
+ TSVector vec;
+ int i;
+ int32 nentries;
+ int datalen; /* number of bytes used in the variable size
+ * area after fixed size TSVector header and
+ * WordEntries */
+ Size hdrlen;
+ Size len; /* allocated size of vec */
+ bool needSort = false;
+
+ nentries = pq_getmsgint(buf, sizeof(int32));
+ if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
+ elog(ERROR, "invalid size of tsvector");
+
+ hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
+
+ len = hdrlen * 2; /* times two to make room for lexemes */
+ vec = (TSVector) palloc0(len);
+ vec->size = nentries;
+
+ datalen = 0;
+ for (i = 0; i < nentries; i++)
+ {
+ const char *lexeme;
+ uint16 npos;
+ size_t lex_len;
+
+ lexeme = pq_getmsgstring(buf);
+ npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
+
+ /* sanity checks */
+
+ lex_len = strlen(lexeme);
+ if (lex_len > MAXSTRLEN)
+ elog(ERROR, "invalid tsvector: lexeme too long");
+
+ if (datalen > MAXSTRPOS)
+ elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
+
+ if (npos > MAXNUMPOS)
+ elog(ERROR, "unexpected number of tsvector positions");
+
+ /*
+ * Looks valid. Fill the WordEntry struct, and copy lexeme.
+ *
+ * But make sure the buffer is large enough first.
+ */
+ while (hdrlen + SHORTALIGN(datalen + lex_len) +
+ sizeof(uint16) + npos * sizeof(WordEntryPos) >= len)
+ {
+ len *= 2;
+ vec = (TSVector) repalloc(vec, len);
+ }
+
+ vec->entries[i].haspos = (npos > 0) ? 1 : 0;
+ vec->entries[i].len = lex_len;
+ vec->entries[i].pos = datalen;
+
+ memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
+
+ datalen += lex_len;
+
+ if (i > 0 && WordEntryCMP(&vec->entries[i],
+ &vec->entries[i - 1],
+ STRPTR(vec)) <= 0)
+ needSort = true;
+
+ /* Receive positions */
+ if (npos > 0)
+ {
+ uint16 j;
+ WordEntryPos *wepptr;
+
+ /*
+ * Pad to 2-byte alignment if necessary. Though we used palloc0
+ * for the initial allocation, subsequent repalloc'd memory areas
+ * are not initialized to zero.
+ */
+ if (datalen != SHORTALIGN(datalen))
+ {
+ *(STRPTR(vec) + datalen) = '\0';
+ datalen = SHORTALIGN(datalen);
+ }
+
+ memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
+
+ wepptr = POSDATAPTR(vec, &vec->entries[i]);
+ for (j = 0; j < npos; j++)
+ {
+ wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
+ if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
+ elog(ERROR, "position information is misordered");
+ }
+
+ datalen += sizeof(uint16) + npos * sizeof(WordEntryPos);
+ }
+ }
+
+ SET_VARSIZE(vec, hdrlen + datalen);
+
+ if (needSort)
+ qsort_arg(ARRPTR(vec), vec->size, sizeof(WordEntry),
+ compareentry, STRPTR(vec));
+
+ PG_RETURN_TSVECTOR(vec);
+}