summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/adt/tsquery_cleanup.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/adt/tsquery_cleanup.c
parentInitial commit. (diff)
downloadpostgresql-14-upstream.tar.xz
postgresql-14-upstream.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/adt/tsquery_cleanup.c')
-rw-r--r--src/backend/utils/adt/tsquery_cleanup.c444
1 files changed, 444 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c
new file mode 100644
index 0000000..82ae284
--- /dev/null
+++ b/src/backend/utils/adt/tsquery_cleanup.c
@@ -0,0 +1,444 @@
+/*-------------------------------------------------------------------------
+ *
+ * tsquery_cleanup.c
+ * Cleanup query from NOT values and/or stopword
+ * Utility functions to correct work.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/tsquery_cleanup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "tsearch/ts_utils.h"
+
+typedef struct NODE
+{
+ struct NODE *left;
+ struct NODE *right;
+ QueryItem *valnode;
+} NODE;
+
+/*
+ * make query tree from plain view of query
+ */
+static NODE *
+maketree(QueryItem *in)
+{
+ NODE *node = (NODE *) palloc(sizeof(NODE));
+
+ /* since this function recurses, it could be driven to stack overflow. */
+ check_stack_depth();
+
+ node->valnode = in;
+ node->right = node->left = NULL;
+ if (in->type == QI_OPR)
+ {
+ node->right = maketree(in + 1);
+ if (in->qoperator.oper != OP_NOT)
+ node->left = maketree(in + in->qoperator.left);
+ }
+ return node;
+}
+
+/*
+ * Internal state for plaintree and plainnode
+ */
+typedef struct
+{
+ QueryItem *ptr;
+ int len; /* allocated size of ptr */
+ int cur; /* number of elements in ptr */
+} PLAINTREE;
+
+static void
+plainnode(PLAINTREE *state, NODE *node)
+{
+ /* since this function recurses, it could be driven to stack overflow. */
+ check_stack_depth();
+
+ if (state->cur == state->len)
+ {
+ state->len *= 2;
+ state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem));
+ }
+ memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem));
+ if (node->valnode->type == QI_VAL)
+ state->cur++;
+ else if (node->valnode->qoperator.oper == OP_NOT)
+ {
+ state->ptr[state->cur].qoperator.left = 1;
+ state->cur++;
+ plainnode(state, node->right);
+ }
+ else
+ {
+ int cur = state->cur;
+
+ state->cur++;
+ plainnode(state, node->right);
+ state->ptr[cur].qoperator.left = state->cur - cur;
+ plainnode(state, node->left);
+ }
+ pfree(node);
+}
+
+/*
+ * make plain view of tree from a NODE-tree representation
+ */
+static QueryItem *
+plaintree(NODE *root, int *len)
+{
+ PLAINTREE pl;
+
+ pl.cur = 0;
+ pl.len = 16;
+ if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
+ {
+ pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
+ plainnode(&pl, root);
+ }
+ else
+ pl.ptr = NULL;
+ *len = pl.cur;
+ return pl.ptr;
+}
+
+static void
+freetree(NODE *node)
+{
+ /* since this function recurses, it could be driven to stack overflow. */
+ check_stack_depth();
+
+ if (!node)
+ return;
+ if (node->left)
+ freetree(node->left);
+ if (node->right)
+ freetree(node->right);
+ pfree(node);
+}
+
+/*
+ * clean tree for ! operator.
+ * It's useful for debug, but in
+ * other case, such view is used with search in index.
+ * Operator ! always return TRUE
+ */
+static NODE *
+clean_NOT_intree(NODE *node)
+{
+ /* since this function recurses, it could be driven to stack overflow. */
+ check_stack_depth();
+
+ if (node->valnode->type == QI_VAL)
+ return node;
+
+ if (node->valnode->qoperator.oper == OP_NOT)
+ {
+ freetree(node);
+ return NULL;
+ }
+
+ /* operator & or | */
+ if (node->valnode->qoperator.oper == OP_OR)
+ {
+ if ((node->left = clean_NOT_intree(node->left)) == NULL ||
+ (node->right = clean_NOT_intree(node->right)) == NULL)
+ {
+ freetree(node);
+ return NULL;
+ }
+ }
+ else
+ {
+ NODE *res = node;
+
+ Assert(node->valnode->qoperator.oper == OP_AND ||
+ node->valnode->qoperator.oper == OP_PHRASE);
+
+ node->left = clean_NOT_intree(node->left);
+ node->right = clean_NOT_intree(node->right);
+ if (node->left == NULL && node->right == NULL)
+ {
+ pfree(node);
+ res = NULL;
+ }
+ else if (node->left == NULL)
+ {
+ res = node->right;
+ pfree(node);
+ }
+ else if (node->right == NULL)
+ {
+ res = node->left;
+ pfree(node);
+ }
+ return res;
+ }
+ return node;
+}
+
+QueryItem *
+clean_NOT(QueryItem *ptr, int *len)
+{
+ NODE *root = maketree(ptr);
+
+ return plaintree(clean_NOT_intree(root), len);
+}
+
+
+/*
+ * Remove QI_VALSTOP (stopword) nodes from query tree.
+ *
+ * Returns NULL if the query degenerates to nothing. Input must not be NULL.
+ *
+ * When we remove a phrase operator due to removing one or both of its
+ * arguments, we might need to adjust the distance of a parent phrase
+ * operator. For example, 'a' is a stopword, so:
+ * (b <-> a) <-> c should become b <2> c
+ * b <-> (a <-> c) should become b <2> c
+ * (b <-> (a <-> a)) <-> c should become b <3> c
+ * b <-> ((a <-> a) <-> c) should become b <3> c
+ * To handle that, we define two output parameters:
+ * ladd: amount to add to a phrase distance to the left of this node
+ * radd: amount to add to a phrase distance to the right of this node
+ * We need two outputs because we could need to bubble up adjustments to two
+ * different parent phrase operators. Consider
+ * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
+ * After we've removed the two a's and are considering the <2> node (which is
+ * now just x <2> y), we have an ladd distance of 1 that needs to propagate
+ * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
+ * propagate to the rightmost <->, so that we'll end up with
+ * w <2> ((x <2> y) <4> z)
+ * Near the bottom of the tree, we may have subtrees consisting only of
+ * stopwords. The distances of any phrase operators within such a subtree are
+ * summed and propagated to both ladd and radd, since we don't know which side
+ * of the lowest surviving phrase operator we are in. The rule is that any
+ * subtree that degenerates to NULL must return equal values of ladd and radd,
+ * and the parent node dealing with it should incorporate only one of those.
+ *
+ * Currently, we only implement this adjustment for adjacent phrase operators.
+ * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
+ * isn't ideal, but there is no way to represent the really desired semantics
+ * without some redesign of the tsquery structure. Certainly it would not be
+ * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
+ * corner case, let it go for now. But we can fix it in cases where the
+ * intervening non-phrase operator also gets removed, for example
+ * '((x <-> a) | a) <-> y' will become 'x <2> y'.
+ */
+static NODE *
+clean_stopword_intree(NODE *node, int *ladd, int *radd)
+{
+ /* since this function recurses, it could be driven to stack overflow. */
+ check_stack_depth();
+
+ /* default output parameters indicate no change in parent distance */
+ *ladd = *radd = 0;
+
+ if (node->valnode->type == QI_VAL)
+ return node;
+ else if (node->valnode->type == QI_VALSTOP)
+ {
+ pfree(node);
+ return NULL;
+ }
+
+ Assert(node->valnode->type == QI_OPR);
+
+ if (node->valnode->qoperator.oper == OP_NOT)
+ {
+ /* NOT doesn't change pattern width, so just report child distances */
+ node->right = clean_stopword_intree(node->right, ladd, radd);
+ if (!node->right)
+ {
+ freetree(node);
+ return NULL;
+ }
+ }
+ else
+ {
+ NODE *res = node;
+ bool isphrase;
+ int ndistance,
+ lladd,
+ lradd,
+ rladd,
+ rradd;
+
+ /* First, recurse */
+ node->left = clean_stopword_intree(node->left, &lladd, &lradd);
+ node->right = clean_stopword_intree(node->right, &rladd, &rradd);
+
+ /* Check if current node is OP_PHRASE, get its distance */
+ isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
+ ndistance = isphrase ? node->valnode->qoperator.distance : 0;
+
+ if (node->left == NULL && node->right == NULL)
+ {
+ /*
+ * When we collapse out a phrase node entirely, propagate its own
+ * distance into both *ladd and *radd; it is the responsibility of
+ * the parent node to count it only once. Also, for a phrase
+ * node, distances coming from children are summed and propagated
+ * up to parent (we assume lladd == lradd and rladd == rradd, else
+ * rule was broken at a lower level). But if this isn't a phrase
+ * node, take the larger of the two child distances; that
+ * corresponds to what TS_execute will do in non-stopword cases.
+ */
+ if (isphrase)
+ *ladd = *radd = lladd + ndistance + rladd;
+ else
+ *ladd = *radd = Max(lladd, rladd);
+ freetree(node);
+ return NULL;
+ }
+ else if (node->left == NULL)
+ {
+ /* Removing this operator and left subnode */
+ /* lladd and lradd are equal/redundant, don't count both */
+ if (isphrase)
+ {
+ /* operator's own distance must propagate to left */
+ *ladd = lladd + ndistance + rladd;
+ *radd = rradd;
+ }
+ else
+ {
+ /* at non-phrase op, just forget the left subnode entirely */
+ *ladd = rladd;
+ *radd = rradd;
+ }
+ res = node->right;
+ pfree(node);
+ }
+ else if (node->right == NULL)
+ {
+ /* Removing this operator and right subnode */
+ /* rladd and rradd are equal/redundant, don't count both */
+ if (isphrase)
+ {
+ /* operator's own distance must propagate to right */
+ *ladd = lladd;
+ *radd = lradd + ndistance + rradd;
+ }
+ else
+ {
+ /* at non-phrase op, just forget the right subnode entirely */
+ *ladd = lladd;
+ *radd = lradd;
+ }
+ res = node->left;
+ pfree(node);
+ }
+ else if (isphrase)
+ {
+ /* Absorb appropriate corrections at this level */
+ node->valnode->qoperator.distance += lradd + rladd;
+ /* Propagate up any unaccounted-for corrections */
+ *ladd = lladd;
+ *radd = rradd;
+ }
+ else
+ {
+ /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
+ }
+
+ return res;
+ }
+ return node;
+}
+
+/*
+ * Number of elements in query tree
+ */
+static int32
+calcstrlen(NODE *node)
+{
+ int32 size = 0;
+
+ if (node->valnode->type == QI_VAL)
+ {
+ size = node->valnode->qoperand.length + 1;
+ }
+ else
+ {
+ Assert(node->valnode->type == QI_OPR);
+
+ size = calcstrlen(node->right);
+ if (node->valnode->qoperator.oper != OP_NOT)
+ size += calcstrlen(node->left);
+ }
+
+ return size;
+}
+
+/*
+ * Remove QI_VALSTOP (stopword) nodes from TSQuery.
+ */
+TSQuery
+cleanup_tsquery_stopwords(TSQuery in)
+{
+ int32 len,
+ lenstr,
+ commonlen,
+ i;
+ NODE *root;
+ int ladd,
+ radd;
+ TSQuery out;
+ QueryItem *items;
+ char *operands;
+
+ if (in->size == 0)
+ return in;
+
+ /* eliminate stop words */
+ root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
+ if (root == NULL)
+ {
+ ereport(NOTICE,
+ (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
+ out = palloc(HDRSIZETQ);
+ out->size = 0;
+ SET_VARSIZE(out, HDRSIZETQ);
+ return out;
+ }
+
+ /*
+ * Build TSQuery from plain view
+ */
+
+ lenstr = calcstrlen(root);
+ items = plaintree(root, &len);
+ commonlen = COMPUTESIZE(len, lenstr);
+
+ out = palloc(commonlen);
+ SET_VARSIZE(out, commonlen);
+ out->size = len;
+
+ memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
+
+ items = GETQUERY(out);
+ operands = GETOPERAND(out);
+ for (i = 0; i < out->size; i++)
+ {
+ QueryOperand *op = (QueryOperand *) &items[i];
+
+ if (op->type != QI_VAL)
+ continue;
+
+ memcpy(operands, GETOPERAND(in) + op->distance, op->length);
+ operands[op->length] = '\0';
+ op->distance = operands - GETOPERAND(out);
+ operands += op->length + 1;
+ }
+
+ return out;
+}