summaryrefslogtreecommitdiffstats
path: root/contrib/pg_trgm/trgm_gin.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /contrib/pg_trgm/trgm_gin.c
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/pg_trgm/trgm_gin.c')
-rw-r--r--contrib/pg_trgm/trgm_gin.c359
1 files changed, 359 insertions, 0 deletions
diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c
new file mode 100644
index 0000000..32fafef
--- /dev/null
+++ b/contrib/pg_trgm/trgm_gin.c
@@ -0,0 +1,359 @@
+/*
+ * contrib/pg_trgm/trgm_gin.c
+ */
+#include "postgres.h"
+
+#include "access/gin.h"
+#include "access/stratnum.h"
+#include "fmgr.h"
+#include "trgm.h"
+
+PG_FUNCTION_INFO_V1(gin_extract_trgm);
+PG_FUNCTION_INFO_V1(gin_extract_value_trgm);
+PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
+PG_FUNCTION_INFO_V1(gin_trgm_consistent);
+PG_FUNCTION_INFO_V1(gin_trgm_triconsistent);
+
+/*
+ * This function can only be called if a pre-9.1 version of the GIN operator
+ * class definition is present in the catalogs (probably as a consequence
+ * of upgrade-in-place). Cope.
+ */
+Datum
+gin_extract_trgm(PG_FUNCTION_ARGS)
+{
+ if (PG_NARGS() == 3)
+ return gin_extract_value_trgm(fcinfo);
+ if (PG_NARGS() == 7)
+ return gin_extract_query_trgm(fcinfo);
+ elog(ERROR, "unexpected number of arguments to gin_extract_trgm");
+ PG_RETURN_NULL();
+}
+
+Datum
+gin_extract_value_trgm(PG_FUNCTION_ARGS)
+{
+ text *val = (text *) PG_GETARG_TEXT_PP(0);
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ Datum *entries = NULL;
+ TRGM *trg;
+ int32 trglen;
+
+ *nentries = 0;
+
+ trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
+ trglen = ARRNELEM(trg);
+
+ if (trglen > 0)
+ {
+ trgm *ptr;
+ int32 i;
+
+ *nentries = trglen;
+ entries = (Datum *) palloc(sizeof(Datum) * trglen);
+
+ ptr = GETARR(trg);
+ for (i = 0; i < trglen; i++)
+ {
+ int32 item = trgm2int(ptr);
+
+ entries[i] = Int32GetDatum(item);
+ ptr++;
+ }
+ }
+
+ PG_RETURN_POINTER(entries);
+}
+
+Datum
+gin_extract_query_trgm(PG_FUNCTION_ARGS)
+{
+ text *val = (text *) PG_GETARG_TEXT_PP(0);
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = PG_GETARG_UINT16(2);
+
+ /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
+ Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4);
+
+ /* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
+ int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
+ Datum *entries = NULL;
+ TRGM *trg;
+ int32 trglen;
+ trgm *ptr;
+ TrgmPackedGraph *graph;
+ int32 i;
+
+ switch (strategy)
+ {
+ case SimilarityStrategyNumber:
+ case WordSimilarityStrategyNumber:
+ case StrictWordSimilarityStrategyNumber:
+ case EqualStrategyNumber:
+ trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
+ break;
+ case ILikeStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case LikeStrategyNumber:
+
+ /*
+ * For wildcard search we extract all the trigrams that every
+ * potentially-matching string must include.
+ */
+ trg = generate_wildcard_trgm(VARDATA_ANY(val),
+ VARSIZE_ANY_EXHDR(val));
+ break;
+ case RegExpICaseStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case RegExpStrategyNumber:
+ trg = createTrgmNFA(val, PG_GET_COLLATION(),
+ &graph, CurrentMemoryContext);
+ if (trg && ARRNELEM(trg) > 0)
+ {
+ /*
+ * Successful regex processing: store NFA-like graph as
+ * extra_data. GIN API requires an array of nentries
+ * Pointers, but we just put the same value in each element.
+ */
+ trglen = ARRNELEM(trg);
+ *extra_data = (Pointer *) palloc(sizeof(Pointer) * trglen);
+ for (i = 0; i < trglen; i++)
+ (*extra_data)[i] = (Pointer) graph;
+ }
+ else
+ {
+ /* No result: have to do full index scan. */
+ *nentries = 0;
+ *searchMode = GIN_SEARCH_MODE_ALL;
+ PG_RETURN_POINTER(entries);
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ trg = NULL; /* keep compiler quiet */
+ break;
+ }
+
+ trglen = ARRNELEM(trg);
+ *nentries = trglen;
+
+ if (trglen > 0)
+ {
+ entries = (Datum *) palloc(sizeof(Datum) * trglen);
+ ptr = GETARR(trg);
+ for (i = 0; i < trglen; i++)
+ {
+ int32 item = trgm2int(ptr);
+
+ entries[i] = Int32GetDatum(item);
+ ptr++;
+ }
+ }
+
+ /*
+ * If no trigram was extracted then we have to scan all the index.
+ */
+ if (trglen == 0)
+ *searchMode = GIN_SEARCH_MODE_ALL;
+
+ PG_RETURN_POINTER(entries);
+}
+
+Datum
+gin_trgm_consistent(PG_FUNCTION_ARGS)
+{
+ bool *check = (bool *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+
+ /* text *query = PG_GETARG_TEXT_PP(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+ Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
+ bool *recheck = (bool *) PG_GETARG_POINTER(5);
+ bool res;
+ int32 i,
+ ntrue;
+ double nlimit;
+
+ /* All cases served by this function are inexact */
+ *recheck = true;
+
+ switch (strategy)
+ {
+ case SimilarityStrategyNumber:
+ case WordSimilarityStrategyNumber:
+ case StrictWordSimilarityStrategyNumber:
+ nlimit = index_strategy_get_limit(strategy);
+
+ /* Count the matches */
+ ntrue = 0;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i])
+ ntrue++;
+ }
+
+ /*--------------------
+ * If DIVUNION is defined then similarity formula is:
+ * c / (len1 + len2 - c)
+ * where c is number of common trigrams and it stands as ntrue in
+ * this code. Here we don't know value of len2 but we can assume
+ * that c (ntrue) is a lower bound of len2, so upper bound of
+ * similarity is:
+ * c / (len1 + c - c) => c / len1
+ * If DIVUNION is not defined then similarity formula is:
+ * c / max(len1, len2)
+ * And again, c (ntrue) is a lower bound of len2, but c <= len1
+ * just by definition and, consequently, upper bound of
+ * similarity is just c / len1.
+ * So, independently on DIVUNION the upper bound formula is the same.
+ */
+ res = (nkeys == 0) ? false :
+ (((((float4) ntrue) / ((float4) nkeys))) >= nlimit);
+ break;
+ case ILikeStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case LikeStrategyNumber:
+ case EqualStrategyNumber:
+ /* Check if all extracted trigrams are presented. */
+ res = true;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!check[i])
+ {
+ res = false;
+ break;
+ }
+ }
+ break;
+ case RegExpICaseStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case RegExpStrategyNumber:
+ if (nkeys < 1)
+ {
+ /* Regex processing gave no result: do full index scan */
+ res = true;
+ }
+ else
+ res = trigramsMatchGraph((TrgmPackedGraph *) extra_data[0],
+ check);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ res = false; /* keep compiler quiet */
+ break;
+ }
+
+ PG_RETURN_BOOL(res);
+}
+
+/*
+ * In all cases, GIN_TRUE is at least as favorable to inclusion as
+ * GIN_MAYBE. If no better option is available, simply treat
+ * GIN_MAYBE as if it were GIN_TRUE and apply the same test as the binary
+ * consistent function.
+ */
+Datum
+gin_trgm_triconsistent(PG_FUNCTION_ARGS)
+{
+ GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+
+ /* text *query = PG_GETARG_TEXT_PP(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+ Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
+ GinTernaryValue res = GIN_MAYBE;
+ int32 i,
+ ntrue;
+ bool *boolcheck;
+ double nlimit;
+
+ switch (strategy)
+ {
+ case SimilarityStrategyNumber:
+ case WordSimilarityStrategyNumber:
+ case StrictWordSimilarityStrategyNumber:
+ nlimit = index_strategy_get_limit(strategy);
+
+ /* Count the matches */
+ ntrue = 0;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] != GIN_FALSE)
+ ntrue++;
+ }
+
+ /*
+ * See comment in gin_trgm_consistent() about * upper bound
+ * formula
+ */
+ res = (nkeys == 0)
+ ? GIN_FALSE : (((((float4) ntrue) / ((float4) nkeys)) >= nlimit)
+ ? GIN_MAYBE : GIN_FALSE);
+ break;
+ case ILikeStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case LikeStrategyNumber:
+ case EqualStrategyNumber:
+ /* Check if all extracted trigrams are presented. */
+ res = GIN_MAYBE;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_FALSE)
+ {
+ res = GIN_FALSE;
+ break;
+ }
+ }
+ break;
+ case RegExpICaseStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case RegExpStrategyNumber:
+ if (nkeys < 1)
+ {
+ /* Regex processing gave no result: do full index scan */
+ res = GIN_MAYBE;
+ }
+ else
+ {
+ /*
+ * As trigramsMatchGraph implements a monotonic boolean
+ * function, promoting all GIN_MAYBE keys to GIN_TRUE will
+ * give a conservative result.
+ */
+ boolcheck = (bool *) palloc(sizeof(bool) * nkeys);
+ for (i = 0; i < nkeys; i++)
+ boolcheck[i] = (check[i] != GIN_FALSE);
+ if (!trigramsMatchGraph((TrgmPackedGraph *) extra_data[0],
+ boolcheck))
+ res = GIN_FALSE;
+ pfree(boolcheck);
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ res = GIN_FALSE; /* keep compiler quiet */
+ break;
+ }
+
+ /* All cases served by this function are inexact */
+ Assert(res != GIN_TRUE);
+ PG_RETURN_GIN_TERNARY_VALUE(res);
+}