summaryrefslogtreecommitdiffstats
path: root/contrib/unaccent/unaccent.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/unaccent/unaccent.c')
-rw-r--r--contrib/unaccent/unaccent.c434
1 files changed, 434 insertions, 0 deletions
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
new file mode 100644
index 0000000..64c879e
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,434 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ * Text search unaccent dictionary
+ *
+ * Copyright (c) 2009-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/unaccent/unaccent.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/namespace.h"
+#include "catalog/pg_ts_dict.h"
+#include "commands/defrem.h"
+#include "lib/stringinfo.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * An unaccent dictionary uses a trie to find a string to replace. Each node
+ * of the trie is an array of 256 TrieChar structs; the N-th element of the
+ * array corresponds to next byte value N. That element can contain both a
+ * replacement string (to be used if the source string ends with this byte)
+ * and a link to another trie node (to be followed if there are more bytes).
+ *
+ * Note that the trie search logic pays no attention to multibyte character
+ * boundaries. This is OK as long as both the data entered into the trie and
+ * the data we're trying to look up are validly encoded; no partial-character
+ * matches will occur.
+ */
+typedef struct TrieChar
+{
+ struct TrieChar *nextChar;
+ char *replaceTo;
+ int replacelen;
+} TrieChar;
+
+/*
+ * placeChar - put str into trie's structure, byte by byte.
+ *
+ * If node is NULL, we need to make a new node, which will be returned;
+ * otherwise the return value is the same as node.
+ */
+static TrieChar *
+placeChar(TrieChar *node, const unsigned char *str, int lenstr,
+ const char *replaceTo, int replacelen)
+{
+ TrieChar *curnode;
+
+ if (!node)
+ node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
+
+ Assert(lenstr > 0); /* else str[0] doesn't exist */
+
+ curnode = node + *str;
+
+ if (lenstr <= 1)
+ {
+ if (curnode->replaceTo)
+ ereport(WARNING,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("duplicate source strings, first one will be used")));
+ else
+ {
+ curnode->replacelen = replacelen;
+ curnode->replaceTo = (char *) palloc(replacelen);
+ memcpy(curnode->replaceTo, replaceTo, replacelen);
+ }
+ }
+ else
+ {
+ curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
+ replaceTo, replacelen);
+ }
+
+ return node;
+}
+
+/*
+ * initTrie - create trie from file.
+ *
+ * Function converts UTF8-encoded file into current encoding.
+ */
+static TrieChar *
+initTrie(const char *filename)
+{
+ TrieChar *volatile rootTrie = NULL;
+ MemoryContext ccxt = CurrentMemoryContext;
+ tsearch_readline_state trst;
+ volatile bool skip;
+
+ filename = get_tsearch_config_filename(filename, "rules");
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open unaccent file \"%s\": %m",
+ filename)));
+
+ do
+ {
+ /*
+ * pg_do_encoding_conversion() (called by tsearch_readline()) will
+ * emit exception if it finds untranslatable characters in current
+ * locale. We just skip such lines, continuing with the next.
+ */
+ skip = true;
+
+ PG_TRY();
+ {
+ char *line;
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ /*----------
+ * The format of each line must be "src" or "src trg", where
+ * src and trg are sequences of one or more non-whitespace
+ * characters, separated by whitespace. Whitespace at start
+ * or end of line is ignored. If trg is omitted, an empty
+ * string is used as the replacement.
+ *
+ * We use a simple state machine, with states
+ * 0 initial (before src)
+ * 1 in src
+ * 2 in whitespace after src
+ * 3 in trg
+ * 4 in whitespace after trg
+ * -1 syntax error detected
+ *----------
+ */
+ int state;
+ char *ptr;
+ char *src = NULL;
+ char *trg = NULL;
+ int ptrlen;
+ int srclen = 0;
+ int trglen = 0;
+
+ state = 0;
+ for (ptr = line; *ptr; ptr += ptrlen)
+ {
+ ptrlen = pg_mblen(ptr);
+ /* ignore whitespace, but end src or trg */
+ if (t_isspace(ptr))
+ {
+ if (state == 1)
+ state = 2;
+ else if (state == 3)
+ state = 4;
+ continue;
+ }
+ switch (state)
+ {
+ case 0:
+ /* start of src */
+ src = ptr;
+ srclen = ptrlen;
+ state = 1;
+ break;
+ case 1:
+ /* continue src */
+ srclen += ptrlen;
+ break;
+ case 2:
+ /* start of trg */
+ trg = ptr;
+ trglen = ptrlen;
+ state = 3;
+ break;
+ case 3:
+ /* continue trg */
+ trglen += ptrlen;
+ break;
+ default:
+ /* bogus line format */
+ state = -1;
+ break;
+ }
+ }
+
+ if (state == 1 || state == 2)
+ {
+ /* trg was omitted, so use "" */
+ trg = "";
+ trglen = 0;
+ }
+
+ if (state > 0)
+ rootTrie = placeChar(rootTrie,
+ (unsigned char *) src, srclen,
+ trg, trglen);
+ else if (state < 0)
+ ereport(WARNING,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid syntax: more than two strings in unaccent rule")));
+
+ pfree(line);
+ }
+ skip = false;
+ }
+ PG_CATCH();
+ {
+ ErrorData *errdata;
+ MemoryContext ecxt;
+
+ ecxt = MemoryContextSwitchTo(ccxt);
+ errdata = CopyErrorData();
+ if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+ {
+ FlushErrorState();
+ }
+ else
+ {
+ MemoryContextSwitchTo(ecxt);
+ PG_RE_THROW();
+ }
+ }
+ PG_END_TRY();
+ }
+ while (skip);
+
+ tsearch_readline_end(&trst);
+
+ return rootTrie;
+}
+
+/*
+ * findReplaceTo - find longest possible match in trie
+ *
+ * On success, returns pointer to ending subnode, plus length of matched
+ * source string in *p_matchlen. On failure, returns NULL.
+ */
+static TrieChar *
+findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
+ int *p_matchlen)
+{
+ TrieChar *result = NULL;
+ int matchlen = 0;
+
+ *p_matchlen = 0; /* prevent uninitialized-variable warnings */
+
+ while (node && matchlen < srclen)
+ {
+ node = node + src[matchlen];
+ matchlen++;
+
+ if (node->replaceTo)
+ {
+ result = node;
+ *p_matchlen = matchlen;
+ }
+
+ node = node->nextChar;
+ }
+
+ return result;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ TrieChar *rootTrie = NULL;
+ bool fileloaded = false;
+ ListCell *l;
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (strcmp(defel->defname, "rules") == 0)
+ {
+ if (fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Rules parameters")));
+ rootTrie = initTrie(defGetString(defel));
+ fileloaded = true;
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Unaccent parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ if (!fileloaded)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing Rules parameter")));
+ }
+
+ PG_RETURN_POINTER(rootTrie);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+ TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
+ char *srcchar = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ char *srcstart = srcchar;
+ TSLexeme *res;
+ StringInfoData buf;
+
+ /* we allocate storage for the buffer only if needed */
+ buf.data = NULL;
+
+ while (len > 0)
+ {
+ TrieChar *node;
+ int matchlen;
+
+ node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
+ &matchlen);
+ if (node && node->replaceTo)
+ {
+ if (buf.data == NULL)
+ {
+ /* initialize buffer */
+ initStringInfo(&buf);
+ /* insert any data we already skipped over */
+ if (srcchar != srcstart)
+ appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
+ }
+ appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
+ }
+ else
+ {
+ matchlen = pg_mblen(srcchar);
+ if (buf.data != NULL)
+ appendBinaryStringInfo(&buf, srcchar, matchlen);
+ }
+
+ srcchar += matchlen;
+ len -= matchlen;
+ }
+
+ /* return a result only if we made at least one substitution */
+ if (buf.data != NULL)
+ {
+ res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
+ res->lexeme = buf.data;
+ res->flags = TSL_FILTER;
+ }
+ else
+ res = NULL;
+
+ PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+ text *str;
+ int strArg;
+ Oid dictOid;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res;
+
+ if (PG_NARGS() == 1)
+ {
+ /*
+ * Use the "unaccent" dictionary that is in the same schema that this
+ * function is in.
+ */
+ Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
+ const char *dictname = "unaccent";
+
+ dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
+ PointerGetDatum(dictname),
+ ObjectIdGetDatum(procnspid));
+ if (!OidIsValid(dictOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("text search dictionary \"%s.%s\" does not exist",
+ get_namespace_name(procnspid), dictname)));
+ strArg = 0;
+ }
+ else
+ {
+ dictOid = PG_GETARG_OID(0);
+ strArg = 1;
+ }
+ str = PG_GETARG_TEXT_PP(strArg);
+
+ dict = lookup_ts_dictionary_cache(dictOid);
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(VARDATA_ANY(str)),
+ Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
+ PointerGetDatum(NULL)));
+
+ PG_FREE_IF_COPY(str, strArg);
+
+ if (res == NULL)
+ {
+ PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+ }
+ else if (res->lexeme == NULL)
+ {
+ pfree(res);
+ PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+ }
+ else
+ {
+ text *txt = cstring_to_text(res->lexeme);
+
+ pfree(res->lexeme);
+ pfree(res);
+
+ PG_RETURN_TEXT_P(txt);
+ }
+}