1 files changed, 434 insertions, 0 deletions
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
new file mode 100644
index 0000000..77ecd76
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,434 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ *	  Text search unaccent dictionary
+ *
+ * Copyright (c) 2009-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  contrib/unaccent/unaccent.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/namespace.h"
+#include "catalog/pg_ts_dict.h"
+#include "commands/defrem.h"
+#include "lib/stringinfo.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * An unaccent dictionary uses a trie to find a string to replace.  Each node
+ * of the trie is an array of 256 TrieChar structs; the N-th element of the
+ * array corresponds to next byte value N.  That element can contain both a
+ * replacement string (to be used if the source string ends with this byte)
+ * and a link to another trie node (to be followed if there are more bytes).
+ *
+ * Note that the trie search logic pays no attention to multibyte character
+ * boundaries.  This is OK as long as both the data entered into the trie and
+ * the data we're trying to look up are validly encoded; no partial-character
+ * matches will occur.
+ */
+typedef struct TrieChar
+{
+	struct TrieChar *nextChar;
+	char	   *replaceTo;
+	int			replacelen;
+} TrieChar;
+
+/*
+ * placeChar - put str into trie's structure, byte by byte.
+ *
+ * If node is NULL, we need to make a new node, which will be returned;
+ * otherwise the return value is the same as node.
+ */
+static TrieChar *
+placeChar(TrieChar *node, const unsigned char *str, int lenstr,
+		  const char *replaceTo, int replacelen)
+{
+	TrieChar   *curnode;
+
+	if (!node)
+		node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
+
+	Assert(lenstr > 0);			/* else str[0] doesn't exist */
+
+	curnode = node + *str;
+
+	if (lenstr <= 1)
+	{
+		if (curnode->replaceTo)
+			ereport(WARNING,
+					(errcode(ERRCODE_CONFIG_FILE_ERROR),
+					 errmsg("duplicate source strings, first one will be used")));
+		else
+		{
+			curnode->replacelen = replacelen;
+			curnode->replaceTo = (char *) palloc(replacelen);
+			memcpy(curnode->replaceTo, replaceTo, replacelen);
+		}
+	}
+	else
+	{
+		curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
+									  replaceTo, replacelen);
+	}
+
+	return node;
+}
+
+/*
+ * initTrie  - create trie from file.
+ *
+ * Function converts UTF8-encoded file into current encoding.
+ */
+static TrieChar *
+initTrie(const char *filename)
+{
+	TrieChar   *volatile rootTrie = NULL;
+	MemoryContext ccxt = CurrentMemoryContext;
+	tsearch_readline_state trst;
+	volatile bool skip;
+
+	filename = get_tsearch_config_filename(filename, "rules");
+	if (!tsearch_readline_begin(&trst, filename))
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIG_FILE_ERROR),
+				 errmsg("could not open unaccent file \"%s\": %m",
+						filename)));
+
+	do
+	{
+		/*
+		 * pg_do_encoding_conversion() (called by tsearch_readline()) will
+		 * emit exception if it finds untranslatable characters in current
+		 * locale. We just skip such lines, continuing with the next.
+		 */
+		skip = true;
+
+		PG_TRY();
+		{
+			char	   *line;
+
+			while ((line = tsearch_readline(&trst)) != NULL)
+			{
+				/*----------
+				 * The format of each line must be "src" or "src trg", where
+				 * src and trg are sequences of one or more non-whitespace
+				 * characters, separated by whitespace.  Whitespace at start
+				 * or end of line is ignored.  If trg is omitted, an empty
+				 * string is used as the replacement.
+				 *
+				 * We use a simple state machine, with states
+				 *	0	initial (before src)
+				 *	1	in src
+				 *	2	in whitespace after src
+				 *	3	in trg
+				 *	4	in whitespace after trg
+				 *	-1	syntax error detected
+				 *----------
+				 */
+				int			state;
+				char	   *ptr;
+				char	   *src = NULL;
+				char	   *trg = NULL;
+				int			ptrlen;
+				int			srclen = 0;
+				int			trglen = 0;
+
+				state = 0;
+				for (ptr = line; *ptr; ptr += ptrlen)
+				{
+					ptrlen = pg_mblen(ptr);
+					/* ignore whitespace, but end src or trg */
+					if (t_isspace(ptr))
+					{
+						if (state == 1)
+							state = 2;
+						else if (state == 3)
+							state = 4;
+						continue;
+					}
+					switch (state)
+					{
+						case 0:
+							/* start of src */
+							src = ptr;
+							srclen = ptrlen;
+							state = 1;
+							break;
+						case 1:
+							/* continue src */
+							srclen += ptrlen;
+							break;
+						case 2:
+							/* start of trg */
+							trg = ptr;
+							trglen = ptrlen;
+							state = 3;
+							break;
+						case 3:
+							/* continue trg */
+							trglen += ptrlen;
+							break;
+						default:
+							/* bogus line format */
+							state = -1;
+							break;
+					}
+				}
+
+				if (state == 1 || state == 2)
+				{
+					/* trg was omitted, so use "" */
+					trg = "";
+					trglen = 0;
+				}
+
+				if (state > 0)
+					rootTrie = placeChar(rootTrie,
+										 (unsigned char *) src, srclen,
+										 trg, trglen);
+				else if (state < 0)
+					ereport(WARNING,
+							(errcode(ERRCODE_CONFIG_FILE_ERROR),
+							 errmsg("invalid syntax: more than two strings in unaccent rule")));
+
+				pfree(line);
+			}
+			skip = false;
+		}
+		PG_CATCH();
+		{
+			ErrorData  *errdata;
+			MemoryContext ecxt;
+
+			ecxt = MemoryContextSwitchTo(ccxt);
+			errdata = CopyErrorData();
+			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+			{
+				FlushErrorState();
+			}
+			else
+			{
+				MemoryContextSwitchTo(ecxt);
+				PG_RE_THROW();
+			}
+		}
+		PG_END_TRY();
+	}
+	while (skip);
+
+	tsearch_readline_end(&trst);
+
+	return rootTrie;
+}
+
+/*
+ * findReplaceTo - find longest possible match in trie
+ *
+ * On success, returns pointer to ending subnode, plus length of matched
+ * source string in *p_matchlen.  On failure, returns NULL.
+ */
+static TrieChar *
+findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
+			  int *p_matchlen)
+{
+	TrieChar   *result = NULL;
+	int			matchlen = 0;
+
+	*p_matchlen = 0;			/* prevent uninitialized-variable warnings */
+
+	while (node && matchlen < srclen)
+	{
+		node = node + src[matchlen];
+		matchlen++;
+
+		if (node->replaceTo)
+		{
+			result = node;
+			*p_matchlen = matchlen;
+		}
+
+		node = node->nextChar;
+	}
+
+	return result;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
+	TrieChar   *rootTrie = NULL;
+	bool		fileloaded = false;
+	ListCell   *l;
+
+	foreach(l, dictoptions)
+	{
+		DefElem    *defel = (DefElem *) lfirst(l);
+
+		if (strcmp(defel->defname, "rules") == 0)
+		{
+			if (fileloaded)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("multiple Rules parameters")));
+			rootTrie = initTrie(defGetString(defel));
+			fileloaded = true;
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized Unaccent parameter: \"%s\"",
+							defel->defname)));
+		}
+	}
+
+	if (!fileloaded)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("missing Rules parameter")));
+	}
+
+	PG_RETURN_POINTER(rootTrie);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+	TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
+	char	   *srcchar = (char *) PG_GETARG_POINTER(1);
+	int32		len = PG_GETARG_INT32(2);
+	char	   *srcstart = srcchar;
+	TSLexeme   *res;
+	StringInfoData buf;
+
+	/* we allocate storage for the buffer only if needed */
+	buf.data = NULL;
+
+	while (len > 0)
+	{
+		TrieChar   *node;
+		int			matchlen;
+
+		node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
+							 &matchlen);
+		if (node && node->replaceTo)
+		{
+			if (buf.data == NULL)
+			{
+				/* initialize buffer */
+				initStringInfo(&buf);
+				/* insert any data we already skipped over */
+				if (srcchar != srcstart)
+					appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
+			}
+			appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
+		}
+		else
+		{
+			matchlen = pg_mblen(srcchar);
+			if (buf.data != NULL)
+				appendBinaryStringInfo(&buf, srcchar, matchlen);
+		}
+
+		srcchar += matchlen;
+		len -= matchlen;
+	}
+
+	/* return a result only if we made at least one substitution */
+	if (buf.data != NULL)
+	{
+		res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
+		res->lexeme = buf.data;
+		res->flags = TSL_FILTER;
+	}
+	else
+		res = NULL;
+
+	PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+	text	   *str;
+	int			strArg;
+	Oid			dictOid;
+	TSDictionaryCacheEntry *dict;
+	TSLexeme   *res;
+
+	if (PG_NARGS() == 1)
+	{
+		/*
+		 * Use the "unaccent" dictionary that is in the same schema that this
+		 * function is in.
+		 */
+		Oid			procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
+		const char *dictname = "unaccent";
+
+		dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
+								  PointerGetDatum(dictname),
+								  ObjectIdGetDatum(procnspid));
+		if (!OidIsValid(dictOid))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_OBJECT),
+					 errmsg("text search dictionary \"%s.%s\" does not exist",
+							get_namespace_name(procnspid), dictname)));
+		strArg = 0;
+	}
+	else
+	{
+		dictOid = PG_GETARG_OID(0);
+		strArg = 1;
+	}
+	str = PG_GETARG_TEXT_PP(strArg);
+
+	dict = lookup_ts_dictionary_cache(dictOid);
+
+	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+													 PointerGetDatum(dict->dictData),
+													 PointerGetDatum(VARDATA_ANY(str)),
+													 Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
+													 PointerGetDatum(NULL)));
+
+	PG_FREE_IF_COPY(str, strArg);
+
+	if (res == NULL)
+	{
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else if (res->lexeme == NULL)
+	{
+		pfree(res);
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else
+	{
+		text	   *txt = cstring_to_text(res->lexeme);
+
+		pfree(res->lexeme);
+		pfree(res);
+
+		PG_RETURN_TEXT_P(txt);
+	}
+}