diff options
Diffstat (limited to 'contrib/unaccent/unaccent.c')
-rw-r--r-- | contrib/unaccent/unaccent.c | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c new file mode 100644 index 0000000..64c879e --- /dev/null +++ b/contrib/unaccent/unaccent.c @@ -0,0 +1,434 @@ +/*------------------------------------------------------------------------- + * + * unaccent.c + * Text search unaccent dictionary + * + * Copyright (c) 2009-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/unaccent/unaccent.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/namespace.h" +#include "catalog/pg_ts_dict.h" +#include "commands/defrem.h" +#include "lib/stringinfo.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_public.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + +PG_MODULE_MAGIC; + +/* + * An unaccent dictionary uses a trie to find a string to replace. Each node + * of the trie is an array of 256 TrieChar structs; the N-th element of the + * array corresponds to next byte value N. That element can contain both a + * replacement string (to be used if the source string ends with this byte) + * and a link to another trie node (to be followed if there are more bytes). + * + * Note that the trie search logic pays no attention to multibyte character + * boundaries. This is OK as long as both the data entered into the trie and + * the data we're trying to look up are validly encoded; no partial-character + * matches will occur. + */ +typedef struct TrieChar +{ + struct TrieChar *nextChar; + char *replaceTo; + int replacelen; +} TrieChar; + +/* + * placeChar - put str into trie's structure, byte by byte. + * + * If node is NULL, we need to make a new node, which will be returned; + * otherwise the return value is the same as node. + */ +static TrieChar * +placeChar(TrieChar *node, const unsigned char *str, int lenstr, + const char *replaceTo, int replacelen) +{ + TrieChar *curnode; + + if (!node) + node = (TrieChar *) palloc0(sizeof(TrieChar) * 256); + + Assert(lenstr > 0); /* else str[0] doesn't exist */ + + curnode = node + *str; + + if (lenstr <= 1) + { + if (curnode->replaceTo) + ereport(WARNING, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("duplicate source strings, first one will be used"))); + else + { + curnode->replacelen = replacelen; + curnode->replaceTo = (char *) palloc(replacelen); + memcpy(curnode->replaceTo, replaceTo, replacelen); + } + } + else + { + curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, + replaceTo, replacelen); + } + + return node; +} + +/* + * initTrie - create trie from file. + * + * Function converts UTF8-encoded file into current encoding. + */ +static TrieChar * +initTrie(const char *filename) +{ + TrieChar *volatile rootTrie = NULL; + MemoryContext ccxt = CurrentMemoryContext; + tsearch_readline_state trst; + volatile bool skip; + + filename = get_tsearch_config_filename(filename, "rules"); + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open unaccent file \"%s\": %m", + filename))); + + do + { + /* + * pg_do_encoding_conversion() (called by tsearch_readline()) will + * emit exception if it finds untranslatable characters in current + * locale. We just skip such lines, continuing with the next. + */ + skip = true; + + PG_TRY(); + { + char *line; + + while ((line = tsearch_readline(&trst)) != NULL) + { + /*---------- + * The format of each line must be "src" or "src trg", where + * src and trg are sequences of one or more non-whitespace + * characters, separated by whitespace. Whitespace at start + * or end of line is ignored. If trg is omitted, an empty + * string is used as the replacement. + * + * We use a simple state machine, with states + * 0 initial (before src) + * 1 in src + * 2 in whitespace after src + * 3 in trg + * 4 in whitespace after trg + * -1 syntax error detected + *---------- + */ + int state; + char *ptr; + char *src = NULL; + char *trg = NULL; + int ptrlen; + int srclen = 0; + int trglen = 0; + + state = 0; + for (ptr = line; *ptr; ptr += ptrlen) + { + ptrlen = pg_mblen(ptr); + /* ignore whitespace, but end src or trg */ + if (t_isspace(ptr)) + { + if (state == 1) + state = 2; + else if (state == 3) + state = 4; + continue; + } + switch (state) + { + case 0: + /* start of src */ + src = ptr; + srclen = ptrlen; + state = 1; + break; + case 1: + /* continue src */ + srclen += ptrlen; + break; + case 2: + /* start of trg */ + trg = ptr; + trglen = ptrlen; + state = 3; + break; + case 3: + /* continue trg */ + trglen += ptrlen; + break; + default: + /* bogus line format */ + state = -1; + break; + } + } + + if (state == 1 || state == 2) + { + /* trg was omitted, so use "" */ + trg = ""; + trglen = 0; + } + + if (state > 0) + rootTrie = placeChar(rootTrie, + (unsigned char *) src, srclen, + trg, trglen); + else if (state < 0) + ereport(WARNING, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid syntax: more than two strings in unaccent rule"))); + + pfree(line); + } + skip = false; + } + PG_CATCH(); + { + ErrorData *errdata; + MemoryContext ecxt; + + ecxt = MemoryContextSwitchTo(ccxt); + errdata = CopyErrorData(); + if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) + { + FlushErrorState(); + } + else + { + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } + } + PG_END_TRY(); + } + while (skip); + + tsearch_readline_end(&trst); + + return rootTrie; +} + +/* + * findReplaceTo - find longest possible match in trie + * + * On success, returns pointer to ending subnode, plus length of matched + * source string in *p_matchlen. On failure, returns NULL. + */ +static TrieChar * +findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, + int *p_matchlen) +{ + TrieChar *result = NULL; + int matchlen = 0; + + *p_matchlen = 0; /* prevent uninitialized-variable warnings */ + + while (node && matchlen < srclen) + { + node = node + src[matchlen]; + matchlen++; + + if (node->replaceTo) + { + result = node; + *p_matchlen = matchlen; + } + + node = node->nextChar; + } + + return result; +} + +PG_FUNCTION_INFO_V1(unaccent_init); +Datum +unaccent_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + TrieChar *rootTrie = NULL; + bool fileloaded = false; + ListCell *l; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (strcmp(defel->defname, "rules") == 0) + { + if (fileloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple Rules parameters"))); + rootTrie = initTrie(defGetString(defel)); + fileloaded = true; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized Unaccent parameter: \"%s\"", + defel->defname))); + } + } + + if (!fileloaded) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing Rules parameter"))); + } + + PG_RETURN_POINTER(rootTrie); +} + +PG_FUNCTION_INFO_V1(unaccent_lexize); +Datum +unaccent_lexize(PG_FUNCTION_ARGS) +{ + TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0); + char *srcchar = (char *) PG_GETARG_POINTER(1); + int32 len = PG_GETARG_INT32(2); + char *srcstart = srcchar; + TSLexeme *res; + StringInfoData buf; + + /* we allocate storage for the buffer only if needed */ + buf.data = NULL; + + while (len > 0) + { + TrieChar *node; + int matchlen; + + node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len, + &matchlen); + if (node && node->replaceTo) + { + if (buf.data == NULL) + { + /* initialize buffer */ + initStringInfo(&buf); + /* insert any data we already skipped over */ + if (srcchar != srcstart) + appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart); + } + appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen); + } + else + { + matchlen = pg_mblen(srcchar); + if (buf.data != NULL) + appendBinaryStringInfo(&buf, srcchar, matchlen); + } + + srcchar += matchlen; + len -= matchlen; + } + + /* return a result only if we made at least one substitution */ + if (buf.data != NULL) + { + res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2); + res->lexeme = buf.data; + res->flags = TSL_FILTER; + } + else + res = NULL; + + PG_RETURN_POINTER(res); +} + +/* + * Function-like wrapper for dictionary + */ +PG_FUNCTION_INFO_V1(unaccent_dict); +Datum +unaccent_dict(PG_FUNCTION_ARGS) +{ + text *str; + int strArg; + Oid dictOid; + TSDictionaryCacheEntry *dict; + TSLexeme *res; + + if (PG_NARGS() == 1) + { + /* + * Use the "unaccent" dictionary that is in the same schema that this + * function is in. + */ + Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid); + const char *dictname = "unaccent"; + + dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid, + PointerGetDatum(dictname), + ObjectIdGetDatum(procnspid)); + if (!OidIsValid(dictOid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("text search dictionary \"%s.%s\" does not exist", + get_namespace_name(procnspid), dictname))); + strArg = 0; + } + else + { + dictOid = PG_GETARG_OID(0); + strArg = 1; + } + str = PG_GETARG_TEXT_PP(strArg); + + dict = lookup_ts_dictionary_cache(dictOid); + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(VARDATA_ANY(str)), + Int32GetDatum(VARSIZE_ANY_EXHDR(str)), + PointerGetDatum(NULL))); + + PG_FREE_IF_COPY(str, strArg); + + if (res == NULL) + { + PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); + } + else if (res->lexeme == NULL) + { + pfree(res); + PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); + } + else + { + text *txt = cstring_to_text(res->lexeme); + + pfree(res->lexeme); + pfree(res); + + PG_RETURN_TEXT_P(txt); + } +} |