summaryrefslogtreecommitdiffstats
path: root/contrib/unaccent
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /contrib/unaccent
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/unaccent')
-rw-r--r--contrib/unaccent/.gitignore7
-rw-r--r--contrib/unaccent/Makefile47
-rw-r--r--contrib/unaccent/expected/unaccent.out99
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py291
-rw-r--r--contrib/unaccent/sql/unaccent.sql24
-rw-r--r--contrib/unaccent/unaccent--1.0--1.1.sql9
-rw-r--r--contrib/unaccent/unaccent--1.1.sql34
-rw-r--r--contrib/unaccent/unaccent.c434
-rw-r--r--contrib/unaccent/unaccent.control6
-rw-r--r--contrib/unaccent/unaccent.rules1613
10 files changed, 2564 insertions, 0 deletions
diff --git a/contrib/unaccent/.gitignore b/contrib/unaccent/.gitignore
new file mode 100644
index 0000000..bccda73
--- /dev/null
+++ b/contrib/unaccent/.gitignore
@@ -0,0 +1,7 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
+
+# Downloaded files
+/Latin-ASCII.xml
diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile
new file mode 100644
index 0000000..b8307d1
--- /dev/null
+++ b/contrib/unaccent/Makefile
@@ -0,0 +1,47 @@
+# contrib/unaccent/Makefile
+
+MODULE_big = unaccent
+OBJS = \
+ $(WIN32RES) \
+ unaccent.o
+
+EXTENSION = unaccent
+DATA = unaccent--1.1.sql unaccent--1.0--1.1.sql
+DATA_TSEARCH = unaccent.rules
+PGFILEDESC = "unaccent - text search dictionary that removes accents"
+
+REGRESS = unaccent
+
+# We need a UTF8 database
+ENCODING = UTF8
+NO_LOCALE = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/unaccent
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+update-unicode: unaccent.rules
+
+# Allow running this even without --with-python
+PYTHON ?= python
+
+unaccent.rules: generate_unaccent_rules.py ../../src/common/unicode/UnicodeData.txt Latin-ASCII.xml
+ $(PYTHON) $< --unicode-data-file $(word 2,$^) --latin-ascii-file $(word 3,$^) >$@
+
+# Only download it once; dependencies must match src/common/unicode/
+../../src/common/unicode/UnicodeData.txt: $(top_builddir)/src/Makefile.global
+ $(MAKE) -C $(@D) $(@F)
+
+# Dependency on Makefile.global is for CLDR_VERSION
+Latin-ASCII.xml: $(top_builddir)/src/Makefile.global
+ $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/cldr/release-$(subst .,-,$(CLDR_VERSION))/common/transforms/Latin-ASCII.xml
+
+distclean:
+ rm -f Latin-ASCII.xml
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
new file mode 100644
index 0000000..c1bd7cd
--- /dev/null
+++ b/contrib/unaccent/expected/unaccent.out
@@ -0,0 +1,99 @@
+CREATE EXTENSION unaccent;
+-- must have a UTF8 database
+SELECT getdatabaseencoding();
+ getdatabaseencoding
+---------------------
+ UTF8
+(1 row)
+
+SET client_encoding TO 'UTF8';
+SELECT unaccent('foobar');
+ unaccent
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('ёлка');
+ unaccent
+----------
+ елка
+(1 row)
+
+SELECT unaccent('ЁЖИК');
+ unaccent
+----------
+ ЕЖИК
+(1 row)
+
+SELECT unaccent('˃˖˗˜');
+ unaccent
+----------
+ >+-~
+(1 row)
+
+SELECT unaccent('À'); -- Remove combining diacritical 0x0300
+ unaccent
+----------
+ A
+(1 row)
+
+SELECT unaccent('unaccent', 'foobar');
+ unaccent
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('unaccent', 'ёлка');
+ unaccent
+----------
+ елка
+(1 row)
+
+SELECT unaccent('unaccent', 'ЁЖИК');
+ unaccent
+----------
+ ЕЖИК
+(1 row)
+
+SELECT unaccent('unaccent', '˃˖˗˜');
+ unaccent
+----------
+ >+-~
+(1 row)
+
+SELECT unaccent('unaccent', 'À');
+ unaccent
+----------
+ A
+(1 row)
+
+SELECT ts_lexize('unaccent', 'foobar');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('unaccent', 'ёлка');
+ ts_lexize
+-----------
+ {елка}
+(1 row)
+
+SELECT ts_lexize('unaccent', 'ЁЖИК');
+ ts_lexize
+-----------
+ {ЕЖИК}
+(1 row)
+
+SELECT ts_lexize('unaccent', '˃˖˗˜');
+ ts_lexize
+-----------
+ {>+-~}
+(1 row)
+
+SELECT ts_lexize('unaccent', 'À');
+ ts_lexize
+-----------
+ {A}
+(1 row)
+
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
new file mode 100644
index 0000000..a952de5
--- /dev/null
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -0,0 +1,291 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
+# arguments. Optionally includes ligature expansion and Unicode CLDR
+# Latin-ASCII transliterator, enabled by default, this can be disabled
+# with "--no-ligatures-expansion" command line option.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+#
+# This approach handles most letters with diacritical marks and some
+# ligatures. However, several characters (notably a majority of
+# ligatures) don't have decomposition. To handle all these cases, one can
+# use a standard Unicode transliterator available in Common Locale Data
+# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode
+# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion"
+# option is enabled, the XML file of this transliterator [2] -- given as a
+# command line argument -- will be parsed and used.
+#
+# Ideally you should use the latest release for each data set. This
+# script is compatible with at least CLDR release 29.
+#
+# [1] https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/UnicodeData.txt
+# [2] https://raw.githubusercontent.com/unicode-org/cldr/${TAG}/common/transforms/Latin-ASCII.xml
+
+# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+# The approach is to be Python3 compatible with Python2 "backports".
+from __future__ import print_function
+from __future__ import unicode_literals
+# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+
+import argparse
+import codecs
+import re
+import sys
+import xml.etree.ElementTree as ET
+
+# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+if sys.version_info[0] <= 2:
+ # Encode stdout as UTF-8, so we can just print to it
+ sys.stdout = codecs.getwriter('utf8')(sys.stdout)
+
+ # Map Python 2's chr to unichr
+ chr = unichr
+
+ # Python 2 and 3 compatible bytes call
+ def bytes(source, encoding='ascii', errors='strict'):
+ return source.encode(encoding=encoding, errors=errors)
+else:
+# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+ sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
+
+# The ranges of Unicode characters that we consider to be "plain letters".
+# For now we are being conservative by including only Latin and Greek. This
+# could be extended in future based on feedback from people with relevant
+# language knowledge.
+PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
+ (ord('A'), ord('Z')), # Latin upper case
+ (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
+ (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
+
+# Combining marks follow a "base" character, and result in a composite
+# character. Example: "U&'A\0300'"produces "À".There are three types of
+# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
+# combining (Mc). We identify the ranges of marks we feel safe removing.
+# References:
+# https://en.wikipedia.org/wiki/Combining_character
+# https://www.unicode.org/charts/PDF/U0300.pdf
+# https://www.unicode.org/charts/PDF/U20D0.pdf
+COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
+ (0x20dd, 0x20E0), # Me: Symbols
+ (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
+
+def print_record(codepoint, letter):
+ if letter:
+ output = chr(codepoint) + "\t" + letter
+ else:
+ output = chr(codepoint)
+
+ print(output)
+
+class Codepoint:
+ def __init__(self, id, general_category, combining_ids):
+ self.id = id
+ self.general_category = general_category
+ self.combining_ids = combining_ids
+
+def is_mark_to_remove(codepoint):
+ """Return true if this is a combining mark to remove."""
+ if not is_mark(codepoint):
+ return False
+
+ for begin, end in COMBINING_MARK_RANGES:
+ if codepoint.id >= begin and codepoint.id <= end:
+ return True
+ return False
+
+def is_plain_letter(codepoint):
+ """Return true if codepoint represents a "plain letter"."""
+ for begin, end in PLAIN_LETTER_RANGES:
+ if codepoint.id >= begin and codepoint.id <= end:
+ return True
+ return False
+
+def is_mark(codepoint):
+ """Returns true for diacritical marks (combining codepoints)."""
+ return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+ """Returns true for letters combined with one or more marks."""
+ # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+
+ # Letter may have no combining characters, in which case it has
+ # no marks.
+ if len(codepoint.combining_ids) == 1:
+ return False
+
+ # A letter without diacritical marks has none of them.
+ if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
+ return False
+
+ # Check if the base letter of this letter has marks.
+ codepoint_base = codepoint.combining_ids[0]
+ if (is_plain_letter(table[codepoint_base]) is False and \
+ is_letter_with_marks(table[codepoint_base], table) is False):
+ return False
+
+ return True
+
+def is_letter(codepoint, table):
+ """Return true for letter with or without diacritical marks."""
+ return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+ """Return the base codepoint without marks. If this codepoint has more
+ than one combining character, do a recursive lookup on the table to
+ find out its plain base letter."""
+ if is_letter_with_marks(codepoint, table):
+ if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
+ return get_plain_letter(table[codepoint.combining_ids[0]], table)
+ elif is_plain_letter(table[codepoint.combining_ids[0]]):
+ return table[codepoint.combining_ids[0]]
+
+ # Should not come here
+ assert(False)
+ elif is_plain_letter(codepoint):
+ return codepoint
+
+ # Should not come here
+ assert(False)
+
+def is_ligature(codepoint, table):
+ """Return true for letters combined with letters."""
+ return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+ """Return a list of plain letters from a ligature."""
+ assert(is_ligature(codepoint, table))
+ return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
+ """Parse the XML file and return a set of tuples (src, trg), where "src"
+ is the original character and "trg" the substitute."""
+ charactersSet = set()
+
+ # RegEx to parse rules
+ rulePattern = re.compile(r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
+
+ # construct tree from XML
+ transliterationTree = ET.parse(latinAsciiFilePath)
+ transliterationTreeRoot = transliterationTree.getroot()
+
+ # Fetch all the transliteration rules. Since release 29 of Latin-ASCII.xml
+ # all the transliteration rules are located in a single tRule block with
+ # all rules separated into separate lines.
+ blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule")
+ assert(len(blockRules) == 1)
+
+ # Split the block of rules into one element per line.
+ rules = blockRules[0].text.splitlines()
+
+ # And finish the processing of each individual rule.
+ for rule in rules:
+ matches = rulePattern.search(rule)
+
+ # The regular expression capture four groups corresponding
+ # to the characters.
+ #
+ # Group 1: plain "src" char. Empty if group 2 is not.
+ # Group 2: unicode-escaped "src" char (e.g. "\u0110"). Empty if group 1 is not.
+ #
+ # Group 3: plain "trg" char. Empty if group 4 is not.
+ # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
+ if matches is not None:
+ src = matches.group(1) if matches.group(1) is not None else bytes(matches.group(2), 'UTF-8').decode('unicode-escape')
+ trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
+
+ # "'" and """ are escaped
+ trg = trg.replace("\\'", "'").replace('\\"', '"')
+
+ # the parser of unaccent only accepts non-whitespace characters
+ # for "src" and "trg" (see unaccent.c)
+ if not src.isspace() and not trg.isspace():
+ charactersSet.add((ord(src), trg))
+
+ return charactersSet
+
+def special_cases():
+ """Returns the special cases which are not handled by other methods"""
+ charactersSet = set()
+
+ # Cyrillic
+ charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO
+ charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO
+
+ # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
+ charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS
+ charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT
+ charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
+
+ return charactersSet
+
+def main(args):
+ # https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+ decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+ table = {}
+ all = []
+
+ # unordered set for ensure uniqueness
+ charactersSet = set()
+
+ # read file UnicodeData.txt
+ with codecs.open(
+ args.unicodeDataFilePath, mode='r', encoding='UTF-8',
+ ) as unicodeDataFile:
+ # read everything we need into memory
+ for line in unicodeDataFile:
+ fields = line.split(";")
+ if len(fields) > 5:
+ # https://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+ general_category = fields[2]
+ decomposition = fields[5]
+ decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+ id = int(fields[0], 16)
+ combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+ codepoint = Codepoint(id, general_category, combining_ids)
+ table[id] = codepoint
+ all.append(codepoint)
+
+ # walk through all the codepoints looking for interesting mappings
+ for codepoint in all:
+ if codepoint.general_category.startswith('L') and \
+ len(codepoint.combining_ids) > 1:
+ if is_letter_with_marks(codepoint, table):
+ charactersSet.add((codepoint.id,
+ chr(get_plain_letter(codepoint, table).id)))
+ elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
+ charactersSet.add((codepoint.id,
+ "".join(chr(combining_codepoint.id)
+ for combining_codepoint \
+ in get_plain_letters(codepoint, table))))
+ elif is_mark_to_remove(codepoint):
+ charactersSet.add((codepoint.id, None))
+
+ # add CLDR Latin-ASCII characters
+ if not args.noLigaturesExpansion:
+ charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)
+ charactersSet |= special_cases()
+
+ # sort for more convenient display
+ charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0])
+
+ for characterPair in charactersList:
+ print_record(characterPair[0], characterPair[1])
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
+ parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath')
+ parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest='latinAsciiFilePath')
+ parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
+ args = parser.parse_args()
+
+ if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None:
+ sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
+ sys.exit(1)
+
+ main(args)
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
new file mode 100644
index 0000000..2ae097f
--- /dev/null
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -0,0 +1,24 @@
+CREATE EXTENSION unaccent;
+
+-- must have a UTF8 database
+SELECT getdatabaseencoding();
+
+SET client_encoding TO 'UTF8';
+
+SELECT unaccent('foobar');
+SELECT unaccent('ёлка');
+SELECT unaccent('ЁЖИК');
+SELECT unaccent('˃˖˗˜');
+SELECT unaccent('À'); -- Remove combining diacritical 0x0300
+
+SELECT unaccent('unaccent', 'foobar');
+SELECT unaccent('unaccent', 'ёлка');
+SELECT unaccent('unaccent', 'ЁЖИК');
+SELECT unaccent('unaccent', '˃˖˗˜');
+SELECT unaccent('unaccent', 'À');
+
+SELECT ts_lexize('unaccent', 'foobar');
+SELECT ts_lexize('unaccent', 'ёлка');
+SELECT ts_lexize('unaccent', 'ЁЖИК');
+SELECT ts_lexize('unaccent', '˃˖˗˜');
+SELECT ts_lexize('unaccent', 'À');
diff --git a/contrib/unaccent/unaccent--1.0--1.1.sql b/contrib/unaccent/unaccent--1.0--1.1.sql
new file mode 100644
index 0000000..8efa0d0
--- /dev/null
+++ b/contrib/unaccent/unaccent--1.0--1.1.sql
@@ -0,0 +1,9 @@
+/* contrib/unaccent/unaccent--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION unaccent UPDATE TO '1.1'" to load this file. \quit
+
+ALTER FUNCTION unaccent(regdictionary, text) PARALLEL SAFE;
+ALTER FUNCTION unaccent(text) PARALLEL SAFE;
+ALTER FUNCTION unaccent_init(internal) PARALLEL SAFE;
+ALTER FUNCTION unaccent_lexize(internal, internal, internal, internal) PARALLEL SAFE;
diff --git a/contrib/unaccent/unaccent--1.1.sql b/contrib/unaccent/unaccent--1.1.sql
new file mode 100644
index 0000000..ecc8651
--- /dev/null
+++ b/contrib/unaccent/unaccent--1.1.sql
@@ -0,0 +1,34 @@
+/* contrib/unaccent/unaccent--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION unaccent" to load this file. \quit
+
+CREATE FUNCTION unaccent(regdictionary, text)
+ RETURNS text
+ AS 'MODULE_PATHNAME', 'unaccent_dict'
+ LANGUAGE C STABLE STRICT PARALLEL SAFE;
+
+CREATE FUNCTION unaccent(text)
+ RETURNS text
+ AS 'MODULE_PATHNAME', 'unaccent_dict'
+ LANGUAGE C STABLE STRICT PARALLEL SAFE;
+
+CREATE FUNCTION unaccent_init(internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME', 'unaccent_init'
+ LANGUAGE C PARALLEL SAFE;
+
+CREATE FUNCTION unaccent_lexize(internal,internal,internal,internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME', 'unaccent_lexize'
+ LANGUAGE C PARALLEL SAFE;
+
+CREATE TEXT SEARCH TEMPLATE unaccent (
+ INIT = unaccent_init,
+ LEXIZE = unaccent_lexize
+);
+
+CREATE TEXT SEARCH DICTIONARY unaccent (
+ TEMPLATE = unaccent,
+ RULES = 'unaccent'
+);
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
new file mode 100644
index 0000000..2b3819f
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,434 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ * Text search unaccent dictionary
+ *
+ * Copyright (c) 2009-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/unaccent/unaccent.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/namespace.h"
+#include "catalog/pg_ts_dict.h"
+#include "commands/defrem.h"
+#include "lib/stringinfo.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * An unaccent dictionary uses a trie to find a string to replace. Each node
+ * of the trie is an array of 256 TrieChar structs; the N-th element of the
+ * array corresponds to next byte value N. That element can contain both a
+ * replacement string (to be used if the source string ends with this byte)
+ * and a link to another trie node (to be followed if there are more bytes).
+ *
+ * Note that the trie search logic pays no attention to multibyte character
+ * boundaries. This is OK as long as both the data entered into the trie and
+ * the data we're trying to look up are validly encoded; no partial-character
+ * matches will occur.
+ */
+typedef struct TrieChar
+{
+ struct TrieChar *nextChar;
+ char *replaceTo;
+ int replacelen;
+} TrieChar;
+
+/*
+ * placeChar - put str into trie's structure, byte by byte.
+ *
+ * If node is NULL, we need to make a new node, which will be returned;
+ * otherwise the return value is the same as node.
+ */
+static TrieChar *
+placeChar(TrieChar *node, const unsigned char *str, int lenstr,
+ const char *replaceTo, int replacelen)
+{
+ TrieChar *curnode;
+
+ if (!node)
+ node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
+
+ Assert(lenstr > 0); /* else str[0] doesn't exist */
+
+ curnode = node + *str;
+
+ if (lenstr <= 1)
+ {
+ if (curnode->replaceTo)
+ ereport(WARNING,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("duplicate source strings, first one will be used")));
+ else
+ {
+ curnode->replacelen = replacelen;
+ curnode->replaceTo = (char *) palloc(replacelen);
+ memcpy(curnode->replaceTo, replaceTo, replacelen);
+ }
+ }
+ else
+ {
+ curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
+ replaceTo, replacelen);
+ }
+
+ return node;
+}
+
+/*
+ * initTrie - create trie from file.
+ *
+ * Function converts UTF8-encoded file into current encoding.
+ */
+static TrieChar *
+initTrie(const char *filename)
+{
+ TrieChar *volatile rootTrie = NULL;
+ MemoryContext ccxt = CurrentMemoryContext;
+ tsearch_readline_state trst;
+ volatile bool skip;
+
+ filename = get_tsearch_config_filename(filename, "rules");
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open unaccent file \"%s\": %m",
+ filename)));
+
+ do
+ {
+ /*
+ * pg_do_encoding_conversion() (called by tsearch_readline()) will
+ * emit exception if it finds untranslatable characters in current
+ * locale. We just skip such lines, continuing with the next.
+ */
+ skip = true;
+
+ PG_TRY();
+ {
+ char *line;
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ /*----------
+ * The format of each line must be "src" or "src trg", where
+ * src and trg are sequences of one or more non-whitespace
+ * characters, separated by whitespace. Whitespace at start
+ * or end of line is ignored. If trg is omitted, an empty
+ * string is used as the replacement.
+ *
+ * We use a simple state machine, with states
+ * 0 initial (before src)
+ * 1 in src
+ * 2 in whitespace after src
+ * 3 in trg
+ * 4 in whitespace after trg
+ * -1 syntax error detected
+ *----------
+ */
+ int state;
+ char *ptr;
+ char *src = NULL;
+ char *trg = NULL;
+ int ptrlen;
+ int srclen = 0;
+ int trglen = 0;
+
+ state = 0;
+ for (ptr = line; *ptr; ptr += ptrlen)
+ {
+ ptrlen = pg_mblen(ptr);
+ /* ignore whitespace, but end src or trg */
+ if (t_isspace(ptr))
+ {
+ if (state == 1)
+ state = 2;
+ else if (state == 3)
+ state = 4;
+ continue;
+ }
+ switch (state)
+ {
+ case 0:
+ /* start of src */
+ src = ptr;
+ srclen = ptrlen;
+ state = 1;
+ break;
+ case 1:
+ /* continue src */
+ srclen += ptrlen;
+ break;
+ case 2:
+ /* start of trg */
+ trg = ptr;
+ trglen = ptrlen;
+ state = 3;
+ break;
+ case 3:
+ /* continue trg */
+ trglen += ptrlen;
+ break;
+ default:
+ /* bogus line format */
+ state = -1;
+ break;
+ }
+ }
+
+ if (state == 1 || state == 2)
+ {
+ /* trg was omitted, so use "" */
+ trg = "";
+ trglen = 0;
+ }
+
+ if (state > 0)
+ rootTrie = placeChar(rootTrie,
+ (unsigned char *) src, srclen,
+ trg, trglen);
+ else if (state < 0)
+ ereport(WARNING,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid syntax: more than two strings in unaccent rule")));
+
+ pfree(line);
+ }
+ skip = false;
+ }
+ PG_CATCH();
+ {
+ ErrorData *errdata;
+ MemoryContext ecxt;
+
+ ecxt = MemoryContextSwitchTo(ccxt);
+ errdata = CopyErrorData();
+ if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+ {
+ FlushErrorState();
+ }
+ else
+ {
+ MemoryContextSwitchTo(ecxt);
+ PG_RE_THROW();
+ }
+ }
+ PG_END_TRY();
+ }
+ while (skip);
+
+ tsearch_readline_end(&trst);
+
+ return rootTrie;
+}
+
+/*
+ * findReplaceTo - find longest possible match in trie
+ *
+ * On success, returns pointer to ending subnode, plus length of matched
+ * source string in *p_matchlen. On failure, returns NULL.
+ */
+static TrieChar *
+findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
+ int *p_matchlen)
+{
+ TrieChar *result = NULL;
+ int matchlen = 0;
+
+ *p_matchlen = 0; /* prevent uninitialized-variable warnings */
+
+ while (node && matchlen < srclen)
+ {
+ node = node + src[matchlen];
+ matchlen++;
+
+ if (node->replaceTo)
+ {
+ result = node;
+ *p_matchlen = matchlen;
+ }
+
+ node = node->nextChar;
+ }
+
+ return result;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ TrieChar *rootTrie = NULL;
+ bool fileloaded = false;
+ ListCell *l;
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (strcmp(defel->defname, "rules") == 0)
+ {
+ if (fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Rules parameters")));
+ rootTrie = initTrie(defGetString(defel));
+ fileloaded = true;
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Unaccent parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ if (!fileloaded)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing Rules parameter")));
+ }
+
+ PG_RETURN_POINTER(rootTrie);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+ TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
+ char *srcchar = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ char *srcstart = srcchar;
+ TSLexeme *res;
+ StringInfoData buf;
+
+ /* we allocate storage for the buffer only if needed */
+ buf.data = NULL;
+
+ while (len > 0)
+ {
+ TrieChar *node;
+ int matchlen;
+
+ node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
+ &matchlen);
+ if (node && node->replaceTo)
+ {
+ if (buf.data == NULL)
+ {
+ /* initialize buffer */
+ initStringInfo(&buf);
+ /* insert any data we already skipped over */
+ if (srcchar != srcstart)
+ appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
+ }
+ appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
+ }
+ else
+ {
+ matchlen = pg_mblen(srcchar);
+ if (buf.data != NULL)
+ appendBinaryStringInfo(&buf, srcchar, matchlen);
+ }
+
+ srcchar += matchlen;
+ len -= matchlen;
+ }
+
+ /* return a result only if we made at least one substitution */
+ if (buf.data != NULL)
+ {
+ res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
+ res->lexeme = buf.data;
+ res->flags = TSL_FILTER;
+ }
+ else
+ res = NULL;
+
+ PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+ text *str;
+ int strArg;
+ Oid dictOid;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res;
+
+ if (PG_NARGS() == 1)
+ {
+ /*
+ * Use the "unaccent" dictionary that is in the same schema that this
+ * function is in.
+ */
+ Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
+ const char *dictname = "unaccent";
+
+ dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
+ PointerGetDatum(dictname),
+ ObjectIdGetDatum(procnspid));
+ if (!OidIsValid(dictOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("text search dictionary \"%s.%s\" does not exist",
+ get_namespace_name(procnspid), dictname)));
+ strArg = 0;
+ }
+ else
+ {
+ dictOid = PG_GETARG_OID(0);
+ strArg = 1;
+ }
+ str = PG_GETARG_TEXT_PP(strArg);
+
+ dict = lookup_ts_dictionary_cache(dictOid);
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(VARDATA_ANY(str)),
+ Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
+ PointerGetDatum(NULL)));
+
+ PG_FREE_IF_COPY(str, strArg);
+
+ if (res == NULL)
+ {
+ PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+ }
+ else if (res->lexeme == NULL)
+ {
+ pfree(res);
+ PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+ }
+ else
+ {
+ text *txt = cstring_to_text(res->lexeme);
+
+ pfree(res->lexeme);
+ pfree(res);
+
+ PG_RETURN_TEXT_P(txt);
+ }
+}
diff --git a/contrib/unaccent/unaccent.control b/contrib/unaccent/unaccent.control
new file mode 100644
index 0000000..649cf68
--- /dev/null
+++ b/contrib/unaccent/unaccent.control
@@ -0,0 +1,6 @@
+# unaccent extension
+comment = 'text search dictionary that removes accents'
+default_version = '1.1'
+module_pathname = '$libdir/unaccent'
+relocatable = true
+trusted = true
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
new file mode 100644
index 0000000..1b5eb1b
--- /dev/null
+++ b/contrib/unaccent/unaccent.rules
@@ -0,0 +1,1613 @@
+¡ !
+© (C)
+« <<
+­ -
+® (R)
+± +/-
+» >>
+¼ 1/4
+½ 1/2
+¾ 3/4
+¿ ?
+À A
+Á A
+Â A
+Ã A
+Ä A
+Å A
+Æ AE
+Ç C
+È E
+É E
+Ê E
+Ë E
+Ì I
+Í I
+Î I
+Ï I
+Ð D
+Ñ N
+Ò O
+Ó O
+Ô O
+Õ O
+Ö O
+× *
+Ø O
+Ù U
+Ú U
+Û U
+Ü U
+Ý Y
+Þ TH
+ß ss
+à a
+á a
+â a
+ã a
+ä a
+å a
+æ ae
+ç c
+è e
+é e
+ê e
+ë e
+ì i
+í i
+î i
+ï i
+ð d
+ñ n
+ò o
+ó o
+ô o
+õ o
+ö o
+÷ /
+ø o
+ù u
+ú u
+û u
+ü u
+ý y
+þ th
+ÿ y
+Ā A
+ā a
+Ă A
+ă a
+Ą A
+ą a
+Ć C
+ć c
+Ĉ C
+ĉ c
+Ċ C
+ċ c
+Č C
+č c
+Ď D
+ď d
+Đ D
+đ d
+Ē E
+ē e
+Ĕ E
+ĕ e
+Ė E
+ė e
+Ę E
+ę e
+Ě E
+ě e
+Ĝ G
+ĝ g
+Ğ G
+ğ g
+Ġ G
+ġ g
+Ģ G
+ģ g
+Ĥ H
+ĥ h
+Ħ H
+ħ h
+Ĩ I
+ĩ i
+Ī I
+ī i
+Ĭ I
+ĭ i
+Į I
+į i
+İ I
+ı i
+IJ IJ
+ij ij
+Ĵ J
+ĵ j
+Ķ K
+ķ k
+ĸ q
+Ĺ L
+ĺ l
+Ļ L
+ļ l
+Ľ L
+ľ l
+Ŀ L
+ŀ l
+Ł L
+ł l
+Ń N
+ń n
+Ņ N
+ņ n
+Ň N
+ň n
+ʼn 'n
+Ŋ N
+ŋ n
+Ō O
+ō o
+Ŏ O
+ŏ o
+Ő O
+ő o
+Œ OE
+œ oe
+Ŕ R
+ŕ r
+Ŗ R
+ŗ r
+Ř R
+ř r
+Ś S
+ś s
+Ŝ S
+ŝ s
+Ş S
+ş s
+Š S
+š s
+Ţ T
+ţ t
+Ť T
+ť t
+Ŧ T
+ŧ t
+Ũ U
+ũ u
+Ū U
+ū u
+Ŭ U
+ŭ u
+Ů U
+ů u
+Ű U
+ű u
+Ų U
+ų u
+Ŵ W
+ŵ w
+Ŷ Y
+ŷ y
+Ÿ Y
+Ź Z
+ź z
+Ż Z
+ż z
+Ž Z
+ž z
+ſ s
+ƀ b
+Ɓ B
+Ƃ B
+ƃ b
+Ƈ C
+ƈ c
+Ɖ D
+Ɗ D
+Ƌ D
+ƌ d
+Ɛ E
+Ƒ F
+ƒ f
+Ɠ G
+ƕ hv
+Ɩ I
+Ɨ I
+Ƙ K
+ƙ k
+ƚ l
+Ɲ N
+ƞ n
+Ơ O
+ơ o
+Ƣ OI
+ƣ oi
+Ƥ P
+ƥ p
+ƫ t
+Ƭ T
+ƭ t
+Ʈ T
+Ư U
+ư u
+Ʋ V
+Ƴ Y
+ƴ y
+Ƶ Z
+ƶ z
+DŽ DZ
+Dž Dz
+dž dz
+LJ LJ
+Lj Lj
+lj lj
+NJ NJ
+Nj Nj
+nj nj
+Ǎ A
+ǎ a
+Ǐ I
+ǐ i
+Ǒ O
+ǒ o
+Ǔ U
+ǔ u
+Ǖ U
+ǖ u
+Ǘ U
+ǘ u
+Ǚ U
+ǚ u
+Ǜ U
+ǜ u
+Ǟ A
+ǟ a
+Ǡ A
+ǡ a
+Ǥ G
+ǥ g
+Ǧ G
+ǧ g
+Ǩ K
+ǩ k
+Ǫ O
+ǫ o
+Ǭ O
+ǭ o
+ǰ j
+DZ DZ
+Dz Dz
+dz dz
+Ǵ G
+ǵ g
+Ǹ N
+ǹ n
+Ǻ A
+ǻ a
+Ȁ A
+ȁ a
+Ȃ A
+ȃ a
+Ȅ E
+ȅ e
+Ȇ E
+ȇ e
+Ȉ I
+ȉ i
+Ȋ I
+ȋ i
+Ȍ O
+ȍ o
+Ȏ O
+ȏ o
+Ȑ R
+ȑ r
+Ȓ R
+ȓ r
+Ȕ U
+ȕ u
+Ȗ U
+ȗ u
+Ș S
+ș s
+Ț T
+ț t
+Ȟ H
+ȟ h
+ȡ d
+Ȥ Z
+ȥ z
+Ȧ A
+ȧ a
+Ȩ E
+ȩ e
+Ȫ O
+ȫ o
+Ȭ O
+ȭ o
+Ȯ O
+ȯ o
+Ȱ O
+ȱ o
+Ȳ Y
+ȳ y
+ȴ l
+ȵ n
+ȶ t
+ȷ j
+ȸ db
+ȹ qp
+Ⱥ A
+Ȼ C
+ȼ c
+Ƚ L
+Ⱦ T
+ȿ s
+ɀ z
+Ƀ B
+Ʉ U
+Ɇ E
+ɇ e
+Ɉ J
+ɉ j
+Ɍ R
+ɍ r
+Ɏ Y
+ɏ y
+ɓ b
+ɕ c
+ɖ d
+ɗ d
+ɛ e
+ɟ j
+ɠ g
+ɡ g
+ɢ G
+ɦ h
+ɧ h
+ɨ i
+ɪ I
+ɫ l
+ɬ l
+ɭ l
+ɱ m
+ɲ n
+ɳ n
+ɴ N
+ɶ OE
+ɼ r
+ɽ r
+ɾ r
+ʀ R
+ʂ s
+ʈ t
+ʉ u
+ʋ v
+ʏ Y
+ʐ z
+ʑ z
+ʙ B
+ʛ G
+ʜ H
+ʝ j
+ʟ L
+ʠ q
+ʣ dz
+ʥ dz
+ʦ ts
+ʪ ls
+ʫ lz
+ʹ '
+ʺ "
+ʻ '
+ʼ '
+ʽ '
+˂ <
+˃ >
+˄ ^
+ˆ ^
+ˈ '
+ˋ `
+ː :
+˖ +
+˗ -
+˜ ~
+̿
+Ά Α
+Έ Ε
+Ή Η
+Ί Ι
+Ό Ο
+Ύ Υ
+Ώ Ω
+ΐ ι
+Ϊ Ι
+Ϋ Υ
+ά α
+έ ε
+ή η
+ί ι
+ΰ υ
+ϊ ι
+ϋ υ
+ό ο
+ύ υ
+ώ ω
+Ё Е
+ё е
+ᴀ A
+ᴁ AE
+ᴃ B
+ᴄ C
+ᴅ D
+ᴆ D
+ᴇ E
+ᴊ J
+ᴋ K
+ᴌ L
+ᴍ M
+ᴏ O
+ᴘ P
+ᴛ T
+ᴜ U
+ᴠ V
+ᴡ W
+ᴢ Z
+ᵫ ue
+ᵬ b
+ᵭ d
+ᵮ f
+ᵯ m
+ᵰ n
+ᵱ p
+ᵲ r
+ᵳ r
+ᵴ s
+ᵵ t
+ᵶ z
+ᵺ th
+ᵻ I
+ᵽ p
+ᵾ U
+ᶀ b
+ᶁ d
+ᶂ f
+ᶃ g
+ᶄ k
+ᶅ l
+ᶆ m
+ᶇ n
+ᶈ p
+ᶉ r
+ᶊ s
+ᶌ v
+ᶍ x
+ᶎ z
+ᶏ a
+ᶑ d
+ᶒ e
+ᶓ e
+ᶖ i
+ᶙ u
+Ḁ A
+ḁ a
+Ḃ B
+ḃ b
+Ḅ B
+ḅ b
+Ḇ B
+ḇ b
+Ḉ C
+ḉ c
+Ḋ D
+ḋ d
+Ḍ D
+ḍ d
+Ḏ D
+ḏ d
+Ḑ D
+ḑ d
+Ḓ D
+ḓ d
+Ḕ E
+ḕ e
+Ḗ E
+ḗ e
+Ḙ E
+ḙ e
+Ḛ E
+ḛ e
+Ḝ E
+ḝ e
+Ḟ F
+ḟ f
+Ḡ G
+ḡ g
+Ḣ H
+ḣ h
+Ḥ H
+ḥ h
+Ḧ H
+ḧ h
+Ḩ H
+ḩ h
+Ḫ H
+ḫ h
+Ḭ I
+ḭ i
+Ḯ I
+ḯ i
+Ḱ K
+ḱ k
+Ḳ K
+ḳ k
+Ḵ K
+ḵ k
+Ḷ L
+ḷ l
+Ḹ L
+ḹ l
+Ḻ L
+ḻ l
+Ḽ L
+ḽ l
+Ḿ M
+ḿ m
+Ṁ M
+ṁ m
+Ṃ M
+ṃ m
+Ṅ N
+ṅ n
+Ṇ N
+ṇ n
+Ṉ N
+ṉ n
+Ṋ N
+ṋ n
+Ṍ O
+ṍ o
+Ṏ O
+ṏ o
+Ṑ O
+ṑ o
+Ṓ O
+ṓ o
+Ṕ P
+ṕ p
+Ṗ P
+ṗ p
+Ṙ R
+ṙ r
+Ṛ R
+ṛ r
+Ṝ R
+ṝ r
+Ṟ R
+ṟ r
+Ṡ S
+ṡ s
+Ṣ S
+ṣ s
+Ṥ S
+ṥ s
+Ṧ S
+ṧ s
+Ṩ S
+ṩ s
+Ṫ T
+ṫ t
+Ṭ T
+ṭ t
+Ṯ T
+ṯ t
+Ṱ T
+ṱ t
+Ṳ U
+ṳ u
+Ṵ U
+ṵ u
+Ṷ U
+ṷ u
+Ṹ U
+ṹ u
+Ṻ U
+ṻ u
+Ṽ V
+ṽ v
+Ṿ V
+ṿ v
+Ẁ W
+ẁ w
+Ẃ W
+ẃ w
+Ẅ W
+ẅ w
+Ẇ W
+ẇ w
+Ẉ W
+ẉ w
+Ẋ X
+ẋ x
+Ẍ X
+ẍ x
+Ẏ Y
+ẏ y
+Ẑ Z
+ẑ z
+Ẓ Z
+ẓ z
+Ẕ Z
+ẕ z
+ẖ h
+ẗ t
+ẘ w
+ẙ y
+ẚ a
+ẜ s
+ẝ s
+ẞ SS
+Ạ A
+ạ a
+Ả A
+ả a
+Ấ A
+ấ a
+Ầ A
+ầ a
+Ẩ A
+ẩ a
+Ẫ A
+ẫ a
+Ậ A
+ậ a
+Ắ A
+ắ a
+Ằ A
+ằ a
+Ẳ A
+ẳ a
+Ẵ A
+ẵ a
+Ặ A
+ặ a
+Ẹ E
+ẹ e
+Ẻ E
+ẻ e
+Ẽ E
+ẽ e
+Ế E
+ế e
+Ề E
+ề e
+Ể E
+ể e
+Ễ E
+ễ e
+Ệ E
+ệ e
+Ỉ I
+ỉ i
+Ị I
+ị i
+Ọ O
+ọ o
+Ỏ O
+ỏ o
+Ố O
+ố o
+Ồ O
+ồ o
+Ổ O
+ổ o
+Ỗ O
+ỗ o
+Ộ O
+ộ o
+Ớ O
+ớ o
+Ờ O
+ờ o
+Ở O
+ở o
+Ỡ O
+ỡ o
+Ợ O
+ợ o
+Ụ U
+ụ u
+Ủ U
+ủ u
+Ứ U
+ứ u
+Ừ U
+ừ u
+Ử U
+ử u
+Ữ U
+ữ u
+Ự U
+ự u
+Ỳ Y
+ỳ y
+Ỵ Y
+ỵ y
+Ỷ Y
+ỷ y
+Ỹ Y
+ỹ y
+Ỻ LL
+ỻ ll
+Ỽ V
+ỽ v
+Ỿ Y
+ỿ y
+ἀ α
+ἁ α
+ἂ α
+ἃ α
+ἄ α
+ἅ α
+ἆ α
+ἇ α
+Ἀ Α
+Ἁ Α
+Ἂ Α
+Ἃ Α
+Ἄ Α
+Ἅ Α
+Ἆ Α
+Ἇ Α
+ἐ ε
+ἑ ε
+ἒ ε
+ἓ ε
+ἔ ε
+ἕ ε
+Ἐ Ε
+Ἑ Ε
+Ἒ Ε
+Ἓ Ε
+Ἔ Ε
+Ἕ Ε
+ἠ η
+ἡ η
+ἢ η
+ἣ η
+ἤ η
+ἥ η
+ἦ η
+ἧ η
+Ἠ Η
+Ἡ Η
+Ἢ Η
+Ἣ Η
+Ἤ Η
+Ἥ Η
+Ἦ Η
+Ἧ Η
+ἰ ι
+ἱ ι
+ἲ ι
+ἳ ι
+ἴ ι
+ἵ ι
+ἶ ι
+ἷ ι
+Ἰ Ι
+Ἱ Ι
+Ἲ Ι
+Ἳ Ι
+Ἴ Ι
+Ἵ Ι
+Ἶ Ι
+Ἷ Ι
+ὀ ο
+ὁ ο
+ὂ ο
+ὃ ο
+ὄ ο
+ὅ ο
+Ὀ Ο
+Ὁ Ο
+Ὂ Ο
+Ὃ Ο
+Ὄ Ο
+Ὅ Ο
+ὐ υ
+ὑ υ
+ὒ υ
+ὓ υ
+ὔ υ
+ὕ υ
+ὖ υ
+ὗ υ
+Ὑ Υ
+Ὓ Υ
+Ὕ Υ
+Ὗ Υ
+ὠ ω
+ὡ ω
+ὢ ω
+ὣ ω
+ὤ ω
+ὥ ω
+ὦ ω
+ὧ ω
+Ὠ Ω
+Ὡ Ω
+Ὢ Ω
+Ὣ Ω
+Ὤ Ω
+Ὥ Ω
+Ὦ Ω
+Ὧ Ω
+ὰ α
+ὲ ε
+ὴ η
+ὶ ι
+ὸ ο
+ὺ υ
+ὼ ω
+ᾀ α
+ᾁ α
+ᾂ α
+ᾃ α
+ᾄ α
+ᾅ α
+ᾆ α
+ᾇ α
+ᾈ Α
+ᾉ Α
+ᾊ Α
+ᾋ Α
+ᾌ Α
+ᾍ Α
+ᾎ Α
+ᾏ Α
+ᾐ η
+ᾑ η
+ᾒ η
+ᾓ η
+ᾔ η
+ᾕ η
+ᾖ η
+ᾗ η
+ᾘ Η
+ᾙ Η
+ᾚ Η
+ᾛ Η
+ᾜ Η
+ᾝ Η
+ᾞ Η
+ᾟ Η
+ᾠ ω
+ᾡ ω
+ᾢ ω
+ᾣ ω
+ᾤ ω
+ᾥ ω
+ᾦ ω
+ᾧ ω
+ᾨ Ω
+ᾩ Ω
+ᾪ Ω
+ᾫ Ω
+ᾬ Ω
+ᾭ Ω
+ᾮ Ω
+ᾯ Ω
+ᾰ α
+ᾱ α
+ᾲ α
+ᾳ α
+ᾴ α
+ᾶ α
+ᾷ α
+Ᾰ Α
+Ᾱ Α
+Ὰ Α
+ᾼ Α
+ῂ η
+ῃ η
+ῄ η
+ῆ η
+ῇ η
+Ὲ Ε
+Ὴ Η
+ῌ Η
+ῐ ι
+ῑ ι
+ῒ ι
+ῖ ι
+ῗ ι
+Ῐ Ι
+Ῑ Ι
+Ὶ Ι
+ῠ υ
+ῡ υ
+ῢ υ
+ῤ ρ
+ῥ ρ
+ῦ υ
+ῧ υ
+Ῠ Υ
+Ῡ Υ
+Ὺ Υ
+Ῥ Ρ
+ῲ ω
+ῳ ω
+ῴ ω
+ῶ ω
+ῷ ω
+Ὸ Ο
+Ὼ Ω
+ῼ Ω
+‐ -
+‑ -
+‒ -
+– -
+— -
+― -
+‖ ||
+‘ '
+’ '
+‚ ,
+‛ '
+“ "
+” "
+„ ,,
+‟ "
+․ .
+‥ ..
+… ...
+′ '
+″ "
+‹ <
+› >
+‼ !!
+⁄ /
+⁅ [
+⁆ ]
+⁇ ??
+⁈ ?!
+⁉ !?
+⁎ *
+₠ CE
+₢ Cr
+₣ Fr.
+₤ L.
+₧ Pts
+₹ Rs
+₺ TL
+⃝
+⃞
+⃟
+⃠
+⃢
+⃣
+⃤
+℀ a/c
+℁ a/s
+ℂ C
+℃ °C
+℅ c/o
+℆ c/u
+℉ °F
+ℊ g
+ℋ H
+ℌ x
+ℍ H
+ℎ h
+ℐ I
+ℑ I
+ℒ L
+ℓ l
+ℕ N
+№ No
+℗ (P)
+℘ P
+ℙ P
+ℚ Q
+ℛ R
+ℜ R
+ℝ R
+℞ Rx
+℡ TEL
+ℤ Z
+ℨ Z
+ℬ B
+ℭ C
+ℯ e
+ℰ E
+ℱ F
+ℳ M
+ℴ o
+ℹ i
+℻ FAX
+ⅅ D
+ⅆ d
+ⅇ e
+ⅈ i
+ⅉ j
+⅐ 1/7
+⅑ 1/9
+⅒ 1/10
+⅓ 1/3
+⅔ 2/3
+⅕ 1/5
+⅖ 2/5
+⅗ 3/5
+⅘ 4/5
+⅙ 1/6
+⅚ 5/6
+⅛ 1/8
+⅜ 3/8
+⅝ 5/8
+⅞ 7/8
+⅟ 1/
+Ⅰ I
+Ⅱ II
+Ⅲ III
+Ⅳ IV
+Ⅴ V
+Ⅵ VI
+Ⅶ VII
+Ⅷ VIII
+Ⅸ IX
+Ⅹ X
+Ⅺ XI
+Ⅻ XII
+Ⅼ L
+Ⅽ C
+Ⅾ D
+Ⅿ M
+ⅰ i
+ⅱ ii
+ⅲ iii
+ⅳ iv
+ⅴ v
+ⅵ vi
+ⅶ vii
+ⅷ viii
+ⅸ ix
+ⅹ x
+ⅺ xi
+ⅻ xii
+ⅼ l
+ⅽ c
+ⅾ d
+ⅿ m
+↉ 0/3
+− -
+∕ /
+∖ \
+∣ |
+∥ ||
+≪ <<
+≫ >>
+⑴ (1)
+⑵ (2)
+⑶ (3)
+⑷ (4)
+⑸ (5)
+⑹ (6)
+⑺ (7)
+⑻ (8)
+⑼ (9)
+⑽ (10)
+⑾ (11)
+⑿ (12)
+⒀ (13)
+⒁ (14)
+⒂ (15)
+⒃ (16)
+⒄ (17)
+⒅ (18)
+⒆ (19)
+⒇ (20)
+⒈ 1.
+⒉ 2.
+⒊ 3.
+⒋ 4.
+⒌ 5.
+⒍ 6.
+⒎ 7.
+⒏ 8.
+⒐ 9.
+⒑ 10.
+⒒ 11.
+⒓ 12.
+⒔ 13.
+⒕ 14.
+⒖ 15.
+⒗ 16.
+⒘ 17.
+⒙ 18.
+⒚ 19.
+⒛ 20.
+⒜ (a)
+⒝ (b)
+⒞ (c)
+⒟ (d)
+⒠ (e)
+⒡ (f)
+⒢ (g)
+⒣ (h)
+⒤ (i)
+⒥ (j)
+⒦ (k)
+⒧ (l)
+⒨ (m)
+⒩ (n)
+⒪ (o)
+⒫ (p)
+⒬ (q)
+⒭ (r)
+⒮ (s)
+⒯ (t)
+⒰ (u)
+⒱ (v)
+⒲ (w)
+⒳ (x)
+⒴ (y)
+⒵ (z)
+⦅ ((
+⦆ ))
+⩴ ::=
+⩵ ==
+⩶ ===
+Ⱡ L
+ⱡ l
+Ɫ L
+Ᵽ P
+Ɽ R
+ⱥ a
+ⱦ t
+Ⱨ H
+ⱨ h
+Ⱪ K
+ⱪ k
+Ⱬ Z
+ⱬ z
+Ɱ M
+ⱱ v
+Ⱳ W
+ⱳ w
+ⱴ v
+ⱸ e
+ⱺ o
+Ȿ S
+Ɀ Z
+、 ,
+。 .
+〇 0
+〈 <
+〉 >
+《 <<
+》 >>
+〔 [
+〕 ]
+〘 [
+〙 ]
+〚 [
+〛 ]
+〝 "
+〞 "
+㍱ hPa
+㍲ da
+㍳ AU
+㍴ bar
+㍵ oV
+㍶ pc
+㍷ dm
+㍺ IU
+㎀ pA
+㎁ nA
+㎃ mA
+㎄ kA
+㎅ KB
+㎆ MB
+㎇ GB
+㎈ cal
+㎉ kcal
+㎊ pF
+㎋ nF
+㎎ mg
+㎏ kg
+㎐ Hz
+㎑ kHz
+㎒ MHz
+㎓ GHz
+㎔ THz
+㎙ fm
+㎚ nm
+㎜ mm
+㎝ cm
+㎞ km
+㎧ m/s
+㎩ Pa
+㎪ kPa
+㎫ MPa
+㎬ GPa
+㎭ rad
+㎮ rad/s
+㎰ ps
+㎱ ns
+㎳ ms
+㎴ pV
+㎵ nV
+㎷ mV
+㎸ kV
+㎹ MV
+㎺ pW
+㎻ nW
+㎽ mW
+㎾ kW
+㎿ MW
+㏂ a.m.
+㏃ Bq
+㏄ cc
+㏅ cd
+㏆ C/kg
+㏇ Co.
+㏈ dB
+㏉ Gy
+㏊ ha
+㏋ HP
+㏌ in
+㏍ KK
+㏎ KM
+㏏ kt
+㏐ lm
+㏑ ln
+㏒ log
+㏓ lx
+㏔ mb
+㏕ mil
+㏖ mol
+㏗ pH
+㏘ p.m.
+㏙ PPM
+㏚ PR
+㏛ sr
+㏜ Sv
+㏝ Wb
+㏞ V/m
+㏟ A/m
+ꜰ F
+ꜱ S
+Ꜳ AA
+ꜳ aa
+Ꜵ AO
+ꜵ ao
+Ꜷ AU
+ꜷ au
+Ꜹ AV
+ꜹ av
+Ꜻ AV
+ꜻ av
+Ꜽ AY
+ꜽ ay
+Ꝁ K
+ꝁ k
+Ꝃ K
+ꝃ k
+Ꝅ K
+ꝅ k
+Ꝇ L
+ꝇ l
+Ꝉ L
+ꝉ l
+Ꝋ O
+ꝋ o
+Ꝍ O
+ꝍ o
+Ꝏ OO
+ꝏ oo
+Ꝑ P
+ꝑ p
+Ꝓ P
+ꝓ p
+Ꝕ P
+ꝕ p
+Ꝗ Q
+ꝗ q
+Ꝙ Q
+ꝙ q
+Ꝟ V
+ꝟ v
+Ꝡ VY
+ꝡ vy
+Ꝥ TH
+ꝥ th
+Ꝧ TH
+ꝧ th
+ꝱ d
+ꝲ l
+ꝳ m
+ꝴ n
+ꝵ r
+ꝶ R
+ꝷ t
+Ꝺ D
+ꝺ d
+Ꝼ F
+ꝼ f
+Ꞇ T
+ꞇ t
+Ꞑ N
+ꞑ n
+Ꞓ C
+ꞓ c
+Ꞡ G
+ꞡ g
+Ꞣ K
+ꞣ k
+Ꞥ N
+ꞥ n
+Ꞧ R
+ꞧ r
+Ꞩ S
+ꞩ s
+Ɦ H
+ff ff
+fi fi
+fl fl
+ffi ffi
+ffl ffl
+ſt st
+st st
+︐ ,
+︑ ,
+︒ .
+︓ :
+︔ ;
+︕ !
+︖ ?
+︙ ...
+︰ ..
+︱ -
+︲ -
+︵ (
+︶ )
+︷ {
+︸ }
+︹ [
+︺ ]
+︽ <<
+︾ >>
+︿ <
+﹀ >
+﹇ [
+﹈ ]
+﹐ ,
+﹑ ,
+﹒ .
+﹔ ;
+﹕ :
+﹖ ?
+﹗ !
+﹘ -
+﹙ (
+﹚ )
+﹛ {
+﹜ }
+﹝ [
+﹞ ]
+﹟ #
+﹠ &
+﹡ *
+﹢ +
+﹣ -
+﹤ <
+﹥ >
+﹦ =
+﹨ \
+﹩ $
+﹪ %
+﹫ @
+! !
+" "
+# #
+$ $
+% %
+& &
+' '
+( (
+) )
+* *
++ +
+, ,
+- -
+. .
+/ /
+0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+: :
+; ;
+< <
+= =
+> >
+? ?
+@ @
+A A
+B B
+C C
+D D
+E E
+F F
+G G
+H H
+I I
+J J
+K K
+L L
+M M
+N N
+O O
+P P
+Q Q
+R R
+S S
+T T
+U U
+V V
+W W
+X X
+Y Y
+Z Z
+[ [
+\ \
+] ]
+^ ^
+_ _
+` `
+a a
+b b
+c c
+d d
+e e
+f f
+g g
+h h
+i i
+j j
+k k
+l l
+m m
+n n
+o o
+p p
+q q
+r r
+s s
+t t
+u u
+v v
+w w
+x x
+y y
+z z
+{ {
+| |
+} }
+~ ~
+⦅ ((
+⦆ ))
+。 .
+、 ,
+← <-
+→ ->