10 files changed, 2555 insertions, 0 deletions
diff --git a/contrib/unaccent/.gitignore b/contrib/unaccent/.gitignore
new file mode 100644
index 0000000..bccda73
--- /dev/null
+++ b/contrib/unaccent/.gitignore
@@ -0,0 +1,7 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
+
+# Downloaded files
+/Latin-ASCII.xml
diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile
new file mode 100644
index 0000000..b8307d1
--- /dev/null
+++ b/contrib/unaccent/Makefile
@@ -0,0 +1,47 @@
+# contrib/unaccent/Makefile
+
+MODULE_big = unaccent
+OBJS = \
+	$(WIN32RES) \
+	unaccent.o
+
+EXTENSION = unaccent
+DATA = unaccent--1.1.sql unaccent--1.0--1.1.sql
+DATA_TSEARCH = unaccent.rules
+PGFILEDESC = "unaccent - text search dictionary that removes accents"
+
+REGRESS = unaccent
+
+# We need a UTF8 database
+ENCODING = UTF8
+NO_LOCALE = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/unaccent
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+update-unicode: unaccent.rules
+
+# Allow running this even without --with-python
+PYTHON ?= python
+
+unaccent.rules: generate_unaccent_rules.py ../../src/common/unicode/UnicodeData.txt Latin-ASCII.xml
+	$(PYTHON) $< --unicode-data-file $(word 2,$^) --latin-ascii-file $(word 3,$^) >$@
+
+# Only download it once; dependencies must match src/common/unicode/
+../../src/common/unicode/UnicodeData.txt: $(top_builddir)/src/Makefile.global
+	$(MAKE) -C $(@D) $(@F)
+
+# Dependency on Makefile.global is for CLDR_VERSION
+Latin-ASCII.xml: $(top_builddir)/src/Makefile.global
+	$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/cldr/release-$(subst .,-,$(CLDR_VERSION))/common/transforms/Latin-ASCII.xml
+
+distclean:
+	rm -f Latin-ASCII.xml
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
new file mode 100644
index 0000000..c1bd7cd
--- /dev/null
+++ b/contrib/unaccent/expected/unaccent.out
@@ -0,0 +1,99 @@
+CREATE EXTENSION unaccent;
+-- must have a UTF8 database
+SELECT getdatabaseencoding();
+ getdatabaseencoding 
+---------------------
+ UTF8
+(1 row)
+
+SET client_encoding TO 'UTF8';
+SELECT unaccent('foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('ёлка');
+ unaccent 
+----------
+ елка
+(1 row)
+
+SELECT unaccent('ЁЖИК');
+ unaccent 
+----------
+ ЕЖИК
+(1 row)
+
+SELECT unaccent('˃˖˗˜');
+ unaccent 
+----------
+ >+-~
+(1 row)
+
+SELECT unaccent('À');  -- Remove combining diacritical 0x0300
+ unaccent 
+----------
+ A
+(1 row)
+
+SELECT unaccent('unaccent', 'foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('unaccent', 'ёлка');
+ unaccent 
+----------
+ елка
+(1 row)
+
+SELECT unaccent('unaccent', 'ЁЖИК');
+ unaccent 
+----------
+ ЕЖИК
+(1 row)
+
+SELECT unaccent('unaccent', '˃˖˗˜');
+ unaccent 
+----------
+ >+-~
+(1 row)
+
+SELECT unaccent('unaccent', 'À');
+ unaccent 
+----------
+ A
+(1 row)
+
+SELECT ts_lexize('unaccent', 'foobar');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('unaccent', 'ёлка');
+ ts_lexize 
+-----------
+ {елка}
+(1 row)
+
+SELECT ts_lexize('unaccent', 'ЁЖИК');
+ ts_lexize 
+-----------
+ {ЕЖИК}
+(1 row)
+
+SELECT ts_lexize('unaccent', '˃˖˗˜');
+ ts_lexize 
+-----------
+ {>+-~}
+(1 row)
+
+SELECT ts_lexize('unaccent', 'À');
+ ts_lexize 
+-----------
+ {A}
+(1 row)
+
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
new file mode 100644
index 0000000..a952de5
--- /dev/null
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -0,0 +1,291 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
+# arguments. Optionally includes ligature expansion and Unicode CLDR
+# Latin-ASCII transliterator, enabled by default, this can be disabled
+# with "--no-ligatures-expansion" command line option.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+#
+# This approach handles most letters with diacritical marks and some
+# ligatures.  However, several characters (notably a majority of
+# ligatures) don't have decomposition. To handle all these cases, one can
+# use a standard Unicode transliterator available in Common Locale Data
+# Repository (CLDR): Latin-ASCII.  This transliterator associates Unicode
+# characters to ASCII-range equivalent.  Unless "--no-ligatures-expansion"
+# option is enabled, the XML file of this transliterator [2] -- given as a
+# command line argument -- will be parsed and used.
+#
+# Ideally you should use the latest release for each data set.  This
+# script is compatible with at least CLDR release 29.
+#
+# [1] https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/UnicodeData.txt
+# [2] https://raw.githubusercontent.com/unicode-org/cldr/${TAG}/common/transforms/Latin-ASCII.xml
+
+# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+# The approach is to be Python3 compatible with Python2 "backports".
+from __future__ import print_function
+from __future__ import unicode_literals
+# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+
+import argparse
+import codecs
+import re
+import sys
+import xml.etree.ElementTree as ET
+
+# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+if sys.version_info[0] <= 2:
+    # Encode stdout as UTF-8, so we can just print to it
+    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
+
+    # Map Python 2's chr to unichr
+    chr = unichr
+
+    # Python 2 and 3 compatible bytes call
+    def bytes(source, encoding='ascii', errors='strict'):
+        return source.encode(encoding=encoding, errors=errors)
+else:
+# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
+
+# The ranges of Unicode characters that we consider to be "plain letters".
+# For now we are being conservative by including only Latin and Greek.  This
+# could be extended in future based on feedback from people with relevant
+# language knowledge.
+PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
+                       (ord('A'), ord('Z')), # Latin upper case
+                       (0x03b1, 0x03c9),     # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
+                       (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
+
+# Combining marks follow a "base" character, and result in a composite
+# character. Example: "U&'A\0300'"produces "À".There are three types of
+# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
+# combining (Mc). We identify the ranges of marks we feel safe removing.
+# References:
+#   https://en.wikipedia.org/wiki/Combining_character
+#   https://www.unicode.org/charts/PDF/U0300.pdf
+#   https://www.unicode.org/charts/PDF/U20D0.pdf
+COMBINING_MARK_RANGES = ((0x0300, 0x0362),  # Mn: Accents, IPA
+                         (0x20dd, 0x20E0),  # Me: Symbols
+                         (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
+
+def print_record(codepoint, letter):
+    if letter:
+        output = chr(codepoint) + "\t" + letter
+    else:
+        output = chr(codepoint)
+
+    print(output)
+
+class Codepoint:
+    def __init__(self, id, general_category, combining_ids):
+        self.id = id
+        self.general_category = general_category
+        self.combining_ids = combining_ids
+
+def is_mark_to_remove(codepoint):
+    """Return true if this is a combining mark to remove."""
+    if not is_mark(codepoint):
+        return False
+
+    for begin, end in COMBINING_MARK_RANGES:
+        if codepoint.id >= begin and codepoint.id <= end:
+            return True
+    return False
+
+def is_plain_letter(codepoint):
+    """Return true if codepoint represents a "plain letter"."""
+    for begin, end in PLAIN_LETTER_RANGES:
+      if codepoint.id >= begin and codepoint.id <= end:
+        return True
+    return False
+
+def is_mark(codepoint):
+    """Returns true for diacritical marks (combining codepoints)."""
+    return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+    """Returns true for letters combined with one or more marks."""
+    # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+
+    # Letter may have no combining characters, in which case it has
+    # no marks.
+    if len(codepoint.combining_ids) == 1:
+        return False
+
+    # A letter without diacritical marks has none of them.
+    if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
+        return False
+
+    # Check if the base letter of this letter has marks.
+    codepoint_base = codepoint.combining_ids[0]
+    if (is_plain_letter(table[codepoint_base]) is False and \
+        is_letter_with_marks(table[codepoint_base], table) is False):
+        return False
+
+    return True
+
+def is_letter(codepoint, table):
+    """Return true for letter with or without diacritical marks."""
+    return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+    """Return the base codepoint without marks. If this codepoint has more
+    than one combining character, do a recursive lookup on the table to
+    find out its plain base letter."""
+    if is_letter_with_marks(codepoint, table):
+        if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
+            return get_plain_letter(table[codepoint.combining_ids[0]], table)
+        elif is_plain_letter(table[codepoint.combining_ids[0]]):
+            return table[codepoint.combining_ids[0]]
+
+        # Should not come here
+        assert(False)
+    elif is_plain_letter(codepoint):
+        return codepoint
+
+    # Should not come here
+    assert(False)
+
+def is_ligature(codepoint, table):
+    """Return true for letters combined with letters."""
+    return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+    """Return a list of plain letters from a ligature."""
+    assert(is_ligature(codepoint, table))
+    return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
+    """Parse the XML file and return a set of tuples (src, trg), where "src"
+    is the original character and "trg" the substitute."""
+    charactersSet = set()
+
+    # RegEx to parse rules
+    rulePattern = re.compile(r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
+
+    # construct tree from XML
+    transliterationTree = ET.parse(latinAsciiFilePath)
+    transliterationTreeRoot = transliterationTree.getroot()
+
+    # Fetch all the transliteration rules.  Since release 29 of Latin-ASCII.xml
+    # all the transliteration rules are located in a single tRule block with
+    # all rules separated into separate lines.
+    blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule")
+    assert(len(blockRules) == 1)
+
+    # Split the block of rules into one element per line.
+    rules = blockRules[0].text.splitlines()
+
+    # And finish the processing of each individual rule.
+    for rule in rules:
+        matches = rulePattern.search(rule)
+
+        # The regular expression capture four groups corresponding
+        # to the characters.
+        #
+        # Group 1: plain "src" char. Empty if group 2 is not.
+        # Group 2: unicode-escaped "src" char (e.g. "\u0110"). Empty if group 1 is not.
+        #
+        # Group 3: plain "trg" char. Empty if group 4 is not.
+        # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
+        if matches is not None:
+            src = matches.group(1) if matches.group(1) is not None else bytes(matches.group(2), 'UTF-8').decode('unicode-escape')
+            trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
+
+            # "'" and """ are escaped
+            trg = trg.replace("\\'", "'").replace('\\"', '"')
+
+            # the parser of unaccent only accepts non-whitespace characters
+            # for "src" and "trg" (see unaccent.c)
+            if not src.isspace() and not trg.isspace():
+                charactersSet.add((ord(src), trg))
+
+    return charactersSet
+
+def special_cases():
+    """Returns the special cases which are not handled by other methods"""
+    charactersSet = set()
+
+    # Cyrillic
+    charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO
+    charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO
+
+    # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
+    charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS
+    charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT
+    charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
+
+    return charactersSet
+
+def main(args):
+    # https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+    decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+    table = {}
+    all = []
+
+    # unordered set for ensure uniqueness
+    charactersSet = set()
+
+    # read file UnicodeData.txt
+    with codecs.open(
+      args.unicodeDataFilePath, mode='r', encoding='UTF-8',
+      ) as unicodeDataFile:
+        # read everything we need into memory
+        for line in unicodeDataFile:
+            fields = line.split(";")
+            if len(fields) > 5:
+                # https://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+                general_category = fields[2]
+                decomposition = fields[5]
+                decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+                id = int(fields[0], 16)
+                combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+                codepoint = Codepoint(id, general_category, combining_ids)
+                table[id] = codepoint
+                all.append(codepoint)
+
+    # walk through all the codepoints looking for interesting mappings
+    for codepoint in all:
+        if codepoint.general_category.startswith('L') and \
+           len(codepoint.combining_ids) > 1:
+            if is_letter_with_marks(codepoint, table):
+                charactersSet.add((codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id)))
+            elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
+                charactersSet.add((codepoint.id,
+                             "".join(chr(combining_codepoint.id)
+                                     for combining_codepoint \
+                                     in get_plain_letters(codepoint, table))))
+        elif is_mark_to_remove(codepoint):
+            charactersSet.add((codepoint.id, None))
+
+    # add CLDR Latin-ASCII characters
+    if not args.noLigaturesExpansion:
+        charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)
+        charactersSet |= special_cases()
+
+    # sort for more convenient display
+    charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0])
+
+    for characterPair in charactersList:
+        print_record(characterPair[0], characterPair[1])
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
+    parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath')
+    parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest='latinAsciiFilePath')
+    parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
+    args = parser.parse_args()
+
+    if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None:
+        sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
+        sys.exit(1)
+
+    main(args)
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
new file mode 100644
index 0000000..2ae097f
--- /dev/null
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -0,0 +1,24 @@
+CREATE EXTENSION unaccent;
+
+-- must have a UTF8 database
+SELECT getdatabaseencoding();
+
+SET client_encoding TO 'UTF8';
+
+SELECT unaccent('foobar');
+SELECT unaccent('ёлка');
+SELECT unaccent('ЁЖИК');
+SELECT unaccent('˃˖˗˜');
+SELECT unaccent('À');  -- Remove combining diacritical 0x0300
+
+SELECT unaccent('unaccent', 'foobar');
+SELECT unaccent('unaccent', 'ёлка');
+SELECT unaccent('unaccent', 'ЁЖИК');
+SELECT unaccent('unaccent', '˃˖˗˜');
+SELECT unaccent('unaccent', 'À');
+
+SELECT ts_lexize('unaccent', 'foobar');
+SELECT ts_lexize('unaccent', 'ёлка');
+SELECT ts_lexize('unaccent', 'ЁЖИК');
+SELECT ts_lexize('unaccent', '˃˖˗˜');
+SELECT ts_lexize('unaccent', 'À');
diff --git a/contrib/unaccent/unaccent--1.0--1.1.sql b/contrib/unaccent/unaccent--1.0--1.1.sql
new file mode 100644
index 0000000..8efa0d0
--- /dev/null
+++ b/contrib/unaccent/unaccent--1.0--1.1.sql
@@ -0,0 +1,9 @@
+/* contrib/unaccent/unaccent--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION unaccent UPDATE TO '1.1'" to load this file. \quit
+
+ALTER FUNCTION unaccent(regdictionary, text) PARALLEL SAFE;
+ALTER FUNCTION unaccent(text) PARALLEL SAFE;
+ALTER FUNCTION unaccent_init(internal) PARALLEL SAFE;
+ALTER FUNCTION unaccent_lexize(internal, internal, internal, internal) PARALLEL SAFE;
diff --git a/contrib/unaccent/unaccent--1.1.sql b/contrib/unaccent/unaccent--1.1.sql
new file mode 100644
index 0000000..ecc8651
--- /dev/null
+++ b/contrib/unaccent/unaccent--1.1.sql
@@ -0,0 +1,34 @@
+/* contrib/unaccent/unaccent--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION unaccent" to load this file. \quit
+
+CREATE FUNCTION unaccent(regdictionary, text)
+	RETURNS text
+	AS 'MODULE_PATHNAME', 'unaccent_dict'
+	LANGUAGE C STABLE STRICT PARALLEL SAFE;
+
+CREATE FUNCTION unaccent(text)
+	RETURNS text
+	AS 'MODULE_PATHNAME', 'unaccent_dict'
+	LANGUAGE C STABLE STRICT PARALLEL SAFE;
+
+CREATE FUNCTION unaccent_init(internal)
+	RETURNS internal
+	AS 'MODULE_PATHNAME', 'unaccent_init'
+	LANGUAGE C PARALLEL SAFE;
+
+CREATE FUNCTION unaccent_lexize(internal,internal,internal,internal)
+	RETURNS internal
+	AS 'MODULE_PATHNAME', 'unaccent_lexize'
+	LANGUAGE C PARALLEL SAFE;
+
+CREATE TEXT SEARCH TEMPLATE unaccent (
+	INIT = unaccent_init,
+	LEXIZE = unaccent_lexize
+);
+
+CREATE TEXT SEARCH DICTIONARY unaccent (
+	TEMPLATE = unaccent,
+	RULES    = 'unaccent'
+);
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
new file mode 100644
index 0000000..0047efc
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,434 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ *	  Text search unaccent dictionary
+ *
+ * Copyright (c) 2009-2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  contrib/unaccent/unaccent.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/namespace.h"
+#include "catalog/pg_ts_dict.h"
+#include "commands/defrem.h"
+#include "lib/stringinfo.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * An unaccent dictionary uses a trie to find a string to replace.  Each node
+ * of the trie is an array of 256 TrieChar structs; the N-th element of the
+ * array corresponds to next byte value N.  That element can contain both a
+ * replacement string (to be used if the source string ends with this byte)
+ * and a link to another trie node (to be followed if there are more bytes).
+ *
+ * Note that the trie search logic pays no attention to multibyte character
+ * boundaries.  This is OK as long as both the data entered into the trie and
+ * the data we're trying to look up are validly encoded; no partial-character
+ * matches will occur.
+ */
+typedef struct TrieChar
+{
+	struct TrieChar *nextChar;
+	char	   *replaceTo;
+	int			replacelen;
+} TrieChar;
+
+/*
+ * placeChar - put str into trie's structure, byte by byte.
+ *
+ * If node is NULL, we need to make a new node, which will be returned;
+ * otherwise the return value is the same as node.
+ */
+static TrieChar *
+placeChar(TrieChar *node, const unsigned char *str, int lenstr,
+		  const char *replaceTo, int replacelen)
+{
+	TrieChar   *curnode;
+
+	if (!node)
+		node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
+
+	Assert(lenstr > 0);			/* else str[0] doesn't exist */
+
+	curnode = node + *str;
+
+	if (lenstr <= 1)
+	{
+		if (curnode->replaceTo)
+			ereport(WARNING,
+					(errcode(ERRCODE_CONFIG_FILE_ERROR),
+					 errmsg("duplicate source strings, first one will be used")));
+		else
+		{
+			curnode->replacelen = replacelen;
+			curnode->replaceTo = (char *) palloc(replacelen);
+			memcpy(curnode->replaceTo, replaceTo, replacelen);
+		}
+	}
+	else
+	{
+		curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
+									  replaceTo, replacelen);
+	}
+
+	return node;
+}
+
+/*
+ * initTrie  - create trie from file.
+ *
+ * Function converts UTF8-encoded file into current encoding.
+ */
+static TrieChar *
+initTrie(const char *filename)
+{
+	TrieChar   *volatile rootTrie = NULL;
+	MemoryContext ccxt = CurrentMemoryContext;
+	tsearch_readline_state trst;
+	volatile bool skip;
+
+	filename = get_tsearch_config_filename(filename, "rules");
+	if (!tsearch_readline_begin(&trst, filename))
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIG_FILE_ERROR),
+				 errmsg("could not open unaccent file \"%s\": %m",
+						filename)));
+
+	do
+	{
+		/*
+		 * pg_do_encoding_conversion() (called by tsearch_readline()) will
+		 * emit exception if it finds untranslatable characters in current
+		 * locale. We just skip such lines, continuing with the next.
+		 */
+		skip = true;
+
+		PG_TRY();
+		{
+			char	   *line;
+
+			while ((line = tsearch_readline(&trst)) != NULL)
+			{
+				/*----------
+				 * The format of each line must be "src" or "src trg", where
+				 * src and trg are sequences of one or more non-whitespace
+				 * characters, separated by whitespace.  Whitespace at start
+				 * or end of line is ignored.  If trg is omitted, an empty
+				 * string is used as the replacement.
+				 *
+				 * We use a simple state machine, with states
+				 *	0	initial (before src)
+				 *	1	in src
+				 *	2	in whitespace after src
+				 *	3	in trg
+				 *	4	in whitespace after trg
+				 *	-1	syntax error detected
+				 *----------
+				 */
+				int			state;
+				char	   *ptr;
+				char	   *src = NULL;
+				char	   *trg = NULL;
+				int			ptrlen;
+				int			srclen = 0;
+				int			trglen = 0;
+
+				state = 0;
+				for (ptr = line; *ptr; ptr += ptrlen)
+				{
+					ptrlen = pg_mblen(ptr);
+					/* ignore whitespace, but end src or trg */
+					if (t_isspace(ptr))
+					{
+						if (state == 1)
+							state = 2;
+						else if (state == 3)
+							state = 4;
+						continue;
+					}
+					switch (state)
+					{
+						case 0:
+							/* start of src */
+							src = ptr;
+							srclen = ptrlen;
+							state = 1;
+							break;
+						case 1:
+							/* continue src */
+							srclen += ptrlen;
+							break;
+						case 2:
+							/* start of trg */
+							trg = ptr;
+							trglen = ptrlen;
+							state = 3;
+							break;
+						case 3:
+							/* continue trg */
+							trglen += ptrlen;
+							break;
+						default:
+							/* bogus line format */
+							state = -1;
+							break;
+					}
+				}
+
+				if (state == 1 || state == 2)
+				{
+					/* trg was omitted, so use "" */
+					trg = "";
+					trglen = 0;
+				}
+
+				if (state > 0)
+					rootTrie = placeChar(rootTrie,
+										 (unsigned char *) src, srclen,
+										 trg, trglen);
+				else if (state < 0)
+					ereport(WARNING,
+							(errcode(ERRCODE_CONFIG_FILE_ERROR),
+							 errmsg("invalid syntax: more than two strings in unaccent rule")));
+
+				pfree(line);
+			}
+			skip = false;
+		}
+		PG_CATCH();
+		{
+			ErrorData  *errdata;
+			MemoryContext ecxt;
+
+			ecxt = MemoryContextSwitchTo(ccxt);
+			errdata = CopyErrorData();
+			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+			{
+				FlushErrorState();
+			}
+			else
+			{
+				MemoryContextSwitchTo(ecxt);
+				PG_RE_THROW();
+			}
+		}
+		PG_END_TRY();
+	}
+	while (skip);
+
+	tsearch_readline_end(&trst);
+
+	return rootTrie;
+}
+
+/*
+ * findReplaceTo - find longest possible match in trie
+ *
+ * On success, returns pointer to ending subnode, plus length of matched
+ * source string in *p_matchlen.  On failure, returns NULL.
+ */
+static TrieChar *
+findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
+			  int *p_matchlen)
+{
+	TrieChar   *result = NULL;
+	int			matchlen = 0;
+
+	*p_matchlen = 0;			/* prevent uninitialized-variable warnings */
+
+	while (node && matchlen < srclen)
+	{
+		node = node + src[matchlen];
+		matchlen++;
+
+		if (node->replaceTo)
+		{
+			result = node;
+			*p_matchlen = matchlen;
+		}
+
+		node = node->nextChar;
+	}
+
+	return result;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
+	TrieChar   *rootTrie = NULL;
+	bool		fileloaded = false;
+	ListCell   *l;
+
+	foreach(l, dictoptions)
+	{
+		DefElem    *defel = (DefElem *) lfirst(l);
+
+		if (strcmp(defel->defname, "rules") == 0)
+		{
+			if (fileloaded)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("multiple Rules parameters")));
+			rootTrie = initTrie(defGetString(defel));
+			fileloaded = true;
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized Unaccent parameter: \"%s\"",
+							defel->defname)));
+		}
+	}
+
+	if (!fileloaded)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("missing Rules parameter")));
+	}
+
+	PG_RETURN_POINTER(rootTrie);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+	TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
+	char	   *srcchar = (char *) PG_GETARG_POINTER(1);
+	int32		len = PG_GETARG_INT32(2);
+	char	   *srcstart = srcchar;
+	TSLexeme   *res;
+	StringInfoData buf;
+
+	/* we allocate storage for the buffer only if needed */
+	buf.data = NULL;
+
+	while (len > 0)
+	{
+		TrieChar   *node;
+		int			matchlen;
+
+		node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
+							 &matchlen);
+		if (node && node->replaceTo)
+		{
+			if (buf.data == NULL)
+			{
+				/* initialize buffer */
+				initStringInfo(&buf);
+				/* insert any data we already skipped over */
+				if (srcchar != srcstart)
+					appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
+			}
+			appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
+		}
+		else
+		{
+			matchlen = pg_mblen(srcchar);
+			if (buf.data != NULL)
+				appendBinaryStringInfo(&buf, srcchar, matchlen);
+		}
+
+		srcchar += matchlen;
+		len -= matchlen;
+	}
+
+	/* return a result only if we made at least one substitution */
+	if (buf.data != NULL)
+	{
+		res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
+		res->lexeme = buf.data;
+		res->flags = TSL_FILTER;
+	}
+	else
+		res = NULL;
+
+	PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+	text	   *str;
+	int			strArg;
+	Oid			dictOid;
+	TSDictionaryCacheEntry *dict;
+	TSLexeme   *res;
+
+	if (PG_NARGS() == 1)
+	{
+		/*
+		 * Use the "unaccent" dictionary that is in the same schema that this
+		 * function is in.
+		 */
+		Oid			procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
+		const char *dictname = "unaccent";
+
+		dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
+								  PointerGetDatum(dictname),
+								  ObjectIdGetDatum(procnspid));
+		if (!OidIsValid(dictOid))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_OBJECT),
+					 errmsg("text search dictionary \"%s.%s\" does not exist",
+							get_namespace_name(procnspid), dictname)));
+		strArg = 0;
+	}
+	else
+	{
+		dictOid = PG_GETARG_OID(0);
+		strArg = 1;
+	}
+	str = PG_GETARG_TEXT_PP(strArg);
+
+	dict = lookup_ts_dictionary_cache(dictOid);
+
+	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+													 PointerGetDatum(dict->dictData),
+													 PointerGetDatum(VARDATA_ANY(str)),
+													 Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
+													 PointerGetDatum(NULL)));
+
+	PG_FREE_IF_COPY(str, strArg);
+
+	if (res == NULL)
+	{
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else if (res->lexeme == NULL)
+	{
+		pfree(res);
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else
+	{
+		text	   *txt = cstring_to_text(res->lexeme);
+
+		pfree(res->lexeme);
+		pfree(res);
+
+		PG_RETURN_TEXT_P(txt);
+	}
+}
diff --git a/contrib/unaccent/unaccent.control b/contrib/unaccent/unaccent.control
new file mode 100644
index 0000000..649cf68
--- /dev/null
+++ b/contrib/unaccent/unaccent.control
@@ -0,0 +1,6 @@
+# unaccent extension
+comment = 'text search dictionary that removes accents'
+default_version = '1.1'
+module_pathname = '$libdir/unaccent'
+relocatable = true
+trusted = true
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
new file mode 100644
index 0000000..bf4c1bd
--- /dev/null
+++ b/contrib/unaccent/unaccent.rules
@@ -0,0 +1,1604 @@
+©	(C)
+«	<<
+	-
+®	(R)
+»	>>
+¼	 1/4
+½	 1/2
+¾	 3/4
+À	A
+Á	A
+Â	A
+Ã	A
+Ä	A
+Å	A
+Æ	AE
+Ç	C
+È	E
+É	E
+Ê	E
+Ë	E
+Ì	I
+Í	I
+Î	I
+Ï	I
+Ð	D
+Ñ	N
+Ò	O
+Ó	O
+Ô	O
+Õ	O
+Ö	O
+×	*
+Ø	O
+Ù	U
+Ú	U
+Û	U
+Ü	U
+Ý	Y
+Þ	TH
+ß	ss
+à	a
+á	a
+â	a
+ã	a
+ä	a
+å	a
+æ	ae
+ç	c
+è	e
+é	e
+ê	e
+ë	e
+ì	i
+í	i
+î	i
+ï	i
+ð	d
+ñ	n
+ò	o
+ó	o
+ô	o
+õ	o
+ö	o
+÷	/
+ø	o
+ù	u
+ú	u
+û	u
+ü	u
+ý	y
+þ	th
+ÿ	y
+Ā	A
+ā	a
+Ă	A
+ă	a
+Ą	A
+ą	a
+Ć	C
+ć	c
+Ĉ	C
+ĉ	c
+Ċ	C
+ċ	c
+Č	C
+č	c
+Ď	D
+ď	d
+Đ	D
+đ	d
+Ē	E
+ē	e
+Ĕ	E
+ĕ	e
+Ė	E
+ė	e
+Ę	E
+ę	e
+Ě	E
+ě	e
+Ĝ	G
+ĝ	g
+Ğ	G
+ğ	g
+Ġ	G
+ġ	g
+Ģ	G
+ģ	g
+Ĥ	H
+ĥ	h
+Ħ	H
+ħ	h
+Ĩ	I
+ĩ	i
+Ī	I
+ī	i
+Ĭ	I
+ĭ	i
+Į	I
+į	i
+İ	I
+ı	i
+Ĳ	IJ
+ĳ	ij
+Ĵ	J
+ĵ	j
+Ķ	K
+ķ	k
+ĸ	q
+Ĺ	L
+ĺ	l
+Ļ	L
+ļ	l
+Ľ	L
+ľ	l
+Ŀ	L
+ŀ	l
+Ł	L
+ł	l
+Ń	N
+ń	n
+Ņ	N
+ņ	n
+Ň	N
+ň	n
+ŉ	'n
+Ŋ	N
+ŋ	n
+Ō	O
+ō	o
+Ŏ	O
+ŏ	o
+Ő	O
+ő	o
+Œ	OE
+œ	oe
+Ŕ	R
+ŕ	r
+Ŗ	R
+ŗ	r
+Ř	R
+ř	r
+Ś	S
+ś	s
+Ŝ	S
+ŝ	s
+Ş	S
+ş	s
+Š	S
+š	s
+Ţ	T
+ţ	t
+Ť	T
+ť	t
+Ŧ	T
+ŧ	t
+Ũ	U
+ũ	u
+Ū	U
+ū	u
+Ŭ	U
+ŭ	u
+Ů	U
+ů	u
+Ű	U
+ű	u
+Ų	U
+ų	u
+Ŵ	W
+ŵ	w
+Ŷ	Y
+ŷ	y
+Ÿ	Y
+Ź	Z
+ź	z
+Ż	Z
+ż	z
+Ž	Z
+ž	z
+ſ	s
+ƀ	b
+Ɓ	B
+Ƃ	B
+ƃ	b
+Ƈ	C
+ƈ	c
+Ɖ	D
+Ɗ	D
+Ƌ	D
+ƌ	d
+Ɛ	E
+Ƒ	F
+ƒ	f
+Ɠ	G
+ƕ	hv
+Ɩ	I
+Ɨ	I
+Ƙ	K
+ƙ	k
+ƚ	l
+Ɲ	N
+ƞ	n
+Ơ	O
+ơ	o
+Ƣ	OI
+ƣ	oi
+Ƥ	P
+ƥ	p
+ƫ	t
+Ƭ	T
+ƭ	t
+Ʈ	T
+Ư	U
+ư	u
+Ʋ	V
+Ƴ	Y
+ƴ	y
+Ƶ	Z
+ƶ	z
+Ǆ	DZ
+ǅ	Dz
+ǆ	dz
+Ǉ	LJ
+ǈ	Lj
+ǉ	lj
+Ǌ	NJ
+ǋ	Nj
+ǌ	nj
+Ǎ	A
+ǎ	a
+Ǐ	I
+ǐ	i
+Ǒ	O
+ǒ	o
+Ǔ	U
+ǔ	u
+Ǖ	U
+ǖ	u
+Ǘ	U
+ǘ	u
+Ǚ	U
+ǚ	u
+Ǜ	U
+ǜ	u
+Ǟ	A
+ǟ	a
+Ǡ	A
+ǡ	a
+Ǥ	G
+ǥ	g
+Ǧ	G
+ǧ	g
+Ǩ	K
+ǩ	k
+Ǫ	O
+ǫ	o
+Ǭ	O
+ǭ	o
+ǰ	j
+Ǳ	DZ
+ǲ	Dz
+ǳ	dz
+Ǵ	G
+ǵ	g
+Ǹ	N
+ǹ	n
+Ǻ	A
+ǻ	a
+Ȁ	A
+ȁ	a
+Ȃ	A
+ȃ	a
+Ȅ	E
+ȅ	e
+Ȇ	E
+ȇ	e
+Ȉ	I
+ȉ	i
+Ȋ	I
+ȋ	i
+Ȍ	O
+ȍ	o
+Ȏ	O
+ȏ	o
+Ȑ	R
+ȑ	r
+Ȓ	R
+ȓ	r
+Ȕ	U
+ȕ	u
+Ȗ	U
+ȗ	u
+Ș	S
+ș	s
+Ț	T
+ț	t
+Ȟ	H
+ȟ	h
+ȡ	d
+Ȥ	Z
+ȥ	z
+Ȧ	A
+ȧ	a
+Ȩ	E
+ȩ	e
+Ȫ	O
+ȫ	o
+Ȭ	O
+ȭ	o
+Ȯ	O
+ȯ	o
+Ȱ	O
+ȱ	o
+Ȳ	Y
+ȳ	y
+ȴ	l
+ȵ	n
+ȶ	t
+ȷ	j
+ȸ	db
+ȹ	qp
+Ⱥ	A
+Ȼ	C
+ȼ	c
+Ƚ	L
+Ⱦ	T
+ȿ	s
+ɀ	z
+Ƀ	B
+Ʉ	U
+Ɇ	E
+ɇ	e
+Ɉ	J
+ɉ	j
+Ɍ	R
+ɍ	r
+Ɏ	Y
+ɏ	y
+ɓ	b
+ɕ	c
+ɖ	d
+ɗ	d
+ɛ	e
+ɟ	j
+ɠ	g
+ɡ	g
+ɢ	G
+ɦ	h
+ɧ	h
+ɨ	i
+ɪ	I
+ɫ	l
+ɬ	l
+ɭ	l
+ɱ	m
+ɲ	n
+ɳ	n
+ɴ	N
+ɶ	OE
+ɼ	r
+ɽ	r
+ɾ	r
+ʀ	R
+ʂ	s
+ʈ	t
+ʉ	u
+ʋ	v
+ʏ	Y
+ʐ	z
+ʑ	z
+ʙ	B
+ʛ	G
+ʜ	H
+ʝ	j
+ʟ	L
+ʠ	q
+ʣ	dz
+ʥ	dz
+ʦ	ts
+ʪ	ls
+ʫ	lz
+ʹ	'
+ʺ	"
+ʻ	'
+ʼ	'
+ʽ	'
+˂	<
+˃	>
+˄	^
+ˆ	^
+ˈ	'
+ˋ	`
+ː	:
+˖	+
+˗	-
+˜	~
+̀
+́
+̂
+̃
+̄
+̅
+̆
+̇
+̈
+̉
+̊
+̋
+̌
+̍
+̎
+̏
+̐
+̑
+̒
+̓
+̔
+̕
+̖
+̗
+̘
+̙
+̚
+̛
+̜
+̝
+̞
+̟
+̠
+̡
+̢
+̣
+̤
+̥
+̦
+̧
+̨
+̩
+̪
+̫
+̬
+̭
+̮
+̯
+̰
+̱
+̲
+̳
+̴
+̵
+̶
+̷
+̸
+̹
+̺
+̻
+̼
+̽
+̾
+̿
+̀
+́
+͂
+̓
+̈́
+ͅ
+͆
+͇
+͈
+͉
+͊
+͋
+͌
+͍
+͎
+͏
+͐
+͑
+͒
+͓
+͔
+͕
+͖
+͗
+͘
+͙
+͚
+͛
+͜
+͝
+͞
+͟
+͠
+͡
+͢
+Ά	Α
+Έ	Ε
+Ή	Η
+Ί	Ι
+Ό	Ο
+Ύ	Υ
+Ώ	Ω
+ΐ	ι
+Ϊ	Ι
+Ϋ	Υ
+ά	α
+έ	ε
+ή	η
+ί	ι
+ΰ	υ
+ϊ	ι
+ϋ	υ
+ό	ο
+ύ	υ
+ώ	ω
+Ё	Е
+ё	е
+ᴀ	A
+ᴁ	AE
+ᴃ	B
+ᴄ	C
+ᴅ	D
+ᴆ	D
+ᴇ	E
+ᴊ	J
+ᴋ	K
+ᴌ	L
+ᴍ	M
+ᴏ	O
+ᴘ	P
+ᴛ	T
+ᴜ	U
+ᴠ	V
+ᴡ	W
+ᴢ	Z
+ᵫ	ue
+ᵬ	b
+ᵭ	d
+ᵮ	f
+ᵯ	m
+ᵰ	n
+ᵱ	p
+ᵲ	r
+ᵳ	r
+ᵴ	s
+ᵵ	t
+ᵶ	z
+ᵺ	th
+ᵻ	I
+ᵽ	p
+ᵾ	U
+ᶀ	b
+ᶁ	d
+ᶂ	f
+ᶃ	g
+ᶄ	k
+ᶅ	l
+ᶆ	m
+ᶇ	n
+ᶈ	p
+ᶉ	r
+ᶊ	s
+ᶌ	v
+ᶍ	x
+ᶎ	z
+ᶏ	a
+ᶑ	d
+ᶒ	e
+ᶓ	e
+ᶖ	i
+ᶙ	u
+Ḁ	A
+ḁ	a
+Ḃ	B
+ḃ	b
+Ḅ	B
+ḅ	b
+Ḇ	B
+ḇ	b
+Ḉ	C
+ḉ	c
+Ḋ	D
+ḋ	d
+Ḍ	D
+ḍ	d
+Ḏ	D
+ḏ	d
+Ḑ	D
+ḑ	d
+Ḓ	D
+ḓ	d
+Ḕ	E
+ḕ	e
+Ḗ	E
+ḗ	e
+Ḙ	E
+ḙ	e
+Ḛ	E
+ḛ	e
+Ḝ	E
+ḝ	e
+Ḟ	F
+ḟ	f
+Ḡ	G
+ḡ	g
+Ḣ	H
+ḣ	h
+Ḥ	H
+ḥ	h
+Ḧ	H
+ḧ	h
+Ḩ	H
+ḩ	h
+Ḫ	H
+ḫ	h
+Ḭ	I
+ḭ	i
+Ḯ	I
+ḯ	i
+Ḱ	K
+ḱ	k
+Ḳ	K
+ḳ	k
+Ḵ	K
+ḵ	k
+Ḷ	L
+ḷ	l
+Ḹ	L
+ḹ	l
+Ḻ	L
+ḻ	l
+Ḽ	L
+ḽ	l
+Ḿ	M
+ḿ	m
+Ṁ	M
+ṁ	m
+Ṃ	M
+ṃ	m
+Ṅ	N
+ṅ	n
+Ṇ	N
+ṇ	n
+Ṉ	N
+ṉ	n
+Ṋ	N
+ṋ	n
+Ṍ	O
+ṍ	o
+Ṏ	O
+ṏ	o
+Ṑ	O
+ṑ	o
+Ṓ	O
+ṓ	o
+Ṕ	P
+ṕ	p
+Ṗ	P
+ṗ	p
+Ṙ	R
+ṙ	r
+Ṛ	R
+ṛ	r
+Ṝ	R
+ṝ	r
+Ṟ	R
+ṟ	r
+Ṡ	S
+ṡ	s
+Ṣ	S
+ṣ	s
+Ṥ	S
+ṥ	s
+Ṧ	S
+ṧ	s
+Ṩ	S
+ṩ	s
+Ṫ	T
+ṫ	t
+Ṭ	T
+ṭ	t
+Ṯ	T
+ṯ	t
+Ṱ	T
+ṱ	t
+Ṳ	U
+ṳ	u
+Ṵ	U
+ṵ	u
+Ṷ	U
+ṷ	u
+Ṹ	U
+ṹ	u
+Ṻ	U
+ṻ	u
+Ṽ	V
+ṽ	v
+Ṿ	V
+ṿ	v
+Ẁ	W
+ẁ	w
+Ẃ	W
+ẃ	w
+Ẅ	W
+ẅ	w
+Ẇ	W
+ẇ	w
+Ẉ	W
+ẉ	w
+Ẋ	X
+ẋ	x
+Ẍ	X
+ẍ	x
+Ẏ	Y
+ẏ	y
+Ẑ	Z
+ẑ	z
+Ẓ	Z
+ẓ	z
+Ẕ	Z
+ẕ	z
+ẖ	h
+ẗ	t
+ẘ	w
+ẙ	y
+ẚ	a
+ẜ	s
+ẝ	s
+ẞ	SS
+Ạ	A
+ạ	a
+Ả	A
+ả	a
+Ấ	A
+ấ	a
+Ầ	A
+ầ	a
+Ẩ	A
+ẩ	a
+Ẫ	A
+ẫ	a
+Ậ	A
+ậ	a
+Ắ	A
+ắ	a
+Ằ	A
+ằ	a
+Ẳ	A
+ẳ	a
+Ẵ	A
+ẵ	a
+Ặ	A
+ặ	a
+Ẹ	E
+ẹ	e
+Ẻ	E
+ẻ	e
+Ẽ	E
+ẽ	e
+Ế	E
+ế	e
+Ề	E
+ề	e
+Ể	E
+ể	e
+Ễ	E
+ễ	e
+Ệ	E
+ệ	e
+Ỉ	I
+ỉ	i
+Ị	I
+ị	i
+Ọ	O
+ọ	o
+Ỏ	O
+ỏ	o
+Ố	O
+ố	o
+Ồ	O
+ồ	o
+Ổ	O
+ổ	o
+Ỗ	O
+ỗ	o
+Ộ	O
+ộ	o
+Ớ	O
+ớ	o
+Ờ	O
+ờ	o
+Ở	O
+ở	o
+Ỡ	O
+ỡ	o
+Ợ	O
+ợ	o
+Ụ	U
+ụ	u
+Ủ	U
+ủ	u
+Ứ	U
+ứ	u
+Ừ	U
+ừ	u
+Ử	U
+ử	u
+Ữ	U
+ữ	u
+Ự	U
+ự	u
+Ỳ	Y
+ỳ	y
+Ỵ	Y
+ỵ	y
+Ỷ	Y
+ỷ	y
+Ỹ	Y
+ỹ	y
+Ỻ	LL
+ỻ	ll
+Ỽ	V
+ỽ	v
+Ỿ	Y
+ỿ	y
+ἀ	α
+ἁ	α
+ἂ	α
+ἃ	α
+ἄ	α
+ἅ	α
+ἆ	α
+ἇ	α
+Ἀ	Α
+Ἁ	Α
+Ἂ	Α
+Ἃ	Α
+Ἄ	Α
+Ἅ	Α
+Ἆ	Α
+Ἇ	Α
+ἐ	ε
+ἑ	ε
+ἒ	ε
+ἓ	ε
+ἔ	ε
+ἕ	ε
+Ἐ	Ε
+Ἑ	Ε
+Ἒ	Ε
+Ἓ	Ε
+Ἔ	Ε
+Ἕ	Ε
+ἠ	η
+ἡ	η
+ἢ	η
+ἣ	η
+ἤ	η
+ἥ	η
+ἦ	η
+ἧ	η
+Ἠ	Η
+Ἡ	Η
+Ἢ	Η
+Ἣ	Η
+Ἤ	Η
+Ἥ	Η
+Ἦ	Η
+Ἧ	Η
+ἰ	ι
+ἱ	ι
+ἲ	ι
+ἳ	ι
+ἴ	ι
+ἵ	ι
+ἶ	ι
+ἷ	ι
+Ἰ	Ι
+Ἱ	Ι
+Ἲ	Ι
+Ἳ	Ι
+Ἴ	Ι
+Ἵ	Ι
+Ἶ	Ι
+Ἷ	Ι
+ὀ	ο
+ὁ	ο
+ὂ	ο
+ὃ	ο
+ὄ	ο
+ὅ	ο
+Ὀ	Ο
+Ὁ	Ο
+Ὂ	Ο
+Ὃ	Ο
+Ὄ	Ο
+Ὅ	Ο
+ὐ	υ
+ὑ	υ
+ὒ	υ
+ὓ	υ
+ὔ	υ
+ὕ	υ
+ὖ	υ
+ὗ	υ
+Ὑ	Υ
+Ὓ	Υ
+Ὕ	Υ
+Ὗ	Υ
+ὠ	ω
+ὡ	ω
+ὢ	ω
+ὣ	ω
+ὤ	ω
+ὥ	ω
+ὦ	ω
+ὧ	ω
+Ὠ	Ω
+Ὡ	Ω
+Ὢ	Ω
+Ὣ	Ω
+Ὤ	Ω
+Ὥ	Ω
+Ὦ	Ω
+Ὧ	Ω
+ὰ	α
+ὲ	ε
+ὴ	η
+ὶ	ι
+ὸ	ο
+ὺ	υ
+ὼ	ω
+ᾀ	α
+ᾁ	α
+ᾂ	α
+ᾃ	α
+ᾄ	α
+ᾅ	α
+ᾆ	α
+ᾇ	α
+ᾈ	Α
+ᾉ	Α
+ᾊ	Α
+ᾋ	Α
+ᾌ	Α
+ᾍ	Α
+ᾎ	Α
+ᾏ	Α
+ᾐ	η
+ᾑ	η
+ᾒ	η
+ᾓ	η
+ᾔ	η
+ᾕ	η
+ᾖ	η
+ᾗ	η
+ᾘ	Η
+ᾙ	Η
+ᾚ	Η
+ᾛ	Η
+ᾜ	Η
+ᾝ	Η
+ᾞ	Η
+ᾟ	Η
+ᾠ	ω
+ᾡ	ω
+ᾢ	ω
+ᾣ	ω
+ᾤ	ω
+ᾥ	ω
+ᾦ	ω
+ᾧ	ω
+ᾨ	Ω
+ᾩ	Ω
+ᾪ	Ω
+ᾫ	Ω
+ᾬ	Ω
+ᾭ	Ω
+ᾮ	Ω
+ᾯ	Ω
+ᾰ	α
+ᾱ	α
+ᾲ	α
+ᾳ	α
+ᾴ	α
+ᾶ	α
+ᾷ	α
+Ᾰ	Α
+Ᾱ	Α
+Ὰ	Α
+ᾼ	Α
+ῂ	η
+ῃ	η
+ῄ	η
+ῆ	η
+ῇ	η
+Ὲ	Ε
+Ὴ	Η
+ῌ	Η
+ῐ	ι
+ῑ	ι
+ῒ	ι
+ῖ	ι
+ῗ	ι
+Ῐ	Ι
+Ῑ	Ι
+Ὶ	Ι
+ῠ	υ
+ῡ	υ
+ῢ	υ
+ῤ	ρ
+ῥ	ρ
+ῦ	υ
+ῧ	υ
+Ῠ	Υ
+Ῡ	Υ
+Ὺ	Υ
+Ῥ	Ρ
+ῲ	ω
+ῳ	ω
+ῴ	ω
+ῶ	ω
+ῷ	ω
+Ὸ	Ο
+Ὼ	Ω
+ῼ	Ω
+‐	-
+‑	-
+‒	-
+–	-
+—	-
+―	-
+‖	||
+‘	'
+’	'
+‚	,
+‛	'
+“	"
+”	"
+„	,,
+‟	"
+․	.
+‥	..
+…	...
+′	'
+″	"
+‹	<
+›	>
+‼	!!
+⁄	/
+⁅	[
+⁆	]
+⁇	??
+⁈	?!
+⁉	!?
+⁎	*
+₠	CE
+₢	Cr
+₣	Fr.
+₤	L.
+₧	Pts
+₹	Rs
+₺	TL
+⃝
+⃞
+⃟
+⃠
+⃢
+⃣
+⃤
+℀	a/c
+℁	a/s
+ℂ	C
+℃	°C
+℅	c/o
+℆	c/u
+℉	°F
+ℊ	g
+ℋ	H
+ℌ	x
+ℍ	H
+ℎ	h
+ℐ	I
+ℑ	I
+ℒ	L
+ℓ	l
+ℕ	N
+№	No
+℗	(P)
+℘	P
+ℙ	P
+ℚ	Q
+ℛ	R
+ℜ	R
+ℝ	R
+℞	Rx
+℡	TEL
+ℤ	Z
+ℨ	Z
+ℬ	B
+ℭ	C
+ℯ	e
+ℰ	E
+ℱ	F
+ℳ	M
+ℴ	o
+ℹ	i
+℻	FAX
+ⅅ	D
+ⅆ	d
+ⅇ	e
+ⅈ	i
+ⅉ	j
+⅓	 1/3
+⅔	 2/3
+⅕	 1/5
+⅖	 2/5
+⅗	 3/5
+⅘	 4/5
+⅙	 1/6
+⅚	 5/6
+⅛	 1/8
+⅜	 3/8
+⅝	 5/8
+⅞	 7/8
+⅟	 1/
+Ⅰ	I
+Ⅱ	II
+Ⅲ	III
+Ⅳ	IV
+Ⅴ	V
+Ⅵ	VI
+Ⅶ	VII
+Ⅷ	VIII
+Ⅸ	IX
+Ⅹ	X
+Ⅺ	XI
+Ⅻ	XII
+Ⅼ	L
+Ⅽ	C
+Ⅾ	D
+Ⅿ	M
+ⅰ	i
+ⅱ	ii
+ⅲ	iii
+ⅳ	iv
+ⅴ	v
+ⅵ	vi
+ⅶ	vii
+ⅷ	viii
+ⅸ	ix
+ⅹ	x
+ⅺ	xi
+ⅻ	xii
+ⅼ	l
+ⅽ	c
+ⅾ	d
+ⅿ	m
+−	-
+∕	/
+∖	\
+∣	|
+∥	||
+≪	<<
+≫	>>
+⑴	(1)
+⑵	(2)
+⑶	(3)
+⑷	(4)
+⑸	(5)
+⑹	(6)
+⑺	(7)
+⑻	(8)
+⑼	(9)
+⑽	(10)
+⑾	(11)
+⑿	(12)
+⒀	(13)
+⒁	(14)
+⒂	(15)
+⒃	(16)
+⒄	(17)
+⒅	(18)
+⒆	(19)
+⒇	(20)
+⒈	1.
+⒉	2.
+⒊	3.
+⒋	4.
+⒌	5.
+⒍	6.
+⒎	7.
+⒏	8.
+⒐	9.
+⒑	10.
+⒒	11.
+⒓	12.
+⒔	13.
+⒕	14.
+⒖	15.
+⒗	16.
+⒘	17.
+⒙	18.
+⒚	19.
+⒛	20.
+⒜	(a)
+⒝	(b)
+⒞	(c)
+⒟	(d)
+⒠	(e)
+⒡	(f)
+⒢	(g)
+⒣	(h)
+⒤	(i)
+⒥	(j)
+⒦	(k)
+⒧	(l)
+⒨	(m)
+⒩	(n)
+⒪	(o)
+⒫	(p)
+⒬	(q)
+⒭	(r)
+⒮	(s)
+⒯	(t)
+⒰	(u)
+⒱	(v)
+⒲	(w)
+⒳	(x)
+⒴	(y)
+⒵	(z)
+⦅	((
+⦆	))
+⩴	::=
+⩵	==
+⩶	===
+Ⱡ	L
+ⱡ	l
+Ɫ	L
+Ᵽ	P
+Ɽ	R
+ⱥ	a
+ⱦ	t
+Ⱨ	H
+ⱨ	h
+Ⱪ	K
+ⱪ	k
+Ⱬ	Z
+ⱬ	z
+Ɱ	M
+ⱱ	v
+Ⱳ	W
+ⱳ	w
+ⱴ	v
+ⱸ	e
+ⱺ	o
+Ȿ	S
+Ɀ	Z
+、	,
+。	.
+〇	0
+〈	<
+〉	>
+《	<<
+》	>>
+〔	[
+〕	]
+〘	[
+〙	]
+〚	[
+〛	]
+〝	"
+〞	"
+㍱	hPa
+㍲	da
+㍳	AU
+㍴	bar
+㍵	oV
+㍶	pc
+㍷	dm
+㍺	IU
+㎀	pA
+㎁	nA
+㎃	mA
+㎄	kA
+㎅	KB
+㎆	MB
+㎇	GB
+㎈	cal
+㎉	kcal
+㎊	pF
+㎋	nF
+㎎	mg
+㎏	kg
+㎐	Hz
+㎑	kHz
+㎒	MHz
+㎓	GHz
+㎔	THz
+㎙	fm
+㎚	nm
+㎜	mm
+㎝	cm
+㎞	km
+㎧	m/s
+㎩	Pa
+㎪	kPa
+㎫	MPa
+㎬	GPa
+㎭	rad
+㎮	rad/s
+㎰	ps
+㎱	ns
+㎳	ms
+㎴	pV
+㎵	nV
+㎷	mV
+㎸	kV
+㎹	MV
+㎺	pW
+㎻	nW
+㎽	mW
+㎾	kW
+㎿	MW
+㏂	a.m.
+㏃	Bq
+㏄	cc
+㏅	cd
+㏆	C/kg
+㏇	Co.
+㏈	dB
+㏉	Gy
+㏊	ha
+㏋	HP
+㏌	in
+㏍	KK
+㏎	KM
+㏏	kt
+㏐	lm
+㏑	ln
+㏒	log
+㏓	lx
+㏔	mb
+㏕	mil
+㏖	mol
+㏗	pH
+㏘	p.m.
+㏙	PPM
+㏚	PR
+㏛	sr
+㏜	Sv
+㏝	Wb
+㏞	V/m
+㏟	A/m
+ꜰ	F
+ꜱ	S
+Ꜳ	AA
+ꜳ	aa
+Ꜵ	AO
+ꜵ	ao
+Ꜷ	AU
+ꜷ	au
+Ꜹ	AV
+ꜹ	av
+Ꜻ	AV
+ꜻ	av
+Ꜽ	AY
+ꜽ	ay
+Ꝁ	K
+ꝁ	k
+Ꝃ	K
+ꝃ	k
+Ꝅ	K
+ꝅ	k
+Ꝇ	L
+ꝇ	l
+Ꝉ	L
+ꝉ	l
+Ꝋ	O
+ꝋ	o
+Ꝍ	O
+ꝍ	o
+Ꝏ	OO
+ꝏ	oo
+Ꝑ	P
+ꝑ	p
+Ꝓ	P
+ꝓ	p
+Ꝕ	P
+ꝕ	p
+Ꝗ	Q
+ꝗ	q
+Ꝙ	Q
+ꝙ	q
+Ꝟ	V
+ꝟ	v
+Ꝡ	VY
+ꝡ	vy
+Ꝥ	TH
+ꝥ	th
+Ꝧ	TH
+ꝧ	th
+ꝱ	d
+ꝲ	l
+ꝳ	m
+ꝴ	n
+ꝵ	r
+ꝶ	R
+ꝷ	t
+Ꝺ	D
+ꝺ	d
+Ꝼ	F
+ꝼ	f
+Ꞇ	T
+ꞇ	t
+Ꞑ	N
+ꞑ	n
+Ꞓ	C
+ꞓ	c
+Ꞡ	G
+ꞡ	g
+Ꞣ	K
+ꞣ	k
+Ꞥ	N
+ꞥ	n
+Ꞧ	R
+ꞧ	r
+Ꞩ	S
+ꞩ	s
+Ɦ	H
+ﬀ	ff
+ﬁ	fi
+ﬂ	fl
+ﬃ	ffi
+ﬄ	ffl
+ﬅ	st
+ﬆ	st
+︐	,
+︑	,
+︒	.
+︓	:
+︔	;
+︕	!
+︖	?
+︙	...
+︰	..
+︱	-
+︲	-
+︵	(
+︶	)
+︷	{
+︸	}
+︹	[
+︺	]
+︽	<<
+︾	>>
+︿	<
+﹀	>
+﹇	[
+﹈	]
+﹐	,
+﹑	,
+﹒	.
+﹔	;
+﹕	:
+﹖	?
+﹗	!
+﹘	-
+﹙	(
+﹚	)
+﹛	{
+﹜	}
+﹝	[
+﹞	]
+﹟	#
+﹠	&
+﹡	*
+﹢	+
+﹣	-
+﹤	<
+﹥	>
+﹦	=
+﹨	\
+﹩	$
+﹪	%
+﹫	@
+！	!
+＂	"
+＃	#
+＄	$
+％	%
+＆	&
+＇	'
+（	(
+）	)
+＊	*
+＋	+
+，	,
+－	-
+．	.
+／	/
+０	0
+１	1
+２	2
+３	3
+４	4
+５	5
+６	6
+７	7
+８	8
+９	9
+：	:
+；	;
+＜	<
+＝	=
+＞	>
+？	?
+＠	@
+Ａ	A
+Ｂ	B
+Ｃ	C
+Ｄ	D
+Ｅ	E
+Ｆ	F
+Ｇ	G
+Ｈ	H
+Ｉ	I
+Ｊ	J
+Ｋ	K
+Ｌ	L
+Ｍ	M
+Ｎ	N
+Ｏ	O
+Ｐ	P
+Ｑ	Q
+Ｒ	R
+Ｓ	S
+Ｔ	T
+Ｕ	U
+Ｖ	V
+Ｗ	W
+Ｘ	X
+Ｙ	Y
+Ｚ	Z
+［	[
+＼	\
+］	]
+＾	^
+＿	_
+｀	`
+ａ	a
+ｂ	b
+ｃ	c
+ｄ	d
+ｅ	e
+ｆ	f
+ｇ	g
+ｈ	h
+ｉ	i
+ｊ	j
+ｋ	k
+ｌ	l
+ｍ	m
+ｎ	n
+ｏ	o
+ｐ	p
+ｑ	q
+ｒ	r
+ｓ	s
+ｔ	t
+ｕ	u
+ｖ	v
+ｗ	w
+ｘ	x
+ｙ	y
+ｚ	z
+｛	{
+｜	|
+｝	}
+～	~
+｟	((
+｠	))
+｡	.
+､	,