diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /contrib/unaccent | |
parent | Initial commit. (diff) | |
download | postgresql-14-upstream.tar.xz postgresql-14-upstream.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/unaccent')
-rw-r--r-- | contrib/unaccent/.gitignore | 7 | ||||
-rw-r--r-- | contrib/unaccent/Makefile | 47 | ||||
-rw-r--r-- | contrib/unaccent/expected/unaccent.out | 99 | ||||
-rw-r--r-- | contrib/unaccent/generate_unaccent_rules.py | 291 | ||||
-rw-r--r-- | contrib/unaccent/sql/unaccent.sql | 24 | ||||
-rw-r--r-- | contrib/unaccent/unaccent--1.0--1.1.sql | 9 | ||||
-rw-r--r-- | contrib/unaccent/unaccent--1.1.sql | 34 | ||||
-rw-r--r-- | contrib/unaccent/unaccent.c | 434 | ||||
-rw-r--r-- | contrib/unaccent/unaccent.control | 6 | ||||
-rw-r--r-- | contrib/unaccent/unaccent.rules | 1613 |
10 files changed, 2564 insertions, 0 deletions
diff --git a/contrib/unaccent/.gitignore b/contrib/unaccent/.gitignore new file mode 100644 index 0000000..bccda73 --- /dev/null +++ b/contrib/unaccent/.gitignore @@ -0,0 +1,7 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ + +# Downloaded files +/Latin-ASCII.xml diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile new file mode 100644 index 0000000..b8307d1 --- /dev/null +++ b/contrib/unaccent/Makefile @@ -0,0 +1,47 @@ +# contrib/unaccent/Makefile + +MODULE_big = unaccent +OBJS = \ + $(WIN32RES) \ + unaccent.o + +EXTENSION = unaccent +DATA = unaccent--1.1.sql unaccent--1.0--1.1.sql +DATA_TSEARCH = unaccent.rules +PGFILEDESC = "unaccent - text search dictionary that removes accents" + +REGRESS = unaccent + +# We need a UTF8 database +ENCODING = UTF8 +NO_LOCALE = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/unaccent +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +update-unicode: unaccent.rules + +# Allow running this even without --with-python +PYTHON ?= python + +unaccent.rules: generate_unaccent_rules.py ../../src/common/unicode/UnicodeData.txt Latin-ASCII.xml + $(PYTHON) $< --unicode-data-file $(word 2,$^) --latin-ascii-file $(word 3,$^) >$@ + +# Only download it once; dependencies must match src/common/unicode/ +../../src/common/unicode/UnicodeData.txt: $(top_builddir)/src/Makefile.global + $(MAKE) -C $(@D) $(@F) + +# Dependency on Makefile.global is for CLDR_VERSION +Latin-ASCII.xml: $(top_builddir)/src/Makefile.global + $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/cldr/release-$(subst .,-,$(CLDR_VERSION))/common/transforms/Latin-ASCII.xml + +distclean: + rm -f Latin-ASCII.xml diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out new file mode 100644 index 0000000..c1bd7cd --- /dev/null +++ b/contrib/unaccent/expected/unaccent.out @@ -0,0 +1,99 @@ +CREATE EXTENSION unaccent; +-- must have a UTF8 database +SELECT getdatabaseencoding(); + getdatabaseencoding +--------------------- + UTF8 +(1 row) + +SET client_encoding TO 'UTF8'; +SELECT unaccent('foobar'); + unaccent +---------- + foobar +(1 row) + +SELECT unaccent('ёлка'); + unaccent +---------- + елка +(1 row) + +SELECT unaccent('ЁЖИК'); + unaccent +---------- + ЕЖИК +(1 row) + +SELECT unaccent('˃˖˗˜'); + unaccent +---------- + >+-~ +(1 row) + +SELECT unaccent('À'); -- Remove combining diacritical 0x0300 + unaccent +---------- + A +(1 row) + +SELECT unaccent('unaccent', 'foobar'); + unaccent +---------- + foobar +(1 row) + +SELECT unaccent('unaccent', 'ёлка'); + unaccent +---------- + елка +(1 row) + +SELECT unaccent('unaccent', 'ЁЖИК'); + unaccent +---------- + ЕЖИК +(1 row) + +SELECT unaccent('unaccent', '˃˖˗˜'); + unaccent +---------- + >+-~ +(1 row) + +SELECT unaccent('unaccent', 'À'); + unaccent +---------- + A +(1 row) + +SELECT ts_lexize('unaccent', 'foobar'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('unaccent', 'ёлка'); + ts_lexize +----------- + {елка} +(1 row) + +SELECT ts_lexize('unaccent', 'ЁЖИК'); + ts_lexize +----------- + {ЕЖИК} +(1 row) + +SELECT ts_lexize('unaccent', '˃˖˗˜'); + ts_lexize +----------- + {>+-~} +(1 row) + +SELECT ts_lexize('unaccent', 'À'); + ts_lexize +----------- + {A} +(1 row) + diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py new file mode 100644 index 0000000..a952de5 --- /dev/null +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -0,0 +1,291 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# This script builds unaccent.rules on standard output when given the +# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as +# arguments. Optionally includes ligature expansion and Unicode CLDR +# Latin-ASCII transliterator, enabled by default, this can be disabled +# with "--no-ligatures-expansion" command line option. +# +# The approach is to use the Unicode decomposition data to identify +# precomposed codepoints that are equivalent to a ligature of several +# letters, or a base letter with any number of diacritical marks. +# +# This approach handles most letters with diacritical marks and some +# ligatures. However, several characters (notably a majority of +# ligatures) don't have decomposition. To handle all these cases, one can +# use a standard Unicode transliterator available in Common Locale Data +# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode +# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion" +# option is enabled, the XML file of this transliterator [2] -- given as a +# command line argument -- will be parsed and used. +# +# Ideally you should use the latest release for each data set. This +# script is compatible with at least CLDR release 29. +# +# [1] https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/UnicodeData.txt +# [2] https://raw.githubusercontent.com/unicode-org/cldr/${TAG}/common/transforms/Latin-ASCII.xml + +# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped +# The approach is to be Python3 compatible with Python2 "backports". +from __future__ import print_function +from __future__ import unicode_literals +# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped + +import argparse +import codecs +import re +import sys +import xml.etree.ElementTree as ET + +# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped +if sys.version_info[0] <= 2: + # Encode stdout as UTF-8, so we can just print to it + sys.stdout = codecs.getwriter('utf8')(sys.stdout) + + # Map Python 2's chr to unichr + chr = unichr + + # Python 2 and 3 compatible bytes call + def bytes(source, encoding='ascii', errors='strict'): + return source.encode(encoding=encoding, errors=errors) +else: +# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped + sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) + +# The ranges of Unicode characters that we consider to be "plain letters". +# For now we are being conservative by including only Latin and Greek. This +# could be extended in future based on feedback from people with relevant +# language knowledge. +PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case + (ord('A'), ord('Z')), # Latin upper case + (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA + (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA + +# Combining marks follow a "base" character, and result in a composite +# character. Example: "U&'A\0300'"produces "À".There are three types of +# combining marks: enclosing (Me), non-spacing combining (Mn), spacing +# combining (Mc). We identify the ranges of marks we feel safe removing. +# References: +# https://en.wikipedia.org/wiki/Combining_character +# https://www.unicode.org/charts/PDF/U0300.pdf +# https://www.unicode.org/charts/PDF/U20D0.pdf +COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA + (0x20dd, 0x20E0), # Me: Symbols + (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle + +def print_record(codepoint, letter): + if letter: + output = chr(codepoint) + "\t" + letter + else: + output = chr(codepoint) + + print(output) + +class Codepoint: + def __init__(self, id, general_category, combining_ids): + self.id = id + self.general_category = general_category + self.combining_ids = combining_ids + +def is_mark_to_remove(codepoint): + """Return true if this is a combining mark to remove.""" + if not is_mark(codepoint): + return False + + for begin, end in COMBINING_MARK_RANGES: + if codepoint.id >= begin and codepoint.id <= end: + return True + return False + +def is_plain_letter(codepoint): + """Return true if codepoint represents a "plain letter".""" + for begin, end in PLAIN_LETTER_RANGES: + if codepoint.id >= begin and codepoint.id <= end: + return True + return False + +def is_mark(codepoint): + """Returns true for diacritical marks (combining codepoints).""" + return codepoint.general_category in ("Mn", "Me", "Mc") + +def is_letter_with_marks(codepoint, table): + """Returns true for letters combined with one or more marks.""" + # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values + + # Letter may have no combining characters, in which case it has + # no marks. + if len(codepoint.combining_ids) == 1: + return False + + # A letter without diacritical marks has none of them. + if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False: + return False + + # Check if the base letter of this letter has marks. + codepoint_base = codepoint.combining_ids[0] + if (is_plain_letter(table[codepoint_base]) is False and \ + is_letter_with_marks(table[codepoint_base], table) is False): + return False + + return True + +def is_letter(codepoint, table): + """Return true for letter with or without diacritical marks.""" + return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table) + +def get_plain_letter(codepoint, table): + """Return the base codepoint without marks. If this codepoint has more + than one combining character, do a recursive lookup on the table to + find out its plain base letter.""" + if is_letter_with_marks(codepoint, table): + if len(table[codepoint.combining_ids[0]].combining_ids) > 1: + return get_plain_letter(table[codepoint.combining_ids[0]], table) + elif is_plain_letter(table[codepoint.combining_ids[0]]): + return table[codepoint.combining_ids[0]] + + # Should not come here + assert(False) + elif is_plain_letter(codepoint): + return codepoint + + # Should not come here + assert(False) + +def is_ligature(codepoint, table): + """Return true for letters combined with letters.""" + return all(is_letter(table[i], table) for i in codepoint.combining_ids) + +def get_plain_letters(codepoint, table): + """Return a list of plain letters from a ligature.""" + assert(is_ligature(codepoint, table)) + return [get_plain_letter(table[id], table) for id in codepoint.combining_ids] + +def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): + """Parse the XML file and return a set of tuples (src, trg), where "src" + is the original character and "trg" the substitute.""" + charactersSet = set() + + # RegEx to parse rules + rulePattern = re.compile(r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;') + + # construct tree from XML + transliterationTree = ET.parse(latinAsciiFilePath) + transliterationTreeRoot = transliterationTree.getroot() + + # Fetch all the transliteration rules. Since release 29 of Latin-ASCII.xml + # all the transliteration rules are located in a single tRule block with + # all rules separated into separate lines. + blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule") + assert(len(blockRules) == 1) + + # Split the block of rules into one element per line. + rules = blockRules[0].text.splitlines() + + # And finish the processing of each individual rule. + for rule in rules: + matches = rulePattern.search(rule) + + # The regular expression capture four groups corresponding + # to the characters. + # + # Group 1: plain "src" char. Empty if group 2 is not. + # Group 2: unicode-escaped "src" char (e.g. "\u0110"). Empty if group 1 is not. + # + # Group 3: plain "trg" char. Empty if group 4 is not. + # Group 4: plain "trg" char between quotes. Empty if group 3 is not. + if matches is not None: + src = matches.group(1) if matches.group(1) is not None else bytes(matches.group(2), 'UTF-8').decode('unicode-escape') + trg = matches.group(3) if matches.group(3) is not None else matches.group(4) + + # "'" and """ are escaped + trg = trg.replace("\\'", "'").replace('\\"', '"') + + # the parser of unaccent only accepts non-whitespace characters + # for "src" and "trg" (see unaccent.c) + if not src.isspace() and not trg.isspace(): + charactersSet.add((ord(src), trg)) + + return charactersSet + +def special_cases(): + """Returns the special cases which are not handled by other methods""" + charactersSet = set() + + # Cyrillic + charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO + charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO + + # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F) + charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS + charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT + charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT + + return charactersSet + +def main(args): + # https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings + decomposition_type_pattern = re.compile(" *<[^>]*> *") + + table = {} + all = [] + + # unordered set for ensure uniqueness + charactersSet = set() + + # read file UnicodeData.txt + with codecs.open( + args.unicodeDataFilePath, mode='r', encoding='UTF-8', + ) as unicodeDataFile: + # read everything we need into memory + for line in unicodeDataFile: + fields = line.split(";") + if len(fields) > 5: + # https://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt + general_category = fields[2] + decomposition = fields[5] + decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) + id = int(fields[0], 16) + combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] + codepoint = Codepoint(id, general_category, combining_ids) + table[id] = codepoint + all.append(codepoint) + + # walk through all the codepoints looking for interesting mappings + for codepoint in all: + if codepoint.general_category.startswith('L') and \ + len(codepoint.combining_ids) > 1: + if is_letter_with_marks(codepoint, table): + charactersSet.add((codepoint.id, + chr(get_plain_letter(codepoint, table).id))) + elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): + charactersSet.add((codepoint.id, + "".join(chr(combining_codepoint.id) + for combining_codepoint \ + in get_plain_letters(codepoint, table)))) + elif is_mark_to_remove(codepoint): + charactersSet.add((codepoint.id, None)) + + # add CLDR Latin-ASCII characters + if not args.noLigaturesExpansion: + charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath) + charactersSet |= special_cases() + + # sort for more convenient display + charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0]) + + for characterPair in charactersList: + print_record(characterPair[0], characterPair[1]) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.') + parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath') + parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest='latinAsciiFilePath') + parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion') + args = parser.parse_args() + + if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None: + sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.') + sys.exit(1) + + main(args) diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql new file mode 100644 index 0000000..2ae097f --- /dev/null +++ b/contrib/unaccent/sql/unaccent.sql @@ -0,0 +1,24 @@ +CREATE EXTENSION unaccent; + +-- must have a UTF8 database +SELECT getdatabaseencoding(); + +SET client_encoding TO 'UTF8'; + +SELECT unaccent('foobar'); +SELECT unaccent('ёлка'); +SELECT unaccent('ЁЖИК'); +SELECT unaccent('˃˖˗˜'); +SELECT unaccent('À'); -- Remove combining diacritical 0x0300 + +SELECT unaccent('unaccent', 'foobar'); +SELECT unaccent('unaccent', 'ёлка'); +SELECT unaccent('unaccent', 'ЁЖИК'); +SELECT unaccent('unaccent', '˃˖˗˜'); +SELECT unaccent('unaccent', 'À'); + +SELECT ts_lexize('unaccent', 'foobar'); +SELECT ts_lexize('unaccent', 'ёлка'); +SELECT ts_lexize('unaccent', 'ЁЖИК'); +SELECT ts_lexize('unaccent', '˃˖˗˜'); +SELECT ts_lexize('unaccent', 'À'); diff --git a/contrib/unaccent/unaccent--1.0--1.1.sql b/contrib/unaccent/unaccent--1.0--1.1.sql new file mode 100644 index 0000000..8efa0d0 --- /dev/null +++ b/contrib/unaccent/unaccent--1.0--1.1.sql @@ -0,0 +1,9 @@ +/* contrib/unaccent/unaccent--1.0--1.1.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION unaccent UPDATE TO '1.1'" to load this file. \quit + +ALTER FUNCTION unaccent(regdictionary, text) PARALLEL SAFE; +ALTER FUNCTION unaccent(text) PARALLEL SAFE; +ALTER FUNCTION unaccent_init(internal) PARALLEL SAFE; +ALTER FUNCTION unaccent_lexize(internal, internal, internal, internal) PARALLEL SAFE; diff --git a/contrib/unaccent/unaccent--1.1.sql b/contrib/unaccent/unaccent--1.1.sql new file mode 100644 index 0000000..ecc8651 --- /dev/null +++ b/contrib/unaccent/unaccent--1.1.sql @@ -0,0 +1,34 @@ +/* contrib/unaccent/unaccent--1.1.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION unaccent" to load this file. \quit + +CREATE FUNCTION unaccent(regdictionary, text) + RETURNS text + AS 'MODULE_PATHNAME', 'unaccent_dict' + LANGUAGE C STABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION unaccent(text) + RETURNS text + AS 'MODULE_PATHNAME', 'unaccent_dict' + LANGUAGE C STABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION unaccent_init(internal) + RETURNS internal + AS 'MODULE_PATHNAME', 'unaccent_init' + LANGUAGE C PARALLEL SAFE; + +CREATE FUNCTION unaccent_lexize(internal,internal,internal,internal) + RETURNS internal + AS 'MODULE_PATHNAME', 'unaccent_lexize' + LANGUAGE C PARALLEL SAFE; + +CREATE TEXT SEARCH TEMPLATE unaccent ( + INIT = unaccent_init, + LEXIZE = unaccent_lexize +); + +CREATE TEXT SEARCH DICTIONARY unaccent ( + TEMPLATE = unaccent, + RULES = 'unaccent' +); diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c new file mode 100644 index 0000000..2b3819f --- /dev/null +++ b/contrib/unaccent/unaccent.c @@ -0,0 +1,434 @@ +/*------------------------------------------------------------------------- + * + * unaccent.c + * Text search unaccent dictionary + * + * Copyright (c) 2009-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/unaccent/unaccent.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/namespace.h" +#include "catalog/pg_ts_dict.h" +#include "commands/defrem.h" +#include "lib/stringinfo.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_public.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + +PG_MODULE_MAGIC; + +/* + * An unaccent dictionary uses a trie to find a string to replace. Each node + * of the trie is an array of 256 TrieChar structs; the N-th element of the + * array corresponds to next byte value N. That element can contain both a + * replacement string (to be used if the source string ends with this byte) + * and a link to another trie node (to be followed if there are more bytes). + * + * Note that the trie search logic pays no attention to multibyte character + * boundaries. This is OK as long as both the data entered into the trie and + * the data we're trying to look up are validly encoded; no partial-character + * matches will occur. + */ +typedef struct TrieChar +{ + struct TrieChar *nextChar; + char *replaceTo; + int replacelen; +} TrieChar; + +/* + * placeChar - put str into trie's structure, byte by byte. + * + * If node is NULL, we need to make a new node, which will be returned; + * otherwise the return value is the same as node. + */ +static TrieChar * +placeChar(TrieChar *node, const unsigned char *str, int lenstr, + const char *replaceTo, int replacelen) +{ + TrieChar *curnode; + + if (!node) + node = (TrieChar *) palloc0(sizeof(TrieChar) * 256); + + Assert(lenstr > 0); /* else str[0] doesn't exist */ + + curnode = node + *str; + + if (lenstr <= 1) + { + if (curnode->replaceTo) + ereport(WARNING, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("duplicate source strings, first one will be used"))); + else + { + curnode->replacelen = replacelen; + curnode->replaceTo = (char *) palloc(replacelen); + memcpy(curnode->replaceTo, replaceTo, replacelen); + } + } + else + { + curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, + replaceTo, replacelen); + } + + return node; +} + +/* + * initTrie - create trie from file. + * + * Function converts UTF8-encoded file into current encoding. + */ +static TrieChar * +initTrie(const char *filename) +{ + TrieChar *volatile rootTrie = NULL; + MemoryContext ccxt = CurrentMemoryContext; + tsearch_readline_state trst; + volatile bool skip; + + filename = get_tsearch_config_filename(filename, "rules"); + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open unaccent file \"%s\": %m", + filename))); + + do + { + /* + * pg_do_encoding_conversion() (called by tsearch_readline()) will + * emit exception if it finds untranslatable characters in current + * locale. We just skip such lines, continuing with the next. + */ + skip = true; + + PG_TRY(); + { + char *line; + + while ((line = tsearch_readline(&trst)) != NULL) + { + /*---------- + * The format of each line must be "src" or "src trg", where + * src and trg are sequences of one or more non-whitespace + * characters, separated by whitespace. Whitespace at start + * or end of line is ignored. If trg is omitted, an empty + * string is used as the replacement. + * + * We use a simple state machine, with states + * 0 initial (before src) + * 1 in src + * 2 in whitespace after src + * 3 in trg + * 4 in whitespace after trg + * -1 syntax error detected + *---------- + */ + int state; + char *ptr; + char *src = NULL; + char *trg = NULL; + int ptrlen; + int srclen = 0; + int trglen = 0; + + state = 0; + for (ptr = line; *ptr; ptr += ptrlen) + { + ptrlen = pg_mblen(ptr); + /* ignore whitespace, but end src or trg */ + if (t_isspace(ptr)) + { + if (state == 1) + state = 2; + else if (state == 3) + state = 4; + continue; + } + switch (state) + { + case 0: + /* start of src */ + src = ptr; + srclen = ptrlen; + state = 1; + break; + case 1: + /* continue src */ + srclen += ptrlen; + break; + case 2: + /* start of trg */ + trg = ptr; + trglen = ptrlen; + state = 3; + break; + case 3: + /* continue trg */ + trglen += ptrlen; + break; + default: + /* bogus line format */ + state = -1; + break; + } + } + + if (state == 1 || state == 2) + { + /* trg was omitted, so use "" */ + trg = ""; + trglen = 0; + } + + if (state > 0) + rootTrie = placeChar(rootTrie, + (unsigned char *) src, srclen, + trg, trglen); + else if (state < 0) + ereport(WARNING, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid syntax: more than two strings in unaccent rule"))); + + pfree(line); + } + skip = false; + } + PG_CATCH(); + { + ErrorData *errdata; + MemoryContext ecxt; + + ecxt = MemoryContextSwitchTo(ccxt); + errdata = CopyErrorData(); + if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) + { + FlushErrorState(); + } + else + { + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } + } + PG_END_TRY(); + } + while (skip); + + tsearch_readline_end(&trst); + + return rootTrie; +} + +/* + * findReplaceTo - find longest possible match in trie + * + * On success, returns pointer to ending subnode, plus length of matched + * source string in *p_matchlen. On failure, returns NULL. + */ +static TrieChar * +findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, + int *p_matchlen) +{ + TrieChar *result = NULL; + int matchlen = 0; + + *p_matchlen = 0; /* prevent uninitialized-variable warnings */ + + while (node && matchlen < srclen) + { + node = node + src[matchlen]; + matchlen++; + + if (node->replaceTo) + { + result = node; + *p_matchlen = matchlen; + } + + node = node->nextChar; + } + + return result; +} + +PG_FUNCTION_INFO_V1(unaccent_init); +Datum +unaccent_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + TrieChar *rootTrie = NULL; + bool fileloaded = false; + ListCell *l; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (strcmp(defel->defname, "rules") == 0) + { + if (fileloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple Rules parameters"))); + rootTrie = initTrie(defGetString(defel)); + fileloaded = true; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized Unaccent parameter: \"%s\"", + defel->defname))); + } + } + + if (!fileloaded) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing Rules parameter"))); + } + + PG_RETURN_POINTER(rootTrie); +} + +PG_FUNCTION_INFO_V1(unaccent_lexize); +Datum +unaccent_lexize(PG_FUNCTION_ARGS) +{ + TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0); + char *srcchar = (char *) PG_GETARG_POINTER(1); + int32 len = PG_GETARG_INT32(2); + char *srcstart = srcchar; + TSLexeme *res; + StringInfoData buf; + + /* we allocate storage for the buffer only if needed */ + buf.data = NULL; + + while (len > 0) + { + TrieChar *node; + int matchlen; + + node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len, + &matchlen); + if (node && node->replaceTo) + { + if (buf.data == NULL) + { + /* initialize buffer */ + initStringInfo(&buf); + /* insert any data we already skipped over */ + if (srcchar != srcstart) + appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart); + } + appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen); + } + else + { + matchlen = pg_mblen(srcchar); + if (buf.data != NULL) + appendBinaryStringInfo(&buf, srcchar, matchlen); + } + + srcchar += matchlen; + len -= matchlen; + } + + /* return a result only if we made at least one substitution */ + if (buf.data != NULL) + { + res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2); + res->lexeme = buf.data; + res->flags = TSL_FILTER; + } + else + res = NULL; + + PG_RETURN_POINTER(res); +} + +/* + * Function-like wrapper for dictionary + */ +PG_FUNCTION_INFO_V1(unaccent_dict); +Datum +unaccent_dict(PG_FUNCTION_ARGS) +{ + text *str; + int strArg; + Oid dictOid; + TSDictionaryCacheEntry *dict; + TSLexeme *res; + + if (PG_NARGS() == 1) + { + /* + * Use the "unaccent" dictionary that is in the same schema that this + * function is in. + */ + Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid); + const char *dictname = "unaccent"; + + dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid, + PointerGetDatum(dictname), + ObjectIdGetDatum(procnspid)); + if (!OidIsValid(dictOid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("text search dictionary \"%s.%s\" does not exist", + get_namespace_name(procnspid), dictname))); + strArg = 0; + } + else + { + dictOid = PG_GETARG_OID(0); + strArg = 1; + } + str = PG_GETARG_TEXT_PP(strArg); + + dict = lookup_ts_dictionary_cache(dictOid); + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(VARDATA_ANY(str)), + Int32GetDatum(VARSIZE_ANY_EXHDR(str)), + PointerGetDatum(NULL))); + + PG_FREE_IF_COPY(str, strArg); + + if (res == NULL) + { + PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); + } + else if (res->lexeme == NULL) + { + pfree(res); + PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); + } + else + { + text *txt = cstring_to_text(res->lexeme); + + pfree(res->lexeme); + pfree(res); + + PG_RETURN_TEXT_P(txt); + } +} diff --git a/contrib/unaccent/unaccent.control b/contrib/unaccent/unaccent.control new file mode 100644 index 0000000..649cf68 --- /dev/null +++ b/contrib/unaccent/unaccent.control @@ -0,0 +1,6 @@ +# unaccent extension +comment = 'text search dictionary that removes accents' +default_version = '1.1' +module_pathname = '$libdir/unaccent' +relocatable = true +trusted = true diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules new file mode 100644 index 0000000..1b5eb1b --- /dev/null +++ b/contrib/unaccent/unaccent.rules @@ -0,0 +1,1613 @@ +¡ ! +© (C) +« << + - +® (R) +± +/- +» >> +¼ 1/4 +½ 1/2 +¾ 3/4 +¿ ? +À A +Á A + A +à A +Ä A +Å A +Æ AE +Ç C +È E +É E +Ê E +Ë E +Ì I +Í I +Î I +Ï I +Ð D +Ñ N +Ò O +Ó O +Ô O +Õ O +Ö O +× * +Ø O +Ù U +Ú U +Û U +Ü U +Ý Y +Þ TH +ß ss +à a +á a +â a +ã a +ä a +å a +æ ae +ç c +è e +é e +ê e +ë e +ì i +í i +î i +ï i +ð d +ñ n +ò o +ó o +ô o +õ o +ö o +÷ / +ø o +ù u +ú u +û u +ü u +ý y +þ th +ÿ y +Ā A +ā a +Ă A +ă a +Ą A +ą a +Ć C +ć c +Ĉ C +ĉ c +Ċ C +ċ c +Č C +č c +Ď D +ď d +Đ D +đ d +Ē E +ē e +Ĕ E +ĕ e +Ė E +ė e +Ę E +ę e +Ě E +ě e +Ĝ G +ĝ g +Ğ G +ğ g +Ġ G +ġ g +Ģ G +ģ g +Ĥ H +ĥ h +Ħ H +ħ h +Ĩ I +ĩ i +Ī I +ī i +Ĭ I +ĭ i +Į I +į i +İ I +ı i +IJ IJ +ij ij +Ĵ J +ĵ j +Ķ K +ķ k +ĸ q +Ĺ L +ĺ l +Ļ L +ļ l +Ľ L +ľ l +Ŀ L +ŀ l +Ł L +ł l +Ń N +ń n +Ņ N +ņ n +Ň N +ň n +ʼn 'n +Ŋ N +ŋ n +Ō O +ō o +Ŏ O +ŏ o +Ő O +ő o +Œ OE +œ oe +Ŕ R +ŕ r +Ŗ R +ŗ r +Ř R +ř r +Ś S +ś s +Ŝ S +ŝ s +Ş S +ş s +Š S +š s +Ţ T +ţ t +Ť T +ť t +Ŧ T +ŧ t +Ũ U +ũ u +Ū U +ū u +Ŭ U +ŭ u +Ů U +ů u +Ű U +ű u +Ų U +ų u +Ŵ W +ŵ w +Ŷ Y +ŷ y +Ÿ Y +Ź Z +ź z +Ż Z +ż z +Ž Z +ž z +ſ s +ƀ b +Ɓ B +Ƃ B +ƃ b +Ƈ C +ƈ c +Ɖ D +Ɗ D +Ƌ D +ƌ d +Ɛ E +Ƒ F +ƒ f +Ɠ G +ƕ hv +Ɩ I +Ɨ I +Ƙ K +ƙ k +ƚ l +Ɲ N +ƞ n +Ơ O +ơ o +Ƣ OI +ƣ oi +Ƥ P +ƥ p +ƫ t +Ƭ T +ƭ t +Ʈ T +Ư U +ư u +Ʋ V +Ƴ Y +ƴ y +Ƶ Z +ƶ z +DŽ DZ +Dž Dz +dž dz +LJ LJ +Lj Lj +lj lj +NJ NJ +Nj Nj +nj nj +Ǎ A +ǎ a +Ǐ I +ǐ i +Ǒ O +ǒ o +Ǔ U +ǔ u +Ǖ U +ǖ u +Ǘ U +ǘ u +Ǚ U +ǚ u +Ǜ U +ǜ u +Ǟ A +ǟ a +Ǡ A +ǡ a +Ǥ G +ǥ g +Ǧ G +ǧ g +Ǩ K +ǩ k +Ǫ O +ǫ o +Ǭ O +ǭ o +ǰ j +DZ DZ +Dz Dz +dz dz +Ǵ G +ǵ g +Ǹ N +ǹ n +Ǻ A +ǻ a +Ȁ A +ȁ a +Ȃ A +ȃ a +Ȅ E +ȅ e +Ȇ E +ȇ e +Ȉ I +ȉ i +Ȋ I +ȋ i +Ȍ O +ȍ o +Ȏ O +ȏ o +Ȑ R +ȑ r +Ȓ R +ȓ r +Ȕ U +ȕ u +Ȗ U +ȗ u +Ș S +ș s +Ț T +ț t +Ȟ H +ȟ h +ȡ d +Ȥ Z +ȥ z +Ȧ A +ȧ a +Ȩ E +ȩ e +Ȫ O +ȫ o +Ȭ O +ȭ o +Ȯ O +ȯ o +Ȱ O +ȱ o +Ȳ Y +ȳ y +ȴ l +ȵ n +ȶ t +ȷ j +ȸ db +ȹ qp +Ⱥ A +Ȼ C +ȼ c +Ƚ L +Ⱦ T +ȿ s +ɀ z +Ƀ B +Ʉ U +Ɇ E +ɇ e +Ɉ J +ɉ j +Ɍ R +ɍ r +Ɏ Y +ɏ y +ɓ b +ɕ c +ɖ d +ɗ d +ɛ e +ɟ j +ɠ g +ɡ g +ɢ G +ɦ h +ɧ h +ɨ i +ɪ I +ɫ l +ɬ l +ɭ l +ɱ m +ɲ n +ɳ n +ɴ N +ɶ OE +ɼ r +ɽ r +ɾ r +ʀ R +ʂ s +ʈ t +ʉ u +ʋ v +ʏ Y +ʐ z +ʑ z +ʙ B +ʛ G +ʜ H +ʝ j +ʟ L +ʠ q +ʣ dz +ʥ dz +ʦ ts +ʪ ls +ʫ lz +ʹ ' +ʺ " +ʻ ' +ʼ ' +ʽ ' +˂ < +˃ > +˄ ^ +ˆ ^ +ˈ ' +ˋ ` +ː : +˖ + +˗ - +˜ ~ +̀ +́ +̂ +̃ +̄ +̅ +̆ +̇ +̈ +̉ +̊ +̋ +̌ +̍ +̎ +̏ +̐ +̑ +̒ +̓ +̔ +̕ +̖ +̗ +̘ +̙ +̚ +̛ +̜ +̝ +̞ +̟ +̠ +̡ +̢ +̣ +̤ +̥ +̦ +̧ +̨ +̩ +̪ +̫ +̬ +̭ +̮ +̯ +̰ +̱ +̲ +̳ +̴ +̵ +̶ +̷ +̸ +̹ +̺ +̻ +̼ +̽ +̾ +̿ +̀ +́ +͂ +̓ +̈́ +ͅ +͆ +͇ +͈ +͉ +͊ +͋ +͌ +͍ +͎ +͏ +͐ +͑ +͒ +͓ +͔ +͕ +͖ +͗ +͘ +͙ +͚ +͛ +͜ +͝ +͞ +͟ +͠ +͡ +͢ +Ά Α +Έ Ε +Ή Η +Ί Ι +Ό Ο +Ύ Υ +Ώ Ω +ΐ ι +Ϊ Ι +Ϋ Υ +ά α +έ ε +ή η +ί ι +ΰ υ +ϊ ι +ϋ υ +ό ο +ύ υ +ώ ω +Ё Е +ё е +ᴀ A +ᴁ AE +ᴃ B +ᴄ C +ᴅ D +ᴆ D +ᴇ E +ᴊ J +ᴋ K +ᴌ L +ᴍ M +ᴏ O +ᴘ P +ᴛ T +ᴜ U +ᴠ V +ᴡ W +ᴢ Z +ᵫ ue +ᵬ b +ᵭ d +ᵮ f +ᵯ m +ᵰ n +ᵱ p +ᵲ r +ᵳ r +ᵴ s +ᵵ t +ᵶ z +ᵺ th +ᵻ I +ᵽ p +ᵾ U +ᶀ b +ᶁ d +ᶂ f +ᶃ g +ᶄ k +ᶅ l +ᶆ m +ᶇ n +ᶈ p +ᶉ r +ᶊ s +ᶌ v +ᶍ x +ᶎ z +ᶏ a +ᶑ d +ᶒ e +ᶓ e +ᶖ i +ᶙ u +Ḁ A +ḁ a +Ḃ B +ḃ b +Ḅ B +ḅ b +Ḇ B +ḇ b +Ḉ C +ḉ c +Ḋ D +ḋ d +Ḍ D +ḍ d +Ḏ D +ḏ d +Ḑ D +ḑ d +Ḓ D +ḓ d +Ḕ E +ḕ e +Ḗ E +ḗ e +Ḙ E +ḙ e +Ḛ E +ḛ e +Ḝ E +ḝ e +Ḟ F +ḟ f +Ḡ G +ḡ g +Ḣ H +ḣ h +Ḥ H +ḥ h +Ḧ H +ḧ h +Ḩ H +ḩ h +Ḫ H +ḫ h +Ḭ I +ḭ i +Ḯ I +ḯ i +Ḱ K +ḱ k +Ḳ K +ḳ k +Ḵ K +ḵ k +Ḷ L +ḷ l +Ḹ L +ḹ l +Ḻ L +ḻ l +Ḽ L +ḽ l +Ḿ M +ḿ m +Ṁ M +ṁ m +Ṃ M +ṃ m +Ṅ N +ṅ n +Ṇ N +ṇ n +Ṉ N +ṉ n +Ṋ N +ṋ n +Ṍ O +ṍ o +Ṏ O +ṏ o +Ṑ O +ṑ o +Ṓ O +ṓ o +Ṕ P +ṕ p +Ṗ P +ṗ p +Ṙ R +ṙ r +Ṛ R +ṛ r +Ṝ R +ṝ r +Ṟ R +ṟ r +Ṡ S +ṡ s +Ṣ S +ṣ s +Ṥ S +ṥ s +Ṧ S +ṧ s +Ṩ S +ṩ s +Ṫ T +ṫ t +Ṭ T +ṭ t +Ṯ T +ṯ t +Ṱ T +ṱ t +Ṳ U +ṳ u +Ṵ U +ṵ u +Ṷ U +ṷ u +Ṹ U +ṹ u +Ṻ U +ṻ u +Ṽ V +ṽ v +Ṿ V +ṿ v +Ẁ W +ẁ w +Ẃ W +ẃ w +Ẅ W +ẅ w +Ẇ W +ẇ w +Ẉ W +ẉ w +Ẋ X +ẋ x +Ẍ X +ẍ x +Ẏ Y +ẏ y +Ẑ Z +ẑ z +Ẓ Z +ẓ z +Ẕ Z +ẕ z +ẖ h +ẗ t +ẘ w +ẙ y +ẚ a +ẜ s +ẝ s +ẞ SS +Ạ A +ạ a +Ả A +ả a +Ấ A +ấ a +Ầ A +ầ a +Ẩ A +ẩ a +Ẫ A +ẫ a +Ậ A +ậ a +Ắ A +ắ a +Ằ A +ằ a +Ẳ A +ẳ a +Ẵ A +ẵ a +Ặ A +ặ a +Ẹ E +ẹ e +Ẻ E +ẻ e +Ẽ E +ẽ e +Ế E +ế e +Ề E +ề e +Ể E +ể e +Ễ E +ễ e +Ệ E +ệ e +Ỉ I +ỉ i +Ị I +ị i +Ọ O +ọ o +Ỏ O +ỏ o +Ố O +ố o +Ồ O +ồ o +Ổ O +ổ o +Ỗ O +ỗ o +Ộ O +ộ o +Ớ O +ớ o +Ờ O +ờ o +Ở O +ở o +Ỡ O +ỡ o +Ợ O +ợ o +Ụ U +ụ u +Ủ U +ủ u +Ứ U +ứ u +Ừ U +ừ u +Ử U +ử u +Ữ U +ữ u +Ự U +ự u +Ỳ Y +ỳ y +Ỵ Y +ỵ y +Ỷ Y +ỷ y +Ỹ Y +ỹ y +Ỻ LL +ỻ ll +Ỽ V +ỽ v +Ỿ Y +ỿ y +ἀ α +ἁ α +ἂ α +ἃ α +ἄ α +ἅ α +ἆ α +ἇ α +Ἀ Α +Ἁ Α +Ἂ Α +Ἃ Α +Ἄ Α +Ἅ Α +Ἆ Α +Ἇ Α +ἐ ε +ἑ ε +ἒ ε +ἓ ε +ἔ ε +ἕ ε +Ἐ Ε +Ἑ Ε +Ἒ Ε +Ἓ Ε +Ἔ Ε +Ἕ Ε +ἠ η +ἡ η +ἢ η +ἣ η +ἤ η +ἥ η +ἦ η +ἧ η +Ἠ Η +Ἡ Η +Ἢ Η +Ἣ Η +Ἤ Η +Ἥ Η +Ἦ Η +Ἧ Η +ἰ ι +ἱ ι +ἲ ι +ἳ ι +ἴ ι +ἵ ι +ἶ ι +ἷ ι +Ἰ Ι +Ἱ Ι +Ἲ Ι +Ἳ Ι +Ἴ Ι +Ἵ Ι +Ἶ Ι +Ἷ Ι +ὀ ο +ὁ ο +ὂ ο +ὃ ο +ὄ ο +ὅ ο +Ὀ Ο +Ὁ Ο +Ὂ Ο +Ὃ Ο +Ὄ Ο +Ὅ Ο +ὐ υ +ὑ υ +ὒ υ +ὓ υ +ὔ υ +ὕ υ +ὖ υ +ὗ υ +Ὑ Υ +Ὓ Υ +Ὕ Υ +Ὗ Υ +ὠ ω +ὡ ω +ὢ ω +ὣ ω +ὤ ω +ὥ ω +ὦ ω +ὧ ω +Ὠ Ω +Ὡ Ω +Ὢ Ω +Ὣ Ω +Ὤ Ω +Ὥ Ω +Ὦ Ω +Ὧ Ω +ὰ α +ὲ ε +ὴ η +ὶ ι +ὸ ο +ὺ υ +ὼ ω +ᾀ α +ᾁ α +ᾂ α +ᾃ α +ᾄ α +ᾅ α +ᾆ α +ᾇ α +ᾈ Α +ᾉ Α +ᾊ Α +ᾋ Α +ᾌ Α +ᾍ Α +ᾎ Α +ᾏ Α +ᾐ η +ᾑ η +ᾒ η +ᾓ η +ᾔ η +ᾕ η +ᾖ η +ᾗ η +ᾘ Η +ᾙ Η +ᾚ Η +ᾛ Η +ᾜ Η +ᾝ Η +ᾞ Η +ᾟ Η +ᾠ ω +ᾡ ω +ᾢ ω +ᾣ ω +ᾤ ω +ᾥ ω +ᾦ ω +ᾧ ω +ᾨ Ω +ᾩ Ω +ᾪ Ω +ᾫ Ω +ᾬ Ω +ᾭ Ω +ᾮ Ω +ᾯ Ω +ᾰ α +ᾱ α +ᾲ α +ᾳ α +ᾴ α +ᾶ α +ᾷ α +Ᾰ Α +Ᾱ Α +Ὰ Α +ᾼ Α +ῂ η +ῃ η +ῄ η +ῆ η +ῇ η +Ὲ Ε +Ὴ Η +ῌ Η +ῐ ι +ῑ ι +ῒ ι +ῖ ι +ῗ ι +Ῐ Ι +Ῑ Ι +Ὶ Ι +ῠ υ +ῡ υ +ῢ υ +ῤ ρ +ῥ ρ +ῦ υ +ῧ υ +Ῠ Υ +Ῡ Υ +Ὺ Υ +Ῥ Ρ +ῲ ω +ῳ ω +ῴ ω +ῶ ω +ῷ ω +Ὸ Ο +Ὼ Ω +ῼ Ω +‐ - +‑ - +‒ - +– - +— - +― - +‖ || +‘ ' +’ ' +‚ , +‛ ' +“ " +” " +„ ,, +‟ " +․ . +‥ .. +… ... +′ ' +″ " +‹ < +› > +‼ !! +⁄ / +⁅ [ +⁆ ] +⁇ ?? +⁈ ?! +⁉ !? +⁎ * +₠ CE +₢ Cr +₣ Fr. +₤ L. +₧ Pts +₹ Rs +₺ TL +⃝ +⃞ +⃟ +⃠ +⃢ +⃣ +⃤ +℀ a/c +℁ a/s +ℂ C +℃ °C +℅ c/o +℆ c/u +℉ °F +ℊ g +ℋ H +ℌ x +ℍ H +ℎ h +ℐ I +ℑ I +ℒ L +ℓ l +ℕ N +№ No +℗ (P) +℘ P +ℙ P +ℚ Q +ℛ R +ℜ R +ℝ R +℞ Rx +℡ TEL +ℤ Z +ℨ Z +ℬ B +ℭ C +ℯ e +ℰ E +ℱ F +ℳ M +ℴ o +ℹ i +℻ FAX +ⅅ D +ⅆ d +ⅇ e +ⅈ i +ⅉ j +⅐ 1/7 +⅑ 1/9 +⅒ 1/10 +⅓ 1/3 +⅔ 2/3 +⅕ 1/5 +⅖ 2/5 +⅗ 3/5 +⅘ 4/5 +⅙ 1/6 +⅚ 5/6 +⅛ 1/8 +⅜ 3/8 +⅝ 5/8 +⅞ 7/8 +⅟ 1/ +Ⅰ I +Ⅱ II +Ⅲ III +Ⅳ IV +Ⅴ V +Ⅵ VI +Ⅶ VII +Ⅷ VIII +Ⅸ IX +Ⅹ X +Ⅺ XI +Ⅻ XII +Ⅼ L +Ⅽ C +Ⅾ D +Ⅿ M +ⅰ i +ⅱ ii +ⅲ iii +ⅳ iv +ⅴ v +ⅵ vi +ⅶ vii +ⅷ viii +ⅸ ix +ⅹ x +ⅺ xi +ⅻ xii +ⅼ l +ⅽ c +ⅾ d +ⅿ m +↉ 0/3 +− - +∕ / +∖ \ +∣ | +∥ || +≪ << +≫ >> +⑴ (1) +⑵ (2) +⑶ (3) +⑷ (4) +⑸ (5) +⑹ (6) +⑺ (7) +⑻ (8) +⑼ (9) +⑽ (10) +⑾ (11) +⑿ (12) +⒀ (13) +⒁ (14) +⒂ (15) +⒃ (16) +⒄ (17) +⒅ (18) +⒆ (19) +⒇ (20) +⒈ 1. +⒉ 2. +⒊ 3. +⒋ 4. +⒌ 5. +⒍ 6. +⒎ 7. +⒏ 8. +⒐ 9. +⒑ 10. +⒒ 11. +⒓ 12. +⒔ 13. +⒕ 14. +⒖ 15. +⒗ 16. +⒘ 17. +⒙ 18. +⒚ 19. +⒛ 20. +⒜ (a) +⒝ (b) +⒞ (c) +⒟ (d) +⒠ (e) +⒡ (f) +⒢ (g) +⒣ (h) +⒤ (i) +⒥ (j) +⒦ (k) +⒧ (l) +⒨ (m) +⒩ (n) +⒪ (o) +⒫ (p) +⒬ (q) +⒭ (r) +⒮ (s) +⒯ (t) +⒰ (u) +⒱ (v) +⒲ (w) +⒳ (x) +⒴ (y) +⒵ (z) +⦅ (( +⦆ )) +⩴ ::= +⩵ == +⩶ === +Ⱡ L +ⱡ l +Ɫ L +Ᵽ P +Ɽ R +ⱥ a +ⱦ t +Ⱨ H +ⱨ h +Ⱪ K +ⱪ k +Ⱬ Z +ⱬ z +Ɱ M +ⱱ v +Ⱳ W +ⱳ w +ⱴ v +ⱸ e +ⱺ o +Ȿ S +Ɀ Z +、 , +。 . +〇 0 +〈 < +〉 > +《 << +》 >> +〔 [ +〕 ] +〘 [ +〙 ] +〚 [ +〛 ] +〝 " +〞 " +㍱ hPa +㍲ da +㍳ AU +㍴ bar +㍵ oV +㍶ pc +㍷ dm +㍺ IU +㎀ pA +㎁ nA +㎃ mA +㎄ kA +㎅ KB +㎆ MB +㎇ GB +㎈ cal +㎉ kcal +㎊ pF +㎋ nF +㎎ mg +㎏ kg +㎐ Hz +㎑ kHz +㎒ MHz +㎓ GHz +㎔ THz +㎙ fm +㎚ nm +㎜ mm +㎝ cm +㎞ km +㎧ m/s +㎩ Pa +㎪ kPa +㎫ MPa +㎬ GPa +㎭ rad +㎮ rad/s +㎰ ps +㎱ ns +㎳ ms +㎴ pV +㎵ nV +㎷ mV +㎸ kV +㎹ MV +㎺ pW +㎻ nW +㎽ mW +㎾ kW +㎿ MW +㏂ a.m. +㏃ Bq +㏄ cc +㏅ cd +㏆ C/kg +㏇ Co. +㏈ dB +㏉ Gy +㏊ ha +㏋ HP +㏌ in +㏍ KK +㏎ KM +㏏ kt +㏐ lm +㏑ ln +㏒ log +㏓ lx +㏔ mb +㏕ mil +㏖ mol +㏗ pH +㏘ p.m. +㏙ PPM +㏚ PR +㏛ sr +㏜ Sv +㏝ Wb +㏞ V/m +㏟ A/m +ꜰ F +ꜱ S +Ꜳ AA +ꜳ aa +Ꜵ AO +ꜵ ao +Ꜷ AU +ꜷ au +Ꜹ AV +ꜹ av +Ꜻ AV +ꜻ av +Ꜽ AY +ꜽ ay +Ꝁ K +ꝁ k +Ꝃ K +ꝃ k +Ꝅ K +ꝅ k +Ꝇ L +ꝇ l +Ꝉ L +ꝉ l +Ꝋ O +ꝋ o +Ꝍ O +ꝍ o +Ꝏ OO +ꝏ oo +Ꝑ P +ꝑ p +Ꝓ P +ꝓ p +Ꝕ P +ꝕ p +Ꝗ Q +ꝗ q +Ꝙ Q +ꝙ q +Ꝟ V +ꝟ v +Ꝡ VY +ꝡ vy +Ꝥ TH +ꝥ th +Ꝧ TH +ꝧ th +ꝱ d +ꝲ l +ꝳ m +ꝴ n +ꝵ r +ꝶ R +ꝷ t +Ꝺ D +ꝺ d +Ꝼ F +ꝼ f +Ꞇ T +ꞇ t +Ꞑ N +ꞑ n +Ꞓ C +ꞓ c +Ꞡ G +ꞡ g +Ꞣ K +ꞣ k +Ꞥ N +ꞥ n +Ꞧ R +ꞧ r +Ꞩ S +ꞩ s +Ɦ H +ff ff +fi fi +fl fl +ffi ffi +ffl ffl +ſt st +st st +︐ , +︑ , +︒ . +︓ : +︔ ; +︕ ! +︖ ? +︙ ... +︰ .. +︱ - +︲ - +︵ ( +︶ ) +︷ { +︸ } +︹ [ +︺ ] +︽ << +︾ >> +︿ < +﹀ > +﹇ [ +﹈ ] +﹐ , +﹑ , +﹒ . +﹔ ; +﹕ : +﹖ ? +﹗ ! +﹘ - +﹙ ( +﹚ ) +﹛ { +﹜ } +﹝ [ +﹞ ] +﹟ # +﹠ & +﹡ * +﹢ + +﹣ - +﹤ < +﹥ > +﹦ = +﹨ \ +﹩ $ +﹪ % +﹫ @ +! ! +" " +# # +$ $ +% % +& & +' ' +( ( +) ) +* * ++ + +, , +- - +. . +/ / +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +: : +; ; +< < += = +> > +? ? +@ @ +A A +B B +C C +D D +E E +F F +G G +H H +I I +J J +K K +L L +M M +N N +O O +P P +Q Q +R R +S S +T T +U U +V V +W W +X X +Y Y +Z Z +[ [ +\ \ +] ] +^ ^ +_ _ +` ` +a a +b b +c c +d d +e e +f f +g g +h h +i i +j j +k k +l l +m m +n n +o o +p p +q q +r r +s s +t t +u u +v v +w w +x x +y y +z z +{ { +| | +} } +~ ~ +⦅ (( +⦆ )) +。 . +、 , +← <- +→ -> |