diff options
Diffstat (limited to 'src/common/unicode_norm.c')
-rw-r--r-- | src/common/unicode_norm.c | 634 |
1 files changed, 634 insertions, 0 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c new file mode 100644 index 0000000..06bf921 --- /dev/null +++ b/src/common/unicode_norm.c @@ -0,0 +1,634 @@ +/*------------------------------------------------------------------------- + * unicode_norm.c + * Normalize a Unicode string + * + * This implements Unicode normalization, per the documentation at + * https://www.unicode.org/reports/tr15/. + * + * Portions Copyright (c) 2017-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode_norm.c + * + *------------------------------------------------------------------------- + */ +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include "common/unicode_norm.h" +#ifndef FRONTEND +#include "common/unicode_norm_hashfunc.h" +#include "common/unicode_normprops_table.h" +#include "port/pg_bswap.h" +#else +#include "common/unicode_norm_table.h" +#endif + +#ifndef FRONTEND +#define ALLOC(size) palloc(size) +#define FREE(size) pfree(size) +#else +#define ALLOC(size) malloc(size) +#define FREE(size) free(size) +#endif + +/* Constants for calculations with Hangul characters */ +#define SBASE 0xAC00 /* U+AC00 */ +#define LBASE 0x1100 /* U+1100 */ +#define VBASE 0x1161 /* U+1161 */ +#define TBASE 0x11A7 /* U+11A7 */ +#define LCOUNT 19 +#define VCOUNT 21 +#define TCOUNT 28 +#define NCOUNT VCOUNT * TCOUNT +#define SCOUNT LCOUNT * NCOUNT + +#ifdef FRONTEND +/* comparison routine for bsearch() of decomposition lookup table. */ +static int +conv_compare(const void *p1, const void *p2) +{ + uint32 v1, + v2; + + v1 = *(const uint32 *) p1; + v2 = ((const pg_unicode_decomposition *) p2)->codepoint; + return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); +} + +#endif + +/* + * get_code_entry + * + * Get the entry corresponding to code in the decomposition lookup table. + * The backend version of this code uses a perfect hash function for the + * lookup, while the frontend version uses a binary search. + */ +static const pg_unicode_decomposition * +get_code_entry(pg_wchar code) +{ +#ifndef FRONTEND + int h; + uint32 hashkey; + pg_unicode_decompinfo decompinfo = UnicodeDecompInfo; + + /* + * Compute the hash function. The hash key is the codepoint with the bytes + * in network order. + */ + hashkey = pg_hton32(code); + h = decompinfo.hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= decompinfo.num_decomps) + return NULL; + + /* + * Since it's a perfect hash, we need only match to the specific codepoint + * it identifies. + */ + if (code != decompinfo.decomps[h].codepoint) + return NULL; + + /* Success! */ + return &decompinfo.decomps[h]; +#else + return bsearch(&(code), + UnicodeDecompMain, + lengthof(UnicodeDecompMain), + sizeof(pg_unicode_decomposition), + conv_compare); +#endif +} + +/* + * Get the combining class of the given codepoint. + */ +static uint8 +get_canonical_class(pg_wchar code) +{ + const pg_unicode_decomposition *entry = get_code_entry(code); + + /* + * If no entries are found, the character used is either an Hangul + * character or a character with a class of 0 and no decompositions. + */ + if (!entry) + return 0; + else + return entry->comb_class; +} + +/* + * Given a decomposition entry looked up earlier, get the decomposed + * characters. + * + * Note: the returned pointer can point to statically allocated buffer, and + * is only valid until next call to this function! + */ +static const pg_wchar * +get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size) +{ + static pg_wchar x; + + if (DECOMPOSITION_IS_INLINE(entry)) + { + Assert(DECOMPOSITION_SIZE(entry) == 1); + x = (pg_wchar) entry->dec_index; + *dec_size = 1; + return &x; + } + else + { + *dec_size = DECOMPOSITION_SIZE(entry); + return &UnicodeDecomp_codepoints[entry->dec_index]; + } +} + +/* + * Calculate how many characters a given character will decompose to. + * + * This needs to recurse, if the character decomposes into characters that + * are, in turn, decomposable. + */ +static int +get_decomposed_size(pg_wchar code, bool compat) +{ + const pg_unicode_decomposition *entry; + int size = 0; + int i; + const uint32 *decomp; + int dec_size; + + /* + * Fast path for Hangul characters not stored in tables to save memory as + * decomposition is algorithmic. See + * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details + * on the matter. + */ + if (code >= SBASE && code < SBASE + SCOUNT) + { + uint32 tindex, + sindex; + + sindex = code - SBASE; + tindex = sindex % TCOUNT; + + if (tindex != 0) + return 3; + return 2; + } + + entry = get_code_entry(code); + + /* + * Just count current code if no other decompositions. A NULL entry is + * equivalent to a character with class 0 and no decompositions. + */ + if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || + (!compat && DECOMPOSITION_IS_COMPAT(entry))) + return 1; + + /* + * If this entry has other decomposition codes look at them as well. First + * get its decomposition in the list of tables available. + */ + decomp = get_code_decomposition(entry, &dec_size); + for (i = 0; i < dec_size; i++) + { + uint32 lcode = decomp[i]; + + size += get_decomposed_size(lcode, compat); + } + + return size; +} + +/* + * Recompose a set of characters. For hangul characters, the calculation + * is algorithmic. For others, an inverse lookup at the decomposition + * table is necessary. Returns true if a recomposition can be done, and + * false otherwise. + */ +static bool +recompose_code(uint32 start, uint32 code, uint32 *result) +{ + /* + * Handle Hangul characters algorithmically, per the Unicode spec. + * + * Check if two current characters are L and V. + */ + if (start >= LBASE && start < LBASE + LCOUNT && + code >= VBASE && code < VBASE + VCOUNT) + { + /* make syllable of form LV */ + uint32 lindex = start - LBASE; + uint32 vindex = code - VBASE; + + *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT; + return true; + } + /* Check if two current characters are LV and T */ + else if (start >= SBASE && start < (SBASE + SCOUNT) && + ((start - SBASE) % TCOUNT) == 0 && + code >= TBASE && code < (TBASE + TCOUNT)) + { + /* make syllable of form LVT */ + uint32 tindex = code - TBASE; + + *result = start + tindex; + return true; + } + else + { + const pg_unicode_decomposition *entry; + + /* + * Do an inverse lookup of the decomposition tables to see if anything + * matches. The comparison just needs to be a perfect match on the + * sub-table of size two, because the start character has already been + * recomposed partially. This lookup uses a perfect hash function for + * the backend code. + */ +#ifndef FRONTEND + + int h, + inv_lookup_index; + uint64 hashkey; + pg_unicode_recompinfo recompinfo = UnicodeRecompInfo; + + /* + * Compute the hash function. The hash key is formed by concatenating + * bytes of the two codepoints in network order. See also + * src/common/unicode/generate-unicode_norm_table.pl. + */ + hashkey = pg_hton64(((uint64) start << 32) | (uint64) code); + h = recompinfo.hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= recompinfo.num_recomps) + return false; + + inv_lookup_index = recompinfo.inverse_lookup[h]; + entry = &UnicodeDecompMain[inv_lookup_index]; + + if (start == UnicodeDecomp_codepoints[entry->dec_index] && + code == UnicodeDecomp_codepoints[entry->dec_index + 1]) + { + *result = entry->codepoint; + return true; + } + +#else + + int i; + + for (i = 0; i < lengthof(UnicodeDecompMain); i++) + { + entry = &UnicodeDecompMain[i]; + + if (DECOMPOSITION_SIZE(entry) != 2) + continue; + + if (DECOMPOSITION_NO_COMPOSE(entry)) + continue; + + if (start == UnicodeDecomp_codepoints[entry->dec_index] && + code == UnicodeDecomp_codepoints[entry->dec_index + 1]) + { + *result = entry->codepoint; + return true; + } + } +#endif /* !FRONTEND */ + } + + return false; +} + +/* + * Decompose the given code into the array given by caller. The + * decomposition begins at the position given by caller, saving one + * lookup on the decomposition table. The current position needs to be + * updated here to let the caller know from where to continue filling + * in the array result. + */ +static void +decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) +{ + const pg_unicode_decomposition *entry; + int i; + const uint32 *decomp; + int dec_size; + + /* + * Fast path for Hangul characters not stored in tables to save memory as + * decomposition is algorithmic. See + * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details + * on the matter. + */ + if (code >= SBASE && code < SBASE + SCOUNT) + { + uint32 l, + v, + tindex, + sindex; + pg_wchar *res = *result; + + sindex = code - SBASE; + l = LBASE + sindex / (VCOUNT * TCOUNT); + v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT; + tindex = sindex % TCOUNT; + + res[*current] = l; + (*current)++; + res[*current] = v; + (*current)++; + + if (tindex != 0) + { + res[*current] = TBASE + tindex; + (*current)++; + } + + return; + } + + entry = get_code_entry(code); + + /* + * Just fill in with the current decomposition if there are no + * decomposition codes to recurse to. A NULL entry is equivalent to a + * character with class 0 and no decompositions, so just leave also in + * this case. + */ + if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || + (!compat && DECOMPOSITION_IS_COMPAT(entry))) + { + pg_wchar *res = *result; + + res[*current] = code; + (*current)++; + return; + } + + /* + * If this entry has other decomposition codes look at them as well. + */ + decomp = get_code_decomposition(entry, &dec_size); + for (i = 0; i < dec_size; i++) + { + pg_wchar lcode = (pg_wchar) decomp[i]; + + /* Leave if no more decompositions */ + decompose_code(lcode, compat, result, current); + } +} + +/* + * unicode_normalize - Normalize a Unicode string to the specified form. + * + * The input is a 0-terminated array of codepoints. + * + * In frontend, returns a 0-terminated array of codepoints, allocated with + * malloc. Or NULL if we run out of memory. In backend, the returned + * string is palloc'd instead, and OOM is reported with ereport(). + */ +pg_wchar * +unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) +{ + bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD); + bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC); + pg_wchar *decomp_chars; + pg_wchar *recomp_chars; + int decomp_size, + current_size; + int count; + const pg_wchar *p; + + /* variables for recomposition */ + int last_class; + int starter_pos; + int target_pos; + uint32 starter_ch; + + /* First, do character decomposition */ + + /* + * Calculate how many characters long the decomposed version will be. + */ + decomp_size = 0; + for (p = input; *p; p++) + decomp_size += get_decomposed_size(*p, compat); + + decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); + if (decomp_chars == NULL) + return NULL; + + /* + * Now fill in each entry recursively. This needs a second pass on the + * decomposition table. + */ + current_size = 0; + for (p = input; *p; p++) + decompose_code(*p, compat, &decomp_chars, ¤t_size); + decomp_chars[decomp_size] = '\0'; + Assert(decomp_size == current_size); + + /* Leave if there is nothing to decompose */ + if (decomp_size == 0) + return decomp_chars; + + /* + * Now apply canonical ordering. + */ + for (count = 1; count < decomp_size; count++) + { + pg_wchar prev = decomp_chars[count - 1]; + pg_wchar next = decomp_chars[count]; + pg_wchar tmp; + const uint8 prevClass = get_canonical_class(prev); + const uint8 nextClass = get_canonical_class(next); + + /* + * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) + * annex 4, a sequence of two adjacent characters in a string is an + * exchangeable pair if the combining class (from the Unicode + * Character Database) for the first character is greater than the + * combining class for the second, and the second is not a starter. A + * character is a starter if its combining class is 0. + */ + if (prevClass == 0 || nextClass == 0) + continue; + + if (prevClass <= nextClass) + continue; + + /* exchange can happen */ + tmp = decomp_chars[count - 1]; + decomp_chars[count - 1] = decomp_chars[count]; + decomp_chars[count] = tmp; + + /* backtrack to check again */ + if (count > 1) + count -= 2; + } + + if (!recompose) + return decomp_chars; + + /* + * The last phase of NFC and NFKC is the recomposition of the reordered + * Unicode string using combining classes. The recomposed string cannot be + * longer than the decomposed one, so make the allocation of the output + * string based on that assumption. + */ + recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); + if (!recomp_chars) + { + FREE(decomp_chars); + return NULL; + } + + last_class = -1; /* this eliminates a special check */ + starter_pos = 0; + target_pos = 1; + starter_ch = recomp_chars[0] = decomp_chars[0]; + + for (count = 1; count < decomp_size; count++) + { + pg_wchar ch = decomp_chars[count]; + int ch_class = get_canonical_class(ch); + pg_wchar composite; + + if (last_class < ch_class && + recompose_code(starter_ch, ch, &composite)) + { + recomp_chars[starter_pos] = composite; + starter_ch = composite; + } + else if (ch_class == 0) + { + starter_pos = target_pos; + starter_ch = ch; + last_class = -1; + recomp_chars[target_pos++] = ch; + } + else + { + last_class = ch_class; + recomp_chars[target_pos++] = ch; + } + } + recomp_chars[target_pos] = (pg_wchar) '\0'; + + FREE(decomp_chars); + + return recomp_chars; +} + +/* + * Normalization "quick check" algorithm; see + * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms> + */ + +/* We only need this in the backend. */ +#ifndef FRONTEND + +static const pg_unicode_normprops * +qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo) +{ + int h; + uint32 hashkey; + + /* + * Compute the hash function. The hash key is the codepoint with the bytes + * in network order. + */ + hashkey = pg_hton32(ch); + h = norminfo->hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= norminfo->num_normprops) + return NULL; + + /* + * Since it's a perfect hash, we need only match to the specific codepoint + * it identifies. + */ + if (ch != norminfo->normprops[h].codepoint) + return NULL; + + /* Success! */ + return &norminfo->normprops[h]; +} + +/* + * Look up the normalization quick check character property + */ +static UnicodeNormalizationQC +qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch) +{ + const pg_unicode_normprops *found = NULL; + + switch (form) + { + case UNICODE_NFC: + found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC); + break; + case UNICODE_NFKC: + found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC); + break; + default: + Assert(false); + break; + } + + if (found) + return found->quickcheck; + else + return UNICODE_NORM_QC_YES; +} + +UnicodeNormalizationQC +unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input) +{ + uint8 lastCanonicalClass = 0; + UnicodeNormalizationQC result = UNICODE_NORM_QC_YES; + + /* + * For the "D" forms, we don't run the quickcheck. We don't include the + * lookup tables for those because they are huge, checking for these + * particular forms is less common, and running the slow path is faster + * for the "D" forms than the "C" forms because you don't need to + * recompose, which is slow. + */ + if (form == UNICODE_NFD || form == UNICODE_NFKD) + return UNICODE_NORM_QC_MAYBE; + + for (const pg_wchar *p = input; *p; p++) + { + pg_wchar ch = *p; + uint8 canonicalClass; + UnicodeNormalizationQC check; + + canonicalClass = get_canonical_class(ch); + if (lastCanonicalClass > canonicalClass && canonicalClass != 0) + return UNICODE_NORM_QC_NO; + + check = qc_is_allowed(form, ch); + if (check == UNICODE_NORM_QC_NO) + return UNICODE_NORM_QC_NO; + else if (check == UNICODE_NORM_QC_MAYBE) + result = UNICODE_NORM_QC_MAYBE; + + lastCanonicalClass = canonicalClass; + } + return result; +} + +#endif /* !FRONTEND */ |