diff options
Diffstat (limited to 'contrib/fuzzystrmatch/daitch_mokotoff.c')
-rw-r--r-- | contrib/fuzzystrmatch/daitch_mokotoff.c | 571 |
1 files changed, 571 insertions, 0 deletions
diff --git a/contrib/fuzzystrmatch/daitch_mokotoff.c b/contrib/fuzzystrmatch/daitch_mokotoff.c new file mode 100644 index 0000000..162e32c --- /dev/null +++ b/contrib/fuzzystrmatch/daitch_mokotoff.c @@ -0,0 +1,571 @@ +/* + * Daitch-Mokotoff Soundex + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * This module was originally sponsored by Finance Norway / + * Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no> + * + * The implementation of the Daitch-Mokotoff Soundex System aims at correctness + * and high performance, and can be summarized as follows: + * + * - The processing of each phoneme is initiated by an O(1) table lookup. + * - For phonemes containing more than one character, a coding tree is traversed + * to process the complete phoneme. + * - The (alternate) soundex codes are produced digit by digit in-place in + * another tree structure. + * + * References: + * + * https://www.avotaynu.com/soundex.htm + * https://www.jewishgen.org/InfoFiles/Soundex.html + * https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex + * https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php) + * https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java) + * https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm) + * + * A few notes on other implementations: + * + * - All other known implementations have the same unofficial rules for "UE", + * these are also adapted by this implementation (0, 1, NC). + * - The only other known implementation which is capable of generating all + * correct soundex codes in all cases is the JOS Soundex Calculator at + * https://www.jewishgen.org/jos/jossound.htm + * - "J" is considered (only) a vowel in dmlat.php + * - The official rules for "RS" are commented out in dmlat.php + * - Identical code digits for adjacent letters are not collapsed correctly in + * dmsoundex.php when double digit codes are involved. E.g. "BESST" yields + * 744300 instead of 743000 as for "BEST". + * - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java + * - "Y" is not considered a vowel in DaitchMokotoffSoundex.java +*/ + +#include "postgres.h" + +#include "catalog/pg_type.h" +#include "mb/pg_wchar.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + + +/* + * The soundex coding chart table is adapted from + * https://www.jewishgen.org/InfoFiles/Soundex.html + * See daitch_mokotoff_header.pl for details. +*/ + +/* Generated coding chart table */ +#include "daitch_mokotoff.h" + +#define DM_CODE_DIGITS 6 + +/* Node in soundex code tree */ +typedef struct dm_node +{ + int soundex_length; /* Length of generated soundex code */ + char soundex[DM_CODE_DIGITS]; /* Soundex code */ + int is_leaf; /* Candidate for complete soundex code */ + int last_update; /* Letter number for last update of node */ + char code_digit; /* Last code digit, 0 - 9 */ + + /* + * One or two alternate code digits leading to this node. If there are two + * digits, one of them is always an 'X'. Repeated code digits and 'X' lead + * back to the same node. + */ + char prev_code_digits[2]; + /* One or two alternate code digits moving forward. */ + char next_code_digits[2]; + /* ORed together code index(es) used to reach current node. */ + int prev_code_index; + int next_code_index; + /* Possible nodes branching out from this node - digits 0-9. */ + struct dm_node *children[10]; + /* Next node in linked list. Alternating index for each iteration. */ + struct dm_node *next[2]; +} dm_node; + +/* Template for new node in soundex code tree. */ +static const dm_node start_node = { + .soundex_length = 0, + .soundex = "000000", /* Six digits */ + .is_leaf = 0, + .last_update = 0, + .code_digit = '\0', + .prev_code_digits = {'\0', '\0'}, + .next_code_digits = {'\0', '\0'}, + .prev_code_index = 0, + .next_code_index = 0, + .children = {NULL}, + .next = {NULL} +}; + +/* Dummy soundex codes at end of input. */ +static const dm_codes end_codes[2] = +{ + { + "X", "X", "X" + } +}; + +/* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */ +static const char iso8859_1_to_ascii_upper[] = +"`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~ ! ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY"; + +/* Internal C implementation */ +static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex); + + +PG_FUNCTION_INFO_V1(daitch_mokotoff); + +Datum +daitch_mokotoff(PG_FUNCTION_ARGS) +{ + text *arg = PG_GETARG_TEXT_PP(0); + Datum retval; + char *string; + ArrayBuildState *soundex; + MemoryContext old_ctx, + tmp_ctx; + + /* Work in a temporary context to simplify cleanup. */ + tmp_ctx = AllocSetContextCreate(CurrentMemoryContext, + "daitch_mokotoff temporary context", + ALLOCSET_DEFAULT_SIZES); + old_ctx = MemoryContextSwitchTo(tmp_ctx); + + /* We must convert the string to UTF-8 if it isn't already. */ + string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg), + PG_UTF8); + + /* The result is built in this ArrayBuildState. */ + soundex = initArrayResult(TEXTOID, tmp_ctx, false); + + if (!daitch_mokotoff_coding(string, soundex)) + { + /* No encodable characters in input */ + MemoryContextSwitchTo(old_ctx); + MemoryContextDelete(tmp_ctx); + PG_RETURN_NULL(); + } + + retval = makeArrayResult(soundex, old_ctx); + + MemoryContextSwitchTo(old_ctx); + MemoryContextDelete(tmp_ctx); + + PG_RETURN_DATUM(retval); +} + + +/* Initialize soundex code tree node for next code digit. */ +static void +initialize_node(dm_node *node, int last_update) +{ + if (node->last_update < last_update) + { + node->prev_code_digits[0] = node->next_code_digits[0]; + node->prev_code_digits[1] = node->next_code_digits[1]; + node->next_code_digits[0] = '\0'; + node->next_code_digits[1] = '\0'; + node->prev_code_index = node->next_code_index; + node->next_code_index = 0; + node->is_leaf = 0; + node->last_update = last_update; + } +} + + +/* Update soundex code tree node with next code digit. */ +static void +add_next_code_digit(dm_node *node, int code_index, char code_digit) +{ + /* OR in index 1 or 2. */ + node->next_code_index |= code_index; + + if (!node->next_code_digits[0]) + node->next_code_digits[0] = code_digit; + else if (node->next_code_digits[0] != code_digit) + node->next_code_digits[1] = code_digit; +} + + +/* Mark soundex code tree node as leaf. */ +static void +set_leaf(dm_node *first_node[2], dm_node *last_node[2], + dm_node *node, int ix_node) +{ + if (!node->is_leaf) + { + node->is_leaf = 1; + + if (first_node[ix_node] == NULL) + first_node[ix_node] = node; + else + last_node[ix_node]->next[ix_node] = node; + + last_node[ix_node] = node; + node->next[ix_node] = NULL; + } +} + + +/* Find next node corresponding to code digit, or create a new node. */ +static dm_node * +find_or_create_child_node(dm_node *parent, char code_digit, + ArrayBuildState *soundex) +{ + int i = code_digit - '0'; + dm_node **nodes = parent->children; + dm_node *node = nodes[i]; + + if (node) + { + /* Found existing child node. Skip completed nodes. */ + return node->soundex_length < DM_CODE_DIGITS ? node : NULL; + } + + /* Create new child node. */ + node = palloc_object(dm_node); + nodes[i] = node; + + *node = start_node; + memcpy(node->soundex, parent->soundex, sizeof(parent->soundex)); + node->soundex_length = parent->soundex_length; + node->soundex[node->soundex_length++] = code_digit; + node->code_digit = code_digit; + node->next_code_index = node->prev_code_index; + + if (node->soundex_length < DM_CODE_DIGITS) + { + return node; + } + else + { + /* Append completed soundex code to output array. */ + text *out = cstring_to_text_with_len(node->soundex, + DM_CODE_DIGITS); + + accumArrayResult(soundex, + PointerGetDatum(out), + false, + TEXTOID, + CurrentMemoryContext); + return NULL; + } +} + + +/* Update node for next code digit(s). */ +static void +update_node(dm_node *first_node[2], dm_node *last_node[2], + dm_node *node, int ix_node, + int letter_no, int prev_code_index, int next_code_index, + const char *next_code_digits, int digit_no, + ArrayBuildState *soundex) +{ + int i; + char next_code_digit = next_code_digits[digit_no]; + int num_dirty_nodes = 0; + dm_node *dirty_nodes[2]; + + initialize_node(node, letter_no); + + if (node->prev_code_index && !(node->prev_code_index & prev_code_index)) + { + /* + * If the sound (vowel / consonant) of this letter encoding doesn't + * correspond to the coding index of the previous letter, we skip this + * letter encoding. Note that currently, only "J" can be either a + * vowel or a consonant. + */ + return; + } + + if (next_code_digit == 'X' || + (digit_no == 0 && + (node->prev_code_digits[0] == next_code_digit || + node->prev_code_digits[1] == next_code_digit))) + { + /* The code digit is the same as one of the previous (i.e. not added). */ + dirty_nodes[num_dirty_nodes++] = node; + } + + if (next_code_digit != 'X' && + (digit_no > 0 || + node->prev_code_digits[0] != next_code_digit || + node->prev_code_digits[1])) + { + /* The code digit is different from one of the previous (i.e. added). */ + node = find_or_create_child_node(node, next_code_digit, soundex); + if (node) + { + initialize_node(node, letter_no); + dirty_nodes[num_dirty_nodes++] = node; + } + } + + for (i = 0; i < num_dirty_nodes; i++) + { + /* Add code digit leading to the current node. */ + add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit); + + if (next_code_digits[++digit_no]) + { + update_node(first_node, last_node, dirty_nodes[i], ix_node, + letter_no, prev_code_index, next_code_index, + next_code_digits, digit_no, + soundex); + } + else + { + /* Add incomplete leaf node to linked list. */ + set_leaf(first_node, last_node, dirty_nodes[i], ix_node); + } + } +} + + +/* Update soundex tree leaf nodes. */ +static void +update_leaves(dm_node *first_node[2], int *ix_node, int letter_no, + const dm_codes *codes, const dm_codes *next_codes, + ArrayBuildState *soundex) +{ + int i, + j, + code_index; + dm_node *node, + *last_node[2]; + const dm_code *code, + *next_code; + int ix_node_next = (*ix_node + 1) & 1; /* Alternating index: 0, 1 */ + + /* Initialize for new linked list of leaves. */ + first_node[ix_node_next] = NULL; + last_node[ix_node_next] = NULL; + + /* Process all nodes. */ + for (node = first_node[*ix_node]; node; node = node->next[*ix_node]) + { + /* One or two alternate code sequences. */ + for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++) + { + /* Coding for previous letter - before vowel: 1, all other: 2 */ + int prev_code_index = (code[0][0] > '1') + 1; + + /* One or two alternate next code sequences. */ + for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++) + { + /* Determine which code to use. */ + if (letter_no == 0) + { + /* This is the first letter. */ + code_index = 0; + } + else if (next_code[0][0] <= '1') + { + /* The next letter is a vowel. */ + code_index = 1; + } + else + { + /* All other cases. */ + code_index = 2; + } + + /* One or two sequential code digits. */ + update_node(first_node, last_node, node, ix_node_next, + letter_no, prev_code_index, code_index, + code[code_index], 0, + soundex); + } + } + } + + *ix_node = ix_node_next; +} + + +/* + * Return next character, converted from UTF-8 to uppercase ASCII. + * *ix is the current string index and is incremented by the character length. + */ +static char +read_char(const unsigned char *str, int *ix) +{ + /* Substitute character for skipped code points. */ + const char na = '\x1a'; + pg_wchar c; + + /* Decode UTF-8 character to ISO 10646 code point. */ + str += *ix; + c = utf8_to_unicode(str); + + /* Advance *ix, but (for safety) not if we've reached end of string. */ + if (c) + *ix += pg_utf_mblen(str); + + /* Convert. */ + if (c >= (unsigned char) '[' && c <= (unsigned char) ']') + { + /* ASCII characters [, \, and ] are reserved for conversions below. */ + return na; + } + else if (c < 0x60) + { + /* Other non-lowercase ASCII characters can be used as-is. */ + return (char) c; + } + else if (c < 0x100) + { + /* ISO-8859-1 code point; convert to upper-case ASCII via table. */ + return iso8859_1_to_ascii_upper[c - 0x60]; + } + else + { + /* Conversion of non-ASCII characters in the coding chart. */ + switch (c) + { + case 0x0104: /* LATIN CAPITAL LETTER A WITH OGONEK */ + case 0x0105: /* LATIN SMALL LETTER A WITH OGONEK */ + return '['; + case 0x0118: /* LATIN CAPITAL LETTER E WITH OGONEK */ + case 0x0119: /* LATIN SMALL LETTER E WITH OGONEK */ + return '\\'; + case 0x0162: /* LATIN CAPITAL LETTER T WITH CEDILLA */ + case 0x0163: /* LATIN SMALL LETTER T WITH CEDILLA */ + case 0x021A: /* LATIN CAPITAL LETTER T WITH COMMA BELOW */ + case 0x021B: /* LATIN SMALL LETTER T WITH COMMA BELOW */ + return ']'; + default: + return na; + } + } +} + + +/* Read next ASCII character, skipping any characters not in [A-\]]. */ +static char +read_valid_char(const char *str, int *ix) +{ + char c; + + while ((c = read_char((const unsigned char *) str, ix)) != '\0') + { + if (c >= 'A' && c <= ']') + break; + } + + return c; +} + + +/* Return sound coding for "letter" (letter sequence) */ +static const dm_codes * +read_letter(const char *str, int *ix) +{ + char c, + cmp; + int i, + j; + const dm_letter *letters; + const dm_codes *codes; + + /* First letter in sequence. */ + if ((c = read_valid_char(str, ix)) == '\0') + return NULL; + + letters = &letter_[c - 'A']; + codes = letters->codes; + i = *ix; + + /* Any subsequent letters in sequence. */ + while ((letters = letters->letters) && (c = read_valid_char(str, &i))) + { + for (j = 0; (cmp = letters[j].letter); j++) + { + if (cmp == c) + { + /* Letter found. */ + letters = &letters[j]; + if (letters->codes) + { + /* Coding for letter sequence found. */ + codes = letters->codes; + *ix = i; + } + break; + } + } + if (!cmp) + { + /* The sequence of letters has no coding. */ + break; + } + } + + return codes; +} + + +/* + * Generate all Daitch-Mokotoff soundex codes for word, + * adding them to the "soundex" ArrayBuildState. + * Returns false if string has no encodable characters, else true. + */ +static bool +daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex) +{ + int i = 0; + int letter_no = 0; + int ix_node = 0; + const dm_codes *codes, + *next_codes; + dm_node *first_node[2], + *node; + + /* First letter. */ + if (!(codes = read_letter(word, &i))) + { + /* No encodable character in input. */ + return false; + } + + /* Starting point. */ + first_node[ix_node] = palloc_object(dm_node); + *first_node[ix_node] = start_node; + + /* + * Loop until either the word input is exhausted, or all generated soundex + * codes are completed to six digits. + */ + while (codes && first_node[ix_node]) + { + next_codes = read_letter(word, &i); + + /* Update leaf nodes. */ + update_leaves(first_node, &ix_node, letter_no, + codes, next_codes ? next_codes : end_codes, + soundex); + + codes = next_codes; + letter_no++; + } + + /* Append all remaining (incomplete) soundex codes to output array. */ + for (node = first_node[ix_node]; node; node = node->next[ix_node]) + { + text *out = cstring_to_text_with_len(node->soundex, + DM_CODE_DIGITS); + + accumArrayResult(soundex, + PointerGetDatum(out), + false, + TEXTOID, + CurrentMemoryContext); + } + + return true; +} |