summaryrefslogtreecommitdiffstats
path: root/contrib/fuzzystrmatch/daitch_mokotoff.c
blob: 162e32c11543fdcc278ace74db836a303331c0f3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
/*
 * Daitch-Mokotoff Soundex
 *
 * Copyright (c) 2023, PostgreSQL Global Development Group
 *
 * This module was originally sponsored by Finance Norway /
 * Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
 *
 * The implementation of the Daitch-Mokotoff Soundex System aims at correctness
 * and high performance, and can be summarized as follows:
 *
 * - The processing of each phoneme is initiated by an O(1) table lookup.
 * - For phonemes containing more than one character, a coding tree is traversed
 *   to process the complete phoneme.
 * - The (alternate) soundex codes are produced digit by digit in-place in
 *   another tree structure.
 *
 * References:
 *
 * https://www.avotaynu.com/soundex.htm
 * https://www.jewishgen.org/InfoFiles/Soundex.html
 * https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex
 * https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php)
 * https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java)
 * https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm)
 *
 * A few notes on other implementations:
 *
 * - All other known implementations have the same unofficial rules for "UE",
 *   these are also adapted by this implementation (0, 1, NC).
 * - The only other known implementation which is capable of generating all
 *   correct soundex codes in all cases is the JOS Soundex Calculator at
 *   https://www.jewishgen.org/jos/jossound.htm
 * - "J" is considered (only) a vowel in dmlat.php
 * - The official rules for "RS" are commented out in dmlat.php
 * - Identical code digits for adjacent letters are not collapsed correctly in
 *   dmsoundex.php when double digit codes are involved. E.g. "BESST" yields
 *   744300 instead of 743000 as for "BEST".
 * - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java
 * - "Y" is not considered a vowel in DaitchMokotoffSoundex.java
*/

#include "postgres.h"

#include "catalog/pg_type.h"
#include "mb/pg_wchar.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/memutils.h"


/*
 * The soundex coding chart table is adapted from
 * https://www.jewishgen.org/InfoFiles/Soundex.html
 * See daitch_mokotoff_header.pl for details.
*/

/* Generated coding chart table */
#include "daitch_mokotoff.h"

#define DM_CODE_DIGITS 6

/* Node in soundex code tree */
typedef struct dm_node
{
	int			soundex_length; /* Length of generated soundex code */
	char		soundex[DM_CODE_DIGITS];	/* Soundex code */
	int			is_leaf;		/* Candidate for complete soundex code */
	int			last_update;	/* Letter number for last update of node */
	char		code_digit;		/* Last code digit, 0 - 9 */

	/*
	 * One or two alternate code digits leading to this node. If there are two
	 * digits, one of them is always an 'X'. Repeated code digits and 'X' lead
	 * back to the same node.
	 */
	char		prev_code_digits[2];
	/* One or two alternate code digits moving forward. */
	char		next_code_digits[2];
	/* ORed together code index(es) used to reach current node. */
	int			prev_code_index;
	int			next_code_index;
	/* Possible nodes branching out from this node - digits 0-9. */
	struct dm_node *children[10];
	/* Next node in linked list. Alternating index for each iteration. */
	struct dm_node *next[2];
} dm_node;

/* Template for new node in soundex code tree. */
static const dm_node start_node = {
	.soundex_length = 0,
	.soundex = "000000",		/* Six digits */
	.is_leaf = 0,
	.last_update = 0,
	.code_digit = '\0',
	.prev_code_digits = {'\0', '\0'},
	.next_code_digits = {'\0', '\0'},
	.prev_code_index = 0,
	.next_code_index = 0,
	.children = {NULL},
	.next = {NULL}
};

/* Dummy soundex codes at end of input. */
static const dm_codes end_codes[2] =
{
	{
		"X", "X", "X"
	}
};

/* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */
static const char iso8859_1_to_ascii_upper[] =
"`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~                                  !                             ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY";

/* Internal C implementation */
static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex);


PG_FUNCTION_INFO_V1(daitch_mokotoff);

Datum
daitch_mokotoff(PG_FUNCTION_ARGS)
{
	text	   *arg = PG_GETARG_TEXT_PP(0);
	Datum		retval;
	char	   *string;
	ArrayBuildState *soundex;
	MemoryContext old_ctx,
				tmp_ctx;

	/* Work in a temporary context to simplify cleanup. */
	tmp_ctx = AllocSetContextCreate(CurrentMemoryContext,
									"daitch_mokotoff temporary context",
									ALLOCSET_DEFAULT_SIZES);
	old_ctx = MemoryContextSwitchTo(tmp_ctx);

	/* We must convert the string to UTF-8 if it isn't already. */
	string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg),
							  PG_UTF8);

	/* The result is built in this ArrayBuildState. */
	soundex = initArrayResult(TEXTOID, tmp_ctx, false);

	if (!daitch_mokotoff_coding(string, soundex))
	{
		/* No encodable characters in input */
		MemoryContextSwitchTo(old_ctx);
		MemoryContextDelete(tmp_ctx);
		PG_RETURN_NULL();
	}

	retval = makeArrayResult(soundex, old_ctx);

	MemoryContextSwitchTo(old_ctx);
	MemoryContextDelete(tmp_ctx);

	PG_RETURN_DATUM(retval);
}


/* Initialize soundex code tree node for next code digit. */
static void
initialize_node(dm_node *node, int last_update)
{
	if (node->last_update < last_update)
	{
		node->prev_code_digits[0] = node->next_code_digits[0];
		node->prev_code_digits[1] = node->next_code_digits[1];
		node->next_code_digits[0] = '\0';
		node->next_code_digits[1] = '\0';
		node->prev_code_index = node->next_code_index;
		node->next_code_index = 0;
		node->is_leaf = 0;
		node->last_update = last_update;
	}
}


/* Update soundex code tree node with next code digit. */
static void
add_next_code_digit(dm_node *node, int code_index, char code_digit)
{
	/* OR in index 1 or 2. */
	node->next_code_index |= code_index;

	if (!node->next_code_digits[0])
		node->next_code_digits[0] = code_digit;
	else if (node->next_code_digits[0] != code_digit)
		node->next_code_digits[1] = code_digit;
}


/* Mark soundex code tree node as leaf. */
static void
set_leaf(dm_node *first_node[2], dm_node *last_node[2],
		 dm_node *node, int ix_node)
{
	if (!node->is_leaf)
	{
		node->is_leaf = 1;

		if (first_node[ix_node] == NULL)
			first_node[ix_node] = node;
		else
			last_node[ix_node]->next[ix_node] = node;

		last_node[ix_node] = node;
		node->next[ix_node] = NULL;
	}
}


/* Find next node corresponding to code digit, or create a new node. */
static dm_node *
find_or_create_child_node(dm_node *parent, char code_digit,
						  ArrayBuildState *soundex)
{
	int			i = code_digit - '0';
	dm_node   **nodes = parent->children;
	dm_node    *node = nodes[i];

	if (node)
	{
		/* Found existing child node. Skip completed nodes. */
		return node->soundex_length < DM_CODE_DIGITS ? node : NULL;
	}

	/* Create new child node. */
	node = palloc_object(dm_node);
	nodes[i] = node;

	*node = start_node;
	memcpy(node->soundex, parent->soundex, sizeof(parent->soundex));
	node->soundex_length = parent->soundex_length;
	node->soundex[node->soundex_length++] = code_digit;
	node->code_digit = code_digit;
	node->next_code_index = node->prev_code_index;

	if (node->soundex_length < DM_CODE_DIGITS)
	{
		return node;
	}
	else
	{
		/* Append completed soundex code to output array. */
		text	   *out = cstring_to_text_with_len(node->soundex,
												   DM_CODE_DIGITS);

		accumArrayResult(soundex,
						 PointerGetDatum(out),
						 false,
						 TEXTOID,
						 CurrentMemoryContext);
		return NULL;
	}
}


/* Update node for next code digit(s). */
static void
update_node(dm_node *first_node[2], dm_node *last_node[2],
			dm_node *node, int ix_node,
			int letter_no, int prev_code_index, int next_code_index,
			const char *next_code_digits, int digit_no,
			ArrayBuildState *soundex)
{
	int			i;
	char		next_code_digit = next_code_digits[digit_no];
	int			num_dirty_nodes = 0;
	dm_node    *dirty_nodes[2];

	initialize_node(node, letter_no);

	if (node->prev_code_index && !(node->prev_code_index & prev_code_index))
	{
		/*
		 * If the sound (vowel / consonant) of this letter encoding doesn't
		 * correspond to the coding index of the previous letter, we skip this
		 * letter encoding. Note that currently, only "J" can be either a
		 * vowel or a consonant.
		 */
		return;
	}

	if (next_code_digit == 'X' ||
		(digit_no == 0 &&
		 (node->prev_code_digits[0] == next_code_digit ||
		  node->prev_code_digits[1] == next_code_digit)))
	{
		/* The code digit is the same as one of the previous (i.e. not added). */
		dirty_nodes[num_dirty_nodes++] = node;
	}

	if (next_code_digit != 'X' &&
		(digit_no > 0 ||
		 node->prev_code_digits[0] != next_code_digit ||
		 node->prev_code_digits[1]))
	{
		/* The code digit is different from one of the previous (i.e. added). */
		node = find_or_create_child_node(node, next_code_digit, soundex);
		if (node)
		{
			initialize_node(node, letter_no);
			dirty_nodes[num_dirty_nodes++] = node;
		}
	}

	for (i = 0; i < num_dirty_nodes; i++)
	{
		/* Add code digit leading to the current node. */
		add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit);

		if (next_code_digits[++digit_no])
		{
			update_node(first_node, last_node, dirty_nodes[i], ix_node,
						letter_no, prev_code_index, next_code_index,
						next_code_digits, digit_no,
						soundex);
		}
		else
		{
			/* Add incomplete leaf node to linked list. */
			set_leaf(first_node, last_node, dirty_nodes[i], ix_node);
		}
	}
}


/* Update soundex tree leaf nodes. */
static void
update_leaves(dm_node *first_node[2], int *ix_node, int letter_no,
			  const dm_codes *codes, const dm_codes *next_codes,
			  ArrayBuildState *soundex)
{
	int			i,
				j,
				code_index;
	dm_node    *node,
			   *last_node[2];
	const dm_code *code,
			   *next_code;
	int			ix_node_next = (*ix_node + 1) & 1;	/* Alternating index: 0, 1 */

	/* Initialize for new linked list of leaves. */
	first_node[ix_node_next] = NULL;
	last_node[ix_node_next] = NULL;

	/* Process all nodes. */
	for (node = first_node[*ix_node]; node; node = node->next[*ix_node])
	{
		/* One or two alternate code sequences. */
		for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++)
		{
			/* Coding for previous letter - before vowel: 1, all other: 2 */
			int			prev_code_index = (code[0][0] > '1') + 1;

			/* One or two alternate next code sequences. */
			for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++)
			{
				/* Determine which code to use. */
				if (letter_no == 0)
				{
					/* This is the first letter. */
					code_index = 0;
				}
				else if (next_code[0][0] <= '1')
				{
					/* The next letter is a vowel. */
					code_index = 1;
				}
				else
				{
					/* All other cases. */
					code_index = 2;
				}

				/* One or two sequential code digits. */
				update_node(first_node, last_node, node, ix_node_next,
							letter_no, prev_code_index, code_index,
							code[code_index], 0,
							soundex);
			}
		}
	}

	*ix_node = ix_node_next;
}


/*
 * Return next character, converted from UTF-8 to uppercase ASCII.
 * *ix is the current string index and is incremented by the character length.
 */
static char
read_char(const unsigned char *str, int *ix)
{
	/* Substitute character for skipped code points. */
	const char	na = '\x1a';
	pg_wchar	c;

	/* Decode UTF-8 character to ISO 10646 code point. */
	str += *ix;
	c = utf8_to_unicode(str);

	/* Advance *ix, but (for safety) not if we've reached end of string. */
	if (c)
		*ix += pg_utf_mblen(str);

	/* Convert. */
	if (c >= (unsigned char) '[' && c <= (unsigned char) ']')
	{
		/* ASCII characters [, \, and ] are reserved for conversions below. */
		return na;
	}
	else if (c < 0x60)
	{
		/* Other non-lowercase ASCII characters can be used as-is. */
		return (char) c;
	}
	else if (c < 0x100)
	{
		/* ISO-8859-1 code point; convert to upper-case ASCII via table. */
		return iso8859_1_to_ascii_upper[c - 0x60];
	}
	else
	{
		/* Conversion of non-ASCII characters in the coding chart. */
		switch (c)
		{
			case 0x0104:		/* LATIN CAPITAL LETTER A WITH OGONEK */
			case 0x0105:		/* LATIN SMALL LETTER A WITH OGONEK */
				return '[';
			case 0x0118:		/* LATIN CAPITAL LETTER E WITH OGONEK */
			case 0x0119:		/* LATIN SMALL LETTER E WITH OGONEK */
				return '\\';
			case 0x0162:		/* LATIN CAPITAL LETTER T WITH CEDILLA */
			case 0x0163:		/* LATIN SMALL LETTER T WITH CEDILLA */
			case 0x021A:		/* LATIN CAPITAL LETTER T WITH COMMA BELOW */
			case 0x021B:		/* LATIN SMALL LETTER T WITH COMMA BELOW */
				return ']';
			default:
				return na;
		}
	}
}


/* Read next ASCII character, skipping any characters not in [A-\]]. */
static char
read_valid_char(const char *str, int *ix)
{
	char		c;

	while ((c = read_char((const unsigned char *) str, ix)) != '\0')
	{
		if (c >= 'A' && c <= ']')
			break;
	}

	return c;
}


/* Return sound coding for "letter" (letter sequence) */
static const dm_codes *
read_letter(const char *str, int *ix)
{
	char		c,
				cmp;
	int			i,
				j;
	const dm_letter *letters;
	const dm_codes *codes;

	/* First letter in sequence. */
	if ((c = read_valid_char(str, ix)) == '\0')
		return NULL;

	letters = &letter_[c - 'A'];
	codes = letters->codes;
	i = *ix;

	/* Any subsequent letters in sequence. */
	while ((letters = letters->letters) && (c = read_valid_char(str, &i)))
	{
		for (j = 0; (cmp = letters[j].letter); j++)
		{
			if (cmp == c)
			{
				/* Letter found. */
				letters = &letters[j];
				if (letters->codes)
				{
					/* Coding for letter sequence found. */
					codes = letters->codes;
					*ix = i;
				}
				break;
			}
		}
		if (!cmp)
		{
			/* The sequence of letters has no coding. */
			break;
		}
	}

	return codes;
}


/*
 * Generate all Daitch-Mokotoff soundex codes for word,
 * adding them to the "soundex" ArrayBuildState.
 * Returns false if string has no encodable characters, else true.
 */
static bool
daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex)
{
	int			i = 0;
	int			letter_no = 0;
	int			ix_node = 0;
	const dm_codes *codes,
			   *next_codes;
	dm_node    *first_node[2],
			   *node;

	/* First letter. */
	if (!(codes = read_letter(word, &i)))
	{
		/* No encodable character in input. */
		return false;
	}

	/* Starting point. */
	first_node[ix_node] = palloc_object(dm_node);
	*first_node[ix_node] = start_node;

	/*
	 * Loop until either the word input is exhausted, or all generated soundex
	 * codes are completed to six digits.
	 */
	while (codes && first_node[ix_node])
	{
		next_codes = read_letter(word, &i);

		/* Update leaf nodes. */
		update_leaves(first_node, &ix_node, letter_no,
					  codes, next_codes ? next_codes : end_codes,
					  soundex);

		codes = next_codes;
		letter_no++;
	}

	/* Append all remaining (incomplete) soundex codes to output array. */
	for (node = first_node[ix_node]; node; node = node->next[ix_node])
	{
		text	   *out = cstring_to_text_with_len(node->soundex,
												   DM_CODE_DIGITS);

		accumArrayResult(soundex,
						 PointerGetDatum(out),
						 false,
						 TEXTOID,
						 CurrentMemoryContext);
	}

	return true;
}