Adding upstream version 1:2.3.19.1+dfsg1.upstream/1%2.3.19.1+dfsg1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 09:51:24 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 09:51:24 +0000
commit: f7548d6d28c313cf80e6f3ef89aed16a19815df1 (patch)
tree: a3f6f2a3f247293bee59ecd28e8cd8ceb6ca064a /src/lib-fts/test-fts-tokenizer.c
parent: Initial commit. (diff)
download: dovecot-upstream.tar.xz
dovecot-upstream.zip
1 files changed, 690 insertions, 0 deletions
diff --git a/src/lib-fts/test-fts-tokenizer.c b/src/lib-fts/test-fts-tokenizer.c
new file mode 100644
index 0000000..955dca4
--- /dev/null
+++ b/src/lib-fts/test-fts-tokenizer.c
@@ -0,0 +1,690 @@
+/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h"
+#include "str.h"
+#include "test-common.h"
+#include "fts-tokenizer.h"
+#include "fts-tokenizer-common.h"
+#include "fts-tokenizer-private.h"
+#include "fts-tokenizer-generic-private.h"
+
+/*there should be a trailing space ' ' at the end of each string except the last one*/
+#define TEST_INPUT_ADDRESS \
+	"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
+	"Bar Baz <bar@example.org>" \
+	"Foo Bar (comment)foo.bar@host.example.org " \
+	"foo, foo@domain " \
+	"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@1bcdefghijklmnopqrstuvxy1.2bcdefghijklmnopqrstuvxy2.3bcdefghijklmnopqrstuvxy3.4bcdefghijklmnopqrstuvxy4.5bcdefghijklmnopqrstuvxy5.6bcdefghijklmnopqrstuvxy6.7bcdefghijklmnopqrstuvxy7.8bcdefghijklmnopqrstuvxy8.9bcdefghijklmnopqrstuvxy9.0bcdefghijklmnopqrstuvxy0.tld " \
+	"trailing, period@blue.com. " \
+	"multi-trialing, mul@trail.com..... " \
+	"m@s " \
+	"hypen@hypen-hypen.com " \
+	"hypen@hypen-hypen-sick.com.-"
+
+static const char *test_inputs[] = {
+	/* generic things and word truncation: */
+	"hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
+	"abc@example.com, "
+	"Bar Baz <bar@example.org>, "
+	"foo@domain "
+	"1234567890123456789012345678\xC3\xA4,"
+	"12345678901234567890123456789\xC3\xA4,"
+	"123456789012345678901234567890\xC3\xA4,"
+	"and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
+	"(\"Hello world\")3.14 3,14 last",
+
+	"1.",
+
+	"' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
+
+	"'1234567890123456789012345678\xC3\xA4,"
+	"123456789012345678901234567x'\xC3\xA4,"
+	"1234567890123456789012345678x're,"
+	"1234567890123456789012345678x',"
+	"1234567890123456789012345678x'',"
+	"12345678901234567890123456789x',"
+	"12345678901234567890123456789x'',"
+	"123456789012345678901234567890x',"
+	"123456789012345678901234567890x'',"
+
+	/* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
+#define SQ "\xE2\x80\x99"
+	SQ " " SQ " " SQ SQ " " SQ SQ SQ " " SQ "quoted text" SQ SQ "word" SQ " " SQ "hlo words" SQ " you" SQ "re78901234567890123456789012 bad" SQ SQ SQ "word" SQ SQ SQ "pre post" SQ SQ SQ,
+
+	"you" SQ "re" SQ "xyz",
+
+	/* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
+	   U+205A(e2 81 9a) and U+205F(e2 81 9f) */
+	"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
+	"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
+
+	/* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E  */
+	"hello world\xEF\xBC\x8E",
+
+	/* TR29 WB5a */
+	"l" SQ "homme l" SQ "humanit\xC3\xA9 d" SQ "immixtions qu" SQ "il aujourd'hui que'euq"
+};
+
+static void test_fts_tokenizer_find(void)
+{
+	test_begin("fts tokenizer find");
+	test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
+	test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
+	test_end();
+}
+
+static unsigned int
+test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
+			   const char *const *expected_output,
+			   unsigned int first_outi)
+{
+	const unsigned char *input = (const unsigned char *)_input;
+	const char *token, *error;
+	unsigned int i, outi, max, char_len;
+	size_t input_len = strlen(_input);
+
+	/* test all input at once */
+	outi = first_outi;
+	while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
+		test_assert_strcmp(token, expected_output[outi]);
+		outi++;
+	}
+	while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
+		test_assert_strcmp(token, expected_output[outi]);
+		outi++;
+	}
+	test_assert_idx(expected_output[outi] == NULL, outi);
+
+	/* test input one byte at a time */
+	outi = first_outi;
+	for (i = 0; i < input_len; i += char_len) {
+		char_len = uni_utf8_char_bytes(input[i]);
+		while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
+			test_assert_strcmp(token, expected_output[outi]);
+			outi++;
+		}
+	}
+	while (fts_tokenizer_final(tok, &token, &error) > 0) {
+		test_assert_strcmp(token, expected_output[outi]);
+		outi++;
+	}
+	test_assert_idx(expected_output[outi] == NULL, outi);
+
+	/* test input in random chunks */
+	outi = first_outi;
+	for (i = 0; i < input_len; i += char_len) {
+		max = i_rand_minmax(1, input_len - i);
+		for (char_len = 0; char_len < max; )
+			char_len += uni_utf8_char_bytes(input[i+char_len]);
+		while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
+			test_assert_strcmp(token, expected_output[outi]);
+			outi++;
+		}
+	}
+	while (fts_tokenizer_final(tok, &token, &error) > 0) {
+		test_assert_strcmp(token, expected_output[outi]);
+		outi++;
+	}
+	test_assert_idx(expected_output[outi] == NULL, outi);
+
+	return outi+1;
+}
+
+static void
+test_tokenizer_inputs(struct fts_tokenizer *tok,
+		      const char *const *inputs, unsigned int count,
+		      const char *const *expected_output)
+{
+	unsigned int i, outi = 0;
+
+	for (i = 0; i < count; i++) {
+		outi = test_tokenizer_inputoutput(tok, inputs[i],
+						  expected_output, outi);
+	}
+	test_assert_idx(expected_output[outi] == NULL, outi);
+}
+
+static void test_fts_tokenizer_generic_only(void)
+{
+	static const char *const expected_output[] = {
+		"hello", "world", "And",
+		"there", "was", "text", "galor\xC3\xA9",
+		"abc", "example", "com", "Bar", "Baz",
+		"bar", "example", "org", "foo", "domain",
+		"1234567890123456789012345678\xC3\xA4",
+		"12345678901234567890123456789",
+		"123456789012345678901234567890",
+		"and", "longlonglongabcdefghijklmnopqr",
+		"more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
+
+		"1", NULL,
+
+		"quoted", "text", "word", "hlo", "words", "you're", "bad",
+		"word", "pre", "post", NULL,
+
+		"1234567890123456789012345678\xC3\xA4",
+		"123456789012345678901234567x'",
+		"1234567890123456789012345678x'",
+		"1234567890123456789012345678x",
+		"1234567890123456789012345678x",
+		"12345678901234567890123456789x",
+		"12345678901234567890123456789x",
+		"123456789012345678901234567890",
+		"123456789012345678901234567890",
+
+		"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
+		"word", "pre", "post", NULL,
+
+		"you're'xyz", NULL,
+
+		"hello", "world", "And",
+		"there", "was", "text", "galore",
+		"and", "more", NULL,
+
+		"hello", "world", NULL,
+
+		"l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
+
+		NULL
+	};
+	struct fts_tokenizer *tok;
+	const char *error;
+
+	test_begin("fts tokenizer generic simple");
+	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
+	test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
+
+	test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
+	fts_tokenizer_unref(&tok);
+	test_end();
+}
+
+const char *const tr29_settings[] = {"algorithm", "tr29", NULL};
+
+/* TODO: U+206F is in "Format" and therefore currently not word break.
+   This definitely needs to be remapped. */
+static void test_fts_tokenizer_generic_tr29_only(void)
+{
+	static const char *const expected_output[] = {
+		"hello", "world", "And",
+		"there", "was", "text", "galor\xC3\xA9",
+		"abc", "example", "com", "Bar", "Baz",
+		"bar", "example", "org", "foo", "domain",
+		"1234567890123456789012345678\xC3\xA4",
+		"12345678901234567890123456789",
+		"123456789012345678901234567890",
+		"and", "longlonglongabcdefghijklmnopqr",
+		"more", "Hello", "world", "3", "14", "3,14", "last", NULL,
+
+		"1", NULL,
+
+		"quoted", "text", "word", "hlo", "words", "you're", "bad",
+		"word", "pre", "post", NULL,
+
+		"1234567890123456789012345678\xC3\xA4",
+		"123456789012345678901234567x'",
+		"1234567890123456789012345678x'",
+		"1234567890123456789012345678x",
+		"1234567890123456789012345678x",
+		"12345678901234567890123456789x",
+		"12345678901234567890123456789x",
+		"123456789012345678901234567890",
+		"123456789012345678901234567890",
+
+		"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
+		"word", "pre", "post", NULL,
+
+		"you're'xyz", NULL,
+
+		"hello", "world", "And",
+		"there", "was", "text", "galore",
+		"and", "more", NULL,
+
+		"hello", "world", NULL,
+
+		"l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
+		NULL
+	};
+	struct fts_tokenizer *tok;
+	const char *error;
+
+	test_begin("fts tokenizer generic TR29");
+	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
+	test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
+	fts_tokenizer_unref(&tok);
+	test_end();
+}
+
+const char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
+
+/* TODO: U+206F is in "Format" and therefore currently not word break.
+   This definitely needs to be remapped. */
+static void test_fts_tokenizer_generic_tr29_wb5a(void)
+{
+	static const char *const expected_output[] = {
+		"hello", "world", "And",
+		"there", "was", "text", "galor\xC3\xA9",
+		"abc", "example", "com", "Bar", "Baz",
+		"bar", "example", "org", "foo", "domain",
+		"1234567890123456789012345678\xC3\xA4",
+		"12345678901234567890123456789",
+		"123456789012345678901234567890",
+		"and", "longlonglongabcdefghijklmnopqr",
+		"more", "Hello", "world", "3", "14", "3,14", "last", NULL,
+
+		"1", NULL,
+
+		"quoted", "text", "word", "hlo", "words", "you're", "bad",
+		"word", "pre", "post", NULL,
+
+		"1234567890123456789012345678\xC3\xA4",
+		"123456789012345678901234567x'",
+		"1234567890123456789012345678x'",
+		"1234567890123456789012345678x",
+		"1234567890123456789012345678x",
+		"12345678901234567890123456789x",
+		"12345678901234567890123456789x",
+		"123456789012345678901234567890",
+		"123456789012345678901234567890",
+
+		"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
+		"word", "pre", "post", NULL,
+
+		"you're'xyz", NULL,
+
+		"hello", "world", "And",
+		"there", "was", "text", "galore",
+		"and", "more", NULL,
+
+		"hello", "world", NULL,
+
+		"l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
+
+		NULL
+	};
+	struct fts_tokenizer *tok;
+	const char *error;
+
+	test_begin("fts tokenizer generic TR29 with WB5a");
+	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
+	test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
+	fts_tokenizer_unref(&tok);
+	test_end();
+}
+
+static void test_fts_tokenizer_address_only(void)
+{
+	static const char input[] = TEST_INPUT_ADDRESS;
+	static const char *const expected_output[] = {
+		"abc.dfg@example.com", "bar@example.org",
+		"foo.bar@host.example.org", "foo@domain",
+		"period@blue.com", /*trailing period '.' in email */
+		"mul@trail.com",
+		"m@s", /*one letter local-part and domain name */
+		"hypen@hypen-hypen.com",
+		"hypen@hypen-hypen-sick.com",
+		NULL
+	};
+	struct fts_tokenizer *tok;
+	const char *error;
+
+	test_begin("fts tokenizer email address only");
+	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
+	test_tokenizer_inputoutput(tok, input, expected_output, 0);
+	fts_tokenizer_unref(&tok);
+	test_end();
+}
+
+static void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
+{
+	static const char input[] = TEST_INPUT_ADDRESS;
+	static const char *const expected_output[] = {
+		"invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "abc.dfg@example.com", "com",
+		"Bar", "Baz", "bar", "example", "bar@example.org", "org",
+		"Foo", "Bar", "comment", "foo", "bar", "host", "example", "foo.bar@host.example.org", "org",
+		"foo", "foo", "foo@domain", "domain",
+		"abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde",
+		"1bcdefghijklmnopqrstuvxy1",
+		"2bcdefghijklmnopqrstuvxy2",
+		"3bcdefghijklmnopqrstuvxy3",
+		"4bcdefghijklmnopqrstuvxy4",
+		"5bcdefghijklmnopqrstuvxy5",
+		"6bcdefghijklmnopqrstuvxy6",
+		"7bcdefghijklmnopqrstuvxy7",
+		"8bcdefghijklmnopqrstuvxy8",
+		"9bcdefghijklmnopqrstuvxy9",
+		"0bcdefghijklmnopqrstuvxy0", "tld",
+		"trailing", "period", "blue", "com", "period@blue.com",
+		"multi", "trialing", "mul", "trail", "com", "mul@trail.com",
+		"m", "m@s", "s",
+		"hypen", "hypen", "hypen", "hypen@hypen-hypen.com", "com",
+		"hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
+		NULL
+	};
+	struct fts_tokenizer *tok, *gen_tok;
+	const char *error;
+
+	test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
+	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
+	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
+	test_tokenizer_inputoutput(tok, input, expected_output, 0);
+	fts_tokenizer_unref(&tok);
+	fts_tokenizer_unref(&gen_tok);
+	test_end();
+}
+
+const char *const simple_settings[] = {"algorithm", "simple", NULL};
+static void test_fts_tokenizer_address_parent_simple(void)
+{
+	test_fts_tokenizer_address_parent("simple", simple_settings);
+}
+
+static void test_fts_tokenizer_address_parent_tr29(void)
+{
+	test_fts_tokenizer_address_parent("tr29", tr29_settings);
+}
+
+static void test_fts_tokenizer_address_search(void)
+{
+	static const char input[] = TEST_INPUT_ADDRESS;
+	static const char *const expected_output[] = {
+		"invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
+		"Bar", "Baz", "bar@example.org",
+		"Foo", "Bar", "comment", "foo.bar@host.example.org",
+		"foo", "foo@domain",
+		"abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde",
+		"1bcdefghijklmnopqrstuvxy1",
+		"2bcdefghijklmnopqrstuvxy2",
+		"3bcdefghijklmnopqrstuvxy3",
+		"4bcdefghijklmnopqrstuvxy4",
+		"5bcdefghijklmnopqrstuvxy5",
+		"6bcdefghijklmnopqrstuvxy6",
+		"7bcdefghijklmnopqrstuvxy7",
+		"8bcdefghijklmnopqrstuvxy8",
+		"9bcdefghijklmnopqrstuvxy9",
+		"0bcdefghijklmnopqrstuvxy0", "tld",
+		"trailing", "period@blue.com",
+		"multi", "trialing", "mul@trail.com",
+		"m@s",
+		"hypen@hypen-hypen.com",
+		"hypen@hypen-hypen-sick.com",
+		NULL
+	};
+	static const char *const settings[] = { "search", "", NULL };
+	struct fts_tokenizer *tok, *gen_tok;
+	const char *token, *error;
+
+	test_begin("fts tokenizer search email address + parent");
+	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
+	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
+	test_tokenizer_inputoutput(tok, input, expected_output, 0);
+
+	/* make sure state is forgotten at EOF */
+	test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
+		    strcmp(token, "foo") == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
+
+	test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
+		    strcmp(token, "bar@baz") == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
+
+	test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
+		    strcmp(token, "foo") == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
+
+	/* test reset explicitly */
+	test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
+	fts_tokenizer_reset(tok);
+	test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
+		    strcmp(token, "b@c") == 0);
+	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
+
+	fts_tokenizer_unref(&tok);
+	fts_tokenizer_unref(&gen_tok);
+	test_end();
+}
+
+static void test_fts_tokenizer_delete_trailing_partial_char(void)
+{
+	static const struct {
+		const char *str;
+		unsigned int truncated_len;
+	} tests[] = {
+		/* non-truncated */
+		{ "\x7f", 1 },
+		{ "\xC2\x80", 2 },
+		{ "\xE0\x80\x80", 3 },
+		{ "\xF0\x80\x80\x80", 4 },
+
+		/* truncated */
+		{ "\xF0\x80\x80", 0 },
+		{ "x\xF0\x80\x80", 1 },
+	};
+	unsigned int i;
+	size_t size;
+
+	test_begin("fts tokenizer delete trailing partial char");
+	for (i = 0; i < N_ELEMENTS(tests); i++) {
+		size = strlen(tests[i].str);
+		fts_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
+		test_assert(size == tests[i].truncated_len);
+	}
+	test_end();
+}
+
+static void test_fts_tokenizer_address_maxlen(void)
+{
+	const char *const settings[] = {"maxlen", "5", NULL};
+	const char *input = "...\357\277\275@a";
+	struct fts_tokenizer *tok;
+	const char *token, *error;
+
+	test_begin("fts tokenizer address maxlen");
+	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
+
+	while (fts_tokenizer_next(tok, (const unsigned char *)input,
+				  strlen(input), &token, &error) > 0) ;
+	while (fts_tokenizer_final(tok, &token, &error) > 0) ;
+	fts_tokenizer_unref(&tok);
+	test_end();
+}
+
+static void test_fts_tokenizer_random(void)
+{
+	const unsigned char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
+	const char *const settings[] = {"algorithm", "simple", NULL};
+	const char *const email_settings[] = {"maxlen", "9", NULL};
+	unsigned int i;
+	unsigned char addr[10] = { 0 };
+	string_t *str = t_str_new(20);
+	struct fts_tokenizer *tok, *gen_tok;
+	const char *token, *error;
+
+	test_begin("fts tokenizer random");
+	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
+	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
+
+	for (i = 0; i < 10000; i++) T_BEGIN {
+		for (unsigned int j = 0; j < sizeof(addr); j++)
+			addr[j] = test_chars[i_rand_limit(N_ELEMENTS(test_chars))];
+		str_truncate(str, 0);
+		if (uni_utf8_get_valid_data(addr, sizeof(addr), str))
+			str_append_data(str, addr, sizeof(addr));
+		while (fts_tokenizer_next(tok, str_data(str), str_len(str),
+					  &token, &error) > 0) ;
+		while (fts_tokenizer_final(tok, &token, &error) > 0) ;
+	} T_END;
+	fts_tokenizer_unref(&tok);
+	fts_tokenizer_unref(&gen_tok);
+	test_end();
+}
+
+
+static void
+test_fts_tokenizer_explicit_prefix(void)
+{
+	const char *input = "* ** "
+		"*pre *both* post* "
+		"mid*dle *mid*dle* "
+		"**twopre **twoboth** twopost**";
+	const char *const expected_star[] = { "pre", "both*", "post*",
+					      "mid*", "dle", "mid*", "dle*",
+					      "twopre", "twoboth*", "twopost*",
+					      NULL, NULL };
+	const char *const expected_nostar[] = { "pre", "both", "post",
+						"mid", "dle", "mid", "dle",
+						"twopre", "twoboth", "twopost",
+						NULL, NULL };
+
+	const char *settings[9] = { "algorithm", "tr29", "wb5a", "yes" };
+	const char **setptr;
+
+	const char *algos[] = { ALGORITHM_SIMPLE_NAME,
+				ALGORITHM_TR29_NAME,
+				ALGORITHM_TR29_NAME "+wb5a" };
+	const char *searches[] = { "indexing", "searching" };
+	const char *prefixes[] = { "fixed", "prefix" };
+
+	for (int algo = 2; algo >= 0; algo--) { /* We overwrite the settings over time */
+		setptr = &settings[algo*2]; /* 4, 2, or 0 settings strings preserved */
+
+		for (int search = 0; search < 2; search++) {
+			const char **setptr2 = setptr;
+			if (search > 0) { *setptr2++ = "search"; *setptr2++ = "yes"; }
+
+			for (int explicitprefix = 0; explicitprefix < 2; explicitprefix++) {
+				const char **setptr3 = setptr2;
+				if (explicitprefix > 0) { *setptr3++ = "explicitprefix"; *setptr3++ = "y"; }
+
+				*setptr3++ = NULL;
+
+				test_begin(t_strdup_printf("prefix search %s:%s:%s",
+							   algos[algo],
+							   searches[search],
+							   prefixes[explicitprefix]));
+				struct fts_tokenizer *tok;
+				const char *error;
+
+				test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings,
+								 &tok, &error) == 0);
+				test_tokenizer_inputs(
+					tok, &input, 1,
+					(search!=0) && (explicitprefix!=0)
+					? expected_star : expected_nostar);
+
+				fts_tokenizer_unref(&tok);
+				test_end();
+			}
+		}
+	}
+}
+
+static void test_fts_tokenizer_skip_base64(void)
+{
+	/* The skip_base64 works on the data already available in the buffer
+	   of the tokenizer, it does not pull more data to see if a base64
+	   sequence long enough would match or not. This is why it does not
+	   use test_tokenizer_inputoutput that also tests with one-byte-at-once
+	   or random chunking, as those are known to fail with the current
+	   implementation */
+	struct fts_tokenizer *tok;
+	const char *error;
+	const char *token;
+
+	static const char *input =
+		",/dirtyleader/456789012345678901234567890123456789/\r\n"
+
+		" /cleanleader/456789012345678901234567890123456789/\r\n"
+		"\t/cleanleader/456789012345678901234567890123456789/\r\n"
+		"\r/cleanleader/456789012345678901234567890123456789/\r\n"
+		"\n/cleanleader/456789012345678901234567890123456789/\r\n"
+		"=/cleanleader/456789012345678901234567890123456789/\r\n"
+		";/cleanleader/456789012345678901234567890123456789/\r\n"
+		":/cleanleader/456789012345678901234567890123456789/\r\n"
+		";/cleanleader/456789012345678901234567890123456789/\r\n"
+
+		"/23456789012345678901234567890123456/dirtytrailer/,\r\n"
+
+		"/23456789012345678901234567890123456/cleantrailer/ \r\n"
+		"/23456789012345678901234567890123456/cleantrailer/\t\r\n"
+		"/23456789012345678901234567890123456/cleantrailer/\r\r\n"
+		"/23456789012345678901234567890123456/cleantrailer/\n\r\n"
+		"/23456789012345678901234567890123456/cleantrailer/=\r\n"
+		"/23456789012345678901234567890123456/cleantrailer/;\r\n"
+		"/23456789012345678901234567890123456/cleantrailer/:\r\n"
+		"/23456789012345678901234567890123456/cleantrailer/?\r\n"
+
+		"J1RrDrZSWxIAphKpYckeKNs10iTeiGMY0hNI32SMoSqCTgH96\r\n" // 49
+		"MziUaLMK6FAOQws3OIuX0tgvQcyhu06ILAWWB1nGPy/bSEAEYg\r\n" // 50
+		"ljWSJo8kxsm4/CiZBpwFfWkd64y+5ZytnKqgkQD87UbQ7FcpZgj\r\n" // 51
+		"pTXUOBszCfdAgfZpWpPiOEQSthPxN9XMaS7HnOTyXtRBPVt96vw=\r\n" // 51=
+		"MJmsWlDKXo7NCSt1wvazf9Xad18qOzpLJkVs/sxKsvLYyPD/zv=\r\n" // 50=
+		"CBLsZ5dUybAEWcDkQwytSL348U/2lvadma7lF4wdNOc8sjUL8=\r\n" // 49=
+
+		"4HWw7lJ15ZW3G1GtH9/NQbylcThN2IJo1kr83Fa2c9z2GFK1/NF+DpAkjbhDA3Al\r\n"
+
+		"alpha bravo charlie delta echo foxtrot golf hotel india\r\n"
+		"=juliet=kilo=lima=mike=november=oscar=papa=qebec=romeo=\r\n";
+
+	static const char *const expected_output[] = {
+		"dirtyleader", "456789012345678901234567890123",
+		"234567890123456789012345678901", "dirtytrailer",
+		"J1RrDrZSWxIAphKpYckeKNs10iTeiG", // 49
+		"CBLsZ5dUybAEWcDkQwytSL348U", "2lvadma7lF4wdNOc8sjUL8", // 49=
+		"alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india",
+		"juliet", "kilo", "lima", "mike", "november", "oscar", "papa", "qebec", "romeo",
+		NULL
+	};
+
+	test_begin("fts tokenizer skip base64");
+	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
+
+	size_t index = 0;
+	while (fts_tokenizer_next(tok, (const unsigned char *) input, strlen(input), &token, &error) > 0) {
+		i_assert(index < N_ELEMENTS(expected_output));
+		test_assert_strcmp(token, expected_output[index]);
+		++index;
+	}
+	while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
+		i_assert(index < N_ELEMENTS(expected_output));
+		test_assert_strcmp(token, expected_output[index]);
+		++index;
+	}
+	i_assert(index < N_ELEMENTS(expected_output));
+	test_assert_idx(expected_output[index] == NULL, index);
+
+	fts_tokenizer_unref(&tok);
+	test_end();
+}
+
+int main(void)
+{
+	static void (*const test_functions[])(void) = {
+		test_fts_tokenizer_skip_base64,
+		test_fts_tokenizer_find,
+		test_fts_tokenizer_generic_only,
+		test_fts_tokenizer_generic_tr29_only,
+		test_fts_tokenizer_generic_tr29_wb5a,
+		test_fts_tokenizer_address_only,
+		test_fts_tokenizer_address_parent_simple,
+		test_fts_tokenizer_address_parent_tr29,
+		test_fts_tokenizer_address_maxlen,
+		test_fts_tokenizer_address_search,
+		test_fts_tokenizer_delete_trailing_partial_char,
+		test_fts_tokenizer_random,
+		test_fts_tokenizer_explicit_prefix,
+		NULL
+	};
+	int ret;
+
+	fts_tokenizers_init();
+	ret = test_run(test_functions);
+	fts_tokenizers_deinit();
+
+	return ret;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 09:51:24 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 09:51:24 +0000
commit	f7548d6d28c313cf80e6f3ef89aed16a19815df1 (patch)
tree	a3f6f2a3f247293bee59ecd28e8cd8ceb6ca064a /src/lib-fts/test-fts-tokenizer.c
parent	Initial commit. (diff)
download	dovecot-upstream.tar.xz dovecot-upstream.zip