summaryrefslogtreecommitdiffstats
path: root/src/lib-fts/test-fts-language.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 09:51:24 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 09:51:24 +0000
commitf7548d6d28c313cf80e6f3ef89aed16a19815df1 (patch)
treea3f6f2a3f247293bee59ecd28e8cd8ceb6ca064a /src/lib-fts/test-fts-language.c
parentInitial commit. (diff)
downloaddovecot-f7548d6d28c313cf80e6f3ef89aed16a19815df1.tar.xz
dovecot-f7548d6d28c313cf80e6f3ef89aed16a19815df1.zip
Adding upstream version 1:2.3.19.1+dfsg1.upstream/1%2.3.19.1+dfsg1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/lib-fts/test-fts-language.c')
-rw-r--r--src/lib-fts/test-fts-language.c323
1 files changed, 323 insertions, 0 deletions
diff --git a/src/lib-fts/test-fts-language.c b/src/lib-fts/test-fts-language.c
new file mode 100644
index 0000000..e5968e9
--- /dev/null
+++ b/src/lib-fts/test-fts-language.c
@@ -0,0 +1,323 @@
+/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "test-common.h"
+#include "fts-language.h"
+/* TODO: These checks will not work without proper libtextcat configuration.
+ As such, they are not really unit test to be coupled with the build. */
+
+const char *const settings[] =
+ {"fts_language_config", TEXTCAT_DATADIR"/fpdb.conf",
+ "fts_language_data", TEXTCAT_DATADIR"/", NULL};
+
+/* Detect Finnish. fi--utf8 */
+static void test_fts_language_detect_finnish(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char finnish[] =
+ "Yhdistyneiden kansakuntien kolmas yleiskokous hyv\xC3\xA4ksyi "\
+ "ja julkisti ihmisoikeuksien yleismaailmallisen julistuksen "\
+ "joulukuun 10. p\xC3\xA4iv\xC3\xA4n\xC3\xA4 1948. Julistuksen "\
+ "hyv\xC3\xA4ksymisen puolesta \xC3\xA4\xC3\xA4nesti 48 maata. "\
+ "Mik\xC3\xA4\xC3\xA4n maa ei \xC3\xA4\xC3\xA4nest\xC3\xA4nyt "\
+ "vastaan. Kahdeksan maata pid\xC3\xA4ttyi "\
+ "\xC3\xA4\xC3\xA4nest\xC3\xA4m\xC3\xA4st\xC3\xA4.";
+ const char names[] = "de, fi, en";
+ const char *unknown, *error;
+ test_begin("fts language detect Finnish");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, finnish, sizeof(finnish)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "fi") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
+/* Detect English */
+static void test_fts_language_detect_english(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char english[] = "Whereas recognition of the inherent dignity and"\
+ " of the equal and inalienable rights of all members of the human"\
+ "family is the foundation of freedom, justice and peace in the "\
+ "world,\n Whereas disregard and contempt for human rights have "\
+ "resulted in barbarous acts which have outraged the conscience"\
+ "of mankind, and the advent of a world in which human beings"\
+ "shall enjoy freedom of speech and belief and freedom from "\
+ "fear and want has been proclaimed as the highest aspiration"\
+ "of the common people, ";
+
+ const char names[] = "fi, de, fr, en";
+ const char *unknown, *error;
+ test_begin("fts language detect English");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, english, sizeof(english)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "en") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
+/* Detect French */
+static void test_fts_language_detect_french(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char french[] =
+ "D\xC3\xA9""claration universelle des droits de l\xE2\x80\x99"
+ "homme Pr\xC3\xA9""ambule Consid\xC3\xA9rant que la "\
+ "reconnaissance de la dignit\xC3\xA9 inh\xC3\xA9rente \xC3\xA0"\
+ " tous les membres de la famille humaine et de leurs droits "\
+ "\xC3\xA9gaux et inali\xC3\xA9nables constitue le fondement de"\
+ " la libert\xC3\xA9, de la justice et de la paix dans le monde,"\
+ " Consid\xC3\xA9rant que la m\xC3\xA9""connaissance et le "\
+ "m\xC3\xA9pris des droits de l\xE2\x80\x99homme ont conduit "\
+ "\xC3\xA0 des actes de barbarie qui r\xC3\xA9voltent la "\
+ "conscience de l\xE2\x80\x99humanit\xC3\xA9 et que "\
+ "l\xE2\x80\x99""av\xC3\xA8nement d\xE2\x80\x99un monde o\xC3\xB9"\
+ " les \xC3\xAAtres humains seront libres de parler et de "\
+ "croire, lib\xC3\xA9r\xC3\xA9s de la terreur et de la "\
+ "mis\xC3\xA8re, a \xC3\xA9t\xC3\xA9 proclam\xC3\xA9 comme la "\
+ "plus haute aspiration de l\xE2\x80\x99homme,";
+
+
+ const char names[] = "de, fi, fr, en";
+ const char *unknown, *error;
+ test_begin("fts language detect French");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, french, sizeof(french)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "fr") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+/* Detect German */
+static void test_fts_language_detect_german(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char german[] =
+ "Artikel 1"\
+ "Alle Menschen sind frei und gleich an W\xC3\xBCrde und Rechten "\
+ "geboren. Sie sind mit Vernunft und Gewissen begabt und sollen "\
+ "einander im Geist der Br\xC3\xBC""derlichkeit begegnen." \
+ "Artikel 2 Jeder hat Anspruch auf die in dieser "\
+ "Erkl\xC3\xA4rung verk\xC3\xBCndeten Rechte und Freiheiten ohne"\
+ "irgendeinen Unterschied, etwa nach Rasse, Hautfarbe, "\
+ "Geschlecht, Sprache, Religion, politischer oder sonstiger "\
+ "\xC3\x9C""berzeugung, nationaler oder sozialer Herkunft, "\
+ "Verm\xC3\xB6gen, Geburt oder sonstigem Stand. Des weiteren "\
+ "darf kein Unterschied gemacht werden auf Grund der "\
+ "politischen, rechtlichen oder internationalen Stellung des "\
+ "Landes oder Gebiets, dem eine Person angeh\xC3\xB6rt, "\
+ "gleichg\xC3\xBCltig, ob dieses unabh\xC3\xA4ngig ist, unter "\
+ "Treuhandschaft steht, keine Selbstregierung besitzt oder "\
+ "sonst in seiner Souver\xC3\xA4nit\xC3\xA4t "\
+ "eingeschr\xC3\xA4nkt ist.";
+
+
+
+ const char names[] = "fi, de, fr, en";
+ const char *unknown, *error;
+ test_begin("fts language detect German");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, german, sizeof(german)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "de") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
+/* Detect Swedish */
+static void test_fts_language_detect_swedish(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char swedish[] =
+ "Artikel 1."\
+ "Alla m\xC3\xA4nniskor \xC3\xA4ro f\xC3\xB6""dda fria och lika"\
+ " i v\xC3\xA4rde och r\xC3\xA4ttigheter. De \xC3\xA4ro "\
+ "utrustade med f\xC3\xB6rnuft och samvete och b\xC3\xB6ra "\
+ "handla gentemot varandra i en anda av broderskap.";
+
+
+
+ const char names[] = "fi, de, sv, fr, en";
+ const char *unknown, *error;
+ test_begin("fts language detect Swedish");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, swedish, sizeof(swedish)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "sv") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
+/* Detect Bokmal */
+static void test_fts_language_detect_bokmal(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char bokmal[] =
+ "Artikkel 1.\n"\
+ "Alle mennesker er f\xC3\xB8""dt frie og med samme menneskeverd"\
+ " og menneskerettigheter. De er utstyrt med fornuft og "\
+ "samvittighet og b\xC3\xB8r handle mot hverandre i "\
+ "brorskapets \xC3\xA5nd";
+
+ const char names[] = "fi, de, sv, no, fr, en";
+ const char *unknown, *error;
+ test_begin("fts language detect Bokmal as Norwegian");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, bokmal, sizeof(bokmal)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "no") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
+/* Detect Nynorsk */
+static void test_fts_language_detect_nynorsk(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char nynorsk[] =
+ "Artikkel 1.\n"\
+ "Alle menneske er f\xC3\xB8""dde til fridom og med same "\
+ "menneskeverd og menneskerettar. Dei har f\xC3\xA5tt fornuft "\
+ "og samvit og skal leve med kvarandre som br\xC3\xB8r.";
+
+
+ const char names[] = "fi, de, sv, no, fr, en";
+ const char *unknown, *error;
+ test_begin("fts language detect Nynorsk as Norwegian");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, nynorsk, sizeof(nynorsk)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "no") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
+/* Detect Finnish as English */
+static void test_fts_language_detect_finnish_as_english(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char finnish[] =
+ "Yhdistyneiden kansakuntien kolmas yleiskokous hyv\xC3\xA4ksyi "\
+ "ja julkisti ihmisoikeuksien yleismaailmallisen julistuksen "\
+ "joulukuun 10. p\xC3\xA4iv\xC3\xA4n\xC3\xA4 1948. Julistuksen "\
+ "hyv\xC3\xA4ksymisen puolesta \xC3\xA4\xC3\xA4nesti 48 maata. "\
+ "Mik\xC3\xA4\xC3\xA4n maa ei \xC3\xA4\xC3\xA4nest\xC3\xA4nyt "\
+ "vastaan. Kahdeksan maata pid\xC3\xA4ttyi "\
+ "\xC3\xA4\xC3\xA4nest\xC3\xA4m\xC3\xA4st\xC3\xA4.";
+ const char names[] = "en";
+ const char *unknown, *error;
+ test_begin("fts language detect Finnish as English");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, finnish, sizeof(finnish)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "en") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
+/* Successfully avoid detecting English, when en is not in language list. */
+static void test_fts_language_detect_na(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char english[] = "Whereas recognition of the inherent dignity and"\
+ " of the equal and inalienable rights of all members of the human"\
+ "family is the foundation of freedom, justice and peace in the "\
+ "world,\n Whereas disregard and contempt for human rights have "\
+ "resulted in barbarous acts which have outraged the conscience"\
+ "of mankind, and the advent of a world in which human beings"\
+ "shall enjoy freedom of speech and belief and freedom from "\
+ "fear and want has been proclaimed as the highest aspiration"\
+ "of the common people, ";
+
+ const char names[] = "fi, de, fr";
+ const char *unknown, *error;
+ test_begin("fts language detect not available");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, english, sizeof(english)-1, &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_UNKNOWN);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
+/* Successfully detect, that Klingon is unknown. */
+static void test_fts_language_detect_unknown(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char klingon[] = "nobwI''a'pu'qoqvam'e' "\
+ "nuHegh'eghrupqa'moHlaHbe'law'lI'neS "\
+ "SeH'eghtaHghach'a'na'chajmo'.";
+
+ const char names[] = "fi, de, fr";
+ const char *unknown, *error;
+ test_begin("fts language detect unknown");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, klingon, sizeof(klingon), &lang_r, &error)
+ == FTS_LANGUAGE_RESULT_UNKNOWN);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+static void test_fts_language_find_builtin(void)
+{
+ const struct fts_language *lp;
+ test_begin("fts language find built-in");
+ lp = fts_language_find("en");
+ i_assert(lp != NULL);
+ test_assert(strcmp(lp->name, "en") == 0);
+ test_end();
+}
+static void test_fts_language_register(void)
+{
+ const struct fts_language *lp;
+ test_begin("fts language register");
+ fts_language_register("jp");
+ lp = fts_language_find("jp");
+ i_assert(lp != NULL);
+ test_assert(strcmp(lp->name, "jp") == 0);
+ test_end();
+}
+
+int main(void)
+{
+ int ret;
+ static void (*const test_functions[])(void) = {
+ test_fts_language_detect_finnish,
+ test_fts_language_detect_english,
+ test_fts_language_detect_french,
+ test_fts_language_detect_german,
+ test_fts_language_detect_swedish,
+ test_fts_language_detect_bokmal,
+ test_fts_language_detect_nynorsk,
+ test_fts_language_detect_finnish_as_english,
+ test_fts_language_detect_na,
+ test_fts_language_detect_unknown,
+ test_fts_language_find_builtin,
+ test_fts_language_register,
+ NULL
+ };
+ fts_languages_init();
+ ret = test_run(test_functions);
+ fts_languages_deinit();
+ return ret;
+}