diff options
Diffstat (limited to '')
-rw-r--r-- | src/lib-fts/fts-language.c | 368 |
1 files changed, 368 insertions, 0 deletions
diff --git a/src/lib-fts/fts-language.c b/src/lib-fts/fts-language.c new file mode 100644 index 0000000..7874d3a --- /dev/null +++ b/src/lib-fts/fts-language.c @@ -0,0 +1,368 @@ +/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "array.h" +#include "llist.h" +#include "fts-language.h" + + +#ifdef HAVE_LIBEXTTEXTCAT_TEXTCAT_H +# include <libexttextcat/textcat.h> +#elif defined (HAVE_FTS_EXTTEXTCAT) +# include <textcat.h> +#endif + +#ifndef TEXTCAT_RESULT_UNKNOWN /* old textcat.h has typos */ +# ifdef TEXTCAT_RESULT_UNKOWN +# define TEXTCAT_RESULT_UNKNOWN TEXTCAT_RESULT_UNKOWN +# endif +#endif + +#define DETECT_STR_MAX_LEN 200 + +struct fts_textcat { + int refcount; + void *handle; + char *config_path, *data_dir, *failed; +}; + +struct fts_language_list { + pool_t pool; + ARRAY_TYPE(fts_language) languages; + struct fts_textcat *textcat; + const char *textcat_config; + const char *textcat_datadir; +}; + +pool_t fts_languages_pool; +ARRAY_TYPE(fts_language) fts_languages; +#ifdef HAVE_FTS_EXTTEXTCAT +static struct fts_textcat *fts_textcat_cache = NULL; +#endif + +/* ISO 639-1 alpha 2 codes for languages */ +const struct fts_language fts_languages_builtin [] = { + { "da" }, /* Danish */ + { "de" }, /* German */ + { "en" }, /* English */ + { "es" }, /* Spanish */ + { "fi" }, /* Finnish */ + { "fr" }, /* French */ + { "it" }, /* Italian */ + { "nl" }, /* Dutch */ + { "no" }, /* Both Bokmal and Nynorsk are detected as Norwegian */ + { "pt" }, /* Portuguese */ + { "ro" }, /* Romanian */ + { "ru" }, /* Russian */ + { "sv" }, /* Swedish */ + { "tr" }, /* Turkish */ +}; + +const struct fts_language fts_language_data = { + "data" +}; + +#ifdef HAVE_FTS_EXTTEXTCAT +static void fts_textcat_unref(struct fts_textcat *textcat) +{ + i_assert(textcat->refcount > 0); + if (--textcat->refcount > 0) + return; + + if (textcat == fts_textcat_cache) + fts_textcat_cache = NULL; + + i_free(textcat->config_path); + i_free(textcat->data_dir); + i_free(textcat->failed); + if (textcat->handle != NULL) + textcat_Done(textcat->handle); + i_free(textcat); +} +#endif + +void fts_languages_init(void) +{ + unsigned int i; + const struct fts_language *lp; + + fts_languages_pool = pool_alloconly_create("fts_language", + sizeof(fts_languages_builtin)); + p_array_init(&fts_languages, fts_languages_pool, + N_ELEMENTS(fts_languages_builtin)); + for (i = 0; i < N_ELEMENTS(fts_languages_builtin); i++){ + lp = &fts_languages_builtin[i]; + array_push_back(&fts_languages, &lp); + } +} + +void fts_languages_deinit(void) +{ +#ifdef HAVE_FTS_EXTTEXTCAT + if (fts_textcat_cache != NULL) + fts_textcat_unref(fts_textcat_cache); +#endif + pool_unref(&fts_languages_pool); +} + +void fts_language_register(const char *name) +{ + struct fts_language *lang; + + if (fts_language_find(name) != NULL) + return; + + lang = p_new(fts_languages_pool, struct fts_language, 1); + lang->name = p_strdup(fts_languages_pool, name); + array_push_back(&fts_languages, (const struct fts_language **)&lang); +} + +const struct fts_language *fts_language_find(const char *name) +{ + const struct fts_language *lang; + + array_foreach_elem(&fts_languages, lang) { + if (strcmp(lang->name, name) == 0) + return lang; + } + return NULL; +} + +int fts_language_list_init(const char *const *settings, + struct fts_language_list **list_r, + const char **error_r) +{ + struct fts_language_list *lp; + pool_t pool; + unsigned int i; + const char *conf = NULL, *data = NULL; + + for (i = 0; settings[i] != NULL; i += 2) { + const char *key = settings[i], *value = settings[i+1]; + + if (strcmp(key, "fts_language_config") == 0) + conf = value; + else if (strcmp(key, "fts_language_data") == 0) + data = value; + else { + *error_r = t_strdup_printf("Unknown setting: %s", key); + return -1; + } + } + + pool = pool_alloconly_create("fts_language_list", 128); + lp = p_new(pool, struct fts_language_list, 1); + lp->pool = pool; + if (conf != NULL) + lp->textcat_config = p_strdup(pool, conf); + else + lp->textcat_config = NULL; + if (data != NULL) + lp->textcat_datadir = p_strdup(pool, data); + else + lp->textcat_datadir = NULL; + p_array_init(&lp->languages, pool, 32); + *list_r = lp; + return 0; +} + +void fts_language_list_deinit(struct fts_language_list **list) +{ + struct fts_language_list *lp = *list; + + *list = NULL; +#ifdef HAVE_FTS_EXTTEXTCAT + if (lp->textcat != NULL) + fts_textcat_unref(lp->textcat); +#endif + pool_unref(&lp->pool); +} + +static const struct fts_language * +fts_language_list_find(struct fts_language_list *list, const char *name) +{ + const struct fts_language *lang; + + array_foreach_elem(&list->languages, lang) { + if (strcmp(lang->name, name) == 0) + return lang; + } + return NULL; +} + +void fts_language_list_add(struct fts_language_list *list, + const struct fts_language *lang) +{ + i_assert(fts_language_list_find(list, lang->name) == NULL); + array_push_back(&list->languages, &lang); +} + +bool fts_language_list_add_names(struct fts_language_list *list, + const char *names, + const char **unknown_name_r) +{ + const char *const *langs; + const struct fts_language *lang; + + for (langs = t_strsplit_spaces(names, ", "); *langs != NULL; langs++) { + lang = fts_language_find(*langs); + if (lang == NULL) { + /* unknown language */ + *unknown_name_r = *langs; + return FALSE; + } + if (fts_language_list_find(list, lang->name) == NULL) + fts_language_list_add(list, lang); + } + return TRUE; +} + +const ARRAY_TYPE(fts_language) * +fts_language_list_get_all(struct fts_language_list *list) +{ + return &list->languages; +} + +const struct fts_language * +fts_language_list_get_first(struct fts_language_list *list) +{ + const struct fts_language *const *langp; + + langp = array_front(&list->languages); + return *langp; +} + +#ifdef HAVE_FTS_EXTTEXTCAT +static bool fts_language_match_lists(struct fts_language_list *list, + candidate_t *candp, int candp_len, + const struct fts_language **lang_r) +{ + const char *name; + + for (int i = 0; i < candp_len; i++) { + /* name is <lang>-<optional country or characterset>-<encoding> + eg, fi--utf8 or pt-PT-utf8 */ + name = t_strcut(candp[i].name, '-'); + + /* For Norwegian we treat both bokmal and nynorsk as "no". */ + if (strcmp(name, "nb") == 0 || strcmp(name, "nn") == 0) + name = "no"; + if ((*lang_r = fts_language_list_find(list, name)) != NULL) + return TRUE; + } + return FALSE; +} +#endif + +#ifdef HAVE_FTS_EXTTEXTCAT +static int fts_language_textcat_init(struct fts_language_list *list, + const char **error_r) +{ + const char *config_path; + const char *data_dir; + + if (list->textcat != NULL) { + if (list->textcat->failed != NULL) { + *error_r = list->textcat->failed; + return -1; + } + i_assert(list->textcat->handle != NULL); + return 0; + } + + config_path = list->textcat_config != NULL ? list->textcat_config : + TEXTCAT_DATADIR"/fpdb.conf"; + data_dir = list->textcat_datadir != NULL ? list->textcat_datadir : + TEXTCAT_DATADIR"/"; + if (fts_textcat_cache != NULL) { + if (strcmp(fts_textcat_cache->config_path, config_path) == 0 && + strcmp(fts_textcat_cache->data_dir, data_dir) == 0) { + list->textcat = fts_textcat_cache; + list->textcat->refcount++; + return 0; + } + fts_textcat_unref(fts_textcat_cache); + } + + fts_textcat_cache = list->textcat = i_new(struct fts_textcat, 1); + fts_textcat_cache->refcount = 2; + fts_textcat_cache->config_path = i_strdup(config_path); + fts_textcat_cache->data_dir = i_strdup(data_dir); + fts_textcat_cache->handle = special_textcat_Init(config_path, data_dir); + if (fts_textcat_cache->handle == NULL) { + fts_textcat_cache->failed = i_strdup_printf( + "special_textcat_Init(%s, %s) failed", + config_path, data_dir); + *error_r = fts_textcat_cache->failed; + return -1; + } + /* The textcat minimum document size could be set here. It + currently defaults to 3. UTF8 is enabled by default. */ + return 0; +} +#endif + +static enum fts_language_result +fts_language_detect_textcat(struct fts_language_list *list ATTR_UNUSED, + const unsigned char *text ATTR_UNUSED, + size_t size ATTR_UNUSED, + const struct fts_language **lang_r ATTR_UNUSED, + const char **error_r ATTR_UNUSED) +{ +#ifdef HAVE_FTS_EXTTEXTCAT + candidate_t *candp; /* textcat candidate result array pointer */ + int cnt; + bool match = FALSE; + + if (fts_language_textcat_init(list, error_r) < 0) + return FTS_LANGUAGE_RESULT_ERROR; + + candp = textcat_GetClassifyFullOutput(list->textcat->handle); + if (candp == NULL) + i_fatal_status(FATAL_OUTOFMEM, "textcat_GetCLassifyFullOutput failed: malloc() returned NULL"); + cnt = textcat_ClassifyFull(list->textcat->handle, (const void *)text, + I_MIN(size, DETECT_STR_MAX_LEN), candp); + if (cnt > 0) { + T_BEGIN { + match = fts_language_match_lists(list, candp, cnt, lang_r); + } T_END; + textcat_ReleaseClassifyFullOutput(list->textcat->handle, candp); + if (match) + return FTS_LANGUAGE_RESULT_OK; + else + return FTS_LANGUAGE_RESULT_UNKNOWN; + } else { + textcat_ReleaseClassifyFullOutput(list->textcat->handle, candp); + switch (cnt) { + case TEXTCAT_RESULT_SHORT: + i_assert(size < DETECT_STR_MAX_LEN); + return FTS_LANGUAGE_RESULT_SHORT; + case TEXTCAT_RESULT_UNKNOWN: + return FTS_LANGUAGE_RESULT_UNKNOWN; + default: + i_unreached(); + } + } +#else + return FTS_LANGUAGE_RESULT_UNKNOWN; +#endif +} + +enum fts_language_result +fts_language_detect(struct fts_language_list *list, + const unsigned char *text ATTR_UNUSED, + size_t size ATTR_UNUSED, + const struct fts_language **lang_r, + const char **error_r) +{ + i_assert(array_count(&list->languages) > 0); + + /* if there's only a single wanted language, return it always. */ + if (array_count(&list->languages) == 1) { + const struct fts_language *const *langp = + array_front(&list->languages); + *lang_r = *langp; + return FTS_LANGUAGE_RESULT_OK; + } + return fts_language_detect_textcat(list, text, size, lang_r, error_r); +} |