From 0441d265f2bb9da249c7abf333f0f771fadb4ab5 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 19:36:47 +0200 Subject: Adding upstream version 1:2.3.21+dfsg1. Signed-off-by: Daniel Baumann --- src/lib-fts/fts-tokenizer-generic.c | 906 ++++++++++++++++++++++++++++++++++++ 1 file changed, 906 insertions(+) create mode 100644 src/lib-fts/fts-tokenizer-generic.c (limited to 'src/lib-fts/fts-tokenizer-generic.c') diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c new file mode 100644 index 0000000..be72aa3 --- /dev/null +++ b/src/lib-fts/fts-tokenizer-generic.c @@ -0,0 +1,906 @@ +/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "base64.h" +#include "buffer.h" +#include "str.h" +#include "unichar.h" +#include "bsearch-insert-pos.h" +#include "fts-common.h" +#include "fts-tokenizer-private.h" +#include "fts-tokenizer-generic-private.h" +#include "fts-tokenizer-common.h" +#include "word-boundary-data.c" +#include "word-break-data.c" + +/* see comments below between is_base64() and skip_base64() */ +#define FTS_SKIP_BASE64_MIN_SEQUENCES 1 +#define FTS_SKIP_BASE64_MIN_CHARS 50 + +#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30 +#define FTS_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */ + +static unsigned char fts_ascii_word_breaks[128] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */ + + 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */ +}; + +static int +fts_tokenizer_generic_create(const char *const *settings, + struct fts_tokenizer **tokenizer_r, + const char **error_r) +{ + struct generic_fts_tokenizer *tok; + unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH; + enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE; + bool wb5a = FALSE; + bool search = FALSE; + bool explicitprefix = FALSE; + unsigned int i; + + for (i = 0; settings[i] != NULL; i += 2) { + const char *key = settings[i], *value = settings[i+1]; + + if (strcmp(key, "maxlen") == 0) { + if (str_to_uint(value, &max_length) < 0 || + max_length == 0) { + *error_r = t_strdup_printf( + "Invalid maxlen setting: %s", value); + return -1; + } + } else if (strcmp(key, "algorithm") == 0) { + if (strcmp(value, ALGORITHM_TR29_NAME) == 0) + algo = BOUNDARY_ALGORITHM_TR29; + else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0) + ; + else { + *error_r = t_strdup_printf( + "Invalid algorithm: %s", value); + return -1; + } + } else if (strcmp(key, "search") == 0) { + /* tokenizing a search string - + makes no difference to us */ + search = TRUE; + } else if (strcasecmp(key, "wb5a") == 0) { + if (strcasecmp(value, "no") == 0) + wb5a = FALSE; + else + wb5a = TRUE; + } else if (strcasecmp(key, "explicitprefix") == 0) { + explicitprefix = TRUE; + } else { + *error_r = t_strdup_printf("Unknown setting: %s", key); + return -1; + } + } + + /* Tokenise normally unless tokenising an explicit prefix query */ + if (!search) + explicitprefix = FALSE; + + if (wb5a && algo != BOUNDARY_ALGORITHM_TR29) { + *error_r = "Can not use WB5a for algorithms other than TR29."; + return -1; + } + + tok = i_new(struct generic_fts_tokenizer, 1); + if (algo == BOUNDARY_ALGORITHM_TR29) + tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29; + else + tok->tokenizer.v = &generic_tokenizer_vfuncs_simple; + tok->max_length = max_length; + tok->algorithm = algo; + tok->wb5a = wb5a; + tok->prefixsplat = explicitprefix; + tok->token = buffer_create_dynamic(default_pool, 64); + + *tokenizer_r = &tok->tokenizer; + return 0; +} + +static void +fts_tokenizer_generic_destroy(struct fts_tokenizer *_tok) +{ + struct generic_fts_tokenizer *tok = + container_of(_tok, struct generic_fts_tokenizer, tokenizer); + + buffer_free(&tok->token); + i_free(tok); +} + +static inline void +shift_prev_type(struct generic_fts_tokenizer *tok, enum letter_type lt) +{ + tok->prev_prev_type = tok->prev_type; + tok->prev_type = lt; +} + +static inline void +add_prev_type(struct generic_fts_tokenizer *tok, enum letter_type lt) +{ + if(tok->prev_type != LETTER_TYPE_NONE) + tok->prev_prev_type = tok->prev_type; + tok->prev_type = lt; +} + +static inline void +add_letter(struct generic_fts_tokenizer *tok, unichar_t c) +{ + if(tok->letter != 0) + tok->prev_letter = tok->letter; + tok->letter = c; +} + +static bool +fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok, + const char **token_r) +{ + const unsigned char *data = tok->token->data; + size_t len = tok->token->used; + + if (tok->untruncated_length <= tok->max_length) { + /* Remove the trailing apostrophe - it was made + into U+0027 earlier. There can be only a single such + apostrophe, because otherwise the token would have already + been split. We also want to remove the trailing apostrophe + only if it's the the last character in the nontruncated + token - a truncated token may end with apostrophe. */ + if (len > 0 && data[len-1] == '\'') { + len--; + i_assert(len > 0 && data[len-1] != '\''); + } + if (len > 0 && data[len-1] == '*' && !tok->prefixsplat) { + len--; + i_assert(len > 0 && data[len-1] != '*'); + } + } else { + fts_tokenizer_delete_trailing_partial_char(data, &len); + } + i_assert(len <= tok->max_length); + + *token_r = len == 0 ? "" : + t_strndup(tok->token->data, len); + buffer_set_used_size(tok->token, 0); + tok->untruncated_length = 0; + return len > 0; +} + +static bool uint32_find(const uint32_t *data, unsigned int count, + uint32_t value, unsigned int *idx_r) +{ + BINARY_NUMBER_SEARCH(data, count, value, idx_r); +} + +static bool fts_uni_word_break(unichar_t c) +{ + unsigned int idx; + + /* Unicode General Punctuation, including deprecated characters. */ + if (c >= 0x2000 && c <= 0x206f) + return TRUE; + /* From word-break-data.c, which is generated from PropList.txt. */ + if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx)) + return TRUE; + if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx)) + return TRUE; + if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx)) + return TRUE; + if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx)) + return TRUE; + if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx)) + return TRUE; + if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx)) + return TRUE; + return FALSE; +} + +enum fts_break_type { + FTS_FROM_STOP = 0, + FTS_FROM_WORD = 2, + FTS_TO_STOP= 0, + FTS_TO_WORD = 1, +#define FROM_TO(f,t) FTS_##f##_TO_##t = FTS_FROM_##f | FTS_TO_##t + FROM_TO(STOP,STOP), + FROM_TO(STOP,WORD), + FROM_TO(WORD,STOP), + FROM_TO(WORD,WORD), +}; +static inline enum fts_break_type +fts_simple_is_word_break(const struct generic_fts_tokenizer *tok, + unichar_t c, bool apostrophe) +{ + /* Until we know better, a letter followed by an apostrophe is continuation of the word. + However, if we see non-word letters afterwards, we'll reverse that decision. */ + if (apostrophe) + return tok->prev_type == LETTER_TYPE_ALETTER ? FTS_WORD_TO_WORD : FTS_STOP_TO_STOP; + + bool new_breakiness = (c < 0x80) ? (fts_ascii_word_breaks[c] != 0) : fts_uni_word_break(c); + + return (new_breakiness ? FTS_TO_STOP : FTS_TO_WORD) + + (tok->prev_type == LETTER_TYPE_ALETTER || + tok->prev_type == LETTER_TYPE_SINGLE_QUOTE + ? FTS_FROM_WORD : FTS_FROM_STOP); +} + +static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok) +{ + struct generic_fts_tokenizer *tok = + container_of(_tok, struct generic_fts_tokenizer, tokenizer); + + tok->prev_type = LETTER_TYPE_NONE; + tok->prev_prev_type = LETTER_TYPE_NONE; + tok->untruncated_length = 0; + buffer_set_used_size(tok->token, 0); +} + +static void tok_append_truncated(struct generic_fts_tokenizer *tok, + const unsigned char *data, size_t size) +{ + buffer_append(tok->token, data, + I_MIN(size, tok->max_length - tok->token->used)); + tok->untruncated_length += size; +} + +inline static bool +is_base64(const unsigned char ch) +{ + return base64_scheme.decmap[ch] != 0xff; +} + +/* So far the following rule seems give good results in avoid indexing base64 + as keywords. It also seems to run well against against base64 embedded + headers, like ARC-Seal, DKIM-Signature, X-SG-EID, X-SG-ID, including + encoded parts (e.g. =?us-ascii?Q?...?= sequences). + + leader characters : [ \t\r\n=:;?]* + matching characters : base64_scheme.decmap[ch] != 0xff + trailing characters : none or [ \t\r\n=:;?] (other characters cause + the run to be ignored) + minimum run length : 50 + minimum runs count : 1 + + i.e. (single or multiple) 50-chars runs of characters in the base64 set + - excluded the trailing '=' - are recognized as base64 and ignored + in indexing. */ + +#define allowed_base64_trailers allowed_base64_leaders +static unsigned char allowed_base64_leaders[] = { + ' ', '\t', '\r', '\n', '=', ';', ':', '?' +}; + +/* skip_base64() works doing lookahead on the data available in the tokenizer + buffer, .i.e. it is not able to see "what will come next" to perform more + extensive matches. This implies that a very long base64 sequence, which is + split halfway into two different chunks while feeding tokenizer, will be + matched separately as the trailing part of first buffer and as the leading + part of the second. Each of these two segments must fulfill the match + criteria on its own to be discarded. What we pay is we will fail to reject + small base64 chunks segments instead of rejecting the whole sequence. + + When skip_base64() is invoked in fts_tokenizer_generic_XX_next(), we know + that we are not halfway the collection of a token. + + As (after the previous token) the buffer will contain non-token characters + (i.e. token separators of some kind), we try to move forward among those + until we find a base64 character. If we don't find one, there's nothing we + can skip in the buffer and the skip phase terminates. + + If we found a base64 character, we check that the previous one is in + allowed_base64_leaders[]; otherwise, the skip phase terminates. + + Now we try to determine how long the base64 sequence is. If it is too short, + the skip phase terminates. It also terminates if there's a character + in the buffer after the sequence and this is not in + allowed_base64_trailers[]. + + At this point we know that we have: + - possibly a skipped sequence of non base64 characters ending with an + allowed leader character, followed by: + - a skipped sequence of base64 characters, possibly followed by an allowed + trailed character + we advance the start pointer to after the last skipped base64 character, + and scan again to see if we can skip further chunks in the same way. */ + +static size_t +skip_base64(const unsigned char *data, size_t size) +{ + if (data == NULL) { + i_assert(size == 0); + return 0; + } + + const unsigned char *start, *end = data + size; + unsigned int matches = 0; + for (start = data; start < end; ) { + const unsigned char *first; + for (first = start; first < end && !is_base64(*first); first++); + if (first > start && memchr(allowed_base64_leaders, *(first - 1), + N_ELEMENTS(allowed_base64_leaders)) == NULL) + break; + + const unsigned char *past; + for (past = first; past < end && is_base64(*past); past++); + if (past - first < FTS_SKIP_BASE64_MIN_CHARS) + break; + if (past < end && memchr(allowed_base64_trailers, *past, + N_ELEMENTS(allowed_base64_trailers)) == NULL) + break; + start = past; + matches++; + } + return matches < FTS_SKIP_BASE64_MIN_SEQUENCES ? 0 : start - data; +} + +static int +fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok, + const unsigned char *data, size_t size, + size_t *skip_r, const char **token_r, + const char **error_r ATTR_UNUSED) +{ + struct generic_fts_tokenizer *tok = + container_of(_tok, struct generic_fts_tokenizer, tokenizer); + size_t i, start; + int char_size; + unichar_t c; + bool apostrophe; + enum fts_break_type break_type; + + start = tok->token->used > 0 ? 0 : skip_base64(data, size); + for (i = start; i < size; i += char_size) { + char_size = uni_utf8_get_char_n(data + i, size - i, &c); + i_assert(char_size > 0); + + apostrophe = IS_APOSTROPHE(c); + if ((tok->prefixsplat && IS_PREFIX_SPLAT(c)) && + (tok->prev_type == LETTER_TYPE_ALETTER)) { + /* this might be a prefix-mathing query */ + shift_prev_type(tok, LETTER_TYPE_PREFIXSPLAT); + } else if ((break_type = fts_simple_is_word_break(tok, c, apostrophe)) + != FTS_WORD_TO_WORD) { + tok_append_truncated(tok, data + start, i - start); + shift_prev_type(tok, (break_type & FTS_TO_WORD) != 0 + ? LETTER_TYPE_ALETTER : LETTER_TYPE_NONE); + if (fts_tokenizer_generic_simple_current_token(tok, token_r)) { + *skip_r = i; + if (break_type != FTS_STOP_TO_WORD) /* therefore *_TO_STOP */ + *skip_r += char_size; + return 1; + } + if ((break_type & FTS_TO_WORD) == 0) + start = i + char_size; + } else if (apostrophe) { + /* all apostrophes require special handling */ + const unsigned char apostrophe_char = '\''; + + tok_append_truncated(tok, data + start, i - start); + if (tok->token->used > 0) + tok_append_truncated(tok, &apostrophe_char, 1); + start = i + char_size; + shift_prev_type(tok, LETTER_TYPE_SINGLE_QUOTE); + } else { + /* Lie slightly about the type. This is anything that + we're not skipping or cutting on and are prepared to + search for - it's "as good as" a letter. */ + shift_prev_type(tok, LETTER_TYPE_ALETTER); + } + } + /* word boundary not found yet */ + if (i > start) + tok_append_truncated(tok, data + start, i - start); + *skip_r = i; + + /* return the last token */ + if (size == 0) { + shift_prev_type(tok, LETTER_TYPE_NONE); + if (fts_tokenizer_generic_simple_current_token(tok, token_r)) + return 1; + } + + return 0; +} + +/* TODO: Arrange array searches roughly in order of likelihood of a match. + TODO: Make some array of the arrays, so this can be a foreach loop. + TODO: Check for Hangul. + TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also + U+058A ( ֊ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE + HYPHEN. + TODO +*/ +static enum letter_type letter_type(unichar_t c) +{ + unsigned int idx; + + if (IS_APOSTROPHE(c)) + return LETTER_TYPE_APOSTROPHE; + if (uint32_find(CR, N_ELEMENTS(CR), c, &idx)) + return LETTER_TYPE_CR; + if (uint32_find(LF, N_ELEMENTS(LF), c, &idx)) + return LETTER_TYPE_LF; + if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx)) + return LETTER_TYPE_NEWLINE; + if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx)) + return LETTER_TYPE_EXTEND; + if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx)) + return LETTER_TYPE_REGIONAL_INDICATOR; + if (uint32_find(Format, N_ELEMENTS(Format), c, &idx)) + return LETTER_TYPE_FORMAT; + if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx)) + return LETTER_TYPE_KATAKANA; + if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx)) + return LETTER_TYPE_HEBREW_LETTER; + if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx)) + return LETTER_TYPE_ALETTER; + if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx)) + return LETTER_TYPE_SINGLE_QUOTE; + if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx)) + return LETTER_TYPE_DOUBLE_QUOTE; + if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx)) + return LETTER_TYPE_MIDNUMLET; + if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx)) + return LETTER_TYPE_MIDLETTER; + if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx)) + return LETTER_TYPE_MIDNUM; + if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx)) + return LETTER_TYPE_NUMERIC; + if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx)) + return LETTER_TYPE_EXTENDNUMLET; + if (IS_PREFIX_SPLAT(c)) /* prioritise appropriately */ + return LETTER_TYPE_PREFIXSPLAT; + return LETTER_TYPE_OTHER; +} + +static bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED) +{ + i_panic("Letter type should not be used."); +} + +/* WB3, WB3a and WB3b, but really different since we try to eat + whitespace between words. */ +static bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED) +{ + return TRUE; +} + +static bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED) +{ + /* WB4 */ + return FALSE; +} + +static bool letter_regional_indicator(struct generic_fts_tokenizer *tok) +{ + /* WB13c */ + if (tok->prev_type == LETTER_TYPE_REGIONAL_INDICATOR) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_katakana(struct generic_fts_tokenizer *tok) +{ + /* WB13 */ + if (tok->prev_type == LETTER_TYPE_KATAKANA) + return FALSE; + + /* WB13b */ + if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_hebrew(struct generic_fts_tokenizer *tok) +{ + /* WB5 */ + if (tok->prev_type == LETTER_TYPE_HEBREW_LETTER) + return FALSE; + + /* WB7 WB7c, except MidNumLet */ + if (tok->prev_prev_type == LETTER_TYPE_HEBREW_LETTER && + (tok->prev_type == LETTER_TYPE_SINGLE_QUOTE || + tok->prev_type == LETTER_TYPE_APOSTROPHE || + tok->prev_type == LETTER_TYPE_MIDLETTER || + tok->prev_type == LETTER_TYPE_DOUBLE_QUOTE)) + return FALSE; + + /* WB10 */ + if (tok->prev_type == LETTER_TYPE_NUMERIC) + return FALSE; + + /* WB13b */ + if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_aletter(struct generic_fts_tokenizer *tok) +{ + + /* WB5a */ + if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH) + if (IS_WB5A_APOSTROPHE(tok->prev_letter) && IS_VOWEL(tok->letter)) { + tok->seen_wb5a = TRUE; + return TRUE; + } + + /* WB5 */ + if (tok->prev_type == LETTER_TYPE_ALETTER) + return FALSE; + + /* WB7, except MidNumLet */ + if (tok->prev_prev_type == LETTER_TYPE_ALETTER && + (tok->prev_type == LETTER_TYPE_SINGLE_QUOTE || + tok->prev_type == LETTER_TYPE_APOSTROPHE || + tok->prev_type == LETTER_TYPE_MIDLETTER)) + return FALSE; + + /* WB10 */ + if (tok->prev_type == LETTER_TYPE_NUMERIC) + return FALSE; + + /* WB13b */ + if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET) + return FALSE; + + + return TRUE; /* Any / Any */ +} + +static bool letter_single_quote(struct generic_fts_tokenizer *tok) +{ + /* WB6 */ + if (tok->prev_type == LETTER_TYPE_ALETTER || + tok->prev_type == LETTER_TYPE_HEBREW_LETTER) + return FALSE; + + /* WB12 */ + if (tok->prev_type == LETTER_TYPE_NUMERIC) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_double_quote(struct generic_fts_tokenizer *tok) +{ + + if (tok->prev_type == LETTER_TYPE_DOUBLE_QUOTE) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED) +{ + + /* Break at MidNumLet, non-conformant with WB6/WB7 */ + return TRUE; +} + +static bool letter_midletter(struct generic_fts_tokenizer *tok) +{ + /* WB6 */ + if (tok->prev_type == LETTER_TYPE_ALETTER || + tok->prev_type == LETTER_TYPE_HEBREW_LETTER) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_midnum(struct generic_fts_tokenizer *tok) +{ + /* WB12 */ + if (tok->prev_type == LETTER_TYPE_NUMERIC) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_numeric(struct generic_fts_tokenizer *tok) +{ + /* WB8 */ + if (tok->prev_type == LETTER_TYPE_NUMERIC) + return FALSE; + + /* WB9 */ + if (tok->prev_type == LETTER_TYPE_ALETTER || + tok->prev_type == LETTER_TYPE_HEBREW_LETTER) + return FALSE; + + /* WB11 */ + if(tok->prev_prev_type == LETTER_TYPE_NUMERIC && + (tok->prev_type == LETTER_TYPE_MIDNUM || + tok->prev_type == LETTER_TYPE_MIDNUMLET || + tok->prev_type == LETTER_TYPE_SINGLE_QUOTE)) + return FALSE; + + /* WB13b */ + if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_extendnumlet(struct generic_fts_tokenizer *tok) +{ + + /* WB13a */ + if (tok->prev_type == LETTER_TYPE_ALETTER || + tok->prev_type == LETTER_TYPE_HEBREW_LETTER || + tok->prev_type == LETTER_TYPE_NUMERIC || + tok->prev_type == LETTER_TYPE_KATAKANA || + tok->prev_type == LETTER_TYPE_EXTENDNUMLET) + return FALSE; + + return TRUE; /* Any / Any */ +} + +static bool letter_apostrophe(struct generic_fts_tokenizer *tok) +{ + + if (tok->prev_type == LETTER_TYPE_ALETTER || + tok->prev_type == LETTER_TYPE_HEBREW_LETTER) + return FALSE; + + return TRUE; /* Any / Any */ +} +static bool letter_prefixsplat(struct generic_fts_tokenizer *tok ATTR_UNUSED) +{ + /* Dovecot explicit-prefix specific */ + return TRUE; /* Always induces a word break - but with special handling */ +} +static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED) +{ + return TRUE; /* Any / Any */ +} + +/* + TODO: Define what to skip between words. + TODO: Include double quotation marks? Messes up parsing? + TODO: Does this "reverse approach" include too much in "whitespace"? + TODO: Possibly use is_word_break()? + */ +static bool is_nontoken(enum letter_type lt) +{ + if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA || + lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER || + lt == LETTER_TYPE_NUMERIC) + return FALSE; + + return TRUE; +} + +/* The way things are done WB6/7 and WB11/12 "false positives" can + leave trailing unwanted chars. They are searched for here. This is + very kludgy and should be coded into the rules themselves + somehow. +*/ +static bool is_one_past_end(struct generic_fts_tokenizer *tok) +{ + /* WB6/7 false positive detected at one past end. */ + if (tok->prev_type == LETTER_TYPE_MIDLETTER || + tok->prev_type == LETTER_TYPE_MIDNUMLET || + tok->prev_type == LETTER_TYPE_APOSTROPHE || + tok->prev_type == LETTER_TYPE_SINGLE_QUOTE ) + return TRUE; + + /* WB11/12 false positive detected at one past end. */ + if (tok->prev_type == LETTER_TYPE_MIDNUM || + tok->prev_type == LETTER_TYPE_MIDNUMLET || + tok->prev_type == LETTER_TYPE_APOSTROPHE || + tok->prev_type == LETTER_TYPE_SINGLE_QUOTE) + return TRUE; + + return FALSE; +} + +static void +fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok, + const char **token_r) +{ + const unsigned char *data = tok->token->data; + size_t len = tok->token->used; + + if (is_one_past_end(tok) && + tok->untruncated_length <= tok->max_length) { + /* delete the last character */ + while (!UTF8_IS_START_SEQ(data[len-1])) + len--; + i_assert(len > 0); + len--; + } else if (tok->untruncated_length > tok->max_length) { + fts_tokenizer_delete_trailing_partial_char(data, &len); + } + /* we're skipping all non-token chars at the beginning of the word, + so by this point we must have something here - even if we just + deleted the last character */ + i_assert(len > 0); + i_assert(len <= tok->max_length); + + tok->prev_prev_type = LETTER_TYPE_NONE; + tok->prev_type = LETTER_TYPE_NONE; + *token_r = t_strndup(data, len); + buffer_set_used_size(tok->token, 0); + tok->untruncated_length = 0; +} + +static void wb5a_reinsert(struct generic_fts_tokenizer *tok) +{ + string_t *utf8_str = t_str_new(6); + + uni_ucs4_to_utf8_c(tok->letter, utf8_str); + buffer_insert(tok->token, 0, str_data(utf8_str), str_len(utf8_str)); + tok->prev_type = letter_type(tok->letter); + tok->letter = 0; + tok->prev_letter = 0; + tok->seen_wb5a = FALSE; +} + +struct letter_fn { + bool (*fn)(struct generic_fts_tokenizer *tok); +}; +static struct letter_fn letter_fns[] = { + {letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline}, + {letter_cr_lf_newline}, {letter_extend_format}, + {letter_regional_indicator}, {letter_extend_format}, + {letter_katakana}, {letter_hebrew}, {letter_aletter}, + {letter_single_quote}, {letter_double_quote}, + {letter_midnumlet}, {letter_midletter}, {letter_midnum}, + {letter_numeric}, {letter_extendnumlet}, {letter_panic}, + {letter_panic}, {letter_apostrophe}, {letter_prefixsplat}, + {letter_other} +}; + +/* + Find word boundaries in input text. Based on Unicode standard annex + #29, but tailored for FTS purposes. + http://www.unicode.org/reports/tr29/ + + Note: The text of tr29 is a living standard, so it keeps + changing. In newer specs some characters are combined, like AHLetter + (ALetter | Hebrew_Letter) and MidNumLetQ (MidNumLet | Single_Quote). + + Adaptions: + * Added optional WB5a as a configurable option. The cut of prefix is + max FTS_WB5A_PREFIX chars. + * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2). + * Break just once, not before and after. + * Break at MidNumLet, except apostrophes (diverging from WB6/WB7). + * Other things also (e.g. is_nontoken(), not really pure tr29. Meant + to assist in finding individual words. +*/ +static bool +uni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt) +{ + /* No rule knows what to do with just one char, except the linebreaks + we eat away (above) anyway. */ + if (tok->prev_type != LETTER_TYPE_NONE) { + if (letter_fns[lt].fn(tok)) + return TRUE; + } + + if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT) { + /* These types are completely ignored. */ + } else { + add_prev_type(tok,lt); + } + return FALSE; +} + +static int +fts_tokenizer_generic_tr29_next(struct fts_tokenizer *_tok, + const unsigned char *data, size_t size, + size_t *skip_r, const char **token_r, + const char **error_r ATTR_UNUSED) +{ + struct generic_fts_tokenizer *tok = + container_of(_tok, struct generic_fts_tokenizer, tokenizer); + unichar_t c; + size_t i, char_start_i, start_pos; + enum letter_type lt; + int char_size; + + start_pos = tok->token->used > 0 ? 0 : skip_base64(data, size); + for (i = start_pos; i < size; ) { + char_start_i = i; + char_size = uni_utf8_get_char_n(data + i, size - i, &c); + i_assert(char_size > 0); + i += char_size; + lt = letter_type(c); + + /* The WB5a break is detected only when the "after + break" char is inspected. That char needs to be + reinserted as the "previous char". */ + if (tok->seen_wb5a) + wb5a_reinsert(tok); + + if (tok->prev_type == LETTER_TYPE_NONE && is_nontoken(lt)) { + /* Skip non-token chars at the beginning of token */ + i_assert(tok->token->used == 0); + start_pos = i; + continue; + } + + if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH) + add_letter(tok, c); + + if (uni_found_word_boundary(tok, lt)) { + i_assert(char_start_i >= start_pos && size >= start_pos); + tok_append_truncated(tok, data + start_pos, + char_start_i - start_pos); + if (lt == LETTER_TYPE_PREFIXSPLAT && tok->prefixsplat) { + const unsigned char prefix_char = FTS_PREFIX_SPLAT_CHAR; + tok_append_truncated(tok, &prefix_char, 1); + } + *skip_r = i; + fts_tokenizer_generic_tr29_current_token(tok, token_r); + return 1; + } else if (lt == LETTER_TYPE_APOSTROPHE || + lt == LETTER_TYPE_SINGLE_QUOTE) { + /* all apostrophes require special handling */ + const unsigned char apostrophe_char = '\''; + tok_append_truncated(tok, data + start_pos, + char_start_i - start_pos); + tok_append_truncated(tok, &apostrophe_char, 1); + start_pos = i; + } + } + i_assert(i >= start_pos && size >= start_pos); + if (i > start_pos) + tok_append_truncated(tok, data + start_pos, i - start_pos); + *skip_r = i; + + if (size == 0 && tok->token->used > 0) { + /* return the last token */ + *skip_r = 0; + fts_tokenizer_generic_tr29_current_token(tok, token_r); + return 1; + } + return 0; +} + +static int +fts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED, + const unsigned char *data ATTR_UNUSED, + size_t size ATTR_UNUSED, + size_t *skip_r ATTR_UNUSED, + const char **token_r ATTR_UNUSED, + const char **error_r ATTR_UNUSED) +{ + i_unreached(); +} + +static const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = { + fts_tokenizer_generic_create, + fts_tokenizer_generic_destroy, + fts_tokenizer_generic_reset, + fts_tokenizer_generic_next +}; + +static const struct fts_tokenizer fts_tokenizer_generic_real = { + .name = "generic", + .v = &generic_tokenizer_vfuncs +}; +const struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real; + +const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = { + fts_tokenizer_generic_create, + fts_tokenizer_generic_destroy, + fts_tokenizer_generic_reset, + fts_tokenizer_generic_simple_next +}; +const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = { + fts_tokenizer_generic_create, + fts_tokenizer_generic_destroy, + fts_tokenizer_generic_reset, + fts_tokenizer_generic_tr29_next +}; -- cgit v1.2.3