summaryrefslogtreecommitdiffstats
path: root/src/lib-fts/fts-tokenizer-generic.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib-fts/fts-tokenizer-generic.c')
-rw-r--r--src/lib-fts/fts-tokenizer-generic.c906
1 files changed, 906 insertions, 0 deletions
diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c
new file mode 100644
index 0000000..be72aa3
--- /dev/null
+++ b/src/lib-fts/fts-tokenizer-generic.c
@@ -0,0 +1,906 @@
+/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "base64.h"
+#include "buffer.h"
+#include "str.h"
+#include "unichar.h"
+#include "bsearch-insert-pos.h"
+#include "fts-common.h"
+#include "fts-tokenizer-private.h"
+#include "fts-tokenizer-generic-private.h"
+#include "fts-tokenizer-common.h"
+#include "word-boundary-data.c"
+#include "word-break-data.c"
+
+/* see comments below between is_base64() and skip_base64() */
+#define FTS_SKIP_BASE64_MIN_SEQUENCES 1
+#define FTS_SKIP_BASE64_MIN_CHARS 50
+
+#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
+#define FTS_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
+
+static unsigned char fts_ascii_word_breaks[128] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
+
+ 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
+};
+
+static int
+fts_tokenizer_generic_create(const char *const *settings,
+ struct fts_tokenizer **tokenizer_r,
+ const char **error_r)
+{
+ struct generic_fts_tokenizer *tok;
+ unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
+ enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
+ bool wb5a = FALSE;
+ bool search = FALSE;
+ bool explicitprefix = FALSE;
+ unsigned int i;
+
+ for (i = 0; settings[i] != NULL; i += 2) {
+ const char *key = settings[i], *value = settings[i+1];
+
+ if (strcmp(key, "maxlen") == 0) {
+ if (str_to_uint(value, &max_length) < 0 ||
+ max_length == 0) {
+ *error_r = t_strdup_printf(
+ "Invalid maxlen setting: %s", value);
+ return -1;
+ }
+ } else if (strcmp(key, "algorithm") == 0) {
+ if (strcmp(value, ALGORITHM_TR29_NAME) == 0)
+ algo = BOUNDARY_ALGORITHM_TR29;
+ else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
+ ;
+ else {
+ *error_r = t_strdup_printf(
+ "Invalid algorithm: %s", value);
+ return -1;
+ }
+ } else if (strcmp(key, "search") == 0) {
+ /* tokenizing a search string -
+ makes no difference to us */
+ search = TRUE;
+ } else if (strcasecmp(key, "wb5a") == 0) {
+ if (strcasecmp(value, "no") == 0)
+ wb5a = FALSE;
+ else
+ wb5a = TRUE;
+ } else if (strcasecmp(key, "explicitprefix") == 0) {
+ explicitprefix = TRUE;
+ } else {
+ *error_r = t_strdup_printf("Unknown setting: %s", key);
+ return -1;
+ }
+ }
+
+ /* Tokenise normally unless tokenising an explicit prefix query */
+ if (!search)
+ explicitprefix = FALSE;
+
+ if (wb5a && algo != BOUNDARY_ALGORITHM_TR29) {
+ *error_r = "Can not use WB5a for algorithms other than TR29.";
+ return -1;
+ }
+
+ tok = i_new(struct generic_fts_tokenizer, 1);
+ if (algo == BOUNDARY_ALGORITHM_TR29)
+ tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
+ else
+ tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
+ tok->max_length = max_length;
+ tok->algorithm = algo;
+ tok->wb5a = wb5a;
+ tok->prefixsplat = explicitprefix;
+ tok->token = buffer_create_dynamic(default_pool, 64);
+
+ *tokenizer_r = &tok->tokenizer;
+ return 0;
+}
+
+static void
+fts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
+{
+ struct generic_fts_tokenizer *tok =
+ container_of(_tok, struct generic_fts_tokenizer, tokenizer);
+
+ buffer_free(&tok->token);
+ i_free(tok);
+}
+
+static inline void
+shift_prev_type(struct generic_fts_tokenizer *tok, enum letter_type lt)
+{
+ tok->prev_prev_type = tok->prev_type;
+ tok->prev_type = lt;
+}
+
+static inline void
+add_prev_type(struct generic_fts_tokenizer *tok, enum letter_type lt)
+{
+ if(tok->prev_type != LETTER_TYPE_NONE)
+ tok->prev_prev_type = tok->prev_type;
+ tok->prev_type = lt;
+}
+
+static inline void
+add_letter(struct generic_fts_tokenizer *tok, unichar_t c)
+{
+ if(tok->letter != 0)
+ tok->prev_letter = tok->letter;
+ tok->letter = c;
+}
+
+static bool
+fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
+ const char **token_r)
+{
+ const unsigned char *data = tok->token->data;
+ size_t len = tok->token->used;
+
+ if (tok->untruncated_length <= tok->max_length) {
+ /* Remove the trailing apostrophe - it was made
+ into U+0027 earlier. There can be only a single such
+ apostrophe, because otherwise the token would have already
+ been split. We also want to remove the trailing apostrophe
+ only if it's the the last character in the nontruncated
+ token - a truncated token may end with apostrophe. */
+ if (len > 0 && data[len-1] == '\'') {
+ len--;
+ i_assert(len > 0 && data[len-1] != '\'');
+ }
+ if (len > 0 && data[len-1] == '*' && !tok->prefixsplat) {
+ len--;
+ i_assert(len > 0 && data[len-1] != '*');
+ }
+ } else {
+ fts_tokenizer_delete_trailing_partial_char(data, &len);
+ }
+ i_assert(len <= tok->max_length);
+
+ *token_r = len == 0 ? "" :
+ t_strndup(tok->token->data, len);
+ buffer_set_used_size(tok->token, 0);
+ tok->untruncated_length = 0;
+ return len > 0;
+}
+
+static bool uint32_find(const uint32_t *data, unsigned int count,
+ uint32_t value, unsigned int *idx_r)
+{
+ BINARY_NUMBER_SEARCH(data, count, value, idx_r);
+}
+
+static bool fts_uni_word_break(unichar_t c)
+{
+ unsigned int idx;
+
+ /* Unicode General Punctuation, including deprecated characters. */
+ if (c >= 0x2000 && c <= 0x206f)
+ return TRUE;
+ /* From word-break-data.c, which is generated from PropList.txt. */
+ if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
+ return TRUE;
+ if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
+ return TRUE;
+ if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
+ return TRUE;
+ if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
+ return TRUE;
+ if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
+ return TRUE;
+ if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
+ return TRUE;
+ return FALSE;
+}
+
+enum fts_break_type {
+ FTS_FROM_STOP = 0,
+ FTS_FROM_WORD = 2,
+ FTS_TO_STOP= 0,
+ FTS_TO_WORD = 1,
+#define FROM_TO(f,t) FTS_##f##_TO_##t = FTS_FROM_##f | FTS_TO_##t
+ FROM_TO(STOP,STOP),
+ FROM_TO(STOP,WORD),
+ FROM_TO(WORD,STOP),
+ FROM_TO(WORD,WORD),
+};
+static inline enum fts_break_type
+fts_simple_is_word_break(const struct generic_fts_tokenizer *tok,
+ unichar_t c, bool apostrophe)
+{
+ /* Until we know better, a letter followed by an apostrophe is continuation of the word.
+ However, if we see non-word letters afterwards, we'll reverse that decision. */
+ if (apostrophe)
+ return tok->prev_type == LETTER_TYPE_ALETTER ? FTS_WORD_TO_WORD : FTS_STOP_TO_STOP;
+
+ bool new_breakiness = (c < 0x80) ? (fts_ascii_word_breaks[c] != 0) : fts_uni_word_break(c);
+
+ return (new_breakiness ? FTS_TO_STOP : FTS_TO_WORD)
+ + (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_SINGLE_QUOTE
+ ? FTS_FROM_WORD : FTS_FROM_STOP);
+}
+
+static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
+{
+ struct generic_fts_tokenizer *tok =
+ container_of(_tok, struct generic_fts_tokenizer, tokenizer);
+
+ tok->prev_type = LETTER_TYPE_NONE;
+ tok->prev_prev_type = LETTER_TYPE_NONE;
+ tok->untruncated_length = 0;
+ buffer_set_used_size(tok->token, 0);
+}
+
+static void tok_append_truncated(struct generic_fts_tokenizer *tok,
+ const unsigned char *data, size_t size)
+{
+ buffer_append(tok->token, data,
+ I_MIN(size, tok->max_length - tok->token->used));
+ tok->untruncated_length += size;
+}
+
+inline static bool
+is_base64(const unsigned char ch)
+{
+ return base64_scheme.decmap[ch] != 0xff;
+}
+
+/* So far the following rule seems give good results in avoid indexing base64
+ as keywords. It also seems to run well against against base64 embedded
+ headers, like ARC-Seal, DKIM-Signature, X-SG-EID, X-SG-ID, including
+ encoded parts (e.g. =?us-ascii?Q?...?= sequences).
+
+ leader characters : [ \t\r\n=:;?]*
+ matching characters : base64_scheme.decmap[ch] != 0xff
+ trailing characters : none or [ \t\r\n=:;?] (other characters cause
+ the run to be ignored)
+ minimum run length : 50
+ minimum runs count : 1
+
+ i.e. (single or multiple) 50-chars runs of characters in the base64 set
+ - excluded the trailing '=' - are recognized as base64 and ignored
+ in indexing. */
+
+#define allowed_base64_trailers allowed_base64_leaders
+static unsigned char allowed_base64_leaders[] = {
+ ' ', '\t', '\r', '\n', '=', ';', ':', '?'
+};
+
+/* skip_base64() works doing lookahead on the data available in the tokenizer
+ buffer, .i.e. it is not able to see "what will come next" to perform more
+ extensive matches. This implies that a very long base64 sequence, which is
+ split halfway into two different chunks while feeding tokenizer, will be
+ matched separately as the trailing part of first buffer and as the leading
+ part of the second. Each of these two segments must fulfill the match
+ criteria on its own to be discarded. What we pay is we will fail to reject
+ small base64 chunks segments instead of rejecting the whole sequence.
+
+ When skip_base64() is invoked in fts_tokenizer_generic_XX_next(), we know
+ that we are not halfway the collection of a token.
+
+ As (after the previous token) the buffer will contain non-token characters
+ (i.e. token separators of some kind), we try to move forward among those
+ until we find a base64 character. If we don't find one, there's nothing we
+ can skip in the buffer and the skip phase terminates.
+
+ If we found a base64 character, we check that the previous one is in
+ allowed_base64_leaders[]; otherwise, the skip phase terminates.
+
+ Now we try to determine how long the base64 sequence is. If it is too short,
+ the skip phase terminates. It also terminates if there's a character
+ in the buffer after the sequence and this is not in
+ allowed_base64_trailers[].
+
+ At this point we know that we have:
+ - possibly a skipped sequence of non base64 characters ending with an
+ allowed leader character, followed by:
+ - a skipped sequence of base64 characters, possibly followed by an allowed
+ trailed character
+ we advance the start pointer to after the last skipped base64 character,
+ and scan again to see if we can skip further chunks in the same way. */
+
+static size_t
+skip_base64(const unsigned char *data, size_t size)
+{
+ if (data == NULL) {
+ i_assert(size == 0);
+ return 0;
+ }
+
+ const unsigned char *start, *end = data + size;
+ unsigned int matches = 0;
+ for (start = data; start < end; ) {
+ const unsigned char *first;
+ for (first = start; first < end && !is_base64(*first); first++);
+ if (first > start && memchr(allowed_base64_leaders, *(first - 1),
+ N_ELEMENTS(allowed_base64_leaders)) == NULL)
+ break;
+
+ const unsigned char *past;
+ for (past = first; past < end && is_base64(*past); past++);
+ if (past - first < FTS_SKIP_BASE64_MIN_CHARS)
+ break;
+ if (past < end && memchr(allowed_base64_trailers, *past,
+ N_ELEMENTS(allowed_base64_trailers)) == NULL)
+ break;
+ start = past;
+ matches++;
+ }
+ return matches < FTS_SKIP_BASE64_MIN_SEQUENCES ? 0 : start - data;
+}
+
+static int
+fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
+ const unsigned char *data, size_t size,
+ size_t *skip_r, const char **token_r,
+ const char **error_r ATTR_UNUSED)
+{
+ struct generic_fts_tokenizer *tok =
+ container_of(_tok, struct generic_fts_tokenizer, tokenizer);
+ size_t i, start;
+ int char_size;
+ unichar_t c;
+ bool apostrophe;
+ enum fts_break_type break_type;
+
+ start = tok->token->used > 0 ? 0 : skip_base64(data, size);
+ for (i = start; i < size; i += char_size) {
+ char_size = uni_utf8_get_char_n(data + i, size - i, &c);
+ i_assert(char_size > 0);
+
+ apostrophe = IS_APOSTROPHE(c);
+ if ((tok->prefixsplat && IS_PREFIX_SPLAT(c)) &&
+ (tok->prev_type == LETTER_TYPE_ALETTER)) {
+ /* this might be a prefix-mathing query */
+ shift_prev_type(tok, LETTER_TYPE_PREFIXSPLAT);
+ } else if ((break_type = fts_simple_is_word_break(tok, c, apostrophe))
+ != FTS_WORD_TO_WORD) {
+ tok_append_truncated(tok, data + start, i - start);
+ shift_prev_type(tok, (break_type & FTS_TO_WORD) != 0
+ ? LETTER_TYPE_ALETTER : LETTER_TYPE_NONE);
+ if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
+ *skip_r = i;
+ if (break_type != FTS_STOP_TO_WORD) /* therefore *_TO_STOP */
+ *skip_r += char_size;
+ return 1;
+ }
+ if ((break_type & FTS_TO_WORD) == 0)
+ start = i + char_size;
+ } else if (apostrophe) {
+ /* all apostrophes require special handling */
+ const unsigned char apostrophe_char = '\'';
+
+ tok_append_truncated(tok, data + start, i - start);
+ if (tok->token->used > 0)
+ tok_append_truncated(tok, &apostrophe_char, 1);
+ start = i + char_size;
+ shift_prev_type(tok, LETTER_TYPE_SINGLE_QUOTE);
+ } else {
+ /* Lie slightly about the type. This is anything that
+ we're not skipping or cutting on and are prepared to
+ search for - it's "as good as" a letter. */
+ shift_prev_type(tok, LETTER_TYPE_ALETTER);
+ }
+ }
+ /* word boundary not found yet */
+ if (i > start)
+ tok_append_truncated(tok, data + start, i - start);
+ *skip_r = i;
+
+ /* return the last token */
+ if (size == 0) {
+ shift_prev_type(tok, LETTER_TYPE_NONE);
+ if (fts_tokenizer_generic_simple_current_token(tok, token_r))
+ return 1;
+ }
+
+ return 0;
+}
+
+/* TODO: Arrange array searches roughly in order of likelihood of a match.
+ TODO: Make some array of the arrays, so this can be a foreach loop.
+ TODO: Check for Hangul.
+ TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
+ U+058A ( ֊ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
+ HYPHEN.
+ TODO
+*/
+static enum letter_type letter_type(unichar_t c)
+{
+ unsigned int idx;
+
+ if (IS_APOSTROPHE(c))
+ return LETTER_TYPE_APOSTROPHE;
+ if (uint32_find(CR, N_ELEMENTS(CR), c, &idx))
+ return LETTER_TYPE_CR;
+ if (uint32_find(LF, N_ELEMENTS(LF), c, &idx))
+ return LETTER_TYPE_LF;
+ if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
+ return LETTER_TYPE_NEWLINE;
+ if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
+ return LETTER_TYPE_EXTEND;
+ if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
+ return LETTER_TYPE_REGIONAL_INDICATOR;
+ if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
+ return LETTER_TYPE_FORMAT;
+ if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
+ return LETTER_TYPE_KATAKANA;
+ if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
+ return LETTER_TYPE_HEBREW_LETTER;
+ if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
+ return LETTER_TYPE_ALETTER;
+ if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
+ return LETTER_TYPE_SINGLE_QUOTE;
+ if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
+ return LETTER_TYPE_DOUBLE_QUOTE;
+ if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
+ return LETTER_TYPE_MIDNUMLET;
+ if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
+ return LETTER_TYPE_MIDLETTER;
+ if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
+ return LETTER_TYPE_MIDNUM;
+ if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
+ return LETTER_TYPE_NUMERIC;
+ if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
+ return LETTER_TYPE_EXTENDNUMLET;
+ if (IS_PREFIX_SPLAT(c)) /* prioritise appropriately */
+ return LETTER_TYPE_PREFIXSPLAT;
+ return LETTER_TYPE_OTHER;
+}
+
+static bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+{
+ i_panic("Letter type should not be used.");
+}
+
+/* WB3, WB3a and WB3b, but really different since we try to eat
+ whitespace between words. */
+static bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+{
+ return TRUE;
+}
+
+static bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+{
+ /* WB4 */
+ return FALSE;
+}
+
+static bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
+{
+ /* WB13c */
+ if (tok->prev_type == LETTER_TYPE_REGIONAL_INDICATOR)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_katakana(struct generic_fts_tokenizer *tok)
+{
+ /* WB13 */
+ if (tok->prev_type == LETTER_TYPE_KATAKANA)
+ return FALSE;
+
+ /* WB13b */
+ if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_hebrew(struct generic_fts_tokenizer *tok)
+{
+ /* WB5 */
+ if (tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
+ return FALSE;
+
+ /* WB7 WB7c, except MidNumLet */
+ if (tok->prev_prev_type == LETTER_TYPE_HEBREW_LETTER &&
+ (tok->prev_type == LETTER_TYPE_SINGLE_QUOTE ||
+ tok->prev_type == LETTER_TYPE_APOSTROPHE ||
+ tok->prev_type == LETTER_TYPE_MIDLETTER ||
+ tok->prev_type == LETTER_TYPE_DOUBLE_QUOTE))
+ return FALSE;
+
+ /* WB10 */
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
+ return FALSE;
+
+ /* WB13b */
+ if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_aletter(struct generic_fts_tokenizer *tok)
+{
+
+ /* WB5a */
+ if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
+ if (IS_WB5A_APOSTROPHE(tok->prev_letter) && IS_VOWEL(tok->letter)) {
+ tok->seen_wb5a = TRUE;
+ return TRUE;
+ }
+
+ /* WB5 */
+ if (tok->prev_type == LETTER_TYPE_ALETTER)
+ return FALSE;
+
+ /* WB7, except MidNumLet */
+ if (tok->prev_prev_type == LETTER_TYPE_ALETTER &&
+ (tok->prev_type == LETTER_TYPE_SINGLE_QUOTE ||
+ tok->prev_type == LETTER_TYPE_APOSTROPHE ||
+ tok->prev_type == LETTER_TYPE_MIDLETTER))
+ return FALSE;
+
+ /* WB10 */
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
+ return FALSE;
+
+ /* WB13b */
+ if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
+ return FALSE;
+
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_single_quote(struct generic_fts_tokenizer *tok)
+{
+ /* WB6 */
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
+ return FALSE;
+
+ /* WB12 */
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_double_quote(struct generic_fts_tokenizer *tok)
+{
+
+ if (tok->prev_type == LETTER_TYPE_DOUBLE_QUOTE)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+{
+
+ /* Break at MidNumLet, non-conformant with WB6/WB7 */
+ return TRUE;
+}
+
+static bool letter_midletter(struct generic_fts_tokenizer *tok)
+{
+ /* WB6 */
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_midnum(struct generic_fts_tokenizer *tok)
+{
+ /* WB12 */
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_numeric(struct generic_fts_tokenizer *tok)
+{
+ /* WB8 */
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
+ return FALSE;
+
+ /* WB9 */
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
+ return FALSE;
+
+ /* WB11 */
+ if(tok->prev_prev_type == LETTER_TYPE_NUMERIC &&
+ (tok->prev_type == LETTER_TYPE_MIDNUM ||
+ tok->prev_type == LETTER_TYPE_MIDNUMLET ||
+ tok->prev_type == LETTER_TYPE_SINGLE_QUOTE))
+ return FALSE;
+
+ /* WB13b */
+ if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
+{
+
+ /* WB13a */
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER ||
+ tok->prev_type == LETTER_TYPE_NUMERIC ||
+ tok->prev_type == LETTER_TYPE_KATAKANA ||
+ tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+
+static bool letter_apostrophe(struct generic_fts_tokenizer *tok)
+{
+
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
+ return FALSE;
+
+ return TRUE; /* Any / Any */
+}
+static bool letter_prefixsplat(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+{
+ /* Dovecot explicit-prefix specific */
+ return TRUE; /* Always induces a word break - but with special handling */
+}
+static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+{
+ return TRUE; /* Any / Any */
+}
+
+/*
+ TODO: Define what to skip between words.
+ TODO: Include double quotation marks? Messes up parsing?
+ TODO: Does this "reverse approach" include too much in "whitespace"?
+ TODO: Possibly use is_word_break()?
+ */
+static bool is_nontoken(enum letter_type lt)
+{
+ if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
+ lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
+ lt == LETTER_TYPE_NUMERIC)
+ return FALSE;
+
+ return TRUE;
+}
+
+/* The way things are done WB6/7 and WB11/12 "false positives" can
+ leave trailing unwanted chars. They are searched for here. This is
+ very kludgy and should be coded into the rules themselves
+ somehow.
+*/
+static bool is_one_past_end(struct generic_fts_tokenizer *tok)
+{
+ /* WB6/7 false positive detected at one past end. */
+ if (tok->prev_type == LETTER_TYPE_MIDLETTER ||
+ tok->prev_type == LETTER_TYPE_MIDNUMLET ||
+ tok->prev_type == LETTER_TYPE_APOSTROPHE ||
+ tok->prev_type == LETTER_TYPE_SINGLE_QUOTE )
+ return TRUE;
+
+ /* WB11/12 false positive detected at one past end. */
+ if (tok->prev_type == LETTER_TYPE_MIDNUM ||
+ tok->prev_type == LETTER_TYPE_MIDNUMLET ||
+ tok->prev_type == LETTER_TYPE_APOSTROPHE ||
+ tok->prev_type == LETTER_TYPE_SINGLE_QUOTE)
+ return TRUE;
+
+ return FALSE;
+}
+
+static void
+fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
+ const char **token_r)
+{
+ const unsigned char *data = tok->token->data;
+ size_t len = tok->token->used;
+
+ if (is_one_past_end(tok) &&
+ tok->untruncated_length <= tok->max_length) {
+ /* delete the last character */
+ while (!UTF8_IS_START_SEQ(data[len-1]))
+ len--;
+ i_assert(len > 0);
+ len--;
+ } else if (tok->untruncated_length > tok->max_length) {
+ fts_tokenizer_delete_trailing_partial_char(data, &len);
+ }
+ /* we're skipping all non-token chars at the beginning of the word,
+ so by this point we must have something here - even if we just
+ deleted the last character */
+ i_assert(len > 0);
+ i_assert(len <= tok->max_length);
+
+ tok->prev_prev_type = LETTER_TYPE_NONE;
+ tok->prev_type = LETTER_TYPE_NONE;
+ *token_r = t_strndup(data, len);
+ buffer_set_used_size(tok->token, 0);
+ tok->untruncated_length = 0;
+}
+
+static void wb5a_reinsert(struct generic_fts_tokenizer *tok)
+{
+ string_t *utf8_str = t_str_new(6);
+
+ uni_ucs4_to_utf8_c(tok->letter, utf8_str);
+ buffer_insert(tok->token, 0, str_data(utf8_str), str_len(utf8_str));
+ tok->prev_type = letter_type(tok->letter);
+ tok->letter = 0;
+ tok->prev_letter = 0;
+ tok->seen_wb5a = FALSE;
+}
+
+struct letter_fn {
+ bool (*fn)(struct generic_fts_tokenizer *tok);
+};
+static struct letter_fn letter_fns[] = {
+ {letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline},
+ {letter_cr_lf_newline}, {letter_extend_format},
+ {letter_regional_indicator}, {letter_extend_format},
+ {letter_katakana}, {letter_hebrew}, {letter_aletter},
+ {letter_single_quote}, {letter_double_quote},
+ {letter_midnumlet}, {letter_midletter}, {letter_midnum},
+ {letter_numeric}, {letter_extendnumlet}, {letter_panic},
+ {letter_panic}, {letter_apostrophe}, {letter_prefixsplat},
+ {letter_other}
+};
+
+/*
+ Find word boundaries in input text. Based on Unicode standard annex
+ #29, but tailored for FTS purposes.
+ http://www.unicode.org/reports/tr29/
+
+ Note: The text of tr29 is a living standard, so it keeps
+ changing. In newer specs some characters are combined, like AHLetter
+ (ALetter | Hebrew_Letter) and MidNumLetQ (MidNumLet | Single_Quote).
+
+ Adaptions:
+ * Added optional WB5a as a configurable option. The cut of prefix is
+ max FTS_WB5A_PREFIX chars.
+ * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
+ * Break just once, not before and after.
+ * Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
+ * Other things also (e.g. is_nontoken(), not really pure tr29. Meant
+ to assist in finding individual words.
+*/
+static bool
+uni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
+{
+ /* No rule knows what to do with just one char, except the linebreaks
+ we eat away (above) anyway. */
+ if (tok->prev_type != LETTER_TYPE_NONE) {
+ if (letter_fns[lt].fn(tok))
+ return TRUE;
+ }
+
+ if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT) {
+ /* These types are completely ignored. */
+ } else {
+ add_prev_type(tok,lt);
+ }
+ return FALSE;
+}
+
+static int
+fts_tokenizer_generic_tr29_next(struct fts_tokenizer *_tok,
+ const unsigned char *data, size_t size,
+ size_t *skip_r, const char **token_r,
+ const char **error_r ATTR_UNUSED)
+{
+ struct generic_fts_tokenizer *tok =
+ container_of(_tok, struct generic_fts_tokenizer, tokenizer);
+ unichar_t c;
+ size_t i, char_start_i, start_pos;
+ enum letter_type lt;
+ int char_size;
+
+ start_pos = tok->token->used > 0 ? 0 : skip_base64(data, size);
+ for (i = start_pos; i < size; ) {
+ char_start_i = i;
+ char_size = uni_utf8_get_char_n(data + i, size - i, &c);
+ i_assert(char_size > 0);
+ i += char_size;
+ lt = letter_type(c);
+
+ /* The WB5a break is detected only when the "after
+ break" char is inspected. That char needs to be
+ reinserted as the "previous char". */
+ if (tok->seen_wb5a)
+ wb5a_reinsert(tok);
+
+ if (tok->prev_type == LETTER_TYPE_NONE && is_nontoken(lt)) {
+ /* Skip non-token chars at the beginning of token */
+ i_assert(tok->token->used == 0);
+ start_pos = i;
+ continue;
+ }
+
+ if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
+ add_letter(tok, c);
+
+ if (uni_found_word_boundary(tok, lt)) {
+ i_assert(char_start_i >= start_pos && size >= start_pos);
+ tok_append_truncated(tok, data + start_pos,
+ char_start_i - start_pos);
+ if (lt == LETTER_TYPE_PREFIXSPLAT && tok->prefixsplat) {
+ const unsigned char prefix_char = FTS_PREFIX_SPLAT_CHAR;
+ tok_append_truncated(tok, &prefix_char, 1);
+ }
+ *skip_r = i;
+ fts_tokenizer_generic_tr29_current_token(tok, token_r);
+ return 1;
+ } else if (lt == LETTER_TYPE_APOSTROPHE ||
+ lt == LETTER_TYPE_SINGLE_QUOTE) {
+ /* all apostrophes require special handling */
+ const unsigned char apostrophe_char = '\'';
+ tok_append_truncated(tok, data + start_pos,
+ char_start_i - start_pos);
+ tok_append_truncated(tok, &apostrophe_char, 1);
+ start_pos = i;
+ }
+ }
+ i_assert(i >= start_pos && size >= start_pos);
+ if (i > start_pos)
+ tok_append_truncated(tok, data + start_pos, i - start_pos);
+ *skip_r = i;
+
+ if (size == 0 && tok->token->used > 0) {
+ /* return the last token */
+ *skip_r = 0;
+ fts_tokenizer_generic_tr29_current_token(tok, token_r);
+ return 1;
+ }
+ return 0;
+}
+
+static int
+fts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
+ const unsigned char *data ATTR_UNUSED,
+ size_t size ATTR_UNUSED,
+ size_t *skip_r ATTR_UNUSED,
+ const char **token_r ATTR_UNUSED,
+ const char **error_r ATTR_UNUSED)
+{
+ i_unreached();
+}
+
+static const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
+ fts_tokenizer_generic_create,
+ fts_tokenizer_generic_destroy,
+ fts_tokenizer_generic_reset,
+ fts_tokenizer_generic_next
+};
+
+static const struct fts_tokenizer fts_tokenizer_generic_real = {
+ .name = "generic",
+ .v = &generic_tokenizer_vfuncs
+};
+const struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
+
+const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
+ fts_tokenizer_generic_create,
+ fts_tokenizer_generic_destroy,
+ fts_tokenizer_generic_reset,
+ fts_tokenizer_generic_simple_next
+};
+const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
+ fts_tokenizer_generic_create,
+ fts_tokenizer_generic_destroy,
+ fts_tokenizer_generic_reset,
+ fts_tokenizer_generic_tr29_next
+};