diff options
Diffstat (limited to 'src/lib-fts/fts-tokenizer-common.c')
-rw-r--r-- | src/lib-fts/fts-tokenizer-common.c | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/src/lib-fts/fts-tokenizer-common.c b/src/lib-fts/fts-tokenizer-common.c new file mode 100644 index 0000000..2763cdf --- /dev/null +++ b/src/lib-fts/fts-tokenizer-common.c @@ -0,0 +1,35 @@ +/* Copyright (c) 2016-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "unichar.h" +#include "fts-tokenizer-common.h" +void +fts_tokenizer_delete_trailing_partial_char(const unsigned char *data, + size_t *len) +{ + size_t pos; + unsigned int char_bytes; + + /* the token is truncated - make sure the last character + exists entirely in the token */ + for (pos = *len-1; pos > 0; pos--) { + if (UTF8_IS_START_SEQ(data[pos])) + break; + } + char_bytes = uni_utf8_char_bytes(data[pos]); + if (char_bytes != *len-pos) { + i_assert(char_bytes > *len-pos); + *len = pos; + } +} +void fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data, + size_t *len) +{ + size_t pos = *len; + + /* the token may contain '.' in the end - remove all of them. */ + while (pos > 0 && + (data[pos-1] == '.' || data[pos-1] == '-')) + pos--; + *len = pos; +} |