summaryrefslogtreecommitdiffstats
path: root/src/lib-fts/fts-tokenizer-common.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib-fts/fts-tokenizer-common.c')
-rw-r--r--src/lib-fts/fts-tokenizer-common.c35
1 files changed, 35 insertions, 0 deletions
diff --git a/src/lib-fts/fts-tokenizer-common.c b/src/lib-fts/fts-tokenizer-common.c
new file mode 100644
index 0000000..2763cdf
--- /dev/null
+++ b/src/lib-fts/fts-tokenizer-common.c
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h"
+#include "fts-tokenizer-common.h"
+void
+fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
+ size_t *len)
+{
+ size_t pos;
+ unsigned int char_bytes;
+
+ /* the token is truncated - make sure the last character
+ exists entirely in the token */
+ for (pos = *len-1; pos > 0; pos--) {
+ if (UTF8_IS_START_SEQ(data[pos]))
+ break;
+ }
+ char_bytes = uni_utf8_char_bytes(data[pos]);
+ if (char_bytes != *len-pos) {
+ i_assert(char_bytes > *len-pos);
+ *len = pos;
+ }
+}
+void fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data,
+ size_t *len)
+{
+ size_t pos = *len;
+
+ /* the token may contain '.' in the end - remove all of them. */
+ while (pos > 0 &&
+ (data[pos-1] == '.' || data[pos-1] == '-'))
+ pos--;
+ *len = pos;
+}