summaryrefslogtreecommitdiffstats
path: root/src/lib-fts/fts-tokenizer-common.c
blob: 2763cdffeb2c189ecb5f3ce4b264a83e80dc7196 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* Copyright (c) 2016-2018 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "unichar.h"
#include "fts-tokenizer-common.h"
void
fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
					   size_t *len)
{
	size_t pos;
	unsigned int char_bytes;

	/* the token is truncated - make sure the last character
	   exists entirely in the token */
	for (pos = *len-1; pos > 0; pos--) {
		if (UTF8_IS_START_SEQ(data[pos]))
			break;
	}
	char_bytes = uni_utf8_char_bytes(data[pos]);
	if (char_bytes != *len-pos) {
		i_assert(char_bytes > *len-pos);
		*len = pos;
	}
}
void fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data,
		   size_t *len)
{
	size_t pos = *len;

	/* the token may contain '.' in the end - remove all of them. */
	while (pos > 0 &&
		  (data[pos-1] == '.' || data[pos-1] == '-'))
	    pos--;
	*len = pos;
}