diff options
Diffstat (limited to 'src/lib-fts/fts-tokenizer.h')
-rw-r--r-- | src/lib-fts/fts-tokenizer.h | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/src/lib-fts/fts-tokenizer.h b/src/lib-fts/fts-tokenizer.h new file mode 100644 index 0000000..c202cf7 --- /dev/null +++ b/src/lib-fts/fts-tokenizer.h @@ -0,0 +1,87 @@ +#ifndef FTS_TOKENIZER_H +#define FTS_TOKENIZER_H + +/* + Settings are given in the form of a const char * const *settings = + {"key, "value", "key2", "value2", NULL} array of string pairs. Some + keys, like "no_parent" and "search" are a sort of boolean and the + value does not matter, just mentioning the key enables the functionality. + The array has to be NULL terminated. +*/ +/* Email address header tokenizer that returns "user@domain.org" input as + "user@domain.org" token as well as passing it through to the parent + (generic) tokenizer, which also returns "user", "domain" and "org". + This allows searching the mails with their individual components, but also + allows doing an explicit "user@domain" search, which returns only mails + matching that exact address (instead of e.g. a mail with both user@domain2 + and user2@domain words). */ +/* Settings: + "no_parent", Return only our tokens, no data for parent to process. + Defaults to disabled. Should normally not be needed. + + "search" Remove addresses from parent data stream, so they are not processed + further. Defaults to disabled. Enable by defining the keyword (and any + value). */ +extern const struct fts_tokenizer *fts_tokenizer_email_address; + +/* Generic email content tokenizer. Cuts text into tokens. */ +/* Settings: + "maxlen" Maximum length of token, before an arbitrary cut off is made. + Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH. + + "algorithm", accepted values are "simple" or "tr29". Defines the + method for looking for word boundaries. Simple is faster and will + work for many texts, especially those using latin alphabets, but + leaves corner cases. The tr29 implements a version of Unicode + technical report 29 word boundary lookup. It might work better with + e.g. texts containing Katakana or hebrew characters, but it is not + possible to use a single algorithm for all existing languages. It + is also significantly slower than simple. The algorithms also + differ in some details, e.g. simple will cut "a.b" and tr29 will + not. The default is "simple" */ +extern const struct fts_tokenizer *fts_tokenizer_generic; + +/* + Tokenizing workflow, find --> create --> filter --> destroy. + Do init before first use and deinit after all done. + */ + +/* Register all built-in tokenizers. */ +void fts_tokenizers_init(void); +void fts_tokenizers_deinit(void); + +const struct fts_tokenizer *fts_tokenizer_find(const char *name); + +/* Create a new tokenizer. The settings are described above. */ +int fts_tokenizer_create(const struct fts_tokenizer *tok_class, + struct fts_tokenizer *parent, + const char *const *settings, + struct fts_tokenizer **tokenizer_r, + const char **error_r); +void fts_tokenizer_ref(struct fts_tokenizer *tok); +void fts_tokenizer_unref(struct fts_tokenizer **tok); + +/* Reset FTS tokenizer state */ +void fts_tokenizer_reset(struct fts_tokenizer *tok); + +/* + Returns 1 if *token_r was returned, 0 if more data is needed, -1 on error. + + This function should be called with the same data+size until it + returns 0. After that fts_tokenizer_final() should be called until it + returns 0 to flush out the final token(s). + + data must contain only valid complete UTF-8 sequences, but otherwise it + may be broken into however small pieces. (Input to this function typically + comes from message-decoder, which returns only complete UTF-8 sequences.) */ + +int fts_tokenizer_next(struct fts_tokenizer *tok, + const unsigned char *data, size_t size, + const char **token_r, const char **error_r); +/* Returns same as fts_tokenizer_next(). */ +int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r, + const char **error_r); + +const char *fts_tokenizer_name(const struct fts_tokenizer *tok); + +#endif |