summaryrefslogtreecommitdiffstats
path: root/src/lib-fts/fts-filter.h
blob: 89060b79716f8d720722b63d33b0876d47cf4ec7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#ifndef FTS_FILTER_H
#define FTS_FILTER_H

struct fts_language;
struct fts_filter;
/*
 Settings are given in the form of a const char * const *settings =
 {"key, "value", "key2", "value2", NULL} array of string pairs.
 The array has to be NULL terminated.
*/
/*
  Settings: "stopwords_dir", path to the directory containing stopword files.
  Stopword files are looked up in "<path>"/stopwords_<lang>.txt

 */
extern const struct fts_filter *fts_filter_stopwords;

/*
 Settings: "lang", language of the stemmed language.
 */
extern const struct fts_filter *fts_filter_stemmer_snowball;

/*
 Settings: "id", description of the normalizing/translitterating rules
 to use.  See
 http://userguide.icu-project.org/transforms/general#TOC-Transliterator-Identifiers
 for syntax. Defaults to "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC"

 "maxlen", maximum length of tokens that ICU normalizer will output.
  Defaults to 250.
 */
extern const struct fts_filter *fts_filter_normalizer_icu;

/* Lowercases the input. Supports UTF8, if libicu is available. */
extern const struct fts_filter *fts_filter_lowercase;

/* Removes <'s> suffix from words. */
extern const struct fts_filter *fts_filter_english_possessive;

/* Removes prefixing contractions from words. */
extern const struct fts_filter *fts_filter_contractions;

/* Register all built-in filters. */
void fts_filters_init(void);
void fts_filters_deinit(void);

/* Register a new class explicitly. Built-in classes are automatically
   registered. */
void fts_filter_register(const struct fts_filter *filter_class);

/*
 Filtering workflow, find --> create --> filter --> destroy.
 */
const struct fts_filter *fts_filter_find(const char *name);
int fts_filter_create(const struct fts_filter *filter_class,
                      struct fts_filter *parent,
                      const struct fts_language *lang,
                      const char *const *settings,
                      struct fts_filter **filter_r,
                      const char **error_r);
void fts_filter_ref(struct fts_filter *filter);
void fts_filter_unref(struct fts_filter **filter);

/* Returns 1 if token is returned in *token, 0 if token was filtered
   out (*token is also set to NULL) and -1 on error.
   Input is also given via *token.
*/
int fts_filter_filter(struct fts_filter *filter, const char **token,
		      const char **error_r);

#endif