1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
use crate::pipeline::PipelineFn;
use regex::Regex;
use std::collections::HashSet;
#[derive(Clone)]
pub struct StopWordFilter {
name: String,
stop_words: HashSet<String>,
}
impl StopWordFilter {
pub fn new(name: &str, stop_words: &[&str]) -> Self {
Self {
name: name.into(),
stop_words: stop_words.iter().map(|s| s.to_string()).collect(),
}
}
}
impl PipelineFn for StopWordFilter {
fn name(&self) -> String {
self.name.clone()
}
fn filter(&self, token: String) -> Option<String> {
if self.stop_words.contains(&token) {
None
} else {
Some(token)
}
}
}
#[derive(Clone)]
pub struct RegexTrimmer {
name: String,
trimmer: Regex,
}
impl RegexTrimmer {
pub fn new(name: &str, word_chars: &str) -> Self {
let name = name.into();
let trimmer = Regex::new(&format!("^[^{0}]+|[^{0}]+$", word_chars)).unwrap();
Self { name, trimmer }
}
}
impl PipelineFn for RegexTrimmer {
fn name(&self) -> String {
self.name.clone()
}
fn filter(&self, token: String) -> Option<String> {
let result = self.trimmer.replace_all(&token, "");
if result.is_empty() {
None
} else if result == token {
Some(token)
} else {
Some(result.into())
}
}
}
#[cfg(feature = "rust-stemmers")]
pub struct RustStemmer {
name: String,
stemmer: rust_stemmers::Stemmer,
}
#[cfg(feature = "rust-stemmers")]
impl RustStemmer {
pub fn new(name: &str, algo: rust_stemmers::Algorithm) -> Self {
Self {
name: name.into(),
stemmer: rust_stemmers::Stemmer::create(algo),
}
}
}
#[cfg(feature = "rust-stemmers")]
impl PipelineFn for RustStemmer {
fn name(&self) -> String {
self.name.clone()
}
fn filter(&self, token: String) -> Option<String> {
let result = self.stemmer.stem(&token);
if result.is_empty() {
None
} else if result == token {
Some(token)
} else {
Some(result.into())
}
}
}
|