1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
use super::{common::RegexTrimmer, Language};
use crate::pipeline::{FnWrapper, Pipeline};
use lindera::tokenizer::{Tokenizer, TokenizerConfig};
use lindera_core::viterbi::Mode;
#[derive(Clone)]
pub struct Japanese {
tokenizer: Tokenizer,
}
impl Japanese {
pub fn new() -> Self {
let config = TokenizerConfig {
mode: Mode::Decompose(Default::default()),
..Default::default()
};
Self::with_config(config)
}
pub fn with_config(config: TokenizerConfig) -> Self {
// NB: unwrap() is okay since the errors are only related to user-supplied dictionaries.
let tokenizer = Tokenizer::with_config(config).unwrap();
Self { tokenizer }
}
}
impl Language for Japanese {
fn name(&self) -> String {
"Japanese".into()
}
fn code(&self) -> String {
"ja".into()
}
fn tokenize(&self, text: &str) -> Vec<String> {
self.tokenizer
.tokenize(text)
.unwrap()
.into_iter()
.filter_map(|tok| match tok.detail.get(0).map(|d| d.as_str()) {
Some("助詞") | Some("助動詞") | Some("記号") | Some("UNK") => None,
_ => Some(tok.text.to_string()),
})
.collect()
}
fn make_pipeline(&self) -> Pipeline {
Pipeline {
queue: vec![
Box::new(RegexTrimmer::new("trimmer-ja", WORD_CHARS)),
Box::new(FnWrapper("stemmer-ja".into(), stemmer)),
],
}
}
}
const WORD_CHARS: &str = r"0-9A-Za-z\p{Hiragana}\p{Katakana}\p{Unified_Ideograph}";
fn stemmer(token: String) -> Option<String> {
Some(token)
}
#[cfg(test)]
mod tests {
use crate::pipeline::PipelineFn;
use super::*;
#[test]
fn test_trimmer() {
let trimmer = RegexTrimmer::new("trimmer-ja".into(), WORD_CHARS);
assert_eq!(
trimmer.filter(" こんにちは、世界!".to_string()),
Some("こんにちは、世界".to_string())
);
}
}
|