diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/elasticlunr-rs/src/lang/zh.rs | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/elasticlunr-rs/src/lang/zh.rs')
-rw-r--r-- | vendor/elasticlunr-rs/src/lang/zh.rs | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/vendor/elasticlunr-rs/src/lang/zh.rs b/vendor/elasticlunr-rs/src/lang/zh.rs new file mode 100644 index 000000000..aa10d758f --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/zh.rs @@ -0,0 +1,55 @@ +use super::{common::RegexTrimmer, Language}; +use crate::pipeline::{FnWrapper, Pipeline}; + +#[derive(Clone)] +pub struct Chinese { + jieba: jieba_rs::Jieba, +} + +impl Chinese { + pub fn new() -> Self { + Self { + jieba: jieba_rs::Jieba::new(), + } + } +} + +impl Language for Chinese { + fn name(&self) -> String { + "Chinese".into() + } + fn code(&self) -> String { + "zh".into() + } + + fn tokenize(&self, text: &str) -> Vec<String> { + self.jieba + .cut_for_search(text, false) + .iter() + .map(|s| s.to_string()) + .collect() + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-zh", r"\p{Unified_Ideograph}\p{Latin}")), + Box::new(FnWrapper("stopWordFilter-zh".into(), stop_word_filter)), + Box::new(FnWrapper("stemmer-zh".into(), stemmer)), + ], + } + } +} + +// TODO: lunr.zh.js has a much larger set of stop words +fn stop_word_filter(token: String) -> Option<String> { + match token.as_str() { + "的" | "了" => None, + _ => Some(token), + } +} + +// lunr.zh.js has an empty stemmer as well +fn stemmer(token: String) -> Option<String> { + Some(token) +} |