summaryrefslogtreecommitdiffstats
path: root/vendor/elasticlunr-rs/src/lang/zh.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/elasticlunr-rs/src/lang/zh.rs')
-rw-r--r--vendor/elasticlunr-rs/src/lang/zh.rs55
1 files changed, 55 insertions, 0 deletions
diff --git a/vendor/elasticlunr-rs/src/lang/zh.rs b/vendor/elasticlunr-rs/src/lang/zh.rs
new file mode 100644
index 000000000..aa10d758f
--- /dev/null
+++ b/vendor/elasticlunr-rs/src/lang/zh.rs
@@ -0,0 +1,55 @@
+use super::{common::RegexTrimmer, Language};
+use crate::pipeline::{FnWrapper, Pipeline};
+
+#[derive(Clone)]
+pub struct Chinese {
+ jieba: jieba_rs::Jieba,
+}
+
+impl Chinese {
+ pub fn new() -> Self {
+ Self {
+ jieba: jieba_rs::Jieba::new(),
+ }
+ }
+}
+
+impl Language for Chinese {
+ fn name(&self) -> String {
+ "Chinese".into()
+ }
+ fn code(&self) -> String {
+ "zh".into()
+ }
+
+ fn tokenize(&self, text: &str) -> Vec<String> {
+ self.jieba
+ .cut_for_search(text, false)
+ .iter()
+ .map(|s| s.to_string())
+ .collect()
+ }
+
+ fn make_pipeline(&self) -> Pipeline {
+ Pipeline {
+ queue: vec![
+ Box::new(RegexTrimmer::new("trimmer-zh", r"\p{Unified_Ideograph}\p{Latin}")),
+ Box::new(FnWrapper("stopWordFilter-zh".into(), stop_word_filter)),
+ Box::new(FnWrapper("stemmer-zh".into(), stemmer)),
+ ],
+ }
+ }
+}
+
+// TODO: lunr.zh.js has a much larger set of stop words
+fn stop_word_filter(token: String) -> Option<String> {
+ match token.as_str() {
+ "的" | "了" => None,
+ _ => Some(token),
+ }
+}
+
+// lunr.zh.js has an empty stemmer as well
+fn stemmer(token: String) -> Option<String> {
+ Some(token)
+}