summaryrefslogtreecommitdiffstats
path: root/vendor/elasticlunr-rs/src/lang/ja.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /vendor/elasticlunr-rs/src/lang/ja.rs
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/elasticlunr-rs/src/lang/ja.rs')
-rw-r--r--vendor/elasticlunr-rs/src/lang/ja.rs76
1 files changed, 76 insertions, 0 deletions
diff --git a/vendor/elasticlunr-rs/src/lang/ja.rs b/vendor/elasticlunr-rs/src/lang/ja.rs
new file mode 100644
index 000000000..e38fcde9f
--- /dev/null
+++ b/vendor/elasticlunr-rs/src/lang/ja.rs
@@ -0,0 +1,76 @@
+use super::{common::RegexTrimmer, Language};
+use crate::pipeline::{FnWrapper, Pipeline};
+use lindera::tokenizer::{Tokenizer, TokenizerConfig};
+use lindera_core::viterbi::Mode;
+
+#[derive(Clone)]
+pub struct Japanese {
+ tokenizer: Tokenizer,
+}
+
+impl Japanese {
+ pub fn new() -> Self {
+ let config = TokenizerConfig {
+ mode: Mode::Decompose(Default::default()),
+ ..Default::default()
+ };
+ Self::with_config(config)
+ }
+
+ pub fn with_config(config: TokenizerConfig) -> Self {
+ // NB: unwrap() is okay since the errors are only related to user-supplied dictionaries.
+ let tokenizer = Tokenizer::with_config(config).unwrap();
+ Self { tokenizer }
+ }
+}
+
+impl Language for Japanese {
+ fn name(&self) -> String {
+ "Japanese".into()
+ }
+ fn code(&self) -> String {
+ "ja".into()
+ }
+
+ fn tokenize(&self, text: &str) -> Vec<String> {
+ self.tokenizer
+ .tokenize(text)
+ .unwrap()
+ .into_iter()
+ .filter_map(|tok| match tok.detail.get(0).map(|d| d.as_str()) {
+ Some("助詞") | Some("助動詞") | Some("記号") | Some("UNK") => None,
+ _ => Some(tok.text.to_string()),
+ })
+ .collect()
+ }
+
+ fn make_pipeline(&self) -> Pipeline {
+ Pipeline {
+ queue: vec![
+ Box::new(RegexTrimmer::new("trimmer-ja", WORD_CHARS)),
+ Box::new(FnWrapper("stemmer-ja".into(), stemmer)),
+ ],
+ }
+ }
+}
+
+const WORD_CHARS: &str = r"0-9A-Za-z\p{Hiragana}\p{Katakana}\p{Unified_Ideograph}";
+
+fn stemmer(token: String) -> Option<String> {
+ Some(token)
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::pipeline::PipelineFn;
+ use super::*;
+
+ #[test]
+ fn test_trimmer() {
+ let trimmer = RegexTrimmer::new("trimmer-ja".into(), WORD_CHARS);
+ assert_eq!(
+ trimmer.filter(" こんにちは、世界!".to_string()),
+ Some("こんにちは、世界".to_string())
+ );
+ }
+}