diff options
Diffstat (limited to 'vendor/elasticlunr-rs/src/lang/ar.rs')
-rw-r--r-- | vendor/elasticlunr-rs/src/lang/ar.rs | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/vendor/elasticlunr-rs/src/lang/ar.rs b/vendor/elasticlunr-rs/src/lang/ar.rs new file mode 100644 index 000000000..d0a640edf --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/ar.rs @@ -0,0 +1,66 @@ +use super::Language; +use crate::pipeline::{Pipeline, PipelineFn}; +use regex::Regex; + +/// Arabic Language +/// +/// Designed to be compatibile with the included Javascript implementation. See `js/lunr.ar.js`. +pub struct Arabic {} + +impl Arabic { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Arabic { + fn name(&self) -> String { + "Arabic".into() + } + fn code(&self) -> String { + "ar".into() + } + + fn tokenize(&self, text: &str) -> Vec<String> { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![Box::new(Stemmer::new())], + } + } +} + +struct Stemmer { + diacritics: Regex, + alefs: Regex, +} + +impl Stemmer { + pub fn new() -> Self { + let diacritics = Regex::new("[\u{0640}\u{064b}-\u{065b}]").unwrap(); + let alefs = Regex::new("[\u{0622}\u{0623}\u{0625}\u{0671}\u{0649}]").unwrap(); + Self { diacritics, alefs } + } +} + +impl PipelineFn for Stemmer { + fn name(&self) -> String { + "stemmer-ar".into() + } + + fn filter(&self, token: String) -> Option<String> { + // remove diacritics and elongating character + let result = self.diacritics.replace(&token, ""); + // replace all variations of alef (آأإٱى) to a plain alef (ا) + let result = self.alefs.replace(&result, "\u{0627}"); + if result.is_empty() { + None + } else if result == token { + Some(token) + } else { + Some(result.into()) + } + } +} |