diff options
Diffstat (limited to 'vendor/elasticlunr-rs/src/lang/en.rs')
-rw-r--r-- | vendor/elasticlunr-rs/src/lang/en.rs | 458 |
1 files changed, 458 insertions, 0 deletions
diff --git a/vendor/elasticlunr-rs/src/lang/en.rs b/vendor/elasticlunr-rs/src/lang/en.rs new file mode 100644 index 000000000..f133ed7c9 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/en.rs @@ -0,0 +1,458 @@ +use super::{common::StopWordFilter, Language}; +use crate::pipeline::{FnWrapper, Pipeline, PipelineFn}; +use regex::Regex; + +const WORDS: &[&str] = &[ + "", "a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", + "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", + "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", + "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", + "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", + "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", + "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", + "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", + "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", + "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", +]; + +#[derive(Clone)] +pub struct English { + stemmer: Stemmer, +} + +impl English { + pub fn new() -> Self { + let stemmer = Stemmer::new(); + Self { stemmer } + } +} + +impl Language for English { + fn name(&self) -> String { + "English".into() + } + fn code(&self) -> String { + "en".into() + } + + fn tokenize(&self, text: &str) -> Vec<String> { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(FnWrapper("trimmer".into(), trimmer)), + Box::new(StopWordFilter::new("stopWordFilter", WORDS)), + Box::new(self.stemmer.clone()), + ], + } + } +} + +fn trimmer(token: String) -> Option<String> { + Some( + token + .trim_matches(|c: char| !c.is_digit(36) && c != '_') + .into(), + ) +} + +static STEP_2: &[(&str, &str)] = &[ + ("ational", "ate"), + ("tional", "tion"), + ("enci", "ence"), + ("anci", "ance"), + ("izer", "ize"), + ("bli", "ble"), + ("alli", "al"), + ("entli", "ent"), + ("eli", "e"), + ("ousli", "ous"), + ("ization", "ize"), + ("ation", "ate"), + ("ator", "ate"), + ("alism", "al"), + ("iveness", "ive"), + ("fulness", "ful"), + ("ousness", "ous"), + ("aliti", "al"), + ("iviti", "ive"), + ("biliti", "ble"), + ("logi", "log"), +]; + +static STEP_3: &[(&str, &str)] = &[ + ("icate", "ic"), + ("ative", ""), + ("alize", "al"), + ("iciti", "ic"), + ("ical", "ic"), + ("ful", ""), + ("ness", ""), +]; + +// This is a direct port of the stemmer from elasticlunr.js +// It's not very efficient and very not-rusty, but it +// generates identical output. + +#[derive(Clone)] +struct Stemmer { + re_mgr0: Regex, + re_mgr1: Regex, + re_meq1: Regex, + re_s_v: Regex, + + re_1a: Regex, + re2_1a: Regex, + re_1b: Regex, + re2_1b: Regex, + re2_1b_2: Regex, + re3_1b_2: Regex, + re4_1b_2: Regex, + + re_1c: Regex, + re_2: Regex, + + re_3: Regex, + + re_4: Regex, + re2_4: Regex, + + re_5: Regex, + re3_5: Regex, +} + +impl PipelineFn for Stemmer { + fn name(&self) -> String { + "stemmer".into() + } + + fn filter(&self, token: String) -> Option<String> { + Some(self.stem(token)) + } +} + +// vowel +macro_rules! V { + () => { + "[aeiouy]" + }; +} + +// consonant sequence +macro_rules! CS { + () => { + "[^aeiou][^aeiouy]*" + }; +} + +// vowel sequence +macro_rules! VS { + () => { + "[aeiouy][aeiou]*" + }; +} + +#[inline] +fn concat_string(strs: &[&str]) -> String { + strs.iter().cloned().collect() +} + +impl Stemmer { + fn new() -> Self { + let mgr0 = concat!("^(", CS!(), ")?", VS!(), CS!()); + let meq1 = concat!("^(", CS!(), ")?", VS!(), CS!(), "(", VS!(), ")?$"); + let mgr1 = concat!("^(", CS!(), ")?", VS!(), CS!(), VS!(), CS!()); + let s_v = concat!("^(", CS!(), ")?", V!()); + + let re_mgr0 = Regex::new(mgr0).unwrap(); + let re_mgr1 = Regex::new(mgr1).unwrap(); + let re_meq1 = Regex::new(meq1).unwrap(); + let re_s_v = Regex::new(s_v).unwrap(); + + let re_1a = Regex::new("^(.+?)(ss|i)es$").unwrap(); + let re2_1a = Regex::new("^(.+?)([^s])s$").unwrap(); + let re_1b = Regex::new("^(.+?)eed$").unwrap(); + let re2_1b = Regex::new("^(.+?)(ed|ing)$").unwrap(); + let re2_1b_2 = Regex::new("(at|bl|iz)$").unwrap(); + let re3_1b_2 = Regex::new("([^aeiouylsz]{2})$").unwrap(); + let re4_1b_2 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap(); + + let re_1c = Regex::new("^(.+?[^aeiou])y$").unwrap(); + let re_2 = Regex::new( + "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\ + ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$", + ) + .unwrap(); + + let re_3 = Regex::new("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$").unwrap(); + + let re_4 = Regex::new( + "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$", + ) + .unwrap(); + let re2_4 = Regex::new("^(.+?)(s|t)(ion)$").unwrap(); + + let re_5 = Regex::new("^(.+?)e$").unwrap(); + let re3_5 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap(); + + Stemmer { + re_mgr0, + re_mgr1, + re_meq1, + re_s_v, + re_1a, + re2_1a, + re_1b, + re2_1b, + re2_1b_2, + re3_1b_2, + re4_1b_2, + re_1c, + re_2, + re_3, + re_4, + re2_4, + re_5, + re3_5, + } + } + + /// Implements the Porter stemming algorithm + pub fn stem(&self, mut w: String) -> String { + if w.len() < 3 { + return w; + } + + let starts_with_y = w.as_bytes()[0] == b'y'; + if starts_with_y { + w.remove(0); + w.insert(0, 'Y'); + } + + // TODO: There's probably a better way to handle the + // borrowchecker than cloning w a million times + + // Step 1a + if let Some(caps) = self.re_1a.captures(&w.clone()) { + w = concat_string(&[&caps[1], &caps[2]]); + } + if let Some(caps) = self.re2_1a.captures(&w.clone()) { + w = concat_string(&[&caps[1], &caps[2]]); + } + + // Step 1b + if let Some(caps) = self.re_1b.captures(&w.clone()) { + let stem = &caps[1]; + if self.re_mgr0.is_match(stem) { + w.pop(); + } + } else if let Some(caps) = self.re2_1b.captures(&w.clone()) { + let stem = &caps[1]; + if self.re_s_v.is_match(stem) { + w = stem.into(); + + let mut re3_1b_2_matched = false; + + if self.re2_1b_2.is_match(&w) { + w.push('e'); + } else if let Some(m) = self.re3_1b_2.find(&w.clone()) { + let mut suffix = m.as_str().chars(); + // Make sure the two characters are the same since we can't use backreferences + if suffix.next() == suffix.next() { + re3_1b_2_matched = true; + w.pop(); + } + } + + // re4_1b_2 still runs if re3_1b_2 matches but + // the matched chcaracters are not the same + if !re3_1b_2_matched && self.re4_1b_2.is_match(&w) { + w.push('e'); + } + } + } + + // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first + // letter of the word (so cry -> cri, by -> by, say -> say) + if let Some(caps) = self.re_1c.captures(&w.clone()) { + let stem = &caps[1]; + w = concat_string(&[stem, "i"]); + } + + // Step 2 + if let Some(caps) = self.re_2.captures(&w.clone()) { + let stem = &caps[1]; + let suffix = &caps[2]; + if self.re_mgr0.is_match(stem) { + w = concat_string(&[stem, STEP_2.iter().find(|&&(k, _)| k == suffix).unwrap().1]); + } + } + + // Step 3 + if let Some(caps) = self.re_3.captures(&w.clone()) { + let stem = &caps[1]; + let suffix = &caps[2]; + if self.re_mgr0.is_match(stem) { + w = concat_string(&[stem, STEP_3.iter().find(|&&(k, _)| k == suffix).unwrap().1]); + } + } + + // Step 4 + if let Some(caps) = self.re_4.captures(&w.clone()) { + let stem = &caps[1]; + if self.re_mgr1.is_match(stem) { + w = stem.into(); + } + } else if let Some(caps) = self.re2_4.captures(&w.clone()) { + let stem = concat_string(&[&caps[1], &caps[2]]); + if self.re_mgr1.is_match(&stem) { + w = stem; + } + } + + // Step 5 + if let Some(caps) = self.re_5.captures(&w.clone()) { + let stem = &caps[1]; + if self.re_mgr1.is_match(stem) + || (self.re_meq1.is_match(stem) && !(self.re3_5.is_match(stem))) + { + w = stem.into(); + } + } + + if w.ends_with("ll") && self.re_mgr1.is_match(&w) { + w.pop(); + } + + // replace the original 'y' + if starts_with_y { + w.remove(0); + w.insert(0, 'y'); + } + + w + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! pipeline_eq { + ($func:expr, $input:expr, $output:expr) => { + assert_eq!(&$func($input.to_string()).unwrap(), $output); + }; + } + + #[test] + fn latin_characters() { + pipeline_eq!(trimmer, "hello", "hello"); + } + + #[test] + fn removing_punctuation() { + pipeline_eq!(trimmer, "hello.", "hello"); + pipeline_eq!(trimmer, "it's", "it's"); + pipeline_eq!(trimmer, "james'", "james"); + pipeline_eq!(trimmer, "stop!", "stop"); + pipeline_eq!(trimmer, "first,", "first"); + pipeline_eq!(trimmer, "", ""); + pipeline_eq!(trimmer, "[tag]", "tag"); + pipeline_eq!(trimmer, "[[[tag]]]", "tag"); + pipeline_eq!(trimmer, "[[!@#@!hello]]]}}}", "hello"); + pipeline_eq!(trimmer, "~!@@@hello***()()()]]", "hello"); + } + + #[test] + fn test_stemmer() { + let cases = [ + ("consign", "consign"), + ("consigned", "consign"), + ("consigning", "consign"), + ("consignment", "consign"), + ("consist", "consist"), + ("consisted", "consist"), + ("consistency", "consist"), + ("consistent", "consist"), + ("consistently", "consist"), + ("consisting", "consist"), + ("consists", "consist"), + ("consolation", "consol"), + ("consolations", "consol"), + ("consolatory", "consolatori"), + ("console", "consol"), + ("consoled", "consol"), + ("consoles", "consol"), + ("consolidate", "consolid"), + ("consolidated", "consolid"), + ("consolidating", "consolid"), + ("consoling", "consol"), + ("consols", "consol"), + ("consonant", "conson"), + ("consort", "consort"), + ("consorted", "consort"), + ("consorting", "consort"), + ("conspicuous", "conspicu"), + ("conspicuously", "conspicu"), + ("conspiracy", "conspiraci"), + ("conspirator", "conspir"), + ("conspirators", "conspir"), + ("conspire", "conspir"), + ("conspired", "conspir"), + ("conspiring", "conspir"), + ("constable", "constabl"), + ("constables", "constabl"), + ("constance", "constanc"), + ("constancy", "constanc"), + ("constant", "constant"), + ("knack", "knack"), + ("knackeries", "knackeri"), + ("knacks", "knack"), + ("knag", "knag"), + ("knave", "knave"), + ("knaves", "knave"), + ("knavish", "knavish"), + ("kneaded", "knead"), + ("kneading", "knead"), + ("knee", "knee"), + ("kneel", "kneel"), + ("kneeled", "kneel"), + ("kneeling", "kneel"), + ("kneels", "kneel"), + ("knees", "knee"), + ("knell", "knell"), + ("knelt", "knelt"), + ("knew", "knew"), + ("knick", "knick"), + ("knif", "knif"), + ("knife", "knife"), + ("knight", "knight"), + ("knights", "knight"), + ("knit", "knit"), + ("knits", "knit"), + ("knitted", "knit"), + ("knitting", "knit"), + ("knives", "knive"), + ("knob", "knob"), + ("knobs", "knob"), + ("knock", "knock"), + ("knocked", "knock"), + ("knocker", "knocker"), + ("knockers", "knocker"), + ("knocking", "knock"), + ("knocks", "knock"), + ("knopp", "knopp"), + ("knot", "knot"), + ("knots", "knot"), + ("lay", "lay"), + ("try", "tri"), + ]; + + let stemmer = Stemmer::new(); + for &(input, output) in cases.iter() { + assert_eq!(&stemmer.stem(input.into()), output); + } + } +} |