Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:02:58 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:02:58 +0000
commit: 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree: 173a775858bd501c378080a10dca74132f05bc50 /vendor/elasticlunr-rs/src/lang/en.rs
parent: Initial commit. (diff)
download: rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
1 files changed, 458 insertions, 0 deletions
diff --git a/vendor/elasticlunr-rs/src/lang/en.rs b/vendor/elasticlunr-rs/src/lang/en.rs
new file mode 100644
index 000000000..f133ed7c9
--- /dev/null
+++ b/vendor/elasticlunr-rs/src/lang/en.rs
@@ -0,0 +1,458 @@
+use super::{common::StopWordFilter, Language};
+use crate::pipeline::{FnWrapper, Pipeline, PipelineFn};
+use regex::Regex;
+
+const WORDS: &[&str] = &[
+    "", "a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an",
+    "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot",
+    "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get",
+    "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if",
+    "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me",
+    "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on",
+    "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since",
+    "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this",
+    "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where",
+    "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your",
+];
+
+#[derive(Clone)]
+pub struct English {
+    stemmer: Stemmer,
+}
+
+impl English {
+    pub fn new() -> Self {
+        let stemmer = Stemmer::new();
+        Self { stemmer }
+    }
+}
+
+impl Language for English {
+    fn name(&self) -> String {
+        "English".into()
+    }
+    fn code(&self) -> String {
+        "en".into()
+    }
+
+    fn tokenize(&self, text: &str) -> Vec<String> {
+        super::tokenize_whitespace(text)
+    }
+
+    fn make_pipeline(&self) -> Pipeline {
+        Pipeline {
+            queue: vec![
+                Box::new(FnWrapper("trimmer".into(), trimmer)),
+                Box::new(StopWordFilter::new("stopWordFilter", WORDS)),
+                Box::new(self.stemmer.clone()),
+            ],
+        }
+    }
+}
+
+fn trimmer(token: String) -> Option<String> {
+    Some(
+        token
+            .trim_matches(|c: char| !c.is_digit(36) && c != '_')
+            .into(),
+    )
+}
+
+static STEP_2: &[(&str, &str)] = &[
+    ("ational", "ate"),
+    ("tional", "tion"),
+    ("enci", "ence"),
+    ("anci", "ance"),
+    ("izer", "ize"),
+    ("bli", "ble"),
+    ("alli", "al"),
+    ("entli", "ent"),
+    ("eli", "e"),
+    ("ousli", "ous"),
+    ("ization", "ize"),
+    ("ation", "ate"),
+    ("ator", "ate"),
+    ("alism", "al"),
+    ("iveness", "ive"),
+    ("fulness", "ful"),
+    ("ousness", "ous"),
+    ("aliti", "al"),
+    ("iviti", "ive"),
+    ("biliti", "ble"),
+    ("logi", "log"),
+];
+
+static STEP_3: &[(&str, &str)] = &[
+    ("icate", "ic"),
+    ("ative", ""),
+    ("alize", "al"),
+    ("iciti", "ic"),
+    ("ical", "ic"),
+    ("ful", ""),
+    ("ness", ""),
+];
+
+// This is a direct port of the stemmer from elasticlunr.js
+// It's not very efficient and very not-rusty, but it
+// generates identical output.
+
+#[derive(Clone)]
+struct Stemmer {
+    re_mgr0: Regex,
+    re_mgr1: Regex,
+    re_meq1: Regex,
+    re_s_v: Regex,
+
+    re_1a: Regex,
+    re2_1a: Regex,
+    re_1b: Regex,
+    re2_1b: Regex,
+    re2_1b_2: Regex,
+    re3_1b_2: Regex,
+    re4_1b_2: Regex,
+
+    re_1c: Regex,
+    re_2: Regex,
+
+    re_3: Regex,
+
+    re_4: Regex,
+    re2_4: Regex,
+
+    re_5: Regex,
+    re3_5: Regex,
+}
+
+impl PipelineFn for Stemmer {
+    fn name(&self) -> String {
+        "stemmer".into()
+    }
+
+    fn filter(&self, token: String) -> Option<String> {
+        Some(self.stem(token))
+    }
+}
+
+// vowel
+macro_rules! V {
+    () => {
+        "[aeiouy]"
+    };
+}
+
+// consonant sequence
+macro_rules! CS {
+    () => {
+        "[^aeiou][^aeiouy]*"
+    };
+}
+
+// vowel sequence
+macro_rules! VS {
+    () => {
+        "[aeiouy][aeiou]*"
+    };
+}
+
+#[inline]
+fn concat_string(strs: &[&str]) -> String {
+    strs.iter().cloned().collect()
+}
+
+impl Stemmer {
+    fn new() -> Self {
+        let mgr0 = concat!("^(", CS!(), ")?", VS!(), CS!());
+        let meq1 = concat!("^(", CS!(), ")?", VS!(), CS!(), "(", VS!(), ")?$");
+        let mgr1 = concat!("^(", CS!(), ")?", VS!(), CS!(), VS!(), CS!());
+        let s_v = concat!("^(", CS!(), ")?", V!());
+
+        let re_mgr0 = Regex::new(mgr0).unwrap();
+        let re_mgr1 = Regex::new(mgr1).unwrap();
+        let re_meq1 = Regex::new(meq1).unwrap();
+        let re_s_v = Regex::new(s_v).unwrap();
+
+        let re_1a = Regex::new("^(.+?)(ss|i)es$").unwrap();
+        let re2_1a = Regex::new("^(.+?)([^s])s$").unwrap();
+        let re_1b = Regex::new("^(.+?)eed$").unwrap();
+        let re2_1b = Regex::new("^(.+?)(ed|ing)$").unwrap();
+        let re2_1b_2 = Regex::new("(at|bl|iz)$").unwrap();
+        let re3_1b_2 = Regex::new("([^aeiouylsz]{2})$").unwrap();
+        let re4_1b_2 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap();
+
+        let re_1c = Regex::new("^(.+?[^aeiou])y$").unwrap();
+        let re_2 = Regex::new(
+            "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
+             ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$",
+        )
+        .unwrap();
+
+        let re_3 = Regex::new("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$").unwrap();
+
+        let re_4 = Regex::new(
+            "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$",
+        )
+        .unwrap();
+        let re2_4 = Regex::new("^(.+?)(s|t)(ion)$").unwrap();
+
+        let re_5 = Regex::new("^(.+?)e$").unwrap();
+        let re3_5 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap();
+
+        Stemmer {
+            re_mgr0,
+            re_mgr1,
+            re_meq1,
+            re_s_v,
+            re_1a,
+            re2_1a,
+            re_1b,
+            re2_1b,
+            re2_1b_2,
+            re3_1b_2,
+            re4_1b_2,
+            re_1c,
+            re_2,
+            re_3,
+            re_4,
+            re2_4,
+            re_5,
+            re3_5,
+        }
+    }
+
+    /// Implements the Porter stemming algorithm
+    pub fn stem(&self, mut w: String) -> String {
+        if w.len() < 3 {
+            return w;
+        }
+
+        let starts_with_y = w.as_bytes()[0] == b'y';
+        if starts_with_y {
+            w.remove(0);
+            w.insert(0, 'Y');
+        }
+
+        // TODO: There's probably a better way to handle the
+        // borrowchecker than cloning w a million times
+
+        // Step 1a
+        if let Some(caps) = self.re_1a.captures(&w.clone()) {
+            w = concat_string(&[&caps[1], &caps[2]]);
+        }
+        if let Some(caps) = self.re2_1a.captures(&w.clone()) {
+            w = concat_string(&[&caps[1], &caps[2]]);
+        }
+
+        // Step 1b
+        if let Some(caps) = self.re_1b.captures(&w.clone()) {
+            let stem = &caps[1];
+            if self.re_mgr0.is_match(stem) {
+                w.pop();
+            }
+        } else if let Some(caps) = self.re2_1b.captures(&w.clone()) {
+            let stem = &caps[1];
+            if self.re_s_v.is_match(stem) {
+                w = stem.into();
+
+                let mut re3_1b_2_matched = false;
+
+                if self.re2_1b_2.is_match(&w) {
+                    w.push('e');
+                } else if let Some(m) = self.re3_1b_2.find(&w.clone()) {
+                    let mut suffix = m.as_str().chars();
+                    // Make sure the two characters are the same since we can't use backreferences
+                    if suffix.next() == suffix.next() {
+                        re3_1b_2_matched = true;
+                        w.pop();
+                    }
+                }
+
+                // re4_1b_2 still runs if re3_1b_2 matches but
+                // the matched chcaracters are not the same
+                if !re3_1b_2_matched && self.re4_1b_2.is_match(&w) {
+                    w.push('e');
+                }
+            }
+        }
+
+        // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first
+        // letter of the word (so cry -> cri, by -> by, say -> say)
+        if let Some(caps) = self.re_1c.captures(&w.clone()) {
+            let stem = &caps[1];
+            w = concat_string(&[stem, "i"]);
+        }
+
+        // Step 2
+        if let Some(caps) = self.re_2.captures(&w.clone()) {
+            let stem = &caps[1];
+            let suffix = &caps[2];
+            if self.re_mgr0.is_match(stem) {
+                w = concat_string(&[stem, STEP_2.iter().find(|&&(k, _)| k == suffix).unwrap().1]);
+            }
+        }
+
+        // Step 3
+        if let Some(caps) = self.re_3.captures(&w.clone()) {
+            let stem = &caps[1];
+            let suffix = &caps[2];
+            if self.re_mgr0.is_match(stem) {
+                w = concat_string(&[stem, STEP_3.iter().find(|&&(k, _)| k == suffix).unwrap().1]);
+            }
+        }
+
+        // Step 4
+        if let Some(caps) = self.re_4.captures(&w.clone()) {
+            let stem = &caps[1];
+            if self.re_mgr1.is_match(stem) {
+                w = stem.into();
+            }
+        } else if let Some(caps) = self.re2_4.captures(&w.clone()) {
+            let stem = concat_string(&[&caps[1], &caps[2]]);
+            if self.re_mgr1.is_match(&stem) {
+                w = stem;
+            }
+        }
+
+        // Step 5
+        if let Some(caps) = self.re_5.captures(&w.clone()) {
+            let stem = &caps[1];
+            if self.re_mgr1.is_match(stem)
+                || (self.re_meq1.is_match(stem) && !(self.re3_5.is_match(stem)))
+            {
+                w = stem.into();
+            }
+        }
+
+        if w.ends_with("ll") && self.re_mgr1.is_match(&w) {
+            w.pop();
+        }
+
+        // replace the original 'y'
+        if starts_with_y {
+            w.remove(0);
+            w.insert(0, 'y');
+        }
+
+        w
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    macro_rules! pipeline_eq {
+        ($func:expr, $input:expr, $output:expr) => {
+            assert_eq!(&$func($input.to_string()).unwrap(), $output);
+        };
+    }
+
+    #[test]
+    fn latin_characters() {
+        pipeline_eq!(trimmer, "hello", "hello");
+    }
+
+    #[test]
+    fn removing_punctuation() {
+        pipeline_eq!(trimmer, "hello.", "hello");
+        pipeline_eq!(trimmer, "it's", "it's");
+        pipeline_eq!(trimmer, "james'", "james");
+        pipeline_eq!(trimmer, "stop!", "stop");
+        pipeline_eq!(trimmer, "first,", "first");
+        pipeline_eq!(trimmer, "", "");
+        pipeline_eq!(trimmer, "[tag]", "tag");
+        pipeline_eq!(trimmer, "[[[tag]]]", "tag");
+        pipeline_eq!(trimmer, "[[!@#@!hello]]]}}}", "hello");
+        pipeline_eq!(trimmer, "~!@@@hello***()()()]]", "hello");
+    }
+
+    #[test]
+    fn test_stemmer() {
+        let cases = [
+            ("consign", "consign"),
+            ("consigned", "consign"),
+            ("consigning", "consign"),
+            ("consignment", "consign"),
+            ("consist", "consist"),
+            ("consisted", "consist"),
+            ("consistency", "consist"),
+            ("consistent", "consist"),
+            ("consistently", "consist"),
+            ("consisting", "consist"),
+            ("consists", "consist"),
+            ("consolation", "consol"),
+            ("consolations", "consol"),
+            ("consolatory", "consolatori"),
+            ("console", "consol"),
+            ("consoled", "consol"),
+            ("consoles", "consol"),
+            ("consolidate", "consolid"),
+            ("consolidated", "consolid"),
+            ("consolidating", "consolid"),
+            ("consoling", "consol"),
+            ("consols", "consol"),
+            ("consonant", "conson"),
+            ("consort", "consort"),
+            ("consorted", "consort"),
+            ("consorting", "consort"),
+            ("conspicuous", "conspicu"),
+            ("conspicuously", "conspicu"),
+            ("conspiracy", "conspiraci"),
+            ("conspirator", "conspir"),
+            ("conspirators", "conspir"),
+            ("conspire", "conspir"),
+            ("conspired", "conspir"),
+            ("conspiring", "conspir"),
+            ("constable", "constabl"),
+            ("constables", "constabl"),
+            ("constance", "constanc"),
+            ("constancy", "constanc"),
+            ("constant", "constant"),
+            ("knack", "knack"),
+            ("knackeries", "knackeri"),
+            ("knacks", "knack"),
+            ("knag", "knag"),
+            ("knave", "knave"),
+            ("knaves", "knave"),
+            ("knavish", "knavish"),
+            ("kneaded", "knead"),
+            ("kneading", "knead"),
+            ("knee", "knee"),
+            ("kneel", "kneel"),
+            ("kneeled", "kneel"),
+            ("kneeling", "kneel"),
+            ("kneels", "kneel"),
+            ("knees", "knee"),
+            ("knell", "knell"),
+            ("knelt", "knelt"),
+            ("knew", "knew"),
+            ("knick", "knick"),
+            ("knif", "knif"),
+            ("knife", "knife"),
+            ("knight", "knight"),
+            ("knights", "knight"),
+            ("knit", "knit"),
+            ("knits", "knit"),
+            ("knitted", "knit"),
+            ("knitting", "knit"),
+            ("knives", "knive"),
+            ("knob", "knob"),
+            ("knobs", "knob"),
+            ("knock", "knock"),
+            ("knocked", "knock"),
+            ("knocker", "knocker"),
+            ("knockers", "knocker"),
+            ("knocking", "knock"),
+            ("knocks", "knock"),
+            ("knopp", "knopp"),
+            ("knot", "knot"),
+            ("knots", "knot"),
+            ("lay", "lay"),
+            ("try", "tri"),
+        ];
+
+        let stemmer = Stemmer::new();
+        for &(input, output) in cases.iter() {
+            assert_eq!(&stemmer.stem(input.into()), output);
+        }
+    }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:02:58 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:02:58 +0000
commit	698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree	173a775858bd501c378080a10dca74132f05bc50 /vendor/elasticlunr-rs/src/lang/en.rs
parent	Initial commit. (diff)
download	rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip