From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/elasticlunr-rs/src/config.rs | 128 ++++++++ vendor/elasticlunr-rs/src/document_store.rs | 330 ++++++++++++++++++++ vendor/elasticlunr-rs/src/inverted_index.rs | 379 +++++++++++++++++++++++ vendor/elasticlunr-rs/src/lang/ar.rs | 66 ++++ vendor/elasticlunr-rs/src/lang/common.rs | 97 ++++++ vendor/elasticlunr-rs/src/lang/da.rs | 49 +++ vendor/elasticlunr-rs/src/lang/de.rs | 273 ++++++++++++++++ vendor/elasticlunr-rs/src/lang/du.rs | 50 +++ vendor/elasticlunr-rs/src/lang/en.rs | 458 +++++++++++++++++++++++++++ vendor/elasticlunr-rs/src/lang/es.rs | 350 +++++++++++++++++++++ vendor/elasticlunr-rs/src/lang/fi.rs | 277 +++++++++++++++++ vendor/elasticlunr-rs/src/lang/fr.rs | 56 ++++ vendor/elasticlunr-rs/src/lang/it.rs | 321 +++++++++++++++++++ vendor/elasticlunr-rs/src/lang/ja.rs | 76 +++++ vendor/elasticlunr-rs/src/lang/mod.rs | 138 +++++++++ vendor/elasticlunr-rs/src/lang/no.rs | 218 +++++++++++++ vendor/elasticlunr-rs/src/lang/pt.rs | 245 +++++++++++++++ vendor/elasticlunr-rs/src/lang/ro.rs | 323 +++++++++++++++++++ vendor/elasticlunr-rs/src/lang/ru.rs | 463 ++++++++++++++++++++++++++++ vendor/elasticlunr-rs/src/lang/sv.rs | 51 +++ vendor/elasticlunr-rs/src/lang/tr.rs | 251 +++++++++++++++ vendor/elasticlunr-rs/src/lang/zh.rs | 55 ++++ vendor/elasticlunr-rs/src/lib.rs | 413 +++++++++++++++++++++++++ vendor/elasticlunr-rs/src/pipeline.rs | 65 ++++ 24 files changed, 5132 insertions(+) create mode 100644 vendor/elasticlunr-rs/src/config.rs create mode 100644 vendor/elasticlunr-rs/src/document_store.rs create mode 100644 vendor/elasticlunr-rs/src/inverted_index.rs create mode 100644 vendor/elasticlunr-rs/src/lang/ar.rs create mode 100644 vendor/elasticlunr-rs/src/lang/common.rs create mode 100644 vendor/elasticlunr-rs/src/lang/da.rs create mode 100644 vendor/elasticlunr-rs/src/lang/de.rs create mode 100644 vendor/elasticlunr-rs/src/lang/du.rs create mode 100644 vendor/elasticlunr-rs/src/lang/en.rs create mode 100644 vendor/elasticlunr-rs/src/lang/es.rs create mode 100644 vendor/elasticlunr-rs/src/lang/fi.rs create mode 100644 vendor/elasticlunr-rs/src/lang/fr.rs create mode 100644 vendor/elasticlunr-rs/src/lang/it.rs create mode 100644 vendor/elasticlunr-rs/src/lang/ja.rs create mode 100644 vendor/elasticlunr-rs/src/lang/mod.rs create mode 100644 vendor/elasticlunr-rs/src/lang/no.rs create mode 100644 vendor/elasticlunr-rs/src/lang/pt.rs create mode 100644 vendor/elasticlunr-rs/src/lang/ro.rs create mode 100644 vendor/elasticlunr-rs/src/lang/ru.rs create mode 100644 vendor/elasticlunr-rs/src/lang/sv.rs create mode 100644 vendor/elasticlunr-rs/src/lang/tr.rs create mode 100644 vendor/elasticlunr-rs/src/lang/zh.rs create mode 100644 vendor/elasticlunr-rs/src/lib.rs create mode 100644 vendor/elasticlunr-rs/src/pipeline.rs (limited to 'vendor/elasticlunr-rs/src') diff --git a/vendor/elasticlunr-rs/src/config.rs b/vendor/elasticlunr-rs/src/config.rs new file mode 100644 index 000000000..304bcb227 --- /dev/null +++ b/vendor/elasticlunr-rs/src/config.rs @@ -0,0 +1,128 @@ +//! These types are not used for generating `Index`es. They are provided to help with +//! creating compatible JSON structures for configuring the JavaScript search +//! function. +//! +//! *Reference:* +//! + +use std::collections::BTreeMap; + +/// Used to set the search configuration for a specific field. +/// When `expand` or `bool` is `None`, elasticlunr.js will use the value from +/// the global configuration. The `boost` field, if present, +/// increases the importance of this field when ordering search results. +#[derive(Serialize, Deserialize, Default, Debug, Copy, Clone, Eq, PartialEq)] +pub struct SearchOptionsField { + #[serde(skip_serializing_if = "Option::is_none")] + pub boost: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub bool: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub expand: Option, +} + +/// Sets which boolean model is used for searching with +/// multiple terms. Defaults to `Or`. +/// +/// - *AND* requires every search term to be present in results +/// - *OR* accepts results which have at least one term +/// +#[derive(Serialize, Deserialize, Debug, Copy, Clone, Eq, PartialEq)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum SearchBool { + Or, + And, +} + +impl Default for SearchBool { + fn default() -> Self { + SearchBool::Or + } +} + +/// The search configuration map which is passed to the +/// elasticlunr.js `Index.search()` function. +/// +/// |Key |Default| +/// |--------|-------| +/// |`bool` |`OR` | +/// |`expand`|`false`| +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +pub struct SearchOptions { + pub bool: SearchBool, + pub expand: bool, + pub fields: BTreeMap, +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json; + + #[test] + fn test_normal_config() { + let options = SearchOptions { + fields: btreemap![ + "title".into() => SearchOptionsField { + boost: Some(5), + ..Default::default() + }, + "body".into() => SearchOptionsField { + boost: Some(1), + ..Default::default() + }, + ], + ..Default::default() + }; + let stringed = serde_json::to_string(&options).unwrap(); + + assert_eq!( + stringed, + r#"{"bool":"OR","expand":false,"fields":{"body":{"boost":1},"title":{"boost":5}}}"# + ); + } + + #[test] + fn test_complex_config() { + let options = SearchOptions { + fields: btreemap! { + "title".into() => SearchOptionsField { + expand: Some(true), + ..Default::default() + }, + "body".into() => SearchOptionsField { + bool: Some(SearchBool::Or), + ..Default::default() + }, + "breadcrumbs".into() => SearchOptionsField { + bool: Some(SearchBool::default()), + boost: Some(200), + ..Default::default() + }, + }, + expand: false, + bool: SearchBool::And, + }; + let stringed = serde_json::to_string_pretty(&options).unwrap(); + + assert_eq!( + stringed, + r#"{ + "bool": "AND", + "expand": false, + "fields": { + "body": { + "bool": "OR" + }, + "breadcrumbs": { + "boost": 200, + "bool": "OR" + }, + "title": { + "expand": true + } + } +}"# + ); + } +} diff --git a/vendor/elasticlunr-rs/src/document_store.rs b/vendor/elasticlunr-rs/src/document_store.rs new file mode 100644 index 000000000..5b745d2ee --- /dev/null +++ b/vendor/elasticlunr-rs/src/document_store.rs @@ -0,0 +1,330 @@ +//! Implements an elasticlunr.js document store. Most users do not need to use this module directly. + +use std::collections::BTreeMap; + +/// The document store saves the complete text of each item saved to the index, if enabled. +/// Most users do not need to use this type directly. +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct DocumentStore { + pub save: bool, + pub docs: BTreeMap>, + pub doc_info: BTreeMap>, + // Redundant with docs.len(), but needed for serialization + pub length: usize, +} + +impl DocumentStore { + pub fn new(save: bool) -> Self { + DocumentStore { + save, + docs: BTreeMap::new(), + doc_info: BTreeMap::new(), + length: 0, + } + } + + pub fn len(&self) -> usize { + self.docs.len() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn is_stored(&self) -> bool { + self.save + } + + pub fn has_doc(&self, doc_ref: &str) -> bool { + self.docs.contains_key(doc_ref) + } + + pub fn add_doc(&mut self, doc_ref: &str, doc: BTreeMap) { + if !self.has_doc(doc_ref) { + self.length += 1; + } + + self.docs.insert( + doc_ref.into(), + if self.save { doc } else { BTreeMap::new() }, + ); + } + + pub fn get_doc(&self, doc_ref: &str) -> Option> { + self.docs.get(doc_ref).cloned() + } + + pub fn remove_doc(&mut self, doc_ref: &str) { + if self.has_doc(doc_ref) { + self.length -= 1; + } + + self.docs.remove(doc_ref); + } + + pub fn add_field_length(&mut self, doc_ref: &str, field: &str, length: usize) { + self.doc_info + .entry(doc_ref.into()) + .or_insert_with(BTreeMap::new) + .insert(field.into(), length); + } + + pub fn get_field_length(&self, doc_ref: &str, field: &str) -> usize { + if self.has_doc(doc_ref) { + self.doc_info + .get(doc_ref) + .and_then(|e| e.get(field)) + .cloned() + .unwrap_or(0) + } else { + 0 + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn add_doc_tokens() { + let mut store = DocumentStore::new(true); + let doc = btreemap! { "title".into() => "eggs bread".into() }; + + store.add_doc("1", doc.clone()); + assert_eq!(store.get_doc("1").unwrap(), doc); + } + + #[test] + fn create_doc_no_store() { + let mut store = DocumentStore::new(false); + let doc = btreemap! { "title".into() => "eggs bread".into() }; + + store.add_doc("1", doc); + assert_eq!(store.len(), 1); + assert_eq!(store.is_stored(), false); + assert_eq!(store.has_doc("1"), true); + } + + #[test] + fn add_doc_no_store() { + let mut store = DocumentStore::new(false); + let doc1 = btreemap! { "title".into() => "eggs bread".into() }; + let doc2 = btreemap! { "title".into() => "hello world".into() }; + + store.add_doc("1", doc1); + store.add_doc("2", doc2); + assert_eq!(store.len(), 2); + assert_eq!(store.is_stored(), false); + assert_eq!(store.has_doc("1"), true); + assert_eq!(store.has_doc("2"), true); + } + + #[test] + fn is_stored_true() { + let store = DocumentStore::new(true); + assert_eq!(store.is_stored(), true); + } + + #[test] + fn is_stored_false() { + let store = DocumentStore::new(false); + assert_eq!(store.is_stored(), false); + } + + #[test] + fn get_doc_no_store() { + let mut store = DocumentStore::new(false); + let doc1 = btreemap! { "title".into() => "eggs bread".into() }; + let doc2 = btreemap! { "title".into() => "hello world".into() }; + + store.add_doc("1", doc1); + store.add_doc("2", doc2); + assert_eq!(store.len(), 2); + assert_eq!(store.is_stored(), false); + assert_eq!(store.get_doc("1").unwrap(), BTreeMap::new()); + assert_eq!(store.get_doc("2").unwrap(), BTreeMap::new()); + } + + #[test] + fn get_nonexistant_doc_no_store() { + let mut store = DocumentStore::new(false); + let doc1 = btreemap! { "title".into() => "eggs bread".into() }; + let doc2 = btreemap! { "title".into() => "hello world".into() }; + + store.add_doc("1", doc1); + store.add_doc("2", doc2); + assert_eq!(store.len(), 2); + assert_eq!(store.is_stored(), false); + assert_eq!(store.get_doc("6"), None); + assert_eq!(store.get_doc("2").unwrap(), BTreeMap::new()); + } + + #[test] + fn remove_doc_no_store() { + let mut store = DocumentStore::new(false); + let doc1 = btreemap! { "title".into() => "eggs bread".into() }; + let doc2 = btreemap! { "title".into() => "hello world".into() }; + + store.add_doc("1", doc1); + store.add_doc("2", doc2); + store.remove_doc("1"); + assert_eq!(store.len(), 1); + assert_eq!(store.is_stored(), false); + assert_eq!(store.get_doc("2").unwrap(), BTreeMap::new()); + assert_eq!(store.get_doc("1"), None); + } + + #[test] + fn remove_nonexistant_doc() { + let mut store = DocumentStore::new(false); + let doc1 = btreemap! { "title".into() => "eggs bread".into() }; + let doc2 = btreemap! { "title".into() => "hello world".into() }; + + store.add_doc("1", doc1); + store.add_doc("2", doc2); + store.remove_doc("8"); + assert_eq!(store.len(), 2); + assert_eq!(store.is_stored(), false); + assert_eq!(store.get_doc("2").unwrap(), BTreeMap::new()); + assert_eq!(store.get_doc("1").unwrap(), BTreeMap::new()); + } + + #[test] + fn get_num_docs() { + let mut store = DocumentStore::new(true); + + assert_eq!(store.len(), 0); + store.add_doc("1", btreemap! { "title".into() => "eggs bread".into() }); + assert_eq!(store.len(), 1); + } + + #[test] + fn get_doc() { + let mut store = DocumentStore::new(true); + + assert_eq!(store.len(), 0); + store.add_doc("1", btreemap! { "title".into() => "eggs bread".into() }); + assert_eq!( + store.get_doc("1").unwrap(), + btreemap! { "title".into() => "eggs bread".into() } + ); + } + + #[test] + fn get_doc_many_fields() { + let mut store = DocumentStore::new(true); + + assert_eq!(store.len(), 0); + store.add_doc( + "1", + btreemap! { + "title".into() => "eggs bread".into() + }, + ); + store.add_doc( + "2", + btreemap! { + "title".into() => "boo bar".into() + }, + ); + store.add_doc( + "3", + btreemap! { + "title".into() => "oracle".into(), + "body".into() => "Oracle is demonspawn".into() + }, + ); + assert_eq!( + store.get_doc("3").unwrap(), + btreemap! { + "title".into() => "oracle".into(), + "body".into() => "Oracle is demonspawn".into() + } + ); + assert_eq!(store.len(), 3); + } + + #[test] + fn get_nonexistant_doc() { + let mut store = DocumentStore::new(true); + + assert_eq!(store.len(), 0); + store.add_doc( + "1", + btreemap! { + "title".into() => "eggs bread".into() + }, + ); + store.add_doc( + "2", + btreemap! { + "title".into() => "boo bar".into() + }, + ); + store.add_doc( + "3", + btreemap! { + "title".into() => "oracle".into(), + "body".into() => "Oracle is demonspawn".into() + }, + ); + assert_eq!(store.get_doc("4"), None); + assert_eq!(store.get_doc("0"), None); + assert_eq!(store.len(), 3); + } + + #[test] + fn check_store_has_key() { + let mut store = DocumentStore::new(true); + + assert!(!store.has_doc("foo")); + store.add_doc("foo", btreemap! { "title".into() => "eggs bread".into() }); + assert!(store.has_doc("foo")); + } + + #[test] + fn remove_doc() { + let mut store = DocumentStore::new(true); + + store.add_doc("foo", btreemap! { "title".into() => "eggs bread".into() }); + assert!(store.has_doc("foo")); + assert_eq!(store.len(), 1); + store.remove_doc("foo"); + assert!(!store.has_doc("foo")); + assert_eq!(store.len(), 0); + } + + #[test] + fn remove_nonexistant_store() { + let mut store = DocumentStore::new(true); + + store.add_doc("foo", btreemap! { "title".into() => "eggs bread".into() }); + assert!(store.has_doc("foo")); + assert_eq!(store.len(), 1); + store.remove_doc("bar"); + assert!(store.has_doc("foo")); + assert_eq!(store.len(), 1); + } + + #[test] + fn add_field_len() { + let mut store = DocumentStore::new(true); + + store.add_doc("foo", btreemap! { "title".into() => "eggs bread".into() }); + store.add_field_length("foo", "title", 2); + assert_eq!(store.get_field_length("foo", "title"), 2); + } + + #[test] + fn add_field_length_multiple() { + let mut store = DocumentStore::new(true); + + store.add_doc("foo", btreemap! { "title".into() => "eggs bread".into() }); + store.add_field_length("foo", "title", 2); + store.add_field_length("foo", "body", 10); + assert_eq!(store.get_field_length("foo", "title"), 2); + assert_eq!(store.get_field_length("foo", "body"), 10); + } +} diff --git a/vendor/elasticlunr-rs/src/inverted_index.rs b/vendor/elasticlunr-rs/src/inverted_index.rs new file mode 100644 index 000000000..be4c4c677 --- /dev/null +++ b/vendor/elasticlunr-rs/src/inverted_index.rs @@ -0,0 +1,379 @@ +//! Implements an elasticlunr.js inverted index. Most users do not need to use this module directly. + +use std::collections::BTreeMap; + +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq)] +struct TermFrequency { + #[serde(rename = "tf")] + pub term_freq: f64, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Default)] +struct IndexItem { + pub docs: BTreeMap, + #[serde(rename = "df")] + pub doc_freq: i64, + #[serde(flatten, serialize_with = "IndexItem::serialize")] + pub children: BTreeMap, +} + +impl IndexItem { + fn new() -> Self { + Default::default() + } + + fn serialize(map: &BTreeMap, ser: S) -> Result + where + S: ::serde::Serializer, + { + use serde::ser::SerializeMap; + + let mut ser_map = ser.serialize_map(Some(map.len()))?; + let mut buf = [0u8; 4]; + for (key, value) in map { + let key = key.encode_utf8(&mut buf); + ser_map.serialize_entry(key, value)?; + } + ser_map.end() + } + + fn add_token(&mut self, doc_ref: &str, token: &str, term_freq: f64) { + let mut iter = token.chars(); + if let Some(character) = iter.next() { + let mut item = self + .children + .entry(character) + .or_insert_with(IndexItem::new); + + for character in iter { + let tmp = item; + item = tmp.children.entry(character).or_insert_with(IndexItem::new); + } + + if !item.docs.contains_key(doc_ref) { + item.doc_freq += 1; + } + item.docs + .insert(doc_ref.into(), TermFrequency { term_freq }); + } + } + + fn get_node(&self, token: &str) -> Option<&IndexItem> { + let mut root = self; + for ch in token.chars() { + if let Some(item) = root.children.get(&ch) { + root = item; + } else { + return None; + } + } + + Some(root) + } + + fn remove_token(&mut self, doc_ref: &str, token: &str) { + let mut iter = token.char_indices(); + if let Some((_, ch)) = iter.next() { + if let Some(item) = self.children.get_mut(&ch) { + if let Some((idx, _)) = iter.next() { + item.remove_token(doc_ref, &token[idx..]); + } else if item.docs.contains_key(doc_ref) { + item.docs.remove(doc_ref); + item.doc_freq -= 1; + } + } + } + } +} + +/// Implements an elasticlunr.js inverted index. Most users do not need to use this type directly. +#[derive(Serialize, Deserialize, Debug, PartialEq, Default)] +pub struct InvertedIndex { + root: IndexItem, +} + +impl InvertedIndex { + pub fn new() -> Self { + Default::default() + } + + pub fn add_token(&mut self, doc_ref: &str, token: &str, term_freq: f64) { + self.root.add_token(doc_ref, token, term_freq) + } + + pub fn has_token(&self, token: &str) -> bool { + self.root.get_node(token).map_or(false, |_| true) + } + + pub fn remove_token(&mut self, doc_ref: &str, token: &str) { + self.root.remove_token(doc_ref, token) + } + + pub fn get_docs(&self, token: &str) -> Option> { + self.root.get_node(token).map(|node| { + node.docs + .iter() + .map(|(k, &v)| (k.clone(), v.term_freq)) + .collect() + }) + } + + pub fn get_term_frequency(&self, doc_ref: &str, token: &str) -> f64 { + self.root + .get_node(token) + .and_then(|node| node.docs.get(doc_ref)) + .map_or(0., |docs| docs.term_freq) + } + + pub fn get_doc_frequency(&self, token: &str) -> i64 { + self.root.get_node(token).map_or(0, |node| node.doc_freq) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn adding_token() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 1.); + assert_eq!(inverted_index.get_doc_frequency("foo"), 1); + assert_eq!(inverted_index.get_term_frequency("123", "foo"), 1.); + } + + #[test] + fn has_token() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 1.); + assert!(inverted_index.has_token(token)); + assert!(inverted_index.has_token("fo")); + assert!(inverted_index.has_token("f")); + + assert!(!inverted_index.has_token("bar")); + assert!(!inverted_index.has_token("foo ")); + assert!(!inverted_index.has_token("foo ")) + } + + #[test] + fn adding_another_document_to_the_token() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 1.); + inverted_index.add_token("456", token, 1.); + + assert_eq!(inverted_index.get_term_frequency("123", "foo"), 1.); + assert_eq!(inverted_index.get_term_frequency("456", "foo"), 1.); + assert_eq!(inverted_index.get_doc_frequency("foo"), 2); + } + + #[test] + fn df_of_nonexistant_token() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 1.); + inverted_index.add_token("456", token, 1.); + + assert_eq!(inverted_index.get_doc_frequency("foo"), 2); + assert_eq!(inverted_index.get_doc_frequency("fox"), 0); + } + + #[test] + fn adding_existing_doc() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 1.); + inverted_index.add_token("456", token, 1.); + inverted_index.add_token("456", token, 100.); + + assert_eq!(inverted_index.get_term_frequency("456", "foo"), 100.); + assert_eq!(inverted_index.get_doc_frequency("foo"), 2); + } + + #[test] + fn checking_token_exists_in() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 1.); + + assert!(inverted_index.has_token(token)); + } + + #[test] + fn checking_if_a_token_does_not_exist() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 1.); + assert!(!inverted_index.has_token("fooo")); + assert!(!inverted_index.has_token("bar")); + assert!(!inverted_index.has_token("fof")); + } + + #[test] + fn retrieving_items() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 1.); + assert_eq!( + inverted_index.get_docs(token).unwrap(), + btreemap! { + "123".into() => 1. + } + ); + + assert_eq!(inverted_index.get_docs(""), Some(BTreeMap::new())); + + inverted_index.add_token("234", "boo", 100.); + inverted_index.add_token("345", "too", 101.); + + assert_eq!( + inverted_index.get_docs(token).unwrap(), + btreemap! { + "123".into() => 1. + } + ); + + inverted_index.add_token("234", token, 100.); + inverted_index.add_token("345", token, 101.); + + assert_eq!( + inverted_index.get_docs(token).unwrap(), + btreemap! { + "123".into() => 1., + "234".into() => 100., + "345".into() => 101., + } + ); + } + + #[test] + fn retrieving_nonexistant_items() { + let inverted_index = InvertedIndex::new(); + + assert_eq!(inverted_index.get_docs("foo"), None); + assert_eq!(inverted_index.get_docs("fox"), None); + } + + #[test] + fn df_of_items() { + let mut inverted_index = InvertedIndex::new(); + + inverted_index.add_token("123", "foo", 1.); + inverted_index.add_token("456", "foo", 1.); + inverted_index.add_token("789", "bar", 1.); + + assert_eq!(inverted_index.get_doc_frequency("foo"), 2); + assert_eq!(inverted_index.get_doc_frequency("bar"), 1); + assert_eq!(inverted_index.get_doc_frequency("baz"), 0); + assert_eq!(inverted_index.get_doc_frequency("ba"), 0); + assert_eq!(inverted_index.get_doc_frequency("b"), 0); + assert_eq!(inverted_index.get_doc_frequency("fo"), 0); + assert_eq!(inverted_index.get_doc_frequency("f"), 0); + } + + #[test] + fn removing_document_from_token() { + let mut inverted_index = InvertedIndex::new(); + assert_eq!(inverted_index.get_docs("foo"), None); + + inverted_index.add_token("123", "foo", 1.); + assert_eq!( + inverted_index.get_docs("foo").unwrap(), + btreemap! { + "123".into() => 1., + } + ); + + inverted_index.remove_token("123", "foo"); + assert_eq!(inverted_index.get_docs("foo"), Some(BTreeMap::new())); + assert_eq!(inverted_index.get_doc_frequency("foo"), 0); + assert_eq!(inverted_index.has_token("foo"), true); + } + + #[test] + fn removing_nonexistant_document() { + let mut inverted_index = InvertedIndex::new(); + + inverted_index.add_token("123", "foo", 1.); + inverted_index.add_token("567", "bar", 1.); + inverted_index.remove_token("foo", "456"); + + assert_eq!( + inverted_index.get_docs("foo").unwrap(), + btreemap! { + "123".into() => 1. + } + ); + assert_eq!(inverted_index.get_doc_frequency("foo"), 1); + } + + #[test] + fn removing_documet_nonexistant_key() { + let mut inverted_index = InvertedIndex::new(); + + inverted_index.remove_token("123", "foo"); + assert!(!inverted_index.has_token("foo")); + assert_eq!(inverted_index.get_doc_frequency("foo"), 0); + } + + #[test] + fn get_term_frequency() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 2.); + inverted_index.add_token("456", token, 3.); + + assert_eq!(inverted_index.get_term_frequency("123", token), 2.); + assert_eq!(inverted_index.get_term_frequency("456", token), 3.); + assert_eq!(inverted_index.get_term_frequency("789", token), 0.); + } + + #[test] + fn get_term_frequency_nonexistant_token() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 2.); + inverted_index.add_token("456", token, 3.); + + assert_eq!(inverted_index.get_term_frequency("123", "ken"), 0.); + assert_eq!(inverted_index.get_term_frequency("456", "ken"), 0.); + } + + #[test] + fn get_term_frequency_nonexistant_docref() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 2.); + inverted_index.add_token("456", token, 3.); + + assert_eq!(inverted_index.get_term_frequency(token, "12"), 0.); + assert_eq!(inverted_index.get_term_frequency(token, "23"), 0.); + assert_eq!(inverted_index.get_term_frequency(token, "45"), 0.); + } + + #[test] + fn get_term_frequency_nonexistant_token_and_docref() { + let mut inverted_index = InvertedIndex::new(); + let token = "foo"; + + inverted_index.add_token("123", token, 2.); + inverted_index.add_token("456", token, 3.); + + assert_eq!(inverted_index.get_term_frequency("token", "1"), 0.); + assert_eq!(inverted_index.get_term_frequency("abc", "2"), 0.); + assert_eq!(inverted_index.get_term_frequency("fo", "123"), 0.); + } +} diff --git a/vendor/elasticlunr-rs/src/lang/ar.rs b/vendor/elasticlunr-rs/src/lang/ar.rs new file mode 100644 index 000000000..d0a640edf --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/ar.rs @@ -0,0 +1,66 @@ +use super::Language; +use crate::pipeline::{Pipeline, PipelineFn}; +use regex::Regex; + +/// Arabic Language +/// +/// Designed to be compatibile with the included Javascript implementation. See `js/lunr.ar.js`. +pub struct Arabic {} + +impl Arabic { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Arabic { + fn name(&self) -> String { + "Arabic".into() + } + fn code(&self) -> String { + "ar".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![Box::new(Stemmer::new())], + } + } +} + +struct Stemmer { + diacritics: Regex, + alefs: Regex, +} + +impl Stemmer { + pub fn new() -> Self { + let diacritics = Regex::new("[\u{0640}\u{064b}-\u{065b}]").unwrap(); + let alefs = Regex::new("[\u{0622}\u{0623}\u{0625}\u{0671}\u{0649}]").unwrap(); + Self { diacritics, alefs } + } +} + +impl PipelineFn for Stemmer { + fn name(&self) -> String { + "stemmer-ar".into() + } + + fn filter(&self, token: String) -> Option { + // remove diacritics and elongating character + let result = self.diacritics.replace(&token, ""); + // replace all variations of alef (آأإٱى) to a plain alef (ا) + let result = self.alefs.replace(&result, "\u{0627}"); + if result.is_empty() { + None + } else if result == token { + Some(token) + } else { + Some(result.into()) + } + } +} diff --git a/vendor/elasticlunr-rs/src/lang/common.rs b/vendor/elasticlunr-rs/src/lang/common.rs new file mode 100644 index 000000000..5616f0138 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/common.rs @@ -0,0 +1,97 @@ +use crate::pipeline::PipelineFn; +use regex::Regex; +use std::collections::HashSet; + +#[derive(Clone)] +pub struct StopWordFilter { + name: String, + stop_words: HashSet, +} + +impl StopWordFilter { + pub fn new(name: &str, stop_words: &[&str]) -> Self { + Self { + name: name.into(), + stop_words: stop_words.iter().map(|s| s.to_string()).collect(), + } + } +} + +impl PipelineFn for StopWordFilter { + fn name(&self) -> String { + self.name.clone() + } + + fn filter(&self, token: String) -> Option { + if self.stop_words.contains(&token) { + None + } else { + Some(token) + } + } +} + +#[derive(Clone)] +pub struct RegexTrimmer { + name: String, + trimmer: Regex, +} + +impl RegexTrimmer { + pub fn new(name: &str, word_chars: &str) -> Self { + let name = name.into(); + let trimmer = Regex::new(&format!("^[^{0}]+|[^{0}]+$", word_chars)).unwrap(); + Self { name, trimmer } + } +} + +impl PipelineFn for RegexTrimmer { + fn name(&self) -> String { + self.name.clone() + } + + fn filter(&self, token: String) -> Option { + let result = self.trimmer.replace_all(&token, ""); + if result.is_empty() { + None + } else if result == token { + Some(token) + } else { + Some(result.into()) + } + } +} + +#[cfg(feature = "rust-stemmers")] +pub struct RustStemmer { + name: String, + stemmer: rust_stemmers::Stemmer, +} + +#[cfg(feature = "rust-stemmers")] +impl RustStemmer { + pub fn new(name: &str, algo: rust_stemmers::Algorithm) -> Self { + Self { + name: name.into(), + stemmer: rust_stemmers::Stemmer::create(algo), + } + } +} + +#[cfg(feature = "rust-stemmers")] +impl PipelineFn for RustStemmer { + fn name(&self) -> String { + self.name.clone() + } + + fn filter(&self, token: String) -> Option { + let result = self.stemmer.stem(&token); + if result.is_empty() { + None + } else if result == token { + Some(token) + } else { + Some(result.into()) + } + } +} diff --git a/vendor/elasticlunr-rs/src/lang/da.rs b/vendor/elasticlunr-rs/src/lang/da.rs new file mode 100644 index 000000000..ab3b7dffe --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/da.rs @@ -0,0 +1,49 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Danish {} + +impl Danish { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Danish { + fn name(&self) -> String { + "Danish".into() + } + fn code(&self) -> String { + "da".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-da", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-da", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-da", Algorithm::Danish)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", "ad", "af", "alle", "alt", "anden", "at", "blev", "blive", "bliver", "da", "de", "dem", + "den", "denne", "der", "deres", "det", "dette", "dig", "din", "disse", "dog", "du", "efter", + "eller", "en", "end", "er", "et", "for", "fra", "ham", "han", "hans", "har", "havde", "have", + "hende", "hendes", "her", "hos", "hun", "hvad", "hvis", "hvor", "i", "ikke", "ind", "jeg", + "jer", "jo", "kunne", "man", "mange", "med", "meget", "men", "mig", "min", "mine", "mit", + "mod", "ned", "noget", "nogle", "nu", "når", "og", "også", "om", "op", "os", "over", "på", + "selv", "sig", "sin", "sine", "sit", "skal", "skulle", "som", "sådan", "thi", "til", "ud", + "under", "var", "vi", "vil", "ville", "vor", "være", "været", +]; diff --git a/vendor/elasticlunr-rs/src/lang/de.rs b/vendor/elasticlunr-rs/src/lang/de.rs new file mode 100644 index 000000000..244685ae9 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/de.rs @@ -0,0 +1,273 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct German {} + +impl German { + pub fn new() -> Self { + Self {} + } +} + +impl Language for German { + fn name(&self) -> String { + "German".into() + } + fn code(&self) -> String { + "de".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-de", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-de", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-de", Algorithm::German)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "aber", + "alle", + "allem", + "allen", + "aller", + "alles", + "als", + "also", + "am", + "an", + "ander", + "andere", + "anderem", + "anderen", + "anderer", + "anderes", + "anderm", + "andern", + "anderr", + "anders", + "auch", + "auf", + "aus", + "bei", + "bin", + "bis", + "bist", + "da", + "damit", + "dann", + "das", + "dasselbe", + "dazu", + "daß", + "dein", + "deine", + "deinem", + "deinen", + "deiner", + "deines", + "dem", + "demselben", + "den", + "denn", + "denselben", + "der", + "derer", + "derselbe", + "derselben", + "des", + "desselben", + "dessen", + "dich", + "die", + "dies", + "diese", + "dieselbe", + "dieselben", + "diesem", + "diesen", + "dieser", + "dieses", + "dir", + "doch", + "dort", + "du", + "durch", + "ein", + "eine", + "einem", + "einen", + "einer", + "eines", + "einig", + "einige", + "einigem", + "einigen", + "einiger", + "einiges", + "einmal", + "er", + "es", + "etwas", + "euch", + "euer", + "eure", + "eurem", + "euren", + "eurer", + "eures", + "für", + "gegen", + "gewesen", + "hab", + "habe", + "haben", + "hat", + "hatte", + "hatten", + "hier", + "hin", + "hinter", + "ich", + "ihm", + "ihn", + "ihnen", + "ihr", + "ihre", + "ihrem", + "ihren", + "ihrer", + "ihres", + "im", + "in", + "indem", + "ins", + "ist", + "jede", + "jedem", + "jeden", + "jeder", + "jedes", + "jene", + "jenem", + "jenen", + "jener", + "jenes", + "jetzt", + "kann", + "kein", + "keine", + "keinem", + "keinen", + "keiner", + "keines", + "können", + "könnte", + "machen", + "man", + "manche", + "manchem", + "manchen", + "mancher", + "manches", + "mein", + "meine", + "meinem", + "meinen", + "meiner", + "meines", + "mich", + "mir", + "mit", + "muss", + "musste", + "nach", + "nicht", + "nichts", + "noch", + "nun", + "nur", + "ob", + "oder", + "ohne", + "sehr", + "sein", + "seine", + "seinem", + "seinen", + "seiner", + "seines", + "selbst", + "sich", + "sie", + "sind", + "so", + "solche", + "solchem", + "solchen", + "solcher", + "solches", + "soll", + "sollte", + "sondern", + "sonst", + "um", + "und", + "uns", + "unse", + "unsem", + "unsen", + "unser", + "unses", + "unter", + "viel", + "vom", + "von", + "vor", + "war", + "waren", + "warst", + "was", + "weg", + "weil", + "weiter", + "welche", + "welchem", + "welchen", + "welcher", + "welches", + "wenn", + "werde", + "werden", + "wie", + "wieder", + "will", + "wir", + "wird", + "wirst", + "wo", + "wollen", + "wollte", + "während", + "würde", + "würden", + "zu", + "zum", + "zur", + "zwar", + "zwischen", + "über", +]; diff --git a/vendor/elasticlunr-rs/src/lang/du.rs b/vendor/elasticlunr-rs/src/lang/du.rs new file mode 100644 index 000000000..73a6d3cf7 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/du.rs @@ -0,0 +1,50 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Dutch {} + +impl Dutch { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Dutch { + fn name(&self) -> String { + "Dutch".into() + } + fn code(&self) -> String { + "du".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-du", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-du", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-du", Algorithm::Dutch)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", "aan", "al", "alles", "als", "altijd", "andere", "ben", "bij", "daar", "dan", "dat", "de", + "der", "deze", "die", "dit", "doch", "doen", "door", "dus", "een", "eens", "en", "er", "ge", + "geen", "geweest", "haar", "had", "heb", "hebben", "heeft", "hem", "het", "hier", "hij", "hoe", + "hun", "iemand", "iets", "ik", "in", "is", "ja", "je", "kan", "kon", "kunnen", "maar", "me", + "meer", "men", "met", "mij", "mijn", "moet", "na", "naar", "niet", "niets", "nog", "nu", "of", + "om", "omdat", "onder", "ons", "ook", "op", "over", "reeds", "te", "tegen", "toch", "toen", + "tot", "u", "uit", "uw", "van", "veel", "voor", "want", "waren", "was", "wat", "werd", "wezen", + "wie", "wil", "worden", "wordt", "zal", "ze", "zelf", "zich", "zij", "zijn", "zo", "zonder", + "zou", +]; diff --git a/vendor/elasticlunr-rs/src/lang/en.rs b/vendor/elasticlunr-rs/src/lang/en.rs new file mode 100644 index 000000000..f133ed7c9 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/en.rs @@ -0,0 +1,458 @@ +use super::{common::StopWordFilter, Language}; +use crate::pipeline::{FnWrapper, Pipeline, PipelineFn}; +use regex::Regex; + +const WORDS: &[&str] = &[ + "", "a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", + "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", + "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", + "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", + "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", + "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", + "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", + "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", + "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", + "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", +]; + +#[derive(Clone)] +pub struct English { + stemmer: Stemmer, +} + +impl English { + pub fn new() -> Self { + let stemmer = Stemmer::new(); + Self { stemmer } + } +} + +impl Language for English { + fn name(&self) -> String { + "English".into() + } + fn code(&self) -> String { + "en".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(FnWrapper("trimmer".into(), trimmer)), + Box::new(StopWordFilter::new("stopWordFilter", WORDS)), + Box::new(self.stemmer.clone()), + ], + } + } +} + +fn trimmer(token: String) -> Option { + Some( + token + .trim_matches(|c: char| !c.is_digit(36) && c != '_') + .into(), + ) +} + +static STEP_2: &[(&str, &str)] = &[ + ("ational", "ate"), + ("tional", "tion"), + ("enci", "ence"), + ("anci", "ance"), + ("izer", "ize"), + ("bli", "ble"), + ("alli", "al"), + ("entli", "ent"), + ("eli", "e"), + ("ousli", "ous"), + ("ization", "ize"), + ("ation", "ate"), + ("ator", "ate"), + ("alism", "al"), + ("iveness", "ive"), + ("fulness", "ful"), + ("ousness", "ous"), + ("aliti", "al"), + ("iviti", "ive"), + ("biliti", "ble"), + ("logi", "log"), +]; + +static STEP_3: &[(&str, &str)] = &[ + ("icate", "ic"), + ("ative", ""), + ("alize", "al"), + ("iciti", "ic"), + ("ical", "ic"), + ("ful", ""), + ("ness", ""), +]; + +// This is a direct port of the stemmer from elasticlunr.js +// It's not very efficient and very not-rusty, but it +// generates identical output. + +#[derive(Clone)] +struct Stemmer { + re_mgr0: Regex, + re_mgr1: Regex, + re_meq1: Regex, + re_s_v: Regex, + + re_1a: Regex, + re2_1a: Regex, + re_1b: Regex, + re2_1b: Regex, + re2_1b_2: Regex, + re3_1b_2: Regex, + re4_1b_2: Regex, + + re_1c: Regex, + re_2: Regex, + + re_3: Regex, + + re_4: Regex, + re2_4: Regex, + + re_5: Regex, + re3_5: Regex, +} + +impl PipelineFn for Stemmer { + fn name(&self) -> String { + "stemmer".into() + } + + fn filter(&self, token: String) -> Option { + Some(self.stem(token)) + } +} + +// vowel +macro_rules! V { + () => { + "[aeiouy]" + }; +} + +// consonant sequence +macro_rules! CS { + () => { + "[^aeiou][^aeiouy]*" + }; +} + +// vowel sequence +macro_rules! VS { + () => { + "[aeiouy][aeiou]*" + }; +} + +#[inline] +fn concat_string(strs: &[&str]) -> String { + strs.iter().cloned().collect() +} + +impl Stemmer { + fn new() -> Self { + let mgr0 = concat!("^(", CS!(), ")?", VS!(), CS!()); + let meq1 = concat!("^(", CS!(), ")?", VS!(), CS!(), "(", VS!(), ")?$"); + let mgr1 = concat!("^(", CS!(), ")?", VS!(), CS!(), VS!(), CS!()); + let s_v = concat!("^(", CS!(), ")?", V!()); + + let re_mgr0 = Regex::new(mgr0).unwrap(); + let re_mgr1 = Regex::new(mgr1).unwrap(); + let re_meq1 = Regex::new(meq1).unwrap(); + let re_s_v = Regex::new(s_v).unwrap(); + + let re_1a = Regex::new("^(.+?)(ss|i)es$").unwrap(); + let re2_1a = Regex::new("^(.+?)([^s])s$").unwrap(); + let re_1b = Regex::new("^(.+?)eed$").unwrap(); + let re2_1b = Regex::new("^(.+?)(ed|ing)$").unwrap(); + let re2_1b_2 = Regex::new("(at|bl|iz)$").unwrap(); + let re3_1b_2 = Regex::new("([^aeiouylsz]{2})$").unwrap(); + let re4_1b_2 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap(); + + let re_1c = Regex::new("^(.+?[^aeiou])y$").unwrap(); + let re_2 = Regex::new( + "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\ + ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$", + ) + .unwrap(); + + let re_3 = Regex::new("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$").unwrap(); + + let re_4 = Regex::new( + "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$", + ) + .unwrap(); + let re2_4 = Regex::new("^(.+?)(s|t)(ion)$").unwrap(); + + let re_5 = Regex::new("^(.+?)e$").unwrap(); + let re3_5 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap(); + + Stemmer { + re_mgr0, + re_mgr1, + re_meq1, + re_s_v, + re_1a, + re2_1a, + re_1b, + re2_1b, + re2_1b_2, + re3_1b_2, + re4_1b_2, + re_1c, + re_2, + re_3, + re_4, + re2_4, + re_5, + re3_5, + } + } + + /// Implements the Porter stemming algorithm + pub fn stem(&self, mut w: String) -> String { + if w.len() < 3 { + return w; + } + + let starts_with_y = w.as_bytes()[0] == b'y'; + if starts_with_y { + w.remove(0); + w.insert(0, 'Y'); + } + + // TODO: There's probably a better way to handle the + // borrowchecker than cloning w a million times + + // Step 1a + if let Some(caps) = self.re_1a.captures(&w.clone()) { + w = concat_string(&[&caps[1], &caps[2]]); + } + if let Some(caps) = self.re2_1a.captures(&w.clone()) { + w = concat_string(&[&caps[1], &caps[2]]); + } + + // Step 1b + if let Some(caps) = self.re_1b.captures(&w.clone()) { + let stem = &caps[1]; + if self.re_mgr0.is_match(stem) { + w.pop(); + } + } else if let Some(caps) = self.re2_1b.captures(&w.clone()) { + let stem = &caps[1]; + if self.re_s_v.is_match(stem) { + w = stem.into(); + + let mut re3_1b_2_matched = false; + + if self.re2_1b_2.is_match(&w) { + w.push('e'); + } else if let Some(m) = self.re3_1b_2.find(&w.clone()) { + let mut suffix = m.as_str().chars(); + // Make sure the two characters are the same since we can't use backreferences + if suffix.next() == suffix.next() { + re3_1b_2_matched = true; + w.pop(); + } + } + + // re4_1b_2 still runs if re3_1b_2 matches but + // the matched chcaracters are not the same + if !re3_1b_2_matched && self.re4_1b_2.is_match(&w) { + w.push('e'); + } + } + } + + // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first + // letter of the word (so cry -> cri, by -> by, say -> say) + if let Some(caps) = self.re_1c.captures(&w.clone()) { + let stem = &caps[1]; + w = concat_string(&[stem, "i"]); + } + + // Step 2 + if let Some(caps) = self.re_2.captures(&w.clone()) { + let stem = &caps[1]; + let suffix = &caps[2]; + if self.re_mgr0.is_match(stem) { + w = concat_string(&[stem, STEP_2.iter().find(|&&(k, _)| k == suffix).unwrap().1]); + } + } + + // Step 3 + if let Some(caps) = self.re_3.captures(&w.clone()) { + let stem = &caps[1]; + let suffix = &caps[2]; + if self.re_mgr0.is_match(stem) { + w = concat_string(&[stem, STEP_3.iter().find(|&&(k, _)| k == suffix).unwrap().1]); + } + } + + // Step 4 + if let Some(caps) = self.re_4.captures(&w.clone()) { + let stem = &caps[1]; + if self.re_mgr1.is_match(stem) { + w = stem.into(); + } + } else if let Some(caps) = self.re2_4.captures(&w.clone()) { + let stem = concat_string(&[&caps[1], &caps[2]]); + if self.re_mgr1.is_match(&stem) { + w = stem; + } + } + + // Step 5 + if let Some(caps) = self.re_5.captures(&w.clone()) { + let stem = &caps[1]; + if self.re_mgr1.is_match(stem) + || (self.re_meq1.is_match(stem) && !(self.re3_5.is_match(stem))) + { + w = stem.into(); + } + } + + if w.ends_with("ll") && self.re_mgr1.is_match(&w) { + w.pop(); + } + + // replace the original 'y' + if starts_with_y { + w.remove(0); + w.insert(0, 'y'); + } + + w + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! pipeline_eq { + ($func:expr, $input:expr, $output:expr) => { + assert_eq!(&$func($input.to_string()).unwrap(), $output); + }; + } + + #[test] + fn latin_characters() { + pipeline_eq!(trimmer, "hello", "hello"); + } + + #[test] + fn removing_punctuation() { + pipeline_eq!(trimmer, "hello.", "hello"); + pipeline_eq!(trimmer, "it's", "it's"); + pipeline_eq!(trimmer, "james'", "james"); + pipeline_eq!(trimmer, "stop!", "stop"); + pipeline_eq!(trimmer, "first,", "first"); + pipeline_eq!(trimmer, "", ""); + pipeline_eq!(trimmer, "[tag]", "tag"); + pipeline_eq!(trimmer, "[[[tag]]]", "tag"); + pipeline_eq!(trimmer, "[[!@#@!hello]]]}}}", "hello"); + pipeline_eq!(trimmer, "~!@@@hello***()()()]]", "hello"); + } + + #[test] + fn test_stemmer() { + let cases = [ + ("consign", "consign"), + ("consigned", "consign"), + ("consigning", "consign"), + ("consignment", "consign"), + ("consist", "consist"), + ("consisted", "consist"), + ("consistency", "consist"), + ("consistent", "consist"), + ("consistently", "consist"), + ("consisting", "consist"), + ("consists", "consist"), + ("consolation", "consol"), + ("consolations", "consol"), + ("consolatory", "consolatori"), + ("console", "consol"), + ("consoled", "consol"), + ("consoles", "consol"), + ("consolidate", "consolid"), + ("consolidated", "consolid"), + ("consolidating", "consolid"), + ("consoling", "consol"), + ("consols", "consol"), + ("consonant", "conson"), + ("consort", "consort"), + ("consorted", "consort"), + ("consorting", "consort"), + ("conspicuous", "conspicu"), + ("conspicuously", "conspicu"), + ("conspiracy", "conspiraci"), + ("conspirator", "conspir"), + ("conspirators", "conspir"), + ("conspire", "conspir"), + ("conspired", "conspir"), + ("conspiring", "conspir"), + ("constable", "constabl"), + ("constables", "constabl"), + ("constance", "constanc"), + ("constancy", "constanc"), + ("constant", "constant"), + ("knack", "knack"), + ("knackeries", "knackeri"), + ("knacks", "knack"), + ("knag", "knag"), + ("knave", "knave"), + ("knaves", "knave"), + ("knavish", "knavish"), + ("kneaded", "knead"), + ("kneading", "knead"), + ("knee", "knee"), + ("kneel", "kneel"), + ("kneeled", "kneel"), + ("kneeling", "kneel"), + ("kneels", "kneel"), + ("knees", "knee"), + ("knell", "knell"), + ("knelt", "knelt"), + ("knew", "knew"), + ("knick", "knick"), + ("knif", "knif"), + ("knife", "knife"), + ("knight", "knight"), + ("knights", "knight"), + ("knit", "knit"), + ("knits", "knit"), + ("knitted", "knit"), + ("knitting", "knit"), + ("knives", "knive"), + ("knob", "knob"), + ("knobs", "knob"), + ("knock", "knock"), + ("knocked", "knock"), + ("knocker", "knocker"), + ("knockers", "knocker"), + ("knocking", "knock"), + ("knocks", "knock"), + ("knopp", "knopp"), + ("knot", "knot"), + ("knots", "knot"), + ("lay", "lay"), + ("try", "tri"), + ]; + + let stemmer = Stemmer::new(); + for &(input, output) in cases.iter() { + assert_eq!(&stemmer.stem(input.into()), output); + } + } +} diff --git a/vendor/elasticlunr-rs/src/lang/es.rs b/vendor/elasticlunr-rs/src/lang/es.rs new file mode 100644 index 000000000..b6c4b5bcf --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/es.rs @@ -0,0 +1,350 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Spanish {} + +impl Spanish { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Spanish { + fn name(&self) -> String { + "Spanish".into() + } + fn code(&self) -> String { + "es".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-es", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-es", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-es", Algorithm::Spanish)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "a", + "al", + "algo", + "algunas", + "algunos", + "ante", + "antes", + "como", + "con", + "contra", + "cual", + "cuando", + "de", + "del", + "desde", + "donde", + "durante", + "e", + "el", + "ella", + "ellas", + "ellos", + "en", + "entre", + "era", + "erais", + "eran", + "eras", + "eres", + "es", + "esa", + "esas", + "ese", + "eso", + "esos", + "esta", + "estaba", + "estabais", + "estaban", + "estabas", + "estad", + "estada", + "estadas", + "estado", + "estados", + "estamos", + "estando", + "estar", + "estaremos", + "estará", + "estarán", + "estarás", + "estaré", + "estaréis", + "estaría", + "estaríais", + "estaríamos", + "estarían", + "estarías", + "estas", + "este", + "estemos", + "esto", + "estos", + "estoy", + "estuve", + "estuviera", + "estuvierais", + "estuvieran", + "estuvieras", + "estuvieron", + "estuviese", + "estuvieseis", + "estuviesen", + "estuvieses", + "estuvimos", + "estuviste", + "estuvisteis", + "estuviéramos", + "estuviésemos", + "estuvo", + "está", + "estábamos", + "estáis", + "están", + "estás", + "esté", + "estéis", + "estén", + "estés", + "fue", + "fuera", + "fuerais", + "fueran", + "fueras", + "fueron", + "fuese", + "fueseis", + "fuesen", + "fueses", + "fui", + "fuimos", + "fuiste", + "fuisteis", + "fuéramos", + "fuésemos", + "ha", + "habida", + "habidas", + "habido", + "habidos", + "habiendo", + "habremos", + "habrá", + "habrán", + "habrás", + "habré", + "habréis", + "habría", + "habríais", + "habríamos", + "habrían", + "habrías", + "habéis", + "había", + "habíais", + "habíamos", + "habían", + "habías", + "han", + "has", + "hasta", + "hay", + "haya", + "hayamos", + "hayan", + "hayas", + "hayáis", + "he", + "hemos", + "hube", + "hubiera", + "hubierais", + "hubieran", + "hubieras", + "hubieron", + "hubiese", + "hubieseis", + "hubiesen", + "hubieses", + "hubimos", + "hubiste", + "hubisteis", + "hubiéramos", + "hubiésemos", + "hubo", + "la", + "las", + "le", + "les", + "lo", + "los", + "me", + "mi", + "mis", + "mucho", + "muchos", + "muy", + "más", + "mí", + "mía", + "mías", + "mío", + "míos", + "nada", + "ni", + "no", + "nos", + "nosotras", + "nosotros", + "nuestra", + "nuestras", + "nuestro", + "nuestros", + "o", + "os", + "otra", + "otras", + "otro", + "otros", + "para", + "pero", + "poco", + "por", + "porque", + "que", + "quien", + "quienes", + "qué", + "se", + "sea", + "seamos", + "sean", + "seas", + "seremos", + "será", + "serán", + "serás", + "seré", + "seréis", + "sería", + "seríais", + "seríamos", + "serían", + "serías", + "seáis", + "sido", + "siendo", + "sin", + "sobre", + "sois", + "somos", + "son", + "soy", + "su", + "sus", + "suya", + "suyas", + "suyo", + "suyos", + "sí", + "también", + "tanto", + "te", + "tendremos", + "tendrá", + "tendrán", + "tendrás", + "tendré", + "tendréis", + "tendría", + "tendríais", + "tendríamos", + "tendrían", + "tendrías", + "tened", + "tenemos", + "tenga", + "tengamos", + "tengan", + "tengas", + "tengo", + "tengáis", + "tenida", + "tenidas", + "tenido", + "tenidos", + "teniendo", + "tenéis", + "tenía", + "teníais", + "teníamos", + "tenían", + "tenías", + "ti", + "tiene", + "tienen", + "tienes", + "todo", + "todos", + "tu", + "tus", + "tuve", + "tuviera", + "tuvierais", + "tuvieran", + "tuvieras", + "tuvieron", + "tuviese", + "tuvieseis", + "tuviesen", + "tuvieses", + "tuvimos", + "tuviste", + "tuvisteis", + "tuviéramos", + "tuviésemos", + "tuvo", + "tuya", + "tuyas", + "tuyo", + "tuyos", + "tú", + "un", + "una", + "uno", + "unos", + "vosotras", + "vosotros", + "vuestra", + "vuestras", + "vuestro", + "vuestros", + "y", + "ya", + "yo", + "él", + "éramos", +]; diff --git a/vendor/elasticlunr-rs/src/lang/fi.rs b/vendor/elasticlunr-rs/src/lang/fi.rs new file mode 100644 index 000000000..91cfaa571 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/fi.rs @@ -0,0 +1,277 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Finnish {} + +impl Finnish { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Finnish { + fn name(&self) -> String { + "Finnish".into() + } + fn code(&self) -> String { + "fi".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-fi", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-fi", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-fi", Algorithm::Finnish)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "ei", + "eivät", + "emme", + "en", + "et", + "ette", + "että", + "he", + "heidän", + "heidät", + "heihin", + "heille", + "heillä", + "heiltä", + "heissä", + "heistä", + "heitä", + "hän", + "häneen", + "hänelle", + "hänellä", + "häneltä", + "hänen", + "hänessä", + "hänestä", + "hänet", + "häntä", + "itse", + "ja", + "johon", + "joiden", + "joihin", + "joiksi", + "joilla", + "joille", + "joilta", + "joina", + "joissa", + "joista", + "joita", + "joka", + "joksi", + "jolla", + "jolle", + "jolta", + "jona", + "jonka", + "jos", + "jossa", + "josta", + "jota", + "jotka", + "kanssa", + "keiden", + "keihin", + "keiksi", + "keille", + "keillä", + "keiltä", + "keinä", + "keissä", + "keistä", + "keitä", + "keneen", + "keneksi", + "kenelle", + "kenellä", + "keneltä", + "kenen", + "kenenä", + "kenessä", + "kenestä", + "kenet", + "ketkä", + "ketkä", + "ketä", + "koska", + "kuin", + "kuka", + "kun", + "me", + "meidän", + "meidät", + "meihin", + "meille", + "meillä", + "meiltä", + "meissä", + "meistä", + "meitä", + "mihin", + "miksi", + "mikä", + "mille", + "millä", + "miltä", + "minkä", + "minkä", + "minua", + "minulla", + "minulle", + "minulta", + "minun", + "minussa", + "minusta", + "minut", + "minuun", + "minä", + "minä", + "missä", + "mistä", + "mitkä", + "mitä", + "mukaan", + "mutta", + "ne", + "niiden", + "niihin", + "niiksi", + "niille", + "niillä", + "niiltä", + "niin", + "niin", + "niinä", + "niissä", + "niistä", + "niitä", + "noiden", + "noihin", + "noiksi", + "noilla", + "noille", + "noilta", + "noin", + "noina", + "noissa", + "noista", + "noita", + "nuo", + "nyt", + "näiden", + "näihin", + "näiksi", + "näille", + "näillä", + "näiltä", + "näinä", + "näissä", + "näistä", + "näitä", + "nämä", + "ole", + "olemme", + "olen", + "olet", + "olette", + "oli", + "olimme", + "olin", + "olisi", + "olisimme", + "olisin", + "olisit", + "olisitte", + "olisivat", + "olit", + "olitte", + "olivat", + "olla", + "olleet", + "ollut", + "on", + "ovat", + "poikki", + "se", + "sekä", + "sen", + "siihen", + "siinä", + "siitä", + "siksi", + "sille", + "sillä", + "sillä", + "siltä", + "sinua", + "sinulla", + "sinulle", + "sinulta", + "sinun", + "sinussa", + "sinusta", + "sinut", + "sinuun", + "sinä", + "sinä", + "sitä", + "tai", + "te", + "teidän", + "teidät", + "teihin", + "teille", + "teillä", + "teiltä", + "teissä", + "teistä", + "teitä", + "tuo", + "tuohon", + "tuoksi", + "tuolla", + "tuolle", + "tuolta", + "tuon", + "tuona", + "tuossa", + "tuosta", + "tuota", + "tähän", + "täksi", + "tälle", + "tällä", + "tältä", + "tämä", + "tämän", + "tänä", + "tässä", + "tästä", + "tätä", + "vaan", + "vai", + "vaikka", + "yli", +]; diff --git a/vendor/elasticlunr-rs/src/lang/fr.rs b/vendor/elasticlunr-rs/src/lang/fr.rs new file mode 100644 index 000000000..ec41f307a --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/fr.rs @@ -0,0 +1,56 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct French {} + +impl French { + pub fn new() -> Self { + Self {} + } +} + +impl Language for French { + fn name(&self) -> String { + "French".into() + } + fn code(&self) -> String { + "fr".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-fr", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-fr", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-fr", Algorithm::French)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", "ai", "aie", "aient", "aies", "ait", "as", "au", "aura", "aurai", "auraient", "aurais", + "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aux", "avaient", "avais", + "avait", "avec", "avez", "aviez", "avions", "avons", "ayant", "ayez", "ayons", "c", "ce", + "ceci", "celà", "ces", "cet", "cette", "d", "dans", "de", "des", "du", "elle", "en", "es", + "est", "et", "eu", "eue", "eues", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", + "eussions", "eut", "eux", "eûmes", "eût", "eûtes", "furent", "fus", "fusse", "fussent", + "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", "ici", "il", "ils", "j", "je", + "l", "la", "le", "les", "leur", "leurs", "lui", "m", "ma", "mais", "me", "mes", "moi", "mon", + "même", "n", "ne", "nos", "notre", "nous", "on", "ont", "ou", "par", "pas", "pour", "qu", + "que", "quel", "quelle", "quelles", "quels", "qui", "s", "sa", "sans", "se", "sera", "serai", + "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", + "ses", "soi", "soient", "sois", "soit", "sommes", "son", "sont", "soyez", "soyons", "suis", + "sur", "t", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre", "vous", "y", + "à", "étaient", "étais", "était", "étant", "étiez", "étions", "été", "étée", "étées", "étés", + "êtes", +]; diff --git a/vendor/elasticlunr-rs/src/lang/it.rs b/vendor/elasticlunr-rs/src/lang/it.rs new file mode 100644 index 000000000..78d7e4454 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/it.rs @@ -0,0 +1,321 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Italian {} + +impl Italian { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Italian { + fn name(&self) -> String { + "Italian".into() + } + fn code(&self) -> String { + "it".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-it", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-it", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-it", Algorithm::Italian)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "a", + "abbia", + "abbiamo", + "abbiano", + "abbiate", + "ad", + "agl", + "agli", + "ai", + "al", + "all", + "alla", + "alle", + "allo", + "anche", + "avemmo", + "avendo", + "avesse", + "avessero", + "avessi", + "avessimo", + "aveste", + "avesti", + "avete", + "aveva", + "avevamo", + "avevano", + "avevate", + "avevi", + "avevo", + "avrai", + "avranno", + "avrebbe", + "avrebbero", + "avrei", + "avremmo", + "avremo", + "avreste", + "avresti", + "avrete", + "avrà", + "avrò", + "avuta", + "avute", + "avuti", + "avuto", + "c", + "che", + "chi", + "ci", + "coi", + "col", + "come", + "con", + "contro", + "cui", + "da", + "dagl", + "dagli", + "dai", + "dal", + "dall", + "dalla", + "dalle", + "dallo", + "degl", + "degli", + "dei", + "del", + "dell", + "della", + "delle", + "dello", + "di", + "dov", + "dove", + "e", + "ebbe", + "ebbero", + "ebbi", + "ed", + "era", + "erano", + "eravamo", + "eravate", + "eri", + "ero", + "essendo", + "faccia", + "facciamo", + "facciano", + "facciate", + "faccio", + "facemmo", + "facendo", + "facesse", + "facessero", + "facessi", + "facessimo", + "faceste", + "facesti", + "faceva", + "facevamo", + "facevano", + "facevate", + "facevi", + "facevo", + "fai", + "fanno", + "farai", + "faranno", + "farebbe", + "farebbero", + "farei", + "faremmo", + "faremo", + "fareste", + "faresti", + "farete", + "farà", + "farò", + "fece", + "fecero", + "feci", + "fosse", + "fossero", + "fossi", + "fossimo", + "foste", + "fosti", + "fu", + "fui", + "fummo", + "furono", + "gli", + "ha", + "hai", + "hanno", + "ho", + "i", + "il", + "in", + "io", + "l", + "la", + "le", + "lei", + "li", + "lo", + "loro", + "lui", + "ma", + "mi", + "mia", + "mie", + "miei", + "mio", + "ne", + "negl", + "negli", + "nei", + "nel", + "nell", + "nella", + "nelle", + "nello", + "noi", + "non", + "nostra", + "nostre", + "nostri", + "nostro", + "o", + "per", + "perché", + "più", + "quale", + "quanta", + "quante", + "quanti", + "quanto", + "quella", + "quelle", + "quelli", + "quello", + "questa", + "queste", + "questi", + "questo", + "sarai", + "saranno", + "sarebbe", + "sarebbero", + "sarei", + "saremmo", + "saremo", + "sareste", + "saresti", + "sarete", + "sarà", + "sarò", + "se", + "sei", + "si", + "sia", + "siamo", + "siano", + "siate", + "siete", + "sono", + "sta", + "stai", + "stando", + "stanno", + "starai", + "staranno", + "starebbe", + "starebbero", + "starei", + "staremmo", + "staremo", + "stareste", + "staresti", + "starete", + "starà", + "starò", + "stava", + "stavamo", + "stavano", + "stavate", + "stavi", + "stavo", + "stemmo", + "stesse", + "stessero", + "stessi", + "stessimo", + "steste", + "stesti", + "stette", + "stettero", + "stetti", + "stia", + "stiamo", + "stiano", + "stiate", + "sto", + "su", + "sua", + "sue", + "sugl", + "sugli", + "sui", + "sul", + "sull", + "sulla", + "sulle", + "sullo", + "suo", + "suoi", + "ti", + "tra", + "tu", + "tua", + "tue", + "tuo", + "tuoi", + "tutti", + "tutto", + "un", + "una", + "uno", + "vi", + "voi", + "vostra", + "vostre", + "vostri", + "vostro", + "è", +]; diff --git a/vendor/elasticlunr-rs/src/lang/ja.rs b/vendor/elasticlunr-rs/src/lang/ja.rs new file mode 100644 index 000000000..e38fcde9f --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/ja.rs @@ -0,0 +1,76 @@ +use super::{common::RegexTrimmer, Language}; +use crate::pipeline::{FnWrapper, Pipeline}; +use lindera::tokenizer::{Tokenizer, TokenizerConfig}; +use lindera_core::viterbi::Mode; + +#[derive(Clone)] +pub struct Japanese { + tokenizer: Tokenizer, +} + +impl Japanese { + pub fn new() -> Self { + let config = TokenizerConfig { + mode: Mode::Decompose(Default::default()), + ..Default::default() + }; + Self::with_config(config) + } + + pub fn with_config(config: TokenizerConfig) -> Self { + // NB: unwrap() is okay since the errors are only related to user-supplied dictionaries. + let tokenizer = Tokenizer::with_config(config).unwrap(); + Self { tokenizer } + } +} + +impl Language for Japanese { + fn name(&self) -> String { + "Japanese".into() + } + fn code(&self) -> String { + "ja".into() + } + + fn tokenize(&self, text: &str) -> Vec { + self.tokenizer + .tokenize(text) + .unwrap() + .into_iter() + .filter_map(|tok| match tok.detail.get(0).map(|d| d.as_str()) { + Some("助詞") | Some("助動詞") | Some("記号") | Some("UNK") => None, + _ => Some(tok.text.to_string()), + }) + .collect() + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-ja", WORD_CHARS)), + Box::new(FnWrapper("stemmer-ja".into(), stemmer)), + ], + } + } +} + +const WORD_CHARS: &str = r"0-9A-Za-z\p{Hiragana}\p{Katakana}\p{Unified_Ideograph}"; + +fn stemmer(token: String) -> Option { + Some(token) +} + +#[cfg(test)] +mod tests { + use crate::pipeline::PipelineFn; + use super::*; + + #[test] + fn test_trimmer() { + let trimmer = RegexTrimmer::new("trimmer-ja".into(), WORD_CHARS); + assert_eq!( + trimmer.filter(" こんにちは、世界!".to_string()), + Some("こんにちは、世界".to_string()) + ); + } +} diff --git a/vendor/elasticlunr-rs/src/lang/mod.rs b/vendor/elasticlunr-rs/src/lang/mod.rs new file mode 100644 index 000000000..81966e1b2 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/mod.rs @@ -0,0 +1,138 @@ +//! Intended to be compatible with . Each supported +//! language has a trimmer, a stop word filter, and a stemmer. Most users will not need to use +//! these modules directly. + +pub mod common; + +use crate::Pipeline; + +pub trait Language { + /// The name of the language in English + fn name(&self) -> String; + + /// The ISO 639-1 language code of the language + fn code(&self) -> String; + + /// Separates the input text into individual tokens. In most languages a token is a word, separated by whitespace. + fn tokenize(&self, text: &str) -> Vec; + + /// Returns the [`Pipeline`] to process the tokens with + fn make_pipeline(&self) -> Pipeline; +} + +/// Splits a text string into a vector of individual tokens. +pub fn tokenize_whitespace(text: &str) -> Vec { + text.split(|c: char| c.is_whitespace() || c == '-') + .filter(|s| !s.is_empty()) + .map(|s| s.trim().to_lowercase()) + .collect() +} + +macro_rules! impl_language { + ($( ( $name:ident, $code:ident $(, #[$cfgs:meta] )? ), )+) => { + /// Returns a list of all the [`Language`] implementations in the crate + pub fn languages() -> Vec> { + vec![ + $( + $(#[$cfgs])? + Box::new($code::$name::new()), + )+ + ] + } + + /// Returns the [`Language`] for the given two-character [ISO 639-1][iso] language code if the + /// language is supported. Returns `None` if not supported. + /// + /// *Note:* + /// + /// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name + /// and pipeline suffix in order to match lunr-languages. + /// + /// [iso]: https://en.wikipedia.org/wiki/ISO_639-1 + pub fn from_code(code: &str) -> Option> { + match code.to_ascii_lowercase().as_str() { + $( + $(#[$cfgs])? + stringify!($code) => Some(Box::new($code::$name::new())), + )+ + _ => None, + } + } + + /// Returns the [`Language`] for the given English language name if the + /// language is supported. Returns `None` if not supported. The first letter must + /// be capitalized. + pub fn from_name(name: &str) -> Option> { + match name { + $( + $(#[$cfgs])? + stringify!($name) => Some(Box::new($code::$name::new())), + )+ + _ => None, + } + } + + $( + $(#[$cfgs])? + mod $code; + + $(#[$cfgs])? + pub use $code::$name; + )+ + }; +} + +impl_language! { + (English, en), + (Arabic, ar, #[cfg(feature = "ar")]), + (Chinese, zh, #[cfg(feature = "zh")]), + (Danish, da, #[cfg(feature = "da")]), + (Dutch, du, #[cfg(feature = "du")]), + (Finnish, fi, #[cfg(feature = "fi")]), + (French, fr, #[cfg(feature = "fr")]), + (German, de, #[cfg(feature = "de")]), + (Italian, it, #[cfg(feature = "it")]), + (Japanese, ja, #[cfg(feature = "ja")]), + (Norwegian, no, #[cfg(feature = "no")]), + (Portuguese, pt, #[cfg(feature = "pt")]), + (Romanian, ro, #[cfg(feature = "ro")]), + (Russian, ru, #[cfg(feature = "ru")]), + (Spanish, es, #[cfg(feature = "es")]), + (Swedish, sv, #[cfg(feature = "sv")]), + (Turkish, tr, #[cfg(feature = "tr")]), +} + +#[cfg(test)] +mod tests { + use super::tokenize_whitespace; + + #[test] + fn split_simple_strings() { + let string = "this is a simple string"; + assert_eq!( + &tokenize_whitespace(string), + &["this", "is", "a", "simple", "string"] + ); + } + + #[test] + fn multiple_white_space() { + let string = " foo bar "; + assert_eq!(&tokenize_whitespace(string), &["foo", "bar"]); + } + + #[test] + fn hyphens() { + let string = "take the New York-San Francisco flight"; + assert_eq!( + &tokenize_whitespace(string), + &["take", "the", "new", "york", "san", "francisco", "flight"] + ); + } + + #[test] + fn splitting_strings_with_hyphens() { + let string = "Solve for A - B"; + assert_eq!(&tokenize_whitespace(string), &["solve", "for", "a", "b"]); + } +} diff --git a/vendor/elasticlunr-rs/src/lang/no.rs b/vendor/elasticlunr-rs/src/lang/no.rs new file mode 100644 index 000000000..710346fde --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/no.rs @@ -0,0 +1,218 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Norwegian {} + +impl Norwegian { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Norwegian { + fn name(&self) -> String { + "Norwegian".into() + } + fn code(&self) -> String { + "no".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-no", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-no", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-no", Algorithm::Norwegian)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "alle", + "at", + "av", + "bare", + "begge", + "ble", + "blei", + "bli", + "blir", + "blitt", + "både", + "båe", + "da", + "de", + "deg", + "dei", + "deim", + "deira", + "deires", + "dem", + "den", + "denne", + "der", + "dere", + "deres", + "det", + "dette", + "di", + "din", + "disse", + "ditt", + "du", + "dykk", + "dykkar", + "då", + "eg", + "ein", + "eit", + "eitt", + "eller", + "elles", + "en", + "enn", + "er", + "et", + "ett", + "etter", + "for", + "fordi", + "fra", + "før", + "ha", + "hadde", + "han", + "hans", + "har", + "hennar", + "henne", + "hennes", + "her", + "hjå", + "ho", + "hoe", + "honom", + "hoss", + "hossen", + "hun", + "hva", + "hvem", + "hver", + "hvilke", + "hvilken", + "hvis", + "hvor", + "hvordan", + "hvorfor", + "i", + "ikke", + "ikkje", + "ikkje", + "ingen", + "ingi", + "inkje", + "inn", + "inni", + "ja", + "jeg", + "kan", + "kom", + "korleis", + "korso", + "kun", + "kunne", + "kva", + "kvar", + "kvarhelst", + "kven", + "kvi", + "kvifor", + "man", + "mange", + "me", + "med", + "medan", + "meg", + "meget", + "mellom", + "men", + "mi", + "min", + "mine", + "mitt", + "mot", + "mykje", + "ned", + "no", + "noe", + "noen", + "noka", + "noko", + "nokon", + "nokor", + "nokre", + "nå", + "når", + "og", + "også", + "om", + "opp", + "oss", + "over", + "på", + "samme", + "seg", + "selv", + "si", + "si", + "sia", + "sidan", + "siden", + "sin", + "sine", + "sitt", + "sjøl", + "skal", + "skulle", + "slik", + "so", + "som", + "som", + "somme", + "somt", + "så", + "sånn", + "til", + "um", + "upp", + "ut", + "uten", + "var", + "vart", + "varte", + "ved", + "vere", + "verte", + "vi", + "vil", + "ville", + "vore", + "vors", + "vort", + "vår", + "være", + "være", + "vært", + "å", +]; diff --git a/vendor/elasticlunr-rs/src/lang/pt.rs b/vendor/elasticlunr-rs/src/lang/pt.rs new file mode 100644 index 000000000..5f36f4280 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/pt.rs @@ -0,0 +1,245 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Portuguese {} + +impl Portuguese { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Portuguese { + fn name(&self) -> String { + "Portuguese".into() + } + fn code(&self) -> String { + "pt".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-pt", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-pt", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-pt", Algorithm::Portuguese)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "a", + "ao", + "aos", + "aquela", + "aquelas", + "aquele", + "aqueles", + "aquilo", + "as", + "até", + "com", + "como", + "da", + "das", + "de", + "dela", + "delas", + "dele", + "deles", + "depois", + "do", + "dos", + "e", + "ela", + "elas", + "ele", + "eles", + "em", + "entre", + "era", + "eram", + "essa", + "essas", + "esse", + "esses", + "esta", + "estamos", + "estas", + "estava", + "estavam", + "este", + "esteja", + "estejam", + "estejamos", + "estes", + "esteve", + "estive", + "estivemos", + "estiver", + "estivera", + "estiveram", + "estiverem", + "estivermos", + "estivesse", + "estivessem", + "estivéramos", + "estivéssemos", + "estou", + "está", + "estávamos", + "estão", + "eu", + "foi", + "fomos", + "for", + "fora", + "foram", + "forem", + "formos", + "fosse", + "fossem", + "fui", + "fôramos", + "fôssemos", + "haja", + "hajam", + "hajamos", + "havemos", + "hei", + "houve", + "houvemos", + "houver", + "houvera", + "houveram", + "houverei", + "houverem", + "houveremos", + "houveria", + "houveriam", + "houvermos", + "houverá", + "houverão", + "houveríamos", + "houvesse", + "houvessem", + "houvéramos", + "houvéssemos", + "há", + "hão", + "isso", + "isto", + "já", + "lhe", + "lhes", + "mais", + "mas", + "me", + "mesmo", + "meu", + "meus", + "minha", + "minhas", + "muito", + "na", + "nas", + "nem", + "no", + "nos", + "nossa", + "nossas", + "nosso", + "nossos", + "num", + "numa", + "não", + "nós", + "o", + "os", + "ou", + "para", + "pela", + "pelas", + "pelo", + "pelos", + "por", + "qual", + "quando", + "que", + "quem", + "se", + "seja", + "sejam", + "sejamos", + "sem", + "serei", + "seremos", + "seria", + "seriam", + "será", + "serão", + "seríamos", + "seu", + "seus", + "somos", + "sou", + "sua", + "suas", + "são", + "só", + "também", + "te", + "tem", + "temos", + "tenha", + "tenham", + "tenhamos", + "tenho", + "terei", + "teremos", + "teria", + "teriam", + "terá", + "terão", + "teríamos", + "teu", + "teus", + "teve", + "tinha", + "tinham", + "tive", + "tivemos", + "tiver", + "tivera", + "tiveram", + "tiverem", + "tivermos", + "tivesse", + "tivessem", + "tivéramos", + "tivéssemos", + "tu", + "tua", + "tuas", + "tém", + "tínhamos", + "um", + "uma", + "você", + "vocês", + "vos", + "à", + "às", + "éramos", +]; diff --git a/vendor/elasticlunr-rs/src/lang/ro.rs b/vendor/elasticlunr-rs/src/lang/ro.rs new file mode 100644 index 000000000..8244fe967 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/ro.rs @@ -0,0 +1,323 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Romanian {} + +impl Romanian { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Romanian { + fn name(&self) -> String { + "Romanian".into() + } + fn code(&self) -> String { + "ro".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-ro", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-ro", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-ro", Algorithm::Romanian)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "acea", + "aceasta", + "această", + "aceea", + "acei", + "aceia", + "acel", + "acela", + "acele", + "acelea", + "acest", + "acesta", + "aceste", + "acestea", + "aceşti", + "aceştia", + "acolo", + "acord", + "acum", + "ai", + "aia", + "aibă", + "aici", + "al", + "ale", + "alea", + "altceva", + "altcineva", + "am", + "ar", + "are", + "asemenea", + "asta", + "astea", + "astăzi", + "asupra", + "au", + "avea", + "avem", + "aveţi", + "azi", + "aş", + "aşadar", + "aţi", + "bine", + "bucur", + "bună", + "ca", + "care", + "caut", + "ce", + "cel", + "ceva", + "chiar", + "cinci", + "cine", + "cineva", + "contra", + "cu", + "cum", + "cumva", + "curând", + "curînd", + "când", + "cât", + "câte", + "câtva", + "câţi", + "cînd", + "cît", + "cîte", + "cîtva", + "cîţi", + "că", + "căci", + "cărei", + "căror", + "cărui", + "către", + "da", + "dacă", + "dar", + "datorită", + "dată", + "dau", + "de", + "deci", + "deja", + "deoarece", + "departe", + "deşi", + "din", + "dinaintea", + "dintr-", + "dintre", + "doi", + "doilea", + "două", + "drept", + "după", + "dă", + "ea", + "ei", + "el", + "ele", + "eram", + "este", + "eu", + "eşti", + "face", + "fata", + "fi", + "fie", + "fiecare", + "fii", + "fim", + "fiu", + "fiţi", + "frumos", + "fără", + "graţie", + "halbă", + "iar", + "ieri", + "la", + "le", + "li", + "lor", + "lui", + "lângă", + "lîngă", + "mai", + "mea", + "mei", + "mele", + "mereu", + "meu", + "mi", + "mie", + "mine", + "mult", + "multă", + "mulţi", + "mulţumesc", + "mâine", + "mîine", + "mă", + "ne", + "nevoie", + "nici", + "nicăieri", + "nimeni", + "nimeri", + "nimic", + "nişte", + "noastre", + "noastră", + "noi", + "noroc", + "nostru", + "nouă", + "noştri", + "nu", + "opt", + "ori", + "oricare", + "orice", + "oricine", + "oricum", + "oricând", + "oricât", + "oricînd", + "oricît", + "oriunde", + "patra", + "patru", + "patrulea", + "pe", + "pentru", + "peste", + "pic", + "poate", + "pot", + "prea", + "prima", + "primul", + "prin", + "puţin", + "puţina", + "puţină", + "până", + "pînă", + "rog", + "sa", + "sale", + "sau", + "se", + "spate", + "spre", + "sub", + "sunt", + "suntem", + "sunteţi", + "sută", + "sînt", + "sîntem", + "sînteţi", + "să", + "săi", + "său", + "ta", + "tale", + "te", + "timp", + "tine", + "toate", + "toată", + "tot", + "totuşi", + "toţi", + "trei", + "treia", + "treilea", + "tu", + "tăi", + "tău", + "un", + "una", + "unde", + "undeva", + "unei", + "uneia", + "unele", + "uneori", + "unii", + "unor", + "unora", + "unu", + "unui", + "unuia", + "unul", + "vi", + "voastre", + "voastră", + "voi", + "vostru", + "vouă", + "voştri", + "vreme", + "vreo", + "vreun", + "vă", + "zece", + "zero", + "zi", + "zice", + "îi", + "îl", + "îmi", + "împotriva", + "în", + "înainte", + "înaintea", + "încotro", + "încât", + "încît", + "între", + "întrucât", + "întrucît", + "îţi", + "ăla", + "ălea", + "ăsta", + "ăstea", + "ăştia", + "şapte", + "şase", + "şi", + "ştiu", + "ţi", + "ţie", +]; diff --git a/vendor/elasticlunr-rs/src/lang/ru.rs b/vendor/elasticlunr-rs/src/lang/ru.rs new file mode 100644 index 000000000..6b210d540 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/ru.rs @@ -0,0 +1,463 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Russian {} + +impl Russian { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Russian { + fn name(&self) -> String { + "Russian".into() + } + fn code(&self) -> String { + "ru".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-ru", r"\p{Cyrillic}")), + Box::new(StopWordFilter::new("stopWordFilter-ru", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-ru", Algorithm::Russian)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "алло", + "без", + "близко", + "более", + "больше", + "будем", + "будет", + "будете", + "будешь", + "будто", + "буду", + "будут", + "будь", + "бы", + "бывает", + "бывь", + "был", + "была", + "были", + "было", + "быть", + "в", + "важная", + "важное", + "важные", + "важный", + "вам", + "вами", + "вас", + "ваш", + "ваша", + "ваше", + "ваши", + "вверх", + "вдали", + "вдруг", + "ведь", + "везде", + "весь", + "вниз", + "внизу", + "во", + "вокруг", + "вон", + "восемнадцатый", + "восемнадцать", + "восемь", + "восьмой", + "вот", + "впрочем", + "времени", + "время", + "все", + "всегда", + "всего", + "всем", + "всеми", + "всему", + "всех", + "всею", + "всю", + "всюду", + "вся", + "всё", + "второй", + "вы", + "г", + "где", + "говорил", + "говорит", + "год", + "года", + "году", + "да", + "давно", + "даже", + "далеко", + "дальше", + "даром", + "два", + "двадцатый", + "двадцать", + "две", + "двенадцатый", + "двенадцать", + "двух", + "девятнадцатый", + "девятнадцать", + "девятый", + "девять", + "действительно", + "дел", + "день", + "десятый", + "десять", + "для", + "до", + "довольно", + "долго", + "должно", + "другая", + "другие", + "других", + "друго", + "другое", + "другой", + "е", + "его", + "ее", + "ей", + "ему", + "если", + "есть", + "еще", + "ещё", + "ею", + "её", + "ж", + "же", + "жизнь", + "за", + "занят", + "занята", + "занято", + "заняты", + "затем", + "зато", + "зачем", + "здесь", + "значит", + "и", + "из", + "или", + "им", + "именно", + "иметь", + "ими", + "имя", + "иногда", + "их", + "к", + "каждая", + "каждое", + "каждые", + "каждый", + "кажется", + "как", + "какая", + "какой", + "кем", + "когда", + "кого", + "ком", + "кому", + "конечно", + "которая", + "которого", + "которой", + "которые", + "который", + "которых", + "кроме", + "кругом", + "кто", + "куда", + "лет", + "ли", + "лишь", + "лучше", + "люди", + "м", + "мало", + "между", + "меля", + "менее", + "меньше", + "меня", + "миллионов", + "мимо", + "мира", + "мне", + "много", + "многочисленная", + "многочисленное", + "многочисленные", + "многочисленный", + "мной", + "мною", + "мог", + "могут", + "мож", + "может", + "можно", + "можхо", + "мои", + "мой", + "мор", + "мочь", + "моя", + "моё", + "мы", + "на", + "наверху", + "над", + "надо", + "назад", + "наиболее", + "наконец", + "нам", + "нами", + "нас", + "начала", + "наш", + "наша", + "наше", + "наши", + "не", + "него", + "недавно", + "недалеко", + "нее", + "ней", + "нельзя", + "нем", + "немного", + "нему", + "непрерывно", + "нередко", + "несколько", + "нет", + "нею", + "неё", + "ни", + "нибудь", + "ниже", + "низко", + "никогда", + "никуда", + "ними", + "них", + "ничего", + "но", + "ну", + "нужно", + "нх", + "о", + "об", + "оба", + "обычно", + "один", + "одиннадцатый", + "одиннадцать", + "однажды", + "однако", + "одного", + "одной", + "около", + "он", + "она", + "они", + "оно", + "опять", + "особенно", + "от", + "отовсюду", + "отсюда", + "очень", + "первый", + "перед", + "по", + "под", + "пожалуйста", + "позже", + "пока", + "пор", + "пора", + "после", + "посреди", + "потом", + "потому", + "почему", + "почти", + "прекрасно", + "при", + "про", + "просто", + "против", + "процентов", + "пятнадцатый", + "пятнадцать", + "пятый", + "пять", + "раз", + "разве", + "рано", + "раньше", + "рядом", + "с", + "сам", + "сама", + "сами", + "самим", + "самими", + "самих", + "само", + "самого", + "самой", + "самом", + "самому", + "саму", + "свое", + "своего", + "своей", + "свои", + "своих", + "свою", + "сеаой", + "себе", + "себя", + "сегодня", + "седьмой", + "сейчас", + "семнадцатый", + "семнадцать", + "семь", + "сих", + "сказал", + "сказала", + "сказать", + "сколько", + "слишком", + "сначала", + "снова", + "со", + "собой", + "собою", + "совсем", + "спасибо", + "стал", + "суть", + "т", + "та", + "так", + "такая", + "также", + "такие", + "такое", + "такой", + "там", + "твой", + "твоя", + "твоё", + "те", + "тебе", + "тебя", + "тем", + "теми", + "теперь", + "тех", + "то", + "тобой", + "тобою", + "тогда", + "того", + "тоже", + "только", + "том", + "тому", + "тот", + "тою", + "третий", + "три", + "тринадцатый", + "тринадцать", + "ту", + "туда", + "тут", + "ты", + "тысяч", + "у", + "уж", + "уже", + "уметь", + "хорошо", + "хотеть", + "хоть", + "хотя", + "хочешь", + "часто", + "чаще", + "чего", + "человек", + "чем", + "чему", + "через", + "четвертый", + "четыре", + "четырнадцатый", + "четырнадцать", + "что", + "чтоб", + "чтобы", + "чуть", + "шестнадцатый", + "шестнадцать", + "шестой", + "шесть", + "эта", + "эти", + "этим", + "этими", + "этих", + "это", + "этого", + "этой", + "этом", + "этому", + "этот", + "эту", + "я", + "а", +]; diff --git a/vendor/elasticlunr-rs/src/lang/sv.rs b/vendor/elasticlunr-rs/src/lang/sv.rs new file mode 100644 index 000000000..29beeb7b1 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/sv.rs @@ -0,0 +1,51 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Swedish {} + +impl Swedish { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Swedish { + fn name(&self) -> String { + "Swedish".into() + } + fn code(&self) -> String { + "sv".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-sv", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-sv", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-sv", Algorithm::Swedish)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", "alla", "allt", "att", "av", "blev", "bli", "blir", "blivit", "de", "dem", "den", "denna", + "deras", "dess", "dessa", "det", "detta", "dig", "din", "dina", "ditt", "du", "där", "då", + "efter", "ej", "eller", "en", "er", "era", "ert", "ett", "från", "för", "ha", "hade", "han", + "hans", "har", "henne", "hennes", "hon", "honom", "hur", "här", "i", "icke", "ingen", "inom", + "inte", "jag", "ju", "kan", "kunde", "man", "med", "mellan", "men", "mig", "min", "mina", + "mitt", "mot", "mycket", "ni", "nu", "när", "någon", "något", "några", "och", "om", "oss", + "på", "samma", "sedan", "sig", "sin", "sina", "sitta", "själv", "skulle", "som", "så", "sådan", + "sådana", "sådant", "till", "under", "upp", "ut", "utan", "vad", "var", "vara", "varför", + "varit", "varje", "vars", "vart", "vem", "vi", "vid", "vilka", "vilkas", "vilken", "vilket", + "vår", "våra", "vårt", "än", "är", "åt", "över", +]; diff --git a/vendor/elasticlunr-rs/src/lang/tr.rs b/vendor/elasticlunr-rs/src/lang/tr.rs new file mode 100644 index 000000000..1aea580fa --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/tr.rs @@ -0,0 +1,251 @@ +use super::{ + common::{RustStemmer, StopWordFilter, RegexTrimmer}, + Language, +}; +use crate::pipeline::Pipeline; +use rust_stemmers::Algorithm; + +#[derive(Clone)] +pub struct Turkish {} + +impl Turkish { + pub fn new() -> Self { + Self {} + } +} + +impl Language for Turkish { + fn name(&self) -> String { + "Turkish".into() + } + fn code(&self) -> String { + "tr".into() + } + + fn tokenize(&self, text: &str) -> Vec { + super::tokenize_whitespace(text) + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-tr", r"\p{Latin}")), + Box::new(StopWordFilter::new("stopWordFilter-tr", STOP_WORDS)), + Box::new(RustStemmer::new("stemmer-tr", Algorithm::Turkish)), + ], + } + } +} + +const STOP_WORDS: &[&str] = &[ + "", + "acaba", + "altmış", + "altı", + "ama", + "ancak", + "arada", + "aslında", + "ayrıca", + "bana", + "bazı", + "belki", + "ben", + "benden", + "beni", + "benim", + "beri", + "beş", + "bile", + "bin", + "bir", + "biri", + "birkaç", + "birkez", + "birçok", + "birşey", + "birşeyi", + "biz", + "bizden", + "bize", + "bizi", + "bizim", + "bu", + "buna", + "bunda", + "bundan", + "bunlar", + "bunları", + "bunların", + "bunu", + "bunun", + "burada", + "böyle", + "böylece", + "da", + "daha", + "dahi", + "de", + "defa", + "değil", + "diye", + "diğer", + "doksan", + "dokuz", + "dolayı", + "dolayısıyla", + "dört", + "edecek", + "eden", + "ederek", + "edilecek", + "ediliyor", + "edilmesi", + "ediyor", + "elli", + "en", + "etmesi", + "etti", + "ettiği", + "ettiğini", + "eğer", + "gibi", + "göre", + "halen", + "hangi", + "hatta", + "hem", + "henüz", + "hep", + "hepsi", + "her", + "herhangi", + "herkesin", + "hiç", + "hiçbir", + "iki", + "ile", + "ilgili", + "ise", + "itibaren", + "itibariyle", + "için", + "işte", + "kadar", + "karşın", + "katrilyon", + "kendi", + "kendilerine", + "kendini", + "kendisi", + "kendisine", + "kendisini", + "kez", + "ki", + "kim", + "kimden", + "kime", + "kimi", + "kimse", + "kırk", + "milyar", + "milyon", + "mu", + "mü", + "mı", + "nasıl", + "ne", + "neden", + "nedenle", + "nerde", + "nerede", + "nereye", + "niye", + "niçin", + "o", + "olan", + "olarak", + "oldu", + "olduklarını", + "olduğu", + "olduğunu", + "olmadı", + "olmadığı", + "olmak", + "olması", + "olmayan", + "olmaz", + "olsa", + "olsun", + "olup", + "olur", + "olursa", + "oluyor", + "on", + "ona", + "ondan", + "onlar", + "onlardan", + "onları", + "onların", + "onu", + "onun", + "otuz", + "oysa", + "pek", + "rağmen", + "sadece", + "sanki", + "sekiz", + "seksen", + "sen", + "senden", + "seni", + "senin", + "siz", + "sizden", + "sizi", + "sizin", + "tarafından", + "trilyon", + "tüm", + "var", + "vardı", + "ve", + "veya", + "ya", + "yani", + "yapacak", + "yapmak", + "yaptı", + "yaptıkları", + "yaptığı", + "yaptığını", + "yapılan", + "yapılması", + "yapıyor", + "yedi", + "yerine", + "yetmiş", + "yine", + "yirmi", + "yoksa", + "yüz", + "zaten", + "çok", + "çünkü", + "öyle", + "üzere", + "üç", + "şey", + "şeyden", + "şeyi", + "şeyler", + "şu", + "şuna", + "şunda", + "şundan", + "şunları", + "şunu", + "şöyle", +]; diff --git a/vendor/elasticlunr-rs/src/lang/zh.rs b/vendor/elasticlunr-rs/src/lang/zh.rs new file mode 100644 index 000000000..aa10d758f --- /dev/null +++ b/vendor/elasticlunr-rs/src/lang/zh.rs @@ -0,0 +1,55 @@ +use super::{common::RegexTrimmer, Language}; +use crate::pipeline::{FnWrapper, Pipeline}; + +#[derive(Clone)] +pub struct Chinese { + jieba: jieba_rs::Jieba, +} + +impl Chinese { + pub fn new() -> Self { + Self { + jieba: jieba_rs::Jieba::new(), + } + } +} + +impl Language for Chinese { + fn name(&self) -> String { + "Chinese".into() + } + fn code(&self) -> String { + "zh".into() + } + + fn tokenize(&self, text: &str) -> Vec { + self.jieba + .cut_for_search(text, false) + .iter() + .map(|s| s.to_string()) + .collect() + } + + fn make_pipeline(&self) -> Pipeline { + Pipeline { + queue: vec![ + Box::new(RegexTrimmer::new("trimmer-zh", r"\p{Unified_Ideograph}\p{Latin}")), + Box::new(FnWrapper("stopWordFilter-zh".into(), stop_word_filter)), + Box::new(FnWrapper("stemmer-zh".into(), stemmer)), + ], + } + } +} + +// TODO: lunr.zh.js has a much larger set of stop words +fn stop_word_filter(token: String) -> Option { + match token.as_str() { + "的" | "了" => None, + _ => Some(token), + } +} + +// lunr.zh.js has an empty stemmer as well +fn stemmer(token: String) -> Option { + Some(token) +} diff --git a/vendor/elasticlunr-rs/src/lib.rs b/vendor/elasticlunr-rs/src/lib.rs new file mode 100644 index 000000000..3efcf4629 --- /dev/null +++ b/vendor/elasticlunr-rs/src/lib.rs @@ -0,0 +1,413 @@ +//!# elasticlunr-rs +//! +//! [![Build Status](https://travis-ci.org/mattico/elasticlunr-rs.svg?branch=master)](https://travis-ci.org/mattico/elasticlunr-rs) +//! [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs) +//! [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs) +//! +//! A partial port of [elasticlunr](https://github.com/weixsong/elasticlunr.js) to Rust. Intended to +//! be used for generating compatible search indices. +//! +//! Access to all index-generating functionality is provided. Most users will only need to use the +//! [`Index`](struct.Index.html) or [`IndexBuilder`](struct.IndexBuilder.html) types. +//! +//! The [`Language`] trait can be used to implement a custom language. +//! +//! ## Example +//! +//! ``` +//! use std::fs::File; +//! use std::io::Write; +//! use elasticlunr::Index; +//! +//! let mut index = Index::new(&["title", "body"]); +//! index.add_doc("1", &["This is a title", "This is body text!"]); +//! // Add more docs... +//! let mut file = File::create("out.json").unwrap(); +//! file.write_all(index.to_json_pretty().as_bytes()); +//! ``` + +#[macro_use] +extern crate serde_derive; + +#[cfg(test)] +#[macro_use] +extern crate maplit; + +/// The version of elasticlunr.js this library was designed for. +pub const ELASTICLUNR_VERSION: &str = "0.9.5"; + +pub mod config; +pub mod document_store; +pub mod inverted_index; +pub mod lang; +pub mod pipeline; + +use std::collections::BTreeMap; + +use document_store::DocumentStore; +use inverted_index::InvertedIndex; +use lang::English; +pub use lang::Language; +pub use pipeline::Pipeline; + +type Tokenizer = Option Vec>>; + +/// A builder for an `Index` with custom parameters. +/// +/// # Example +/// ``` +/// # use elasticlunr::{Index, IndexBuilder}; +/// let mut index = IndexBuilder::new() +/// .save_docs(false) +/// .add_fields(&["title", "subtitle", "body"]) +/// .set_ref("doc_id") +/// .build(); +/// index.add_doc("doc_a", &["Chapter 1", "Welcome to Copenhagen", "..."]); +/// ``` +pub struct IndexBuilder { + save: bool, + fields: Vec, + field_tokenizers: Vec, + ref_field: String, + pipeline: Option, + language: Box, +} + +impl Default for IndexBuilder { + fn default() -> Self { + IndexBuilder { + save: true, + fields: Vec::new(), + field_tokenizers: Vec::new(), + ref_field: "id".into(), + pipeline: None, + language: Box::new(English::new()), + } + } +} + +impl IndexBuilder { + pub fn new() -> Self { + Default::default() + } + + pub fn with_language(language: Box) -> Self { + Self { + language, + ..Default::default() + } + } + + /// Set whether or not documents should be saved in the `Index`'s document store. + pub fn save_docs(mut self, save: bool) -> Self { + self.save = save; + self + } + + /// Add a document field to the `Index`. + /// + /// # Panics + /// + /// Panics if a field with the name already exists. + pub fn add_field(mut self, field: &str) -> Self { + let field = field.into(); + if self.fields.contains(&field) { + panic!("Duplicate fields in index: {}", field); + } + self.fields.push(field); + self.field_tokenizers.push(None); + self + } + + /// Add a document field to the `Index`, with a custom tokenizer for that field. + /// + /// # Panics + /// + /// Panics if a field with the name already exists. + pub fn add_field_with_tokenizer( + mut self, + field: &str, + tokenizer: Box Vec>, + ) -> Self { + let field = field.into(); + if self.fields.contains(&field) { + panic!("Duplicate fields in index: {}", field); + } + self.fields.push(field); + self.field_tokenizers.push(Some(tokenizer)); + self + } + + /// Add the document fields to the `Index`. + /// + /// # Panics + /// + /// Panics if two fields have the same name. + pub fn add_fields(mut self, fields: I) -> Self + where + I: IntoIterator, + I::Item: AsRef, + { + for field in fields { + self = self.add_field(field.as_ref()) + } + self + } + + /// Set the key used to store the document reference field. + pub fn set_ref(mut self, ref_field: &str) -> Self { + self.ref_field = ref_field.into(); + self + } + + /// Build an `Index` from this builder. + pub fn build(self) -> Index { + let IndexBuilder { + save, + fields, + field_tokenizers, + ref_field, + pipeline, + language, + } = self; + + let index = fields + .iter() + .map(|f| (f.clone(), InvertedIndex::new())) + .collect(); + + let pipeline = pipeline.unwrap_or_else(|| language.make_pipeline()); + + Index { + index, + fields: fields, + field_tokenizers: field_tokenizers, + ref_field: ref_field, + document_store: DocumentStore::new(save), + pipeline, + version: crate::ELASTICLUNR_VERSION, + lang: language, + } + } +} + +/// An elasticlunr search index. +#[derive(Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Index { + fields: Vec, + #[serde(skip)] + field_tokenizers: Vec, + pipeline: Pipeline, + #[serde(rename = "ref")] + ref_field: String, + version: &'static str, + index: BTreeMap, + document_store: DocumentStore, + #[serde(with = "ser_lang")] + lang: Box, +} + +mod ser_lang { + use crate::Language; + use serde::de; + use serde::{Deserializer, Serializer}; + use std::fmt; + + pub fn serialize(lang: &Box, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&lang.name()) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> + where + D: Deserializer<'de>, + { + deserializer.deserialize_str(LanguageVisitor) + } + + struct LanguageVisitor; + + impl<'de> de::Visitor<'de> for LanguageVisitor { + type Value = Box; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a capitalized language name") + } + + fn visit_borrowed_str(self, v: &'de str) -> Result + where + E: de::Error, + { + match crate::lang::from_name(v) { + Some(l) => Ok(l), + None => Err(E::custom(format!("Unknown language name: {}", v))), + } + } + } +} + +impl Index { + /// Create a new index with the provided fields. + /// + /// # Example + /// + /// ``` + /// # use elasticlunr::{Index}; + /// let mut index = Index::new(&["title", "body"]); + /// index.add_doc("1", &["this is a title", "this is body text"]); + /// ``` + /// + /// # Panics + /// + /// Panics if a field with the name already exists. + pub fn new(fields: I) -> Self + where + I: IntoIterator, + I::Item: AsRef, + { + IndexBuilder::new().add_fields(fields).build() + } + + /// Create a new index with the provided fields for the given + /// [`Language`](lang/enum.Language.html). + /// + /// # Example + /// + /// ``` + /// use elasticlunr::{Index, lang::English}; + /// let mut index = Index::with_language(Box::new(English::new()), &["title", "body"]); + /// index.add_doc("1", &["this is a title", "this is body text"]); + /// ``` + /// + /// # Panics + /// + /// Panics if a field with the name already exists. + pub fn with_language(lang: Box, fields: I) -> Self + where + I: IntoIterator, + I::Item: AsRef, + { + IndexBuilder::with_language(lang).add_fields(fields).build() + } + + /// Add the data from a document to the index. + /// + /// *NOTE: The elements of `data` should be provided in the same order as + /// the fields used to create the index.* + /// + /// # Example + /// ``` + /// # use elasticlunr::Index; + /// let mut index = Index::new(&["title", "body"]); + /// index.add_doc("1", &["this is a title", "this is body text"]); + /// ``` + pub fn add_doc(&mut self, doc_ref: &str, data: I) + where + I: IntoIterator, + I::Item: AsRef, + { + let mut doc = BTreeMap::new(); + doc.insert(self.ref_field.clone(), doc_ref.into()); + let mut token_freq = BTreeMap::new(); + + for (i, value) in data.into_iter().enumerate() { + let field = &self.fields[i]; + let tokenizer = self.field_tokenizers[i].as_ref(); + doc.insert(field.clone(), value.as_ref().to_string()); + + if field == &self.ref_field { + continue; + } + + let raw_tokens = if let Some(tokenizer) = tokenizer { + tokenizer(value.as_ref()) + } else { + self.lang.tokenize(value.as_ref()) + }; + + let tokens = self.pipeline.run(raw_tokens); + + self.document_store + .add_field_length(doc_ref, field, tokens.len()); + + for token in tokens { + *token_freq.entry(token).or_insert(0u64) += 1; + } + + for (token, count) in &token_freq { + let freq = (*count as f64).sqrt(); + + self.index + .get_mut(field) + .unwrap_or_else(|| panic!("InvertedIndex does not exist for field {}", field)) + .add_token(doc_ref, token, freq); + } + } + + self.document_store.add_doc(doc_ref, doc); + } + + pub fn get_fields(&self) -> &[String] { + &self.fields + } + + /// Returns the index, serialized to pretty-printed JSON. + pub fn to_json_pretty(&self) -> String { + serde_json::to_string_pretty(&self).unwrap() + } + + /// Returns the index, serialized to JSON. + pub fn to_json(&self) -> String { + serde_json::to_string(&self).unwrap() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn add_field_to_builder() { + let idx = IndexBuilder::new() + .add_fields(&["foo", "bar", "baz"]) + .build(); + + let idx_fields = idx.get_fields(); + for f in &["foo", "bar", "baz"] { + assert_eq!(idx_fields.iter().filter(|x| x == f).count(), 1); + } + } + + #[test] + fn adding_document_to_index() { + let mut idx = Index::new(&["body"]); + idx.add_doc("1", &["this is a test"]); + + assert_eq!(idx.document_store.len(), 1); + assert_eq!( + idx.document_store.get_doc("1").unwrap(), + btreemap! { + "id".into() => "1".into(), + "body".into() => "this is a test".into(), + } + ); + } + + #[test] + fn adding_document_with_empty_field() { + let mut idx = Index::new(&["title", "body"]); + + idx.add_doc("1", &["", "test"]); + assert_eq!(idx.index["body"].get_doc_frequency("test"), 1); + assert_eq!(idx.index["body"].get_docs("test").unwrap()["1"], 1.); + } + + #[test] + #[should_panic] + fn creating_index_with_identical_fields_panics() { + let _idx = Index::new(&["title", "body", "title"]); + } +} diff --git a/vendor/elasticlunr-rs/src/pipeline.rs b/vendor/elasticlunr-rs/src/pipeline.rs new file mode 100644 index 000000000..a20de3f11 --- /dev/null +++ b/vendor/elasticlunr-rs/src/pipeline.rs @@ -0,0 +1,65 @@ +//! Defines the pipeline which processes text for inclusion in the index. Most users do not need +//! to use this module directly. + +use serde::ser::{Serialize, SerializeSeq, Serializer}; + +pub trait PipelineFn { + fn name(&self) -> String; + + fn filter(&self, token: String) -> Option; +} + +#[derive(Clone)] +pub struct FnWrapper(pub String, pub fn(String) -> Option); + +impl PipelineFn for FnWrapper { + fn name(&self) -> String { + self.0.clone() + } + + fn filter(&self, token: String) -> Option { + (self.1)(token) + } +} + +/// A sequence of `PipelineFn`s which are run on tokens to prepare them for searching. +#[derive(Deserialize)] +pub struct Pipeline { + #[serde(skip_deserializing)] + pub queue: Vec>, +} + +impl Serialize for Pipeline { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut seq = serializer.serialize_seq(Some(self.queue.len()))?; + for elem in &self.queue { + seq.serialize_element(&elem.name())?; + } + seq.end() + } +} + +impl Pipeline { + /// Run the Pipeline against the given vector of tokens. The returned vector may be shorter + /// than the input if a pipeline function returns `None` for a token. + pub fn run(&self, tokens: Vec) -> Vec { + let mut ret = vec![]; + for token in tokens { + let mut token = Some(token); + for func in &self.queue { + if let Some(t) = token { + token = func.filter(t); + } else { + break; + } + } + if let Some(t) = token { + ret.push(t); + } + } + ret + } +} -- cgit v1.2.3