summaryrefslogtreecommitdiffstats
path: root/vendor/elasticlunr-rs/src/lib.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /vendor/elasticlunr-rs/src/lib.rs
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/elasticlunr-rs/src/lib.rs')
-rw-r--r--vendor/elasticlunr-rs/src/lib.rs413
1 files changed, 413 insertions, 0 deletions
diff --git a/vendor/elasticlunr-rs/src/lib.rs b/vendor/elasticlunr-rs/src/lib.rs
new file mode 100644
index 000000000..3efcf4629
--- /dev/null
+++ b/vendor/elasticlunr-rs/src/lib.rs
@@ -0,0 +1,413 @@
+//!# elasticlunr-rs
+//!
+//! [![Build Status](https://travis-ci.org/mattico/elasticlunr-rs.svg?branch=master)](https://travis-ci.org/mattico/elasticlunr-rs)
+//! [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs)
+//! [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs)
+//!
+//! A partial port of [elasticlunr](https://github.com/weixsong/elasticlunr.js) to Rust. Intended to
+//! be used for generating compatible search indices.
+//!
+//! Access to all index-generating functionality is provided. Most users will only need to use the
+//! [`Index`](struct.Index.html) or [`IndexBuilder`](struct.IndexBuilder.html) types.
+//!
+//! The [`Language`] trait can be used to implement a custom language.
+//!
+//! ## Example
+//!
+//! ```
+//! use std::fs::File;
+//! use std::io::Write;
+//! use elasticlunr::Index;
+//!
+//! let mut index = Index::new(&["title", "body"]);
+//! index.add_doc("1", &["This is a title", "This is body text!"]);
+//! // Add more docs...
+//! let mut file = File::create("out.json").unwrap();
+//! file.write_all(index.to_json_pretty().as_bytes());
+//! ```
+
+#[macro_use]
+extern crate serde_derive;
+
+#[cfg(test)]
+#[macro_use]
+extern crate maplit;
+
+/// The version of elasticlunr.js this library was designed for.
+pub const ELASTICLUNR_VERSION: &str = "0.9.5";
+
+pub mod config;
+pub mod document_store;
+pub mod inverted_index;
+pub mod lang;
+pub mod pipeline;
+
+use std::collections::BTreeMap;
+
+use document_store::DocumentStore;
+use inverted_index::InvertedIndex;
+use lang::English;
+pub use lang::Language;
+pub use pipeline::Pipeline;
+
+type Tokenizer = Option<Box<dyn Fn(&str) -> Vec<String>>>;
+
+/// A builder for an `Index` with custom parameters.
+///
+/// # Example
+/// ```
+/// # use elasticlunr::{Index, IndexBuilder};
+/// let mut index = IndexBuilder::new()
+/// .save_docs(false)
+/// .add_fields(&["title", "subtitle", "body"])
+/// .set_ref("doc_id")
+/// .build();
+/// index.add_doc("doc_a", &["Chapter 1", "Welcome to Copenhagen", "..."]);
+/// ```
+pub struct IndexBuilder {
+ save: bool,
+ fields: Vec<String>,
+ field_tokenizers: Vec<Tokenizer>,
+ ref_field: String,
+ pipeline: Option<Pipeline>,
+ language: Box<dyn Language>,
+}
+
+impl Default for IndexBuilder {
+ fn default() -> Self {
+ IndexBuilder {
+ save: true,
+ fields: Vec::new(),
+ field_tokenizers: Vec::new(),
+ ref_field: "id".into(),
+ pipeline: None,
+ language: Box::new(English::new()),
+ }
+ }
+}
+
+impl IndexBuilder {
+ pub fn new() -> Self {
+ Default::default()
+ }
+
+ pub fn with_language(language: Box<dyn Language>) -> Self {
+ Self {
+ language,
+ ..Default::default()
+ }
+ }
+
+ /// Set whether or not documents should be saved in the `Index`'s document store.
+ pub fn save_docs(mut self, save: bool) -> Self {
+ self.save = save;
+ self
+ }
+
+ /// Add a document field to the `Index`.
+ ///
+ /// # Panics
+ ///
+ /// Panics if a field with the name already exists.
+ pub fn add_field(mut self, field: &str) -> Self {
+ let field = field.into();
+ if self.fields.contains(&field) {
+ panic!("Duplicate fields in index: {}", field);
+ }
+ self.fields.push(field);
+ self.field_tokenizers.push(None);
+ self
+ }
+
+ /// Add a document field to the `Index`, with a custom tokenizer for that field.
+ ///
+ /// # Panics
+ ///
+ /// Panics if a field with the name already exists.
+ pub fn add_field_with_tokenizer(
+ mut self,
+ field: &str,
+ tokenizer: Box<dyn Fn(&str) -> Vec<String>>,
+ ) -> Self {
+ let field = field.into();
+ if self.fields.contains(&field) {
+ panic!("Duplicate fields in index: {}", field);
+ }
+ self.fields.push(field);
+ self.field_tokenizers.push(Some(tokenizer));
+ self
+ }
+
+ /// Add the document fields to the `Index`.
+ ///
+ /// # Panics
+ ///
+ /// Panics if two fields have the same name.
+ pub fn add_fields<I>(mut self, fields: I) -> Self
+ where
+ I: IntoIterator,
+ I::Item: AsRef<str>,
+ {
+ for field in fields {
+ self = self.add_field(field.as_ref())
+ }
+ self
+ }
+
+ /// Set the key used to store the document reference field.
+ pub fn set_ref(mut self, ref_field: &str) -> Self {
+ self.ref_field = ref_field.into();
+ self
+ }
+
+ /// Build an `Index` from this builder.
+ pub fn build(self) -> Index {
+ let IndexBuilder {
+ save,
+ fields,
+ field_tokenizers,
+ ref_field,
+ pipeline,
+ language,
+ } = self;
+
+ let index = fields
+ .iter()
+ .map(|f| (f.clone(), InvertedIndex::new()))
+ .collect();
+
+ let pipeline = pipeline.unwrap_or_else(|| language.make_pipeline());
+
+ Index {
+ index,
+ fields: fields,
+ field_tokenizers: field_tokenizers,
+ ref_field: ref_field,
+ document_store: DocumentStore::new(save),
+ pipeline,
+ version: crate::ELASTICLUNR_VERSION,
+ lang: language,
+ }
+ }
+}
+
+/// An elasticlunr search index.
+#[derive(Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Index {
+ fields: Vec<String>,
+ #[serde(skip)]
+ field_tokenizers: Vec<Tokenizer>,
+ pipeline: Pipeline,
+ #[serde(rename = "ref")]
+ ref_field: String,
+ version: &'static str,
+ index: BTreeMap<String, InvertedIndex>,
+ document_store: DocumentStore,
+ #[serde(with = "ser_lang")]
+ lang: Box<dyn Language>,
+}
+
+mod ser_lang {
+ use crate::Language;
+ use serde::de;
+ use serde::{Deserializer, Serializer};
+ use std::fmt;
+
+ pub fn serialize<S>(lang: &Box<dyn Language>, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: Serializer,
+ {
+ serializer.serialize_str(&lang.name())
+ }
+
+ pub fn deserialize<'de, D>(deserializer: D) -> Result<Box<dyn Language>, D::Error>
+ where
+ D: Deserializer<'de>,
+ {
+ deserializer.deserialize_str(LanguageVisitor)
+ }
+
+ struct LanguageVisitor;
+
+ impl<'de> de::Visitor<'de> for LanguageVisitor {
+ type Value = Box<dyn Language>;
+
+ fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+ formatter.write_str("a capitalized language name")
+ }
+
+ fn visit_borrowed_str<E>(self, v: &'de str) -> Result<Self::Value, E>
+ where
+ E: de::Error,
+ {
+ match crate::lang::from_name(v) {
+ Some(l) => Ok(l),
+ None => Err(E::custom(format!("Unknown language name: {}", v))),
+ }
+ }
+ }
+}
+
+impl Index {
+ /// Create a new index with the provided fields.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # use elasticlunr::{Index};
+ /// let mut index = Index::new(&["title", "body"]);
+ /// index.add_doc("1", &["this is a title", "this is body text"]);
+ /// ```
+ ///
+ /// # Panics
+ ///
+ /// Panics if a field with the name already exists.
+ pub fn new<I>(fields: I) -> Self
+ where
+ I: IntoIterator,
+ I::Item: AsRef<str>,
+ {
+ IndexBuilder::new().add_fields(fields).build()
+ }
+
+ /// Create a new index with the provided fields for the given
+ /// [`Language`](lang/enum.Language.html).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use elasticlunr::{Index, lang::English};
+ /// let mut index = Index::with_language(Box::new(English::new()), &["title", "body"]);
+ /// index.add_doc("1", &["this is a title", "this is body text"]);
+ /// ```
+ ///
+ /// # Panics
+ ///
+ /// Panics if a field with the name already exists.
+ pub fn with_language<I>(lang: Box<dyn Language>, fields: I) -> Self
+ where
+ I: IntoIterator,
+ I::Item: AsRef<str>,
+ {
+ IndexBuilder::with_language(lang).add_fields(fields).build()
+ }
+
+ /// Add the data from a document to the index.
+ ///
+ /// *NOTE: The elements of `data` should be provided in the same order as
+ /// the fields used to create the index.*
+ ///
+ /// # Example
+ /// ```
+ /// # use elasticlunr::Index;
+ /// let mut index = Index::new(&["title", "body"]);
+ /// index.add_doc("1", &["this is a title", "this is body text"]);
+ /// ```
+ pub fn add_doc<I>(&mut self, doc_ref: &str, data: I)
+ where
+ I: IntoIterator,
+ I::Item: AsRef<str>,
+ {
+ let mut doc = BTreeMap::new();
+ doc.insert(self.ref_field.clone(), doc_ref.into());
+ let mut token_freq = BTreeMap::new();
+
+ for (i, value) in data.into_iter().enumerate() {
+ let field = &self.fields[i];
+ let tokenizer = self.field_tokenizers[i].as_ref();
+ doc.insert(field.clone(), value.as_ref().to_string());
+
+ if field == &self.ref_field {
+ continue;
+ }
+
+ let raw_tokens = if let Some(tokenizer) = tokenizer {
+ tokenizer(value.as_ref())
+ } else {
+ self.lang.tokenize(value.as_ref())
+ };
+
+ let tokens = self.pipeline.run(raw_tokens);
+
+ self.document_store
+ .add_field_length(doc_ref, field, tokens.len());
+
+ for token in tokens {
+ *token_freq.entry(token).or_insert(0u64) += 1;
+ }
+
+ for (token, count) in &token_freq {
+ let freq = (*count as f64).sqrt();
+
+ self.index
+ .get_mut(field)
+ .unwrap_or_else(|| panic!("InvertedIndex does not exist for field {}", field))
+ .add_token(doc_ref, token, freq);
+ }
+ }
+
+ self.document_store.add_doc(doc_ref, doc);
+ }
+
+ pub fn get_fields(&self) -> &[String] {
+ &self.fields
+ }
+
+ /// Returns the index, serialized to pretty-printed JSON.
+ pub fn to_json_pretty(&self) -> String {
+ serde_json::to_string_pretty(&self).unwrap()
+ }
+
+ /// Returns the index, serialized to JSON.
+ pub fn to_json(&self) -> String {
+ serde_json::to_string(&self).unwrap()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn add_field_to_builder() {
+ let idx = IndexBuilder::new()
+ .add_fields(&["foo", "bar", "baz"])
+ .build();
+
+ let idx_fields = idx.get_fields();
+ for f in &["foo", "bar", "baz"] {
+ assert_eq!(idx_fields.iter().filter(|x| x == f).count(), 1);
+ }
+ }
+
+ #[test]
+ fn adding_document_to_index() {
+ let mut idx = Index::new(&["body"]);
+ idx.add_doc("1", &["this is a test"]);
+
+ assert_eq!(idx.document_store.len(), 1);
+ assert_eq!(
+ idx.document_store.get_doc("1").unwrap(),
+ btreemap! {
+ "id".into() => "1".into(),
+ "body".into() => "this is a test".into(),
+ }
+ );
+ }
+
+ #[test]
+ fn adding_document_with_empty_field() {
+ let mut idx = Index::new(&["title", "body"]);
+
+ idx.add_doc("1", &["", "test"]);
+ assert_eq!(idx.index["body"].get_doc_frequency("test"), 1);
+ assert_eq!(idx.index["body"].get_docs("test").unwrap()["1"], 1.);
+ }
+
+ #[test]
+ #[should_panic]
+ fn creating_index_with_identical_fields_panics() {
+ let _idx = Index::new(&["title", "body", "title"]);
+ }
+}