Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:02:58 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:02:58 +0000
commit: 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree: 173a775858bd501c378080a10dca74132f05bc50 /vendor/elasticlunr-rs/src/lang/mod.rs
parent: Initial commit. (diff)
download: rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
1 files changed, 138 insertions, 0 deletions
diff --git a/vendor/elasticlunr-rs/src/lang/mod.rs b/vendor/elasticlunr-rs/src/lang/mod.rs
new file mode 100644
index 000000000..81966e1b2
--- /dev/null
+++ b/vendor/elasticlunr-rs/src/lang/mod.rs
@@ -0,0 +1,138 @@
+//! Intended to be compatible with <https://github.com/MihaiValentin/lunr-languages>. Each supported
+//! language has a trimmer, a stop word filter, and a stemmer. Most users will not need to use
+//! these modules directly.
+
+pub mod common;
+
+use crate::Pipeline;
+
+pub trait Language {
+    /// The name of the language in English
+    fn name(&self) -> String;
+
+    /// The ISO 639-1 language code of the language
+    fn code(&self) -> String;
+
+    /// Separates the input text into individual tokens. In most languages a token is a word, separated by whitespace.
+    fn tokenize(&self, text: &str) -> Vec<String>;
+
+    /// Returns the [`Pipeline`] to process the tokens with
+    fn make_pipeline(&self) -> Pipeline;
+}
+
+/// Splits a text string into a vector of individual tokens.
+pub fn tokenize_whitespace(text: &str) -> Vec<String> {
+    text.split(|c: char| c.is_whitespace() || c == '-')
+        .filter(|s| !s.is_empty())
+        .map(|s| s.trim().to_lowercase())
+        .collect()
+}
+
+macro_rules! impl_language {
+    ($( ( $name:ident, $code:ident $(, #[$cfgs:meta] )? ), )+) => {
+        /// Returns a list of all the [`Language`] implementations in the crate
+        pub fn languages() -> Vec<Box<dyn Language>> {
+            vec![
+                $(
+                    $(#[$cfgs])?
+                    Box::new($code::$name::new()),
+                )+
+            ]
+        }
+
+        /// Returns the [`Language`] for the given two-character [ISO 639-1][iso] language code if the
+        /// language is supported. Returns `None` if not supported.
+        ///
+        /// *Note:*
+        ///
+        /// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name
+        /// and pipeline suffix in order to match lunr-languages.
+        ///
+        /// [iso]: https://en.wikipedia.org/wiki/ISO_639-1
+        pub fn from_code(code: &str) -> Option<Box<dyn Language>> {
+            match code.to_ascii_lowercase().as_str() {
+                $(
+                    $(#[$cfgs])?
+                    stringify!($code) => Some(Box::new($code::$name::new())),
+                )+
+                _ => None,
+            }
+        }
+
+        /// Returns the [`Language`] for the given English language name if the
+        /// language is supported. Returns `None` if not supported. The first letter must
+        /// be capitalized.
+        pub fn from_name(name: &str) -> Option<Box<dyn Language>> {
+            match name {
+                $(
+                    $(#[$cfgs])?
+                    stringify!($name) => Some(Box::new($code::$name::new())),
+                )+
+                _ => None,
+            }
+        }
+
+        $(
+            $(#[$cfgs])?
+            mod $code;
+
+            $(#[$cfgs])?
+            pub use $code::$name;
+        )+
+    };
+}
+
+impl_language! {
+    (English, en),
+    (Arabic, ar, #[cfg(feature = "ar")]),
+    (Chinese, zh, #[cfg(feature = "zh")]),
+    (Danish, da, #[cfg(feature = "da")]),
+    (Dutch, du, #[cfg(feature = "du")]),
+    (Finnish, fi, #[cfg(feature = "fi")]),
+    (French, fr, #[cfg(feature = "fr")]),
+    (German, de, #[cfg(feature = "de")]),
+    (Italian, it, #[cfg(feature = "it")]),
+    (Japanese, ja, #[cfg(feature = "ja")]),
+    (Norwegian, no, #[cfg(feature = "no")]),
+    (Portuguese, pt, #[cfg(feature = "pt")]),
+    (Romanian, ro, #[cfg(feature = "ro")]),
+    (Russian, ru, #[cfg(feature = "ru")]),
+    (Spanish, es, #[cfg(feature = "es")]),
+    (Swedish, sv, #[cfg(feature = "sv")]),
+    (Turkish, tr, #[cfg(feature = "tr")]),
+}
+
+#[cfg(test)]
+mod tests {
+    use super::tokenize_whitespace;
+
+    #[test]
+    fn split_simple_strings() {
+        let string = "this is a simple string";
+        assert_eq!(
+            &tokenize_whitespace(string),
+            &["this", "is", "a", "simple", "string"]
+        );
+    }
+
+    #[test]
+    fn multiple_white_space() {
+        let string = "  foo    bar  ";
+        assert_eq!(&tokenize_whitespace(string), &["foo", "bar"]);
+    }
+
+    #[test]
+    fn hyphens() {
+        let string = "take the New York-San Francisco flight";
+        assert_eq!(
+            &tokenize_whitespace(string),
+            &["take", "the", "new", "york", "san", "francisco", "flight"]
+        );
+    }
+
+    #[test]
+    fn splitting_strings_with_hyphens() {
+        let string = "Solve for A - B";
+        assert_eq!(&tokenize_whitespace(string), &["solve", "for", "a", "b"]);
+    }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:02:58 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:02:58 +0000
commit	698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree	173a775858bd501c378080a10dca74132f05bc50 /vendor/elasticlunr-rs/src/lang/mod.rs
parent	Initial commit. (diff)
download	rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip