diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:44:51 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:44:51 +0000 |
commit | 9e3c08db40b8916968b9f30096c7be3f00ce9647 (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/rust/strsim | |
parent | Initial commit. (diff) | |
download | thunderbird-9e3c08db40b8916968b9f30096c7be3f00ce9647.tar.xz thunderbird-9e3c08db40b8916968b9f30096c7be3f00ce9647.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/strsim')
-rw-r--r-- | third_party/rust/strsim/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | third_party/rust/strsim/CHANGELOG.md | 203 | ||||
-rw-r--r-- | third_party/rust/strsim/Cargo.toml | 24 | ||||
-rw-r--r-- | third_party/rust/strsim/LICENSE | 23 | ||||
-rw-r--r-- | third_party/rust/strsim/README.md | 102 | ||||
-rw-r--r-- | third_party/rust/strsim/benches/benches.rs | 100 | ||||
-rw-r--r-- | third_party/rust/strsim/src/lib.rs | 1005 | ||||
-rw-r--r-- | third_party/rust/strsim/tests/lib.rs | 49 |
8 files changed, 1507 insertions, 0 deletions
diff --git a/third_party/rust/strsim/.cargo-checksum.json b/third_party/rust/strsim/.cargo-checksum.json new file mode 100644 index 0000000000..245ef42f91 --- /dev/null +++ b/third_party/rust/strsim/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"CHANGELOG.md":"96553d0de79bf911b5ca66c999195f7f4ea6061564e4698d1adcb567060e1bcd","Cargo.toml":"3f0f1737ecbf9c7595b52585d54507f217e66b2b8dfa337934ca427022d810c8","LICENSE":"1e697ce8d21401fbf1bddd9b5c3fd4c4c79ae1e3bdf51f81761c85e11d5a89cd","README.md":"b9fc7a1ac69abed8055b824713bf9ebfb4a07e2b7a356b50d8ed55e7a9accd18","benches/benches.rs":"62c83a5a0948c06ffb54d0bf75a31ee5d9e5acde9e079c3b5cfb755bc634b72c","src/lib.rs":"1300ad81d4b682476e30d361a01a248a93e96426303ffde8bbd585258fa0b02f","tests/lib.rs":"de2b1181c379a0f55de7b86021a9afb77dbe81053a6acf99623bec3663f9b7c4"},"package":"73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"}
\ No newline at end of file diff --git a/third_party/rust/strsim/CHANGELOG.md b/third_party/rust/strsim/CHANGELOG.md new file mode 100644 index 0000000000..5b029ffb9b --- /dev/null +++ b/third_party/rust/strsim/CHANGELOG.md @@ -0,0 +1,203 @@ +# Change Log + +This project attempts to adhere to [Semantic Versioning](http://semver.org). + +## [Unreleased] + +## [0.10.0] - (2020-01-31) + +### Added + +- Sørensen-Dice implementation (thanks [@robjtede](https://github.com/robjtede)) + +## [0.9.3] - (2019-12-12) + +### Fixed + +- Fix Jaro and Jaro-Winkler when the arguments have lengths of 1 and are equal. + Previously, the functions would erroneously return 0 instead of 1. Thanks to + [@vvrably](https://github.com/vvrably) for pointing out the issue. + +## [0.9.2] - (2019-05-09) + +### Changed + +- Revert back to the standard library hashmap because it will use hashbrown very + soon +- Remove ndarray in favor of using a single vector to represent the 2d grid in + Damerau-Levenshtein + +## [0.9.1] - (2019-04-08) + +### Changed + +- Faster Damerau-Levenshtein implementation (thanks [@lovasoa](https://github.com/lovasoa)) + +## [0.9.0] - (2019-04-06) + +### Added + +- Generic distance functions (thanks [@lovasoa](https://github.com/lovasoa)) + +## [0.8.0] - (2018-08-19) + +### Added + +- Normalized versions of Levenshtein and Damerau-Levenshtein (thanks [@gentoid](https://github.com/gentoid)) + +## [0.7.0] - (2018-01-17) + +### Changed + +- Faster Levenshtein implementation (thanks [@wdv4758h](https://github.com/wdv4758h)) + +### Removed + +- Remove the "against_vec" functions. They are one-liners now, so they don't + seem to add enough value to justify making the API larger. I didn't find + anybody using them when I skimmed through a GitHub search. If you do use them, + you can change the calls to something like: +```rust +let distances = strings.iter().map(|a| jaro(target, a)).collect(); +``` + +## [0.6.0] - (2016-12-26) + +### Added + +- Add optimal string alignment distance + +### Fixed + +- Fix Damerau-Levenshtein implementation (previous implementation was actually + optimal string alignment; see this [Damerau-Levenshtein explanation]) + +## [0.5.2] - (2016-11-21) + +### Changed + +- Remove Cargo generated documentation in favor of a [docs.rs] link + +## [0.5.1] - (2016-08-23) + +### Added + +- Add Cargo generated documentation + +### Fixed + +- Fix panic when Jaro or Jaro-Winkler are given strings both with a length of + one + +## [0.5.0] - (2016-08-11) + +### Changed + +- Make Hamming faster (thanks @IBUzPE9) when the two strings have the same + length but slower when they have different lengths + +## [0.4.1] - (2016-04-18) + +### Added + +- Add Vagrant setup for development +- Add AppVeyor configuration for Windows CI + +### Fixed + +- Fix metrics when given strings with multibyte characters (thanks @WanzenBug) + +## [0.4.0] - (2015-06-10) + +### Added + +- For each metric, add a function that takes a vector of strings and returns a +vector of results (thanks @ovarene) + +## [0.3.0] - (2015-04-30) + +### Changed + +- Remove usage of unstable Rust features + +## [0.2.5] - (2015-04-24) + +### Fixed + +- Remove unnecessary `Float` import from doc tests + +## [0.2.4] - (2015-04-15) + +### Fixed + +- Remove unused `core` feature flag + +## [0.2.3] - (2015-04-01) + +### Fixed + +- Remove now unnecessary `Float` import + +## [0.2.2] - (2015-03-29) + +### Fixed + +- Remove usage of `char_at` (marked as unstable) + +## [0.2.1] - (2015-02-20) + +### Fixed + +- Update bit vector import to match Rust update + +## [0.2.0] - (2015-02-19) + +### Added + +- Implement Damerau-Levenshtein +- Add tests in docs + +## [0.1.1] - (2015-02-10) + +### Added + +- Configure Travis for CI +- Add rustdoc comments + +### Fixed + +- Limit Jaro-Winkler return value to a maximum of 1.0 +- Fix float comparisons in tests + +## [0.1.0] - (2015-02-09) + +### Added + +- Implement Hamming, Jaro, Jaro-Winkler, and Levenshtein + +[Unreleased]: https://github.com/dguo/strsim-rs/compare/0.10.0...HEAD +[0.10.0]: https://github.com/dguo/strsim-rs/compare/0.9.3...0.10.0 +[0.9.3]: https://github.com/dguo/strsim-rs/compare/0.9.2...0.9.3 +[0.9.2]: https://github.com/dguo/strsim-rs/compare/0.9.1...0.9.2 +[0.9.1]: https://github.com/dguo/strsim-rs/compare/0.9.0...0.9.1 +[0.9.0]: https://github.com/dguo/strsim-rs/compare/0.8.0...0.9.0 +[0.8.0]: https://github.com/dguo/strsim-rs/compare/0.7.0...0.8.0 +[0.7.0]: https://github.com/dguo/strsim-rs/compare/0.6.0...0.7.0 +[0.6.0]: https://github.com/dguo/strsim-rs/compare/0.5.2...0.6.0 +[0.5.2]: https://github.com/dguo/strsim-rs/compare/0.5.1...0.5.2 +[0.5.1]: https://github.com/dguo/strsim-rs/compare/0.5.0...0.5.1 +[0.5.0]: https://github.com/dguo/strsim-rs/compare/0.4.1...0.5.0 +[0.4.1]: https://github.com/dguo/strsim-rs/compare/0.4.0...0.4.1 +[0.4.0]: https://github.com/dguo/strsim-rs/compare/0.3.0...0.4.0 +[0.3.0]: https://github.com/dguo/strsim-rs/compare/0.2.5...0.3.0 +[0.2.5]: https://github.com/dguo/strsim-rs/compare/0.2.4...0.2.5 +[0.2.4]: https://github.com/dguo/strsim-rs/compare/0.2.3...0.2.4 +[0.2.3]: https://github.com/dguo/strsim-rs/compare/0.2.2...0.2.3 +[0.2.2]: https://github.com/dguo/strsim-rs/compare/0.2.1...0.2.2 +[0.2.1]: https://github.com/dguo/strsim-rs/compare/0.2.0...0.2.1 +[0.2.0]: https://github.com/dguo/strsim-rs/compare/0.1.1...0.2.0 +[0.1.1]: https://github.com/dguo/strsim-rs/compare/0.1.0...0.1.1 +[0.1.0]: https://github.com/dguo/strsim-rs/compare/fabad4...0.1.0 +[docs.rs]: https://docs.rs/strsim/ +[Damerau-Levenshtein explanation]: +http://scarcitycomputing.blogspot.com/2013/04/damerau-levenshtein-edit-distance.html diff --git a/third_party/rust/strsim/Cargo.toml b/third_party/rust/strsim/Cargo.toml new file mode 100644 index 0000000000..aca741a86a --- /dev/null +++ b/third_party/rust/strsim/Cargo.toml @@ -0,0 +1,24 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g. crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "strsim" +version = "0.10.0" +authors = ["Danny Guo <danny@dannyguo.com>"] +exclude = ["/.github", "/dev"] +description = "Implementations of string similarity metrics. Includes Hamming, Levenshtein,\nOSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice.\n" +homepage = "https://github.com/dguo/strsim-rs" +documentation = "https://docs.rs/strsim/" +readme = "README.md" +keywords = ["string", "similarity", "Hamming", "Levenshtein", "Jaro"] +license = "MIT" +repository = "https://github.com/dguo/strsim-rs" diff --git a/third_party/rust/strsim/LICENSE b/third_party/rust/strsim/LICENSE new file mode 100644 index 0000000000..8d1fbe1dce --- /dev/null +++ b/third_party/rust/strsim/LICENSE @@ -0,0 +1,23 @@ +The MIT License (MIT) + +Copyright (c) 2015 Danny Guo +Copyright (c) 2016 Titus Wormer <tituswormer@gmail.com> +Copyright (c) 2018 Akash Kurdekar + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/third_party/rust/strsim/README.md b/third_party/rust/strsim/README.md new file mode 100644 index 0000000000..d8c9780d48 --- /dev/null +++ b/third_party/rust/strsim/README.md @@ -0,0 +1,102 @@ +# strsim-rs + +[![Crates.io](https://img.shields.io/crates/v/strsim.svg)](https://crates.io/crates/strsim) +[![Crates.io](https://img.shields.io/crates/l/strsim.svg?maxAge=2592000)](https://github.com/dguo/strsim-rs/blob/master/LICENSE) +[![CI status](https://github.com/dguo/strsim-rs/workflows/CI/badge.svg)](https://github.com/dguo/strsim-rs/actions?query=branch%3Amaster) +[![unsafe forbidden](https://img.shields.io/badge/unsafe-forbidden-success.svg)](https://github.com/rust-secure-code/safety-dance/) + +[Rust](https://www.rust-lang.org) implementations of [string similarity metrics]: + - [Hamming] + - [Levenshtein] - distance & normalized + - [Optimal string alignment] + - [Damerau-Levenshtein] - distance & normalized + - [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length + - [Sørensen-Dice] + +The normalized versions return values between `0.0` and `1.0`, where `1.0` means +an exact match. + +There are also generic versions of the functions for non-string inputs. + +## Installation + +`strsim` is available on [crates.io](https://crates.io/crates/strsim). Add it to +your `Cargo.toml`: +```toml +[dependencies] +strsim = "0.10.0" +``` + +## Usage + +Go to [Docs.rs](https://docs.rs/strsim/) for the full documentation. You can +also clone the repo, and run `$ cargo doc --open`. + +### Examples + +```rust +extern crate strsim; + +use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance, + damerau_levenshtein, normalized_damerau_levenshtein, jaro, + jaro_winkler, sorensen_dice}; + +fn main() { + match hamming("hamming", "hammers") { + Ok(distance) => assert_eq!(3, distance), + Err(why) => panic!("{:?}", why) + } + + assert_eq!(levenshtein("kitten", "sitting"), 3); + + assert!((normalized_levenshtein("kitten", "sitting") - 0.571).abs() < 0.001); + + assert_eq!(osa_distance("ac", "cba"), 3); + + assert_eq!(damerau_levenshtein("ac", "cba"), 2); + + assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.272).abs() < + 0.001); + + assert!((jaro("Friedrich Nietzsche", "Jean-Paul Sartre") - 0.392).abs() < + 0.001); + + assert!((jaro_winkler("cheeseburger", "cheese fries") - 0.911).abs() < + 0.001); + + assert_eq!(sorensen_dice("web applications", "applications of the web"), + 0.7878787878787878); +} +``` + +Using the generic versions of the functions: + +```rust +extern crate strsim; + +use strsim::generic_levenshtein; + +fn main() { + assert_eq!(2, generic_levenshtein(&[1, 2, 3], &[0, 2, 5])); +} +``` + +## Contributing + +If you don't want to install Rust itself, you can run `$ ./dev` for a +development CLI if you have [Docker] installed. + +Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`. + +## License + +[MIT](https://github.com/dguo/strsim-rs/blob/master/LICENSE) + +[string similarity metrics]:http://en.wikipedia.org/wiki/String_metric +[Damerau-Levenshtein]:http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance +[Jaro and Jaro-Winkler]:http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance +[Levenshtein]:http://en.wikipedia.org/wiki/Levenshtein_distance +[Hamming]:http://en.wikipedia.org/wiki/Hamming_distance +[Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance +[Sørensen-Dice]:http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient +[Docker]:https://docs.docker.com/engine/installation/ diff --git a/third_party/rust/strsim/benches/benches.rs b/third_party/rust/strsim/benches/benches.rs new file mode 100644 index 0000000000..b7ba829f74 --- /dev/null +++ b/third_party/rust/strsim/benches/benches.rs @@ -0,0 +1,100 @@ +//! Benchmarks for strsim. + +#![feature(test)] + +extern crate strsim; + +mod benches { + use self::test::Bencher; + use super::*; + + extern crate test; + + #[bench] + fn bench_hamming(bencher: &mut Bencher) { + let a = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGG"; + let b = "CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGC"; + bencher.iter(|| { + strsim::hamming(a, b).unwrap(); + }) + } + + #[bench] + fn bench_jaro(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::jaro(&a, &b); + }) + } + + #[bench] + fn bench_jaro_winkler(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::jaro_winkler(&a, &b); + }) + } + + #[bench] + fn bench_levenshtein(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::levenshtein(&a, &b); + }) + } + + #[bench] + fn bench_levenshtein_on_u8(bencher: &mut Bencher) { + bencher.iter(|| { + strsim::generic_levenshtein(&vec![0u8; 30], &vec![7u8; 31]); + }) + } + + #[bench] + fn bench_normalized_levenshtein(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::normalized_levenshtein(&a, &b); + }) + } + + #[bench] + fn bench_osa_distance(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::osa_distance(&a, &b); + }) + } + + #[bench] + fn bench_damerau_levenshtein(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::damerau_levenshtein(&a, &b); + }) + } + + #[bench] + fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::normalized_damerau_levenshtein(&a, &b); + }) + } + + #[bench] + fn bench_sorensen_dice(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::sorensen_dice(&a, &b); + }) + } +} diff --git a/third_party/rust/strsim/src/lib.rs b/third_party/rust/strsim/src/lib.rs new file mode 100644 index 0000000000..480a64243b --- /dev/null +++ b/third_party/rust/strsim/src/lib.rs @@ -0,0 +1,1005 @@ +//! This library implements string similarity metrics. + +#![forbid(unsafe_code)] + +use std::char; +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::error::Error; +use std::fmt::{self, Display, Formatter}; +use std::hash::Hash; +use std::str::Chars; + +#[derive(Debug, PartialEq)] +pub enum StrSimError { + DifferentLengthArgs, +} + +impl Display for StrSimError { + fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { + let text = match self { + StrSimError::DifferentLengthArgs => "Differing length arguments provided", + }; + + write!(fmt, "{}", text) + } +} + +impl Error for StrSimError {} + +pub type HammingResult = Result<usize, StrSimError>; + +/// Calculates the number of positions in the two sequences where the elements +/// differ. Returns an error if the sequences have different lengths. +pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult + where Iter1: IntoIterator<Item=Elem1>, + Iter2: IntoIterator<Item=Elem2>, + Elem1: PartialEq<Elem2> { + let (mut ita, mut itb) = (a.into_iter(), b.into_iter()); + let mut count = 0; + loop { + match (ita.next(), itb.next()){ + (Some(x), Some(y)) => if x != y { count += 1 }, + (None, None) => return Ok(count), + _ => return Err(StrSimError::DifferentLengthArgs), + } + } +} + +/// Calculates the number of positions in the two strings where the characters +/// differ. Returns an error if the strings have different lengths. +/// +/// ``` +/// use strsim::{hamming, StrSimError::DifferentLengthArgs}; +/// +/// assert_eq!(Ok(3), hamming("hamming", "hammers")); +/// +/// assert_eq!(Err(DifferentLengthArgs), hamming("hamming", "ham")); +/// ``` +pub fn hamming(a: &str, b: &str) -> HammingResult { + generic_hamming(a.chars(), b.chars()) +} + +/// Calculates the Jaro similarity between two sequences. The returned value +/// is between 0.0 and 1.0 (higher value means more similar). +pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 + where &'a Iter1: IntoIterator<Item=Elem1>, + &'b Iter2: IntoIterator<Item=Elem2>, + Elem1: PartialEq<Elem2> { + let a_len = a.into_iter().count(); + let b_len = b.into_iter().count(); + + // The check for lengths of one here is to prevent integer overflow when + // calculating the search range. + if a_len == 0 && b_len == 0 { + return 1.0; + } else if a_len == 0 || b_len == 0 { + return 0.0; + } else if a_len == 1 && b_len == 1 { + return if a.into_iter().eq(b.into_iter()) { 1.0} else { 0.0 }; + } + + let search_range = (max(a_len, b_len) / 2) - 1; + + let mut b_consumed = Vec::with_capacity(b_len); + for _ in 0..b_len { + b_consumed.push(false); + } + let mut matches = 0.0; + + let mut transpositions = 0.0; + let mut b_match_index = 0; + + for (i, a_elem) in a.into_iter().enumerate() { + let min_bound = + // prevent integer wrapping + if i > search_range { + max(0, i - search_range) + } else { + 0 + }; + + let max_bound = min(b_len - 1, i + search_range); + + if min_bound > max_bound { + continue; + } + + for (j, b_elem) in b.into_iter().enumerate() { + if min_bound <= j && j <= max_bound && a_elem == b_elem && + !b_consumed[j] { + b_consumed[j] = true; + matches += 1.0; + + if j < b_match_index { + transpositions += 1.0; + } + b_match_index = j; + + break; + } + } + } + + if matches == 0.0 { + 0.0 + } else { + (1.0 / 3.0) * ((matches / a_len as f64) + + (matches / b_len as f64) + + ((matches - transpositions) / matches)) + } +} + +struct StringWrapper<'a>(&'a str); + +impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { + type Item = char; + type IntoIter = Chars<'b>; + + fn into_iter(self) -> Self::IntoIter { + self.0.chars() + } +} + +/// Calculates the Jaro similarity between two strings. The returned value +/// is between 0.0 and 1.0 (higher value means more similar). +/// +/// ``` +/// use strsim::jaro; +/// +/// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() < +/// 0.001); +/// ``` +pub fn jaro(a: &str, b: &str) -> f64 { + generic_jaro(&StringWrapper(a), &StringWrapper(b)) +} + +/// Like Jaro but gives a boost to sequences that have a common prefix. +pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 + where &'a Iter1: IntoIterator<Item=Elem1>, + &'b Iter2: IntoIterator<Item=Elem2>, + Elem1: PartialEq<Elem2> { + let jaro_distance = generic_jaro(a, b); + + // Don't limit the length of the common prefix + let prefix_length = a.into_iter() + .zip(b.into_iter()) + .take_while(|&(ref a_elem, ref b_elem)| a_elem == b_elem) + .count(); + + let jaro_winkler_distance = + jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance)); + + if jaro_winkler_distance <= 1.0 { + jaro_winkler_distance + } else { + 1.0 + } +} + +/// Like Jaro but gives a boost to strings that have a common prefix. +/// +/// ``` +/// use strsim::jaro_winkler; +/// +/// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < +/// 0.001); +/// ``` +pub fn jaro_winkler(a: &str, b: &str) -> f64 { + generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b)) +} + +/// Calculates the minimum number of insertions, deletions, and substitutions +/// required to change one sequence into the other. +/// +/// ``` +/// use strsim::generic_levenshtein; +/// +/// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6])); +/// ``` +pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize + where &'a Iter1: IntoIterator<Item=Elem1>, + &'b Iter2: IntoIterator<Item=Elem2>, + Elem1: PartialEq<Elem2> { + let b_len = b.into_iter().count(); + + if a.into_iter().next().is_none() { return b_len; } + + let mut cache: Vec<usize> = (1..b_len+1).collect(); + + let mut result = 0; + + for (i, a_elem) in a.into_iter().enumerate() { + result = i + 1; + let mut distance_b = i; + + for (j, b_elem) in b.into_iter().enumerate() { + let cost = if a_elem == b_elem { 0usize } else { 1usize }; + let distance_a = distance_b + cost; + distance_b = cache[j]; + result = min(result + 1, min(distance_a, distance_b + 1)); + cache[j] = result; + } + } + + result +} + +/// Calculates the minimum number of insertions, deletions, and substitutions +/// required to change one string into the other. +/// +/// ``` +/// use strsim::levenshtein; +/// +/// assert_eq!(3, levenshtein("kitten", "sitting")); +/// ``` +pub fn levenshtein(a: &str, b: &str) -> usize { + generic_levenshtein(&StringWrapper(a), &StringWrapper(b)) +} + +/// Calculates a normalized score of the Levenshtein algorithm between 0.0 and +/// 1.0 (inclusive), where 1.0 means the strings are the same. +/// +/// ``` +/// use strsim::normalized_levenshtein; +/// +/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); +/// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001); +/// assert!(normalized_levenshtein("", "second").abs() < 0.00001); +/// assert!(normalized_levenshtein("first", "").abs() < 0.00001); +/// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001); +/// ``` +pub fn normalized_levenshtein(a: &str, b: &str) -> f64 { + if a.is_empty() && b.is_empty() { + return 1.0; + } + 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) +} + +/// Like Levenshtein but allows for adjacent transpositions. Each substring can +/// only be edited once. +/// +/// ``` +/// use strsim::osa_distance; +/// +/// assert_eq!(3, osa_distance("ab", "bca")); +/// ``` +pub fn osa_distance(a: &str, b: &str) -> usize { + let a_len = a.chars().count(); + let b_len = b.chars().count(); + if a == b { return 0; } + else if a_len == 0 { return b_len; } + else if b_len == 0 { return a_len; } + + let mut prev_two_distances: Vec<usize> = Vec::with_capacity(b_len + 1); + let mut prev_distances: Vec<usize> = Vec::with_capacity(b_len + 1); + let mut curr_distances: Vec<usize> = Vec::with_capacity(b_len + 1); + + let mut prev_a_char = char::MAX; + let mut prev_b_char = char::MAX; + + for i in 0..(b_len + 1) { + prev_two_distances.push(i); + prev_distances.push(i); + curr_distances.push(0); + } + + for (i, a_char) in a.chars().enumerate() { + curr_distances[0] = i + 1; + + for (j, b_char) in b.chars().enumerate() { + let cost = if a_char == b_char { 0 } else { 1 }; + curr_distances[j + 1] = min(curr_distances[j] + 1, + min(prev_distances[j + 1] + 1, + prev_distances[j] + cost)); + if i > 0 && j > 0 && a_char != b_char && + a_char == prev_b_char && b_char == prev_a_char { + curr_distances[j + 1] = min(curr_distances[j + 1], + prev_two_distances[j - 1] + 1); + } + + prev_b_char = b_char; + } + + prev_two_distances.clone_from(&prev_distances); + prev_distances.clone_from(&curr_distances); + prev_a_char = a_char; + } + + curr_distances[b_len] + +} + +/* Returns the final index for a value in a single vector that represents a fixed + 2d grid */ +fn flat_index(i: usize, j: usize, width: usize) -> usize { + j * width + i +} + +/// Like optimal string alignment, but substrings can be edited an unlimited +/// number of times, and the triangle inequality holds. +/// +/// ``` +/// use strsim::generic_damerau_levenshtein; +/// +/// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1])); +/// ``` +pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize + where Elem: Eq + Hash + Clone { + let a_len = a_elems.len(); + let b_len = b_elems.len(); + + if a_len == 0 { return b_len; } + if b_len == 0 { return a_len; } + + let width = a_len + 2; + let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; + let max_distance = a_len + b_len; + distances[0] = max_distance; + + for i in 0..(a_len + 1) { + distances[flat_index(i + 1, 0, width)] = max_distance; + distances[flat_index(i + 1, 1, width)] = i; + } + + for j in 0..(b_len + 1) { + distances[flat_index(0, j + 1, width)] = max_distance; + distances[flat_index(1, j + 1, width)] = j; + } + + let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64); + + for i in 1..(a_len + 1) { + let mut db = 0; + + for j in 1..(b_len + 1) { + let k = match elems.get(&b_elems[j - 1]) { + Some(&value) => value, + None => 0 + }; + + let insertion_cost = distances[flat_index(i, j + 1, width)] + 1; + let deletion_cost = distances[flat_index(i + 1, j, width)] + 1; + let transposition_cost = distances[flat_index(k, db, width)] + + (i - k - 1) + 1 + (j - db - 1); + + let mut substitution_cost = distances[flat_index(i, j, width)] + 1; + if a_elems[i - 1] == b_elems[j - 1] { + db = j; + substitution_cost -= 1; + } + + distances[flat_index(i + 1, j + 1, width)] = min(substitution_cost, + min(insertion_cost, min(deletion_cost, transposition_cost))); + } + + elems.insert(a_elems[i - 1].clone(), i); + } + + distances[flat_index(a_len + 1, b_len + 1, width)] +} + +/// Like optimal string alignment, but substrings can be edited an unlimited +/// number of times, and the triangle inequality holds. +/// +/// ``` +/// use strsim::damerau_levenshtein; +/// +/// assert_eq!(2, damerau_levenshtein("ab", "bca")); +/// ``` +pub fn damerau_levenshtein(a: &str, b: &str) -> usize { + let (x, y): (Vec<_>, Vec<_>) = (a.chars().collect(), b.chars().collect()); + generic_damerau_levenshtein(x.as_slice(), y.as_slice()) +} + +/// Calculates a normalized score of the Damerau–Levenshtein algorithm between +/// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same. +/// +/// ``` +/// use strsim::normalized_damerau_levenshtein; +/// +/// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); +/// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001); +/// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001); +/// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001); +/// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); +/// ``` +pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { + if a.is_empty() && b.is_empty() { + return 1.0; + } + 1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) +} + +/// Returns an Iterator of char tuples. +fn bigrams(s: &str) -> impl Iterator<Item=(char, char)> + '_ { + s.chars().zip(s.chars().skip(1)) +} + + +/// Calculates a Sørensen-Dice similarity distance using bigrams. +/// See http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. +/// +/// ``` +/// use strsim::sorensen_dice; +/// +/// assert_eq!(1.0, sorensen_dice("", "")); +/// assert_eq!(0.0, sorensen_dice("", "a")); +/// assert_eq!(0.0, sorensen_dice("french", "quebec")); +/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); +/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); +/// assert_eq!(0.8888888888888888, sorensen_dice("feris", "ferris")); +/// ``` +pub fn sorensen_dice(a: &str, b: &str) -> f64 { + // implementation guided by + // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 + + let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); + let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); + + if a.len() == 0 && b.len() == 0 { + return 1.0; + } + + if a.len() == 0 || b.len() == 0 { + return 0.0; + } + + if a == b { + return 1.0; + } + + if a.len() == 1 && b.len() == 1 { + return 0.0; + } + + if a.len() < 2 || b.len() < 2 { + return 0.0; + } + + let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); + + for bigram in bigrams(&a) { + *a_bigrams.entry(bigram).or_insert(0) += 1; + } + + let mut intersection_size = 0; + + for bigram in bigrams(&b) { + a_bigrams.entry(bigram).and_modify(|bi| { + if *bi > 0 { + *bi -= 1; + intersection_size += 1; + } + }); + } + + (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bigrams_iterator() { + let mut bi = bigrams("abcde"); + + assert_eq!(Some(('a', 'b')), bi.next()); + assert_eq!(Some(('b', 'c')), bi.next()); + assert_eq!(Some(('c', 'd')), bi.next()); + assert_eq!(Some(('d', 'e')), bi.next()); + assert_eq!(None, bi.next()); + } + + fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { + assert_eq!(Ok(dist), hamming(str1, str2)); + } + + #[test] + fn hamming_empty() { + assert_hamming_dist(0, "", "") + } + + #[test] + fn hamming_same() { + assert_hamming_dist(0, "hamming", "hamming") + } + + #[test] + fn hamming_numbers() { + assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3])); + } + + #[test] + fn hamming_diff() { + assert_hamming_dist(3, "hamming", "hammers") + } + + #[test] + fn hamming_diff_multibyte() { + assert_hamming_dist(2, "hamming", "h香mmüng"); + } + + #[test] + fn hamming_unequal_length() { + assert_eq!( + Err(StrSimError::DifferentLengthArgs), + generic_hamming("ham".chars(), "hamming".chars()) + ); + } + + #[test] + fn hamming_names() { + assert_hamming_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre") + } + + #[test] + fn jaro_both_empty() { + assert_eq!(1.0, jaro("", "")); + } + + #[test] + fn jaro_first_empty() { + assert_eq!(0.0, jaro("", "jaro")); + } + + #[test] + fn jaro_second_empty() { + assert_eq!(0.0, jaro("distance", "")); + } + + #[test] + fn jaro_same() { + assert_eq!(1.0, jaro("jaro", "jaro")); + } + + #[test] + fn jaro_multibyte() { + assert!((0.818 - jaro("testabctest", "testöঙ香test")) < 0.001); + assert!((0.818 - jaro("testöঙ香test", "testabctest")) < 0.001); + } + + #[test] + fn jaro_diff_short() { + assert!((0.767 - jaro("dixon", "dicksonx")).abs() < 0.001); + } + + #[test] + fn jaro_diff_one_character() { + assert_eq!(0.0, jaro("a", "b")); + } + + #[test] + fn jaro_same_one_character() { + assert_eq!(1.0, jaro("a", "a")); + } + + #[test] + fn generic_jaro_diff() { + assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4])); + } + + #[test] + fn jaro_diff_one_and_two() { + assert!((0.83 - jaro("a", "ab")).abs() < 0.01); + } + + #[test] + fn jaro_diff_two_and_one() { + assert!((0.83 - jaro("ab", "a")).abs() < 0.01); + } + + #[test] + fn jaro_diff_no_transposition() { + assert!((0.822 - jaro("dwayne", "duane")).abs() < 0.001); + } + + #[test] + fn jaro_diff_with_transposition() { + assert!((0.944 - jaro("martha", "marhta")).abs() < 0.001); + } + + #[test] + fn jaro_names() { + assert!((0.392 - jaro("Friedrich Nietzsche", + "Jean-Paul Sartre")).abs() < 0.001); + } + + #[test] + fn jaro_winkler_both_empty() { + assert_eq!(1.0, jaro_winkler("", "")); + } + + #[test] + fn jaro_winkler_first_empty() { + assert_eq!(0.0, jaro_winkler("", "jaro-winkler")); + } + + #[test] + fn jaro_winkler_second_empty() { + assert_eq!(0.0, jaro_winkler("distance", "")); + } + + #[test] + fn jaro_winkler_same() { + assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler")); + } + + #[test] + fn jaro_winkler_multibyte() { + assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() < + 0.001); + assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() < + 0.001); + } + + #[test] + fn jaro_winkler_diff_short() { + assert!((0.813 - jaro_winkler("dixon", "dicksonx")).abs() < 0.001); + assert!((0.813 - jaro_winkler("dicksonx", "dixon")).abs() < 0.001); + } + + #[test] + fn jaro_winkler_diff_one_character() { + assert_eq!(0.0, jaro_winkler("a", "b")); + } + + #[test] + fn jaro_winkler_same_one_character() { + assert_eq!(1.0, jaro_winkler("a", "a")); + } + + #[test] + fn jaro_winkler_diff_no_transposition() { + assert!((0.840 - jaro_winkler("dwayne", "duane")).abs() < 0.001); + } + + #[test] + fn jaro_winkler_diff_with_transposition() { + assert!((0.961 - jaro_winkler("martha", "marhta")).abs() < 0.001); + } + + #[test] + fn jaro_winkler_names() { + assert!((0.562 - jaro_winkler("Friedrich Nietzsche", + "Fran-Paul Sartre")).abs() < 0.001); + } + + #[test] + fn jaro_winkler_long_prefix() { + assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < + 0.001); + } + + #[test] + fn jaro_winkler_more_names() { + assert!((0.868 - jaro_winkler("Thorkel", "Thorgier")).abs() < 0.001); + } + + #[test] + fn jaro_winkler_length_of_one() { + assert!((0.738 - jaro_winkler("Dinsdale", "D")).abs() < 0.001); + } + + #[test] + fn jaro_winkler_very_long_prefix() { + assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx", + "thequickbrownfoxjumpedovery")).abs() < + 0.001); + } + + #[test] + fn levenshtein_empty() { + assert_eq!(0, levenshtein("", "")); + } + + #[test] + fn levenshtein_same() { + assert_eq!(0, levenshtein("levenshtein", "levenshtein")); + } + + #[test] + fn levenshtein_diff_short() { + assert_eq!(3, levenshtein("kitten", "sitting")); + } + + #[test] + fn levenshtein_diff_with_space() { + assert_eq!(5, levenshtein("hello, world", "bye, world")); + } + + #[test] + fn levenshtein_diff_multibyte() { + assert_eq!(3, levenshtein("öঙ香", "abc")); + assert_eq!(3, levenshtein("abc", "öঙ香")); + } + + #[test] + fn levenshtein_diff_longer() { + let a = "The quick brown fox jumped over the angry dog."; + let b = "Lorem ipsum dolor sit amet, dicta latine an eam."; + assert_eq!(37, levenshtein(a, b)); + } + + #[test] + fn levenshtein_first_empty() { + assert_eq!(7, levenshtein("", "sitting")); + } + + #[test] + fn levenshtein_second_empty() { + assert_eq!(6, levenshtein("kitten", "")); + } + + #[test] + fn normalized_levenshtein_diff_short() { + assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); + } + + #[test] + fn normalized_levenshtein_for_empty_strings() { + assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001); + } + + #[test] + fn normalized_levenshtein_first_empty() { + assert!(normalized_levenshtein("", "second").abs() < 0.00001); + } + + #[test] + fn normalized_levenshtein_second_empty() { + assert!(normalized_levenshtein("first", "").abs() < 0.00001); + } + + #[test] + fn normalized_levenshtein_identical_strings() { + assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001); + } + + #[test] + fn osa_distance_empty() { + assert_eq!(0, osa_distance("", "")); + } + + #[test] + fn osa_distance_same() { + assert_eq!(0, osa_distance("damerau", "damerau")); + } + + #[test] + fn osa_distance_first_empty() { + assert_eq!(7, osa_distance("", "damerau")); + } + + #[test] + fn osa_distance_second_empty() { + assert_eq!(7, osa_distance("damerau", "")); + } + + #[test] + fn osa_distance_diff() { + assert_eq!(3, osa_distance("ca", "abc")); + } + + #[test] + fn osa_distance_diff_short() { + assert_eq!(3, osa_distance("damerau", "aderua")); + } + + #[test] + fn osa_distance_diff_reversed() { + assert_eq!(3, osa_distance("aderua", "damerau")); + } + + #[test] + fn osa_distance_diff_multibyte() { + assert_eq!(3, osa_distance("öঙ香", "abc")); + assert_eq!(3, osa_distance("abc", "öঙ香")); + } + + #[test] + fn osa_distance_diff_unequal_length() { + assert_eq!(6, osa_distance("damerau", "aderuaxyz")); + } + + #[test] + fn osa_distance_diff_unequal_length_reversed() { + assert_eq!(6, osa_distance("aderuaxyz", "damerau")); + } + + #[test] + fn osa_distance_diff_comedians() { + assert_eq!(5, osa_distance("Stewart", "Colbert")); + } + + #[test] + fn osa_distance_many_transpositions() { + assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk")); + } + + #[test] + fn osa_distance_diff_longer() { + let a = "The quick brown fox jumped over the angry dog."; + let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; + assert_eq!(36, osa_distance(a, b)); + } + + #[test] + fn osa_distance_beginning_transposition() { + assert_eq!(1, osa_distance("foobar", "ofobar")); + } + + #[test] + fn osa_distance_end_transposition() { + assert_eq!(1, osa_distance("specter", "spectre")); + } + + #[test] + fn osa_distance_restricted_edit() { + assert_eq!(4, osa_distance("a cat", "an abct")); + } + + #[test] + fn damerau_levenshtein_empty() { + assert_eq!(0, damerau_levenshtein("", "")); + } + + #[test] + fn damerau_levenshtein_same() { + assert_eq!(0, damerau_levenshtein("damerau", "damerau")); + } + + #[test] + fn damerau_levenshtein_first_empty() { + assert_eq!(7, damerau_levenshtein("", "damerau")); + } + + #[test] + fn damerau_levenshtein_second_empty() { + assert_eq!(7, damerau_levenshtein("damerau", "")); + } + + #[test] + fn damerau_levenshtein_diff() { + assert_eq!(2, damerau_levenshtein("ca", "abc")); + } + + #[test] + fn damerau_levenshtein_diff_short() { + assert_eq!(3, damerau_levenshtein("damerau", "aderua")); + } + + #[test] + fn damerau_levenshtein_diff_reversed() { + assert_eq!(3, damerau_levenshtein("aderua", "damerau")); + } + + #[test] + fn damerau_levenshtein_diff_multibyte() { + assert_eq!(3, damerau_levenshtein("öঙ香", "abc")); + assert_eq!(3, damerau_levenshtein("abc", "öঙ香")); + } + + #[test] + fn damerau_levenshtein_diff_unequal_length() { + assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz")); + } + + #[test] + fn damerau_levenshtein_diff_unequal_length_reversed() { + assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau")); + } + + #[test] + fn damerau_levenshtein_diff_comedians() { + assert_eq!(5, damerau_levenshtein("Stewart", "Colbert")); + } + + #[test] + fn damerau_levenshtein_many_transpositions() { + assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk")); + } + + #[test] + fn damerau_levenshtein_diff_longer() { + let a = "The quick brown fox jumped over the angry dog."; + let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; + assert_eq!(36, damerau_levenshtein(a, b)); + } + + #[test] + fn damerau_levenshtein_beginning_transposition() { + assert_eq!(1, damerau_levenshtein("foobar", "ofobar")); + } + + #[test] + fn damerau_levenshtein_end_transposition() { + assert_eq!(1, damerau_levenshtein("specter", "spectre")); + } + + #[test] + fn damerau_levenshtein_unrestricted_edit() { + assert_eq!(3, damerau_levenshtein("a cat", "an abct")); + } + + #[test] + fn normalized_damerau_levenshtein_diff_short() { + assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); + } + + #[test] + fn normalized_damerau_levenshtein_for_empty_strings() { + assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001); + } + + #[test] + fn normalized_damerau_levenshtein_first_empty() { + assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001); + } + + #[test] + fn normalized_damerau_levenshtein_second_empty() { + assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001); + } + + #[test] + fn normalized_damerau_levenshtein_identical_strings() { + assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); + } + + #[test] + fn sorensen_dice_all() { + // test cases taken from + // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 + + assert_eq!(1.0, sorensen_dice("a", "a")); + assert_eq!(0.0, sorensen_dice("a", "b")); + assert_eq!(1.0, sorensen_dice("", "")); + assert_eq!(0.0, sorensen_dice("a", "")); + assert_eq!(0.0, sorensen_dice("", "a")); + assert_eq!(1.0, sorensen_dice("apple event", "apple event")); + assert_eq!(0.9090909090909091, sorensen_dice("iphone", "iphone x")); + assert_eq!(0.0, sorensen_dice("french", "quebec")); + assert_eq!(1.0, sorensen_dice("france", "france")); + assert_eq!(0.2, sorensen_dice("fRaNce", "france")); + assert_eq!(0.8, sorensen_dice("healed", "sealed")); + assert_eq!( + 0.7878787878787878, + sorensen_dice("web applications", "applications of the web") + ); + assert_eq!( + 0.92, + sorensen_dice( + "this will have a typo somewhere", + "this will huve a typo somewhere" + ) + ); + assert_eq!( + 0.6060606060606061, + sorensen_dice( + "Olive-green table for sale, in extremely good condition.", + "For sale: table in very good condition, olive green in colour." + ) + ); + assert_eq!( + 0.2558139534883721, + sorensen_dice( + "Olive-green table for sale, in extremely good condition.", + "For sale: green Subaru Impreza, 210,000 miles" + ) + ); + assert_eq!( + 0.1411764705882353, + sorensen_dice( + "Olive-green table for sale, in extremely good condition.", + "Wanted: mountain bike with at least 21 gears." + ) + ); + assert_eq!( + 0.7741935483870968, + sorensen_dice("this has one extra word", "this has one word") + ); + } +} diff --git a/third_party/rust/strsim/tests/lib.rs b/third_party/rust/strsim/tests/lib.rs new file mode 100644 index 0000000000..ad7af513ae --- /dev/null +++ b/third_party/rust/strsim/tests/lib.rs @@ -0,0 +1,49 @@ +extern crate strsim; + +use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance,damerau_levenshtein, + normalized_damerau_levenshtein, jaro, jaro_winkler}; + +#[test] +fn hamming_works() { + match hamming("hamming", "hammers") { + Ok(distance) => assert_eq!(3, distance), + Err(why) => panic!("{:?}", why) + } +} + +#[test] +fn levenshtein_works() { + assert_eq!(3, levenshtein("kitten", "sitting")); +} + +#[test] +fn normalized_levenshtein_works() { + assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); +} + +#[test] +fn osa_distance_works() { + assert_eq!(3, osa_distance("ac", "cba")); +} + +#[test] +fn damerau_levenshtein_works() { + assert_eq!(2, damerau_levenshtein("ac", "cba")); +} + +#[test] +fn normalized_damerau_levenshtein_works() { + assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); +} + +#[test] +fn jaro_works() { + assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() < + 0.001); +} + +#[test] +fn jaro_winkler_works() { + assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < + 0.001); +} |