diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/fluent-langneg | |
parent | Initial commit. (diff) | |
download | firefox-upstream.tar.xz firefox-upstream.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/fluent-langneg')
-rw-r--r-- | third_party/rust/fluent-langneg/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | third_party/rust/fluent-langneg/Cargo.toml | 61 | ||||
-rw-r--r-- | third_party/rust/fluent-langneg/README.md | 113 | ||||
-rw-r--r-- | third_party/rust/fluent-langneg/benches/negotiate.rs | 40 | ||||
-rw-r--r-- | third_party/rust/fluent-langneg/src/accepted_languages.rs | 41 | ||||
-rw-r--r-- | third_party/rust/fluent-langneg/src/lib.rs | 49 | ||||
-rw-r--r-- | third_party/rust/fluent-langneg/src/negotiate/likely_subtags.rs | 39 | ||||
-rw-r--r-- | third_party/rust/fluent-langneg/src/negotiate/mod.rs | 233 |
8 files changed, 577 insertions, 0 deletions
diff --git a/third_party/rust/fluent-langneg/.cargo-checksum.json b/third_party/rust/fluent-langneg/.cargo-checksum.json new file mode 100644 index 0000000000..bf0abede56 --- /dev/null +++ b/third_party/rust/fluent-langneg/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"1b11d8d30fe978704012e27981f8d50a3462319594b54ed2e71eaf85284d61eb","README.md":"a4f17c795725dcb84cdf1e327a61306e82aaa2ca1908c9ea95c0fbe9d53216fd","benches/negotiate.rs":"f14c49d75413fb4b248f8f586c046340d61f0682eb0860db326f1f415e1bceb9","src/accepted_languages.rs":"74fe73bb8c3f36d3b8b85bfdc55731c234c20e92245b0f89eb1e8b68af47c17c","src/lib.rs":"529e3c9810688c3a5d216c977b968a775f83a85c2da90d669f2cfc5eb6c71361","src/negotiate/likely_subtags.rs":"44531e2bbf3a2155771f197f863dffdce403d3e8dd0e1d4f36f7178e52e5a3a3","src/negotiate/mod.rs":"e8aa5ecf08b866d83c957230586cb9c03880473406d7cca28cadf9e883310a15"},"package":"2c4ad0989667548f06ccd0e306ed56b61bd4d35458d54df5ec7587c0e8ed5e94"}
\ No newline at end of file diff --git a/third_party/rust/fluent-langneg/Cargo.toml b/third_party/rust/fluent-langneg/Cargo.toml new file mode 100644 index 0000000000..58aae3c672 --- /dev/null +++ b/third_party/rust/fluent-langneg/Cargo.toml @@ -0,0 +1,61 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "fluent-langneg" +version = "0.13.0" +authors = ["Zibi Braniecki <gandalf@mozilla.com>"] +include = ["src/**/*", "benches/*.rs", "Cargo.toml", "README.md"] +description = "A library for language and locale negotiation.\n" +homepage = "http://projectfluent.org/" +readme = "README.md" +categories = ["internationalization", "localization"] +license = "Apache-2.0" +repository = "https://github.com/projectfluent/fluent-langneg-rs" + +[[bench]] +name = "negotiate" +harness = false +[dependencies.unic-langid] +version = "0.9" +[dev-dependencies.criterion] +version = "0.3" + +[dev-dependencies.serde] +version = "1.0" +features = ["derive"] + +[dev-dependencies.serde_json] +version = "1.0" + +[dev-dependencies.unic-langid] +version = "0.9" +features = ["macros"] + +[dev-dependencies.unic-locale] +version = "0.9" +features = ["macros"] + +[features] +cldr = ["unic-langid/likelysubtags"] +default = [] +[badges.coveralls] +branch = "master" +repository = "projectfluent/fluent-langneg-rs" +service = "github" + +[badges.maintenance] +status = "actively-developed" + +[badges.travis-ci] +repository = "projectfluent/fluent-langneg-rs" diff --git a/third_party/rust/fluent-langneg/README.md b/third_party/rust/fluent-langneg/README.md new file mode 100644 index 0000000000..bdff7649f8 --- /dev/null +++ b/third_party/rust/fluent-langneg/README.md @@ -0,0 +1,113 @@ +# Fluent LangNeg + +**Fluent LangNeg is a library for language and locale identifier negotiation.** + +[![crates.io](http://meritbadge.herokuapp.com/fluent-langneg)](https://crates.io/crates/fluent-langneg) +[![Build Status](https://travis-ci.org/projectfluent/fluent-langneg-rs.svg?branch=master)](https://travis-ci.org/projectfluent/fluent-langneg-rs) +[![Coverage Status](https://coveralls.io/repos/github/projectfluent/fluent-langneg-rs/badge.svg?branch=master)](https://coveralls.io/github/projectfluent/fluent-langneg-rs?branch=master) + +Introduction +------------ + +This is a Rust implementation of fluent-langneg library which is a part of Project Fluent. + +The library uses [unic-langid](https://github.com/zbraniecki/unic-locale) and [unic-locale](https://github.com/zbraniecki/unic-locale) to retrieve and operate on Unicode Language and Locale Identifiers. +The library provides algorithm for negotiating between lists of locales. + +Usage +----- + +```rust +use fluent_langneg::negotiate_languages; +use fluent_langneg::NegotiationStrategy; +use fluent_langneg::convert_vec_str_to_langids_lossy; +use unic_langid::LanguageIdentifier + +// Since langid parsing from string is fallible, we'll use a helper +// function which strips any langids that failed to parse. +let requested = convert_vec_str_to_langids_lossy(&["de-DE", "fr-FR", "en-US"]); +let available = convert_vec_str_to_langids_lossy(&["it", "fr", "de-AT", "fr-CA", "en-US"]); +let default: LanguageIdentifier = "en-US".parse().expect("Parsing langid failed."); + +let supported = negotiate_languages( + &requested, + &available, + Some(&default), + NegotiationStrategy::Filtering +); + +let expected = convert_vec_str_to_langids_lossy(&["de-AT", "fr", "fr-CA", "en-US"]); +assert_eq!(supported, + expected.iter().map(|t| t.as_ref()).collect::<Vec<&LanguageIdentifier>>()); +``` + +See [docs.rs][] for more examples. + +[docs.rs]: https://docs.rs/fluent-langneg/ + +Status +------ + +The implementation is complete according to fluent-langneg +corpus of tests, which means that it parses, serializes and negotiates as expected. + +The negotiation methods can operate on lists of `LanguageIdentifier` or `Locale`. + +The remaining work is on the path to 1.0 is to gain in-field experience of using it, +add more tests and ensure that bad input is correctly handled. + +Compatibility +------------- + +The API is based on [UTS 35][] definition of [Unicode Locale Identifier][] and is aiming to +parse and serialize all locale identifiers according to that definition. + +*Note*: Unicode Locale Identifier is similar, but different, from what [BCP47][] specifies under +the name Language Tag. +For most locale management and negotiation needs, the Unicode Locale Identifier used in this crate is likely a better choice, +but in some case, like HTTP Accepted Headers, you may need the complete BCP47 Language Tag implementation which +this crate does not provide. + +Language negotiation algorithms are custom Project Fluent solutions, +based on [RFC4647][]. + +The language negotiation strategies aim to replicate the best-effort matches with +the most limited amount of data. The algorithm returns reasonable +results without any database, but the results can be improved with either limited +or full [CLDR likely-subtags][] database. + +The result is a balance chosen for Project Fluent and may differ from other +implementations of language negotiation algorithms which may choose different +tradeoffs. + +[BCP47]: https://tools.ietf.org/html/bcp47 +[RFC6067]: https://www.ietf.org/rfc/rfc6067.txt +[UTS 35]: http://www.unicode.org/reports/tr35/#Locale_Extension_Key_and_Type_Data +[RFC4647]: https://tools.ietf.org/html/rfc4647 +[CLDR likely-subtags]: http://www.unicode.org/cldr/charts/latest/supplemental/likely_subtags.html +[Unicode Locale Identifier]: (http://unicode.org/reports/tr35/#Identifiers) + +Alternatives +------------ + +Although Fluent Locale aims to stay close to W3C Accepted Languages, it does not aim +to implement the full behavior and some aspects of the language negotiation strategy +recommended by W3C, such as weights, are not a target right now. + +For such purposes, [rust-language-tags][] crate seems to be a better choice. + +[rust-language-tags]: https://github.com/pyfisch/rust-language-tags + +Performance +----------- + +The crate is considered to be fully optimized for production. + + +Develop +------- + + cargo build + cargo test + cargo bench + diff --git a/third_party/rust/fluent-langneg/benches/negotiate.rs b/third_party/rust/fluent-langneg/benches/negotiate.rs new file mode 100644 index 0000000000..2ca70d59ec --- /dev/null +++ b/third_party/rust/fluent-langneg/benches/negotiate.rs @@ -0,0 +1,40 @@ +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::Criterion; + +use fluent_langneg::convert_vec_str_to_langids_lossy; +use fluent_langneg::negotiate_languages; + +use unic_langid::LanguageIdentifier; + +#[no_mangle] +#[inline(never)] +fn do_negotiate<'a>( + requested: &[LanguageIdentifier], + available: &'a [LanguageIdentifier], +) -> Vec<&'a LanguageIdentifier> { + negotiate_languages( + requested, + available, + None, + fluent_langneg::NegotiationStrategy::Filtering, + ) +} + +fn negotiate_bench(c: &mut Criterion) { + let requested = &["de", "it", "ru"]; + let available = &[ + "en-US", "fr", "de", "en-GB", "it", "pl", "ru", "sr-Cyrl", "sr-Latn", "zh-Hant", "zh-Hans", + "ja-JP", "he-IL", "de-DE", "de-IT", + ]; + + let requested = convert_vec_str_to_langids_lossy(requested); + let available = convert_vec_str_to_langids_lossy(available); + + c.bench_function("negotiate", |b| { + b.iter(|| do_negotiate(&requested, &available)) + }); +} + +criterion_group!(benches, negotiate_bench); +criterion_main!(benches); diff --git a/third_party/rust/fluent-langneg/src/accepted_languages.rs b/third_party/rust/fluent-langneg/src/accepted_languages.rs new file mode 100644 index 0000000000..58cf277703 --- /dev/null +++ b/third_party/rust/fluent-langneg/src/accepted_languages.rs @@ -0,0 +1,41 @@ +//! This function parses Accept-Language string into a list of language tags that +//! can be later passed to language negotiation functions. +//! +//! # Example: +//! +//! ``` +//! use fluent_langneg::negotiate_languages; +//! use fluent_langneg::NegotiationStrategy; +//! use fluent_langneg::parse_accepted_languages; +//! use fluent_langneg::convert_vec_str_to_langids_lossy; +//! use unic_langid::LanguageIdentifier; +//! +//! let requested = parse_accepted_languages("de-AT;0.9,de-DE;0.8,de;0.7;en-US;0.5"); +//! let available = convert_vec_str_to_langids_lossy(&["fr", "pl", "de", "en-US"]); +//! let default: LanguageIdentifier = "en-US".parse().expect("Failed to parse a langid."); +//! +//! let supported = negotiate_languages( +//! &requested, +//! &available, +//! Some(&default), +//! NegotiationStrategy::Filtering +//! ); +//! +//! let expected = convert_vec_str_to_langids_lossy(&["de", "en-US"]); +//! assert_eq!(supported, +//! expected.iter().map(|t| t.as_ref()).collect::<Vec<&LanguageIdentifier>>()); +//! ``` +//! +//! This function ignores the weights associated with the locales, since Fluent Locale +//! language negotiation only uses the order of locales, not the weights. +//! + +use unic_langid::LanguageIdentifier; + +pub fn parse(s: &str) -> Vec<LanguageIdentifier> { + s.split(',') + .map(|t| t.trim().split(';').nth(0).unwrap()) + .filter(|t| !t.is_empty()) + .filter_map(|t| t.parse().ok()) + .collect() +} diff --git a/third_party/rust/fluent-langneg/src/lib.rs b/third_party/rust/fluent-langneg/src/lib.rs new file mode 100644 index 0000000000..865bfc2758 --- /dev/null +++ b/third_party/rust/fluent-langneg/src/lib.rs @@ -0,0 +1,49 @@ +//! fluent-langneg is an API for operating on locales and language tags. +//! It's part of Project Fluent, a localization framework designed to unleash +//! the expressive power of the natural language. +//! +//! The primary use of fluent-langneg is to parse/modify/serialize language tags +//! and to perform language negotiation. +//! +//! fluent-langneg operates on a subset of [BCP47](http://tools.ietf.org/html/bcp47). +//! It can parse full BCP47 language tags, and will serialize them back, +//! but currently only allows for operations on primary subtags and +//! unicode extension keys. +//! +//! In result fluent-langneg is not suited to replace full implementations of +//! BCP47 like [rust-language-tags](https://github.com/pyfisch/rust-language-tags), +//! but is arguably a better option for use cases involving operations on +//! language tags and for language negotiation. + +pub mod accepted_languages; +pub mod negotiate; + +pub use accepted_languages::parse as parse_accepted_languages; +pub use negotiate::negotiate_languages; +pub use negotiate::NegotiationStrategy; + +use unic_langid::{LanguageIdentifier, LanguageIdentifierError}; + +pub fn convert_vec_str_to_langids<'a, I, J>( + input: I, +) -> Result<Vec<LanguageIdentifier>, LanguageIdentifierError> +where + I: IntoIterator<Item = J>, + J: AsRef<[u8]> + 'a, +{ + input + .into_iter() + .map(|s| LanguageIdentifier::from_bytes(s.as_ref())) + .collect() +} + +pub fn convert_vec_str_to_langids_lossy<'a, I, J>(input: I) -> Vec<LanguageIdentifier> +where + I: IntoIterator<Item = J>, + J: AsRef<[u8]> + 'a, +{ + input + .into_iter() + .filter_map(|t| LanguageIdentifier::from_bytes(t.as_ref()).ok()) + .collect() +} diff --git a/third_party/rust/fluent-langneg/src/negotiate/likely_subtags.rs b/third_party/rust/fluent-langneg/src/negotiate/likely_subtags.rs new file mode 100644 index 0000000000..60a7b7a525 --- /dev/null +++ b/third_party/rust/fluent-langneg/src/negotiate/likely_subtags.rs @@ -0,0 +1,39 @@ +use unic_langid::LanguageIdentifier; + +static REGION_MATCHING_KEYS: &[&str] = &[ + "az", "bg", "cs", "de", "es", "fi", "fr", "hu", "it", "lt", "lv", "nl", "pl", "ro", "ru", +]; + +pub trait MockLikelySubtags { + fn maximize(&mut self) -> bool; +} + +impl MockLikelySubtags for LanguageIdentifier { + fn maximize(&mut self) -> bool { + let extended = match self.to_string().as_str() { + "en" => "en-Latn-US", + "fr" => "fr-Latn-FR", + "sr" => "sr-Cyrl-SR", + "sr-RU" => "sr-Latn-SR", + "az-IR" => "az-Arab-IR", + "zh-GB" => "zh-Hant-GB", + "zh-US" => "zh-Hant-US", + _ => { + let lang = self.language; + + for subtag in REGION_MATCHING_KEYS { + if lang == *subtag { + self.region = Some(subtag.parse().unwrap()); + return true; + } + } + return false; + } + }; + let langid: LanguageIdentifier = extended.parse().expect("Failed to parse langid."); + self.language = langid.language; + self.script = langid.script; + self.region = langid.region; + true + } +} diff --git a/third_party/rust/fluent-langneg/src/negotiate/mod.rs b/third_party/rust/fluent-langneg/src/negotiate/mod.rs new file mode 100644 index 0000000000..4b3587fd40 --- /dev/null +++ b/third_party/rust/fluent-langneg/src/negotiate/mod.rs @@ -0,0 +1,233 @@ +//! Language Negotiation is a process in which locales from different +//! sources are filtered and sorted in an effort to produce the best +//! possible selection of them. +//! +//! There are multiple language negotiation strategies, most popular is +//! described in [RFC4647](https://www.ietf.org/rfc/rfc4647.txt). +//! +//! The algorithm is based on the BCP4647 3.3.2 Extended Filtering algorithm, +//! with several modifications. +//! +//! # Example: +//! +//! ``` +//! use fluent_langneg::negotiate_languages; +//! use fluent_langneg::NegotiationStrategy; +//! use fluent_langneg::convert_vec_str_to_langids_lossy; +//! use unic_langid::LanguageIdentifier; +//! +//! let requested = convert_vec_str_to_langids_lossy(&["pl", "fr", "en-US"]); +//! let available = convert_vec_str_to_langids_lossy(&["it", "de", "fr", "en-GB", "en_US"]); +//! let default: LanguageIdentifier = "en-US".parse().expect("Parsing langid failed."); +//! +//! let supported = negotiate_languages( +//! &requested, +//! &available, +//! Some(&default), +//! NegotiationStrategy::Filtering +//! ); +//! +//! let expected = convert_vec_str_to_langids_lossy(&["fr", "en-US", "en-GB"]); +//! assert_eq!(supported, +//! expected.iter().map(|t| t.as_ref()).collect::<Vec<&LanguageIdentifier>>()); +//! ``` +//! +//! # The exact algorithm is custom, and consists of a 6 level strategy: +//! +//! ### 1) Attempt to find an exact match for each requested locale in available locales. +//! +//! Example: +//! +//! ```text +//! // [requested] * [available] = [supported] +//! +//! ["en-US"] * ["en-US"] = ["en-US"] +//! ``` +//! +//! ### 2) Attempt to match a requested locale to an available locale treated as a locale range. +//! +//! Example: +//! +//! ```text +//! // [requested] * [available] = [supported] +//! +//! ["en-US"] * ["en"] = ["en"] +//! ^^ +//! |-- becomes "en-*-*-*" +//! ``` +//! +//! ### 3) Maximize the requested locale to find the best match in available locales. +//! +//! This part uses ICU's likelySubtags or similar database. +//! +//! Example: +//! +//! ```text +//! // [requested] * [available] = [supported] +//! +//! ["en"] * ["en-GB", "en-US"] = ["en-US"] +//! ^^ ^^^^^ ^^^^^ +//! | | | +//! | |----------- become "en-*-GB-*" and "en-*-US-*" +//! | +//! |-- ICU likelySubtags expands it to "en-Latn-US" +//! ``` +//! +//! ### 4) Attempt to look up for a different variant of the same locale. +//! +//! Example: +//! +//! ```text +//! // [requested] * [available] = [supported] +//! +//! ["ja-JP-win"] * ["ja-JP-mac"] = ["ja-JP-mac"] +//! ^^^^^^^^^ ^^^^^^^^^ +//! | |-- become "ja-*-JP-mac" +//! | +//! |----------- replace variant with range: "ja-JP-*" +//! ``` +//! +//! ### 5) Look up for a maximized version of the requested locale, stripped of the region code. +//! +//! Example: +//! +//! ```text +//! // [requested] * [available] = [supported] +//! +//! ["en-CA"] * ["en-ZA", "en-US"] = ["en-US", "en-ZA"] +//! ^^^^^ +//! | ^^^^^ ^^^^^ +//! | | | +//! | |----------- become "en-*-ZA-*" and "en-*-US-*" +//! | +//! |----------- strip region produces "en", then lookup likelySubtag: "en-Latn-US" +//! ``` +//! +//! +//! ### 6) Attempt to look up for a different region of the same locale. +//! +//! Example: +//! +//! ```text +//! // [requested] * [available] = [supported] +//! +//! ["en-GB"] * ["en-AU"] = ["en-AU"] +//! ^^^^^ ^^^^^ +//! | |-- become "en-*-AU-*" +//! | +//! |----- replace region with range: "en-*" +//! ``` +//! + +use unic_langid::LanguageIdentifier; + +#[cfg(not(feature = "cldr"))] +mod likely_subtags; +#[cfg(not(feature = "cldr"))] +use likely_subtags::MockLikelySubtags; + +#[derive(PartialEq, Debug, Clone, Copy)] +pub enum NegotiationStrategy { + Filtering, + Matching, + Lookup, +} + +pub fn filter_matches<'a, R: 'a + AsRef<LanguageIdentifier>, A: 'a + AsRef<LanguageIdentifier>>( + requested: &[R], + available: &'a [A], + strategy: NegotiationStrategy, +) -> Vec<&'a A> { + let mut supported_locales = vec![]; + + let mut available_locales: Vec<&A> = available.iter().collect(); + + for req in requested { + let mut req = req.as_ref().to_owned(); + macro_rules! test_strategy { + ($self_as_range:expr, $other_as_range:expr) => {{ + let mut match_found = false; + available_locales.retain(|locale| { + if strategy != NegotiationStrategy::Filtering && match_found { + return true; + } + + if locale + .as_ref() + .matches(&req, $self_as_range, $other_as_range) + { + match_found = true; + supported_locales.push(*locale); + return false; + } + true + }); + + if match_found { + match strategy { + NegotiationStrategy::Filtering => {} + NegotiationStrategy::Matching => continue, + NegotiationStrategy::Lookup => break, + } + } + }}; + } + + // 1) Try to find a simple (case-insensitive) string match for the request. + test_strategy!(false, false); + + // 2) Try to match against the available locales treated as ranges. + test_strategy!(true, false); + + // Per Unicode TR35, 4.4 Locale Matching, we don't add likely subtags to + // requested locales, so we'll skip it from the rest of the steps. + if req.language.is_empty() { + continue; + } + + // 3) Try to match against a maximized version of the requested locale + if req.maximize() { + test_strategy!(true, false); + } + + // 4) Try to match against a variant as a range + req.clear_variants(); + test_strategy!(true, true); + + // 5) Try to match against the likely subtag without region + req.region = None; + if req.maximize() { + test_strategy!(true, false); + } + + // 6) Try to match against a region as a range + req.region = None; + test_strategy!(true, true); + } + + supported_locales +} + +pub fn negotiate_languages< + 'a, + R: 'a + AsRef<LanguageIdentifier>, + A: 'a + AsRef<LanguageIdentifier> + PartialEq, +>( + requested: &[R], + available: &'a [A], + default: Option<&'a A>, + strategy: NegotiationStrategy, +) -> Vec<&'a A> { + let mut supported = filter_matches(requested, available, strategy); + + if let Some(default) = default { + if strategy == NegotiationStrategy::Lookup { + if supported.is_empty() { + supported.push(default); + } + } else if !supported.contains(&default) { + supported.push(default); + } + } + supported +} |